RemoveHTMLReader.java Source code

Introduction

Here is the source code for RemoveHTMLReader.java
Source

/*
 * Copyright (c) 2004 David Flanagan.  All rights reserved.
 * This code is from the book Java Examples in a Nutshell, 3nd Edition.
 * It is provided AS-IS, WITHOUT ANY WARRANTY either expressed or implied.
 * You may study, use, and modify it for any non-commercial purpose,
 * including teaching and use in open-source projects.
 * You may distribute it non-commercially as long as you retain this notice.
 * For a commercial use license, or to purchase the book, 
 * please visit http://www.davidflanagan.com/javaexamples3.
 */

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;

/**
 * A simple FilterReader that strips HTML tags (or anything between pairs of
 * angle brackets) out of a stream of characters.
 */
public class RemoveHTMLReader extends FilterReader {
    /** A trivial constructor. Just initialize our superclass */
    public RemoveHTMLReader(Reader in) {
        super(in);
    }

    boolean intag = false; // Used to remember whether we are "inside" a tag

    /**
     * This is the implementation of the no-op read() method of FilterReader. It
     * calls in.read() to get a buffer full of characters, then strips out the
     * HTML tags. (in is a protected field of the superclass).
     */
    public int read(char[] buf, int from, int len) throws IOException {
        int numchars = 0; // how many characters have been read
        // Loop, because we might read a bunch of characters, then strip them
        // all out, leaving us with zero characters to return.
        while (numchars == 0) {
            numchars = in.read(buf, from, len); // Read characters
            if (numchars == -1)
                return -1; // Check for EOF and handle it.

            // Loop through the characters we read, stripping out HTML tags.
            // Characters not in tags are copied over previous tags
            int last = from; // Index of last non-HTML char
            for (int i = from; i < from + numchars; i++) {
                if (!intag) { // If not in an HTML tag
                    if (buf[i] == '<')
                        intag = true; // check for tag start
                    else
                        buf[last++] = buf[i]; // and copy the character
                } else if (buf[i] == '>')
                    intag = false; // check for end of tag
            }
            numchars = last - from; // Figure out how many characters remain
        } // And if it is more than zero characters
        return numchars; // Then return that number.
    }

    /**
     * This is another no-op read() method we have to implement. We implement it
     * in terms of the method above. Our superclass implements the remaining
     * read() methods in terms of these two.
     */
    public int read() throws IOException {
        char[] buf = new char[1];
        int result = read(buf, 0, 1);
        if (result == -1)
            return -1;
        else
            return (int) buf[0];
    }

    /** The test program: read a text file, strip HTML, print to console */
    public static void main(String[] args) {
        try {
            if (args.length != 1)
                throw new IllegalArgumentException("Wrong number of args");
            // Create a stream to read from the file and strip tags from it
            BufferedReader in = new BufferedReader(new RemoveHTMLReader(new FileReader(args[0])));
            // Read line by line, printing lines to the console
            String line;
            while ((line = in.readLine()) != null)
                System.out.println(line);
            in.close(); // Close the stream.
        } catch (Exception e) {
            System.err.println(e);
            System.err.println("Usage: java RemoveHTMLReader$Test" + " <filename>");
        }
    }
}