com.liyo.html.HTMLParser.java Source code

Introduction

Here is the source code for com.liyo.html.HTMLParser.java
Source

// HTMLParser Library v0.7 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// For any questions or suggestions, you can write to me at :
// Email :somik@kizna.com
// 
// Postal Address : 
// Somik Raha
// R&D Team
// Kizna Corporation
// 2-1-17-6F, Sakamoto Bldg., Moto Azabu, Minato ku, Tokyo, 106 0046, JAPAN

package com.liyo.html;
//////////////////

// Java Imports //
//////////////////

import org.apache.commons.lang.StringUtils;

import java.io.*;
import java.net.*;
import java.util.*;

/**
 * This is the class that the user will use, either to get an iterator into
 * the html page or to directly parse the page and print the results
 */
public class HTMLParser {
    /**
     * The URL or filename to be parsed.
     */
    protected String resourceUrl;
    /**
     * The html reader associated with this parser
     */
    protected HTMLReader reader;
    /**
     * The last read HTML node.
     */
    protected HTMLNode node;
    /**
     * Keeps track of whether the first reading has been performed.
     */
    protected boolean readFlag = false;

    /**
     * ????
     */
    private InputStreamReader inputStreamReader;

    /**
     * ???
     */
    private String charSetName = "SJIS";

    /**
     * Creates a HTMLParser object with the location of the resource (URL or file)
     *
     * @param resourceUrl Either the URL or the filename (autodetects)
     */
    public HTMLParser(String resourceUrl) {
        this.resourceUrl = resourceUrl;
        openConnection();
    }

    public HTMLParser(InputStream inputStream, String url, String charSet) {
        this.resourceUrl = url;
        if (StringUtils.isNotBlank(charSet)) {
            this.charSetName = charSet;
        }
        try {
            this.inputStreamReader = new InputStreamReader(inputStream, charSetName);
            this.resourceUrl = removeEscapeCharacters(resourceUrl);
            this.resourceUrl = checkEnding(resourceUrl);
            reader = new HTMLReader(inputStreamReader, resourceUrl);
        } catch (IOException e) {
            System.err.println("I/O Exception occured while reading " + resourceUrl);
        }
    }

    /**
     * Opens the connection with the resource to begin reading, by creating a HTML reader
     * object.
     */
    private void openConnection() {
        try {
            if (resourceUrl.indexOf("http") != -1 || resourceUrl.indexOf("www.") != -1) {
                // Its a web address
                resourceUrl = removeEscapeCharacters(resourceUrl);
                resourceUrl = checkEnding(resourceUrl);
                URL url = new URL(resourceUrl);
                URLConnection uc = url.openConnection();
                reader = new HTMLReader(new InputStreamReader(uc.getInputStream(), charSetName), resourceUrl);
            } else {
                reader = new HTMLReader(new FileReader(resourceUrl), resourceUrl);
            }
        } catch (FileNotFoundException e) {
            System.err.println("Error! File " + resourceUrl + " not found!");
        } catch (MalformedURLException e) {
            System.err.println("Error! URL " + resourceUrl + " Malformed!");
        } catch (IOException e) {
            System.err.println("I/O Exception occured while reading " + resourceUrl);
        }
    }

    /**
     * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
     * string/link/image
     */
    public Enumeration elements() {
        return new Enumeration() {
            public boolean hasMoreElements() {
                if (reader == null)
                    return false;
                try {
                    node = reader.readElement();
                    readFlag = true;
                    if (node == null) {
                        return false;
                    } else {
                        return true;
                    }
                } catch (IOException e) {
                    System.err.println("I/O Exception occured while reading " + resourceUrl);
                    return false;
                }
            }

            public Object nextElement() {
                try {
                    if (!readFlag)
                        node = reader.readElement();
                    return node;
                } catch (IOException e) {
                    System.err.println("I/O Exception occured while reading " + resourceUrl);
                    return null;
                }
            }
        };
    }

    /**
     * Parse the given resource, using the filter provided
     */
    public void parse(String filter) {
        HTMLNode node;
        for (Enumeration e = elements(); e.hasMoreElements();) {
            node = (HTMLNode) e.nextElement();
            if (node != null) {
                if (filter == null) {
                    node.print();
                } else {
                    // There is a filter. Find if the associated filter of this node
                    // matches the specified filter
                    if (!(node instanceof HTMLTag))
                        continue;
                    HTMLTag tag = (HTMLTag) node;
                    HTMLTagScanner scanner = tag.getThisScanner();
                    if (scanner == null)
                        continue;
                    String tagFilter = scanner.getFilter();
                    if (tagFilter == null)
                        continue;
                    if (tagFilter.equals(filter)) {
                        node.print();
                    }
                }
            } else {
                System.out.println("Node is null");
            }
        }

    }

    public static String checkEnding(String link) {
        // Check if the link ends in html, htm, or /. If not, add a slash
        int l1 = link.indexOf("html");
        int l2 = link.indexOf("htm");
        if (l1 == -1 && l2 == -1) {
            if (link.charAt(link.length() - 1) != '/') {
                link += "/index.html";
            }
            return link;
        } else {
            return link;
        }
    }

    public static String removeEscapeCharacters(String link) {
        int state = 0;
        String temp = "", retVal = "";
        for (int i = 0; i < link.length(); i++) {
            char ch = link.charAt(i);
            if (state == 4) {
                state = 0;
            }
            if (ch == '#' && state == 0) {
                state = 1;
                continue;
            }
            if (state == 1) {
                if (ch == '3') {
                    state = 2;
                    continue;
                } else {
                    state = 0;
                    retVal += temp;
                }
            }
            if (state == 2) {
                if (ch == '8') {
                    state = 3;
                    continue;
                } else {
                    state = 0;
                    retVal += temp;
                }
            }
            if (state == 3) {
                if (ch == ';') {
                    state = 4;
                    continue;
                } else {
                    state = 0;
                    retVal += temp;
                }
            }
            if (state == 0) {
                retVal += ch;
            } else {
                temp += ch;
            }
        }
        return retVal;
    }

    /*
     * The main program, which can be executed from the command line
     */
    public static void main(String[] args) {
        new HTMLLinkScanner("-l");
        new HTMLImageScanner("-i");
        if (args.length < 1 || args[0].equals("-help")) {
            System.out.println("java -jar Parse.jar <resourceLocn/website> -l");
            System.out.println(
                    "   <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)");
            System.out.println("   -l Show only the link tags extracted from the document");
            System.out.println("   -i Show only the image tags extracted from the document");
            System.out.println("   -help This screen");
            System.exit(-1);
        }
        if (args[0].indexOf("http") != -1 || args[0].indexOf("www.") != -1) {
            System.out.println("Parsing website " + args[0]);
        } else {
            System.out.println("Parsing file " + args[0] + "...");
        }
        HTMLParser parser = new HTMLParser(args[0]);
        if (args.length == 2) {
            parser.parse(args[1]);
        } else {
            parser.parse(null);
        }
    }

}