com.cyberway.issue.crawler.extractor.JerichoExtractorHTML.java Source code

Introduction

Here is the source code for com.cyberway.issue.crawler.extractor.JerichoExtractorHTML.java
Source

/* JerichoExtractorHTML
 * 
 * Copyright (C) 2006 Olaf Freyer
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * $Id: JerichoExtractorHTML.java 5757 2008-02-06 07:44:20Z Gojomo $
 */
package com.cyberway.issue.crawler.extractor;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.datamodel.RobotsHonoringPolicy;
import com.cyberway.issue.net.UURI;
import com.cyberway.issue.net.UURIFactory;
import com.cyberway.issue.util.DevUtils;
import com.cyberway.issue.util.TextUtils;

import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Attributes;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.FormControl;
import au.id.jericho.lib.html.FormControlType;
import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.FormFields;
import au.id.jericho.lib.html.HTMLElementName;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTagType;

/**
 * Improved link-extraction from an HTML content-body using jericho-html parser.
 * This extractor extends ExtractorHTML and mimics its workflow - but has some
 * substantial differences when it comes to internal implementation. Instead
 * of heavily relying upon java regular expressions it uses a real html parser
 * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
 * Using this parser it can better handle broken html (i.e. missing quotes)
 * and also offer improved extraction of HTML form URLs (not only extract
 * the action of a form, but also its default values).
 * Unfortunately this parser also has one major drawback - it has to read the
 * whole document into memory for parsing, thus has an inherent OOME risk.
 * This OOME risk can be reduced/eleminated by limiting the size of documents
 * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
 * Also note that this extractor seems to have a lower overall memory 
 * consumption compared to ExtractorHTML. (still to be confirmed on a larger 
 * scale crawl) 
 * 
 * @author Olaf Freyer
 * @version $Date: 2008-02-06 07:44:20 +0000 (Wed, 06 Feb 2008) $ $Revision: 5757 $
 */
public class JerichoExtractorHTML extends ExtractorHTML implements CoreAttributeConstants {

    private static final long serialVersionUID = 1684681316546343615L;

    private Logger logger = Logger.getLogger(this.getClass().getName());

    protected long numberOfFormsProcessed = 0;

    public JerichoExtractorHTML(String name) {
        this(name, "Jericho-HTML extractor. Extracts links from HTML "
                + "documents using Jericho HTML Parser. Offers same "
                + "basic functionality as ExtractorHTML but better "
                + "handles broken HTML and extraction of default "
                + "values from HTML forms. A word of warning: the used "
                + "parser, the Jericho HTML Parser, reads the whole " + "document into memory for "
                + "parsing - thus this extractor has an inherent OOME risk. "
                + "This OOME risk can be reduced/eleminated by limiting the "
                + "size of documents to be parsed (i.e. using " + "NotExceedsDocumentLengthTresholdDecideRule). ");
    }

    public JerichoExtractorHTML(String name, String description) {
        super(name, description);
    }

    private static List<Attribute> findOnAttributes(Attributes attributes) {
        List<Attribute> result = new LinkedList<Attribute>();
        for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {
            Attribute attr = (Attribute) attrIter.next();
            if (attr.getKey().startsWith("on"))
                result.add(attr);
        }
        return result;
    }

    protected void processGeneralTag(CrawlURI curi, Element element, Attributes attributes) {
        Attribute attr;
        String attrValue;
        List attrList;
        String elementName = element.getName();

        // Just in case it's an OBJECT or APPLET tag
        String codebase = null;
        ArrayList<String> resources = null;

        final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS))
                .booleanValue();

        final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(curi, ATTR_IGNORE_FORM_ACTION_URLS))
                .booleanValue();

        final boolean overlyEagerLinkDetection = ((Boolean) getUncheckedAttribute(curi, EXTRACT_VALUE_ATTRIBUTES))
                .booleanValue();

        // HREF
        if (((attr = attributes.get("href")) != null) && ((attrValue = attr.getValue()) != null)) {
            CharSequence context = Link.elementContext(elementName, attr.getKey());
            if ("link".equals(elementName)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(curi, attrValue, context);
            } else {
                // other HREFs treated as links
                processLink(curi, attrValue, context);
            }
            if ("base".equals(elementName)) {
                try {
                    curi.setBaseURI(attrValue);
                } catch (URIException e) {
                    if (getController() != null) {
                        // Controller can be null: e.g. when running
                        // ExtractorTool.
                        getController().logUriError(e, curi.getUURI(), attrValue);
                    } else {
                        logger.info("Failed set base uri: " + curi + ", " + attrValue + ": " + e.getMessage());
                    }
                }
            }
        }
        // ACTION
        if (((attr = attributes.get("action")) != null) && ((attrValue = attr.getValue()) != null)) {
            if (!ignoreFormActions) {
                CharSequence context = Link.elementContext(elementName, attr.getKey());
                processLink(curi, attrValue, context);
            }
        }
        // ON_
        if ((attrList = findOnAttributes(attributes)).size() != 0) {
            for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
                attr = (Attribute) attrIter.next();
                CharSequence valueSegment = attr.getValueSegment();
                if (valueSegment != null)
                    processScriptCode(curi, valueSegment);

            }
        }
        // SRC atc.
        if ((((attr = attributes.get("src")) != null) || ((attr = attributes.get("lowsrc")) != null)
                || ((attr = attributes.get("background")) != null) || ((attr = attributes.get("cite")) != null)
                || ((attr = attributes.get("longdesc")) != null) || ((attr = attributes.get("usemap")) != null)
                || ((attr = attributes.get("profile")) != null) || ((attr = attributes.get("datasrc")) != null))
                && ((attrValue = attr.getValue()) != null)) {

            final char hopType;
            CharSequence context = Link.elementContext(elementName, attr.getKey());

            if (!framesAsEmbeds && ("frame".equals(elementName) || "iframe".equals(elementName)))
                hopType = Link.NAVLINK_HOP;
            else
                hopType = Link.EMBED_HOP;

            processEmbed(curi, attrValue, context, hopType);
        }
        // CODEBASE
        if (((attr = attributes.get("codebase")) != null) && ((attrValue = attr.getValue()) != null)) {
            codebase = StringEscapeUtils.unescapeHtml(attrValue);
            CharSequence context = Link.elementContext(elementName, attr.getKey());
            processEmbed(curi, codebase, context);
        }
        // CLASSID DATA
        if ((((attr = attributes.get("classid")) != null) || ((attr = attributes.get("data")) != null))
                && ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            resources.add(attrValue);
        }
        // ARCHIVE
        if (((attr = attributes.get("archive")) != null) && ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            String[] multi = TextUtils.split(WHITESPACE, attrValue);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        }
        // CODE
        if (((attr = attributes.get("code")) != null) && ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
                resources.add(attrValue + CLASSEXT);
            } else {
                resources.add(attrValue);
            }
        }
        // VALUE
        if (((attr = attributes.get("value")) != null) && ((attrValue = attr.getValue()) != null)) {
            if (TextUtils.matches(LIKELY_URI_PATH, attrValue) && overlyEagerLinkDetection) {
                CharSequence context = Link.elementContext(elementName, attr.getKey());
                processLink(curi, attrValue, context);
            }

        }
        // STYLE
        if (((attr = attributes.get("style")) != null) && ((attrValue = attr.getValue()) != null)) {
            // STYLE inline attribute
            // then, parse for URIs
            this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, attrValue, getController());
        }

        // handle codebase/resources
        if (resources == null)
            return;

        Iterator<String> iter = resources.iterator();
        UURI codebaseURI = null;
        String res = null;
        try {
            if (codebase != null) {
                // TODO: Pass in the charset.
                codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
            }
            while (iter.hasNext()) {
                res = iter.next();
                res = StringEscapeUtils.unescapeHtml(res);
                if (codebaseURI != null) {
                    res = codebaseURI.resolve(res).toString();
                }
                processEmbed(curi, res, element); // TODO: include attribute
                                                  // too
            }
        } catch (URIException e) {
            curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
        } catch (IllegalArgumentException e) {
            DevUtils.logger.log(Level.WARNING,
                    "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(),
                    e);
        }
    }

    protected boolean processMeta(CrawlURI curi, Element element) {
        String name = element.getAttributeValue("name");
        String httpEquiv = element.getAttributeValue("http-equiv");
        String content = element.getAttributeValue("content");

        if ("robots".equals(name) && content != null) {
            curi.putString(A_META_ROBOTS, content);
            RobotsHonoringPolicy policy = getSettingsHandler().getOrder().getRobotsHonoringPolicy();
            String contentLower = content.toLowerCase();
            if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
                    && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
                    && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) {
                // if 'nofollow' or 'none' is specified and the
                // honoring policy is not IGNORE or CUSTOM, end html extraction
                logger.fine("HTML extraction skipped due to robots meta-tag " + "for: " + curi.toString());
                return true;
            }
        }
        if ("refresh".equals(httpEquiv) && content != null) {
            String refreshUri = content.substring(content.indexOf("=") + 1);
            try {
                curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP);
            } catch (URIException e) {
                if (getController() != null) {
                    getController().logUriError(e, curi.getUURI(), refreshUri);
                } else {
                    logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + element.toString() + ", "
                            + refreshUri + ": " + e);
                }
            }
        }
        return false;
    }

    protected void processScript(CrawlURI curi, Element element) {
        // first, get attributes of script-open tag
        // as per any other tag
        processGeneralTag(curi, element, element.getAttributes());

        // then, apply best-effort string-analysis heuristics
        // against any code present (false positives are OK)
        processScriptCode(curi, element.getContent());

    }

    protected void processStyle(CrawlURI curi, Element element) {
        // First, get attributes of script-open tag as per any other tag.
        processGeneralTag(curi, element, element.getAttributes());

        // then, parse for URIs
        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, element.getContent(), getController());
    }

    protected void processForm(CrawlURI curi, Element element) {
        String action = element.getAttributeValue("action");
        String name = element.getAttributeValue("name");
        String queryURL = "";

        final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(curi, ATTR_IGNORE_FORM_ACTION_URLS))
                .booleanValue();

        if (ignoreFormActions) {
            return;
        }

        // method-sensitive extraction
        String method = StringUtils.defaultIfEmpty(element.getAttributeValue("method"), "GET");
        if (((Boolean) getUncheckedAttribute(curi, ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()
                && !"GET".equalsIgnoreCase(method)) {
            return;
        }

        numberOfFormsProcessed++;

        // get all form fields
        FormFields formFields = element.findFormFields();
        for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
            // for each form field
            FormField formField = (FormField) fieldsIter.next();

            // for each form control
            for (Iterator controlIter = formField.getFormControls().iterator(); controlIter.hasNext();) {
                FormControl formControl = (FormControl) controlIter.next();

                // get name of control element (and URLEncode it)
                String controlName = formControl.getName();

                // retrieve list of values - submit needs special handling
                Collection controlValues;
                if (!(formControl.getFormControlType() == FormControlType.SUBMIT)) {
                    controlValues = formControl.getValues();
                } else {
                    controlValues = formControl.getPredefinedValues();
                }

                if (controlValues.size() > 0) {
                    // for each value set
                    for (Iterator valueIter = controlValues.iterator(); valueIter.hasNext();) {
                        String value = (String) valueIter.next();
                        queryURL += "&" + controlName + "=" + value;
                    }
                } else {
                    queryURL += "&" + controlName + "=";
                }
            }
        }

        // clean up url
        if (action == null) {
            queryURL = queryURL.replaceFirst("&", "?");
        } else {
            if (!action.contains("?"))
                queryURL = queryURL.replaceFirst("&", "?");
            queryURL = action + queryURL;
        }

        CharSequence context = Link.elementContext(element.getName(), "name=" + name);
        processLink(curi, queryURL, context);

    }

    /**
     * Run extractor. This method is package visible to ease testing.
     * 
     * @param curi
     *            CrawlURI we're processing.
     * @param cs
     *            Sequence from underlying ReplayCharSequence.
     */
    void extract(CrawlURI curi, CharSequence cs) {
        Source source = new Source(cs);
        List elements = source.findAllElements(StartTagType.NORMAL);
        for (Iterator elementIter = elements.iterator(); elementIter.hasNext();) {
            Element element = (Element) elementIter.next();
            String elementName = element.getName();
            Attributes attributes;
            if (elementName.equals(HTMLElementName.META)) {
                if (processMeta(curi, element)) {
                    // meta tag included NOFOLLOW; abort processing
                    break;
                }
            } else if (elementName.equals(HTMLElementName.SCRIPT)) {
                processScript(curi, element);
            } else if (elementName.equals(HTMLElementName.STYLE)) {
                processStyle(curi, element);
            } else if (elementName.equals(HTMLElementName.FORM)) {
                processForm(curi, element);
            } else if (!(attributes = element.getAttributes()).isEmpty()) {
                processGeneralTag(curi, element, attributes);
            }
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.Processor#report()
     */
    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: com.cyberway.issue.crawler.extractor.JerichoExtractorHTML\n");
        ret.append("  Function:          Link extraction on HTML documents\n");
        ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return ret.toString();
    }
}