com.nanocrawler.contentparser.HtmlContentParser.java Source code

Introduction

Here is the source code for com.nanocrawler.contentparser.HtmlContentParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Based on crawler4j project by Yasser Ganjisaffar
 */
package com.nanocrawler.contentparser;

import com.nanocrawler.data.Content;
import com.nanocrawler.data.HtmlContent;
import com.nanocrawler.data.Page;
import com.nanocrawler.util.CrawlConfig;
import com.nanocrawler.urlmanipulation.URLCanonicalizer;
import com.nanocrawler.urlmanipulation.WebURL;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import nu.validator.htmlparser.common.DoctypeExpectation;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.dom.HtmlDocumentBuilder;
import org.apache.commons.io.IOUtils;
import org.mozilla.universalchardet.UniversalDetector;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

// Parses HTML content (default content parser) using Validator.nu HTML parser
public class HtmlContentParser implements ContentParser {

    private final String BODY_ELEMENT = "body";
    private final String TITLE_ELEMENT = "title";
    private final String BASE_ELEMENT = "base";
    private final String LINK_ELEMENT = "link";
    private final String A_ELEMENT = "a";
    private final String IFRAME_ELEMENT = "iframe";
    private final String FRAME_ELEMENT = "frame";
    private final String EMBED_ELEMENT = "embed";
    private final String META_ELEMENT = "meta";

    private final String HREF_ATTRIB = "href";
    private final String SRC_ATTRIB = "src";

    private CrawlConfig config = null;

    // Constructor
    public HtmlContentParser(CrawlConfig config) {
        this.config = config;
    }

    @Override
    // This is default content parser if no other content parser handles the parsing
    public boolean canParseContent(String mimeType) {
        return true;
    }

    // Parses the web page using Validator.nu parser
    private Document parseHtml(Page page, HtmlContent c) throws SAXException, IOException {
        String html;

        // Handle char type conversions based on the byte stream, not based on what the server says
        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(page.getContentData(), 0, page.getContentData().length);
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();

        if (encoding != null) {
            page.setContentCharset(encoding);
            try {
                html = new String(page.getContentData(), encoding);
            } catch (Exception ex) {
                html = new String(page.getContentData());
            }
        } else {
            html = new String(page.getContentData());
        }

        html = html.trim();
        c.setHtml(html);

        HtmlDocumentBuilder builder = new HtmlDocumentBuilder();
        builder.setCommentPolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setContentNonXmlCharPolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setStreamabilityViolationPolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
        builder.setMappingLangToXmlLang(true);
        builder.setHtml4ModeCompatibleWithXhtml1Schemata(true);
        builder.setHeuristics(Heuristics.ALL);
        builder.setCheckingNormalization(false);
        builder.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS);
        builder.setIgnoringComments(true);
        builder.setScriptingEnabled(true);
        builder.setXmlPolicy(XmlViolationPolicy.ALTER_INFOSET);

        Document doc = builder.parse(IOUtils.toInputStream(html));
        if (doc.getElementsByTagName(BODY_ELEMENT).getLength() == 0) {
            throw new RuntimeException("Problem parsing document - invalid HTML, no body element found");
        }

        return doc;
    }

    // Returns <base> item if there is one
    private String getBaseUrl(Document doc) {
        String baseUrl = null;
        try {
            if (doc.getElementsByTagName(BASE_ELEMENT).getLength() > 0
                    && doc.getElementsByTagName(BASE_ELEMENT).item(0).hasAttributes()) {
                baseUrl = doc.getElementsByTagName(BASE_ELEMENT).item(0).getAttributes().getNamedItem(HREF_ATTRIB)
                        .getNodeValue().trim();
            }
        } catch (Exception ex) {
            baseUrl = null;
        }
        return baseUrl;
    }

    // Parses links of all element - attribute combos (e.g. <a> & "href") 
    private void getLinks(List<ExtractedUrlAnchorPair> outgoingUrls, Document doc, String elementName,
            String attribName, boolean getAnchorText) {
        if (doc.getElementsByTagName(elementName).getLength() > 0) {
            NodeList nl = doc.getElementsByTagName(elementName);
            for (int i = 0; i < nl.getLength(); i++) {
                Node n = nl.item(i);
                if (n.hasAttributes() && n.getAttributes().getNamedItem(attribName) != null) {
                    ExtractedUrlAnchorPair newUrl = new ExtractedUrlAnchorPair();
                    newUrl.setHref(n.getAttributes().getNamedItem(attribName).getNodeValue().trim());
                    if (getAnchorText) {
                        newUrl.setAnchor(n.getTextContent().trim());
                    } else {
                        newUrl.setAnchor("");
                    }

                    if (newUrl.getHref().trim().length() > 0) {
                        outgoingUrls.add(newUrl);
                    }
                }
            }
        }
    }

    // Parses various outgoing URLs 
    public List<ExtractedUrlAnchorPair> getOutgoingUrls(Document doc) {
        List<ExtractedUrlAnchorPair> outgoingUrls = new ArrayList<>();

        getLinks(outgoingUrls, doc, LINK_ELEMENT, HREF_ATTRIB, false);
        getLinks(outgoingUrls, doc, A_ELEMENT, HREF_ATTRIB, true);
        getLinks(outgoingUrls, doc, IFRAME_ELEMENT, SRC_ATTRIB, false);
        getLinks(outgoingUrls, doc, FRAME_ELEMENT, SRC_ATTRIB, false);
        getLinks(outgoingUrls, doc, EMBED_ELEMENT, SRC_ATTRIB, false);

        NodeList metaNodes = doc.getElementsByTagName(META_ELEMENT);
        if (metaNodes != null) {
            for (int i = 0; i < metaNodes.getLength(); i++) {
                Node n = metaNodes.item(i);
                if (n.hasAttributes()) {
                    String equiv = n.getAttributes().getNamedItem("http-equiv") != null
                            ? n.getAttributes().getNamedItem("http-equiv").getNodeValue().trim()
                            : null;
                    String content = n.getAttributes().getNamedItem("content") != null
                            ? n.getAttributes().getNamedItem("content").getNodeValue().trim()
                            : null;
                    if (equiv != null && content != null) {
                        equiv = equiv.toLowerCase();

                        // http-equiv="refresh" content="0;URL=http://foo.bar/..."
                        if (equiv.equalsIgnoreCase("refresh")) {
                            String metaRefresh = "";
                            int pos = content.toLowerCase().indexOf("url=");
                            if (pos != -1) {
                                metaRefresh = content.substring(pos + 4);
                            }

                            if (metaRefresh.length() > 0) {
                                ExtractedUrlAnchorPair newUrl = new ExtractedUrlAnchorPair();
                                newUrl.setHref(metaRefresh);
                                newUrl.setAnchor("");
                                outgoingUrls.add(newUrl);
                            }
                        }

                        // http-equiv="location" content="http://foo.bar/..."
                        if (equiv.equalsIgnoreCase("location")) {
                            if (content.length() > 0) {
                                ExtractedUrlAnchorPair newUrl = new ExtractedUrlAnchorPair();
                                newUrl.setHref(content);
                                outgoingUrls.add(newUrl);
                            }
                        }
                    }
                }
            }
        }

        return outgoingUrls;
    }

    @Override
    // Runs the whole parsing, extracts links from the page 
    public Content parseContent(Page page, String contextUrl) {
        HtmlContent c = new HtmlContent();
        try {
            Document doc = parseHtml(page, c);

            c.setText(doc.getElementsByTagName(BODY_ELEMENT).item(0).getTextContent().trim());
            if (doc.getElementsByTagName(TITLE_ELEMENT).getLength() > 0) {
                c.setTitle(doc.getElementsByTagName(TITLE_ELEMENT).item(0).getTextContent().trim());
            }

            String baseUrl = getBaseUrl(doc);
            if (baseUrl != null) {
                contextUrl = baseUrl;
            }

            List<ExtractedUrlAnchorPair> extractedUrls = getOutgoingUrls(doc);
            List<WebURL> outgoingUrls = new ArrayList<>();
            int urlCount = 0;
            for (ExtractedUrlAnchorPair urlAnchorPair : extractedUrls) {
                String href = urlAnchorPair.getHref();
                href = href.trim();
                if (href.length() == 0) {
                    continue;
                }

                String hrefWithoutProtocol = href.toLowerCase();
                if (href.startsWith("http://")) {
                    hrefWithoutProtocol = href.substring(7);
                }

                if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("mailto:")
                        && !hrefWithoutProtocol.contains("@")) {
                    String url = URLCanonicalizer.getCanonicalURL(href, contextUrl);
                    if (url != null) {
                        WebURL webURL = new WebURL();
                        webURL.setURL(url);
                        webURL.setAnchor(urlAnchorPair.getAnchor());
                        outgoingUrls.add(webURL);
                        urlCount++;
                        if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                            break;
                        }
                    }
                }
            }

            c.setOutgoingUrls(outgoingUrls);
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        return c;
    }
}