uk.bl.wa.parsers.HtmlFeatureParser.java Source code

Introduction

Here is the source code for uk.bl.wa.parsers.HtmlFeatureParser.java
Source

package uk.bl.wa.parsers;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.regex.Pattern;

import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.ParseError;
import org.jsoup.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

public class HtmlFeatureParser extends AbstractParser {

    /** */
    private static final long serialVersionUID = 1631417895901342814L;

    private static Log log = LogFactory.getLog(HtmlFeatureParser.class);

    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), MediaType.application("xhtml"))));

    // The parser to use, preferring the XML variation as it does not 'fix' the
    // mark-up.
    private Parser parser = Parser.xmlParser();
    // Max errors to returm:
    private int max_errors;
    private final boolean normaliseLinks;
    private boolean extractImageLinks = false;

    public static final String ORIGINAL_PUB_DATE = "OriginalPublicationDate";
    // Explicit property to get faster link handling as it allows for set with multiple values (same as LINKS?)
    public static final Property LINK_LIST = Property.internalTextBag("LinkList");
    public static final Property LINKS = Property.internalTextBag("LINK-LIST");
    public static final String FIRST_PARAGRAPH = "FirstParagraph";
    public static final Property IMAGE_LINKS = Property.internalTextBag("ImageLinks");
    public static final Property DISTINCT_ELEMENTS = Property.internalTextBag("DISTINCT-ELEMENTS");
    public static final Property NUM_PARSE_ERRORS = Property.internalInteger("Html-Parse-Error-Count");
    public static final int DEFAULT_MAX_PARSE_ERRORS = 1000;

    // Setting this to true also adds the field url_norm to the Solr document in WARCIndexer
    public static final String CONF_LINKS_NORMALISE = "warc.index.extract.linked.normalise";
    public static final String CONF_LINKS_EXTRACT_IMAGE_LINKS = "warc.index.extract.linked.images";
    public static final boolean DEFAULT_LINKS_NORMALISE = false;

    /**
     * 
     */
    public HtmlFeatureParser() {
        this(ConfigFactory.empty());
    }

    public HtmlFeatureParser(Config conf) {
        normaliseLinks = conf.hasPath(CONF_LINKS_NORMALISE) ? conf.getBoolean(CONF_LINKS_NORMALISE)
                : DEFAULT_LINKS_NORMALISE;
        this.setMaxParseErrors(DEFAULT_MAX_PARSE_ERRORS);

        extractImageLinks = conf.hasPath(CONF_LINKS_EXTRACT_IMAGE_LINKS)
                ? conf.getBoolean(CONF_LINKS_EXTRACT_IMAGE_LINKS)
                : false;

    }

    /**
     * 
     * @param max_errors
     */
    public void setMaxParseErrors(int max_errors) {
        this.max_errors = max_errors;
        parser.setTrackErrors(max_errors);
    }

    /**
     * 
     * @return
     */
    public int getMaxParseErrors() {
        return this.max_errors;
    }

    /**
     * 
     * @return
     */
    public List<ParseError> getParseErrors() {
        return this.parser.getErrors();
    }

    /**
     * 
     */
    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    /**
     * 
     */
    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final long start = System.nanoTime();
        // Pick up the URL:
        String url = metadata.get(Metadata.RESOURCE_NAME_KEY);

        // Parse it using JSoup
        Document doc = null;
        try {
            doc = Jsoup.parse(stream, null, url, parser);
        } catch (java.nio.charset.IllegalCharsetNameException e) {
            log.warn("Jsoup parse had to assume UTF-8: " + e);
            doc = Jsoup.parse(stream, "UTF-8", url);
        } catch (Exception e) {
            log.error("Jsoup parse failed: " + e);
        } finally {
            if (doc == null)
                return;
        }
        Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#jsoupparse", start);

        final long nonJsoupStart = System.nanoTime();
        // Record the number of errors found:
        if (parser.getErrors() != null)
            metadata.set(NUM_PARSE_ERRORS, parser.getErrors().size());

        // Get the links (no image links):
        Set<String> links = this.extractLinks(doc);
        if (links != null && links.size() > 0) {
            metadata.set(LINK_LIST, links.toArray(new String[links.size()]));
        }

        //get the image links
        if (extractImageLinks) {
            Set<String> imageLinks = this.extractImageLinks(doc);
            if (imageLinks != null && imageLinks.size() > 0) {
                metadata.set(IMAGE_LINKS, imageLinks.toArray(new String[imageLinks.size()]));
            }
        }

        // Get the publication date, from BBC pages:
        for (Element meta : doc.select("meta[name=OriginalPublicationDate]")) {
            metadata.set(ORIGINAL_PUB_DATE, meta.attr("content"));
            //log.debug(ORIGINAL_PUB_DATE + ": " + meta.attr("content"));
        }

        // Grab the first paragraph with text, and extract the text:
        for (Element p : doc.select("p")) {
            String pt = p.text();
            if (pt != null) {
                pt = pt.trim();
                if (pt.length() > 0) {
                    metadata.set(FIRST_PARAGRAPH, p.text());
                    //log.debug(FIRST_PARAGRAPH + ": " +p.text() );
                    break;
                }
            }
        }

        // Grab the list of distinct elements used in the page:
        Set<String> de = new HashSet<String>();
        for (Element e : doc.select("*")) {
            // ELEMENT_NAME matching to weed out the worst false positives caused by JavaScript
            // This handles cases such as '<script> if (3<a) console.log('something');' where 'a' would have been
            // seen as a tag, but does not handle cases such as '<script> if ( 3<a ) console.log('something');' where
            // the a is still seen as a tag because it is followed by a space
            if (!"#root".equals(e.tag().getName()) && ELEMENT_NAME.matcher(e.tag().getName()).matches()) {
                de.add(StringUtils.left(e.tag().getName().toLowerCase(Locale.ENGLISH), 100));
            }
        }
        // For some elements, dig deeper and record attributes too:
        for (Element e : doc.select("link")) {
            de.add("link/@rel=" + e.attr("rel"));
        }
        // Store them:
        metadata.set(DISTINCT_ELEMENTS, de.toArray(new String[] {}));

        // Licence field, following:
        // http://www.whatwg.org/specs/web-apps/current-work/multipage/links.html#link-type-license
        for (Element a : doc.select("a[rel=license]")) {
            metadata.add(Metadata.LICENSE_URL, a.attr("href"));
        }
        for (Element a : doc.select("link[rel=license]")) {
            metadata.add(Metadata.LICENSE_URL, a.attr("href"));
        }
        for (Element a : doc.select("area[rel=license]")) {
            metadata.add(Metadata.LICENSE_URL, a.attr("href"));
        }
        Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#featureextract", nonJsoupStart);
    }

    // https://www.w3.org/TR/html/syntax.html#tag-name
    private final Pattern ELEMENT_NAME = Pattern.compile("[a-zA-Z0-9]+");

    /**
     * Use a tolerant parser to extract all of the absolute a href links from a document.
     * 
     * Does not extract other links, e.g. stylesheets, etc. etc. Image extracting in another field
     * 
     * @param input The InputStream
     * @param charset The character set, e.g. "UTF-8". Value of "null" attempts to extract encoding from the document and falls-back on UTF-8.
     * @param baseUri base URI for the page, for resolving relative links. e.g. "http://www.example.com/"
     * @return
     * @throws java.io.IOException
     */
    private Set<String> extractLinks(Document doc) throws IOException {
        Set<String> linkset = new HashSet<String>();

        // All a with href
        for (Element link : doc.select("a[href]")) {
            linkset.add(normaliseLink(link.attr("abs:href")));
        }

        return linkset;
    }

    private Set<String> extractImageLinks(Document doc) throws IOException {
        Set<String> linkset = new HashSet<String>();
        for (Element link : doc.select("img[src]")) {
            linkset.add(normaliseLink(link.attr("abs:src")));
        }
        return linkset;
        // Example of use: all PNG references...
        //Elements pngs = doc.select("img[src$=.png]");

        //Element masthead = doc.select("div.masthead").first();
    }

    /**
     * Normalises links if the parser has been configured to do so.
     * @param link a link from a HTML document.
     * @return the link in normalised form or the unchanged link if normalisation has not been enabled.
     */
    public String normaliseLink(String link) {
        return normaliseLinks ? Normalisation.canonicaliseURL(link) : link;
    }

    public static Metadata extractMetadata(InputStream in, String url) {
        HtmlFeatureParser hfp = new HtmlFeatureParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, url);
        try {
            hfp.parse(in, null, metadata, null);
        } catch (Exception e) {
            log.error("Failed to extract Metadata.");
        }
        return metadata;
    }
}