gr.iti.mklab.bubing.parser.ITIHTMLParser.java Source code

Introduction

Here is the source code for gr.iti.mklab.bubing.parser.ITIHTMLParser.java
Source

package gr.iti.mklab.bubing.parser;

/*
 * Copyright (C) 2004-2013 Paolo Boldi, Massimo Santini, and Sebastiano Vigna
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import gr.iti.mklab.bubing.ItiAgent;
import gr.iti.mklab.image.Utils;
import gr.iti.mklab.image.VisualIndexer;
import gr.iti.mklab.simmo.items.Image;
import gr.iti.mklab.visual.utilities.ImageIOGreyScale;
import it.unimi.di.law.bubing.Agent;
import it.unimi.di.law.bubing.parser.BinaryParser;
import it.unimi.di.law.bubing.parser.Parser;
import it.unimi.di.law.bubing.util.BURL;
import it.unimi.di.law.bubing.util.ByteArrayCharSequence;
import it.unimi.di.law.bubing.util.Util;
import it.unimi.di.law.warc.filters.URIResponse;
import it.unimi.di.law.warc.records.WarcHeader;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.di.law.warc.util.StringHttpMessages;
import it.unimi.dsi.fastutil.io.InspectableFileCachedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.util.TextPattern;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.lang.reflect.InvocationTargetException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.StreamedSource;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.bson.types.ObjectId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

import javax.imageio.ImageIO;

// RELEASE-STATUS: DIST

/**
 * An HTML parser with additional responsibilities.
 * An instance of this class does some buffering that makes it possible to
 * parse quickly a {@link HttpResponse}. Instances are heavyweight&mdash;they
 * should be pooled and shared, since their usage is transitory and CPU-intensive.
 */
public class ITIHTMLParser<T> implements Parser<T> {
    private static final Logger LOGGER = LoggerFactory.getLogger(ITIHTMLParser.class);

    static {
        /* As suggested by Martin Jericho. This should speed up things and avoid problems with
         * server tags embedded in weird places (e.g., JavaScript string literals). Server tags
        * should not appear in generated HTML anyway. */

        StartTagType.SERVER_COMMON.deregister();
        StartTagType.SERVER_COMMON_COMMENT.deregister();
        StartTagType.SERVER_COMMON_ESCAPED.deregister();
    }

    /**
     * An implementation of a {@link Parser.LinkReceiver} that accumulates the URLs in a public set.
     */
    public final static class SetLinkReceiver implements LinkReceiver {
        /**
         * The set of URLs gathered so far.
         */
        public final Set<URI> urls = new ObjectLinkedOpenHashSet<URI>();

        @Override
        public void location(URI location) {
            urls.add(location);
        }

        @Override
        public void metaLocation(URI location) {
            urls.add(location);
        }

        @Override
        public void metaRefresh(URI refresh) {
            urls.add(refresh);
        }

        @Override
        public void link(URI link) {
            urls.add(link);
        }

        @Override
        public void init(URI responseUrl) {
            urls.clear();
        }

        @Override
        public Iterator<URI> iterator() {
            return urls.iterator();
        }

        @Override
        public int size() {
            return urls.size();
        }
    }

    /**
     * A class computing the digest of a page.
     * <p/>
     * <p>The page is somewhat simplified before being passed (as a sequence of bytes obtained
     * by breaking each character into the upper and lower byte) to a {@link Hasher}.
     * All start/end tags are case-normalized, and their whole content (except for the
     * element-type name) is removed.
     * An exception is made for <samp>SRC</samp> attribute of
     * <samp>FRAME</samp> and <samp>IFRAME</samp> elements, as they are necessary to
     * distinguish correctly framed pages without alternative text. The attributes will be resolved
     * w.r.t. the {@linkplain #init(URI) URL associated to the page}.
     * Moreover, non-HTML tags are substituted with a special tag <samp>unknown</samp>.
     * <p/>
     * <p>For what concerns the text, all digits are substituted by a whitespace, and nonempty whitespace maximal sequences are coalesced
     * to a single space. Tags are considered as a non-whitespace character.
     * <p/>
     * <p>To avoid clashes between digests coming from different sites, you can optionally set a URL
     * (passed to the {@link #init(URI)} method) whose scheme+authority will be used to update the digest before adding the actual text page.
     */
    public final static class DigestAppendable implements Appendable {
        private static final boolean DEBUG = false;
        private PrintStream debugStream;
        private File debugFile;

        /**
         * Cached byte representations of all opening tags. The map must be queried using {@linkplain HTMLElementName Jericho names}.
         */
        protected static final Reference2ObjectOpenHashMap<String, byte[]> startTags;

        /**
         * Cached byte representations of all closing tags. The map must be queried using {@linkplain HTMLElementName Jericho names}.
         */
        protected static final Reference2ObjectOpenHashMap<String, byte[]> endTags;

        static {
            final List<String> elementNames = HTMLElements.getElementNames();
            startTags = new Reference2ObjectOpenHashMap<String, byte[]>(elementNames.size());
            endTags = new Reference2ObjectOpenHashMap<String, byte[]>(elementNames.size());

            // Set up defaults for bizarre element types
            startTags.defaultReturnValue(Util.toByteArray("<unknown>"));
            endTags.defaultReturnValue(Util.toByteArray("</unknown>"));

            // Scan all known element types and fill startTag/endTag
            for (String name : elementNames) {
                startTags.put(name, Util.toByteArray("<" + name + ">"));
                endTags.put(name, Util.toByteArray("</" + name + ">"));
            }
        }

        /**
         * The message digest used to compute the digest.
         */
        protected final HashFunction hashFunction;
        /**
         * The hasher currently used to compute the digest.
         */
        protected Hasher hasher;
        /**
         * True iff the last character appended was a space.
         */
        protected boolean lastAppendedWasSpace;
        /**
         * The last returne digest, or {@code null} if {@link #init(URI)} has been called but {@link #digest()} hasn't.
         */
        protected byte[] digest;

        /**
         * Create a digest appendable using a given hash function.
         *
         * @param hashFunction the hash function used to digest.
         */
        public DigestAppendable(final HashFunction hashFunction) {
            this.hashFunction = hashFunction;
            if (DEBUG)
                try {
                    debugStream = new PrintStream(debugFile = File.createTempFile("tempfile", ".tmp"));
                    System.err.println("Debug file: " + debugFile);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
        }

        /**
         * Initializes the digest computation.
         *
         * @param url a URL, or {@code null} for no URL. In the former case, the host name will be used to initialize the digest.
         */
        public void init(final URI url) {
            hasher = hashFunction.newHasher();
            digest = null;

            if (url != null) {
                // Note that we need to go directly to the hasher to encode explicit IP addresses
                hasher.putUnencodedChars(url.getHost());
                hasher.putByte((byte) 0);
                if (DEBUG)
                    debugStream.append(url.getHost());
            }
            lastAppendedWasSpace = false;
        }

        @Override
        public Appendable append(CharSequence csq, int start, int end) {
            // Hopefully this will soon be inlined by the jvm: no need to duplicate the code! :-)
            for (int i = start; i < end; i++)
                append(csq.charAt(i));
            return this;
        }

        @Override
        public Appendable append(char c) {
            if (Character.isWhitespace(c) || Character.isDigit(c)) {
                if (!lastAppendedWasSpace) {
                    hasher.putChar(' ');
                    if (DEBUG)
                        debugStream.append(' ');
                    lastAppendedWasSpace = true;
                }
            } else {
                hasher.putChar(c);
                if (DEBUG)
                    debugStream.append(c);
                lastAppendedWasSpace = false;
            }
            return this;
        }

        @Override
        public Appendable append(CharSequence csq) {
            return append(csq, 0, csq.length());
        }

        private void append(byte[] a) {
            hasher.putBytes(a);
            if (DEBUG)
                for (byte b : a)
                    debugStream.append((char) b);
        }

        public byte[] digest() {
            if (digest == null)
                digest = hasher.hash().asBytes();
            return digest;
        }

        public void startTag(final StartTag startTag) {
            final String name = startTag.getName();
            append(startTags.get(name));

            // IFRAME or FRAME + SRC
            if (name == HTMLElementName.IFRAME || name == HTMLElementName.FRAME) {
                String s = startTag.getAttributeValue("src");
                if (s != null) {
                    append('\"');
                    append(s);
                    append('\"');
                }
            }
            lastAppendedWasSpace = false;
        }

        public void endTag(final EndTag endTag) {
            append(endTags.get(endTag.getName()));
            lastAppendedWasSpace = false;
        }
    }

    /**
     * The pattern prefixing the URL in a <samp>META </samp> <samp>HTTP-EQUIV </samp> element of refresh type.
     */
    protected static final TextPattern URLEQUAL_PATTERN = new TextPattern("URL=", TextPattern.CASE_INSENSITIVE);
    /**
     * The size of the internal Jericho buffer.
     */
    public static final int CHAR_BUFFER_SIZE = 128 * 1024;

    /**
     * The character buffer. It is set up at construction time, but it can be changed later.
     */
    protected final char[] buffer;
    /**
     * The charset we guessed for the last response.
     */
    protected String guessedCharset;
    /**
     * An object emboding the digest logic, or {@code null} for no digest computation.
     */
    protected final DigestAppendable digestAppendable;
    /**
     * A text processor, or {@code null}.
     */
    protected final TextProcessor<T> textProcessor;
    /**
     * The location URL from headers of the last response, if any, or {@code null}.
     */
    protected URI location;
    /**
     * The location URL from <samp>META</samp> elements of the last response, if any, or {@code null}.
     */
    protected URI metaLocation;
    /**
     * If <code>true</code>, pages with the same content but with different authorities are considered duplicates.
     */
    protected boolean crossAuthorityDuplicates;

    /**
     * Builds a parser for link extraction and, possibly, digesting a page. By default, only pages from within the same
     * scheme+authority may be considered to be duplicates.
     *
     * @param hashFunction the hash function used to digest, {@code null} if no digesting will be performed.
     */
    public ITIHTMLParser(final HashFunction hashFunction) {
        this(hashFunction, false);
    }

    /**
     * Builds a parser for link extraction and, possibly, digesting a page.
     *
     * @param hashFunction             the hash function used to digest, {@code null} if no digesting will be performed.
     * @param textProcessor            a text processor, or {@code null} if no text processing is required.
     * @param crossAuthorityDuplicates if <code>true</code>, pages with different scheme+authority but with the same content will be considered to be duplicates, as long
     *                                 as they are assigned to the same {@link Agent}.
     * @param bufferSize               the fixed size of the internal buffer; if zero, the buffer will be dynamic.
     */
    public ITIHTMLParser(final HashFunction hashFunction, final TextProcessor<T> textProcessor,
            final boolean crossAuthorityDuplicates, final int bufferSize) {
        buffer = bufferSize != 0 ? new char[bufferSize] : null;
        digestAppendable = hashFunction == null ? null : new DigestAppendable(hashFunction);
        this.textProcessor = textProcessor;
        this.crossAuthorityDuplicates = crossAuthorityDuplicates;
    }

    /**
     * Builds a parser with a fixed buffer of {@link #CHAR_BUFFER_SIZE} characters for link extraction and, possibly, digesting a page.
     *
     * @param hashFunction             the hash function used to digest, {@code null} if no digesting will be performed.
     * @param crossAuthorityDuplicates if <code>true</code>, pages with different scheme+authority but with the same content will be considered to be duplicates, as long
     *                                 as they are assigned to the same {@link Agent}.
     */
    public ITIHTMLParser(final HashFunction hashFunction, final boolean crossAuthorityDuplicates) {
        this(hashFunction, null, crossAuthorityDuplicates, CHAR_BUFFER_SIZE);
    }

    /**
     * Builds a parser with a fixed buffer of {@link #CHAR_BUFFER_SIZE} characters for link extraction and, possibly, digesting a page.
     *
     * @param hashFunction             the hash function used to digest, {@code null} if no digesting will be performed.
     * @param textProcessor            a text processor, or {@code null} if no text processing is required.
     * @param crossAuthorityDuplicates if <code>true</code>, pages with different scheme+authority but with the same content will be considered to be duplicates, as long
     *                                 as they are assigned to the same {@link Agent}.
     */
    public ITIHTMLParser(final HashFunction hashFunction, final TextProcessor<T> textProcessor,
            final boolean crossAuthorityDuplicates) {
        this(hashFunction, textProcessor, crossAuthorityDuplicates, CHAR_BUFFER_SIZE);
    }

    /**
     * Builds a parser with a fixed buffer of {@link #CHAR_BUFFER_SIZE} characters for link extraction and, possibly, digesting a page. (No cross-authority duplicates are considered)
     *
     * @param messageDigest the name of a message-digest algorithm, or the empty string if no digest will be computed.
     * @throws NoSuchAlgorithmException
     */
    public ITIHTMLParser(final String messageDigest) throws NoSuchAlgorithmException {
        this(BinaryParser.forName(messageDigest));
    }

    /**
     * Builds a parser with a fixed buffer of {@link #CHAR_BUFFER_SIZE} characters for link extraction and, possibly, digesting a page.
     *
     * @param messageDigest            the name of a message-digest algorithm, or the empty string if no digest will be computed.
     * @param crossAuthorityDuplicates a string whose value can only be "true" or "false" that is used to determine if you want to check for cross-authority duplicates.
     * @throws NoSuchAlgorithmException
     */
    public ITIHTMLParser(final String messageDigest, final String crossAuthorityDuplicates)
            throws NoSuchAlgorithmException {
        this(BinaryParser.forName(messageDigest), Util.parseBoolean(crossAuthorityDuplicates));
    }

    /**
     * Builds a parser with a fixed buffer of {@link #CHAR_BUFFER_SIZE} characters for link extraction and, possibly, digesting a page.
     *
     * @param messageDigest            the name of a message-digest algorithm, or the empty string if no digest will be computed.
     * @param textProcessorSpec        the specification of a text processor that will be passed to an {@link ObjectParser}.
     * @param crossAuthorityDuplicates a string whose value can only be "true" or "false" that is used to determine if you want to check for cross-authority duplicates.
     * @throws NoSuchAlgorithmException
     */
    @SuppressWarnings("unchecked")
    public ITIHTMLParser(final String messageDigest, final String textProcessorSpec,
            final String crossAuthorityDuplicates) throws NoSuchAlgorithmException, IllegalArgumentException,
            ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException,
            NoSuchMethodException, IOException {
        this(BinaryParser.forName(messageDigest), (TextProcessor<T>) ObjectParser.fromSpec(textProcessorSpec),
                Util.parseBoolean(crossAuthorityDuplicates));
    }

    /**
     * Builds a parser for link extraction that does not compute digests.
     */
    public ITIHTMLParser() {
        this(null, null, false, 0);
    }

    /**
     * Pre-process a string that represents a raw link found in the page, trying to derelativize it. If it succeeds, the
     * resulting URL is passed to the link receiver.
     *
     * @param linkReceiver the link receiver that will receive the resulting URL.
     * @param base         the base URL to be used to derelativize the link.
     * @param s            the raw link to be derelativized.
     */
    protected void process(final LinkReceiver linkReceiver, final URI base, final String s, final String text,
            boolean checkImage) {
        if (s == null)
            return;
        URI url = BURL.parse(s);
        if (url == null)
            return;
        if (checkImage && Utils.isImageUrl(s)) {
            try {
                processImageURL(url, base, s, text);
            } catch (Exception ex) {
            }
        } else
            linkReceiver.link(base.resolve(url));
    }

    public void processImageURL(URI pageUri, URI base, String imageUri, String altText)
            throws MalformedURLException, IOException {

        URI url = BURL.parse(imageUri);
        if (url != null) {
            URI resolved = base.resolve(url);
            String resolvedStr = resolved.toString();
            //avoid trying to index the same image multiple times
            if (!ItiAgent.UNIQUE_IMAGE_URLS.mightContain(resolvedStr)) {
                // Put it in the bloom filter even if it is not saved eventually
                // to avoid doing the same checks for the same image a second time
                ItiAgent.UNIQUE_IMAGE_URLS.put(resolvedStr);

                final URLConnection con = resolved.toURL().openConnection();

                if (Utils.checkContentHeaders(con.getContentLength(), con.getContentType())) {

                    InputStream is = con.getInputStream();

                    BufferedImage image = null;
                    try {
                        image = ImageIO.read(is);
                    } catch (IllegalArgumentException e) {
                        // this exception is probably thrown because of a greyscale jpeg image
                        System.out.println("Exception: " + e.getMessage() + " | Image: " + imageUri);
                        image = ImageIOGreyScale.read(is); // retry with the modified class
                    } catch (MalformedURLException e) {
                        System.out.println("Malformed url exception. Url: " + imageUri);
                    }

                    if (Utils.checkImage(image)) {

                        Image item = new Image();
                        item.setUrl(resolvedStr);
                        item.setTitle(altText);
                        item.setWidth(image.getWidth());
                        item.setHeight(image.getHeight());
                        item.setWebPageUrl(pageUri.toString());
                        item.setLastModifiedDate(new Date(con.getLastModified()));
                        item.setObjectId(new ObjectId());

                        try {
                            VisualIndexer.getInstance().indexAndStore(image, item);
                        } catch (Exception e) {
                            System.out.println("HTMLImageParser parse exeption: " + e);
                        }

                    }
                }
            }
        }
    }

    @Override
    public byte[] parse(final URI uri, final HttpResponse httpResponse, final LinkReceiver linkReceiver)
            throws IOException {
        guessedCharset = "ISO-8859-1";

        final HttpEntity entity = httpResponse.getEntity();

        // TODO: check if it will make sense to use getValue() of entity
        // Try to guess using headers
        final Header contentTypeHeader = entity.getContentType();
        if (contentTypeHeader != null) {
            final String headerCharset = getCharsetNameFromHeader(contentTypeHeader.getValue());
            if (headerCharset != null)
                guessedCharset = headerCharset;
        }

        final InputStream contentStream = entity.getContent();

        /* Note that the bubing-guessed-charset header and the header guessed by inspecting
        the entity content are complementary. The first is supposed to appear when parsing
           a store, the second while crawling. They should be aligned. This is a bit tricky,
           but we want to avoid the dependency on "rewindable" streams while parsing. */

        final Header bubingGuessedCharsetHeader = httpResponse instanceof WarcRecord
                ? ((WarcRecord) httpResponse).getWarcHeader(WarcHeader.Name.BUBING_GUESSED_CHARSET)
                : null;

        if (bubingGuessedCharsetHeader != null)
            guessedCharset = bubingGuessedCharsetHeader.getValue();
        else {
            if (contentStream instanceof InspectableFileCachedInputStream) {
                final InspectableFileCachedInputStream inspectableStream = (InspectableFileCachedInputStream) contentStream;
                final String metaCharset = getCharsetName(inspectableStream.buffer, inspectableStream.inspectable);
                if (metaCharset != null)
                    guessedCharset = metaCharset;
            }
        }

        if (LOGGER.isDebugEnabled())
            LOGGER.debug("Guessing charset \"{}\" for URL {}", guessedCharset, uri);

        Charset charset = Charsets.ISO_8859_1; // Fallback
        try {
            charset = Charset.forName(guessedCharset);
        } catch (IllegalCharsetNameException e) {
            if (LOGGER.isDebugEnabled())
                LOGGER.debug("Response for {} contained an illegal charset name: \"{}\"", uri, guessedCharset);
        } catch (UnsupportedCharsetException e) {
            if (LOGGER.isDebugEnabled())
                LOGGER.debug("Response for {} contained an unsupported charset: \"{}\"", uri, guessedCharset);
        }

        linkReceiver.init(uri);
        if (textProcessor != null)
            textProcessor.init(uri);

        // Get location if present
        location = null;
        metaLocation = null;

        final Header locationHeader = httpResponse.getFirstHeader(HttpHeaders.LOCATION);
        if (locationHeader != null) {
            final URI location = BURL.parse(locationHeader.getValue());
            if (location != null) {
                // This shouldn't happen by standard, but people unfortunately does it.
                if (!location.isAbsolute() && LOGGER.isDebugEnabled())
                    LOGGER.debug("Found relative header location URL: \"{}\"", location);
                linkReceiver.location(this.location = uri.resolve(location));
            }
        }

        @SuppressWarnings("resource")
        final StreamedSource streamedSource = new StreamedSource(new InputStreamReader(contentStream, charset));
        if (buffer != null)
            streamedSource.setBuffer(buffer);
        if (digestAppendable != null)
            digestAppendable.init(crossAuthorityDuplicates ? null : uri);
        URI base = uri;

        int lastSegmentEnd = 0;
        int inSpecialText = 0;
        for (Segment segment : streamedSource) {
            if (segment.getEnd() > lastSegmentEnd) {
                lastSegmentEnd = segment.getEnd();
                if (segment instanceof StartTag) {
                    final StartTag startTag = (StartTag) segment;
                    if (startTag.getTagType() != StartTagType.NORMAL)
                        continue;
                    final String name = startTag.getName();
                    if ((name == HTMLElementName.STYLE || name == HTMLElementName.SCRIPT)
                            && !startTag.isSyntacticalEmptyElementTag())
                        inSpecialText++;

                    if (digestAppendable != null)
                        digestAppendable.startTag(startTag);
                    // TODO: detect flow breakers
                    if (linkReceiver == null)
                        continue; // No link receiver, nothing to do.

                    // IFRAME or FRAME + SRC
                    if (name == HTMLElementName.IFRAME || name == HTMLElementName.FRAME
                            || name == HTMLElementName.EMBED)
                        process(linkReceiver, base, startTag.getAttributeValue("src"),
                                startTag.getAttributeValue("name"), true);
                    else if (name == HTMLElementName.IMG) {
                        processImageURL(uri, base, startTag.getAttributeValue("src"),
                                startTag.getAttributeValue("alt"));
                    } else if (name == HTMLElementName.SCRIPT)
                        process(linkReceiver, base, startTag.getAttributeValue("src"), null, false);
                    else if (name == HTMLElementName.OBJECT)
                        process(linkReceiver, base, startTag.getAttributeValue("data"),
                                startTag.getAttributeValue("name"), true);
                    else if (name == HTMLElementName.A || name == HTMLElementName.AREA
                            || name == HTMLElementName.LINK)
                        process(linkReceiver, base, startTag.getAttributeValue("href"), null, true);
                    else if (name == HTMLElementName.BASE) {
                        String s = startTag.getAttributeValue("href");
                        if (s != null) {
                            final URI link = BURL.parse(s);
                            if (link != null) {
                                if (link.isAbsolute())
                                    base = link;
                                else if (LOGGER.isDebugEnabled())
                                    LOGGER.debug("Found relative BASE URL: \"{}\"", link);
                            }
                        }
                    }

                    // META REFRESH/LOCATION
                    else if (name == HTMLElementName.META) {
                        final String equiv = startTag.getAttributeValue("http-equiv");
                        final String content = startTag.getAttributeValue("content");
                        if (equiv != null && content != null) {
                            equiv.toLowerCase();

                            // http-equiv="refresh" content="0;URL=http://foo.bar/..."
                            if (equiv.equals("refresh")) {

                                final int pos = URLEQUAL_PATTERN.search(content);
                                if (pos != -1) {
                                    final String urlPattern = content.substring(pos + URLEQUAL_PATTERN.length());
                                    final URI refresh = BURL.parse(urlPattern);
                                    if (refresh != null) {
                                        // This shouldn't happen by standard, but people unfortunately does it.
                                        if (!refresh.isAbsolute() && LOGGER.isDebugEnabled())
                                            LOGGER.debug("Found relative META refresh URL: \"{}\"", urlPattern);
                                        linkReceiver.metaRefresh(base.resolve(refresh));
                                    }
                                }
                            }

                            // http-equiv="location" content="http://foo.bar/..."
                            if (equiv.equals("location")) {
                                final URI metaLocation = BURL.parse(content);
                                if (metaLocation != null) {
                                    // This shouldn't happen by standard, but people unfortunately does it.
                                    if (!metaLocation.isAbsolute() && LOGGER.isDebugEnabled())
                                        LOGGER.debug("Found relative META location URL: \"{}\"", content);
                                    linkReceiver.metaLocation(this.metaLocation = base.resolve(metaLocation));
                                }
                            }
                        }
                    }
                } else if (segment instanceof EndTag) {
                    final EndTag endTag = (EndTag) segment;
                    final String name = endTag.getName();
                    if (name == HTMLElementName.STYLE || name == HTMLElementName.SCRIPT) {
                        inSpecialText = Math.max(0, inSpecialText - 1); // Ignore extra closing tags
                    }

                    if (digestAppendable != null) {
                        if (endTag.getTagType() != EndTagType.NORMAL)
                            continue;
                        digestAppendable.endTag(endTag);
                    }
                } else if (inSpecialText == 0) {
                    if (textProcessor != null) {
                        if (segment instanceof CharacterReference)
                            ((CharacterReference) segment).appendCharTo(textProcessor);
                        else
                            textProcessor.append(segment);
                    }
                    if (digestAppendable != null) {
                        if (segment instanceof CharacterReference)
                            ((CharacterReference) segment).appendCharTo(digestAppendable);
                        else
                            digestAppendable.append(segment);
                    }
                }
            }
        }

        if (DigestAppendable.DEBUG)
            if (digestAppendable != null) {
                System.err.println("Closing " + digestAppendable.debugFile + " for " + uri);
                digestAppendable.debugStream.close();
            }

        return digestAppendable != null ? digestAppendable.digest() : null;
    }

    @Override
    public String guessedCharset() {
        return guessedCharset;
    }

    /**
     * Returns the BURL location header, if present; if it is not present, but the page contains a valid metalocation, the latter
     * is returned. Otherwise, {@code null} is returned.
     *
     * @return the location (or metalocation), if present; {@code null} otherwise.
     */
    public URI location() {
        //TODO: see if we must derelativize
        if (location != null)
            return location;
        else if (metaLocation != null)
            return metaLocation;
        else
            return null;
    }

    /**
     * Used by {@link #getCharsetName(byte[], int)}.
     */
    protected static final TextPattern META_PATTERN = new TextPattern("<meta", TextPattern.CASE_INSENSITIVE);
    /**
     * Used by {@link #getCharsetName(byte[], int)}.
     */
    protected static final Pattern HTTP_EQUIV_PATTERN = Pattern
            .compile(".*http-equiv\\s*=\\s*('|\")?content-type('|\")?.*", Pattern.CASE_INSENSITIVE);
    /**
     * Used by {@link #getCharsetName(byte[], int)}.
     */
    protected static final Pattern CONTENT_PATTERN = Pattern.compile(".*content\\s*=\\s*('|\")([^'\"]*)('|\").*",
            Pattern.CASE_INSENSITIVE);
    /**
     * Used by {@link #getCharsetName(byte[], int)}.
     */
    protected static final Pattern CHARSET_PATTERN = Pattern.compile(
            ".*charset\\s*=\\s*(([\\041-\\0176&&[^<>\\{\\}\\\\/:,;@?=]])+|\"[^\"]*\").*", Pattern.CASE_INSENSITIVE);

    /**
     * Returns the charset name as indicated by a <samp>META</samp>
     * <samp>HTTP-EQUIV</samp> element, if
     * present, interpreting the provided byte array as a sequence of
     * ISO-8859-1-encoded characters. Only the first such occurrence is considered (even if
     * it might not correspond to a valid or available charset).
     * <p/>
     * <p><strong>Beware</strong>: it might not work if the
     * <em>value</em> of some attribute in a <code>meta</code> tag
     * contains a string matching (case insensitively) the r.e.
     * <code>http-equiv\s*=\s*('|")content-type('|")</code>, or
     * <code>content\s*=\s*('|")[^"']*('|")</code>.
     *
     * @param buffer a buffer containing raw bytes that will be interpreted as ISO-8859-1 characters.
     * @param length the number of significant bytes in the buffer.
     * @return the charset name, or {@code null} if no
     * charset is specified; note that the charset might be not valid or not available.
     */
    public static String getCharsetName(final byte buffer[], final int length) {
        int start = 0;
        while ((start = META_PATTERN.search(buffer, start, length)) != -1) {

            /* Look for attribute http-equiv with value content-type,
                * if present, look for attribute content and, if present,
             * return its value. */

            int end = start;
            while (end < length && buffer[end] != '>')
                end++; // Look for closing '>'
            if (end == length)
                return null; // No closing '>'

            final ByteArrayCharSequence tagContent = new ByteArrayCharSequence(buffer,
                    start + META_PATTERN.length(), end - start - META_PATTERN.length());
            if (HTTP_EQUIV_PATTERN.matcher(tagContent).matches()) {
                final Matcher m = CONTENT_PATTERN.matcher(tagContent);
                if (m.matches())
                    return getCharsetNameFromHeader(m.group(2)); // got it!
            }

            start = end + 1;
        }

        return null; // no '<meta' found
    }

    /**
     * Extracts the charset name from the header value of a <samp>content-type</samp>
     * header using a regular expression.
     * <p/>
     * <strong>Warning</strong>: it might not work if someone puts the string <samp>charset=</samp>
     * in a string inside some attribute/value pair.
     *
     * @param headerValue The value of a <samp>content-type</samp> header.
     * @return the charset name, or {@code null} if no
     * charset is specified; note that the charset might be not valid or not available.
     */
    public static String getCharsetNameFromHeader(final String headerValue) {
        final Matcher m = CHARSET_PATTERN.matcher(headerValue);
        if (m.matches()) {
            final String s = m.group(1);
            int start = 0, end = s.length();
            // TODO: we discard delimiting single/double quotes; is it necessary?
            if (end > 0 && (s.charAt(0) == '\"' || s.charAt(0) == '\''))
                start = 1;
            if (end > 0 && (s.charAt(end - 1) == '\"' || s.charAt(end - 1) == '\''))
                end--;
            if (start < end)
                return s.substring(start, end);
        }
        return null;
    }

    @Override
    public boolean apply(final URIResponse uriResponse) {
        final Header contentType = uriResponse.response().getEntity().getContentType();
        return contentType != null && contentType.getValue().startsWith("text/");
    }

    @Override
    public ITIHTMLParser<T> clone() {
        return new ITIHTMLParser<T>(digestAppendable == null ? null : digestAppendable.hashFunction,
                textProcessor == null ? null : textProcessor.copy(), crossAuthorityDuplicates, buffer.length);
    }

    @Override
    public ITIHTMLParser<T> copy() {
        return clone();
    }

    @Override
    public T result() {
        return textProcessor == null ? null : textProcessor.result();
    }

    public static void main(String arg[]) throws IllegalArgumentException, IOException, URISyntaxException,
            JSAPException, NoSuchAlgorithmException {

        final SimpleJSAP jsap = new SimpleJSAP(ITIHTMLParser.class.getName(),
                "Produce the digest of a page: the page is downloaded or passed as argument by specifying a file",
                new Parameter[] {
                        new UnflaggedOption("url", JSAP.STRING_PARSER, JSAP.REQUIRED, "The url of the page."),
                        new Switch("crossAuthorityDuplicates", 'c', "cross-authority-duplicates"),
                        new FlaggedOption("charBufferSize", JSAP.INTSIZE_PARSER, Integer.toString(CHAR_BUFFER_SIZE),
                                JSAP.NOT_REQUIRED, 'b', "buffer",
                                "The size of the parser character buffer (0 for dynamic sizing)."),
                        new FlaggedOption("file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f',
                                "file", "The page to be processed."),
                        new FlaggedOption("digester", JSAP.STRING_PARSER, "MD5", JSAP.NOT_REQUIRED, 'd', "digester",
                                "The digester to be used.") });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())
            System.exit(1);

        final String url = jsapResult.getString("url");
        final String digester = jsapResult.getString("digester");
        final boolean crossAuthorityDuplicates = jsapResult.userSpecified("crossAuthorityDuplicates");
        final int charBufferSize = jsapResult.getInt("charBufferSize");

        final ITIHTMLParser<Void> htmlParser = new ITIHTMLParser<Void>(BinaryParser.forName(digester),
                (TextProcessor<Void>) null, crossAuthorityDuplicates, charBufferSize);
        final SetLinkReceiver linkReceiver = new SetLinkReceiver();
        final byte[] digest;

        if (!jsapResult.userSpecified("file")) {
            final URI uri = new URI(url);
            final HttpGet request = new HttpGet(uri);
            request.setConfig(RequestConfig.custom().setRedirectsEnabled(false).build());
            digest = htmlParser.parse(uri, HttpClients.createDefault().execute(request), linkReceiver);
        } else {
            final String file = jsapResult.getString("file");
            String content = IOUtils.toString(new InputStreamReader(new FileInputStream(file)));
            digest = htmlParser.parse(BURL.parse(url), new StringHttpMessages.HttpResponse(content), linkReceiver);
        }

        System.out.println("DigestHexString: " + Hex.encodeHexString(digest));
        System.out.println("Links: " + linkReceiver.urls);

        Set<String> urlStrings = new ObjectOpenHashSet<String>();
        for (URI link : linkReceiver.urls)
            urlStrings.add(link.toString());
        if (urlStrings.size() != linkReceiver.urls.size())
            System.out.println(
                    "There are " + linkReceiver.urls.size() + " URIs but " + urlStrings.size() + " strings");

    }

}