it.unimi.di.big.mg4j.document.WarcDocumentSequence_NYU.java Source code

Java tutorial

Introduction

Here is the source code for it.unimi.di.big.mg4j.document.WarcDocumentSequence_NYU.java

Source

package it.unimi.di.big.mg4j.document;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 * 
 * Copyright (C) 2015 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import com.martiansoftware.jsap.*;
import it.unimi.di.law.bubing.parser.HTMLParser;
import it.unimi.di.law.warc.io.UncompressedWarcReader_NYU;
import it.unimi.di.law.warc.io.WarcReader;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.di.law.warc.records.WarcHeader.Name;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.InspectableFileCachedInputStream;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.zip.GZIPInputStream;

/**
 * A {@linkplain DocumentSequence document sequence} over a set of
 * (possibly compressed) Warc files.
 * <p>
 * <p>The metadata provided by the sequence include the {@linkplain PropertyBasedDocumentFactory.MetadataKeys#ENCODING encoding},
 * the {@linkplain PropertyBasedDocumentFactory.MetadataKeys#URI URI},
 * the {@linkplain PropertyBasedDocumentFactory.MetadataKeys#MIMETYPE MIME type}.
 * <p>
 * <p>If a Warc header with name &ldquo;WARC-TREC-ID&rdquo; is present, it will be used as
 * {@linkplain PropertyBasedDocumentFactory.MetadataKeys#TITLE TITLE}.
 * <p>
 * <p>This class will also fetch and use the {@linkplain Name#BUBING_GUESSED_CHARSET BUbiNG guessed charset}, if present.
 * <p>
 * <p>As a commodity, this class provides a main method for the creation of a serialized version
 * of the document sequence.
 */

public class WarcDocumentSequence_NYU extends AbstractDocumentSequence implements Serializable {

    /**
     * Default buffer size, set up after some edu.nyu.tandon.experiments.
     */
    public static final String DEFAULT_BUFFER_SIZE = "64Ki";
    private static final long serialVersionUID = 0L;
    private static final Logger LOGGER = LoggerFactory.getLogger(WarcDocumentSequence_NYU.class);
    /**
     * The user specified factory.
     */
    protected final DocumentFactory factory;
    /**
     * The buffer size used for reads.
     */
    protected final int bufferSize;
    /**
     * Whether the Warcfile are gzipped.
     */
    protected final boolean useGzip;
    /**
     * The list of WARC files
     */
    protected final String[] warcFile;

    protected WarcDocumentSequence_NYU(final WarcDocumentSequence_NYU prototype) {
        this.factory = prototype.factory;
        this.warcFile = prototype.warcFile;
        this.useGzip = prototype.useGzip;
        this.bufferSize = prototype.bufferSize;
    }

    public WarcDocumentSequence_NYU(final String[] warcFile, final DocumentFactory factory, final boolean useGzip,
            final int bufferSize) {
        this.warcFile = warcFile;
        this.useGzip = useGzip;
        this.bufferSize = bufferSize;
        this.factory = factory;
    }

    public static void main(String[] args) throws Exception {

        SimpleJSAP jsap = new SimpleJSAP(WarcDocumentSequence_NYU.class.getName(),
                "Saves a serialised Warc document sequence based on a set of file names.",
                new Parameter[] {
                        new FlaggedOption("factory", JSAP.CLASS_PARSER, IdentityDocumentFactory.class.getName(),
                                JSAP.NOT_REQUIRED, 'f', "factory",
                                "A document factory with a standard constructor."),
                        new FlaggedOption("property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p',
                                "property", "A 'key=value' specification, or the name of a property file")
                                        .setAllowMultipleDeclarations(true),
                        new Switch("gzip", 'z', "gzip",
                                "Expect gzip-ed WARC content (files should end in .warc.gz)."),
                        new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, JSAP.NOT_REQUIRED,
                                'b', "buffer-size", "The size of an I/O buffer."),
                        new UnflaggedOption("sequence", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The filename for the serialized sequence."),
                        new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                                JSAP.GREEDY,
                                "A list of basename files that will be indexed. If missing, a list of files will be read from standard input.") });

        final JSAPResult jsapResult = jsap.parse(args);
        if (jsap.messagePrinted())
            System.exit(1);

        final DocumentFactory factory = PropertyBasedDocumentFactory.getInstance(jsapResult.getClass("factory"),
                jsapResult.getStringArray("property"));
        final boolean isGZipped = jsapResult.getBoolean("gzip");

        String[] file = jsapResult.getStringArray("basename");
        if (file.length == 0)
            file = IOUtils.readLines(System.in).toArray(new String[0]);
        if (file.length == 0)
            LOGGER.warn("Empty fileset");

        BinIO.storeObject(new WarcDocumentSequence_NYU(file, factory, isGZipped, jsapResult.getInt("bufferSize")),
                jsapResult.getString("sequence"));
    }

    public DocumentFactory factory() {
        return factory;
    }

    protected Document getCurrentDocument(WarcRecord record) throws IOException {

        HttpResponseWarcRecord httpResponse = (HttpResponseWarcRecord) record;
        String guessedCharset = "ISO-8859-1";

        final HttpEntity entity = httpResponse.getEntity();

        // Try to guess using headers
        final Header contentTypeHeader = entity.getContentType();
        if (contentTypeHeader != null) {
            final String headerCharset = HTMLParser.getCharsetNameFromHeader(contentTypeHeader.getValue());
            if (headerCharset != null)
                guessedCharset = headerCharset;
        }

        final InputStream contentStream = entity.getContent();

        /* Note that the bubing-guessed-charset header and the header guessed by inspecting
        the entity content are complementary. The first is supposed to appear when parsing
           a store, the second while crawling. They should be aligned. This is a bit tricky,
           but we want to avoid the dependency on "rewindable" streams while parsing. */

        final Header bubingGuessedCharsetHeader = httpResponse.getWarcHeader(Name.BUBING_GUESSED_CHARSET);
        if (bubingGuessedCharsetHeader != null)
            guessedCharset = bubingGuessedCharsetHeader.getValue();
        else {
            if (contentStream instanceof InspectableFileCachedInputStream) {
                final InspectableFileCachedInputStream inspectableStream = (InspectableFileCachedInputStream) contentStream;
                final String metaCharset = HTMLParser.getCharsetName(inspectableStream.buffer,
                        inspectableStream.inspectable);
                if (metaCharset != null)
                    guessedCharset = metaCharset;
            }
        }

        final Reference2ObjectMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>();
        metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, guessedCharset);

        final Header trecId = httpResponse.getWarcHeaders().getFirstHeader("WARC-TREC-ID");
        if (trecId != null)
            metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, trecId.getValue());

        metadata.put(PropertyBasedDocumentFactory.MetadataKeys.URI, httpResponse.getWarcTargetURI());
        if (contentTypeHeader != null)
            metadata.put(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, contentTypeHeader.getValue());

        return factory.getDocument(entity.getContent(), metadata);
    }

    public DocumentIterator iterator() throws IOException {

        return new AbstractDocumentIterator() {
            private InputStream currentStream;
            private int n;
            private WarcReader reader;

            @SuppressWarnings("resource")
            public Document nextDocument() throws IOException {
                for (;;) {
                    if (currentStream == null) {
                        if (n == warcFile.length)
                            return null;
                        currentStream = useGzip
                                ? new GZIPInputStream(new FileInputStream(warcFile[n++]), bufferSize)
                                : new FastBufferedInputStream(new FileInputStream(warcFile[n++]), bufferSize);
                        reader = new UncompressedWarcReader_NYU(currentStream);
                    }

                    for (;;) {
                        WarcRecord record = null;
                        try {
                            record = reader.read();
                        } catch (Exception ignore) {
                            LOGGER.error("Unexpected exception reading WARC file", ignore);
                            continue;
                        }

                        if (record == null) {
                            currentStream.close();
                            currentStream = null;
                            break;
                        }

                        if (record.getWarcType() == WarcRecord.Type.RESPONSE)
                            return getCurrentDocument(record);
                    }
                }
            }
        };
    }
}