org.archive.jbs.Parse.java Source code

Introduction

Here is the source code for org.archive.jbs.Parse.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.archive.jbs;

import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Outlink;
//import org.apache.nutch.parse.Parse;  // Don't import due to name conflict.
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.archive.io.warc.WARCConstants;
import org.archive.jbs.arc.ArcReader;
import org.archive.jbs.arc.ArchiveRecordProxy;
import org.archive.jbs.util.FilenameInputFormat;
import org.archive.jbs.util.PerMapOutputFormat;

import de.l3s.boilerpipe.BoilerpipeProcessingException;

/**
 * Parse the contents of a (W)ARC file, output
 * in a JSON Document.
 */
public class Parse extends Configured implements Tool {

    public static final Log LOG = LogFactory.getLog(Parse.class);

    public static class ParseMapper extends MapReduceBase implements Mapper<Text, Text, Text, Text> {
        private JobConf jobConf;
        private ParseUtil parseUtil;

        /**
         * <p>Configures the job.  Sets the url filters, scoring filters, url normalizers
         * and other relevant data.</p>
         * 
         * @param job The job configuration.
         */
        public void configure(JobConf job) {
            this.jobConf = job;
            this.parseUtil = new ParseUtil(jobConf);
        }

        /**
         * Read the records from the (w)arc file named in the
         * <code>key</code>, parse each record (if possible) and emit a
         * JSON Document for the parsed record body.
         */
        public void map(Text key, Text value, OutputCollector output, Reporter reporter) throws IOException {
            String path = key.toString();

            LOG.info("Start: " + path);

            FSDataInputStream fis = null;
            try {
                fis = FileSystem.get(new java.net.URI(path), this.jobConf).open(new Path(path));

                ArcReader reader = new ArcReader(path, fis);

                reader.setSizeLimit(jobConf.getInt("jbs.parse.content.limit", -1));

                for (ArchiveRecordProxy record : reader) {
                    // If this is an HTTP response record, do all the parsing and stuff.
                    if (WARCConstants.WARCRecordType.response.toString().equals(record.getWARCRecordType())) {
                        if (WARCConstants.HTTP_RESPONSE_MIMETYPE.equals(record.getWARCContentType())) {
                            LOG.info("Process response: " + record.getUrl() + " digest:" + record.getDigest()
                                    + " date: " + record.getDate());

                            parseRecord(record, output);
                        } else {
                            LOG.info("Skip response: " + record.getUrl() + " response-type:"
                                    + record.getWARCContentType() + " date: " + record.getDate());
                        }
                    } else if (WARCConstants.WARCRecordType.resource.toString()
                            .equals(record.getWARCRecordType())) {
                        // We only care about "ftp://" resource records.  It's possible that the ArchiveRecordProxy will
                        // pass us resource records other than ftp, so we filter out non-ftp ones here.
                        //
                        // Also, only care about "application/octet-stream" content-type, which indicates the record is
                        // the ftp file download and a directory listing.
                        if (record.getUrl().startsWith("ftp://")
                                && "application/octet-stream".equals(record.getWARCContentType())) {
                            LOG.info("Process resource: " + record.getUrl() + " digest:" + record.getDigest()
                                    + " date: " + record.getDate());

                            parseRecord(record, output);
                        } else {
                            LOG.info("Skip resource: " + record.getUrl() + " response-type:"
                                    + record.getWARCContentType() + " date: " + record.getDate());
                        }
                    } else if (WARCConstants.WARCRecordType.revisit.toString().equals(record.getWARCRecordType())) {
                        // If this is a revisit record, just create a JSON
                        // Document with the relevant info.  No parsing or
                        // anything needed.
                        LOG.info("Process revisit: " + record.getUrl() + " digest:" + record.getDigest() + " date: "
                                + record.getDate());

                        Text docKey = new Text(record.getUrl() + " " + record.getDigest());

                        Document doc = new Document();
                        doc.set("url", record.getUrl());
                        doc.set("digest", record.getDigest());
                        doc.set("date", record.getDate());

                        output.collect(docKey, new Text(doc.toString()));
                    } else {
                        LOG.info("Skip record: " + record.getUrl() + " record-type:" + record.getWARCRecordType()
                                + " date: " + record.getDate());
                    }

                    reporter.progress();
                }
            } catch (Exception e) {
                LOG.error("Error processing archive file: " + path, e);

                if (jobConf.getBoolean("jbs.parse.abortOnArchiveReadError", true)) {
                    throw new IOException(e);
                }
            } finally {
                LOG.info("Finish: " + path);
            }
        }

        transient private ExecutorService timeoutExecutor = Executors.newSingleThreadExecutor();

        private <T> T runWithTimeout(long timeout, TimeUnit timeUnit, Callable<T> callable)
                throws InterruptedException, ExecutionException, TimeoutException {
            Future<T> future = timeoutExecutor.submit(callable);
            try {
                return future.get(timeout, timeUnit);
            } catch (TimeoutException e) {
                future.cancel(true);
                try {
                    Thread.sleep(2000); // give it 2 seconds to try to cancel
                } catch (InterruptedException f) {
                }
                // We don't actually expect the cancel to work since the parsing
                // library doesn't respond to interrupt. If we try to reuse this
                // executor it will keep timing out waiting for the job ahead of
                // it to finish. Create a new executor instead.
                timeoutExecutor.shutdownNow();
                timeoutExecutor = Executors.newSingleThreadExecutor();
                throw e;
            }
        }

        /**
         * 
         */
        private void parseRecord(final ArchiveRecordProxy record, OutputCollector output) throws IOException {
            String key = record.getUrl() + " " + record.getDigest();

            try {
                Metadata contentMetadata = new Metadata();
                contentMetadata.set("url", record.getUrl());
                contentMetadata.set("date", record.getDate());
                contentMetadata.set("digest", record.getDigest());
                contentMetadata.set("length", String.valueOf(record.getLength()));
                contentMetadata.set("code", record.getHttpStatusCode());

                // The Nutch Content object will invoke Tika's magic/mime-detection.
                Content content = new Content(record.getUrl(), record.getUrl(), record.getHttpResponseBody(), null,
                        contentMetadata, this.jobConf);

                // Retain the auto-detected Content-Type/MIME-Type.
                contentMetadata.set("type", content.getContentType());

                // Limit the size of either the HTML or text document to avoid blowing up the parsers.
                // Also boilerpipe the HTML.
                if ("text/html".equals(content.getContentType())
                        || "application/xhtml+xml".equals(content.getContentType())
                        || "application/xhtml".equals(content.getContentType())) {
                    int size = jobConf.getInt("jbs.parse.content.limit.html", -1);
                    if (size > 0 && size < record.getLength()) {
                        LOG.warn("HTML file size exceeds threshold [" + size + "]: " + record.getUrl() + " ["
                                + record.getLength() + "]");

                        content.setContent(Arrays.copyOf(record.getHttpResponseBody(), size));
                    }

                    try {
                        if (jobConf.getBoolean("jbs.parse.boilerpipe", true)) {
                            // BoilerPipe! 
                            final org.xml.sax.InputSource inputSource = new org.xml.sax.InputSource(
                                    new java.io.ByteArrayInputStream(record.getHttpResponseBody()));
                            String boiled = runWithTimeout(60, TimeUnit.SECONDS, new Callable<String>() {
                                // dummy var to help in debugging so we can see what url the thing is spinning on
                                private ArchiveRecordProxy rec = record;

                                @Override
                                public String call() throws BoilerpipeProcessingException {
                                    return de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE
                                            .getText(inputSource);
                                }

                            });
                            contentMetadata.set("boiled", boiled);
                        }
                    } catch (TimeoutException e) {
                        LOG.warn("Timeout boilerpiping " + record.getUrl());
                    } catch (Exception e) {
                        LOG.warn("Error boilerpiping: " + record.getUrl());
                    }
                }

                if ("text/plain".equals(content.getContentType())) {
                    int size = jobConf.getInt("jbs.parse.content.limit.text", -1);
                    if (size > 0 && size < record.getLength()) {
                        LOG.warn("Text file size exceeds threshold [" + size + "]: " + record.getUrl() + " ["
                                + record.getLength() + "]");

                        content.setContent(Arrays.copyOf(record.getHttpResponseBody(), size));
                    }
                }

                write(output, new Text(key), content);
            } catch (Throwable t) {
                if (jobConf.getBoolean("jbs.parse.emitParseErrorRecords", true)) {
                    Document doc = new Document();
                    doc.set("status", "error");
                    doc.set("errorMessage", "Failed to parse record: " + t.getMessage());

                    output.collect(new Text(key), new Text(doc.toString()));
                }
            }
        }

        /**
         * Writes the key and related content to the output collector.
         */
        private void write(OutputCollector output, Text key, Content content) throws IOException {
            ParseResult parseResult = null;
            try {
                parseResult = this.parseUtil.parse(content);
            } catch (Throwable t) {
                if (jobConf.getBoolean("jbs.parse.emitParseErrorRecords", true)) {
                    Document doc = new Document();
                    doc.set("status", "error");
                    doc.set("errorMessage", "Failed to parse record: " + t.getMessage());

                    output.collect(key, new Text(doc.toString()));
                }
            }

            try {
                if (parseResult != null) {
                    for (Map.Entry<Text, org.apache.nutch.parse.Parse> entry : parseResult) {
                        // Text url = entry.getKey();
                        org.apache.nutch.parse.Parse parse = entry.getValue();
                        ParseStatus parseStatus = parse.getData().getStatus();

                        if (!parseStatus.isSuccess()) {
                            LOG.warn("Error parsing: " + key + ": " + parseStatus);
                            parse = parseStatus.getEmptyParse(this.jobConf);
                        }

                        String parsedText = parse.getText();

                        Document doc = new Document();

                        ParseData pd = parse.getData();

                        // Copy metadata fields.
                        Metadata meta = pd.getContentMeta();
                        for (String name : meta.names()) {
                            doc.set(name, meta.get(name));
                        }

                        // Ensure that the title comes from the ParseData.
                        doc.set("title", pd.getTitle());

                        // Optionally skip the outlinks.
                        if (jobConf.getBoolean("jbs.parse.emitOutlinks", true)) {
                            for (Outlink outlink : pd.getOutlinks()) {
                                doc.addLink(outlink.getToUrl(), outlink.getAnchor());
                            }
                        }

                        doc.set("content", parsedText);

                        // Emit JSON string
                        output.collect(key, new Text(doc.toString()));
                    }
                }
            } catch (Throwable t) {
                if (jobConf.getBoolean("jbs.parse.emitParseErrorRecords", true)) {
                    Document doc = new Document();
                    doc.set("status", "error");
                    doc.set("errorMessage", "Failed to parse record: " + t.getMessage());

                    output.collect(key, new Text(doc.toString()));
                }
            }
        }

    }

    /**
     * Run the job.
     */
    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            usage();
            return 1;
        }

        FileSystem fs = FileSystem.get(getConf());

        // Create a job configuration
        JobConf job = new JobConf(getConf());

        // Job name uses output dir to help identify it to the operator.
        job.setJobName("jbs.Parse " + args[0]);

        // The inputs are a list of filenames, use the
        // FilenameInputFormat to pass them to the mappers.
        job.setInputFormat(FilenameInputFormat.class);

        // This is a map-only job, no reducers.
        job.setNumReduceTasks(0);

        // Use the Parse-specific output format.
        job.setOutputFormat(PerMapOutputFormat.class);

        // Use our ParseMapper, with output keys and values of type
        // Text.
        job.setMapperClass(ParseMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // Configure the input and output paths, from the command-line.
        Path outputDir = new Path(args[0]);
        FileOutputFormat.setOutputPath(job, outputDir);

        boolean atLeastOneInput = false;
        for (int i = 1; i < args.length; i++) {
            FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

            for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
                Path inputPath = status.getPath();
                Path outputPath = new Path(outputDir, inputPath.getName());
                if (fs.exists(outputPath)) {
                    LOG.debug("Output path already exists: " + outputPath);
                } else {
                    atLeastOneInput = true;
                    LOG.info("Add input path: " + inputPath);
                    FileInputFormat.addInputPath(job, inputPath);
                }
            }
        }

        if (!atLeastOneInput) {
            LOG.info("No input files to parse.");
            return 0;
        }

        // Run the job!
        RunningJob rj = JobClient.runJob(job);

        if (!rj.isSuccessful()) {
            LOG.error("FAILED: " + rj.getID());
            return 2;
        }

        return 0;
    }

    /**
     * Emit usage information for command-line driver.
     */
    public void usage() {
        String usage = "Usage: Parse <outputDir> <(w)arcfile>...\n";

        System.out.println(usage);
    }

    /**
     * Command-line driver.  Runs the Parse as a Hadoop job.
     */
    public static void main(String args[]) throws Exception {
        JobConf conf = new JobConf(Parse.class);

        // Load the default set of config properties, including the
        // essential properties needed by the bits of Nutch that we are
        // still using.  These properties can still be over-ridden by
        // command-line args.
        conf.addResource("conf-parse.xml");

        int result = ToolRunner.run(conf, new Parse(), args);

        System.exit(result);
    }

}