net.sf.nutchcontentexporter.NutchToWARCConverter.java Source code

Introduction

Here is the source code for net.sf.nutchcontentexporter.NutchToWARCConverter.java
Source

/*
 * Copyright 2015 Ivan Habernal
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package net.sf.nutchcontentexporter;

import net.sf.nutchcontentexporter.filter.ContentTypeFilter;
import net.sf.nutchcontentexporter.filter.ExportContentFilter;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.protocol.Content;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPoolSettingsData;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;

import java.io.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Converts a content from Nutch stored in a segment folder into a compressed WARC file
 *
 * @author Ivan Habernal
 */
public class NutchToWARCConverter extends Configured implements Tool {
    // Thu, 01 Jan 1970 00:00:01 GMT
    private static final String DEFAULT_WARC_DATE = "1000";
    private static final long FILE_LIMIT_1GB = (long) 10e8;

    // counter
    private long totalBytesWritten = 0L;

    // counter
    private int totalFilesWritten = 0;

    // counter
    private long entriesCounter = 0L;

    protected RecordIDGenerator generator = new UUIDGenerator();

    /**
     * Filters for deciding whether a particular crawled document should be exported to the
     * final WARC file
     */
    private final Set<ExportContentFilter> filters = new HashSet<ExportContentFilter>();

    /**
     * Add the filters to the filter set
     *
     * @param filters filters
     */
    public void addFilters(ExportContentFilter... filters) {
        this.filters.addAll(Arrays.asList(filters));
    }

    /**
     * Converts a content from Nutch stored in a segment folder into a bz2 WARC file
     *
     * @param segmentFile   Nutch segment folder
     * @param outDir        output warc file
     * @param fileName
     * @param fileExtension
     * @param conf          hadoop configuraion
     * @param compressBz2   @throws IOException
     * @throws ParseException
     */
    public void nutchSegmentToWARCFile(Path segmentFile, File outDir, String fileName, String fileExtension,
            Configuration conf, boolean compressBz2) throws IOException, ParseException {
        // reader for hadoop sequence file
        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(segmentFile));

        // get writer
        WARCWriter writer = prepareOutputWarcFile(outDir, fileName, this.totalFilesWritten, fileExtension,
                compressBz2);

        Text key = new Text();
        Content content = new Content();

        while (reader.next(key, content)) {
            write(writer, content);

            // close file and create a new one if limit reached
            if (this.totalBytesWritten > FILE_LIMIT_1GB) {
                writer.close();
                // reset counters
                this.totalFilesWritten++;
                this.totalBytesWritten = 0;

                // new output
                writer = prepareOutputWarcFile(outDir, fileName, this.totalFilesWritten, fileExtension,
                        compressBz2);
            }
        }

        writer.close();
        reader.close();

        System.out.println("Total WARC entries: " + entriesCounter);
    }

    public WARCWriter prepareOutputWarcFile(File outputDir, String fileName, int totalFilesWritten,
            String fileExtension, boolean compressBz2) throws IOException {
        // create a warc writer
        OutputStream outputStream;

        File warc = new File(outputDir,
                fileName + String.format(Locale.ENGLISH, "_%02d%s", totalFilesWritten, fileExtension));

        System.out.println("Writing to " + warc);

        if (compressBz2) {
            // we don't compress using the built-in GZ support, use bz2 instead
            outputStream = new BZip2CompressorOutputStream(new BufferedOutputStream(new FileOutputStream(warc)));
        } else {
            // default compression (gz)
            outputStream = new FileOutputStream(warc);
        }
        WARCWriter writer = new WARCWriter(new AtomicInteger(), outputStream, warc,
                new WARCWriterPoolSettingsData("", "", -1, !compressBz2, null, null, generator));

        // warc info record
        writer.writeWarcinfoRecord(warc.getName(), "Made by " + this.getClass().getName() + "/" + getRevision());

        return writer;

    }

    private static String getRevision() {
        return "1";
    }

    protected void write(final WARCWriter writer, final Content content) throws IOException, ParseException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setUrl(content.getUrl());

        byte[] byteContent = content.getContent();

        // skip empty records
        if (byteContent.length == 0) {
            return;
        }

        recordInfo.setContentStream(new ByteArrayInputStream(byteContent));
        recordInfo.setContentLength(byteContent.length);
        recordInfo.setEnforceLength(true);

        String warcDateString = DEFAULT_WARC_DATE;

        // convert date to WARC-Date format
        String date = content.getMetadata().get("Date");
        if (date != null) {
            try {
                warcDateString = String
                        .valueOf(new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ZZZ", Locale.ENGLISH).parse(date)
                                .getTime());
            } catch (ParseException ex) {
                // ignore
            }
        }

        recordInfo.setCreate14DigitDate(warcDateString);

        recordInfo.setType(WARCConstants.WARCRecordType.response);
        recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
        recordInfo.setRecordId(generator.getRecordID());

        // add some extra headers from nutch
        Set<String> extraHeaders = new HashSet<String>(Arrays.asList("nutch.crawl.score", "nutch.segment.name",
                "Set-Cookie", "Content-Type", "Server", "Pragma", "Cache-Control"));

        for (String extraHeader : extraHeaders) {
            String value = content.getMetadata().get(extraHeader);
            if (value != null) {
                recordInfo.addExtraHeader("Nutch_" + extraHeader, value);
            }
        }

        // apply filters
        boolean acceptExport = true;
        for (ExportContentFilter filter : filters) {
            acceptExport &= filter.acceptContent(recordInfo);
        }

        // and write only if we accept this content
        if (acceptExport) {
            writer.writeRecord(recordInfo);

            totalBytesWritten += byteContent.length;
            entriesCounter++;
        }

    }

    /**
     * Input: Nutch segment folder (e.g. "20150303005802")
     * Ouput: gz/bz2 WARC file (e.g. "20150303005802.warc.gz/bz2")
     * Third parameter is an ouput file prefix (e.g. "prefix20150303005802.warc.gz")
     * <p/>
     * By default, the output is compressed with gz
     *
     * @param args args
     * @return int
     * @throws Exception
     */
    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        try {
            FileSystem fs = FileSystem.get(conf);

            String segmentDir = otherArgs[0];

            File outDir = new File(otherArgs[1]);
            if (!outDir.exists()) {
                if (outDir.mkdirs()) {
                    System.out.println("Creating output dir " + outDir.getAbsolutePath());
                }
            }

            String outputFilePrefix = "";
            if (otherArgs.length >= 3) {
                outputFilePrefix = otherArgs[2];
            }

            boolean compressBz2 = false;
            // do we want bz2 output?
            if (otherArgs.length >= 4) {
                compressBz2 = "bz2".equals(otherArgs[3]);
            }

            Path file = new Path(segmentDir, Content.DIR_NAME + "/part-00000/data");

            String extension = ".warc." + (compressBz2 ? "bz2" : "gz");

            String segmentName = new File(segmentDir).getName();
            nutchSegmentToWARCFile(file, outDir, outputFilePrefix + segmentName, extension, conf, compressBz2);

            fs.close();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        return 0;
    }

    public static void main(String[] args) {
        try {
            NutchToWARCConverter nutchToWARCConverter = new NutchToWARCConverter();
            nutchToWARCConverter.addFilters(new ContentTypeFilter());
            ToolRunner.run(nutchToWARCConverter, args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}