de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Copyright (c) 2014 Martin Kleppmann
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io;

import de.tudarmstadt.ukp.dkpro.c4corpus.warc.io.WARCRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;

/**
 * Writes {@link WARCRecord}s to a WARC file, using Hadoop's filesystem APIs. (This means you
 * can write to HDFS, S3 or any other filesystem supported by Hadoop).
 * <br>
 * WARCFileWriter keeps track of how much data it has written (optionally gzip-compressed);
 * when the file becomes larger than some threshold, it is automatically closed and a
 * new segment is started. A segment number is appended to the filename for that purpose.
 * The segment number always starts at 00000, and by default a new segment is started when
 * the file size exceeds 1GB. To change the target size for a segment, you can set the
 * `warc.output.segment.size` key in the Hadoop configuration to the number of bytes.
 * (Files may actually be a bit larger than this threshold, since we finish writing the
 * current record before opening a new file.)
 * <br>
 * Based on https://github.com/ept/warc-hadoop
 * <br>
 * Note: originally published under MIT license, which is compatible with ASL license
 * https://www.gnu.org/philosophy/license-list.html
 *
 * @author Martin Kleppmann
 * @author Ivan Habernal
 */
public class WARCFileWriter {
    private static final Logger logger = LoggerFactory.getLogger(WARCFileWriter.class);
    public static final long DEFAULT_MAX_SEGMENT_SIZE = 1000000000L; // 1 GB

    private final Configuration conf;
    private final CompressionCodec codec;
    private final Path workOutputPath;
    private final Progressable progress;
    private final String extensionFormat;
    private final long maxSegmentSize;
    private long segmentsCreated = 0;
    private long bytesWritten = 0;
    private DataOutputStream dataStream;

    /**
     * Creates a WARC file, and opens it for writing. If a file with the same name already
     * exists, an attempt number in the filename is incremented until we find a file that
     * doesn't already exist.
     *
     * @param conf           The Hadoop configuration.
     * @param codec          If null, the file is uncompressed. If non-null, this compression codec
     *                       will be used. The codec's default file extension is appended to the filename.
     * @param workOutputPath The directory and filename prefix to which the data should be
     *                       written. We append a segment number and filename extensions to it.
     * @throws IOException I/O exception
     */
    public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath) throws IOException {
        this(conf, codec, workOutputPath, null);
    }

    /**
     * Creates a WARC file, and opens it for writing. If a file with the same name already
     * exists, it is *overwritten*. Note that this is different behaviour from the other
     * constructor. Yes, this sucks. It will probably change in a future version.
     *
     * @param conf           The Hadoop configuration.
     * @param codec          If null, the file is uncompressed. If non-null, this compression codec
     *                       will be used. The codec's default file extension is appended to the filename.
     * @param workOutputPath The directory and filename prefix to which the data should be
     *                       written. We append a segment number and filename extensions to it.
     * @param progress       An object used by the mapred API for tracking a task's progress.
     * @throws IOException I/O exception
     */
    public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
            throws IOException {
        this.conf = conf;
        this.codec = codec;
        this.workOutputPath = workOutputPath;
        this.progress = progress;
        this.extensionFormat = ".seg-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension());
        this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
        createSegment();
    }

    /**
     * Instantiates a Hadoop codec for compressing and decompressing Gzip files. This is the
     * most common compression applied to WARC files.
     *
     * @param conf The Hadoop configuration.
     * @return codec instance
     */
    public static CompressionCodec getGzipCodec(Configuration conf) {
        try {
            return ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec")
                    .asSubclass(CompressionCodec.class), conf);
        } catch (ClassNotFoundException e) {
            logger.warn("GzipCodec could not be instantiated", e);
            return null;
        }
    }

    /**
     * Creates an output segment file and sets up the output streams to point at it.
     * If the file already exists, retries with a different filename. This is a bit nasty --
     * after all, {@link FileOutputFormat}'s work directory concept is supposed to prevent
     * filename clashes -- but it looks like Amazon Elastic MapReduce prevents use of per-task
     * work directories if the output of a job is on S3.
     */
    private void createSegment() throws IOException {
        bytesWritten = 0;

        Path path = workOutputPath.suffix(String.format(extensionFormat, segmentsCreated));
        FileSystem fs = path.getFileSystem(conf);

        // find a non-existing output path by increasing segment counter
        // see https://github.com/dkpro/dkpro-c4corpus/issues/13
        while (fs.exists(path)) {
            logger.warn("Output path " + path + " already exists; increasing segment counter");
            segmentsCreated++;
            path = workOutputPath.suffix(String.format(extensionFormat, segmentsCreated));
            fs = path.getFileSystem(conf);
        }

        FSDataOutputStream fsStream = (progress == null) ? fs.create(path, false) : fs.create(path, progress);
        CountingOutputStream byteStream = new CountingOutputStream(new BufferedOutputStream(fsStream));
        dataStream = new DataOutputStream(codec == null ? byteStream : codec.createOutputStream(byteStream));
        segmentsCreated++;

        logger.info("Writing to output file: {}", path);
    }

    /**
     * Appends a {@link WARCRecord} to the file, in WARC/1.0 format.
     *
     * @param record The record to be written.
     * @throws IOException I/O exception
     */
    public void write(WARCRecord record) throws IOException {
        if (bytesWritten > maxSegmentSize) {
            dataStream.close();
            createSegment();
        }
        record.write(dataStream);
    }

    /**
     * Appends a {@link WARCRecord} wrapped in a {@link WARCWritable} to the file.
     *
     * @param record The wrapper around the record to be written.
     * @throws IOException I/O exception
     */
    public void write(WARCWritable record) throws IOException {
        if (record.getRecord() != null) {
            write(record.getRecord());
        }
    }

    /**
     * Flushes any buffered data and closes the file.
     *
     * @throws IOException I/O exception
     */
    public void close() throws IOException {
        dataStream.close();
    }

    private class CountingOutputStream extends FilterOutputStream {
        public CountingOutputStream(OutputStream out) {
            super(out);
        }

        @Override
        public void write(byte[] b, int off, int len) throws IOException {
            out.write(b, off, len);
            bytesWritten += len;
        }

        @Override
        public void write(int b) throws IOException {
            out.write(b);
            bytesWritten++;
        }

        // Overriding close() because FilterOutputStream's close() method pre-JDK8 has bad behavior:
        // it silently ignores any exception thrown by flush(). Instead, just close the delegate stream.
        // It should flush itself if necessary. (Thanks to the Guava project for noticing this.)
        @Override
        public void close() throws IOException {
            out.close();
        }
    }
}