de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone;

import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.BoilerPlateRemoval;
import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.JusTextBoilerplateRemoval;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.CharsetDetector;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.ICUCharsetDetectorWrapper;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileWriter;
import de.tudarmstadt.ukp.dkpro.c4corpus.warc.io.WARCRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;

import java.io.*;
import java.nio.charset.Charset;
import java.util.Locale;

/**
 * This class takes one warc.gz file as input, removes boilerplate for each entry, and write the
 * results to the output warc.gz file.
 *
 * @author Ivan Habernal
 */
public class WarcBoilerplateRemoval {
    public static void processWarcGzFile(File input, File outFile, boolean keepMinimalHtml) throws IOException {
        System.out.printf("Reading from %s, writing to %s%n", input, outFile);

        Configuration conf = new Configuration();
        // set limit to 100 GB (= almost unlimited)
        conf.setLong("warc.output.segment.size", WARCFileWriter.DEFAULT_MAX_SEGMENT_SIZE * 100);

        //Opens a file for reading.
        CompressionCodec codec = WARCFileWriter.getGzipCodec(conf);
        InputStream byteStream = new BufferedInputStream(new FileInputStream(input));
        DataInputStream dataStream = new DataInputStream(
                codec == null ? byteStream : codec.createInputStream(byteStream));

        BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

        long startTime = System.currentTimeMillis();
        int counter = 0;

        int recordsRead = 0;

        Path outputPath = new Path(outFile.getAbsolutePath());
        WARCFileWriter warcFileWriter = new WARCFileWriter(conf, codec, outputPath);

        // detecting the correct charset
        final CharsetDetector charsetDetector = new ICUCharsetDetectorWrapper();

        while (true) {
            try {
                //Reads the next record from the file.
                WARCRecord wc = new WARCRecord(dataStream);

                // detect charset
                byte[] bytes = wc.getContent();
                Charset charset = charsetDetector.detectCharset(bytes);

                String html = new String(bytes, charset);

                // strip HTTP header
                html = html.substring(html.indexOf("\r\n\r\n") + 4);

                String plainText;
                if (keepMinimalHtml) {
                    plainText = boilerPlateRemoval.getMinimalHtml(html, null);
                } else {
                    plainText = boilerPlateRemoval.getPlainText(html, null);
                }

                counter++;
                if (counter % 100 == 0) {
                    System.out.printf(Locale.ENGLISH, "~%.1f entries per second%n",
                            counter * 1000f / (double) (System.currentTimeMillis() - startTime));
                    System.out.printf(Locale.ENGLISH, "%d records processed%n", recordsRead);
                }

                recordsRead++;

                // create copy of WarcRecord
                WARCRecord newWarcRecord = new WARCRecord(wc);
                newWarcRecord.setContent(plainText);

                warcFileWriter.write(newWarcRecord);
            } catch (EOFException e) {
                break;
            }
        }

        warcFileWriter.close();

        // rename from out.warc.gz.seg-00000.warc.gz to out.warc.gz
        File actualOutputFile = new File(outFile.getAbsolutePath() + ".seg-00000.warc.gz");
        if (!actualOutputFile.exists()) {
            throw new IOException("File " + actualOutputFile + " does not exist");
        }
        if (!actualOutputFile.renameTo(outFile)) {
            throw new IOException("Renaming file " + actualOutputFile + " to " + outFile + " failed");
        }

        // delete .crc file
        File crcFile = new File(actualOutputFile.getParentFile(), "." + actualOutputFile.getName() + ".crc");
        if (!crcFile.delete()) {
            throw new IOException(crcFile + " was not deleted");
        }

        System.out.printf(Locale.ENGLISH, "%d records written to %s, total time %f%n", recordsRead,
                outFile.getName(), counter * 1000f / (double) (System.currentTimeMillis() - startTime));
    }

    public static void main(String[] args) throws IOException {
        File input = new File(args[0]);
        File output = new File(args[1]);

        // keep minimal html?
        boolean keepMinimalHtml = args.length > 2 && "html".equals(args[2]);

        processWarcGzFile(input, output, keepMinimalHtml);
    }
}