de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.LangLicenseStatistics.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.LangLicenseStatistics.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics;

import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCWritable;
import de.tudarmstadt.ukp.dkpro.c4corpus.warc.io.WARCRecord;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Collecting statistics over languages and licenses in processed Web corpus.
 *
 * @author Ivan Habernal
 */
public class LangLicenseStatistics extends Configured implements Tool {

    private static final Text KEY_DOCUMENTS = new Text("documents");
    private static final Text KEY_TOKENS = new Text("tokens");

    private static final Pattern PATTERN = Pattern.compile("\\s+");

    /**
     * Returns an approximate number of words based on number of whitespace blocks.
     *
     * @param string string
     * @return word count
     */
    public static int fastApproximateWordCount(String string) {
        Matcher m = PATTERN.matcher(string);

        int count = 1;
        while (m.find()) {
            count++;
        }

        return count;
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());

        ConfigurationHelper.configureJob(job, LangLicenseStatistics.class, MapperClass.class, ReducerClass.class,
                args[0], args[1]);

        // intermediate data
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MapWritable.class);

        // output data
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new LangLicenseStatistics(), args);
    }

    public static class MapperClass extends Mapper<LongWritable, WARCWritable, Text, MapWritable> {

        @Override
        protected void map(LongWritable key, WARCWritable value, Context context)
                throws IOException, InterruptedException {
            WARCRecord warcRecord = value.getRecord();

            // get metadata: language & license
            String lang = warcRecord.getHeader().getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE);

            String license = warcRecord.getHeader().getField(WARCRecord.WARCRecordFieldConstants.LICENSE);

            if (lang == null) {
                throw new IOException(WARCRecord.WARCRecordFieldConstants.LANGUAGE
                        + " metadata not found. Run language identification first");
            }

            if (license == null) {
                throw new IOException(WARCRecord.WARCRecordFieldConstants.LICENSE
                        + " metadata not found. Run license identification first");
            }

            // get number of tokens
            String content = new String(warcRecord.getContent(), "utf-8");
            int tokenCount = fastApproximateWordCount(content);

            // collect output statistics into map
            MapWritable mapWritable = new MapWritable();
            mapWritable.put(KEY_DOCUMENTS, new IntWritable(1));
            mapWritable.put(KEY_TOKENS, new IntWritable(tokenCount));

            // create tab-delimited key
            Text outputKey = new Text(lang + "\t" + license);

            // write
            context.write(outputKey, mapWritable);
        }
    }

    public static class ReducerClass extends Reducer<Text, MapWritable, NullWritable, Text> {

        @Override
        protected void reduce(Text key, Iterable<MapWritable> values, Context context)
                throws IOException, InterruptedException {
            // collect statistics (int is too short for that!)
            long documentCounts = 0;
            long tokensCount = 0;
            for (MapWritable mapWritable : values) {
                documentCounts += ((IntWritable) mapWritable.get(KEY_DOCUMENTS)).get();
                tokensCount += ((IntWritable) mapWritable.get(KEY_TOKENS)).get();
            }

            // print as tab separated: language \t license \t documents \t tokens
            context.write(NullWritable.get(),
                    new Text(String.format("%s\t%d\t%d", key, documentCounts, tokensCount)));
        }

    }
}