de.l3s.content.timex.extracting.ClueWeb09TimexWriteToHDFS.java Source code

Introduction

Here is the source code for de.l3s.content.timex.extracting.ClueWeb09TimexWriteToHDFS.java
Source

package de.l3s.content.timex.extracting;

/*
 * TIMETool - Large-scale Temporal Search in MapReduce
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

/*
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * @author 
 */

import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Scanner;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.clueweb.clueweb09.ClueWeb09WarcRecord;
import org.clueweb.clueweb09.mapreduce.ClueWeb09InputFormat;

import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import de.l3s.content.timex.extracting.utils.DateUtil;
import de.unihd.dbs.heideltime.standalone.DocumentType;
import de.unihd.dbs.heideltime.standalone.HeidelTimeAnnotator;
import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone;
import de.unihd.dbs.heideltime.standalone.OutputType;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Triple;

public class ClueWeb09TimexWriteToHDFS extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(ClueWeb09TimexWriteToHDFS.class);
    //HBase content column family
    public static final String CONTENT_CF = "content_cf";
    //HBase content quantifier
    public static final String CLEANED = "cleaned";
    //HBase content quantifier
    public static final String PUBDATE = "pubdate";
    //HBase content column family
    public static final String TEMPEX_CF = "tempex";
    //HBase tempex quantifier
    public static final String RAW = "raw";
    //HBase tempex quantifier
    public static final String ANNOTATED = "annotated";
    //date extraction confident level
    public static final String VERY_WEAK = "very weak";
    public static final String WEAK = "weak";
    public static final String STRONG = "strong";
    public static final String MILDLY_STRONG = "mildly strong";
    public static final String VERY_STRONG = "very strong";

    public enum Counters {
        LINES
    }

    //Sample Date: Mon, 14 Apr 2008 10:05:10 GMT
    static SimpleDateFormat simpleDateFormat = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss", Locale.ROOT);
    static SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("E MMM dd HH:mm:ss yyyy", Locale.ROOT);
    static SimpleDateFormat simpleDateFormat3 = new SimpleDateFormat("E, dd-MMM-yyyy HH:mm:ss", Locale.ROOT);
    private static Pattern timex3Date = Pattern.compile(
            "<TIMEX3 tid=\"t(\\d+)\" type=\"DATE\" value=\"([^\"]+)\">([^<]?)</TIMEX3>", Pattern.MULTILINE);
    static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
    private final static IntWritable one = new IntWritable(1);

    public static String parseDate(String gmtDate) {
        gmtDate = gmtDate.replace(" GMT", "");
        gmtDate = gmtDate.replace("GMT", "");
        gmtDate = gmtDate.replaceAll("^ +| +$|( )+", "$1");
        Date parsedDate = null;
        try {
            parsedDate = simpleDateFormat.parse(gmtDate);
        } catch (ParseException e2) {
            try {
                parsedDate = simpleDateFormat2.parse(gmtDate);
            } catch (ParseException e3) {
                try {
                    parsedDate = simpleDateFormat3.parse(gmtDate);
                } catch (ParseException e4) {
                }
            }
        }
        try {
            return dateFormat.format(parsedDate);
        } catch (NullPointerException npe) {
            return "2009-03-01";
        }
    }

    private static class TMapper extends Mapper<LongWritable, ClueWeb09WarcRecord, Text, Writable> {

        HeidelTimeStandalone narrative = null;
        HeidelTimeStandalone colloguial = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            narrative = new HeidelTimeStandalone(Language.ENGLISH, DocumentType.NARRATIVES, OutputType.TIMEML);
            colloguial = new HeidelTimeStandalone(Language.ENGLISH, DocumentType.COLLOQUIAL, OutputType.TIMEML);
        }

        @Override
        public void map(LongWritable key, ClueWeb09WarcRecord doc, Context context)
                throws IOException, InterruptedException {

            String docid = doc.getDocid();
            String url = doc.getHeaderMetadataItem("WARC-Target-URI");
            /**
             if (url == null) return;
            String site = new URL(url).getHost();
                
            context.write(new Text(site), one);
             */
            Pair<String, String> docDate = null;
            if (doc.getHeaderMetadataItem("Last-Modified") != null)
                docDate = Pair.makePair(parseDate(doc.getHeaderMetadataItem("Last-Modified")), WEAK);
            else if (doc.getHeaderMetadataItem("Date") != null)
                docDate = Pair.makePair(parseDate(doc.getHeaderMetadataItem("Date")), VERY_WEAK);
            else
                docDate = Pair.makePair("2009-03-01", "N/A");
            if (docid != null) {
                try {
                    //clean html
                    LOG.info(docid);

                    String content = ArticleExtractor.INSTANCE.getText(doc.getContent());
                    if (content == null || content.equals("")) {
                        //do not save records with empty content
                        return;
                    }
                    Scanner contentScanner = new Scanner(content);
                    String firstLines = (contentScanner.hasNext()) ? contentScanner.nextLine()
                            : "" + ((contentScanner.hasNext()) ? contentScanner.nextLine() : "");
                    contentScanner.close();
                    //assume the publication date is from the first 2 lines
                    String pubDateTags = narrative.tag(content, firstLines);
                    Matcher date = (pubDateTags == null) ? null : timex3Date.matcher(pubDateTags);
                    //the first extracted absolute date is the publication date
                    if (date != null && date.find()) {
                        docDate = (date.group(2).length() == "yyyy-MM-dd".length())
                                ? Pair.makePair(date.group(2), STRONG)
                                : Pair.makePair(date.group(2), MILDLY_STRONG);
                    } else {
                        //get publication date from URL
                        Pair<String, String> docDateURL = DateUtil.extractDateFromURL_(url);
                        docDate = (docDateURL == null) ? docDate : docDateURL;
                    }
                    String timetags;
                    StringBuffer _timetags = new StringBuffer();
                    //if publication date is not extracted then very likely it is not a web article

                    LOG.info("Doc date:" + docDate.toString());
                    if (!docDate.second.contains("strong")) {
                        content = DefaultExtractor.INSTANCE.getText(doc.getContent());
                        //reference point is not important here
                        //for HeidelTime
                        timetags = narrative.tag(content, docDate.first);
                        //annotation is not necessary here 
                    } else {
                        timetags = colloguial.tag(content, docDate.first);
                        ArrayList<Triple<String, String, String>> triples = HeidelTimeAnnotator.annotate(content,
                                docDate.first);
                        for (Triple<String, String, String> triple : triples) {
                            _timetags.append(triple.toString());
                        }
                    }

                    LOG.info("Doc date: " + docDate.toString());
                    LOG.info("Time tags: " + timetags);

                    //context.getCounter(Counters.LINES).increment(1);

                    context.write(new Text(docid + "\t" + docDate.toString() + "\t" + timetags.toString() + "\t"
                            + _timetags.toString()), null);

                } catch (BoilerpipeProcessingException e) {
                } catch (ArrayIndexOutOfBoundsException ae) {
                }
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static final String INPUT_OPTION = "input";
    public static final String OUTPUT_OPTION = "output";
    public static final String COLUMN = "column";

    /**
     * Runs this tool.
     */
    @SuppressWarnings("static-access")
    public int run(String[] args) throws Exception {
        Options options = new Options();

        options.addOption(
                OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION));

        options.addOption(
                OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION));

        CommandLine cmdline;
        CommandLineParser parser = new GnuParser();
        cmdline = parser.parse(options, args);

        if (!cmdline.hasOption(INPUT_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(this.getClass().getName(), options);
            ToolRunner.printGenericCommandUsage(System.out);
            return -1;
        }

        if (!cmdline.hasOption(OUTPUT_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(this.getClass().getName(), options);
            ToolRunner.printGenericCommandUsage(System.out);
            return -1;
        }

        String input = cmdline.getOptionValue(INPUT_OPTION);

        String output = cmdline.getOptionValue(OUTPUT_OPTION);

        LOG.info("Tool name: " + ClueWeb09TimexWriteToHDFS.class.getSimpleName());
        LOG.info(" - input: " + input);
        LOG.info(" - output: " + output);

        Configuration conf = new Configuration();
        long milliSeconds = 10000 * 60 * 60; //x10 default
        conf.setLong("mapred.task.timeout", milliSeconds);
        Job job = Job.getInstance(conf, "extract CW tempex and output to HDFS");
        job.setJarByClass(ClueWeb09TimexWriteToHDFS.class);
        job.setNumReduceTasks(0);

        job.setInputFormatClass(ClueWeb09InputFormat.class);
        job.setMapperClass(TMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));
        job.waitForCompletion(true);

        return 0;
    }

    /**
     * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
     */
    public static void main(String[] args) throws Exception {
        LOG.info("Running " + ClueWeb09TimexWriteToHDFS.class.getCanonicalName() + " with args "
                + Arrays.toString(args));
        ToolRunner.run(new ClueWeb09TimexWriteToHDFS(), args);
    }

}