org.opensextant.mapreduce.XponentsTaggerDemo.java Source code

Java tutorial

Introduction

Here is the source code for org.opensextant.mapreduce.XponentsTaggerDemo.java

Source

/*
 * This software was produced for the U. S. Government
 * under Basic Contract No. W15P7T-13-C-A802, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (FEB 2012)
 *
 * Copyright (C) 2016 The MITRE Corporation.
 * Copyright (C) 2016 OpenSextant.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.mapreduce;

import java.io.IOException;
import java.util.Arrays;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.opensextant.util.GeonamesUtility;

import gnu.getopt.LongOpt;

public class XponentsTaggerDemo extends Configured implements Tool {
    public static final String LOG4J_SUPPLEMENTARY_CONFIGURATION = "opensextantLog4jSupplementaryConfiguration";

    /**
     * 
     * @param key
     * @param cfg
     * @return
     * @throws IOException
     */
    public static DateRange parseDateRange(String key, Configuration cfg) throws IOException {
        String d = cfg.get(key);
        if (d == null) {
            return null;
        }
        String[] dates = d.split(":");
        DateTimeFormatter DATEFMT = DateTimeFormat.forPattern("yyyyMMdd");
        DateRange r = new DateRange();
        r.startDate = DATEFMT.parseDateTime(dates[0]).getMillis();
        r.endDate = DATEFMT.parseDateTime(dates[1]).getMillis();
        if (r.isValid()) {
            return r;
        }
        throw new IOException("Date Range is incorrect");
    }

    /**
     * 
     * @param key
     * @param cfg
     * @return
     * @throws IOException
     */
    public static UTCRange parseUTCRange(String key, Configuration cfg) throws IOException {
        String u = cfg.get(key);
        if (u == null) {
            return null;
        }

        String[] u1u2 = u.split(":");
        UTCRange range = new UTCRange();
        range.startUTC = Integer.parseInt(u1u2[0]);
        range.endUTC = Integer.parseInt(u1u2[1]);
        return range;
    }

    public static class UTCRange {

        public int startUTC = 0;
        public int endUTC = 0;

        public boolean isValid() {
            return startUTC <= endUTC;
        }

        public boolean isWithin(int utc) {
            return (utc >= startUTC && utc <= endUTC);
        }
    }

    /*
     * Note with these epochs, 1970-01-01 is 0seconds.  epoch can be negative or positive.
     * Value of 0 generally means not set.
     */
    public static class DateRange {
        public long startDate = 0;
        public long endDate = 0;

        public boolean isValid() {
            return endDate > startDate & startDate != 0;
        }

        public boolean isWithin(long epoch) {
            return epoch >= startDate && epoch <= endDate;
        }

        public String toString() {
            return String.format("%s to %s", new Date(startDate), new Date(endDate));
        }
    }

    public static GeonamesUtility geonames = null;

    /**
     * 
     * @throws IOException
     */
    public static void initResources() throws IOException {
        /*
         * load country metadata - codes, names, timezones.
         */
        geonames = new GeonamesUtility();
    }

    /**
     * Returns 0 = SUCCESS, other than 0 is a FAILURE mode.
     * 
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    @Override
    public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        /*
         * Run:
         *    /input/path  /output/path  phase CC  yyyymmdd:yyyymmdd
         *    
         *    phase = geotag | xtax
         */
        LongOpt[] options = { new LongOpt("in", LongOpt.REQUIRED_ARGUMENT, null, 'i'),
                new LongOpt("out", LongOpt.REQUIRED_ARGUMENT, null, 'o'),
                new LongOpt("phase", LongOpt.REQUIRED_ARGUMENT, null, 'p'),
                new LongOpt("cc", LongOpt.REQUIRED_ARGUMENT, null, 'c'),
                new LongOpt("date-range", LongOpt.REQUIRED_ARGUMENT, null, 'd'),
                new LongOpt("utc-range", LongOpt.REQUIRED_ARGUMENT, null, 'u'),
                new LongOpt("log4j-extra-config", LongOpt.REQUIRED_ARGUMENT, null, 'L') };

        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("socgeo", args, "", options);
        String inPath = null;
        String outPath = null;
        String phase = null;
        String cc = null;
        String dateRange = null;
        String utcRange = null;
        String xponentsArchive = null;
        String log4jExtraConfig = null;

        System.out.println(Arrays.toString(args));
        int c;
        while ((c = opts.getopt()) != -1) {
            switch (c) {

            case 0:
                // 0 = Long opt processed.
                break;

            case 'i':
                inPath = opts.getOptarg();
                break;

            case 'o':
                outPath = opts.getOptarg();
                break;

            case 'p':
                phase = opts.getOptarg();
                break;

            case 'c':
                cc = opts.getOptarg();
                break;

            case 'd':
                dateRange = opts.getOptarg();
                break;

            case 'u':
                utcRange = opts.getOptarg();
                break;

            case 'L':
                log4jExtraConfig = opts.getOptarg();
                break;

            default:
                return -2;

            }
        }

        /* Helper resources -- possibly just replaced by existing SocGeoBase utilities.
         */
        XponentsTaggerDemo.initResources();

        /* Job App Configuration 
         */
        Job job = Job.getInstance(getConf(), "Xponents-Tagging");
        job.setJarByClass(XponentsTaggerDemo.class);

        // REQUIRED:
        if (cc == null || phase == null) {
            System.err.println("Phase and CC must be set");
            return -2;
        }
        job.getConfiguration().set("phase", phase);
        job.getConfiguration().set("country", cc);
        if (dateRange != null) {
            job.getConfiguration().set("date-range", dateRange);
        }
        if (utcRange != null) {
            job.getConfiguration().set("utc-range", utcRange);
        }
        /* Mapper */
        if (phase.equalsIgnoreCase("geotag")) {
            job.setMapperClass(GeoTaggerMapper.class);
        } else if (phase.equalsIgnoreCase("xtax")) {
            job.setMapperClass(KeywordTaggerMapper.class);
        }

        if (log4jExtraConfig != null) {
            job.getConfiguration().set(LOG4J_SUPPLEMENTARY_CONFIGURATION, log4jExtraConfig);
        }

        /* No Reduce step */
        job.setNumReduceTasks(1);

        /* Job Input */
        job.setInputFormatClass(SequenceFileInputFormat.class);

        /* Map Phase ouptut */
        SequenceFileInputFormat.addInputPaths(job, inPath);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);

        /* Job Output */
        job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
        job.getConfiguration().set("mapreduce.map.output.compress.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputPath(job, new Path(outPath));
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
        job.getConfiguration().set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        return job.waitForCompletion(true) ? 0 : 1;

    }

    /**
     * Print stderr usage.
     */
    public static void usage() {
        System.err.println("XponentsTaggerDemo --in HDFSPATH --out HDFSPATH --phase [ geotag | xtax ] \n"
                + "              --cc COUNTRYCODE [--date-range yyyymmdd:yyyymmdd] [--utc-range h:h] [--extra-log4j-config file:///path/to/config]"
                + "\nExample:\n\n"
                + "XponentsTaggerDemo --in /data/some/thing  --out /user/ubbs/output/some-thing-tagged --phase proploc"
                + "              --date-range 20150101:20150301  --utc-range -12:-7");

    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run(new Configuration(), new XponentsTaggerDemo(), args);
        if (result == -2) {
            usage();
        }
        System.exit(result);
    }

}