net.anthonypoon.ngram.rollingregression.Main.java Source code

Java tutorial

Introduction

Here is the source code for net.anthonypoon.ngram.rollingregression.Main.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package net.anthonypoon.ngram.rollingregression;

import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 *
 * @author ypoon
 */
public class Main {
    public static void main(String[] args) throws Exception {
        Options options = new Options();
        options.addOption("a", "action", true, "Action");
        options.addOption("i", "input", true, "input");
        options.addOption("o", "output", true, "output");
        //options.addOption("f", "format", true, "Format");
        options.addOption("u", "upbound", true, "Year up bound");
        options.addOption("l", "lowbound", true, "Year low bound");
        options.addOption("r", "range", true, "Range");
        options.addOption("T", "threshold", true, "Threshold - min count for regression");
        options.addOption("p", "positive-only", false, "Write positive slope only"); // default faluse
        CommandLineParser parser = new GnuParser();
        CommandLine cmd = parser.parse(options, args);
        Configuration conf = new Configuration();
        if (cmd.hasOption("range")) {
            conf.set("range", cmd.getOptionValue("range"));
        }
        if (cmd.hasOption("upbound")) {
            conf.set("upbound", cmd.getOptionValue("upbound"));
        } else {
            conf.set("upbound", "9999");
        }
        if (cmd.hasOption("lowbound")) {
            conf.set("lowbound", cmd.getOptionValue("lowbound"));
        } else {
            conf.set("lowbound", "0");
        }
        if (cmd.hasOption("threshold")) {
            conf.set("threshold", cmd.getOptionValue("threshold"));
        }
        if (cmd.hasOption("positive-only")) {
            conf.set("positive-only", "true");
        }
        Job job = Job.getInstance(conf);
        /**
        if (cmd.hasOption("format")) {
        switch (cmd.getOptionValue("format")) {
            case "compressed":
                job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
                break;
            case "text":
                job.setInputFormatClass(KeyValueTextInputFormat.class);
                break;
        }
            
        }**/
        job.setJarByClass(Main.class);
        switch (cmd.getOptionValue("action")) {
        case "get-regression":
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            for (String inputPath : cmd.getOptionValue("input").split(",")) {
                MultipleInputs.addInputPath(job, new Path(inputPath), KeyValueTextInputFormat.class,
                        RollingRegressionMapper.class);
            }
            job.setReducerClass(RollingRegressionReducer.class);
            break;
        default:
            throw new IllegalArgumentException("Missing action");
        }

        String timestamp = new SimpleDateFormat("yyyy_MM_dd_HH_mm_ss").format(new Date());

        //FileInputFormat.setInputPaths(job, new Path(cmd.getOptionValue("input")));
        FileOutputFormat.setOutputPath(job, new Path(cmd.getOptionValue("output") + "/" + timestamp));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
        /**
        double[] nazismBaseLine = {3, 12, 12, 18, 233, 239, 386, 333, 593, 1244, 1925, 3013, 3120, 3215, 3002, 3355, 2130, 1828, 1406, 1751, 1433, 1033, 881, 1330, 1029, 760, 1288, 1013, 1014};
        InputStream inStream = Main.class.getResourceAsStream("/1g-matrix.txt");
        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
        String line = "";
        Map<String, Double> result = new HashMap();
        while ((line = br.readLine()) != null) {
        String[] strArray = line.split("\t");
        double[] compareArray = new double[nazismBaseLine.length];
        for (int i = 0; i < nazismBaseLine.length; i ++) {
            compareArray[i] = Double.valueOf(strArray[i + 24]);
        }
        result.put(strArray[0], new PearsonsCorrelation().correlation(nazismBaseLine, compareArray));
        }
        List<Map.Entry<String, Double>> toBeSorted = new ArrayList();
        for (Map.Entry pair : result.entrySet()) {
        toBeSorted.add(pair);
        }
        Collections.sort(toBeSorted, new Comparator<Map.Entry<String, Double>>(){
        @Override
        public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
        });
        for (Map.Entry<String, Double> pair : toBeSorted) {
        if (!Double.isNaN(pair.getValue())) {
            System.out.println(pair.getKey() + "\t" + pair.getValue());
        }
        }**/
    }
}