Example usage for java.util HashSet add

Introduction

In this page you can find the example usage for java.util HashSet add.

Prototype

public boolean add(E e)

Source Link

Document

Adds the specified element to this set if it is not already present.

Usage

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

/**
 * @param args//from w  ww .  j a v  a 2s.c o m
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java

/**
 * @param args// w w w.ja v  a2 s .c  o m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0];
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2.");
        firstGroup.addAll(secondGroup);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1.");
        secondGroup.addAll(firstGroup);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3);

    String dataAttributesInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/"
                    + dataset1 + "-" + dataset2 + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        dataAttributesInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "correlation";
    String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/";

    FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(SpatioTemporalValueWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationTechniquesMapper.class);
    job.setReducerClass(CorrelationTechniquesReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job,
            dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir));

    job.setJarByClass(CorrelationTechniques.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-"
                    + dataset2 + "/";
            String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2
                    + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:de.unileipzig.ub.indexer.App.java

public static void main(String[] args) throws IOException {

    // create Options object
    Options options = new Options();

    options.addOption("h", "help", false, "display this help");

    options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed");
    options.addOption("i", "index", true, "the name of the target index");
    options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)");

    options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)");
    options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)");
    options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)");

    options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)");
    options.addOption("v", "verbose", false, "show processing speed while indexing");
    options.addOption("s", "status", false, "only show status of index for file");

    options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go");
    options.addOption("e", "debug", false, "set logging level to debug");
    options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout");

    options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)");
    options.addOption("z", "latest-flag-on", true,
            "enable latest flag according to field (within content, e.g. 001)");
    options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies");

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;/*from   www  .j  ava 2s.  c o  m*/

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException ex) {
        logger.error(ex);
        System.exit(1);
    }

    // setup logging
    Properties systemProperties = System.getProperties();
    systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger");
    System.setProperties(systemProperties);
    Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR);

    Properties props = new Properties();
    props.load(props.getClass().getResourceAsStream("/log4j.properties"));

    if (cmd.hasOption("debug")) {
        props.setProperty("log4j.logger.de.unileipzig", "DEBUG");
    }

    if (cmd.hasOption("logfile")) {
        props.setProperty("log4j.rootLogger", "INFO, stdout, F");
        props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender");
        props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile"));
        props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout");
        props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n");
    }

    PropertyConfigurator.configure(props);

    InetAddress addr = InetAddress.getLocalHost();
    String memcachedHostAndPort = addr.getHostAddress() + ":11211";
    if (cmd.hasOption("m")) {
        memcachedHostAndPort = cmd.getOptionValue("m");
    }

    // setup caching
    try {
        if (memcachedClient == null) {
            memcachedClient = new MemcachedClient(
                    new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(),
                    AddrUtil.getAddresses("0.0.0.0:11211"));
            try {
                // give client and server 500ms
                Thread.sleep(300);
            } catch (InterruptedException ex) {
            }

            Collection availableServers = memcachedClient.getAvailableServers();
            logger.info(availableServers);
            if (availableServers.size() == 0) {
                logger.info("no memcached servers found");
                memcachedClient.shutdown();
                memcachedClient = null;
            } else {
                logger.info(availableServers.size() + " memcached server(s) detected, fine.");
            }
        }
    } catch (IOException ex) {
        logger.warn("couldn't create a connection, bailing out: " + ex.getMessage());
    }

    // process options

    if (cmd.hasOption("h")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("indexer", options, true);
        quit(0);
    }

    boolean verbose = false;
    if (cmd.hasOption("verbose")) {
        verbose = true;
    }

    // ES options
    String[] hosts = new String[] { "0.0.0.0" };
    int port = 9300;
    String clusterName = "elasticsearch_mdma";
    int bulkSize = 3000;

    if (cmd.hasOption("host")) {
        hosts = cmd.getOptionValues("host");
    }
    if (cmd.hasOption("port")) {
        port = Integer.parseInt(cmd.getOptionValue("port"));
    }
    if (cmd.hasOption("cluster")) {
        clusterName = cmd.getOptionValue("cluster");
    }
    if (cmd.hasOption("bulksize")) {
        bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize"));
        if (bulkSize < 1 || bulkSize > 100000) {
            logger.error("bulksize must be between 1 and 100,000");
            quit(1);
        }
    }

    // ES Client
    final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma")
            .build();
    final TransportClient client = new TransportClient(settings);
    for (String host : hosts) {
        client.addTransportAddress(new InetSocketTransportAddress(host, port));
    }

    if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) {

        final String filename = cmd.getOptionValue("filename");

        final File _file = new File(filename);
        if (_file.length() == 0) {
            logger.info(_file.getAbsolutePath() + " is empty, skipping");
            quit(0); // file is empty
        }

        // for flat mode: leave a stampfile beside the json to 
        // indicate previous successful processing
        File directory = new File(filename).getParentFile();
        File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed");

        long start = System.currentTimeMillis();
        long lineCount = 0;

        final String indexName = cmd.getOptionValue("index");
        final String docType = cmd.getOptionValue("doctype");
        BulkRequestBuilder bulkRequest = client.prepareBulk();

        try {
            if (cmd.hasOption("flat")) {
                // flat mode
                // .........
                if (stampfile.exists()) {
                    logger.info("SKIPPING, since it seems this file has already " + "been imported (found: "
                            + stampfile.getAbsolutePath() + ")");
                    quit(0);
                }
            } else {

                final String srcSHA1 = extractSrcSHA1(filename);

                logger.debug(filename + " srcsha1: " + srcSHA1);

                long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1);
                logger.debug(filename + " indexed: " + docsInIndex);

                long docsInFile = getLineCount(filename);
                logger.debug(filename + " lines: " + docsInFile);

                // in non-flat-mode, indexing would take care
                // of inconsistencies
                if (docsInIndex == docsInFile) {
                    logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")");
                    client.close();
                    quit(0);
                }

                if (docsInIndex > 0) {
                    logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:"
                            + docsInFile);

                    if (!cmd.hasOption("r")) {
                        logger.warn(
                                "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE "
                                        + hosts[0] + ":9200/" + indexName
                                        + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1
                                        + "\" }}'");
                        client.close();
                        quit(1);
                    } else {
                        logger.info("Attempting to clear residues...");
                        // attempt to repair once
                        DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName)
                                .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet();

                        Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator();
                        long deletions = 0;
                        while (it.hasNext()) {
                            IndexDeleteByQueryResponse response = it.next();
                            deletions += 1;
                        }
                        logger.info("Deleted residues of " + filename);
                        logger.info("Refreshing [" + indexName + "]");
                        RefreshResponse refreshResponse = client.admin().indices()
                                .refresh(new RefreshRequest(indexName)).actionGet();

                        long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1);
                        logger.info(indexedAfterDelete + " docs remained");
                        if (indexedAfterDelete > 0) {
                            logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE "
                                    + hosts[0] + ":9200/" + indexName
                                    + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'");
                            quit(1);
                        } else {
                            logger.info("Residues are gone. Now trying to reindex: " + filename);
                        }
                    }
                }
            }

            logger.info("INDEXING-REQUIRED: " + filename);
            if (cmd.hasOption("status")) {
                quit(0);
            }

            HashSet idsInBatch = new HashSet();

            String idField = null;
            if (cmd.hasOption("z")) {
                idField = cmd.getOptionValue("z");
            }

            final FileReader fr = new FileReader(filename);
            final BufferedReader br = new BufferedReader(fr);

            String line;
            // one line is one document
            while ((line = br.readLine()) != null) {

                // "Latest-Flag" machine
                // This gets obsolete with a "flat" index
                if (cmd.hasOption("z")) {
                    // flag that indicates, whether the document
                    // about to be indexed will be the latest
                    boolean willBeLatest = true;

                    // check if there is a previous (lower meta.timestamp) document with 
                    // the same identifier (whatever that may be - queried under "content")
                    final String contentIdentifier = getContentIdentifier(line, idField);
                    idsInBatch.add(contentIdentifier);

                    // assumed in meta.timestamp
                    final Long timestamp = Long.parseLong(getTimestamp(line));

                    logger.debug("Checking whether record is latest (line: " + lineCount + ")");
                    logger.debug(contentIdentifier + ", " + timestamp);

                    // get all docs, which match the contentIdentifier
                    // by filter, which doesn't score
                    final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField,
                            contentIdentifier);
                    final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType);
                    final AndFilterBuilder afb = new AndFilterBuilder();
                    afb.add(idFilter).add(kindFilter);
                    final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb);

                    final SearchResponse searchResponse = client.prepareSearch(indexName)
                            .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0)
                            .setSize(1200) // 3 years and 105 days assuming daily updates at the most
                            .setExplain(false).execute().actionGet();

                    final SearchHits searchHits = searchResponse.getHits();

                    logger.debug("docs with this id in the index: " + searchHits.getTotalHits());

                    for (final SearchHit hit : searchHits.getHits()) {
                        final String docId = hit.id();
                        final Map<String, Object> source = hit.sourceAsMap();
                        final Map meta = (Map) source.get("meta");
                        final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString());
                        // if the indexed doc timestamp is lower the the current one, 
                        // remove any latest flag
                        if (timestamp >= docTimestamp) {
                            source.remove("latest");
                            final ObjectMapper mapper = new ObjectMapper();
                            // put the updated doc back
                            // IndexResponse response = 
                            client.prepareIndex(indexName, docType).setCreate(false).setId(docId)
                                    .setSource(mapper.writeValueAsBytes(source))
                                    .execute(new ActionListener<IndexResponse>() {
                                        public void onResponse(IndexResponse rspns) {
                                            logger.debug("Removed latest flag from " + contentIdentifier + ", "
                                                    + docTimestamp + ", " + hit.id() + " since (" + timestamp
                                                    + " > " + docTimestamp + ")");
                                        }

                                        public void onFailure(Throwable thrwbl) {
                                            logger.error("Could not remove flag from " + hit.id() + ", "
                                                    + contentIdentifier);
                                        }
                                    });
                            // .execute()
                            //.actionGet();
                        } else {
                            logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")");
                            willBeLatest = false;
                        }
                    }

                    if (willBeLatest) {
                        line = setLatestFlag(line);
                        logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp);
                    }

                    // end of latest-flag machine
                    // beware - this will be correct as long as there
                    // are no dups within one bulk!
                }

                bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line));
                lineCount++;
                logger.debug("Added line " + lineCount + " to BULK");
                logger.debug(line);

                if (lineCount % bulkSize == 0) {

                    if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) {
                        logger.error(
                                "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy");
                        logger.error(
                                "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)");
                    }
                    idsInBatch.clear();

                    logger.debug("Issuing BULK request");

                    final long actionCount = bulkRequest.numberOfActions();
                    final BulkResponse bulkResponse = bulkRequest.execute().actionGet();
                    final long tookInMillis = bulkResponse.getTookInMillis();

                    if (bulkResponse.hasFailures()) {
                        logger.fatal("FAILED, bulk not indexed. exiting now.");
                        Iterator<BulkItemResponse> it = bulkResponse.iterator();
                        while (it.hasNext()) {
                            BulkItemResponse bir = it.next();
                            if (bir.isFailed()) {
                                Failure failure = bir.getFailure();
                                logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage()
                                        + ", type: " + failure.getType() + ", index: " + failure.getIndex());
                            }
                        }
                        quit(1);
                    } else {
                        if (verbose) {
                            final double elapsed = System.currentTimeMillis() - start;
                            final double speed = (lineCount / elapsed * 1000);
                            logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount
                                    + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)");
                        }
                    }
                    bulkRequest = client.prepareBulk();
                }
            }

            // handle the remaining items
            final long actionCount = bulkRequest.numberOfActions();
            if (actionCount > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet();
                final long tookInMillis = bulkResponse.getTookInMillis();

                if (bulkResponse.hasFailures()) {
                    logger.fatal("FAILED, bulk not indexed. exiting now.");
                    Iterator<BulkItemResponse> it = bulkResponse.iterator();
                    while (it.hasNext()) {
                        BulkItemResponse bir = it.next();
                        if (bir.isFailed()) {
                            Failure failure = bir.getFailure();
                            logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage()
                                    + ", type: " + failure.getType() + ", index: " + failure.getIndex());
                        }
                    }
                    quit(1);
                } else {

                    // trigger update now
                    RefreshResponse refreshResponse = client.admin().indices()
                            .refresh(new RefreshRequest(indexName)).actionGet();

                    if (verbose) {
                        final double elapsed = System.currentTimeMillis() - start;
                        final double speed = (lineCount / elapsed * 1000);
                        logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/"
                                + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)");
                    }

                }

            }

            br.close();
            client.close();
            final double elapsed = (System.currentTimeMillis() - start) / 1000;
            final double speed = (lineCount / elapsed);
            logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: "
                    + String.format("%.2f", speed) + "r/s)");
            if (cmd.hasOption("flat")) {
                try {
                    FileUtils.touch(stampfile);
                } catch (IOException ioe) {
                    logger.warn(".indexed files not created. Will reindex everything everytime.");
                }
            }
        } catch (IOException e) {
            client.close();
            logger.error(e);
            quit(1);
        } finally {
            client.close();
        }
    }
    quit(0);
}

From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java

/**
 * @param args//from   w  ww  .j  a va 2 s.  c o  m
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score");
    scoreOption.setRequired(false);
    scoreOption.setArgName("SCORE THRESHOLD");
    options.addOption(scoreOption);

    Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength");
    strengthOption.setRequired(false);
    strengthOption.setArgName("STRENGTH THRESHOLD");
    options.addOption(strengthOption);

    Option completeRandomizationOption = new Option("c", "complete-randomization", false,
            "use complete randomization when performing significance tests");
    completeRandomizationOption.setRequired(false);
    options.addOption(completeRandomizationOption);

    Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes");
    idOption.setRequired(false);
    options.addOption(idOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    Option removeOption = new Option("r", "remove-not-significant", false,
            "remove relationships that are not" + "significant from the final output");
    removeOption.setRequired(false);
    options.addOption(removeOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeNotSignificant = cmd.hasOption("r");
    boolean removeExistingFiles = cmd.hasOption("f");
    boolean completeRandomization = cmd.hasOption("c");
    boolean hasScoreThreshold = cmd.hasOption("sc");
    boolean hasStrengthThreshold = cmd.hasOption("st");
    boolean outputIds = cmd.hasOption("id");
    String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : "";
    String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : "";

    // all datasets
    ArrayList<String> all_datasets = new ArrayList<String>();
    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        all_datasets.add(line.split("\t")[0]);
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();
    String[] all_datasets_array = new String[all_datasets.size()];
    all_datasets.toArray(all_datasets_array);

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array;
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("No indices from datasets in G1.");
        System.exit(0);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("No indices from datasets in G2.");
        System.exit(0);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        all_datasets.add(dt[0]);
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    String relationshipsDir = "";
    if (outputIds) {
        relationshipsDir = FrameworkUtils.relationshipsIdsDir;
    } else {
        relationshipsDir = FrameworkUtils.relationshipsDir;
    }

    FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3);

    String random = completeRandomization ? "complete" : "restricted";

    String indexInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2
                    + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        indexInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "relationship" + "-" + random;
    String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/";

    FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("complete-random", String.valueOf(completeRandomization));
    conf.set("output-ids", String.valueOf(outputIds));
    conf.set("complete-random-str", random);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    conf.set("remove-not-significant", String.valueOf(removeNotSignificant));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }
    if (hasScoreThreshold) {
        conf.set("score-threshold", scoreThreshold);
    }
    if (hasStrengthThreshold) {
        conf.set("strength-threshold", strengthThreshold);
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(TopologyTimeSeriesWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationMapper.class);
    job.setReducerClass(CorrelationReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir));

    job.setJarByClass(Relationship.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/";
            String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java

/**
 * @param args//from  w  w w . j a  v a  2  s  . c  o m
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the aggregate functions " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions"
            + " will be computed, followed by their temporal and spatial attribute indices");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";
    String preProcessingDatasets = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetAggregation = new ArrayList<String>();
    HashMap<String, String> datasetTempAtt = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>();
    HashMap<String, String> preProcessingDataset = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");
    String[] datasetArgs = cmd.getOptionValues("g");

    for (int i = 0; i < datasetArgs.length; i += 3) {
        String dataset = datasetArgs[i];

        // getting pre-processing
        String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3);
        if (tempPreProcessing == null) {
            System.out.println("No pre-processing available for " + dataset);
            continue;
        }
        preProcessingDataset.put(dataset, tempPreProcessing);

        shortDataset.add(dataset);
        datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1]));
        datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2]));

        datasetId.put(dataset, null);
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    Path path = null;
    FileSystem fs = null;

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        fs = FileSystem.get(new Configuration());
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3);

    // getting smallest resolution

    HashMap<String, String> tempResMap = new HashMap<String, String>();
    HashMap<String, String> spatialResMap = new HashMap<String, String>();

    HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>();

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String[] datasetArray = preProcessingDataset.get(dataset).split("-");

        String datasetTemporalStr = datasetArray[datasetArray.length - 2];
        int datasetTemporal = utils.temporalResolution(datasetTemporalStr);

        String datasetSpatialStr = datasetArray[datasetArray.length - 1];
        int datasetSpatial = utils.spatialResolution(datasetSpatialStr);

        // finding all possible resolutions

        String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal);
        String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial);

        String temporalResolution = "";
        String spatialResolution = "";

        String tempRes = "";
        String spatialRes = "";

        boolean dataAdded = false;

        for (int i = 0; i < temporalResolutions.length; i++) {
            for (int j = 0; j < spatialResolutions.length; j++) {

                temporalResolution = temporalResolutions[i];
                spatialResolution = spatialResolutions[j];

                String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";

                if (removeExistingFiles) {
                    FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3);
                }

                if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) {

                    dataAdded = true;

                    tempRes += temporalResolution + "-";
                    spatialRes += spatialResolution + "-";
                }
            }
        }

        if (dataAdded) {
            input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
            shortDatasetAggregation.add(dataset);

            tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1));
            spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1));

            datasetTemporalStrMap.put(dataset, datasetTemporalStr);
            datasetSpatialStrMap.put(dataset, datasetSpatialStr);
        }
    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have aggregates.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    it = input.iterator();
    while (it.hasNext()) {
        preProcessingDatasets += it.next() + ",";
    }

    Job aggJob = null;
    String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/";
    String jobName = "aggregates";

    FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3);

    Configuration aggConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    aggConf.set("dataset-name", datasetNames);
    aggConf.set("dataset-id", datasetIds);

    for (int i = 0; i < shortDatasetAggregation.size(); i++) {
        String dataset = shortDatasetAggregation.get(i);
        String id = datasetId.get(dataset);
        aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset));
        aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset));
        aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset));

        if (s3)
            aggConf.set("dataset-" + id,
                    s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
        else
            aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/"
                    + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
    }

    aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    aggConf.set("mapreduce.task.io.sort.mb", "200");
    aggConf.set("mapreduce.task.io.sort.factor", "100");
    machineConf.setMachineConfiguration(aggConf);

    if (s3) {
        machineConf.setMachineConfiguration(aggConf);
        aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    if (snappyCompression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    aggJob = new Job(aggConf);
    aggJob.setJobName(jobName);

    aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setMapOutputValueClass(AggregationArrayWritable.class);
    aggJob.setOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setOutputValueClass(FloatArrayWritable.class);
    //aggJob.setOutputKeyClass(Text.class);
    //aggJob.setOutputValueClass(Text.class);

    aggJob.setMapperClass(AggregationMapper.class);
    aggJob.setCombinerClass(AggregationCombiner.class);
    aggJob.setReducerClass(AggregationReducer.class);
    aggJob.setNumReduceTasks(machineConf.getNumberReduces());

    aggJob.setInputFormatClass(SequenceFileInputFormat.class);
    //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(aggJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(aggJob, true);
    FileInputFormat.setInputPaths(aggJob,
            preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1));
    FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir));

    aggJob.setJarByClass(Aggregation.class);

    long start = System.currentTimeMillis();
    aggJob.submit();
    aggJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetAggregation) {
        String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:eqtlmappingpipeline.util.ModuleEqtlGeuvadisReplication.java

/**
 * @param args the command line arguments
 *//*  ww  w  .  j a  va2  s.co  m*/
public static void main(String[] args) throws IOException, LdCalculatorException {

    System.out.println(HEADER);
    System.out.println();
    System.out.flush(); //flush to make sure header is before errors
    try {
        Thread.sleep(25); //Allows flush to complete
    } catch (InterruptedException ex) {
    }

    CommandLineParser parser = new PosixParser();
    final CommandLine commandLine;
    try {
        commandLine = parser.parse(OPTIONS, args, true);
    } catch (ParseException ex) {
        System.err.println("Invalid command line arguments: " + ex.getMessage());
        System.err.println();
        new HelpFormatter().printHelp(" ", OPTIONS);
        System.exit(1);
        return;
    }

    final String[] genotypesBasePaths = commandLine.getOptionValues("g");
    final RandomAccessGenotypeDataReaderFormats genotypeDataType;
    final String replicationQtlFilePath = commandLine.getOptionValue("e");
    final String interactionQtlFilePath = commandLine.getOptionValue("i");
    final String outputFilePath = commandLine.getOptionValue("o");
    final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld"));
    final int window = Integer.parseInt(commandLine.getOptionValue("w"));

    System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths));
    System.out.println("Interaction file: " + interactionQtlFilePath);
    System.out.println("Replication file: " + replicationQtlFilePath);
    System.out.println("Output: " + outputFilePath);
    System.out.println("LD: " + ldCutoff);
    System.out.println("Window: " + window);

    try {
        if (commandLine.hasOption("G")) {
            genotypeDataType = RandomAccessGenotypeDataReaderFormats
                    .valueOf(commandLine.getOptionValue("G").toUpperCase());
        } else {
            if (genotypesBasePaths[0].endsWith(".vcf")) {
                System.err.println(
                        "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
                System.exit(1);
                return;
            }
            try {
                genotypeDataType = RandomAccessGenotypeDataReaderFormats
                        .matchFormatToPath(genotypesBasePaths[0]);
            } catch (GenotypeDataException e) {
                System.err
                        .println("Unable to determine input 1 type based on specified path. Please specify -G");
                System.exit(1);
                return;
            }
        }
    } catch (IllegalArgumentException e) {
        System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G")
                + "\" is not a valid input data format");
        System.exit(1);
        return;
    }

    final RandomAccessGenotypeData genotypeData;

    try {
        genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null,
                0.8);
    } catch (TabixFileNotFoundException e) {
        LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n"
                + "Please see README on how to create a tabix file");
        System.exit(1);
        return;
    } catch (IOException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (IncompatibleMultiPartGenotypeDataException e) {
        LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (GenotypeDataException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    }

    ChrPosTreeMap<ArrayList<EQTL>> replicationQtls = new QTLTextFile(replicationQtlFilePath, false)
            .readQtlsAsTreeMap();

    int interactionSnpNotInGenotypeData = 0;
    int noReplicationQtlsInWindow = 0;
    int noReplicationQtlsInLd = 0;
    int multipleReplicationQtlsInLd = 0;
    int replicationTopSnpNotInGenotypeData = 0;

    final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0');
    final String[] outputLine = new String[14];
    int c = 0;
    outputLine[c++] = "Chr";
    outputLine[c++] = "Pos";
    outputLine[c++] = "SNP";
    outputLine[c++] = "Gene";
    outputLine[c++] = "Module";
    outputLine[c++] = "DiscoveryZ";
    outputLine[c++] = "ReplicationZ";
    outputLine[c++] = "DiscoveryZCorrected";
    outputLine[c++] = "ReplicationZCorrected";
    outputLine[c++] = "DiscoveryAlleleAssessed";
    outputLine[c++] = "ReplicationAlleleAssessed";
    outputLine[c++] = "bestLd";
    outputLine[c++] = "bestLd_dist";
    outputLine[c++] = "nextLd";
    outputWriter.writeNext(outputLine);

    HashSet<String> notFound = new HashSet<>();

    CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t');
    interactionQtlReader.readNext();//skip header
    String[] interactionQtlLine;
    while ((interactionQtlLine = interactionQtlReader.readNext()) != null) {

        String snp = interactionQtlLine[1];
        String chr = interactionQtlLine[2];
        int pos = Integer.parseInt(interactionQtlLine[3]);
        String gene = interactionQtlLine[4];
        String alleleAssessed = interactionQtlLine[9];
        String module = interactionQtlLine[12];
        double discoveryZ = Double.parseDouble(interactionQtlLine[10]);

        GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos);

        if (interactionQtlVariant == null) {
            System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos);
            ++interactionSnpNotInGenotypeData;
            continue;
        }

        EQTL bestMatch = null;
        double bestMatchR2 = Double.NaN;
        Ld bestMatchLd = null;
        double nextBestR2 = Double.NaN;

        ArrayList<EQTL> sameSnpQtls = replicationQtls.get(chr, pos);

        if (sameSnpQtls != null) {
            for (EQTL sameSnpQtl : sameSnpQtls) {
                if (sameSnpQtl.getProbe().equals(gene)) {
                    bestMatch = sameSnpQtl;
                    bestMatchR2 = 1;
                }
            }
        }

        NavigableMap<Integer, ArrayList<EQTL>> potentionalReplicationQtls = replicationQtls.getChrRange(chr,
                pos - window, true, pos + window, true);

        for (ArrayList<EQTL> potentialReplicationQtls : potentionalReplicationQtls.values()) {

            for (EQTL potentialReplicationQtl : potentialReplicationQtls) {

                if (!potentialReplicationQtl.getProbe().equals(gene)) {
                    continue;
                }

                GeneticVariant potentialReplicationQtlVariant = genotypeData.getSnpVariantByPos(
                        potentialReplicationQtl.getRsChr().toString(), potentialReplicationQtl.getRsChrPos());

                if (potentialReplicationQtlVariant == null) {
                    notFound.add(potentialReplicationQtl.getRsChr().toString() + ":"
                            + potentialReplicationQtl.getRsChrPos());
                    ++replicationTopSnpNotInGenotypeData;
                    continue;
                }

                Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant);
                double r2 = ld.getR2();

                if (r2 > 1) {
                    r2 = 1;
                }

                if (bestMatch == null) {
                    bestMatch = potentialReplicationQtl;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                } else if (r2 > bestMatchR2) {
                    bestMatch = potentialReplicationQtl;
                    nextBestR2 = bestMatchR2;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                }

            }
        }

        double replicationZ = Double.NaN;
        double replicationZCorrected = Double.NaN;
        double discoveryZCorrected = Double.NaN;

        String replicationAlleleAssessed = null;

        if (bestMatch != null) {
            replicationZ = bestMatch.getZscore();
            replicationAlleleAssessed = bestMatch.getAlleleAssessed();

            if (pos != bestMatch.getRsChrPos()) {

                String commonHap = null;
                double commonHapFreq = -1;
                for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) {

                    double f = hapFreq.getValue();

                    if (f > commonHapFreq) {
                        commonHapFreq = f;
                        commonHap = hapFreq.getKey();
                    }

                }

                String[] commonHapAlleles = StringUtils.split(commonHap, '/');

                discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1;
                replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;

            } else {

                discoveryZCorrected = discoveryZ;
                replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;

            }

        }

        c = 0;
        outputLine[c++] = chr;
        outputLine[c++] = String.valueOf(pos);
        outputLine[c++] = snp;
        outputLine[c++] = gene;
        outputLine[c++] = module;
        outputLine[c++] = String.valueOf(discoveryZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected);
        outputLine[c++] = alleleAssessed;
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAlleleAssessed());
        outputLine[c++] = String.valueOf(bestMatchR2);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getRsChrPos()));
        outputLine[c++] = String.valueOf(nextBestR2);
        outputWriter.writeNext(outputLine);

    }

    outputWriter.close();

    for (String e : notFound) {
        System.err.println("Not found: " + e);
    }

    System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData);
    System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow);
    System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd);
    System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd);
    System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData);

}

From source file:eqtlmappingpipeline.util.ModuleEqtlNeutrophilReplication.java

/**
 * @param args the command line arguments
 *//*  w  w  w  . java2s .  c o  m*/
public static void main(String[] args) throws IOException, LdCalculatorException {

    System.out.println(HEADER);
    System.out.println();
    System.out.flush(); //flush to make sure header is before errors
    try {
        Thread.sleep(25); //Allows flush to complete
    } catch (InterruptedException ex) {
    }

    CommandLineParser parser = new PosixParser();
    final CommandLine commandLine;
    try {
        commandLine = parser.parse(OPTIONS, args, true);
    } catch (ParseException ex) {
        System.err.println("Invalid command line arguments: " + ex.getMessage());
        System.err.println();
        new HelpFormatter().printHelp(" ", OPTIONS);
        System.exit(1);
        return;
    }

    final String[] genotypesBasePaths = commandLine.getOptionValues("g");
    final RandomAccessGenotypeDataReaderFormats genotypeDataType;
    final String replicationQtlFilePath = commandLine.getOptionValue("e");
    final String interactionQtlFilePath = commandLine.getOptionValue("i");
    final String outputFilePath = commandLine.getOptionValue("o");
    final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld"));
    final int window = Integer.parseInt(commandLine.getOptionValue("w"));

    System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths));
    System.out.println("Interaction file: " + interactionQtlFilePath);
    System.out.println("Replication file: " + replicationQtlFilePath);
    System.out.println("Output: " + outputFilePath);
    System.out.println("LD: " + ldCutoff);
    System.out.println("Window: " + window);

    try {
        if (commandLine.hasOption("G")) {
            genotypeDataType = RandomAccessGenotypeDataReaderFormats
                    .valueOf(commandLine.getOptionValue("G").toUpperCase());
        } else {
            if (genotypesBasePaths[0].endsWith(".vcf")) {
                System.err.println(
                        "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
                System.exit(1);
                return;
            }
            try {
                genotypeDataType = RandomAccessGenotypeDataReaderFormats
                        .matchFormatToPath(genotypesBasePaths[0]);
            } catch (GenotypeDataException e) {
                System.err
                        .println("Unable to determine input 1 type based on specified path. Please specify -G");
                System.exit(1);
                return;
            }
        }
    } catch (IllegalArgumentException e) {
        System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G")
                + "\" is not a valid input data format");
        System.exit(1);
        return;
    }

    final RandomAccessGenotypeData genotypeData;

    try {
        genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null,
                0.8);
    } catch (TabixFileNotFoundException e) {
        LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n"
                + "Please see README on how to create a tabix file");
        System.exit(1);
        return;
    } catch (IOException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (IncompatibleMultiPartGenotypeDataException e) {
        LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (GenotypeDataException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    }

    ChrPosTreeMap<ArrayList<ReplicationQtl>> replicationQtls = new ChrPosTreeMap<>();

    CSVReader replicationQtlReader = new CSVReader(new FileReader(replicationQtlFilePath), '\t');
    replicationQtlReader.readNext();//skip header
    String[] replicationLine;
    while ((replicationLine = replicationQtlReader.readNext()) != null) {

        try {

            GeneticVariant variant = genotypeData.getSnpVariantByPos(replicationLine[REPLICATION_SNP_CHR_COL],
                    Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]));
            if (variant == null) {
                continue;
            }

            ReplicationQtl replicationQtl = new ReplicationQtl(replicationLine[REPLICATION_SNP_CHR_COL],
                    Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]),
                    replicationLine[REPLICATION_GENE_COL],
                    Double.parseDouble(replicationLine[REPLICATION_BETA_COL]),
                    variant.getAlternativeAlleles().get(0).getAlleleAsString());
            ArrayList<ReplicationQtl> posReplicationQtls = replicationQtls.get(replicationQtl.getChr(),
                    replicationQtl.getPos());
            if (posReplicationQtls == null) {
                posReplicationQtls = new ArrayList<>();
                replicationQtls.put(replicationQtl.getChr(), replicationQtl.getPos(), posReplicationQtls);
            }
            posReplicationQtls.add(replicationQtl);

        } catch (Exception e) {
            System.out.println(Arrays.toString(replicationLine));
            throw e;
        }
    }

    int interactionSnpNotInGenotypeData = 0;
    int noReplicationQtlsInWindow = 0;
    int noReplicationQtlsInLd = 0;
    int multipleReplicationQtlsInLd = 0;
    int replicationTopSnpNotInGenotypeData = 0;

    final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0');
    final String[] outputLine = new String[14];
    int c = 0;
    outputLine[c++] = "Chr";
    outputLine[c++] = "Pos";
    outputLine[c++] = "SNP";
    outputLine[c++] = "Gene";
    outputLine[c++] = "Module";
    outputLine[c++] = "DiscoveryZ";
    outputLine[c++] = "ReplicationZ";
    outputLine[c++] = "DiscoveryZCorrected";
    outputLine[c++] = "ReplicationZCorrected";
    outputLine[c++] = "DiscoveryAlleleAssessed";
    outputLine[c++] = "ReplicationAlleleAssessed";
    outputLine[c++] = "bestLd";
    outputLine[c++] = "bestLd_dist";
    outputLine[c++] = "nextLd";
    outputWriter.writeNext(outputLine);

    HashSet<String> notFound = new HashSet<>();

    CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t');
    interactionQtlReader.readNext();//skip header
    String[] interactionQtlLine;
    while ((interactionQtlLine = interactionQtlReader.readNext()) != null) {

        String snp = interactionQtlLine[1];
        String chr = interactionQtlLine[2];
        int pos = Integer.parseInt(interactionQtlLine[3]);
        String gene = interactionQtlLine[4];
        String alleleAssessed = interactionQtlLine[9];
        String module = interactionQtlLine[12];
        double discoveryZ = Double.parseDouble(interactionQtlLine[10]);

        GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos);

        if (interactionQtlVariant == null) {
            System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos);
            ++interactionSnpNotInGenotypeData;
            continue;
        }

        ReplicationQtl bestMatch = null;
        double bestMatchR2 = Double.NaN;
        Ld bestMatchLd = null;
        double nextBestR2 = Double.NaN;

        ArrayList<ReplicationQtl> sameSnpQtls = replicationQtls.get(chr, pos);

        if (sameSnpQtls != null) {
            for (ReplicationQtl sameSnpQtl : sameSnpQtls) {
                if (sameSnpQtl.getGene().equals(gene)) {
                    bestMatch = sameSnpQtl;
                    bestMatchR2 = 1;
                }
            }
        }

        NavigableMap<Integer, ArrayList<ReplicationQtl>> potentionalReplicationQtls = replicationQtls
                .getChrRange(chr, pos - window, true, pos + window, true);

        for (ArrayList<ReplicationQtl> potentialReplicationQtls : potentionalReplicationQtls.values()) {

            for (ReplicationQtl potentialReplicationQtl : potentialReplicationQtls) {

                if (!potentialReplicationQtl.getGene().equals(gene)) {
                    continue;
                }

                GeneticVariant potentialReplicationQtlVariant = genotypeData
                        .getSnpVariantByPos(potentialReplicationQtl.getChr(), potentialReplicationQtl.getPos());

                if (potentialReplicationQtlVariant == null) {
                    notFound.add(potentialReplicationQtl.getChr() + ":" + potentialReplicationQtl.getPos());
                    ++replicationTopSnpNotInGenotypeData;
                    continue;
                }

                Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant);
                double r2 = ld.getR2();

                if (r2 > 1) {
                    r2 = 1;
                }

                if (bestMatch == null) {
                    bestMatch = potentialReplicationQtl;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                } else if (r2 > bestMatchR2) {
                    bestMatch = potentialReplicationQtl;
                    nextBestR2 = bestMatchR2;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                }

            }
        }

        double replicationZ = Double.NaN;
        double replicationZCorrected = Double.NaN;
        double discoveryZCorrected = Double.NaN;

        String replicationAlleleAssessed = null;

        if (bestMatch != null) {
            replicationZ = bestMatch.getBeta();
            replicationAlleleAssessed = bestMatch.getAssessedAllele();

            if (pos != bestMatch.getPos()) {

                String commonHap = null;
                double commonHapFreq = -1;
                for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) {

                    double f = hapFreq.getValue();

                    if (f > commonHapFreq) {
                        commonHapFreq = f;
                        commonHap = hapFreq.getKey();
                    }

                }

                String[] commonHapAlleles = StringUtils.split(commonHap, '/');

                discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1;
                replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;

            } else {

                discoveryZCorrected = discoveryZ;
                replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;

            }

        }

        c = 0;
        outputLine[c++] = chr;
        outputLine[c++] = String.valueOf(pos);
        outputLine[c++] = snp;
        outputLine[c++] = gene;
        outputLine[c++] = module;
        outputLine[c++] = String.valueOf(discoveryZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected);
        outputLine[c++] = alleleAssessed;
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAssessedAllele());
        outputLine[c++] = String.valueOf(bestMatchR2);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getPos()));
        outputLine[c++] = String.valueOf(nextBestR2);
        outputWriter.writeNext(outputLine);

    }

    outputWriter.close();

    for (String e : notFound) {
        System.err.println("Not found: " + e);
    }

    System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData);
    System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow);
    System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd);
    System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd);
    System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData);

}

From source file:eqtlmappingpipeline.util.ModuleEqtWestraReplication.java

/**
 * @param args the command line arguments
 *///  w w w. j  a v  a  2 s. c  o  m
public static void main(String[] args) throws IOException, LdCalculatorException {

    System.out.println(HEADER);
    System.out.println();
    System.out.flush(); //flush to make sure header is before errors
    try {
        Thread.sleep(25); //Allows flush to complete
    } catch (InterruptedException ex) {
    }

    CommandLineParser parser = new PosixParser();
    final CommandLine commandLine;
    try {
        commandLine = parser.parse(OPTIONS, args, true);
    } catch (ParseException ex) {
        System.err.println("Invalid command line arguments: " + ex.getMessage());
        System.err.println();
        new HelpFormatter().printHelp(" ", OPTIONS);
        System.exit(1);
        return;
    }

    final String[] genotypesBasePaths = commandLine.getOptionValues("g");
    final RandomAccessGenotypeDataReaderFormats genotypeDataType;
    final String replicationQtlFilePath = commandLine.getOptionValue("e");
    final String interactionQtlFilePath = commandLine.getOptionValue("i");
    final String outputFilePath = commandLine.getOptionValue("o");
    final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld"));
    final int window = Integer.parseInt(commandLine.getOptionValue("w"));

    System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths));
    System.out.println("Interaction file: " + interactionQtlFilePath);
    System.out.println("Replication file: " + replicationQtlFilePath);
    System.out.println("Output: " + outputFilePath);
    System.out.println("LD: " + ldCutoff);
    System.out.println("Window: " + window);

    try {
        if (commandLine.hasOption("G")) {
            genotypeDataType = RandomAccessGenotypeDataReaderFormats
                    .valueOf(commandLine.getOptionValue("G").toUpperCase());
        } else {
            if (genotypesBasePaths[0].endsWith(".vcf")) {
                System.err.println(
                        "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
                System.exit(1);
                return;
            }
            try {
                genotypeDataType = RandomAccessGenotypeDataReaderFormats
                        .matchFormatToPath(genotypesBasePaths[0]);
            } catch (GenotypeDataException e) {
                System.err
                        .println("Unable to determine input 1 type based on specified path. Please specify -G");
                System.exit(1);
                return;
            }
        }
    } catch (IllegalArgumentException e) {
        System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G")
                + "\" is not a valid input data format");
        System.exit(1);
        return;
    }

    final RandomAccessGenotypeData genotypeData;

    try {
        genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null,
                0.8);
    } catch (TabixFileNotFoundException e) {
        LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n"
                + "Please see README on how to create a tabix file");
        System.exit(1);
        return;
    } catch (IOException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (IncompatibleMultiPartGenotypeDataException e) {
        LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e);
        System.exit(1);
        return;
    } catch (GenotypeDataException e) {
        LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
        System.exit(1);
        return;
    }

    ChrPosTreeMap<ArrayList<ReplicationQtl>> replicationQtls = new ChrPosTreeMap<>();

    CSVReader replicationQtlReader = new CSVReader(new FileReader(replicationQtlFilePath), '\t');
    String[] replicationHeader = replicationQtlReader.readNext();
    String[] replicationLine;
    while ((replicationLine = replicationQtlReader.readNext()) != null) {

        try {

            GeneticVariant variant = genotypeData.getSnpVariantByPos(replicationLine[REPLICATION_SNP_CHR_COL],
                    Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]));
            if (variant == null) {
                continue;
            }

            Alleles variantAlleles = variant.getVariantAlleles();
            String[] replicationAllelesString = StringUtils.split(replicationLine[REPLICATION_ALLELES_COL],
                    '/');

            Alleles replicationAlleles = Alleles.createBasedOnString(replicationAllelesString[0],
                    replicationAllelesString[1]);
            Allele assessedAlleleReplication = Allele.create(replicationLine[REPLICATION_ALLELE_ASSESSED_COL]);

            boolean isAmbigous = replicationAlleles.isAtOrGcSnp();

            if (!variantAlleles.equals(replicationAlleles)) {
                if (variantAlleles.equals(replicationAlleles.getComplement())) {
                    assessedAlleleReplication = assessedAlleleReplication.getComplement();
                } else {
                    continue;
                }
            }

            ReplicationQtl replicationQtl = new ReplicationQtl(replicationLine[REPLICATION_SNP_CHR_COL],
                    Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]),
                    replicationLine[REPLICATION_GENE_COL],
                    Double.parseDouble(replicationLine[REPLICATION_BETA_COL]),
                    assessedAlleleReplication.getAlleleAsString(), replicationLine, isAmbigous);
            ArrayList<ReplicationQtl> posReplicationQtls = replicationQtls.get(replicationQtl.getChr(),
                    replicationQtl.getPos());
            if (posReplicationQtls == null) {
                posReplicationQtls = new ArrayList<>();
                replicationQtls.put(replicationQtl.getChr(), replicationQtl.getPos(), posReplicationQtls);
            }
            posReplicationQtls.add(replicationQtl);

        } catch (Exception e) {
            System.out.println(Arrays.toString(replicationLine));
            throw e;
        }
    }

    int interactionSnpNotInGenotypeData = 0;
    int noReplicationQtlsInWindow = 0;
    int noReplicationQtlsInLd = 0;
    int multipleReplicationQtlsInLd = 0;
    int replicationTopSnpNotInGenotypeData = 0;

    final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0');
    final String[] outputLine = new String[15 + EXTRA_COL_FROM_REPLICATION.length];
    int c = 0;
    outputLine[c++] = "Chr";
    outputLine[c++] = "Pos";
    outputLine[c++] = "SNP";
    outputLine[c++] = "Gene";
    outputLine[c++] = "Module";
    outputLine[c++] = "DiscoveryZ";
    outputLine[c++] = "ReplicationZ";
    outputLine[c++] = "DiscoveryZCorrected";
    outputLine[c++] = "ReplicationZCorrected";
    outputLine[c++] = "DiscoveryAlleleAssessed";
    outputLine[c++] = "ReplicationAlleleAssessed";
    outputLine[c++] = "bestLd";
    outputLine[c++] = "bestLd_dist";
    outputLine[c++] = "nextLd";
    outputLine[c++] = "replicationAmbigous";
    for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
        outputLine[c++] = replicationHeader[EXTRA_COL_FROM_REPLICATION[i]];
    }
    outputWriter.writeNext(outputLine);

    HashSet<String> notFound = new HashSet<>();

    CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t');
    interactionQtlReader.readNext();//skip header
    String[] interactionQtlLine;
    while ((interactionQtlLine = interactionQtlReader.readNext()) != null) {

        String snp = interactionQtlLine[1];
        String chr = interactionQtlLine[2];
        int pos = Integer.parseInt(interactionQtlLine[3]);
        String gene = interactionQtlLine[4];
        String alleleAssessed = interactionQtlLine[9];
        String module = interactionQtlLine[12];
        double discoveryZ = Double.parseDouble(interactionQtlLine[10]);

        GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos);

        if (interactionQtlVariant == null) {
            System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos);
            ++interactionSnpNotInGenotypeData;
            continue;
        }

        ReplicationQtl bestMatch = null;
        double bestMatchR2 = Double.NaN;
        Ld bestMatchLd = null;
        double nextBestR2 = Double.NaN;

        ArrayList<ReplicationQtl> sameSnpQtls = replicationQtls.get(chr, pos);

        if (sameSnpQtls != null) {
            for (ReplicationQtl sameSnpQtl : sameSnpQtls) {
                if (sameSnpQtl.getGene().equals(gene)) {
                    bestMatch = sameSnpQtl;
                    bestMatchR2 = 1;
                }
            }
        }

        NavigableMap<Integer, ArrayList<ReplicationQtl>> potentionalReplicationQtls = replicationQtls
                .getChrRange(chr, pos - window, true, pos + window, true);

        for (ArrayList<ReplicationQtl> potentialReplicationQtls : potentionalReplicationQtls.values()) {

            for (ReplicationQtl potentialReplicationQtl : potentialReplicationQtls) {

                if (!potentialReplicationQtl.getGene().equals(gene)) {
                    continue;
                }

                GeneticVariant potentialReplicationQtlVariant = genotypeData
                        .getSnpVariantByPos(potentialReplicationQtl.getChr(), potentialReplicationQtl.getPos());

                if (potentialReplicationQtlVariant == null) {
                    notFound.add(potentialReplicationQtl.getChr() + ":" + potentialReplicationQtl.getPos());
                    ++replicationTopSnpNotInGenotypeData;
                    continue;
                }

                Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant);
                double r2 = ld.getR2();

                if (r2 > 1) {
                    r2 = 1;
                }

                if (bestMatch == null) {
                    bestMatch = potentialReplicationQtl;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                } else if (r2 > bestMatchR2) {
                    bestMatch = potentialReplicationQtl;
                    nextBestR2 = bestMatchR2;
                    bestMatchR2 = r2;
                    bestMatchLd = ld;
                }

            }
        }

        double replicationZ = Double.NaN;
        double replicationZCorrected = Double.NaN;
        double discoveryZCorrected = Double.NaN;

        String replicationAlleleAssessed = null;

        if (bestMatch != null) {
            replicationZ = bestMatch.getBeta();
            replicationAlleleAssessed = bestMatch.getAssessedAllele();

            if (pos != bestMatch.getPos()) {

                String commonHap = null;
                double commonHapFreq = -1;
                for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) {

                    double f = hapFreq.getValue();

                    if (f > commonHapFreq) {
                        commonHapFreq = f;
                        commonHap = hapFreq.getKey();
                    }

                }

                String[] commonHapAlleles = StringUtils.split(commonHap, '/');

                discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1;
                replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;

            } else {

                discoveryZCorrected = discoveryZ;
                replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ
                        : replicationZ * -1;
                //replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) || alleleAssessed.equals(String.valueOf(Utils.getComplementNucleotide(replicationAlleleAssessed.charAt(0)))) ? replicationZ : replicationZ * -1;

            }

        }

        c = 0;
        outputLine[c++] = chr;
        outputLine[c++] = String.valueOf(pos);
        outputLine[c++] = snp;
        outputLine[c++] = gene;
        outputLine[c++] = module;
        outputLine[c++] = String.valueOf(discoveryZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected);
        outputLine[c++] = alleleAssessed;
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAssessedAllele());
        outputLine[c++] = String.valueOf(bestMatchR2);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getPos()));
        outputLine[c++] = String.valueOf(nextBestR2);
        outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.isIsAmbigous());

        if (bestMatch == null) {
            for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
                outputLine[c++] = "NA";
            }
        } else {
            for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
                outputLine[c++] = bestMatch.getLine()[EXTRA_COL_FROM_REPLICATION[i]];
            }
        }

        outputWriter.writeNext(outputLine);

    }

    outputWriter.close();

    for (String e : notFound) {
        System.err.println("Not found: " + e);
    }

    System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData);
    System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow);
    System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd);
    System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd);
    System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData);

}

From source file:Main.java

public static int[] divide(int number, int number_of_parts) {
    HashSet<Integer> uniqueInts = new HashSet<Integer>();
    uniqueInts.add(0);
    uniqueInts.add(number);/*from  ww  w .j  a v a2 s . c  om*/
    int array_size = number_of_parts + 1;
    while (uniqueInts.size() < array_size) {
        uniqueInts.add(1 + r.nextInt(number - 1));
    }
    Integer[] dividers = uniqueInts.toArray(new Integer[array_size]);
    Arrays.sort(dividers);
    int[] results = new int[number_of_parts];
    for (int i = 1, j = 0; i < dividers.length; ++i, ++j) {
        results[j] = dividers[i] - dividers[j];
    }
    return results;
}

From source file:Main.java

public static <A> Set<A> singleSet(A object) {
    HashSet<A> set = new HashSet<A>();
    set.add(object);
    return set;//from www .ja va  2s .  co  m
}