Example usage for java.util HashMap get

List of usage examples for java.util HashMap get

Introduction

In this page you can find the example usage for java.util HashMap get.

Prototype

public V get(Object key) 

Source Link

Document

Returns the value to which the specified key is mapped, or null if this map contains no mapping for the key.

Usage

From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java

/**
 * @param args// ww w . jav  a2  s  . com
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score");
    scoreOption.setRequired(false);
    scoreOption.setArgName("SCORE THRESHOLD");
    options.addOption(scoreOption);

    Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength");
    strengthOption.setRequired(false);
    strengthOption.setArgName("STRENGTH THRESHOLD");
    options.addOption(strengthOption);

    Option completeRandomizationOption = new Option("c", "complete-randomization", false,
            "use complete randomization when performing significance tests");
    completeRandomizationOption.setRequired(false);
    options.addOption(completeRandomizationOption);

    Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes");
    idOption.setRequired(false);
    options.addOption(idOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    Option removeOption = new Option("r", "remove-not-significant", false,
            "remove relationships that are not" + "significant from the final output");
    removeOption.setRequired(false);
    options.addOption(removeOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeNotSignificant = cmd.hasOption("r");
    boolean removeExistingFiles = cmd.hasOption("f");
    boolean completeRandomization = cmd.hasOption("c");
    boolean hasScoreThreshold = cmd.hasOption("sc");
    boolean hasStrengthThreshold = cmd.hasOption("st");
    boolean outputIds = cmd.hasOption("id");
    String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : "";
    String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : "";

    // all datasets
    ArrayList<String> all_datasets = new ArrayList<String>();
    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        all_datasets.add(line.split("\t")[0]);
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();
    String[] all_datasets_array = new String[all_datasets.size()];
    all_datasets.toArray(all_datasets_array);

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array;
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("No indices from datasets in G1.");
        System.exit(0);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("No indices from datasets in G2.");
        System.exit(0);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        all_datasets.add(dt[0]);
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    String relationshipsDir = "";
    if (outputIds) {
        relationshipsDir = FrameworkUtils.relationshipsIdsDir;
    } else {
        relationshipsDir = FrameworkUtils.relationshipsDir;
    }

    FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3);

    String random = completeRandomization ? "complete" : "restricted";

    String indexInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2
                    + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        indexInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "relationship" + "-" + random;
    String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/";

    FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("complete-random", String.valueOf(completeRandomization));
    conf.set("output-ids", String.valueOf(outputIds));
    conf.set("complete-random-str", random);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    conf.set("remove-not-significant", String.valueOf(removeNotSignificant));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }
    if (hasScoreThreshold) {
        conf.set("score-threshold", scoreThreshold);
    }
    if (hasStrengthThreshold) {
        conf.set("strength-threshold", strengthThreshold);
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(TopologyTimeSeriesWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationMapper.class);
    job.setReducerClass(CorrelationReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir));

    job.setJarByClass(Relationship.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/";
            String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step8GoldDataAggregator.java

public static void main(String[] args) throws Exception {
    String inputDir = args[0] + "/";
    // output dir
    File outputDir = new File(args[1]);
    File turkersConfidence = new File(args[2]);
    if (outputDir.exists()) {
        outputDir.delete();/*from  w  w w  . j  a  v a2  s . com*/
    }
    outputDir.mkdir();

    List<String> annotatorsIDs = new ArrayList<>();
    //        for (File f : FileUtils.listFiles(new File(inputDir), new String[] { "xml" }, false)) {
    //            QueryResultContainer queryResultContainer = QueryResultContainer
    //                    .fromXML(FileUtils.readFileToString(f, "utf-8"));
    //            for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
    //                for (QueryResultContainer.MTurkRelevanceVote relevanceVote : rankedResults.mTurkRelevanceVotes) {
    //                    if (!annotatorsIDs.contains(relevanceVote.turkID))
    //                        annotatorsIDs.add(relevanceVote.turkID);
    //                }
    //            }
    //        }
    HashMap<String, Integer> countVotesForATurker = new HashMap<>();
    // creates temporary file with format for mace
    // Hashmap annotations: key is the id of a document and a sentence
    // Value is an array votes[] of turkers decisions: true or false (relevant or not)
    // the length of this array equals the number of annotators in List<String> annotatorsIDs.
    // If an annotator worked on the task his decision is written in the array otherwise the value is NULL

    // key: queryID + clueWebID + sentenceID
    // value: true and false annotations
    TreeMap<String, Annotations> annotations = new TreeMap<>();

    for (File f : FileUtils.listFiles(new File(inputDir), new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));
        System.out.println("Reading " + f.getName());
        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            String documentID = rankedResults.clueWebID;
            for (QueryResultContainer.MTurkRelevanceVote relevanceVote : rankedResults.mTurkRelevanceVotes) {
                Integer turkerID;
                if (!annotatorsIDs.contains(relevanceVote.turkID)) {
                    annotatorsIDs.add(relevanceVote.turkID);
                    turkerID = annotatorsIDs.size() - 1;
                } else {
                    turkerID = annotatorsIDs.indexOf(relevanceVote.turkID);
                }
                Integer count = countVotesForATurker.get(relevanceVote.turkID);
                if (count == null) {
                    count = 0;
                }
                count++;
                countVotesForATurker.put(relevanceVote.turkID, count);

                String id;
                List<Integer> trueVotes;
                List<Integer> falseVotes;
                for (QueryResultContainer.SingleSentenceRelevanceVote singleSentenceRelevanceVote : relevanceVote.singleSentenceRelevanceVotes)
                    if (!"".equals(singleSentenceRelevanceVote.sentenceID)) {

                        id = f.getName() + "_" + documentID + "_" + singleSentenceRelevanceVote.sentenceID;
                        Annotations turkerVotes = annotations.get(id);
                        if (turkerVotes == null) {
                            trueVotes = new ArrayList<>();
                            falseVotes = new ArrayList<>();
                            turkerVotes = new Annotations(trueVotes, falseVotes);
                        }
                        trueVotes = turkerVotes.trueAnnotations;
                        falseVotes = turkerVotes.falseAnnotations;
                        if ("true".equals(singleSentenceRelevanceVote.relevant)) {
                            // votes[turkerID] = true;
                            trueVotes.add(turkerID);
                        } else if ("false".equals(singleSentenceRelevanceVote.relevant)) {
                            //   votes[turkerID] = false;
                            falseVotes.add(turkerID);
                        } else {
                            throw new IllegalStateException("Annotation value of sentence "
                                    + singleSentenceRelevanceVote.sentenceID + " in " + rankedResults.clueWebID
                                    + " equals " + singleSentenceRelevanceVote.relevant);
                        }
                        try {
                            int allVotesCount = trueVotes.size() + falseVotes.size();
                            if (allVotesCount > 5) {
                                System.err.println(id + " doesn't have 5 annotators: true: " + trueVotes.size()
                                        + " false: " + falseVotes.size());

                                // nasty hack, we're gonna strip some data; true votes first
                                /* we can't do that, it breaks something down the line
                                int toRemove = allVotesCount - 5;
                                if (trueVotes.size() >= toRemove) {
                                trueVotes = trueVotes
                                        .subList(0, trueVotes.size() - toRemove);
                                }
                                else if (
                                    falseVotes.size() >= toRemove) {
                                falseVotes = falseVotes
                                        .subList(0, trueVotes.size() - toRemove);
                                }
                                */
                                System.err.println("Adjusted: " + id + " doesn't have 5 annotators: true: "
                                        + trueVotes.size() + " false: " + falseVotes.size());
                            }
                        } catch (IllegalStateException e) {
                            e.printStackTrace();
                        }
                        turkerVotes.trueAnnotations = trueVotes;
                        turkerVotes.falseAnnotations = falseVotes;
                        annotations.put(id, turkerVotes);
                    } else {
                        throw new IllegalStateException(
                                "Empty Sentence ID in " + f.getName() + " for turker " + turkerID);
                    }

            }
        }

    }
    File tmp = printHashMap(annotations, annotatorsIDs.size());

    String file = TEMP_DIR + "/" + tmp.getName();
    MACE.main(new String[] { "--prefix", file });

    //gets the keys of the documents and sentences
    ArrayList<String> lines = (ArrayList<String>) FileUtils.readLines(new File(file + ".prediction"));
    int i = 0;
    TreeMap<String, TreeMap<String, ArrayList<HashMap<String, String>>>> ids = new TreeMap<>();
    ArrayList<HashMap<String, String>> sentences;
    if (lines.size() != annotations.size()) {
        throw new IllegalStateException(
                "The size of prediction file is " + lines.size() + "but expected " + annotations.size());
    }
    for (Map.Entry entry : annotations.entrySet()) { //1001.xml_clueweb12-1905wb-13-07360_8783
        String key = (String) entry.getKey();
        String[] IDs = key.split("_");
        if (IDs.length > 2) {
            String queryID = IDs[0];
            String clueWebID = IDs[1];
            String sentenceID = IDs[2];
            TreeMap<String, ArrayList<HashMap<String, String>>> clueWebIDs = ids.get(queryID);
            if (clueWebIDs == null) {
                clueWebIDs = new TreeMap<>();
            }
            sentences = clueWebIDs.get(clueWebID);
            if (sentences == null) {
                sentences = new ArrayList<>();
            }
            HashMap<String, String> sentence = new HashMap<>();
            sentence.put(sentenceID, lines.get(i));
            sentences.add(sentence);
            clueWebIDs.put(clueWebID, sentences);
            ids.put(queryID, clueWebIDs);
        } else {
            throw new IllegalStateException("Wrong ID " + key);
        }

        i++;
    }

    for (Map.Entry entry : ids.entrySet()) {
        TreeMap<Integer, String> value = (TreeMap<Integer, String>) entry.getValue();
        String queryID = (String) entry.getKey();
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(new File(inputDir, queryID), "utf-8"));
        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            for (Map.Entry val : value.entrySet()) {
                String clueWebID = (String) val.getKey();
                if (clueWebID.equals(rankedResults.clueWebID)) {
                    List<QueryResultContainer.SingleSentenceRelevanceVote> goldEstimatedLabels = new ArrayList<>();
                    List<QueryResultContainer.SingleSentenceRelevanceVote> turkersVotes = new ArrayList<>();
                    int size = 0;
                    int hitSize = 0;
                    String hitID = "";
                    for (QueryResultContainer.MTurkRelevanceVote vote : rankedResults.mTurkRelevanceVotes) {
                        if (!hitID.equals(vote.hitID)) {
                            hitID = vote.hitID;
                            hitSize = vote.singleSentenceRelevanceVotes.size();
                            size = size + hitSize;
                            turkersVotes.addAll(vote.singleSentenceRelevanceVotes);
                        } else {
                            if (vote.singleSentenceRelevanceVotes.size() != hitSize) {
                                hitSize = vote.singleSentenceRelevanceVotes.size();
                                size = size + hitSize;
                                turkersVotes.addAll(vote.singleSentenceRelevanceVotes);
                            }
                        }
                    }
                    ArrayList<HashMap<String, String>> sentenceList = (ArrayList<HashMap<String, String>>) val
                            .getValue();
                    if (sentenceList.size() != turkersVotes.size()) {
                        try {
                            throw new IllegalStateException("Expected size of annotations is "
                                    + turkersVotes.size() + "but found " + sentenceList.size()
                                    + " for document " + rankedResults.clueWebID + " in " + queryID);
                        } catch (IllegalStateException ex) {
                            ex.printStackTrace();
                        }
                    }
                    for (QueryResultContainer.SingleSentenceRelevanceVote s : turkersVotes) {
                        String valSentence = null;
                        for (HashMap<String, String> anno : sentenceList) {
                            if (anno.keySet().contains(s.sentenceID)) {
                                valSentence = anno.get(s.sentenceID);
                            }
                        }
                        QueryResultContainer.SingleSentenceRelevanceVote singleSentenceVote = new QueryResultContainer.SingleSentenceRelevanceVote();
                        singleSentenceVote.sentenceID = s.sentenceID;
                        if (("false").equals(valSentence)) {
                            singleSentenceVote.relevant = "false";
                        } else if (("true").equals(valSentence)) {
                            singleSentenceVote.relevant = "true";
                        } else {
                            throw new IllegalStateException("Annotation value of sentence "
                                    + singleSentenceVote.sentenceID + " equals " + val.getValue());
                        }
                        goldEstimatedLabels.add(singleSentenceVote);
                    }
                    rankedResults.goldEstimatedLabels = goldEstimatedLabels;
                }
            }
        }
        File outputFile = new File(outputDir, queryID);
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

    ArrayList<String> annotators = (ArrayList<String>) FileUtils.readLines(new File(file + ".competence"));
    FileWriter fileWriter;
    StringBuilder sb = new StringBuilder();
    for (int j = 0; j < annotatorsIDs.size(); j++) {
        String[] s = annotators.get(0).split("\t");
        Float score = Float.parseFloat(s[j]);
        String turkerID = annotatorsIDs.get(j);
        System.out.println(turkerID + " " + score + " " + countVotesForATurker.get(turkerID));
        sb.append(turkerID).append(" ").append(score).append(" ").append(countVotesForATurker.get(turkerID))
                .append("\n");
    }
    fileWriter = new FileWriter(turkersConfidence);
    fileWriter.append(sb.toString());
    fileWriter.close();

}

From source file:apps.ParsedPost.java

public static void main(String args[]) {

    Options options = new Options();

    options.addOption(INPUT_PARAM, null, true, INPUT_DESC);
    options.addOption(OUTPUT_PARAM, null, true, OUTPUT_DESC);
    options.addOption(MAX_NUM_REC_PARAM, null, true, MAX_NUM_REC_DESC);
    options.addOption(DEBUG_PRINT_PARAM, null, false, DEBUG_PRINT_DESC);
    options.addOption(EXCLUDE_CODE_PARAM, null, false, EXCLUDE_CODE_DESC);

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    HashMap<String, ParsedPost> hQuestions = new HashMap<String, ParsedPost>();

    try {/*from  ww  w.  j a v a  2  s . com*/
        CommandLine cmd = parser.parse(options, args);

        String inputFile = cmd.getOptionValue(INPUT_PARAM);

        if (null == inputFile)
            Usage("Specify: " + INPUT_PARAM, options);

        String outputFile = cmd.getOptionValue(OUTPUT_PARAM);

        if (null == outputFile)
            Usage("Specify: " + OUTPUT_PARAM, options);

        InputStream input = CompressUtils.createInputStream(inputFile);
        BufferedWriter output = new BufferedWriter(new FileWriter(new File(outputFile)));

        int maxNumRec = Integer.MAX_VALUE;

        String tmp = cmd.getOptionValue(MAX_NUM_REC_PARAM);

        if (tmp != null)
            maxNumRec = Integer.parseInt(tmp);

        boolean debug = cmd.hasOption(DEBUG_PRINT_PARAM);

        boolean excludeCode = cmd.hasOption(EXCLUDE_CODE_PARAM);

        System.out.println("Processing at most " + maxNumRec + " records, excluding code? " + excludeCode);

        XmlIterator xi = new XmlIterator(input, ROOT_POST_TAG);

        String elem;

        output.write("<?xml version='1.0' encoding='UTF-8'?><ystfeed>\n");

        for (int num = 1; num <= maxNumRec && !(elem = xi.readNext()).isEmpty(); ++num) {
            ParsedPost post = null;
            try {
                post = parsePost(elem, excludeCode);

                if (!post.mAcceptedAnswerId.isEmpty()) {
                    hQuestions.put(post.mId, post);
                } else if (post.mpostIdType.equals("2")) {
                    String parentId = post.mParentId;
                    String id = post.mId;
                    if (!parentId.isEmpty()) {
                        ParsedPost parentPost = hQuestions.get(parentId);
                        if (parentPost != null && parentPost.mAcceptedAnswerId.equals(id)) {
                            output.write(createYahooAnswersQuestion(parentPost, post));
                            hQuestions.remove(parentId);
                        }
                    }
                }

            } catch (Exception e) {
                e.printStackTrace();
                throw new Exception("Error parsing record # " + num + ", error message: " + e);
            }
            if (debug) {
                System.out.println(String.format("%s parentId=%s acceptedAnswerId=%s type=%s", post.mId,
                        post.mParentId, post.mAcceptedAnswerId, post.mpostIdType));
                System.out.println("================================");
                if (!post.mTitle.isEmpty()) {
                    System.out.println(post.mTitle);
                    System.out.println("--------------------------------");
                }
                System.out.println(post.mBody);
                System.out.println("================================");
            }
        }

        output.write("</ystfeed>\n");

        input.close();
        output.close();

    } catch (ParseException e) {
        Usage("Cannot parse arguments", options);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }

}

From source file:edu.cmu.lti.oaqa.knn4qa.apps.ParsedPost.java

public static void main(String args[]) {

    Options options = new Options();

    options.addOption(INPUT_PARAM, null, true, INPUT_DESC);
    options.addOption(OUTPUT_PARAM, null, true, OUTPUT_DESC);
    options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC);
    options.addOption(DEBUG_PRINT_PARAM, null, false, DEBUG_PRINT_DESC);
    options.addOption(EXCLUDE_CODE_PARAM, null, false, EXCLUDE_CODE_DESC);

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    HashMap<String, ParsedPost> hQuestions = new HashMap<String, ParsedPost>();

    try {/*from w  ww  . j  a v  a  2s  .com*/
        CommandLine cmd = parser.parse(options, args);

        String inputFile = cmd.getOptionValue(INPUT_PARAM);

        if (null == inputFile)
            Usage("Specify: " + INPUT_PARAM, options);

        String outputFile = cmd.getOptionValue(OUTPUT_PARAM);

        if (null == outputFile)
            Usage("Specify: " + OUTPUT_PARAM, options);

        InputStream input = CompressUtils.createInputStream(inputFile);
        BufferedWriter output = new BufferedWriter(new FileWriter(new File(outputFile)));

        int maxNumRec = Integer.MAX_VALUE;

        String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM);

        if (tmp != null)
            maxNumRec = Integer.parseInt(tmp);

        boolean debug = cmd.hasOption(DEBUG_PRINT_PARAM);

        boolean excludeCode = cmd.hasOption(EXCLUDE_CODE_PARAM);

        System.out.println("Processing at most " + maxNumRec + " records, excluding code? " + excludeCode);

        XmlIterator xi = new XmlIterator(input, ROOT_POST_TAG);

        String elem;

        output.write("<?xml version='1.0' encoding='UTF-8'?><ystfeed>\n");

        for (int num = 1; num <= maxNumRec && !(elem = xi.readNext()).isEmpty(); ++num) {
            ParsedPost post = null;
            try {
                post = parsePost(elem, excludeCode);

                if (!post.mAcceptedAnswerId.isEmpty()) {
                    hQuestions.put(post.mId, post);
                } else if (post.mpostIdType.equals("2")) {
                    String parentId = post.mParentId;
                    String id = post.mId;
                    if (!parentId.isEmpty()) {
                        ParsedPost parentPost = hQuestions.get(parentId);
                        if (parentPost != null && parentPost.mAcceptedAnswerId.equals(id)) {
                            output.write(createYahooAnswersQuestion(parentPost, post));
                            hQuestions.remove(parentId);
                        }
                    }
                }

            } catch (Exception e) {
                e.printStackTrace();
                throw new Exception("Error parsing record # " + num + ", error message: " + e);
            }
            if (debug) {
                System.out.println(String.format("%s parentId=%s acceptedAnswerId=%s type=%s", post.mId,
                        post.mParentId, post.mAcceptedAnswerId, post.mpostIdType));
                System.out.println("================================");
                if (!post.mTitle.isEmpty()) {
                    System.out.println(post.mTitle);
                    System.out.println("--------------------------------");
                }
                System.out.println(post.mBody);
                System.out.println("================================");
            }
        }

        output.write("</ystfeed>\n");

        input.close();
        output.close();

    } catch (ParseException e) {
        Usage("Cannot parse arguments", options);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }

}

From source file:gedi.lfc.quick.ShiroguchiCounter.java

public static void main(String[] args) throws IOException {

    String path = "/home/users/erhard/biostor/seq/ngade/shiroguchi_randombarcodes/data/";

    MemoryIntervalTreeStorage<int[]> reads = new MemoryIntervalTreeStorage<int[]>(int[].class);

    String[] files = { "Shiroguchi_A_collapsed.bed", "Shiroguchi_B_collapsed.bed",
            "Shiroguchi_A_uncollapsed.bed", "Shiroguchi_B_uncollapsed.bed" };
    for (int i = 0; i < 4; i++) {
        Iterator<String> it = new LineOrientedFile(path + files[i]).lineIterator();
        while (it.hasNext()) {
            String[] f = StringUtils.split(it.next(), '\t');
            Chromosome chr = Chromosome.obtain(f[0]);
            ArrayGenomicRegion region = new ArrayGenomicRegion(Integer.parseInt(f[1]), Integer.parseInt(f[2]));
            int c = Integer.parseInt(StringUtils.splitField(f[3], '|', 0));

            int[] counts = reads.getData(chr, region);
            if (counts == null)
                reads.add(chr, region, counts = new int[4]);

            counts[i] += c;/*from  www .  ja  v a  2s  .c  om*/
        }
    }

    HashMap<String, String> map = new HashMap<String, String>();
    new LineOrientedFile(path + "U00096.2.genes.csv").lineIterator().forEachRemaining(s -> {
        String[] f = StringUtils.split(s, '\t');
        map.put(f[0], f[7]);
    });

    LineOrientedFile fragments = new LineOrientedFile("fragments.csv");
    fragments.startWriting();
    fragments.writef("Gene\tonlyA\tonlyB\tBoth\tLength\n");

    LineOrientedFile bias = new LineOrientedFile("bias.csv");
    bias.startWriting();
    bias.writef("OriginalA\tBiasA\tOriginalB\tBiasB\n");

    IntArrayList biasFactors = new IntArrayList();
    ArrayList<GeneData> geneData = new ArrayList<GeneData>();

    MemoryIntervalTreeStorage<Transcript> genes = new BiomartExonFileReader(path + "U00096.2.exons.csv", false)
            .readIntoMemoryTakeFirst();
    for (ImmutableReferenceGenomicRegion<Transcript> g : genes.getReferenceGenomicRegions()) {
        ArrayList<ImmutableReferenceGenomicRegion<int[]>> frag = reads
                .getReferenceRegionsIntersecting(g.getReference().toStrandIndependent(), g.getRegion());

        GeneData gd = new GeneData();

        int l = g.getRegion().getTotalLength();
        for (ImmutableReferenceGenomicRegion<int[]> r : frag) {
            if (r.getData()[0] == 0)
                gd.onlyB++;
            if (r.getData()[1] == 0)
                gd.onlyA++;
            if (r.getData()[0] == 0 && r.getData()[1] == 0)
                throw new RuntimeException();

            bias.writef("%d\t%.0f\t%d\t%.0f\n", r.getData()[0], r.getData()[2] / (double) r.getData()[0],
                    r.getData()[1], r.getData()[3] / (double) r.getData()[1]);
            if (r.getData()[0] > 0) {
                biasFactors.add(r.getData()[2] / r.getData()[0]);
            }
            if (r.getData()[1] > 0) {
                biasFactors.add(r.getData()[3] / r.getData()[1]);
            }

        }
        gd.both = frag.size() - gd.onlyA - gd.onlyB;
        fragments.writef("%s\t%d\t%d\t%d\t%d\n", map.get(g.getData().getTranscriptId()), gd.onlyA, gd.onlyB,
                gd.both, l);

        if (gd.onlyA + gd.onlyB + gd.both > 0)
            geneData.add(gd);
    }

    fragments.finishWriting();
    bias.finishWriting();

    double fc = 1.4;
    int rep = 5;
    int nDiff = 1000;
    int n = 10000;
    int N = 6000;
    double noise = 0.05;

    LineOrientedFile countMatrix = new LineOrientedFile("countMatrix.csv");
    countMatrix.startWriting();

    LineOrientedFile downCountMatrix = new LineOrientedFile("countMatrix_downsampled.csv");
    downCountMatrix.startWriting();

    RandomNumbers rnd = new RandomNumbers();
    for (int i = 0; i < n; i++) {
        GeneData gd = geneData.get(rnd.getUnif(0, geneData.size()));

        //         int N = gd.both==0?Integer.MAX_VALUE/2:(int) (gd.onlyA+gd.onlyB+gd.both+gd.onlyA*gd.onlyB/gd.both);
        double p1 = (gd.onlyA + gd.both) / (double) N;
        double p2 = i < nDiff ? p1 / fc : p1;

        ArrayList<ReadData> list = new ArrayList<ReadData>();
        for (int r = 0; r < rep * 2; r++) {
            int k = rnd.getBinom(N, r < rep ? p1 : p2) + 1;
            int hit = N == -1 ? 0 : rnd.getBinom(k, list.size() / (double) N);
            rnd.shuffle(list);
            for (int x = 0; x < hit; x++)
                list.get(x).reads[r] = (int) rnd.getNormal(list.get(x).bias, list.get(x).bias * noise);
            for (int x = 0; x < k - hit; x++)
                list.add(new ReadData(biasFactors.getInt(rnd.getUnif(0, biasFactors.size())), rep * 2, r));
        }

        int[] c = new int[rep * 2];
        for (ReadData d : list) {
            for (int r = 0; r < c.length; r++) {
                c[r] += d.reads[r];
            }
        }

        double[] down = new double[rep * 2];
        for (ReadData d : list) {
            double max = ArrayUtils.max(d.reads);
            for (int r = 0; r < down.length; r++) {
                down[r] += d.reads[r] / max;
            }
        }

        countMatrix.writeLine(StringUtils.concat("\t", c));
        downCountMatrix.writeLine(StringUtils.concat("\t", down));

    }

    countMatrix.finishWriting();
    downCountMatrix.finishWriting();

}

From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java

/**
 * @param args/*w w w  .  jav a 2 s .c  o  m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the aggregate functions " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions"
            + " will be computed, followed by their temporal and spatial attribute indices");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";
    String preProcessingDatasets = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetAggregation = new ArrayList<String>();
    HashMap<String, String> datasetTempAtt = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>();
    HashMap<String, String> preProcessingDataset = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");
    String[] datasetArgs = cmd.getOptionValues("g");

    for (int i = 0; i < datasetArgs.length; i += 3) {
        String dataset = datasetArgs[i];

        // getting pre-processing
        String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3);
        if (tempPreProcessing == null) {
            System.out.println("No pre-processing available for " + dataset);
            continue;
        }
        preProcessingDataset.put(dataset, tempPreProcessing);

        shortDataset.add(dataset);
        datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1]));
        datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2]));

        datasetId.put(dataset, null);
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    Path path = null;
    FileSystem fs = null;

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        fs = FileSystem.get(new Configuration());
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3);

    // getting smallest resolution

    HashMap<String, String> tempResMap = new HashMap<String, String>();
    HashMap<String, String> spatialResMap = new HashMap<String, String>();

    HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>();

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String[] datasetArray = preProcessingDataset.get(dataset).split("-");

        String datasetTemporalStr = datasetArray[datasetArray.length - 2];
        int datasetTemporal = utils.temporalResolution(datasetTemporalStr);

        String datasetSpatialStr = datasetArray[datasetArray.length - 1];
        int datasetSpatial = utils.spatialResolution(datasetSpatialStr);

        // finding all possible resolutions

        String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal);
        String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial);

        String temporalResolution = "";
        String spatialResolution = "";

        String tempRes = "";
        String spatialRes = "";

        boolean dataAdded = false;

        for (int i = 0; i < temporalResolutions.length; i++) {
            for (int j = 0; j < spatialResolutions.length; j++) {

                temporalResolution = temporalResolutions[i];
                spatialResolution = spatialResolutions[j];

                String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";

                if (removeExistingFiles) {
                    FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3);
                }

                if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) {

                    dataAdded = true;

                    tempRes += temporalResolution + "-";
                    spatialRes += spatialResolution + "-";
                }
            }
        }

        if (dataAdded) {
            input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
            shortDatasetAggregation.add(dataset);

            tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1));
            spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1));

            datasetTemporalStrMap.put(dataset, datasetTemporalStr);
            datasetSpatialStrMap.put(dataset, datasetSpatialStr);
        }
    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have aggregates.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    it = input.iterator();
    while (it.hasNext()) {
        preProcessingDatasets += it.next() + ",";
    }

    Job aggJob = null;
    String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/";
    String jobName = "aggregates";

    FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3);

    Configuration aggConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    aggConf.set("dataset-name", datasetNames);
    aggConf.set("dataset-id", datasetIds);

    for (int i = 0; i < shortDatasetAggregation.size(); i++) {
        String dataset = shortDatasetAggregation.get(i);
        String id = datasetId.get(dataset);
        aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset));
        aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset));
        aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset));

        if (s3)
            aggConf.set("dataset-" + id,
                    s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
        else
            aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/"
                    + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
    }

    aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    aggConf.set("mapreduce.task.io.sort.mb", "200");
    aggConf.set("mapreduce.task.io.sort.factor", "100");
    machineConf.setMachineConfiguration(aggConf);

    if (s3) {
        machineConf.setMachineConfiguration(aggConf);
        aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    if (snappyCompression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    aggJob = new Job(aggConf);
    aggJob.setJobName(jobName);

    aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setMapOutputValueClass(AggregationArrayWritable.class);
    aggJob.setOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setOutputValueClass(FloatArrayWritable.class);
    //aggJob.setOutputKeyClass(Text.class);
    //aggJob.setOutputValueClass(Text.class);

    aggJob.setMapperClass(AggregationMapper.class);
    aggJob.setCombinerClass(AggregationCombiner.class);
    aggJob.setReducerClass(AggregationReducer.class);
    aggJob.setNumReduceTasks(machineConf.getNumberReduces());

    aggJob.setInputFormatClass(SequenceFileInputFormat.class);
    //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(aggJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(aggJob, true);
    FileInputFormat.setInputPaths(aggJob,
            preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1));
    FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir));

    aggJob.setJarByClass(Aggregation.class);

    long start = System.currentTimeMillis();
    aggJob.submit();
    aggJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetAggregation) {
        String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:edu.msu.cme.rdp.seqmatch.cli.SeqMatchMain.java

public static void main(String[] args) throws Exception {

    if (args.length == 0) {
        System.err.println("USAGE: SeqMatchMain [train|seqmatch] <args>");
        return;// w  ww  . j  a va2  s.c  o m
    }

    String cmd = args[0];
    args = Arrays.copyOfRange(args, 1, args.length);

    if (cmd.equals("train")) {
        if (args.length != 2) {
            System.err.println("USAGE: train <reference sequences> <trainee_out_file_prefix>"
                    + "\nMultiple trainee output files might be created, each containing maximum "
                    + Trainee.MAX_NUM_SEQ + " sequences");
            return;
        }

        File refSeqs = new File(args[0]);
        File traineeFileOut = new File(args[1]);

        //maybe more than 1 trainee files need to be created, depending on the number of seqs
        CreateMultiMatchFromFile.getMultiTrainee(refSeqs, traineeFileOut);
    } else if (cmd.equals("seqmatch")) {
        File refFile = null;
        File queryFile = null;
        HashMap<String, String> descMap = new HashMap<String, String>();
        PrintStream out = new PrintStream(System.out);
        int knn = 20;
        float minSab = .5f;

        try {
            CommandLine line = new PosixParser().parse(options, args);

            if (line.hasOption("knn")) {
                knn = Integer.parseInt(line.getOptionValue("knn"));
            }

            if (line.hasOption("sab")) {
                minSab = Float.parseFloat(line.getOptionValue("sab"));
            }
            if (line.hasOption("desc")) {
                descMap = readDesc(new File(line.getOptionValue("desc")));
            }
            if (line.hasOption("outFile")) {
                out = new PrintStream(new File(line.getOptionValue("outFile")));
            }

            args = line.getArgs();

            if (args.length != 2) {
                throw new Exception("Unexpected number of command line arguments");
            }

            refFile = new File(args[0]);
            queryFile = new File(args[1]);

        } catch (Exception e) {
            new HelpFormatter().printHelp("seqmatch <refseqs | trainee_file_or_dir> <query_file>\n"
                    + " trainee_file_or_dir is a single trainee file or a directory containing multiple trainee files",
                    options);
            System.err.println("Error: " + e.getMessage());
            return;
        }

        SeqMatch seqmatch = null;
        if (refFile.isDirectory()) { // a directory of trainee files
            List<SeqMatch> engineList = new ArrayList<SeqMatch>();
            for (File f : refFile.listFiles()) {
                if (!f.isHidden()) {
                    TwowaySeqMatch match = new TwowaySeqMatch(new SeqMatchEngine(new StorageTrainee(f)));
                    engineList.add(match);
                }
            }
            seqmatch = new MultiTraineeSeqMatch(engineList);
        } else { // a single fasta file or trainee file
            if (SeqUtils.guessFileFormat(refFile) == SequenceFormat.UNKNOWN) {
                seqmatch = CLISeqMatchFactory.trainTwowaySeqMatch(new StorageTrainee(refFile));
            } else {
                seqmatch = CreateMultiMatchFromFile.getMultiMatch(refFile);
            }
        }

        out.println("query name\tmatch seq\torientation\tS_ab score\tunique oligomers\tdescription");

        SeqReader reader = new SequenceReader(queryFile);
        Sequence seq;

        while ((seq = reader.readNextSequence()) != null) {
            SeqMatchResultSet resultSet = seqmatch.match(seq, knn);
            for (SeqMatchResult result : resultSet) {
                char r = '+';
                if (result.isReverse()) {
                    r = '-';
                }

                if (result.getScore() > minSab) {
                    out.println(seq.getSeqName() + "\t" + result.getSeqName() + "\t" + r + "\t"
                            + result.getScore() + "\t" + resultSet.getQueryWordCount() + "\t"
                            + descMap.get(result.getSeqName()));
                }
            }
        }

        out.close();
    } else {
        throw new IllegalArgumentException("USAGE: SeqMatchMain [train|seqmatch] <args>");
    }
}

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

/**
 * @param args//from w w  w .  ja  va 2  s. c o  m
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:InlineSchemaValidator.java

/** Main program entry point. */
public static void main(String[] argv) {

    // is there anything to do?
    if (argv.length == 0) {
        printUsage();//w  w w .  j  a  va  2  s . c  o  m
        System.exit(1);
    }

    // variables
    Vector schemas = null;
    Vector instances = null;
    HashMap prefixMappings = null;
    HashMap uriMappings = null;
    String docURI = argv[argv.length - 1];
    String schemaLanguage = DEFAULT_SCHEMA_LANGUAGE;
    int repetition = DEFAULT_REPETITION;
    boolean schemaFullChecking = DEFAULT_SCHEMA_FULL_CHECKING;
    boolean honourAllSchemaLocations = DEFAULT_HONOUR_ALL_SCHEMA_LOCATIONS;
    boolean validateAnnotations = DEFAULT_VALIDATE_ANNOTATIONS;
    boolean generateSyntheticAnnotations = DEFAULT_GENERATE_SYNTHETIC_ANNOTATIONS;
    boolean memoryUsage = DEFAULT_MEMORY_USAGE;

    // process arguments
    for (int i = 0; i < argv.length - 1; ++i) {
        String arg = argv[i];
        if (arg.startsWith("-")) {
            String option = arg.substring(1);
            if (option.equals("l")) {
                // get schema language name
                if (++i == argv.length) {
                    System.err.println("error: Missing argument to -l option.");
                } else {
                    schemaLanguage = argv[i];
                }
                continue;
            }
            if (option.equals("x")) {
                if (++i == argv.length) {
                    System.err.println("error: Missing argument to -x option.");
                    continue;
                }
                String number = argv[i];
                try {
                    int value = Integer.parseInt(number);
                    if (value < 1) {
                        System.err.println("error: Repetition must be at least 1.");
                        continue;
                    }
                    repetition = value;
                } catch (NumberFormatException e) {
                    System.err.println("error: invalid number (" + number + ").");
                }
                continue;
            }
            if (arg.equals("-a")) {
                // process -a: xpath expressions for schemas
                if (schemas == null) {
                    schemas = new Vector();
                }
                while (i + 1 < argv.length - 1 && !(arg = argv[i + 1]).startsWith("-")) {
                    schemas.add(arg);
                    ++i;
                }
                continue;
            }
            if (arg.equals("-i")) {
                // process -i: xpath expressions for instance documents
                if (instances == null) {
                    instances = new Vector();
                }
                while (i + 1 < argv.length - 1 && !(arg = argv[i + 1]).startsWith("-")) {
                    instances.add(arg);
                    ++i;
                }
                continue;
            }
            if (arg.equals("-nm")) {
                String prefix;
                String uri;
                while (i + 2 < argv.length - 1 && !(prefix = argv[i + 1]).startsWith("-")
                        && !(uri = argv[i + 2]).startsWith("-")) {
                    if (prefixMappings == null) {
                        prefixMappings = new HashMap();
                        uriMappings = new HashMap();
                    }
                    prefixMappings.put(prefix, uri);
                    HashSet prefixes = (HashSet) uriMappings.get(uri);
                    if (prefixes == null) {
                        prefixes = new HashSet();
                        uriMappings.put(uri, prefixes);
                    }
                    prefixes.add(prefix);
                    i += 2;
                }
                continue;
            }
            if (option.equalsIgnoreCase("f")) {
                schemaFullChecking = option.equals("f");
                continue;
            }
            if (option.equalsIgnoreCase("hs")) {
                honourAllSchemaLocations = option.equals("hs");
                continue;
            }
            if (option.equalsIgnoreCase("va")) {
                validateAnnotations = option.equals("va");
                continue;
            }
            if (option.equalsIgnoreCase("ga")) {
                generateSyntheticAnnotations = option.equals("ga");
                continue;
            }
            if (option.equalsIgnoreCase("m")) {
                memoryUsage = option.equals("m");
                continue;
            }
            if (option.equals("h")) {
                printUsage();
                continue;
            }
            System.err.println("error: unknown option (" + option + ").");
            continue;
        }
    }

    try {
        // Create new instance of inline schema validator.
        InlineSchemaValidator inlineSchemaValidator = new InlineSchemaValidator(prefixMappings, uriMappings);

        // Parse document containing schemas and validation roots
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setNamespaceAware(true);
        DocumentBuilder db = dbf.newDocumentBuilder();
        db.setErrorHandler(inlineSchemaValidator);
        Document doc = db.parse(docURI);

        // Create XPath factory for selecting schema and validation roots
        XPathFactory xpf = XPathFactory.newInstance();
        XPath xpath = xpf.newXPath();
        xpath.setNamespaceContext(inlineSchemaValidator);

        // Select schema roots from the DOM
        NodeList[] schemaNodes = new NodeList[schemas != null ? schemas.size() : 0];
        for (int i = 0; i < schemaNodes.length; ++i) {
            XPathExpression xpathSchema = xpath.compile((String) schemas.elementAt(i));
            schemaNodes[i] = (NodeList) xpathSchema.evaluate(doc, XPathConstants.NODESET);
        }

        // Select validation roots from the DOM
        NodeList[] instanceNodes = new NodeList[instances != null ? instances.size() : 0];
        for (int i = 0; i < instanceNodes.length; ++i) {
            XPathExpression xpathInstance = xpath.compile((String) instances.elementAt(i));
            instanceNodes[i] = (NodeList) xpathInstance.evaluate(doc, XPathConstants.NODESET);
        }

        // Create SchemaFactory and configure
        SchemaFactory factory = SchemaFactory.newInstance(schemaLanguage);
        factory.setErrorHandler(inlineSchemaValidator);

        try {
            factory.setFeature(SCHEMA_FULL_CHECKING_FEATURE_ID, schemaFullChecking);
        } catch (SAXNotRecognizedException e) {
            System.err.println("warning: SchemaFactory does not recognize feature ("
                    + SCHEMA_FULL_CHECKING_FEATURE_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println("warning: SchemaFactory does not support feature ("
                    + SCHEMA_FULL_CHECKING_FEATURE_ID + ")");
        }
        try {
            factory.setFeature(HONOUR_ALL_SCHEMA_LOCATIONS_ID, honourAllSchemaLocations);
        } catch (SAXNotRecognizedException e) {
            System.err.println("warning: SchemaFactory does not recognize feature ("
                    + HONOUR_ALL_SCHEMA_LOCATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println(
                    "warning: SchemaFactory does not support feature (" + HONOUR_ALL_SCHEMA_LOCATIONS_ID + ")");
        }
        try {
            factory.setFeature(VALIDATE_ANNOTATIONS_ID, validateAnnotations);
        } catch (SAXNotRecognizedException e) {
            System.err.println(
                    "warning: SchemaFactory does not recognize feature (" + VALIDATE_ANNOTATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println(
                    "warning: SchemaFactory does not support feature (" + VALIDATE_ANNOTATIONS_ID + ")");
        }
        try {
            factory.setFeature(GENERATE_SYNTHETIC_ANNOTATIONS_ID, generateSyntheticAnnotations);
        } catch (SAXNotRecognizedException e) {
            System.err.println("warning: SchemaFactory does not recognize feature ("
                    + GENERATE_SYNTHETIC_ANNOTATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println("warning: SchemaFactory does not support feature ("
                    + GENERATE_SYNTHETIC_ANNOTATIONS_ID + ")");
        }

        // Build Schema from sources
        Schema schema;
        {
            DOMSource[] sources;
            int size = 0;
            for (int i = 0; i < schemaNodes.length; ++i) {
                size += schemaNodes[i].getLength();
            }
            sources = new DOMSource[size];
            if (size == 0) {
                schema = factory.newSchema();
            } else {
                int count = 0;
                for (int i = 0; i < schemaNodes.length; ++i) {
                    NodeList nodeList = schemaNodes[i];
                    int nodeListLength = nodeList.getLength();
                    for (int j = 0; j < nodeListLength; ++j) {
                        sources[count++] = new DOMSource(nodeList.item(j));
                    }
                }
                schema = factory.newSchema(sources);
            }
        }

        // Setup validator and input source.
        Validator validator = schema.newValidator();
        validator.setErrorHandler(inlineSchemaValidator);

        try {
            validator.setFeature(SCHEMA_FULL_CHECKING_FEATURE_ID, schemaFullChecking);
        } catch (SAXNotRecognizedException e) {
            System.err.println(
                    "warning: Validator does not recognize feature (" + SCHEMA_FULL_CHECKING_FEATURE_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println(
                    "warning: Validator does not support feature (" + SCHEMA_FULL_CHECKING_FEATURE_ID + ")");
        }
        try {
            validator.setFeature(HONOUR_ALL_SCHEMA_LOCATIONS_ID, honourAllSchemaLocations);
        } catch (SAXNotRecognizedException e) {
            System.err.println(
                    "warning: Validator does not recognize feature (" + HONOUR_ALL_SCHEMA_LOCATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println(
                    "warning: Validator does not support feature (" + HONOUR_ALL_SCHEMA_LOCATIONS_ID + ")");
        }
        try {
            validator.setFeature(VALIDATE_ANNOTATIONS_ID, validateAnnotations);
        } catch (SAXNotRecognizedException e) {
            System.err
                    .println("warning: Validator does not recognize feature (" + VALIDATE_ANNOTATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println("warning: Validator does not support feature (" + VALIDATE_ANNOTATIONS_ID + ")");
        }
        try {
            validator.setFeature(GENERATE_SYNTHETIC_ANNOTATIONS_ID, generateSyntheticAnnotations);
        } catch (SAXNotRecognizedException e) {
            System.err.println("warning: Validator does not recognize feature ("
                    + GENERATE_SYNTHETIC_ANNOTATIONS_ID + ")");
        } catch (SAXNotSupportedException e) {
            System.err.println(
                    "warning: Validator does not support feature (" + GENERATE_SYNTHETIC_ANNOTATIONS_ID + ")");
        }

        // Validate instance documents
        for (int i = 0; i < instanceNodes.length; ++i) {
            NodeList nodeList = instanceNodes[i];
            int nodeListLength = nodeList.getLength();
            for (int j = 0; j < nodeListLength; ++j) {
                DOMSource source = new DOMSource(nodeList.item(j));
                source.setSystemId(docURI);
                inlineSchemaValidator.validate(validator, source, docURI, repetition, memoryUsage);
            }
        }
    } catch (SAXParseException e) {
        // ignore
    } catch (Exception e) {
        System.err.println("error: Parse error occurred - " + e.getMessage());
        if (e instanceof SAXException) {
            Exception nested = ((SAXException) e).getException();
            if (nested != null) {
                e = nested;
            }
        }
        e.printStackTrace(System.err);
    }
}

From source file:com.adobe.aem.demomachine.Json2Csv.java

public static void main(String[] args) throws IOException {

    String inputFile1 = null;/*from w  w w . j  a v a2s.c  o m*/
    String inputFile2 = null;
    String outputFile = null;

    HashMap<String, String> hmReportSuites = new HashMap<String, String>();

    // Command line options for this tool
    Options options = new Options();
    options.addOption("c", true, "Filename 1");
    options.addOption("r", true, "Filename 2");
    options.addOption("o", true, "Filename 3");
    CommandLineParser parser = new BasicParser();
    try {
        CommandLine cmd = parser.parse(options, args);

        if (cmd.hasOption("c")) {
            inputFile1 = cmd.getOptionValue("c");
        }

        if (cmd.hasOption("r")) {
            inputFile2 = cmd.getOptionValue("r");
        }

        if (cmd.hasOption("o")) {
            outputFile = cmd.getOptionValue("o");
        }

        if (inputFile1 == null || inputFile1 == null || outputFile == null) {
            System.exit(-1);
        }

    } catch (ParseException ex) {

        logger.error(ex.getMessage());

    }

    // List of customers and report suites for these customers
    String sInputFile1 = readFile(inputFile1, Charset.defaultCharset());
    sInputFile1 = sInputFile1.replaceAll("ObjectId\\(\"([0-9a-z]*)\"\\)", "\"$1\"");

    // Processing the list of report suites for each customer
    try {

        JSONArray jCustomers = new JSONArray(sInputFile1.trim());
        for (int i = 0, size = jCustomers.length(); i < size; i++) {
            JSONObject jCustomer = jCustomers.getJSONObject(i);
            Iterator<?> keys = jCustomer.keys();
            String companyName = null;
            while (keys.hasNext()) {
                String key = (String) keys.next();
                if (key.equals("company")) {
                    companyName = jCustomer.getString(key);
                }
            }
            keys = jCustomer.keys();
            while (keys.hasNext()) {
                String key = (String) keys.next();

                if (key.equals("report_suites")) {
                    JSONArray jReportSuites = jCustomer.getJSONArray(key);
                    for (int j = 0, rSize = jReportSuites.length(); j < rSize; j++) {
                        hmReportSuites.put(jReportSuites.getString(j), companyName);
                        System.out.println(jReportSuites.get(j) + " for company " + companyName);
                    }

                }
            }
        }

        // Creating the out put file
        PrintWriter writer = new PrintWriter(outputFile, "UTF-8");
        writer.println("\"" + "Customer" + "\",\"" + "ReportSuite ID" + "\",\"" + "Number of Documents"
                + "\",\"" + "Last Updated" + "\"");

        // Processing the list of SOLR collections
        String sInputFile2 = readFile(inputFile2, Charset.defaultCharset());
        sInputFile2 = sInputFile2.replaceAll("NumberLong\\(\"([0-9a-z]*)\"\\)", "\"$1\"");

        JSONObject jResults = new JSONObject(sInputFile2.trim());
        JSONArray jCollections = jResults.getJSONArray("result");
        for (int i = 0, size = jCollections.length(); i < size; i++) {
            JSONObject jCollection = jCollections.getJSONObject(i);
            String id = null;
            String number = null;
            String lastupdate = null;

            Iterator<?> keys = jCollection.keys();
            while (keys.hasNext()) {
                String key = (String) keys.next();
                if (key.equals("_id")) {
                    id = jCollection.getString(key);
                }
            }

            keys = jCollection.keys();
            while (keys.hasNext()) {
                String key = (String) keys.next();
                if (key.equals("noOfDocs")) {
                    number = jCollection.getString(key);
                }
            }

            keys = jCollection.keys();
            while (keys.hasNext()) {
                String key = (String) keys.next();
                if (key.equals("latestUpdateDate")) {
                    lastupdate = jCollection.getString(key);
                }
            }

            Date d = new Date(Long.parseLong(lastupdate));
            System.out.println(hmReportSuites.get(id) + "," + id + "," + number + "," + lastupdate + ","
                    + new SimpleDateFormat("MM-dd-yyyy").format(d));
            writer.println("\"" + hmReportSuites.get(id) + "\",\"" + id + "\",\"" + number + "\",\""
                    + new SimpleDateFormat("MM-dd-yyyy").format(d) + "\"");

        }

        writer.close();

    } catch (JSONException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}