List of usage examples for java.util HashSet add
public boolean add(E e)
From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java
/** * @param args//from w ww . j a v a 2s.c o m */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the index and events " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option thresholdOption = new Option("t", "use-custom-thresholds", false, "use custom thresholds for regular and rare events, defined in HDFS_HOME/" + FrameworkUtils.thresholdDir + " file"); thresholdOption.setRequired(false); options.addOption(thresholdOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the indices and events" + " will be computed"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetIndex = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>(); HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>(); Path path = null; FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br; boolean removeExistingFiles = cmd.hasOption("f"); boolean isThresholdUserDefined = cmd.hasOption("t"); for (String dataset : cmd.getOptionValues("g")) { // getting aggregates String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3); if (aggregate.length == 0) { System.out.println("No aggregates found for " + dataset + "."); continue; } // getting aggregates header String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3); if (aggregatesHeaderFileName == null) { System.out.println("No aggregate header for " + dataset); continue; } String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName; shortDataset.add(dataset); datasetId.put(dataset, null); if (s3) { path = new Path(aggregatesHeader); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader); } br = new BufferedReader(new InputStreamReader(fs.open(path))); datasetAgg.put(dataset, br.readLine().split("\t")[1]); br.close(); if (s3) fs.close(); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } // getting user defined thresholds if (isThresholdUserDefined) { if (s3) { path = new Path(s3bucket + FrameworkUtils.thresholdDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { // getting dataset name String dataset = line.trim(); HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>(); HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>(); line = br.readLine(); while ((line != null) && (line.split("\t").length > 1)) { // getting attribute ids and thresholds String[] keyVals = line.trim().split("\t"); int att = Integer.parseInt(keyVals[0].trim()); regThresholds.put(att, Double.parseDouble(keyVals[1].trim())); rareThresholds.put(att, Double.parseDouble(keyVals[2].trim())); line = br.readLine(); } datasetRegThreshold.put(dataset, regThresholds); datasetRareThreshold.put(dataset, rareThresholds); } br.close(); } if (s3) fs.close(); // datasets that will use existing merge tree ArrayList<String> useMergeTree = new ArrayList<String>(); // creating index for each spatio-temporal resolution FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3); FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3); } else if (datasetRegThreshold.containsKey(dataset)) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) { useMergeTree.add(dataset); } } if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) { input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset); shortDatasetIndex.add(dataset); } } if (input.isEmpty()) { System.out.println("All the input datasets have indices."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } String aggregateDatasets = ""; it = input.iterator(); while (it.hasNext()) { aggregateDatasets += it.next() + ","; } Job icJob = null; Configuration icConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "index"; String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/"; FrameworkUtils.removeFile(indexOutputDir, s3conf, s3); icConf.set("dataset-name", datasetNames); icConf.set("dataset-id", datasetIds); if (!useMergeTree.isEmpty()) { String useMergeTreeStr = ""; for (String dt : useMergeTree) { useMergeTreeStr += dt + ","; } icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1)); } for (int i = 0; i < shortDataset.size(); i++) { String dataset = shortDataset.get(i); String id = datasetId.get(dataset); icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset)); if (datasetRegThreshold.containsKey(dataset)) { HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset); String thresholds = ""; for (int att : regThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ","; } icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1)); } if (datasetRareThreshold.containsKey(dataset)) { HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset); String thresholds = ""; for (int att : rareThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ","; } icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1)); } } icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); icConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); icConf.set("mapreduce.task.io.sort.mb", "200"); icConf.set("mapreduce.task.io.sort.factor", "100"); //icConf.set("mapreduce.task.timeout", "1800000"); machineConf.setMachineConfiguration(icConf); if (s3) { machineConf.setMachineConfiguration(icConf); icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); icConf.set("bucket", s3bucket); } if (snappyCompression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } icJob = new Job(icConf); icJob.setJobName(jobName); icJob.setMapOutputKeyClass(AttributeResolutionWritable.class); icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class); icJob.setOutputKeyClass(AttributeResolutionWritable.class); icJob.setOutputValueClass(TopologyTimeSeriesWritable.class); //icJob.setOutputKeyClass(Text.class); //icJob.setOutputValueClass(Text.class); icJob.setMapperClass(IndexCreationMapper.class); icJob.setReducerClass(IndexCreationReducer.class); icJob.setNumReduceTasks(machineConf.getNumberReduces()); icJob.setInputFormatClass(SequenceFileInputFormat.class); //icJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(icJob, true); SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(icJob, true); FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1)); FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir)); icJob.setJarByClass(IndexCreation.class); long start = System.currentTimeMillis(); icJob.submit(); icJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetIndex) { String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }
From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java
/** * @param args// w w w.ja v a2 s .c o m * @throws ParseException */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the relationship " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option g1Option = new Option("g1", "first-group", true, "set first group of datasets"); g1Option.setRequired(true); g1Option.setArgName("FIRST GROUP"); g1Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g1Option); Option g2Option = new Option("g2", "second-group", true, "set second group of datasets"); g2Option.setRequired(false); g2Option.setArgName("SECOND GROUP"); g2Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g2Option); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Path path = null; FileSystem fs = FileSystem.get(new Configuration()); ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> firstGroup = new ArrayList<String>(); ArrayList<String> secondGroup = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); boolean removeExistingFiles = cmd.hasOption("f"); String[] firstGroupCmd = cmd.getOptionValues("g1"); String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0]; addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } if (firstGroup.isEmpty()) { System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2."); firstGroup.addAll(secondGroup); } if (secondGroup.isEmpty()) { System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1."); secondGroup.addAll(firstGroup); } // getting dataset ids String datasetNames = ""; String datasetIds = ""; HashMap<String, String> datasetId = new HashMap<String, String>(); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { datasetId.put(it.next(), null); } if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } String firstGroupStr = ""; String secondGroupStr = ""; for (String dataset : firstGroup) { firstGroupStr += datasetId.get(dataset) + ","; } for (String dataset : secondGroup) { secondGroupStr += datasetId.get(dataset) + ","; } firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1); secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1); FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3); String dataAttributesInputDirs = ""; String noRelationship = ""; HashSet<String> dirs = new HashSet<String>(); String dataset1; String dataset2; String datasetId1; String datasetId2; for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } datasetId1 = datasetId.get(dataset1); datasetId2 = datasetId.get(dataset2); if (dataset1.equals(dataset2)) continue; String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2 + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) { dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1); dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2); } else { noRelationship += datasetId1 + "-" + datasetId2 + ","; } } } if (dirs.isEmpty()) { System.out.println("All the relationships were already computed."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } for (String dir : dirs) { dataAttributesInputDirs += dir + ","; } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "correlation"; String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/"; FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3); for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i))); } for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size", Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length)); } conf.set("dataset-keys", datasetIds); conf.set("dataset-names", datasetNames); conf.set("first-group", firstGroupStr); conf.set("second-group", secondGroupStr); conf.set("main-dataset-id", datasetId.get(shortDataset.get(0))); if (noRelationship.length() > 0) { conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1)); } conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); conf.set("mapreduce.task.timeout", "2400000"); if (s3) { machineConf.setMachineConfiguration(conf); conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("bucket", s3bucket); } if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } Job job = new Job(conf); job.setJobName(jobName); job.setMapOutputKeyClass(PairAttributeWritable.class); job.setMapOutputValueClass(SpatioTemporalValueWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CorrelationTechniquesMapper.class); job.setReducerClass(CorrelationTechniquesReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1)); FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir)); job.setJarByClass(CorrelationTechniques.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } if (dataset1.equals(dataset2)) continue; String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-" + dataset2 + "/"; String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2 + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } } }
From source file:de.unileipzig.ub.indexer.App.java
public static void main(String[] args) throws IOException { // create Options object Options options = new Options(); options.addOption("h", "help", false, "display this help"); options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed"); options.addOption("i", "index", true, "the name of the target index"); options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)"); options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)"); options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)"); options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)"); options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)"); options.addOption("v", "verbose", false, "show processing speed while indexing"); options.addOption("s", "status", false, "only show status of index for file"); options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go"); options.addOption("e", "debug", false, "set logging level to debug"); options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout"); options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)"); options.addOption("z", "latest-flag-on", true, "enable latest flag according to field (within content, e.g. 001)"); options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null;/*from www .j ava 2s. c o m*/ try { cmd = parser.parse(options, args); } catch (ParseException ex) { logger.error(ex); System.exit(1); } // setup logging Properties systemProperties = System.getProperties(); systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger"); System.setProperties(systemProperties); Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR); Properties props = new Properties(); props.load(props.getClass().getResourceAsStream("/log4j.properties")); if (cmd.hasOption("debug")) { props.setProperty("log4j.logger.de.unileipzig", "DEBUG"); } if (cmd.hasOption("logfile")) { props.setProperty("log4j.rootLogger", "INFO, stdout, F"); props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender"); props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile")); props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout"); props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n"); } PropertyConfigurator.configure(props); InetAddress addr = InetAddress.getLocalHost(); String memcachedHostAndPort = addr.getHostAddress() + ":11211"; if (cmd.hasOption("m")) { memcachedHostAndPort = cmd.getOptionValue("m"); } // setup caching try { if (memcachedClient == null) { memcachedClient = new MemcachedClient( new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(), AddrUtil.getAddresses("0.0.0.0:11211")); try { // give client and server 500ms Thread.sleep(300); } catch (InterruptedException ex) { } Collection availableServers = memcachedClient.getAvailableServers(); logger.info(availableServers); if (availableServers.size() == 0) { logger.info("no memcached servers found"); memcachedClient.shutdown(); memcachedClient = null; } else { logger.info(availableServers.size() + " memcached server(s) detected, fine."); } } } catch (IOException ex) { logger.warn("couldn't create a connection, bailing out: " + ex.getMessage()); } // process options if (cmd.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("indexer", options, true); quit(0); } boolean verbose = false; if (cmd.hasOption("verbose")) { verbose = true; } // ES options String[] hosts = new String[] { "0.0.0.0" }; int port = 9300; String clusterName = "elasticsearch_mdma"; int bulkSize = 3000; if (cmd.hasOption("host")) { hosts = cmd.getOptionValues("host"); } if (cmd.hasOption("port")) { port = Integer.parseInt(cmd.getOptionValue("port")); } if (cmd.hasOption("cluster")) { clusterName = cmd.getOptionValue("cluster"); } if (cmd.hasOption("bulksize")) { bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize")); if (bulkSize < 1 || bulkSize > 100000) { logger.error("bulksize must be between 1 and 100,000"); quit(1); } } // ES Client final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma") .build(); final TransportClient client = new TransportClient(settings); for (String host : hosts) { client.addTransportAddress(new InetSocketTransportAddress(host, port)); } if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) { final String filename = cmd.getOptionValue("filename"); final File _file = new File(filename); if (_file.length() == 0) { logger.info(_file.getAbsolutePath() + " is empty, skipping"); quit(0); // file is empty } // for flat mode: leave a stampfile beside the json to // indicate previous successful processing File directory = new File(filename).getParentFile(); File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed"); long start = System.currentTimeMillis(); long lineCount = 0; final String indexName = cmd.getOptionValue("index"); final String docType = cmd.getOptionValue("doctype"); BulkRequestBuilder bulkRequest = client.prepareBulk(); try { if (cmd.hasOption("flat")) { // flat mode // ......... if (stampfile.exists()) { logger.info("SKIPPING, since it seems this file has already " + "been imported (found: " + stampfile.getAbsolutePath() + ")"); quit(0); } } else { final String srcSHA1 = extractSrcSHA1(filename); logger.debug(filename + " srcsha1: " + srcSHA1); long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1); logger.debug(filename + " indexed: " + docsInIndex); long docsInFile = getLineCount(filename); logger.debug(filename + " lines: " + docsInFile); // in non-flat-mode, indexing would take care // of inconsistencies if (docsInIndex == docsInFile) { logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")"); client.close(); quit(0); } if (docsInIndex > 0) { logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:" + docsInFile); if (!cmd.hasOption("r")) { logger.warn( "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); client.close(); quit(1); } else { logger.info("Attempting to clear residues..."); // attempt to repair once DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName) .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet(); Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator(); long deletions = 0; while (it.hasNext()) { IndexDeleteByQueryResponse response = it.next(); deletions += 1; } logger.info("Deleted residues of " + filename); logger.info("Refreshing [" + indexName + "]"); RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1); logger.info(indexedAfterDelete + " docs remained"); if (indexedAfterDelete > 0) { logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); quit(1); } else { logger.info("Residues are gone. Now trying to reindex: " + filename); } } } } logger.info("INDEXING-REQUIRED: " + filename); if (cmd.hasOption("status")) { quit(0); } HashSet idsInBatch = new HashSet(); String idField = null; if (cmd.hasOption("z")) { idField = cmd.getOptionValue("z"); } final FileReader fr = new FileReader(filename); final BufferedReader br = new BufferedReader(fr); String line; // one line is one document while ((line = br.readLine()) != null) { // "Latest-Flag" machine // This gets obsolete with a "flat" index if (cmd.hasOption("z")) { // flag that indicates, whether the document // about to be indexed will be the latest boolean willBeLatest = true; // check if there is a previous (lower meta.timestamp) document with // the same identifier (whatever that may be - queried under "content") final String contentIdentifier = getContentIdentifier(line, idField); idsInBatch.add(contentIdentifier); // assumed in meta.timestamp final Long timestamp = Long.parseLong(getTimestamp(line)); logger.debug("Checking whether record is latest (line: " + lineCount + ")"); logger.debug(contentIdentifier + ", " + timestamp); // get all docs, which match the contentIdentifier // by filter, which doesn't score final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField, contentIdentifier); final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType); final AndFilterBuilder afb = new AndFilterBuilder(); afb.add(idFilter).add(kindFilter); final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb); final SearchResponse searchResponse = client.prepareSearch(indexName) .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0) .setSize(1200) // 3 years and 105 days assuming daily updates at the most .setExplain(false).execute().actionGet(); final SearchHits searchHits = searchResponse.getHits(); logger.debug("docs with this id in the index: " + searchHits.getTotalHits()); for (final SearchHit hit : searchHits.getHits()) { final String docId = hit.id(); final Map<String, Object> source = hit.sourceAsMap(); final Map meta = (Map) source.get("meta"); final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString()); // if the indexed doc timestamp is lower the the current one, // remove any latest flag if (timestamp >= docTimestamp) { source.remove("latest"); final ObjectMapper mapper = new ObjectMapper(); // put the updated doc back // IndexResponse response = client.prepareIndex(indexName, docType).setCreate(false).setId(docId) .setSource(mapper.writeValueAsBytes(source)) .execute(new ActionListener<IndexResponse>() { public void onResponse(IndexResponse rspns) { logger.debug("Removed latest flag from " + contentIdentifier + ", " + docTimestamp + ", " + hit.id() + " since (" + timestamp + " > " + docTimestamp + ")"); } public void onFailure(Throwable thrwbl) { logger.error("Could not remove flag from " + hit.id() + ", " + contentIdentifier); } }); // .execute() //.actionGet(); } else { logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")"); willBeLatest = false; } } if (willBeLatest) { line = setLatestFlag(line); logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp); } // end of latest-flag machine // beware - this will be correct as long as there // are no dups within one bulk! } bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line)); lineCount++; logger.debug("Added line " + lineCount + " to BULK"); logger.debug(line); if (lineCount % bulkSize == 0) { if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) { logger.error( "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy"); logger.error( "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)"); } idsInBatch.clear(); logger.debug("Issuing BULK request"); final long actionCount = bulkRequest.numberOfActions(); final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } bulkRequest = client.prepareBulk(); } } // handle the remaining items final long actionCount = bulkRequest.numberOfActions(); if (actionCount > 0) { final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { // trigger update now RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } } br.close(); client.close(); final double elapsed = (System.currentTimeMillis() - start) / 1000; final double speed = (lineCount / elapsed); logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: " + String.format("%.2f", speed) + "r/s)"); if (cmd.hasOption("flat")) { try { FileUtils.touch(stampfile); } catch (IOException ioe) { logger.warn(".indexed files not created. Will reindex everything everytime."); } } } catch (IOException e) { client.close(); logger.error(e); quit(1); } finally { client.close(); } } quit(0); }
From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java
/** * @param args//from w ww .j a va 2 s. c o m * @throws ParseException */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the relationship " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score"); scoreOption.setRequired(false); scoreOption.setArgName("SCORE THRESHOLD"); options.addOption(scoreOption); Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength"); strengthOption.setRequired(false); strengthOption.setArgName("STRENGTH THRESHOLD"); options.addOption(strengthOption); Option completeRandomizationOption = new Option("c", "complete-randomization", false, "use complete randomization when performing significance tests"); completeRandomizationOption.setRequired(false); options.addOption(completeRandomizationOption); Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes"); idOption.setRequired(false); options.addOption(idOption); Option g1Option = new Option("g1", "first-group", true, "set first group of datasets"); g1Option.setRequired(true); g1Option.setArgName("FIRST GROUP"); g1Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g1Option); Option g2Option = new Option("g2", "second-group", true, "set second group of datasets"); g2Option.setRequired(false); g2Option.setArgName("SECOND GROUP"); g2Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g2Option); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); Option removeOption = new Option("r", "remove-not-significant", false, "remove relationships that are not" + "significant from the final output"); removeOption.setRequired(false); options.addOption(removeOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Path path = null; FileSystem fs = FileSystem.get(new Configuration()); ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> firstGroup = new ArrayList<String>(); ArrayList<String> secondGroup = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); boolean removeNotSignificant = cmd.hasOption("r"); boolean removeExistingFiles = cmd.hasOption("f"); boolean completeRandomization = cmd.hasOption("c"); boolean hasScoreThreshold = cmd.hasOption("sc"); boolean hasStrengthThreshold = cmd.hasOption("st"); boolean outputIds = cmd.hasOption("id"); String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : ""; String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : ""; // all datasets ArrayList<String> all_datasets = new ArrayList<String>(); if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { all_datasets.add(line.split("\t")[0]); line = br.readLine(); } br.close(); if (s3) fs.close(); String[] all_datasets_array = new String[all_datasets.size()]; all_datasets.toArray(all_datasets_array); String[] firstGroupCmd = cmd.getOptionValues("g1"); String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array; addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } if (firstGroup.isEmpty()) { System.out.println("No indices from datasets in G1."); System.exit(0); } if (secondGroup.isEmpty()) { System.out.println("No indices from datasets in G2."); System.exit(0); } // getting dataset ids String datasetNames = ""; String datasetIds = ""; HashMap<String, String> datasetId = new HashMap<String, String>(); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { datasetId.put(it.next(), null); } if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); all_datasets.add(dt[0]); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } String firstGroupStr = ""; String secondGroupStr = ""; for (String dataset : firstGroup) { firstGroupStr += datasetId.get(dataset) + ","; } for (String dataset : secondGroup) { secondGroupStr += datasetId.get(dataset) + ","; } firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1); secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1); String relationshipsDir = ""; if (outputIds) { relationshipsDir = FrameworkUtils.relationshipsIdsDir; } else { relationshipsDir = FrameworkUtils.relationshipsDir; } FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3); String random = completeRandomization ? "complete" : "restricted"; String indexInputDirs = ""; String noRelationship = ""; HashSet<String> dirs = new HashSet<String>(); String dataset1; String dataset2; String datasetId1; String datasetId2; for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } datasetId1 = datasetId.get(dataset1); datasetId2 = datasetId.get(dataset2); if (dataset1.equals(dataset2)) continue; String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) { dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1); dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2); } else { noRelationship += datasetId1 + "-" + datasetId2 + ","; } } } if (dirs.isEmpty()) { System.out.println("All the relationships were already computed."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } for (String dir : dirs) { indexInputDirs += dir + ","; } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "relationship" + "-" + random; String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/"; FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3); for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i))); } for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size", Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length)); } conf.set("dataset-keys", datasetIds); conf.set("dataset-names", datasetNames); conf.set("first-group", firstGroupStr); conf.set("second-group", secondGroupStr); conf.set("complete-random", String.valueOf(completeRandomization)); conf.set("output-ids", String.valueOf(outputIds)); conf.set("complete-random-str", random); conf.set("main-dataset-id", datasetId.get(shortDataset.get(0))); conf.set("remove-not-significant", String.valueOf(removeNotSignificant)); if (noRelationship.length() > 0) { conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1)); } if (hasScoreThreshold) { conf.set("score-threshold", scoreThreshold); } if (hasStrengthThreshold) { conf.set("strength-threshold", strengthThreshold); } conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); conf.set("mapreduce.task.timeout", "2400000"); if (s3) { machineConf.setMachineConfiguration(conf); conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("bucket", s3bucket); } if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } Job job = new Job(conf); job.setJobName(jobName); job.setMapOutputKeyClass(PairAttributeWritable.class); job.setMapOutputValueClass(TopologyTimeSeriesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CorrelationMapper.class); job.setReducerClass(CorrelationReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); job.setInputFormatClass(SequenceFileInputFormat.class); //job.setOutputFormatClass(TextOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1)); FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir)); job.setJarByClass(Relationship.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } if (dataset1.equals(dataset2)) continue; String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/"; String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } } }
From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java
/** * @param args//from w w w . j a v a 2 s . c o m */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the aggregate functions " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions" + " will be computed, followed by their temporal and spatial attribute indices"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; String preProcessingDatasets = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetAggregation = new ArrayList<String>(); HashMap<String, String> datasetTempAtt = new HashMap<String, String>(); HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>(); HashMap<String, String> preProcessingDataset = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); boolean removeExistingFiles = cmd.hasOption("f"); String[] datasetArgs = cmd.getOptionValues("g"); for (int i = 0; i < datasetArgs.length; i += 3) { String dataset = datasetArgs[i]; // getting pre-processing String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3); if (tempPreProcessing == null) { System.out.println("No pre-processing available for " + dataset); continue; } preProcessingDataset.put(dataset, tempPreProcessing); shortDataset.add(dataset); datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1])); datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2])); datasetId.put(dataset, null); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id Path path = null; FileSystem fs = null; if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3); // getting smallest resolution HashMap<String, String> tempResMap = new HashMap<String, String>(); HashMap<String, String> spatialResMap = new HashMap<String, String>(); HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>(); HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>(); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String[] datasetArray = preProcessingDataset.get(dataset).split("-"); String datasetTemporalStr = datasetArray[datasetArray.length - 2]; int datasetTemporal = utils.temporalResolution(datasetTemporalStr); String datasetSpatialStr = datasetArray[datasetArray.length - 1]; int datasetSpatial = utils.spatialResolution(datasetSpatialStr); // finding all possible resolutions String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal); String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial); String temporalResolution = ""; String spatialResolution = ""; String tempRes = ""; String spatialRes = ""; boolean dataAdded = false; for (int i = 0; i < temporalResolutions.length; i++) { for (int j = 0; j < spatialResolutions.length; j++) { temporalResolution = temporalResolutions[i]; spatialResolution = spatialResolutions[j]; String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) { dataAdded = true; tempRes += temporalResolution + "-"; spatialRes += spatialResolution + "-"; } } } if (dataAdded) { input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); shortDatasetAggregation.add(dataset); tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1)); spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1)); datasetTemporalStrMap.put(dataset, datasetTemporalStr); datasetSpatialStrMap.put(dataset, datasetSpatialStr); } } if (input.isEmpty()) { System.out.println("All the input datasets have aggregates."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } it = input.iterator(); while (it.hasNext()) { preProcessingDatasets += it.next() + ","; } Job aggJob = null; String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/"; String jobName = "aggregates"; FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3); Configuration aggConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); aggConf.set("dataset-name", datasetNames); aggConf.set("dataset-id", datasetIds); for (int i = 0; i < shortDatasetAggregation.size(); i++) { String dataset = shortDatasetAggregation.get(i); String id = datasetId.get(dataset); aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset)); aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset)); aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset)); aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset)); if (s3) aggConf.set("dataset-" + id, s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); else aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/" + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); } aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); aggConf.set("mapreduce.task.io.sort.mb", "200"); aggConf.set("mapreduce.task.io.sort.factor", "100"); machineConf.setMachineConfiguration(aggConf); if (s3) { machineConf.setMachineConfiguration(aggConf); aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); } if (snappyCompression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } aggJob = new Job(aggConf); aggJob.setJobName(jobName); aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class); aggJob.setMapOutputValueClass(AggregationArrayWritable.class); aggJob.setOutputKeyClass(SpatioTemporalWritable.class); aggJob.setOutputValueClass(FloatArrayWritable.class); //aggJob.setOutputKeyClass(Text.class); //aggJob.setOutputValueClass(Text.class); aggJob.setMapperClass(AggregationMapper.class); aggJob.setCombinerClass(AggregationCombiner.class); aggJob.setReducerClass(AggregationReducer.class); aggJob.setNumReduceTasks(machineConf.getNumberReduces()); aggJob.setInputFormatClass(SequenceFileInputFormat.class); //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(aggJob, true); SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(aggJob, true); FileInputFormat.setInputPaths(aggJob, preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1)); FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir)); aggJob.setJarByClass(Aggregation.class); long start = System.currentTimeMillis(); aggJob.submit(); aggJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetAggregation) { String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }
From source file:eqtlmappingpipeline.util.ModuleEqtlGeuvadisReplication.java
/** * @param args the command line arguments *//* ww w . j a va2 s.co m*/ public static void main(String[] args) throws IOException, LdCalculatorException { System.out.println(HEADER); System.out.println(); System.out.flush(); //flush to make sure header is before errors try { Thread.sleep(25); //Allows flush to complete } catch (InterruptedException ex) { } CommandLineParser parser = new PosixParser(); final CommandLine commandLine; try { commandLine = parser.parse(OPTIONS, args, true); } catch (ParseException ex) { System.err.println("Invalid command line arguments: " + ex.getMessage()); System.err.println(); new HelpFormatter().printHelp(" ", OPTIONS); System.exit(1); return; } final String[] genotypesBasePaths = commandLine.getOptionValues("g"); final RandomAccessGenotypeDataReaderFormats genotypeDataType; final String replicationQtlFilePath = commandLine.getOptionValue("e"); final String interactionQtlFilePath = commandLine.getOptionValue("i"); final String outputFilePath = commandLine.getOptionValue("o"); final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld")); final int window = Integer.parseInt(commandLine.getOptionValue("w")); System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths)); System.out.println("Interaction file: " + interactionQtlFilePath); System.out.println("Replication file: " + replicationQtlFilePath); System.out.println("Output: " + outputFilePath); System.out.println("LD: " + ldCutoff); System.out.println("Window: " + window); try { if (commandLine.hasOption("G")) { genotypeDataType = RandomAccessGenotypeDataReaderFormats .valueOf(commandLine.getOptionValue("G").toUpperCase()); } else { if (genotypesBasePaths[0].endsWith(".vcf")) { System.err.println( "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file."); System.exit(1); return; } try { genotypeDataType = RandomAccessGenotypeDataReaderFormats .matchFormatToPath(genotypesBasePaths[0]); } catch (GenotypeDataException e) { System.err .println("Unable to determine input 1 type based on specified path. Please specify -G"); System.exit(1); return; } } } catch (IllegalArgumentException e) { System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G") + "\" is not a valid input data format"); System.exit(1); return; } final RandomAccessGenotypeData genotypeData; try { genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null, 0.8); } catch (TabixFileNotFoundException e) { LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n" + "Please see README on how to create a tabix file"); System.exit(1); return; } catch (IOException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } catch (IncompatibleMultiPartGenotypeDataException e) { LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e); System.exit(1); return; } catch (GenotypeDataException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } ChrPosTreeMap<ArrayList<EQTL>> replicationQtls = new QTLTextFile(replicationQtlFilePath, false) .readQtlsAsTreeMap(); int interactionSnpNotInGenotypeData = 0; int noReplicationQtlsInWindow = 0; int noReplicationQtlsInLd = 0; int multipleReplicationQtlsInLd = 0; int replicationTopSnpNotInGenotypeData = 0; final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0'); final String[] outputLine = new String[14]; int c = 0; outputLine[c++] = "Chr"; outputLine[c++] = "Pos"; outputLine[c++] = "SNP"; outputLine[c++] = "Gene"; outputLine[c++] = "Module"; outputLine[c++] = "DiscoveryZ"; outputLine[c++] = "ReplicationZ"; outputLine[c++] = "DiscoveryZCorrected"; outputLine[c++] = "ReplicationZCorrected"; outputLine[c++] = "DiscoveryAlleleAssessed"; outputLine[c++] = "ReplicationAlleleAssessed"; outputLine[c++] = "bestLd"; outputLine[c++] = "bestLd_dist"; outputLine[c++] = "nextLd"; outputWriter.writeNext(outputLine); HashSet<String> notFound = new HashSet<>(); CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t'); interactionQtlReader.readNext();//skip header String[] interactionQtlLine; while ((interactionQtlLine = interactionQtlReader.readNext()) != null) { String snp = interactionQtlLine[1]; String chr = interactionQtlLine[2]; int pos = Integer.parseInt(interactionQtlLine[3]); String gene = interactionQtlLine[4]; String alleleAssessed = interactionQtlLine[9]; String module = interactionQtlLine[12]; double discoveryZ = Double.parseDouble(interactionQtlLine[10]); GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos); if (interactionQtlVariant == null) { System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos); ++interactionSnpNotInGenotypeData; continue; } EQTL bestMatch = null; double bestMatchR2 = Double.NaN; Ld bestMatchLd = null; double nextBestR2 = Double.NaN; ArrayList<EQTL> sameSnpQtls = replicationQtls.get(chr, pos); if (sameSnpQtls != null) { for (EQTL sameSnpQtl : sameSnpQtls) { if (sameSnpQtl.getProbe().equals(gene)) { bestMatch = sameSnpQtl; bestMatchR2 = 1; } } } NavigableMap<Integer, ArrayList<EQTL>> potentionalReplicationQtls = replicationQtls.getChrRange(chr, pos - window, true, pos + window, true); for (ArrayList<EQTL> potentialReplicationQtls : potentionalReplicationQtls.values()) { for (EQTL potentialReplicationQtl : potentialReplicationQtls) { if (!potentialReplicationQtl.getProbe().equals(gene)) { continue; } GeneticVariant potentialReplicationQtlVariant = genotypeData.getSnpVariantByPos( potentialReplicationQtl.getRsChr().toString(), potentialReplicationQtl.getRsChrPos()); if (potentialReplicationQtlVariant == null) { notFound.add(potentialReplicationQtl.getRsChr().toString() + ":" + potentialReplicationQtl.getRsChrPos()); ++replicationTopSnpNotInGenotypeData; continue; } Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant); double r2 = ld.getR2(); if (r2 > 1) { r2 = 1; } if (bestMatch == null) { bestMatch = potentialReplicationQtl; bestMatchR2 = r2; bestMatchLd = ld; } else if (r2 > bestMatchR2) { bestMatch = potentialReplicationQtl; nextBestR2 = bestMatchR2; bestMatchR2 = r2; bestMatchLd = ld; } } } double replicationZ = Double.NaN; double replicationZCorrected = Double.NaN; double discoveryZCorrected = Double.NaN; String replicationAlleleAssessed = null; if (bestMatch != null) { replicationZ = bestMatch.getZscore(); replicationAlleleAssessed = bestMatch.getAlleleAssessed(); if (pos != bestMatch.getRsChrPos()) { String commonHap = null; double commonHapFreq = -1; for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) { double f = hapFreq.getValue(); if (f > commonHapFreq) { commonHapFreq = f; commonHap = hapFreq.getKey(); } } String[] commonHapAlleles = StringUtils.split(commonHap, '/'); discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1; replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; } else { discoveryZCorrected = discoveryZ; replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; } } c = 0; outputLine[c++] = chr; outputLine[c++] = String.valueOf(pos); outputLine[c++] = snp; outputLine[c++] = gene; outputLine[c++] = module; outputLine[c++] = String.valueOf(discoveryZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected); outputLine[c++] = alleleAssessed; outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAlleleAssessed()); outputLine[c++] = String.valueOf(bestMatchR2); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getRsChrPos())); outputLine[c++] = String.valueOf(nextBestR2); outputWriter.writeNext(outputLine); } outputWriter.close(); for (String e : notFound) { System.err.println("Not found: " + e); } System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData); System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow); System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd); System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd); System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData); }
From source file:eqtlmappingpipeline.util.ModuleEqtlNeutrophilReplication.java
/** * @param args the command line arguments *//* w w w . java2s . c o m*/ public static void main(String[] args) throws IOException, LdCalculatorException { System.out.println(HEADER); System.out.println(); System.out.flush(); //flush to make sure header is before errors try { Thread.sleep(25); //Allows flush to complete } catch (InterruptedException ex) { } CommandLineParser parser = new PosixParser(); final CommandLine commandLine; try { commandLine = parser.parse(OPTIONS, args, true); } catch (ParseException ex) { System.err.println("Invalid command line arguments: " + ex.getMessage()); System.err.println(); new HelpFormatter().printHelp(" ", OPTIONS); System.exit(1); return; } final String[] genotypesBasePaths = commandLine.getOptionValues("g"); final RandomAccessGenotypeDataReaderFormats genotypeDataType; final String replicationQtlFilePath = commandLine.getOptionValue("e"); final String interactionQtlFilePath = commandLine.getOptionValue("i"); final String outputFilePath = commandLine.getOptionValue("o"); final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld")); final int window = Integer.parseInt(commandLine.getOptionValue("w")); System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths)); System.out.println("Interaction file: " + interactionQtlFilePath); System.out.println("Replication file: " + replicationQtlFilePath); System.out.println("Output: " + outputFilePath); System.out.println("LD: " + ldCutoff); System.out.println("Window: " + window); try { if (commandLine.hasOption("G")) { genotypeDataType = RandomAccessGenotypeDataReaderFormats .valueOf(commandLine.getOptionValue("G").toUpperCase()); } else { if (genotypesBasePaths[0].endsWith(".vcf")) { System.err.println( "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file."); System.exit(1); return; } try { genotypeDataType = RandomAccessGenotypeDataReaderFormats .matchFormatToPath(genotypesBasePaths[0]); } catch (GenotypeDataException e) { System.err .println("Unable to determine input 1 type based on specified path. Please specify -G"); System.exit(1); return; } } } catch (IllegalArgumentException e) { System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G") + "\" is not a valid input data format"); System.exit(1); return; } final RandomAccessGenotypeData genotypeData; try { genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null, 0.8); } catch (TabixFileNotFoundException e) { LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n" + "Please see README on how to create a tabix file"); System.exit(1); return; } catch (IOException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } catch (IncompatibleMultiPartGenotypeDataException e) { LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e); System.exit(1); return; } catch (GenotypeDataException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } ChrPosTreeMap<ArrayList<ReplicationQtl>> replicationQtls = new ChrPosTreeMap<>(); CSVReader replicationQtlReader = new CSVReader(new FileReader(replicationQtlFilePath), '\t'); replicationQtlReader.readNext();//skip header String[] replicationLine; while ((replicationLine = replicationQtlReader.readNext()) != null) { try { GeneticVariant variant = genotypeData.getSnpVariantByPos(replicationLine[REPLICATION_SNP_CHR_COL], Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL])); if (variant == null) { continue; } ReplicationQtl replicationQtl = new ReplicationQtl(replicationLine[REPLICATION_SNP_CHR_COL], Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]), replicationLine[REPLICATION_GENE_COL], Double.parseDouble(replicationLine[REPLICATION_BETA_COL]), variant.getAlternativeAlleles().get(0).getAlleleAsString()); ArrayList<ReplicationQtl> posReplicationQtls = replicationQtls.get(replicationQtl.getChr(), replicationQtl.getPos()); if (posReplicationQtls == null) { posReplicationQtls = new ArrayList<>(); replicationQtls.put(replicationQtl.getChr(), replicationQtl.getPos(), posReplicationQtls); } posReplicationQtls.add(replicationQtl); } catch (Exception e) { System.out.println(Arrays.toString(replicationLine)); throw e; } } int interactionSnpNotInGenotypeData = 0; int noReplicationQtlsInWindow = 0; int noReplicationQtlsInLd = 0; int multipleReplicationQtlsInLd = 0; int replicationTopSnpNotInGenotypeData = 0; final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0'); final String[] outputLine = new String[14]; int c = 0; outputLine[c++] = "Chr"; outputLine[c++] = "Pos"; outputLine[c++] = "SNP"; outputLine[c++] = "Gene"; outputLine[c++] = "Module"; outputLine[c++] = "DiscoveryZ"; outputLine[c++] = "ReplicationZ"; outputLine[c++] = "DiscoveryZCorrected"; outputLine[c++] = "ReplicationZCorrected"; outputLine[c++] = "DiscoveryAlleleAssessed"; outputLine[c++] = "ReplicationAlleleAssessed"; outputLine[c++] = "bestLd"; outputLine[c++] = "bestLd_dist"; outputLine[c++] = "nextLd"; outputWriter.writeNext(outputLine); HashSet<String> notFound = new HashSet<>(); CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t'); interactionQtlReader.readNext();//skip header String[] interactionQtlLine; while ((interactionQtlLine = interactionQtlReader.readNext()) != null) { String snp = interactionQtlLine[1]; String chr = interactionQtlLine[2]; int pos = Integer.parseInt(interactionQtlLine[3]); String gene = interactionQtlLine[4]; String alleleAssessed = interactionQtlLine[9]; String module = interactionQtlLine[12]; double discoveryZ = Double.parseDouble(interactionQtlLine[10]); GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos); if (interactionQtlVariant == null) { System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos); ++interactionSnpNotInGenotypeData; continue; } ReplicationQtl bestMatch = null; double bestMatchR2 = Double.NaN; Ld bestMatchLd = null; double nextBestR2 = Double.NaN; ArrayList<ReplicationQtl> sameSnpQtls = replicationQtls.get(chr, pos); if (sameSnpQtls != null) { for (ReplicationQtl sameSnpQtl : sameSnpQtls) { if (sameSnpQtl.getGene().equals(gene)) { bestMatch = sameSnpQtl; bestMatchR2 = 1; } } } NavigableMap<Integer, ArrayList<ReplicationQtl>> potentionalReplicationQtls = replicationQtls .getChrRange(chr, pos - window, true, pos + window, true); for (ArrayList<ReplicationQtl> potentialReplicationQtls : potentionalReplicationQtls.values()) { for (ReplicationQtl potentialReplicationQtl : potentialReplicationQtls) { if (!potentialReplicationQtl.getGene().equals(gene)) { continue; } GeneticVariant potentialReplicationQtlVariant = genotypeData .getSnpVariantByPos(potentialReplicationQtl.getChr(), potentialReplicationQtl.getPos()); if (potentialReplicationQtlVariant == null) { notFound.add(potentialReplicationQtl.getChr() + ":" + potentialReplicationQtl.getPos()); ++replicationTopSnpNotInGenotypeData; continue; } Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant); double r2 = ld.getR2(); if (r2 > 1) { r2 = 1; } if (bestMatch == null) { bestMatch = potentialReplicationQtl; bestMatchR2 = r2; bestMatchLd = ld; } else if (r2 > bestMatchR2) { bestMatch = potentialReplicationQtl; nextBestR2 = bestMatchR2; bestMatchR2 = r2; bestMatchLd = ld; } } } double replicationZ = Double.NaN; double replicationZCorrected = Double.NaN; double discoveryZCorrected = Double.NaN; String replicationAlleleAssessed = null; if (bestMatch != null) { replicationZ = bestMatch.getBeta(); replicationAlleleAssessed = bestMatch.getAssessedAllele(); if (pos != bestMatch.getPos()) { String commonHap = null; double commonHapFreq = -1; for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) { double f = hapFreq.getValue(); if (f > commonHapFreq) { commonHapFreq = f; commonHap = hapFreq.getKey(); } } String[] commonHapAlleles = StringUtils.split(commonHap, '/'); discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1; replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; } else { discoveryZCorrected = discoveryZ; replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; } } c = 0; outputLine[c++] = chr; outputLine[c++] = String.valueOf(pos); outputLine[c++] = snp; outputLine[c++] = gene; outputLine[c++] = module; outputLine[c++] = String.valueOf(discoveryZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected); outputLine[c++] = alleleAssessed; outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAssessedAllele()); outputLine[c++] = String.valueOf(bestMatchR2); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getPos())); outputLine[c++] = String.valueOf(nextBestR2); outputWriter.writeNext(outputLine); } outputWriter.close(); for (String e : notFound) { System.err.println("Not found: " + e); } System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData); System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow); System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd); System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd); System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData); }
From source file:eqtlmappingpipeline.util.ModuleEqtWestraReplication.java
/** * @param args the command line arguments */// w w w. j a v a 2 s. c o m public static void main(String[] args) throws IOException, LdCalculatorException { System.out.println(HEADER); System.out.println(); System.out.flush(); //flush to make sure header is before errors try { Thread.sleep(25); //Allows flush to complete } catch (InterruptedException ex) { } CommandLineParser parser = new PosixParser(); final CommandLine commandLine; try { commandLine = parser.parse(OPTIONS, args, true); } catch (ParseException ex) { System.err.println("Invalid command line arguments: " + ex.getMessage()); System.err.println(); new HelpFormatter().printHelp(" ", OPTIONS); System.exit(1); return; } final String[] genotypesBasePaths = commandLine.getOptionValues("g"); final RandomAccessGenotypeDataReaderFormats genotypeDataType; final String replicationQtlFilePath = commandLine.getOptionValue("e"); final String interactionQtlFilePath = commandLine.getOptionValue("i"); final String outputFilePath = commandLine.getOptionValue("o"); final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld")); final int window = Integer.parseInt(commandLine.getOptionValue("w")); System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths)); System.out.println("Interaction file: " + interactionQtlFilePath); System.out.println("Replication file: " + replicationQtlFilePath); System.out.println("Output: " + outputFilePath); System.out.println("LD: " + ldCutoff); System.out.println("Window: " + window); try { if (commandLine.hasOption("G")) { genotypeDataType = RandomAccessGenotypeDataReaderFormats .valueOf(commandLine.getOptionValue("G").toUpperCase()); } else { if (genotypesBasePaths[0].endsWith(".vcf")) { System.err.println( "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file."); System.exit(1); return; } try { genotypeDataType = RandomAccessGenotypeDataReaderFormats .matchFormatToPath(genotypesBasePaths[0]); } catch (GenotypeDataException e) { System.err .println("Unable to determine input 1 type based on specified path. Please specify -G"); System.exit(1); return; } } } catch (IllegalArgumentException e) { System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G") + "\" is not a valid input data format"); System.exit(1); return; } final RandomAccessGenotypeData genotypeData; try { genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null, 0.8); } catch (TabixFileNotFoundException e) { LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n" + "Please see README on how to create a tabix file"); System.exit(1); return; } catch (IOException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } catch (IncompatibleMultiPartGenotypeDataException e) { LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e); System.exit(1); return; } catch (GenotypeDataException e) { LOGGER.fatal("Error reading input data: " + e.getMessage(), e); System.exit(1); return; } ChrPosTreeMap<ArrayList<ReplicationQtl>> replicationQtls = new ChrPosTreeMap<>(); CSVReader replicationQtlReader = new CSVReader(new FileReader(replicationQtlFilePath), '\t'); String[] replicationHeader = replicationQtlReader.readNext(); String[] replicationLine; while ((replicationLine = replicationQtlReader.readNext()) != null) { try { GeneticVariant variant = genotypeData.getSnpVariantByPos(replicationLine[REPLICATION_SNP_CHR_COL], Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL])); if (variant == null) { continue; } Alleles variantAlleles = variant.getVariantAlleles(); String[] replicationAllelesString = StringUtils.split(replicationLine[REPLICATION_ALLELES_COL], '/'); Alleles replicationAlleles = Alleles.createBasedOnString(replicationAllelesString[0], replicationAllelesString[1]); Allele assessedAlleleReplication = Allele.create(replicationLine[REPLICATION_ALLELE_ASSESSED_COL]); boolean isAmbigous = replicationAlleles.isAtOrGcSnp(); if (!variantAlleles.equals(replicationAlleles)) { if (variantAlleles.equals(replicationAlleles.getComplement())) { assessedAlleleReplication = assessedAlleleReplication.getComplement(); } else { continue; } } ReplicationQtl replicationQtl = new ReplicationQtl(replicationLine[REPLICATION_SNP_CHR_COL], Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]), replicationLine[REPLICATION_GENE_COL], Double.parseDouble(replicationLine[REPLICATION_BETA_COL]), assessedAlleleReplication.getAlleleAsString(), replicationLine, isAmbigous); ArrayList<ReplicationQtl> posReplicationQtls = replicationQtls.get(replicationQtl.getChr(), replicationQtl.getPos()); if (posReplicationQtls == null) { posReplicationQtls = new ArrayList<>(); replicationQtls.put(replicationQtl.getChr(), replicationQtl.getPos(), posReplicationQtls); } posReplicationQtls.add(replicationQtl); } catch (Exception e) { System.out.println(Arrays.toString(replicationLine)); throw e; } } int interactionSnpNotInGenotypeData = 0; int noReplicationQtlsInWindow = 0; int noReplicationQtlsInLd = 0; int multipleReplicationQtlsInLd = 0; int replicationTopSnpNotInGenotypeData = 0; final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0'); final String[] outputLine = new String[15 + EXTRA_COL_FROM_REPLICATION.length]; int c = 0; outputLine[c++] = "Chr"; outputLine[c++] = "Pos"; outputLine[c++] = "SNP"; outputLine[c++] = "Gene"; outputLine[c++] = "Module"; outputLine[c++] = "DiscoveryZ"; outputLine[c++] = "ReplicationZ"; outputLine[c++] = "DiscoveryZCorrected"; outputLine[c++] = "ReplicationZCorrected"; outputLine[c++] = "DiscoveryAlleleAssessed"; outputLine[c++] = "ReplicationAlleleAssessed"; outputLine[c++] = "bestLd"; outputLine[c++] = "bestLd_dist"; outputLine[c++] = "nextLd"; outputLine[c++] = "replicationAmbigous"; for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) { outputLine[c++] = replicationHeader[EXTRA_COL_FROM_REPLICATION[i]]; } outputWriter.writeNext(outputLine); HashSet<String> notFound = new HashSet<>(); CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t'); interactionQtlReader.readNext();//skip header String[] interactionQtlLine; while ((interactionQtlLine = interactionQtlReader.readNext()) != null) { String snp = interactionQtlLine[1]; String chr = interactionQtlLine[2]; int pos = Integer.parseInt(interactionQtlLine[3]); String gene = interactionQtlLine[4]; String alleleAssessed = interactionQtlLine[9]; String module = interactionQtlLine[12]; double discoveryZ = Double.parseDouble(interactionQtlLine[10]); GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos); if (interactionQtlVariant == null) { System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos); ++interactionSnpNotInGenotypeData; continue; } ReplicationQtl bestMatch = null; double bestMatchR2 = Double.NaN; Ld bestMatchLd = null; double nextBestR2 = Double.NaN; ArrayList<ReplicationQtl> sameSnpQtls = replicationQtls.get(chr, pos); if (sameSnpQtls != null) { for (ReplicationQtl sameSnpQtl : sameSnpQtls) { if (sameSnpQtl.getGene().equals(gene)) { bestMatch = sameSnpQtl; bestMatchR2 = 1; } } } NavigableMap<Integer, ArrayList<ReplicationQtl>> potentionalReplicationQtls = replicationQtls .getChrRange(chr, pos - window, true, pos + window, true); for (ArrayList<ReplicationQtl> potentialReplicationQtls : potentionalReplicationQtls.values()) { for (ReplicationQtl potentialReplicationQtl : potentialReplicationQtls) { if (!potentialReplicationQtl.getGene().equals(gene)) { continue; } GeneticVariant potentialReplicationQtlVariant = genotypeData .getSnpVariantByPos(potentialReplicationQtl.getChr(), potentialReplicationQtl.getPos()); if (potentialReplicationQtlVariant == null) { notFound.add(potentialReplicationQtl.getChr() + ":" + potentialReplicationQtl.getPos()); ++replicationTopSnpNotInGenotypeData; continue; } Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant); double r2 = ld.getR2(); if (r2 > 1) { r2 = 1; } if (bestMatch == null) { bestMatch = potentialReplicationQtl; bestMatchR2 = r2; bestMatchLd = ld; } else if (r2 > bestMatchR2) { bestMatch = potentialReplicationQtl; nextBestR2 = bestMatchR2; bestMatchR2 = r2; bestMatchLd = ld; } } } double replicationZ = Double.NaN; double replicationZCorrected = Double.NaN; double discoveryZCorrected = Double.NaN; String replicationAlleleAssessed = null; if (bestMatch != null) { replicationZ = bestMatch.getBeta(); replicationAlleleAssessed = bestMatch.getAssessedAllele(); if (pos != bestMatch.getPos()) { String commonHap = null; double commonHapFreq = -1; for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) { double f = hapFreq.getValue(); if (f > commonHapFreq) { commonHapFreq = f; commonHap = hapFreq.getKey(); } } String[] commonHapAlleles = StringUtils.split(commonHap, '/'); discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1; replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; } else { discoveryZCorrected = discoveryZ; replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ : replicationZ * -1; //replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) || alleleAssessed.equals(String.valueOf(Utils.getComplementNucleotide(replicationAlleleAssessed.charAt(0)))) ? replicationZ : replicationZ * -1; } } c = 0; outputLine[c++] = chr; outputLine[c++] = String.valueOf(pos); outputLine[c++] = snp; outputLine[c++] = gene; outputLine[c++] = module; outputLine[c++] = String.valueOf(discoveryZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected); outputLine[c++] = alleleAssessed; outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAssessedAllele()); outputLine[c++] = String.valueOf(bestMatchR2); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getPos())); outputLine[c++] = String.valueOf(nextBestR2); outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.isIsAmbigous()); if (bestMatch == null) { for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) { outputLine[c++] = "NA"; } } else { for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) { outputLine[c++] = bestMatch.getLine()[EXTRA_COL_FROM_REPLICATION[i]]; } } outputWriter.writeNext(outputLine); } outputWriter.close(); for (String e : notFound) { System.err.println("Not found: " + e); } System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData); System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow); System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd); System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd); System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData); }
From source file:Main.java
public static int[] divide(int number, int number_of_parts) { HashSet<Integer> uniqueInts = new HashSet<Integer>(); uniqueInts.add(0); uniqueInts.add(number);/*from ww w .j a v a2 s . c om*/ int array_size = number_of_parts + 1; while (uniqueInts.size() < array_size) { uniqueInts.add(1 + r.nextInt(number - 1)); } Integer[] dividers = uniqueInts.toArray(new Integer[array_size]); Arrays.sort(dividers); int[] results = new int[number_of_parts]; for (int i = 1, j = 0; i < dividers.length; ++i, ++j) { results[j] = dividers[i] - dividers[j]; } return results; }
From source file:Main.java
public static <A> Set<A> singleSet(A object) { HashSet<A> set = new HashSet<A>(); set.add(object); return set;//from www .ja va 2s . co m }