List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java
License:Apache License
@Override public void serviceInit(Configuration conf) throws Exception { conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME); // Use libs from cluster since no build is available conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true); // blacklisting disabled to prevent scheduling issues conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath()); }//w w w . j a v a2 s. c o m if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) { // nothing defined. set quick delete value conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l); } File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR); if (!appJarLocalFile.exists()) { String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting."; LOG.info(message); throw new TezUncheckedException(message); } else { LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath()); } FileSystem fs = FileSystem.get(conf); Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir")); Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar"); // Copy AppJar and make it public. Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR); fs.copyFromLocalFile(appMasterJar, appRemoteJar); fs.setPermission(appRemoteJar, new FsPermission("777")); conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString()); LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS)); // VMEM monitoring disabled, PMEM monitoring enabled. conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000"); try { Path stagingPath = FileContext.getFileContext(conf) .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR))); /* * Re-configure the staging path on Windows if the file system is localFs. * We need to use a absolute path that contains the drive letter. The unit * test could run on a different drive than the AM. We can run into the * issue that job files are localized to the drive where the test runs on, * while the AM starts on a different drive and fails to find the job * metafiles. Using absolute path can avoid this ambiguity. */ if (Path.WINDOWS) { if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) { conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath()); } } FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf); if (fc.util().exists(stagingPath)) { LOG.info(stagingPath + " exists! deleting..."); fc.delete(stagingPath, true); } LOG.info("mkdir: " + stagingPath); fc.mkdir(stagingPath, null, true); //mkdir done directory as well String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf); Path doneDirPath = fc.makeQualified(new Path(doneDir)); fc.mkdir(doneDirPath, null, true); } catch (IOException e) { throw new TezUncheckedException("Could not create staging directory. ", e); } conf.set(MRConfig.MASTER_ADDRESS, "test"); //configure the shuffle service in NM conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); // Non-standard shuffle port conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class, ContainerExecutor.class); // TestMRJobs is for testing non-uberized operation only; see TestUberAM // for corresponding uberized tests. conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); super.serviceInit(conf); }
From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java
License:Open Source License
@Override public int run(String[] argv) throws Exception { try {/*w w w.jav a 2s . c om*/ Configuration conf; FileSystem srcFs, outFs, fs; Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath; int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100; FileStatus[] content; ClusterStatus status; int numNodes, mapSlotsPerNode; long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime, splitSize; float inputBufpcnt; FSDataOutputStream out; FSDataInputStream in; SAMFileReader fileReader; InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler; double sampling_frequency = 0.01; // Job object can be used for Aligner job if enabled conf = getConf(); Job job = new Job(conf); parseCommandLineArgs(argv, conf); maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks(); maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks(); if (!noalign) { System.out.println("Starting Alignment Job"); startTime = System.currentTimeMillis(); status = new JobClient(new JobConf(conf)).getClusterStatus(); numNodes = status.getTaskTrackers(); // Job specific setting of number of Reducers.. if (nReducers == 0) nReducers = numNodes; conf.setInt("mapred.reduce.tasks", nReducers); Path refPath = new Path(refFileLoc); fs = refPath.getFileSystem(conf); blockSize = fs.getFileStatus(refPath).getBlockSize(); splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize()); if (reads_per_split == 0) { inputPath = new Path(readFile1); long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen(); long numSplits = Math.round(readSize / splitSize); if (numSplits < maxMapTasks) numSplits = maxMapTasks; if (numSplits < nReducers) numSplits = nReducers; long numReads = Math.round(readSize / (long) fq_read_size); reads_per_split = numReads / numSplits; // Total Order Partitioner if ((double) reads_per_split <= (1 / sampling_frequency)) { sampling_frequency = 1; granularity = 1; } else if (((double) reads_per_split > (1 / sampling_frequency)) && ((double) reads_per_split <= (1 / sampling_frequency * 100))) { sampling_frequency = 0.1; granularity = 10; } } job.setJarByClass(GATKJobClient.class); job.setInputFormatClass(NLineXInputFormat.class); FileInputFormat.addInputPath(job, new Path(fqInput)); FileOutputFormat.setOutputPath(job, new Path(BWAOutPath)); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration()); if (!is_azure) { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"), job.getConfiguration()); } else { DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"), job.getConfiguration()); } DistributedCache.createSymlink(job.getConfiguration()); // Setting local.cache.size - Add up the size of the files // distributed through the cache cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen() + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen(); if (!is_azure) { cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen() + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen(); } if (cacheSize > 8 * 1024 * 1024 * 1024) { conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024)); } conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs.. conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(BWAPartitioner.class); job.setReducerClass(BWAReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); if (job.waitForCompletion(true)) { System.out.println("BWA Alignment done"); } content = fs.listStatus(new Path(BWAOutPath)); for (int i = 0; i < content.length; i++) { if (!((content[i].getPath().getName()).endsWith(".bam")) && !((content[i].getPath().getName()).startsWith("_"))) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("BWA Alignment took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Splitting BAM Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1"); FileOutputFormat.setOutputPath(job, output); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setInt("gatk.hadoop.granularity", granularity); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", false); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("SplittingBAM Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Splitting BAM Indexing took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Sort Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (norealign && nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); inputPath = new Path(BWAOutPath); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath)); job.setInputFormatClass(ContigInputFormat.class); job.setPartitionerClass(ContigPartitioner.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); for (int i = 0; i < content.length; i++) { if (content[i].getPath().getName().endsWith(".bam")) { in = fs.open(content[i].getPath()); List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader()) .getSequenceDictionary().getSequences(); conf.setInt("mapred.reduce.tasks", sequences.size()); break; } } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); //conf.setBoolean("mapred.map.tasks.speculative.execution", false); //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); if (job.waitForCompletion(true)) { System.out.println("Sort completed successfully"); } endTime = System.currentTimeMillis(); System.out.println("Sort job took: " + (endTime - startTime)); } if (!norealign) { if (!noalign) BAMInputPath = SortBWAOutPath; startTime = System.currentTimeMillis(); System.out.println("Starting Indexing Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2"); FileOutputFormat.setOutputPath(job, output); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); conf.setBoolean("gatk.hadoop.isindex", true); conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.ismarkdup", false); job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indexing job done"); } output.getFileSystem(conf).delete(output, true); endTime = System.currentTimeMillis(); System.out.println("Indexing job took: " + (endTime - startTime)); startTime = System.currentTimeMillis(); System.out.println("Starting Realigner Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition"))) System.out.println("mkdir failed"); inputDir = new Path(outputDir + Path.SEPARATOR + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { if (!nomarkdup || !noqrecab || !novariant) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10)); } } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setLong("mapred.task.timeout", 86400000L); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 if (nomarkdup && noqrecab && novariant && !nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(IndelMapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(IndelOutPath)); sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Indel realignment done"); } endTime = System.currentTimeMillis(); System.out.println("Indel Realigner took: " + (endTime - startTime)); } if (!nomarkdup || !noqrecab || !novariant) { /* * MarkDuplicate and Indexing Job * FixMateInformation is not required as it is handled * automatically by GATK after IndelRealignment. */ System.out.println("Starting MarkDup/Indexing job"); startTime = System.currentTimeMillis(); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.reduce.tasks", 0); if (!nomarkdup) { System.out.println("Starting MarkDuplicates job"); conf.setBoolean("gatk.hadoop.ismarkdup", true); FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath)); } if (!noqrecab || !novariant) { conf.setBoolean("gatk.hadoop.issindex", true); conf.setBoolean("gatk.hadoop.isindex", true); if (nomarkdup) { System.out.println("Starting Indexing job"); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3")); } } job.setMapperClass(IndexMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Markdup/Indexing job done !!!"); } Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } if (!nomarkdup) { Path rmdupOutPath = new Path(RmdupOutPath); fs = rmdupOutPath.getFileSystem(conf); content = fs.listStatus(rmdupOutPath); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).startsWith("part")) { fs.delete(content[i].getPath(), false); } } endTime = System.currentTimeMillis(); System.out.println("MarkDuplicates took: " + (endTime - startTime)); } else { endTime = System.currentTimeMillis(); System.out.println("Indexing took: " + (endTime - startTime)); } } if (!noqrecab) { startTime = System.currentTimeMillis(); System.out.println("Starting Recal - Count Covariates Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(LociInputFormat.class); conf.setLong("local.cache.size", 20106127360L); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.set("gatk.hadoop.outputpath", outputDir); // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1); // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1); // conf.setBoolean("mapred.map.tasks.speculative.execution", false); // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1 conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setMapperClass(RecalCovMapper.class); job.setCombinerClass(RecalCovCombiner.class); job.setReducerClass(RecalCovReducer.class); job.setMapOutputKeyClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("CountCovariates done"); } endTime = System.currentTimeMillis(); System.out.println("CountCovariates took: " + (endTime - startTime)); } if (!noqrecab || !novariant) { startTime = System.currentTimeMillis(); System.out.println("Starting Table Recalibration / Unified Genotyper Job"); if (!nomarkdup) inputPath = new Path(RmdupOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else inputPath = new Path(BAMInputPath); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); FileInputFormat.addInputPath(job, inputPath); if (!noqrecab) { conf.setBoolean("gatk.hadoop.recab", true); if (norealign) { job.setInputFormatClass(BAMInputFormat.class); srcFs = new Path(outputDir).getFileSystem(conf); if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition"))) System.out.println("mkdir failed"); } else { job.setInputFormatClass(LociInputFormat.class); } inputDir = new Path(outputDir + "/" + "Partition"); inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); partition = new Path(inputDir, "_partition"); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partition); try { URI partitionURI = new URI(partition.toString() + "#_partition"); DistributedCache.addCacheFile(partitionURI, conf); } catch (URISyntaxException e) { assert false; } if (nReducers == 0) { conf.setInt("mapred.reduce.tasks", maxMapTasks); } else { conf.setInt("mapred.reduce.tasks", nReducers); } conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1 conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); if (!nomresults) conf.setBoolean("gatk.hadoop.ismerge", true); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(SortOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(RecalOutPath)); } else { job.setInputFormatClass(LociInputFormat.class); conf.setInt("mapred.reduce.tasks", 0); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4")); } job.setMapperClass(RecalMapper.class); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setInt("dfs.datanode.socket.write.timeout", 600000); conf.setInt("dfs.socket.timeout", 600000); conf.set("gatk.hadoop.outputpath", outputDir); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); if (!novariant) { conf.setBoolean("gatk.hadoop.variant", true); if (!nofvariant) conf.setBoolean("gatk.hadoop.fvariant", true); conf.setInt("gatk.hadoop.nthreads", nThreads); conf.setBoolean("gatk.hadoop.xvariant", xVariantCall); } if (!noqrecab && norealign) { sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency, max_splits); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler); job.setInputFormatClass(LociInputFormat.class); } DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("TableRecalibration Job done !!"); } endTime = System.currentTimeMillis(); Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"); fs = toDelete.getFileSystem(conf); if (fs.exists(toDelete)) { fs.delete(toDelete, true); } System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime)); } if (!novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeVariant job took: " + (endTime - startTime)); if (xVariantCall && !novariant && !nomresults) { startTime = System.currentTimeMillis(); System.out.println("Merge INDEL Variant Job"); job = new Job(); job.setJarByClass(GATKJobClient.class); conf = job.getConfiguration(); inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut"); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(WholeFileInputFormat.class); conf.setInt("mapred.reduce.tasks", 1); conf.setLong("mapred.task.timeout", 86400000L); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setBoolean("gatk.hadoop", true); conf.setBoolean("gatk.hadoop.isazure", is_azure); job.setReducerClass(VariantReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut")); DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration()); // Standard inputs DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration()); DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); if (job.waitForCompletion(true)) { System.out.println("Merge INDEL Variants done"); } endTime = System.currentTimeMillis(); System.out.println("MergeINDELVariant job took: " + (endTime - startTime)); } } if (!nomresults) { startTime = System.currentTimeMillis(); System.out.println("Starting Merge BAM Job"); outputPath = new Path(FinalBAMPath); outFs = outputPath.getFileSystem(conf); if (!outFs.mkdirs(outputPath)) System.out.println("mkdir failed"); // Currently no support to merge output from MarkDuplicates // from Job Client. Need to have a separate MR job for it. if (!noqrecab) inputPath = new Path(RecalOutPath); else if (!norealign) inputPath = new Path(IndelOutPath); else if (!noalign) inputPath = new Path(SortBWAOutPath); else if (!nomarkdup) throw new Exception("Merge not implemented MarkDuplicates output."); else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant) inputPath = new Path(BAMInputPath); fs = inputPath.getFileSystem(conf); content = fs.listStatus(inputPath); mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam"); Path p = null; int nfiles = 0; for (int i = 0; i < content.length; i++) { p = content[i].getPath(); ++nfiles; } if (nfiles == 1) { boolean rename = fs.rename(p, mergeOutFile); } else { out = outFs.create(mergeOutFile, true); for (int i = 0; i < content.length; i++) { p = content[i].getPath(); if ((p.getName()).endsWith(".bam")) { in = fs.open(p); IOUtils.copyBytes(in, out, conf, false); in.close(); } } out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); out.close(); } endTime = System.currentTimeMillis(); System.out.println("Final Merge took: " + (endTime - startTime)); } System.out.println("JobCompleted"); } catch (IOException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (InterruptedException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (ClassNotFoundException e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } catch (Exception e) { System.err.printf("Hadoop Error : %s\n", e); return -1; } return 0; }
From source file:org.lilyproject.hadooptestfw.HBaseTestingUtilityFactory.java
License:Apache License
/** * Creates an HBaseTestingUtility with settings applied such that everything will be stored below the * supplied directory and makes (to some extent) use of standard port numbers. * * @param conf HBase conf to use, as created by HBaseConfiguration.create(). * @param tmpDir directory under which data of dfs, zookeeper, mr, ... will be stored * @param clearData can data be cleared (at startup or shutdown), use true unless you need the data from a previous * run/*from ww w . j av a 2 s . c o m*/ */ public static HBaseTestingUtility create(Configuration conf, File tmpDir, boolean clearData) throws IOException { // This location will be used for dfs, zookeeper, ... conf.set(TEST_DIR_KEY, createSubDir(tmpDir, "hbase-test-util")); // This property is picked up by our fork of MiniMRCluster (the default implementation was hardcoded // to use build/test/mapred/local) System.setProperty("mapred.local.dir", createSubDir(tmpDir, "mapred-local")); conf.set("mapred.local.dir", createSubDir(tmpDir, "mapred-local")); // Properties used for MiniMRCluster conf.set("hadoop.log.dir", createSubDir(tmpDir, "hadoop-logs")); conf.set("hadoop.tmp.dir", createSubDir(tmpDir, "mapred-output")); conf.set("mapred.system.dir", "/tmp/hadoop/mapred/system"); conf.set("mapreduce.jobtracker.staging.root.dir", "/tmp/hadoop/mapred/staging"); // Only use one MR child VM, should be lighter on developer machines conf.set("mapred.tasktracker.map.tasks.maximum", "1"); // Force default port numbers conf.set("hbase.master.info.port", "60010"); conf.set("hbase.regionserver.info.port", "60030"); // Allow more clients to connect concurrently (HBase default is 10) conf.set("hbase.regionserver.handler.count", "30"); // Allow more clients to connect concurrently to hdfs (default is 3) conf.set("dfs.datanode.handler.count", "6"); // Generic performance related settings conf.set("io.file.buffer.size", "65536"); conf.set("hbase.hregion.memstore.flush.size", "268435456"); // Disable the automatic closing of Hadoop FileSystem objects by its shutdown hook. // Otherwise, when stopping 'launch-test-lily' (LilyLauncher), the shutdown hook closes the filesystem // before HBase had the opportunity to flush its data. This then leads to (possibly long) recoveries // on the next startup (and even then, I've seen data loss, maybe sync is not active for the mini cluster?). conf.set("fs.automatic.close", "false"); // Replication parameters needed for the SEP conf.set("hbase.replication", "true"); conf.setFloat("replication.source.ratio", 1.0f); conf.set("replication.source.nb.capacity", "200"); conf.set("replication.replicationsource.implementation", "com.ngdata.sep.impl.SepReplicationSource"); // make replication react a little quicker conf.setLong("replication.source.sleepforretries", 200); // make retries in ZooKeeper a little quicker // This was added with CDH 4.2, where on shutdown HBase's snapshot manager closed a zookeeper // connection which later on was still used by another component, which then got into a retry loop, // leading to a slow shutdown. conf.setInt("zookeeper.recovery.retry.intervalmill", 100); return new HBaseTestingUtility(conf, clearData); }
From source file:org.mrgeo.data.vector.VectorInputFormatContext.java
License:Apache License
public void save(final Configuration conf) { conf.setInt(INPUTS_COUNT, inputs.size()); int inputIndex = 0; for (String input : inputs) { conf.set(INPUTS_PREFIX + inputIndex, input); inputIndex++;//from w ww. j a v a2 s . c o m } conf.setLong(FEATURE_COUNT_KEY, featureCount); conf.setInt(MIN_FEATURES_PER_SPLIT_KEY, minFeaturesPerSplit); conf.set(PROVIDER_PROPERTY_KEY, ProviderProperties.toDelimitedString(inputProviderProperties)); }
From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java
License:Apache License
private boolean buildTiles() { try {// ww w. j ava2 s. com final Job job = new Job(config); HadoopUtils.setJar(job, this.getClass()); final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date()); final String jobName = "BuildTiles_" + now + "_" + UUID.randomUUID().toString(); job.setJobName(jobName); final Configuration conf = job.getConfiguration(); conf.setInt(ZOOMLEVEL, zoomlevel); final int tilesize = Integer.parseInt(MrGeoProperties.getInstance() .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT)); conf.setInt(TILESIZE, tilesize); conf.set(OUTPUT, tmpDir.toString()); conf.setInt(GRANULATIRY, granularity); conf.setLong(LATOFFSET, latOffset); conf.setLong(LONOFFSET, lonOffset); job.setInputFormatClass(SequenceFileInputFormat.class); final Path tilesPath = new Path(tmpDir, TILEIDS + "/*/part*"); HadoopVectorUtils.addInputPath(job, tilesPath); job.setReducerClass(ProcessTilesReducer.class); final Path output = new Path(tmpDir, VECTORTILES); HadoopFileUtils.delete(output); MrsImageOutputFormatProvider ofProvider = MrsImageDataProvider.setupMrsPyramidOutputFormat(job, output.toString(), datasetBounds, zoomlevel, tilesize, protectionLevel, providerProperties); //FileOutputFormat.setOutputPath(job, outputWithZoom); job.setMapOutputKeyClass(TileIdWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(VectorTileWritable.class); try { job.submit(); final boolean success = job.waitForCompletion(true); if (success) { ofProvider.teardown(job); MrsVectorPyramid.calculateMetadata(output.toString(), zoomlevel, tilesize, datasetBounds, protectionLevel); return true; } } catch (final InterruptedException e) { e.printStackTrace(); } catch (final ClassNotFoundException e) { e.printStackTrace(); } } catch (final IOException e) { e.printStackTrace(); } return false; }
From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java
License:Apache License
private boolean processNodes() { try {/*from w ww .ja va2 s . c o m*/ final Job job = new Job(config); HadoopUtils.setJar(job, this.getClass()); final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date()); final String jobName = "ProcesNodes_" + now + "_" + UUID.randomUUID().toString(); job.setJobName(jobName); final Configuration conf = job.getConfiguration(); conf.setInt(ZOOMLEVEL, zoomlevel); conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance() .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT))); conf.set(OUTPUT, tmpDir.toString()); conf.setInt(GRANULATIRY, granularity); conf.setLong(LATOFFSET, latOffset); conf.setLong(LONOFFSET, lonOffset); job.setInputFormatClass(SequenceFileInputFormat.class); final Path nodesPath = new Path(tmpDir, NODES); HadoopVectorUtils.addInputPath(job, nodesPath); job.setReducerClass(ProcessNodesReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); final Path output = new Path(tmpDir, TILEIDS + "/" + NODES); HadoopFileUtils.delete(output); FileOutputFormat.setOutputPath(job, output); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(Text.class); try { job.submit(); final boolean success = job.waitForCompletion(true); if (success) { return true; } } catch (final InterruptedException e) { e.printStackTrace(); } catch (final ClassNotFoundException e) { e.printStackTrace(); } } catch (final IOException e) { e.printStackTrace(); } return false; }
From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java
License:Apache License
private boolean processRelations() { try {// w w w .java 2 s.c om int runCnt = 1; while (true) { final Job job = new Job(config); HadoopUtils.setJar(job, this.getClass()); final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date()); final String jobName = "ProcesRelations_" + runCnt + "_" + now + "_" + UUID.randomUUID().toString(); job.setJobName(jobName); final Configuration conf = job.getConfiguration(); conf.setInt(ZOOMLEVEL, zoomlevel); conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance().getProperty( MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT))); conf.set(OUTPUT, tmpDir.toString()); conf.setInt(GRANULATIRY, granularity); conf.setLong(LATOFFSET, latOffset); conf.setLong(LONOFFSET, lonOffset); conf.setInt(RELATION_RUN, runCnt); job.setInputFormatClass(SequenceFileInputFormat.class); final Path relationsPath; if (runCnt <= 1) { relationsPath = new Path(tmpDir, RELATIONS); } else { relationsPath = new Path(tmpDir, RELATIONS + "_" + (runCnt - 1)); } HadoopVectorUtils.addInputPath(job, relationsPath); job.setOutputFormatClass(SequenceFileOutputFormat.class); final Path output = new Path(tmpDir, TILEIDS + "/" + RELATIONS + "_" + runCnt); HadoopFileUtils.delete(output); FileOutputFormat.setOutputPath(job, output); job.setReducerClass(ProcessRelationsReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(Text.class); boolean success = false; try { job.submit(); success = job.waitForCompletion(true); } catch (final InterruptedException e) { e.printStackTrace(); } catch (final ClassNotFoundException e) { e.printStackTrace(); } if (success) { final Path rp = new Path(tmpDir, RELATIONS + "_" + runCnt); // did we make a relations file? if (!HadoopFileUtils.exists(rp)) { return true; } } runCnt++; if (runCnt > 5) { return true; } } } catch (final IOException e) { e.printStackTrace(); } return false; }
From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java
License:Apache License
private boolean processWays() { try {//www . j a va 2 s . co m final Job job = new Job(config); HadoopUtils.setJar(job, this.getClass()); final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date()); final String jobName = "ProcesWays_" + now + "_" + UUID.randomUUID().toString(); job.setJobName(jobName); final Configuration conf = job.getConfiguration(); conf.setInt(ZOOMLEVEL, zoomlevel); conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance() .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT))); conf.set(OUTPUT, tmpDir.toString()); conf.setInt(GRANULATIRY, granularity); conf.setLong(LATOFFSET, latOffset); conf.setLong(LONOFFSET, lonOffset); job.setInputFormatClass(SequenceFileInputFormat.class); final Path waysPath = new Path(tmpDir, WAYS); HadoopVectorUtils.addInputPath(job, waysPath); job.setReducerClass(ProcessWaysReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); final Path output = new Path(tmpDir, TILEIDS + "/" + WAYS); HadoopFileUtils.delete(output); FileOutputFormat.setOutputPath(job, output); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(Text.class); try { job.submit(); final boolean success = job.waitForCompletion(true); if (success) { return true; } } catch (final InterruptedException e) { e.printStackTrace(); } catch (final ClassNotFoundException e) { e.printStackTrace(); } } catch (final IOException e) { e.printStackTrace(); } return false; }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraGenJob.java
License:Apache License
static void setNumberOfRows(Configuration job, long numRows) { job.setLong("terasort.num-rows", numRows); }
From source file:org.unigram.likelike.lsh.function.TestMinWiseFunction.java
License:Apache License
private MinWiseFunction createFunction(int depth, long seed) { Configuration conf = new Configuration(); conf.setLong(SelectClustersMapper.MINWISE_HASH_SEEDS, seed); conf.setInt(LikelikeConstants.FEATURE_DEPTH, depth); return new MinWiseFunction(conf); }