List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(URI uri, Configuration conf) throws IOException
From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java
License:Apache License
/** * Reads the singleton items with their tid lists from the specified file. * /*w ww .j ava 2 s . c om*/ * @param conf * @param path * @return * @throws IOException * @throws URISyntaxException */ private static List<Item> readTidLists(Configuration conf, Path path) throws IOException, URISyntaxException { SequenceFile.Reader r = new SequenceFile.Reader(FileSystem.get(new URI("file:///"), conf), path, conf); List<Item> items = newArrayList(); IntWritable key = new IntWritable(); IntMatrixWritable value = new IntMatrixWritable(); while (r.next(key, value)) { final int[][] tids = value.toIntMatrix(); int support = 0; for (int[] partTids : tids) { if (partTids != null) { support += partTids.length; } } items.add(new Item(key.get(), support, tids)); } r.close(); return items; }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java
License:Open Source License
protected void elPrepPreprocess(Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, IOException, QualityException, URISyntaxException { String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict"; String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM); String preSamOut = tmpFileBase + "-p1.sam"; String samOut = tmpFileBase + "-p2.sam"; String fCounts = tmpFileBase + "-features.count"; outHeader = header.clone();/* w w w . ja va 2 s . c om*/ outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); Logger.DEBUG("call elPrep"); context.setStatus("call elPrep"); int reads; if (keep) { reads = tools.callElPrep(preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } else { reads = tools.streamElPrep(context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } Logger.DEBUG(reads + " reads processed in elPrep"); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); if (gff != null) { Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, samOut, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS(context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(samOut, output, threads); context.setStatus("build bam index"); Logger.DEBUG("build bam index"); tools.runBuildBamIndex(output); // remove temporary files HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java
License:Open Source License
protected void PicardPreprocess(Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, QualityException, IOException, URISyntaxException { outHeader = header.clone();/*from w ww. jav a 2s .c o m*/ outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); // tmp files String tmpOut1 = tmpFileBase + "-p1.bam"; String tmpOut2 = tmpFileBase + "-p2.bam"; String tmpOut3 = tmpFileBase + "-p3.sam"; String fCounts = tmpFileBase + "-features.count"; String tmpMetrics = tmpFileBase + "-p3-metrics.txt"; SAMFileWriterFactory factory = new SAMFileWriterFactory(); if (!inputIsBam) { outHeader.addReadGroup(bamrg); } SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1)); long startTime = System.currentTimeMillis(); int count = 0; SAMRecord sam; while (input.hasNext()) { sam = input.next(); writer.addAlignment(sam); count++; } int reads = count; writer.close(); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); long estimatedTime = System.currentTimeMillis() - startTime; context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime); Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000); Logger.DEBUG("clean sam"); context.setStatus("clean sam"); tools.runCleanSam(tmpOut1, tmpOut2); Logger.DEBUG("mark duplicates"); context.setStatus("mark duplicates"); tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics); if (gff != null) { // tmpOut3 is sam for htseq count! Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, tmpOut3, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS(context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } if (!inputIsBam) { Logger.DEBUG("add read-group"); context.setStatus("add read-group"); tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM); } else { context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(tmpOut3, output, threads); } Logger.DEBUG("build bam index"); context.setStatus("build bam index"); tools.runBuildBamIndex(output); estimatedTime = System.currentTimeMillis() - startTime; Logger.DEBUG("estimated time: " + estimatedTime / 1000); // remove all temporary files now! HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java
License:Open Source License
protected String makeRegionFile(Context context, ChromosomeRange r, PreprocessingTools tools, String region) throws URISyntaxException, IOException, InterruptedException { // if exome dont do but for exome filter on exomeBedFile if (filterBedFile == null) { r.writeToPicardRegionFile(region); } else {/*from w ww .j a v a2s . c om*/ String exomebed = tmpFileBase + "exome.bed"; if (filterBedFile.endsWith(".gz")) { exomebed += ".gz"; } HalvadeFileUtils.downloadFileFromHDFS(context, FileSystem.get(new URI(filterBedFile), context.getConfiguration()), filterBedFile, exomebed); if (exomebed.endsWith(".gz")) { exomebed = HalvadeFileUtils.Unzip(exomebed); } region = tools.filterExomeBed(exomebed, r); if (region == null) { Logger.DEBUG("empty region file, no vcf results!!"); return null; } HalvadeFileUtils.removeLocalFile(keep, exomebed); } return region; }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.HalvadeReducer.java
@Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); Logger.DEBUG("count: " + count); String output = null;//from ww w. ja v a2 s. c o m if (variantFiles.size() > 1) { // should not happen -> multiple keys per reducer GATKTools gatk = new GATKTools(ref, bin); gatk.setThreads(threads); gatk.setContext(context); if (java != null) gatk.setJava(java); output = tmp + context.getTaskAttemptID().toString() + ".vcf"; Logger.DEBUG("run CombineVariants"); gatk.runCombineVariants(variantFiles.toArray(new String[variantFiles.size()]), output, ref); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); } else if (variantFiles.size() == 1) { output = variantFiles.get(0); } if (output != null && checkVcfIsNotEmpty(output)) { try { HalvadeFileUtils.uploadFileToHDFS(context, FileSystem.get(new URI(outputdir), context.getConfiguration()), output, outputdir + context.getTaskAttemptID().toString() + ".vcf"); HalvadeFileUtils.uploadFileToHDFS(context, FileSystem.get(new URI(outputdir), context.getConfiguration()), output + ".idx", outputdir + context.getTaskAttemptID().toString() + ".vcf.idx"); } catch (URISyntaxException ex) { Logger.EXCEPTION(ex); throw new InterruptedException(); } } else if (output != null) { Logger.DEBUG("empty vcf file, not uploaded to vcf to avoid error when merging."); } // delete the files from local scratch if (variantFiles.size() > 1) { for (String snps : variantFiles) { HalvadeFileUtils.removeLocalFile(keep, snps, context, HalvadeCounters.FOUT_GATK_VCF); HalvadeFileUtils.removeLocalFile(keep, snps + ".idx"); } } if (output != null) { HalvadeFileUtils.removeLocalFile(keep, output, context, HalvadeCounters.FOUT_GATK_VCF); HalvadeFileUtils.removeLocalFile(keep, output + ".idx"); } }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.java
License:Open Source License
@Override protected void cleanup(Context context) throws IOException, InterruptedException { Logger.DEBUG("total count: " + totalValCount); Logger.DEBUG("total keys: " + totalKeyCount); //for(Integer count : keyFactors) { // int factor = Math.min(1, count / avg + 1); // Logger.DEBUG("count: " + count + " factor: " + factor + " new count: " + (count/factor)); //}/*from w w w . j a va2 s . co m*/ FileSystem fs = null; try { fs = FileSystem.get(new URI(out), context.getConfiguration()); } catch (URISyntaxException ex) { Logger.EXCEPTION(ex); } bw.close(); File mergeFile = new File(mergeJS); Logger.DEBUG("written " + count + " lines to " + mergeJS); HalvadeFileUtils.uploadFileToHDFS(context, fs, mergeFile.getAbsolutePath(), out + mergeFile.getName()); // build new genome ref String newGenomeDir = refDir + jobId + "-nsg/"; File starOut = new File(newGenomeDir); starOut.mkdirs(); long time = STARInstance.rebuildStarGenome(context, bin, newGenomeDir, ref, mergeJS, overhang, threads, mem); context.getCounter(HalvadeCounters.TIME_STAR_BUILD).increment(time); //upload to outputdir String pass2GenDir = HalvadeConf.getStarDirPass2HDFS(context.getConfiguration()); File pass2check = new File(newGenomeDir + HalvadeFileUtils.HALVADE_STAR_SUFFIX_P2); pass2check.createNewFile(); if (requireUploadToHDFS) { Logger.DEBUG("Uploading STAR genome to parallel filesystem..."); fs.mkdirs(new Path(pass2GenDir)); File[] genFiles = starOut.listFiles(); for (File gen : genFiles) { HalvadeFileUtils.uploadFileToHDFS(context, fs, gen.getAbsolutePath(), pass2GenDir + gen.getName()); } Logger.DEBUG("Finished uploading new reference to " + pass2GenDir); } HalvadeFileUtils.removeLocalFile(mergeJS); }
From source file:be.ugent.intec.halvade.hadoop.mapreduce.VCFCombineReducer.java
License:Open Source License
@Override protected void setup(Context context) throws IOException, InterruptedException { try {/*from ww w. j a va2 s.c o m*/ // read header from input outpFormat = new KeyIgnoringVCFOutputFormat(VCFFormat.VCF); String input = HalvadeConf.getInputDir(context.getConfiguration()); String output = HalvadeConf.getOutDir(context.getConfiguration()); reportBest = HalvadeConf.getReportAllVariant(context.getConfiguration()); FileSystem fs = FileSystem.get(new URI(input), context.getConfiguration()); Path firstVcfFile = null; if (fs.getFileStatus(new Path(input)).isDirectory()) { // get first file FileStatus[] files = fs.listStatus(new Path(input)); int i = 0, l = files.length; while (i < l && !files[i].getPath().getName().endsWith(".vcf")) { i++; } if (i < l) { firstVcfFile = files[i].getPath(); } else { throw new InterruptedException("VCFCombineReducer: No files in input folder."); } } else { throw new InterruptedException("VCFCombineReducer: Input directory is not a directory."); } Logger.DEBUG("first file: " + firstVcfFile); outpFormat.readHeaderFrom(firstVcfFile, fs); recordWriter = outpFormat.getRecordWriter(context, new Path(output + "HalvadeCombined.vcf")); } catch (URISyntaxException ex) { Logger.EXCEPTION(ex); throw new InterruptedException("URI for input directory is invalid."); } }
From source file:be.ugent.intec.halvade.HalvadeOptions.java
License:Open Source License
protected double getInputSize(String input, Configuration conf) throws URISyntaxException, IOException { double size = 0; FileSystem fs = FileSystem.get(new URI(input), conf); if (fs.getFileStatus(new Path(input)).isDirectory()) { // add every file in directory FileStatus[] files = fs.listStatus(new Path(input)); for (FileStatus file : files) { if (!file.isDirectory()) { size += file.getLen();/* ww w . j a va 2s . co m*/ } } } else { size += fs.getFileStatus(new Path(input)).getLen(); } return (size / (1024 * 1024 * 1024)); }
From source file:be.ugent.intec.halvade.HalvadeOptions.java
License:Open Source License
protected void parseDictFile(Configuration conf) { be.ugent.intec.halvade.utils.Logger.DEBUG("parsing dictionary " + ref + DICT_SUFFIX); try {/*www .jav a 2s . c om*/ FileSystem fs = FileSystem.get(new URI(ref + DICT_SUFFIX), conf); FSDataInputStream stream = fs.open(new Path(ref + DICT_SUFFIX)); String line = getLine(stream); // header dict = new SAMSequenceDictionary(); line = getLine(stream); while (line != null) { String[] lineData = line.split("\\s+"); String seqName = lineData[1].substring(lineData[1].indexOf(':') + 1); int seqLength = 0; try { seqLength = Integer.parseInt(lineData[2].substring(lineData[2].indexOf(':') + 1)); } catch (NumberFormatException ex) { be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex); } SAMSequenceRecord seq = new SAMSequenceRecord(seqName, seqLength); // Logger.DEBUG("name: " + seq.getSequenceName() + " length: " + seq.getSequenceLength()); dict.addSequence(seq); line = getLine(stream); } HalvadeConf.setSequenceDictionary(conf, dict); } catch (URISyntaxException | IOException ex) { be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex); } }
From source file:be.ugent.intec.halvade.MapReduceRunner.java
License:Open Source License
protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { HalvadeConf.setIsPass2(pass1Conf, false); HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, true, halvadeOpts.useBamInput);/*from w w w. j a va2s . co m*/ Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline"); pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries)); pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class); FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf); try { if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) { // add every file in directory FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in)); for (FileStatus file : files) { if (!file.isDirectory()) { FileInputFormat.addInputPath(pass1Job, file.getPath()); } } } else { FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in)); } } catch (IOException | IllegalArgumentException e) { Logger.EXCEPTION(e); } FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf); boolean skipPass1 = false; if (outFs.exists(new Path(tmpOutDir))) { // check if genome already exists skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS")); if (skipPass1) Logger.DEBUG("pass1 genome already created, skipping pass 1"); else { Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists."); Logger.INFO("ERROR: Please remove this directory before trying again."); System.exit(-2); } } if (!skipPass1) { FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir)); pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class); pass1Job.setInputFormatClass(HalvadeTextInputFormat.class); pass1Job.setMapOutputKeyClass(GenomeSJ.class); pass1Job.setMapOutputValueClass(Text.class); pass1Job.setSortComparatorClass(GenomeSJSortComparator.class); pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class); pass1Job.setNumReduceTasks(1); pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class); pass1Job.setOutputKeyClass(LongWritable.class); pass1Job.setOutputValueClass(Text.class); return runTimedJob(pass1Job, "Halvade pass 1 Job"); } else return 0; }