Example usage for org.apache.hadoop.fs FileSystem get

List of usage examples for org.apache.hadoop.fs FileSystem get

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem get.

Prototype

public static FileSystem get(URI uri, Configuration conf) throws IOException 

Source Link

Document

Get a FileSystem for this URI's scheme and authority.

Usage

From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java

License:Apache License

/**
 * Reads the singleton items with their tid lists from the specified file.
 * /*w ww  .j ava 2 s . c  om*/
 * @param conf
 * @param path
 * @return
 * @throws IOException
 * @throws URISyntaxException
 */
private static List<Item> readTidLists(Configuration conf, Path path) throws IOException, URISyntaxException {
    SequenceFile.Reader r = new SequenceFile.Reader(FileSystem.get(new URI("file:///"), conf), path, conf);

    List<Item> items = newArrayList();

    IntWritable key = new IntWritable();
    IntMatrixWritable value = new IntMatrixWritable();

    while (r.next(key, value)) {
        final int[][] tids = value.toIntMatrix();
        int support = 0;

        for (int[] partTids : tids) {
            if (partTids != null) {
                support += partTids.length;
            }
        }

        items.add(new Item(key.get(), support, tids));
    }
    r.close();

    return items;
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java

License:Open Source License

protected void elPrepPreprocess(Context context, PreprocessingTools tools, SAMRecordIterator input,
        String output) throws InterruptedException, IOException, QualityException, URISyntaxException {
    String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict";
    String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM);
    String preSamOut = tmpFileBase + "-p1.sam";
    String samOut = tmpFileBase + "-p2.sam";
    String fCounts = tmpFileBase + "-features.count";

    outHeader = header.clone();/*  w  w w . ja va 2 s . c om*/
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);

    Logger.DEBUG("call elPrep");
    context.setStatus("call elPrep");
    int reads;
    if (keep) {
        reads = tools.callElPrep(preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    } else {
        reads = tools.streamElPrep(context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    }

    Logger.DEBUG(reads + " reads processed in elPrep");
    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);

    if (gff != null) {
        Logger.DEBUG("featureCounts");
        context.setStatus("featureCounts");
        tools.runFeatureCounts(gff, samOut, fCounts, threads);
        HalvadeFileUtils.uploadFileToHDFS(context,
                FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts,
                outputdir + context.getTaskAttemptID().toString() + ".count");
    }
    context.setStatus("convert SAM to BAM");
    Logger.DEBUG("convert SAM to BAM");
    tools.callSAMToBAM(samOut, output, threads);
    context.setStatus("build bam index");
    Logger.DEBUG("build bam index");
    tools.runBuildBamIndex(output);
    // remove temporary files
    HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java

License:Open Source License

protected void PicardPreprocess(Context context, PreprocessingTools tools, SAMRecordIterator input,
        String output) throws InterruptedException, QualityException, IOException, URISyntaxException {
    outHeader = header.clone();/*from   w ww.  jav  a  2s  .c  o m*/
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    // tmp files
    String tmpOut1 = tmpFileBase + "-p1.bam";
    String tmpOut2 = tmpFileBase + "-p2.bam";
    String tmpOut3 = tmpFileBase + "-p3.sam";
    String fCounts = tmpFileBase + "-features.count";
    String tmpMetrics = tmpFileBase + "-p3-metrics.txt";
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    if (!inputIsBam) {
        outHeader.addReadGroup(bamrg);
    }
    SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1));

    long startTime = System.currentTimeMillis();

    int count = 0;
    SAMRecord sam;
    while (input.hasNext()) {
        sam = input.next();
        writer.addAlignment(sam);
        count++;
    }
    int reads = count;
    writer.close();

    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);
    long estimatedTime = System.currentTimeMillis() - startTime;
    context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime);
    Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000);

    Logger.DEBUG("clean sam");
    context.setStatus("clean sam");
    tools.runCleanSam(tmpOut1, tmpOut2);
    Logger.DEBUG("mark duplicates");
    context.setStatus("mark duplicates");
    tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics);

    if (gff != null) {
        // tmpOut3 is sam for htseq count!        
        Logger.DEBUG("featureCounts");
        context.setStatus("featureCounts");
        tools.runFeatureCounts(gff, tmpOut3, fCounts, threads);
        HalvadeFileUtils.uploadFileToHDFS(context,
                FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts,
                outputdir + context.getTaskAttemptID().toString() + ".count");
    }

    if (!inputIsBam) {
        Logger.DEBUG("add read-group");
        context.setStatus("add read-group");
        tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM);
    } else {
        context.setStatus("convert SAM to BAM");
        Logger.DEBUG("convert SAM to BAM");
        tools.callSAMToBAM(tmpOut3, output, threads);
    }

    Logger.DEBUG("build bam index");
    context.setStatus("build bam index");
    tools.runBuildBamIndex(output);

    estimatedTime = System.currentTimeMillis() - startTime;
    Logger.DEBUG("estimated time: " + estimatedTime / 1000);

    // remove all temporary files now!
    HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.GATKReducer.java

License:Open Source License

protected String makeRegionFile(Context context, ChromosomeRange r, PreprocessingTools tools, String region)
        throws URISyntaxException, IOException, InterruptedException {
    // if exome dont do but for exome filter on exomeBedFile
    if (filterBedFile == null) {
        r.writeToPicardRegionFile(region);
    } else {/*from   w ww  .j a  v a2s  . c om*/
        String exomebed = tmpFileBase + "exome.bed";
        if (filterBedFile.endsWith(".gz")) {
            exomebed += ".gz";
        }
        HalvadeFileUtils.downloadFileFromHDFS(context,
                FileSystem.get(new URI(filterBedFile), context.getConfiguration()), filterBedFile, exomebed);
        if (exomebed.endsWith(".gz")) {
            exomebed = HalvadeFileUtils.Unzip(exomebed);
        }
        region = tools.filterExomeBed(exomebed, r);
        if (region == null) {
            Logger.DEBUG("empty region file, no vcf results!!");
            return null;
        }
        HalvadeFileUtils.removeLocalFile(keep, exomebed);
    }
    return region;
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.HalvadeReducer.java

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
    super.cleanup(context);
    Logger.DEBUG("count: " + count);
    String output = null;//from ww  w.  ja  v  a2  s. c o  m
    if (variantFiles.size() > 1) { // should not happen -> multiple keys per reducer
        GATKTools gatk = new GATKTools(ref, bin);
        gatk.setThreads(threads);
        gatk.setContext(context);
        if (java != null)
            gatk.setJava(java);
        output = tmp + context.getTaskAttemptID().toString() + ".vcf";
        Logger.DEBUG("run CombineVariants");
        gatk.runCombineVariants(variantFiles.toArray(new String[variantFiles.size()]), output, ref);
        context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1);
    } else if (variantFiles.size() == 1) {
        output = variantFiles.get(0);
    }
    if (output != null && checkVcfIsNotEmpty(output)) {
        try {
            HalvadeFileUtils.uploadFileToHDFS(context,
                    FileSystem.get(new URI(outputdir), context.getConfiguration()), output,
                    outputdir + context.getTaskAttemptID().toString() + ".vcf");
            HalvadeFileUtils.uploadFileToHDFS(context,
                    FileSystem.get(new URI(outputdir), context.getConfiguration()), output + ".idx",
                    outputdir + context.getTaskAttemptID().toString() + ".vcf.idx");
        } catch (URISyntaxException ex) {
            Logger.EXCEPTION(ex);
            throw new InterruptedException();
        }
    } else if (output != null) {
        Logger.DEBUG("empty vcf file, not uploaded to vcf to avoid error when merging.");
    }

    // delete the files from local scratch
    if (variantFiles.size() > 1) {
        for (String snps : variantFiles) {
            HalvadeFileUtils.removeLocalFile(keep, snps, context, HalvadeCounters.FOUT_GATK_VCF);
            HalvadeFileUtils.removeLocalFile(keep, snps + ".idx");
        }
    }
    if (output != null) {
        HalvadeFileUtils.removeLocalFile(keep, output, context, HalvadeCounters.FOUT_GATK_VCF);
        HalvadeFileUtils.removeLocalFile(keep, output + ".idx");
    }
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.java

License:Open Source License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
    Logger.DEBUG("total count: " + totalValCount);
    Logger.DEBUG("total keys: " + totalKeyCount);
    //for(Integer count : keyFactors) {
    //    int factor = Math.min(1, count / avg + 1); 
    //    Logger.DEBUG("count: " + count + " factor: " + factor + " new count: " + (count/factor));
    //}/*from w  w w  .  j a  va2  s  . co m*/

    FileSystem fs = null;
    try {
        fs = FileSystem.get(new URI(out), context.getConfiguration());
    } catch (URISyntaxException ex) {
        Logger.EXCEPTION(ex);
    }

    bw.close();
    File mergeFile = new File(mergeJS);
    Logger.DEBUG("written " + count + " lines to " + mergeJS);
    HalvadeFileUtils.uploadFileToHDFS(context, fs, mergeFile.getAbsolutePath(), out + mergeFile.getName());

    // build new genome ref
    String newGenomeDir = refDir + jobId + "-nsg/";
    File starOut = new File(newGenomeDir);
    starOut.mkdirs();

    long time = STARInstance.rebuildStarGenome(context, bin, newGenomeDir, ref, mergeJS, overhang, threads,
            mem);
    context.getCounter(HalvadeCounters.TIME_STAR_BUILD).increment(time);

    //upload to outputdir
    String pass2GenDir = HalvadeConf.getStarDirPass2HDFS(context.getConfiguration());
    File pass2check = new File(newGenomeDir + HalvadeFileUtils.HALVADE_STAR_SUFFIX_P2);
    pass2check.createNewFile();
    if (requireUploadToHDFS) {
        Logger.DEBUG("Uploading STAR genome to parallel filesystem...");
        fs.mkdirs(new Path(pass2GenDir));
        File[] genFiles = starOut.listFiles();
        for (File gen : genFiles) {
            HalvadeFileUtils.uploadFileToHDFS(context, fs, gen.getAbsolutePath(), pass2GenDir + gen.getName());
        }
        Logger.DEBUG("Finished uploading new reference to " + pass2GenDir);
    }
    HalvadeFileUtils.removeLocalFile(mergeJS);
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.VCFCombineReducer.java

License:Open Source License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    try {/*from ww w. j  a  va2 s.c o  m*/
        // read header from input
        outpFormat = new KeyIgnoringVCFOutputFormat(VCFFormat.VCF);
        String input = HalvadeConf.getInputDir(context.getConfiguration());
        String output = HalvadeConf.getOutDir(context.getConfiguration());
        reportBest = HalvadeConf.getReportAllVariant(context.getConfiguration());
        FileSystem fs = FileSystem.get(new URI(input), context.getConfiguration());
        Path firstVcfFile = null;
        if (fs.getFileStatus(new Path(input)).isDirectory()) {
            // get first file
            FileStatus[] files = fs.listStatus(new Path(input));
            int i = 0, l = files.length;
            while (i < l && !files[i].getPath().getName().endsWith(".vcf")) {
                i++;
            }
            if (i < l) {
                firstVcfFile = files[i].getPath();
            } else {
                throw new InterruptedException("VCFCombineReducer: No files in input folder.");
            }
        } else {
            throw new InterruptedException("VCFCombineReducer: Input directory is not a directory.");
        }
        Logger.DEBUG("first file: " + firstVcfFile);
        outpFormat.readHeaderFrom(firstVcfFile, fs);
        recordWriter = outpFormat.getRecordWriter(context, new Path(output + "HalvadeCombined.vcf"));
    } catch (URISyntaxException ex) {
        Logger.EXCEPTION(ex);
        throw new InterruptedException("URI for input directory is invalid.");
    }
}

From source file:be.ugent.intec.halvade.HalvadeOptions.java

License:Open Source License

protected double getInputSize(String input, Configuration conf) throws URISyntaxException, IOException {
    double size = 0;
    FileSystem fs = FileSystem.get(new URI(input), conf);
    if (fs.getFileStatus(new Path(input)).isDirectory()) {
        // add every file in directory
        FileStatus[] files = fs.listStatus(new Path(input));
        for (FileStatus file : files) {
            if (!file.isDirectory()) {
                size += file.getLen();/*  ww  w  .  j a va  2s  . co m*/
            }
        }
    } else {
        size += fs.getFileStatus(new Path(input)).getLen();
    }
    return (size / (1024 * 1024 * 1024));
}

From source file:be.ugent.intec.halvade.HalvadeOptions.java

License:Open Source License

protected void parseDictFile(Configuration conf) {
    be.ugent.intec.halvade.utils.Logger.DEBUG("parsing dictionary " + ref + DICT_SUFFIX);
    try {/*www  .jav  a  2s . c om*/
        FileSystem fs = FileSystem.get(new URI(ref + DICT_SUFFIX), conf);
        FSDataInputStream stream = fs.open(new Path(ref + DICT_SUFFIX));
        String line = getLine(stream); // header
        dict = new SAMSequenceDictionary();
        line = getLine(stream);
        while (line != null) {
            String[] lineData = line.split("\\s+");
            String seqName = lineData[1].substring(lineData[1].indexOf(':') + 1);
            int seqLength = 0;
            try {
                seqLength = Integer.parseInt(lineData[2].substring(lineData[2].indexOf(':') + 1));
            } catch (NumberFormatException ex) {
                be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex);
            }
            SAMSequenceRecord seq = new SAMSequenceRecord(seqName, seqLength);
            //                Logger.DEBUG("name: " + seq.getSequenceName() + " length: " + seq.getSequenceLength());
            dict.addSequence(seq);
            line = getLine(stream);
        }
        HalvadeConf.setSequenceDictionary(conf, dict);
    } catch (URISyntaxException | IOException ex) {
        be.ugent.intec.halvade.utils.Logger.EXCEPTION(ex);
    }

}

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    HalvadeConf.setIsPass2(pass1Conf, false);
    HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, true,
            halvadeOpts.useBamInput);/*from w w w.  j  a va2s . co  m*/
    Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline");
    pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf);
    try {
        if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) {
            // add every file in directory
            FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in));
            for (FileStatus file : files) {
                if (!file.isDirectory()) {
                    FileInputFormat.addInputPath(pass1Job, file.getPath());
                }
            }
        } else {
            FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in));
        }
    } catch (IOException | IllegalArgumentException e) {
        Logger.EXCEPTION(e);
    }

    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf);
    boolean skipPass1 = false;
    if (outFs.exists(new Path(tmpOutDir))) {
        // check if genome already exists
        skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS"));
        if (skipPass1)
            Logger.DEBUG("pass1 genome already created, skipping pass 1");
        else {
            Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
            Logger.INFO("ERROR: Please remove this directory before trying again.");
            System.exit(-2);
        }
    }
    if (!skipPass1) {
        FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir));
        pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);

        pass1Job.setInputFormatClass(HalvadeTextInputFormat.class);
        pass1Job.setMapOutputKeyClass(GenomeSJ.class);
        pass1Job.setMapOutputValueClass(Text.class);

        pass1Job.setSortComparatorClass(GenomeSJSortComparator.class);
        pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class);
        pass1Job.setNumReduceTasks(1);
        pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class);
        pass1Job.setOutputKeyClass(LongWritable.class);
        pass1Job.setOutputValueClass(Text.class);

        return runTimedJob(pass1Job, "Halvade pass 1 Job");
    } else
        return 0;
}