org.bgi.flexlab.gaea.util.SortUilts.java Source code

Java tutorial

Introduction

Here is the source code for org.bgi.flexlab.gaea.util.SortUilts.java

Source

/*******************************************************************************
 * Copyright (c) 2017, BGI-Shenzhen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 *
 * This file incorporates work covered by the following copyright and 
 * Permission notices:
 *
 * Copyright (c) 2010 Aalto University 
 *
 *     Permission is hereby granted, free of charge, to any person
 *     obtaining a copy of this software and associated documentation
 *     files (the "Software"), to deal in the Software without
 *     restriction, including without limitation the rights to use,
 *     copy, modify, merge, publish, distribute, sublicense, and/or sell
 *     copies of the Software, and to permit persons to whom the
 *     Software is furnished to do so, subject to the following
 *     conditions:
 *  
 *     The above copyright notice and this permission notice shall be
 *     included in all copies or substantial portions of the Software.
 *  
 *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 *     OTHER DEALINGS IN THE SOFTWARE.
 *******************************************************************************/
package org.bgi.flexlab.gaea.util;

import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.bgi.flexlab.gaea.data.structure.header.MultipleVCFHeader;
import org.bgi.flexlab.gaea.data.structure.header.SingleVCFHeader;
import org.bgi.flexlab.gaea.data.structure.reference.ReferenceShare;
import org.bgi.flexlab.gaea.framework.tools.mapreduce.BioJob;
import org.bgi.flexlab.gaea.tools.mapreduce.vcf.sort.VCFSortOptions;

import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

public class SortUilts {

    public static void configureSampling(Path outPath, BioJob job, VCFSortOptions options) throws IOException {
        Configuration conf = job.getConfiguration();
        final Path partition = outPath.getFileSystem(conf)
                .makeQualified(new Path(outPath, "_partitioning" + "VCF"));

        TotalOrderPartitioner.setPartitionFile(conf, partition);
        try {
            final URI partitionURI = new URI(partition.toString() + "#" + partition.getName());

            if (partitionURI.getScheme().equals("file"))
                return;

            ReferenceShare.distributeCache(partitionURI.toString(), job);
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }

    }

    public static void merge(MultipleVCFHeader mVcfHeader, VCFSortOptions options, Configuration conf) {
        try {
            System.out.println("vcf-MultiSampleSort :: Merging output...");

            // First, place the VCF or BCF header.
            final Path outpath = new Path(options.getOutputPath());
            final Path wrkPath = new Path(options.getWorkPath());
            final FileSystem srcFS = wrkPath.getFileSystem(conf);
            final FileSystem dstFS = outpath.getFileSystem(conf);

            Map<String, OutputStream> outs = new HashMap<String, OutputStream>();
            Map<Integer, String> multiOutputs = options.getMultiOutputs();
            for (String result : multiOutputs.values()) {
                Path sPath = new Path(options.getOutputPath() + "/" + result + ".vcf");
                OutputStream os = dstFS.create(sPath);
                outs.put(result, os);
            }

            final VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
            VariantContextWriter writer;
            Map<Integer, SingleVCFHeader> id2VcfHeader = mVcfHeader.getID2SingleVcfHeader();
            for (int id : multiOutputs.keySet()) {
                VCFHeader newHeader = id2VcfHeader.get(id).getHeader();
                writer = builder.setOutputStream(new FilterOutputStream(outs.get(multiOutputs.get(id))) {
                    @Override
                    public void close() throws IOException {
                        this.out.flush();
                    }
                }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

                writer.writeHeader(newHeader);
                writer.close();

                final FileStatus[] parts = srcFS.globStatus(
                        new Path(options.getWorkPath(), multiOutputs.get(id) + "-*-[0-9][0-9][0-9][0-9][0-9]*"));

                int i = 0;

                for (final FileStatus part : parts) {
                    System.out.printf("sort:: Merging part %d ( size %d)...\n", i++, part.getLen());
                    System.out.flush();

                    final FSDataInputStream ins = srcFS.open(part.getPath());
                    IOUtils.copyBytes(ins, outs.get(multiOutputs.get(id)), conf, false);
                    ins.close();
                }
                for (final FileStatus part : parts)
                    srcFS.delete(part.getPath(), false);

                outs.get(multiOutputs.get(id)).close();

            }
        } catch (IOException e) {
            System.err.printf("vcf-MultiSampleSort :: Output merging failed: %s\n", e);
        }
    }

}