TestBufferStreamGenomicsDBImporter.java Source code

Introduction

Here is the source code for TestBufferStreamGenomicsDBImporter.java
Source

/**
 * The MIT License (MIT)
 * Copyright (c) 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of 
 * this software and associated documentation files (the "Software"), to deal in 
 * the Software without restriction, including without limitation the rights to 
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 
 * the Software, and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all 
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.json.simple.parser.ContainerFactory;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Iterator;
import com.intel.genomicsdb.GenomicsDBImporter;
import com.intel.genomicsdb.GenomicsDBException;

/**
 * Wrapper class to maintain stream state for the test driver program
 * The class can maintains
 * (a) VCFHeader
 * (b) CloseableTribbleIterator<VariantContext>
 * (c) mNextVC the next VariantContext object to be sent to GenomicsDBImporter iff the buffer
 *     interface of GenomicsDBImporter is used (addBufferStream()) and not the
 *     Iterator<VariantContext> interface (addSortedVariantContextIterator())
 */
public final class TestBufferStreamGenomicsDBImporter {
    private static class VCFFileStreamInfo {
        public int mStreamIdx = -1;
        public VCFHeader mVCFHeader = null;
        public Iterator<VariantContext> mIterator = null;
        public VariantContext mNextVC = null;

        /**
         * Constructor
         * @param fileName path to VCF file
         */
        public VCFFileStreamInfo(final String fileName, final String loaderJSONFile, final int rank,
                final boolean useMultiChromosomeIterator) throws IOException, ParseException {
            AbstractFeatureReader<VariantContext, LineIterator> reader = AbstractFeatureReader
                    .getFeatureReader(fileName, new VCFCodec(), false);
            mVCFHeader = (VCFHeader) (reader.getHeader());
            if (useMultiChromosomeIterator)
                mIterator = GenomicsDBImporter.columnPartitionIterator(reader, loaderJSONFile, rank);
            else
                mIterator = reader.iterator();
        }
    }

    /**
     * Factory object to maintain order of keys in simple JSON parsing - use LinkedHashMap
     */
    private static class LinkedHashFactory implements ContainerFactory {
        @Override
        public List creatArrayContainer() {
            return new ArrayList();
        }

        @Override
        public Map createObjectContainer() {
            return new LinkedHashMap();
        }
    }

    /**
     * Sample driver code for testing Java VariantContext write API for GenomicsDB
     * The code shows two ways of using the API
     *   (a) Iterator<VariantContext>
     *   (b) Directly adding VariantContext objects
     * If "-iterators" is passed as the second argument, method (a) is used.
     */
    public static void main(final String[] args) throws IOException, GenomicsDBException, ParseException {
        if (args.length < 2) {
            System.err.println("For loading: [-iterators] <loader.json> "
                    + "<stream_name_to_file.json> [bufferCapacity rank lbRowIdx ubRowIdx useMultiChromosomeIterator]");
            System.exit(-1);
        }
        int argsLoaderFileIdx = 0;
        if (args[0].equals("-iterators"))
            argsLoaderFileIdx = 1;
        //Buffer capacity
        long bufferCapacity = (args.length >= argsLoaderFileIdx + 3) ? Integer.parseInt(args[argsLoaderFileIdx + 2])
                : 1024;
        //Specify rank (or partition idx) of this process
        int rank = (args.length >= argsLoaderFileIdx + 4) ? Integer.parseInt(args[argsLoaderFileIdx + 3]) : 0;
        //Specify smallest row idx from which to start loading.
        // This is useful for incremental loading into existing array
        long lbRowIdx = (args.length >= argsLoaderFileIdx + 5) ? Long.parseLong(args[argsLoaderFileIdx + 4]) : 0;
        //Specify largest row idx up to which loading should be performed - for completeness
        long ubRowIdx = (args.length >= argsLoaderFileIdx + 6) ? Long.parseLong(args[argsLoaderFileIdx + 5])
                : Long.MAX_VALUE - 1;
        //Boolean to use MultipleChromosomeIterator
        boolean useMultiChromosomeIterator = (args.length >= argsLoaderFileIdx + 7)
                ? Boolean.parseBoolean(args[argsLoaderFileIdx + 6])
                : false;
        //<loader.json> first arg
        String loaderJSONFile = args[argsLoaderFileIdx];
        GenomicsDBImporter loader = new GenomicsDBImporter(loaderJSONFile, rank, lbRowIdx, ubRowIdx);
        //<stream_name_to_file.json> - useful for the driver only
        //JSON file that contains "stream_name": "vcf_file_path" entries
        FileReader mappingReader = new FileReader(args[argsLoaderFileIdx + 1]);
        JSONParser parser = new JSONParser();
        LinkedHashMap streamNameToFileName = (LinkedHashMap) parser.parse(mappingReader, new LinkedHashFactory());
        ArrayList<VCFFileStreamInfo> streamInfoVec = new ArrayList<VCFFileStreamInfo>();
        long rowIdx = 0;
        for (Object currObj : streamNameToFileName.entrySet()) {
            Map.Entry<String, String> entry = (Map.Entry<String, String>) currObj;
            VCFFileStreamInfo currInfo = new VCFFileStreamInfo(entry.getValue(), loaderJSONFile, rank,
                    useMultiChromosomeIterator);

            /** The following 2 lines are not mandatory - use initializeSampleInfoMapFromHeader()
             * iff you know for sure that sample names in the VCF header are globally unique
             * across all streams/files. If not, you have 2 options:
             *   (a) specify your own mapping from sample index in the header to SampleInfo object
             *       (unique_name, rowIdx) OR
             *   (b) specify the mapping in the callset_mapping_file (JSON) and pass null to
             *       addSortedVariantContextIterator()
             */
            LinkedHashMap<Integer, GenomicsDBImporter.SampleInfo> sampleIndexToInfo = new LinkedHashMap<Integer, GenomicsDBImporter.SampleInfo>();
            rowIdx = GenomicsDBImporter.initializeSampleInfoMapFromHeader(sampleIndexToInfo, currInfo.mVCFHeader,
                    rowIdx);
            int streamIdx = -1;
            if (args[0].equals("-iterators"))
                streamIdx = loader.addSortedVariantContextIterator(entry.getKey(), currInfo.mVCFHeader,
                        currInfo.mIterator, bufferCapacity, VariantContextWriterBuilder.OutputType.BCF_STREAM,
                        sampleIndexToInfo); //pass sorted VC iterators
            else
                //use buffers - VCs will be provided by caller
                streamIdx = loader.addBufferStream(entry.getKey(), currInfo.mVCFHeader, bufferCapacity,
                        VariantContextWriterBuilder.OutputType.BCF_STREAM, sampleIndexToInfo);
            currInfo.mStreamIdx = streamIdx;
            streamInfoVec.add(currInfo);
        }
        if (args[0].equals("-iterators")) {
            //Much simpler interface if using Iterator<VariantContext>
            loader.importBatch();
            assert loader.isDone();
        } else {
            //Must be called after all iterators/streams added - no more iterators/streams
            // can be added once this function is called
            loader.setupGenomicsDBImporter();
            //Counts and tracks buffer streams for which new data must be supplied
            //Initialized to all the buffer streams
            int numExhaustedBufferStreams = streamInfoVec.size();
            int[] exhaustedBufferStreamIdxs = new int[numExhaustedBufferStreams];
            for (int i = 0; i < numExhaustedBufferStreams; ++i)
                exhaustedBufferStreamIdxs[i] = i;
            while (!loader.isDone()) {
                //Add data for streams that were exhausted in the previous round
                for (int i = 0; i < numExhaustedBufferStreams; ++i) {
                    VCFFileStreamInfo currInfo = streamInfoVec.get(exhaustedBufferStreamIdxs[i]);
                    boolean added = true;
                    while (added && (currInfo.mIterator.hasNext() || currInfo.mNextVC != null)) {
                        if (currInfo.mNextVC != null)
                            added = loader.add(currInfo.mNextVC, currInfo.mStreamIdx);
                        if (added)
                            if (currInfo.mIterator.hasNext())
                                currInfo.mNextVC = currInfo.mIterator.next();
                            else
                                currInfo.mNextVC = null;
                    }
                }
                loader.importBatch();
                numExhaustedBufferStreams = (int) loader.getNumExhaustedBufferStreams();
                for (int i = 0; i < numExhaustedBufferStreams; ++i)
                    exhaustedBufferStreamIdxs[i] = loader.getExhaustedBufferStreamIndex(i);
            }
        }
    }
}