com.cloudera.science.quince.LoadVariantsTool.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.science.quince.LoadVariantsTool.java

Source

/*
 * Copyright (c) 2015, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.science.quince;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.beust.jcommander.Parameters;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.commons.codec.binary.Base64;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.Pipeline;
import org.apache.crunch.PipelineResult;
import org.apache.crunch.Source;
import org.apache.crunch.Target;
import org.apache.crunch.TableSource;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.From;
import org.apache.crunch.io.parquet.AvroParquetFileSource;
import org.apache.crunch.types.avro.Avros;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.ga4gh.models.FlatVariant;
import org.ga4gh.models.Variant;
import org.kitesdk.data.CompressionType;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Formats;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.View;
import org.kitesdk.data.crunch.CrunchDatasets;
import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.seqdoop.hadoop_bam.VCFInputFormat;
import org.seqdoop.hadoop_bam.VariantContextWritable;

/**
 * Loads Variants stored in Avro or Parquet GA4GH format into a Hadoop filesystem,
 * ready for querying with Hive or Impala.
 */
@Parameters(commandDescription = "Load variants tool")
public class LoadVariantsTool extends Configured implements Tool {

    private static final Logger LOG = LoggerFactory.getLogger(LoadVariantsTool.class);

    @Parameter(description = "<input-path> <output-path>")
    private List<String> paths;

    @Parameter(names = "--overwrite", description = "Allow data for an existing sample group to be overwritten.")
    private boolean overwrite = false;

    @Parameter(names = "--sample-group", description = "An identifier for the group of samples being loaded.")
    private String sampleGroup = "sample1";

    @Parameter(names = "--segment-size", description = "The number of base pairs in each segment partition.")
    private long segmentSize = 1000000;

    @Override
    public int run(String[] args) throws Exception {
        JCommander jc = new JCommander(this);
        try {
            jc.parse(args);
        } catch (ParameterException e) {
            jc.usage();
            return 1;
        }

        if (paths == null || paths.size() != 2) {
            jc.usage();
            return 1;
        }

        String inputPath = paths.get(0);
        String outputPath = paths.get(1);

        Configuration conf = getConf();
        // Copy records to avoid problem with Parquet string statistics not being correct.
        // This can be removed from parquet 1.8.0
        // (see https://issues.apache.org/jira/browse/PARQUET-251).
        conf.setBoolean(DatasetKeyOutputFormat.KITE_COPY_RECORDS, true);

        Path path = new Path(inputPath);

        if (path.getName().endsWith(".vcf")) {
            int size = 500000;
            byte[] bytes = new byte[size];
            InputStream inputStream = path.getFileSystem(conf).open(path);
            inputStream.read(bytes, 0, size);
            conf.set(VariantContextToVariantFn.VARIANT_HEADER, Base64.encodeBase64String(bytes));
        }

        Pipeline pipeline = new MRPipeline(getClass(), conf);
        PCollection<Variant> records = readVariants(path, conf, pipeline);

        PCollection<FlatVariant> flatRecords = records.parallelDo(new FlattenVariantFn(),
                Avros.specifics(FlatVariant.class));

        DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(FlatVariant.getClassSchema())
                .partitionStrategy(buildPartitionStrategy(segmentSize)).format(Formats.PARQUET)
                .compressionType(CompressionType.Uncompressed).build();

        View<FlatVariant> dataset;
        if (Datasets.exists(outputPath)) {
            dataset = Datasets.load(outputPath, FlatVariant.class).getDataset().with("sample_group", sampleGroup);
        } else {
            dataset = Datasets.create(outputPath, desc, FlatVariant.class).getDataset().with("sample_group",
                    sampleGroup);
        }

        int numReducers = conf.getInt("mapreduce.job.reduces", 1);
        System.out.println("Num reducers: " + numReducers);

        final Schema sortKeySchema = SchemaBuilder.record("sortKey").fields().requiredString("sampleId")
                .endRecord();

        PCollection<FlatVariant> partitioned = CrunchDatasets.partitionAndSort(flatRecords, dataset,
                new FlatVariantRecordMapFn(sortKeySchema), sortKeySchema, numReducers, 1);

        try {
            Target.WriteMode writeMode = overwrite ? Target.WriteMode.OVERWRITE : Target.WriteMode.DEFAULT;
            pipeline.write(partitioned, CrunchDatasets.asTarget(dataset), writeMode);
        } catch (CrunchRuntimeException e) {
            LOG.error("Crunch runtime error", e);
            return 1;
        }

        PipelineResult result = pipeline.done();
        return result.succeeded() ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new LoadVariantsTool(), args);
        System.exit(exitCode);
    }

    private static PCollection<Variant> readVariants(Path path, Configuration conf, Pipeline pipeline)
            throws IOException {
        Path file = SchemaUtils.findFile(path, conf);
        if (file.getName().endsWith(".avro")) {
            return pipeline.read(From.avroFile(path, Avros.specifics(Variant.class)));
        } else if (file.getName().endsWith(".parquet")) {
            @SuppressWarnings("unchecked")
            Source<Variant> source = new AvroParquetFileSource(path, Avros.specifics(Variant.class));
            return pipeline.read(source);
        } else if (file.getName().endsWith(".vcf")) {
            TableSource<LongWritable, VariantContextWritable> vcfSource = From.formattedFile(path,
                    VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
            return pipeline.read(vcfSource).parallelDo(new VariantContextToVariantFn(),
                    Avros.specifics(Variant.class));
        }
        throw new IllegalStateException("Unrecognized format for " + file);
    }

    static PartitionStrategy buildPartitionStrategy(long segmentSize) throws IOException {
        return new PartitionStrategy.Builder().identity("referenceName", "chr")
                .fixedSizeRange("start", "pos", segmentSize).provided("sample_group", "string").build();
    }

    private static class FlatVariantRecordMapFn extends MapFn<FlatVariant, GenericData.Record> {

        private final String sortKeySchemaString; // TODO: improve

        public FlatVariantRecordMapFn(Schema sortKeySchema) {
            this.sortKeySchemaString = sortKeySchema.toString();
        }

        @Override
        public GenericData.Record map(FlatVariant input) {
            GenericData.Record record = new GenericData.Record(new Schema.Parser().parse(sortKeySchemaString));
            record.put("sampleId", input.getCallSetId());
            return record;
        }
    }
}