com.google.cloud.genomics.examples.TransformNonVariantSegmentData.java Source code

Introduction

Here is the source code for com.google.cloud.genomics.examples.TransformNonVariantSegmentData.java
Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.examples;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.logging.Logger;

import com.google.api.client.util.Preconditions;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.genomics.model.Call;
import com.google.api.services.genomics.model.SearchVariantsRequest;
import com.google.api.services.genomics.model.Variant;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.options.Validation;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.functions.JoinNonVariantSegmentsWithVariants;
import com.google.cloud.genomics.dataflow.utils.DataflowWorkarounds;
import com.google.cloud.genomics.dataflow.utils.GenomicsDatasetOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.utils.Contig.SexChromosomeFilter;
import com.google.cloud.genomics.utils.GenomicsFactory;
import com.google.common.base.CharMatcher;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimaps;
import com.google.common.io.Files;

/**
 * Sample pipeline that transforms data with non-variant segments (such as data that was in source
 * format Genome VCF (gVCF) or Complete Genomics) to variant-only data with calls from
 * non-variant-segments merged into the variants with which they overlap. The resultant data is
 * emitted to a BigQuery table.
 *
 * This is currently done only for SNP variants. Indels and structural variants are left as-is.
 *
 * The data source is the Google Genomics Variants API. The data sink is BigQuery.
 *
 * <p>
 * The sample could be expanded upon to:
 * <ol>
 * <li>emit additional fields from the variants and calls
 * <li>perform additional data munging
 * <li>write data to a different sink such as Google Cloud Storage or Google Datastore
 * </ol>
 */
public class TransformNonVariantSegmentData {

    private static final Logger LOG = Logger.getLogger(TransformNonVariantSegmentData.class.getName());

    public static final String HAS_AMBIGUOUS_CALLS_FIELD = "ambiguousCalls";

    /**
     * Options supported by {@link TransformNonVariantSegmentData}.
     * <p>
     * Inherits standard configuration options for Genomics pipelines and datasets.
     */
    private static interface Options extends GenomicsDatasetOptions {
        @Description("BigQuery table to write to, specified as "
                + "<project_id>:<dataset_id>.<table_id>. The dataset must already exist.")
        @Validation.Required
        String getOutputTable();

        void setOutputTable(String value);

        @Description("A local file path to an optional list of newline-separated callset names "
                + "to exclude from the results of this pipeline.")
        String getCallSetNamesToExclude();

        void setCallSetNamesToExclude(String value);
    }

    /**
     * Construct the table schema for the output table.
     *
     * @return The schema for the destination table.
     */
    private static TableSchema getTableSchema() {
        List<TableFieldSchema> callFields = new ArrayList<>();
        callFields.add(new TableFieldSchema().setName("call_set_name").setType("STRING"));
        callFields.add(new TableFieldSchema().setName("phaseset").setType("STRING"));
        callFields.add(new TableFieldSchema().setName("genotype").setType("INTEGER").setMode("REPEATED"));
        callFields.add(new TableFieldSchema().setName("genotype_likelihood").setType("FLOAT").setMode("REPEATED"));

        List<TableFieldSchema> fields = new ArrayList<>();
        fields.add(new TableFieldSchema().setName("variant_id").setType("STRING"));
        fields.add(new TableFieldSchema().setName("reference_name").setType("STRING"));
        fields.add(new TableFieldSchema().setName("start").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("end").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("reference_bases").setType("STRING"));
        fields.add(new TableFieldSchema().setName("alternate_bases").setType("STRING").setMode("REPEATED"));
        fields.add(new TableFieldSchema().setName("names").setType("STRING").setMode("REPEATED"));
        fields.add(new TableFieldSchema().setName("filter").setType("STRING").setMode("REPEATED"));
        fields.add(new TableFieldSchema().setName("quality").setType("FLOAT"));
        fields.add(new TableFieldSchema().setName(HAS_AMBIGUOUS_CALLS_FIELD).setType("BOOLEAN"));
        fields.add(
                new TableFieldSchema().setName("call").setType("RECORD").setMode("REPEATED").setFields(callFields));

        return new TableSchema().setFields(fields);
    }

    /**
     * Pipeline function to prepare the data for writing to BigQuery.
     * 
     * It builds a TableRow object containing data from the variant mapped onto the schema to be used
     * for the destination table.
     */
    static class FormatVariantsFn extends DoFn<Variant, TableRow> {
        @Override
        public void processElement(ProcessContext c) {
            Variant v = c.element();

            List<TableRow> calls = new ArrayList<>();
            for (Call call : v.getCalls()) {
                calls.add(new TableRow().set("call_set_name", call.getCallSetName())
                        .set("phaseset", call.getPhaseset()).set("genotype", call.getGenotype())
                        .set("genotype_likelihood", (call.getGenotypeLikelihood() == null) ? new ArrayList<Double>()
                                : call.getGenotypeLikelihood()));
            }

            TableRow row = new TableRow().set("variant_id", v.getId()).set("reference_name", v.getReferenceName())
                    .set("start", v.getStart()).set("end", v.getEnd()).set("reference_bases", v.getReferenceBases())
                    .set("alternate_bases",
                            (v.getAlternateBases() == null) ? new ArrayList<String>() : v.getAlternateBases())
                    .set("names", (v.getNames() == null) ? new ArrayList<String>() : v.getNames())
                    .set("filter", (v.getFilter() == null) ? new ArrayList<String>() : v.getFilter())
                    .set("quality", v.getQuality()).set("call", calls)
                    .set(HAS_AMBIGUOUS_CALLS_FIELD, v.getInfo().get(HAS_AMBIGUOUS_CALLS_FIELD).get(0));

            c.output(row);
        }
    }

    /**
     * Pipeline function to filter out calls for specific callSetNames.
     * 
     * One may want to do this for a list of callsets that have failed quality control checks.
     */
    public static final class FilterCallsFn extends DoFn<Variant, Variant> {

        private final ImmutableSet<String> callSetNamesToExclude;

        /**
         * @param callSetNamesToExclude
         */
        public FilterCallsFn(ImmutableSet<String> callSetNamesToExclude) {
            super();
            this.callSetNamesToExclude = callSetNamesToExclude;
        }

        @Override
        public void processElement(ProcessContext context) {
            Variant variant = context.element();

            // We may have variants without calls if any callsets were deleted from the variant set.
            if (!(null == variant.getCalls() || variant.getCalls().isEmpty())) {
                List<Call> filteredCalls = Lists
                        .newArrayList(Iterables.filter(variant.getCalls(), new Predicate<Call>() {
                            @Override
                            public boolean apply(Call call) {
                                if (callSetNamesToExclude.contains(call.getCallSetName())) {
                                    return false;
                                }
                                return true;
                            }
                        }));
                variant.setCalls(filteredCalls);
            }

            context.output(variant);
        }
    }

    /**
     * Pipeline function to flag any variants with more than one call for a particular callSetName.
     * 
     * We don't see this for the tidy test datasets such as Platinum Genomes. But in practice, we have
     * seen datasets with the same individual sequenced twice, mistakes in data conversions prior to
     * this step, etc... So this function flags the particular variants with ambiguous calls for the
     * same individual, and also updates the UI with a count of such variants.
     */
    public static final class FlagVariantsWithAmbiguousCallsFn extends DoFn<Variant, Variant> {

        Aggregator<Long> variantsWithAmbiguousCallsCount;

        @Override
        public void startBundle(Context c) {
            variantsWithAmbiguousCallsCount = c.createAggregator("Number of variants containing ambiguous calls",
                    new Sum.SumLongFn());
        }

        @Override
        public void processElement(ProcessContext context) {
            Variant variant = context.element();

            // We may have variants without calls if any callsets were deleted from the variant set and/or
            // a filtering step removed all calls. Omit these variants from our output.
            if (null == variant.getCalls() || variant.getCalls().isEmpty()) {
                return;
            }

            // Gather calls together for the same callSetName.
            ListMultimap<String, Call> indexedCalls = Multimaps.index(variant.getCalls(),
                    new Function<Call, String>() {
                        @Override
                        public String apply(final Call c) {
                            return c.getCallSetName();
                        }
                    });

            // Identify and count variants with multiple calls per callSetName.
            boolean isAmbiguous = false;
            for (Entry<String, Collection<Call>> entry : indexedCalls.asMap().entrySet()) {
                if (1 < entry.getValue().size()) {
                    LOG.warning("Variant " + variant.getId() + " contains ambiguous calls for at least one genome: "
                            + entry.getValue().iterator().next().getCallSetName());
                    isAmbiguous = true;
                    variantsWithAmbiguousCallsCount.addValue(1l);
                    break; // No need to look for additional ambiguity; one instance is enough to warrant the flag.
                }
            }

            // Add the flag to the variant.
            if (variant.getInfo() == null) {
                variant.setInfo(new HashMap<String, List<String>>());
            }
            variant.getInfo().put(HAS_AMBIGUOUS_CALLS_FIELD, Arrays.asList(Boolean.toString(isAmbiguous)));

            context.output(variant);
        }
    }

    public static void main(String[] args) throws IOException, GeneralSecurityException {
        // Register the options so that they show up via --help.
        PipelineOptionsFactory.register(TransformNonVariantSegmentData.Options.class);
        TransformNonVariantSegmentData.Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
                .as(TransformNonVariantSegmentData.Options.class);

        // Option validation is not yet automatic, we make an explicit call here.
        GenomicsDatasetOptions.Methods.validateOptions(options);
        Preconditions.checkState(options.getHasNonVariantSegments(),
                "This job is only valid for data containing non-variant segments. "
                        + "Set the --hasNonVariantSegments command line option accordingly.");

        // Grab and parse our optional list of genomes to skip.
        ImmutableSet<String> callSetNamesToExclude = null;
        String skipFilepath = options.getCallSetNamesToExclude();
        if (null != skipFilepath) {
            Iterable<String> callSetNames = Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings()
                    .trimResults().split(Files.toString(new File(skipFilepath), Charset.defaultCharset()));
            callSetNamesToExclude = ImmutableSet.<String>builder().addAll(callSetNames).build();
            LOG.info("The pipeline will skip " + callSetNamesToExclude.size() + " genomes with callSetNames: "
                    + callSetNamesToExclude);
        }

        GenomicsFactory.OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);
        List<SearchVariantsRequest> requests = GenomicsDatasetOptions.Methods.getVariantRequests(options, auth,
                SexChromosomeFilter.INCLUDE_XY);

        Pipeline p = Pipeline.create(options);
        DataflowWorkarounds.registerGenomicsCoders(p);

        PCollection<SearchVariantsRequest> input = p.begin().apply(Create.of(requests));

        // Create a collection of data with non-variant segments omitted but calls from overlapping
        // non-variant segments added to SNPs.
        PCollection<Variant> variants = JoinNonVariantSegmentsWithVariants.joinVariantsTransform(input, auth);

        // For each variant flag whether or not it has ambiguous calls for a particular sample and
        // optionally filter calls.
        PCollection<Variant> flaggedVariants = callSetNamesToExclude == null
                ? variants.apply(ParDo.of(new FlagVariantsWithAmbiguousCallsFn()))
                : variants.apply(ParDo.of(new FilterCallsFn(callSetNamesToExclude)))
                        .apply(ParDo.of(new FlagVariantsWithAmbiguousCallsFn()));

        // Emit the variants to BigQuery.
        flaggedVariants.apply(ParDo.of(new FormatVariantsFn()))
                .apply(BigQueryIO.Write.to(options.getOutputTable()).withSchema(getTableSchema())
                        .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                        .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

        p.run();
    }
}