org.opencb.biodata.tools.variant.converter.VariantTabix.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.biodata.tools.variant.converter.VariantTabix.java

Source

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.biodata.tools.variant.converter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.protobuf.VcfTabixProto.VcfRecord;
import org.opencb.biodata.models.variant.protobuf.VcfTabixProto.VcfRecord.Builder;
import org.opencb.biodata.models.variant.protobuf.VcfTabixProto.VcfSample;

/**
 * @author Matthias Haimel mh719+git@cam.ac.uk
 */
public class VariantTabix implements Converter<Variant, VcfRecord> {

    //   private static final char STRING_JOIN_SEP = '~';
    public static final String ATTRIBUTE_SRC = "src";
    public static final String ATTRIBUTE_ORI = "ori";
    public static final String ATTRIBUTE_QUAL = "QUAL";
    public static final String ATTRIBUTE_FILTER = "FILTER";
    //   public static final String ILLUMINA_GVCF_BLOCK_END = "END";

    private final ConcurrentMap<String, Integer> sample_to_index = new ConcurrentHashMap<String, Integer>();
    private final List<String> samples = new CopyOnWriteArrayList<String>();
    private final AtomicReference<String> defaultFilterKeys = new AtomicReference<String>();
    //   private final AtomicReference<String> defaultInfoKeys = new AtomicReference<String>();
    private final List<String> defaultInfoKeys = new CopyOnWriteArrayList<String>();
    private final List<String> defaultFormatKeys = new CopyOnWriteArrayList<String>();

    /**
     *
     */
    public VariantTabix() {
        // to nothing
    }

    @Override
    public VcfRecord convert(Variant variant) {
        return convert(variant, -1);
    }

    public VcfRecord convert(Variant variant, int chunkSize) {
        Builder recordBuilder = VcfRecord.newBuilder()
                .setRelativeStart(getSliceOffset(variant.getStart().intValue(), chunkSize))
                .setRelativeEnd(getSliceOffset(variant.getEnd().intValue(), chunkSize))
                .setReference(variant.getReference()).setAlternate(variant.getAlternate())
                .addAllIdNonDefault(decodeIds(variant.getIds()));

        /* Get Study (one only expected  */
        //        Map<String, VariantSourceEntry> sourceEntries = variant.getStudies();
        List<StudyEntry> sourceEntries = variant.getStudies();

        if (null == sourceEntries || sourceEntries.size() == 0) {
            throw new UnsupportedOperationException(String.format("No Study found for variant: %s %s",
                    variant.getChromosome(), variant.getStart()));
        }
        if (sourceEntries.size() > 1) {
            throw new UnsupportedOperationException(
                    String.format("Only one Study supported - found %s studies instead!!!", sourceEntries.size()));
        }
        //        Entry<String, VariantSourceEntry> entry = sourceEntries.entrySet().iterator().next();
        //        VariantSourceEntry study = entry.getValue();
        StudyEntry study = sourceEntries.get(0);

        Map<String, String> attr = study.getAttributes();
        /* Filter */
        recordBuilder.setFilterNonDefault(decodeFilter(attr.remove(ATTRIBUTE_FILTER)));

        /* QUAL */
        recordBuilder.setQuality(decodeQual(attr.remove(ATTRIBUTE_QUAL)));

        /* INFO */
        // remove possible other columns
        attr.remove(ATTRIBUTE_ORI);
        attr.remove(ATTRIBUTE_SRC);

        List<String> infoKeys = decodeInfoKeys(attr);
        boolean isInfoDefault = isDefaultInfoKeys(infoKeys);
        List<String> infoValues = decodeInfoValues(attr, infoKeys);
        if (!isInfoDefault) {
            recordBuilder.addAllInfoKey(infoKeys);
        } else {
            recordBuilder.addAllInfoKey(Arrays.asList(new String[] {}));
        }
        recordBuilder.addAllInfoValue(infoValues);

        /* FORMAT */
        List<String> formatLst = decodeFormat(study.getFormat().stream().collect(Collectors.joining(","))); // FORMAT column
        if (!isDefaultFormat(formatLst)) {
            recordBuilder.addAllSampleFormatNonDefault(formatLst); // maybe empty if default
        }
        recordBuilder.addAllSamples(decodeSamples(formatLst, study.getSamplesData()));

        // TODO check all worked
        return recordBuilder.build();
    }

    /**
     * Calculate Slice given a position and a chunk size &gt; 0; if chunk size &lt;= 0, returns position
     *
     * @param position  genomic position
     * @param chunkSize chunk size
     * @return slice calculated using position and chunk size
     */
    public long getSlicePosition(long position, int chunkSize) {
        return chunkSize > 0 ? position / (long) chunkSize : position;
    }

    /**
     * Calculate offset to a slice junction given a position and a chunk size &gt; 0; if chunk size &lt;= 0, return position
     *
     * @param position  genomic position
     * @param chunkSize chunk size
     * @return offset calculated to the slice start position
     */
    public int getSliceOffset(int position, int chunkSize) {
        return chunkSize > 0 ? position % chunkSize : position;
    }

    private Iterable<String> decodeIds(List<String> ids) {
        // TODO check if "." are removed!!!
        return ids.stream().map(x -> x.toString()).collect(Collectors.toList());
    }

    private String decodeQual(String value) {
        if (null != value) {
            return value.toString();
        }
        return StringUtils.EMPTY;
    }

    private String decodeFilter(String filter) {
        if (null != filter) {
            return filter.toString();
        }
        return StringUtils.EMPTY;
    }

    private List<String> decodeInfoValues(Map<String, String> attr, List<String> infoKeys) {
        //      infoKeys.stream().map(x -> attr.get(x)).collect(Collectors.toList()); // not sure if order is protected
        List<String> values = new ArrayList<String>(infoKeys.size());
        for (String key : infoKeys) {
            values.add(attr.get(key).toString());
        }
        return values;
    }

    /**
     * Creates a sorted list of strings from the INFO KEYs
     *
     * @param attr {@link Map} of key-value info pairs
     * @return {@link List}
     */
    private List<String> decodeInfoKeys(Map<String, String> attr) {
        // sorted key list
        List<String> keyList = attr.keySet().stream().map(x -> x.toString()).sorted().collect(Collectors.toList());
        return keyList;
    }

    private boolean isDefaultInfoKeys(List<String> keyList) {
        return this.getDefaultInfoKeys().equals(keyList);
        //      String str = StringUtils.join(keyList,STRING_JOIN_SEP);
        //      return StringUtils.equals(str, getDefaultInfoKeys());
    }

    /**
     * Creates a List of strings in the original order from the FORMAT string provided; Split the string by ":"
     *
     * @param format Format string with the keys separated by ":"
     * @return {@link List}
     */
    public List<String> decodeFormat(String format) {
        return Arrays.asList(format.split(":"));
    }

    public boolean isDefaultFormat(List<String> keyList) {
        return getDefaultFormatKeys().equals(keyList);
    }

    public List<VcfSample> decodeSamples(List<String> formatLst, List<List<String>> samplesData) {
        List<String> samples = getSamples(); // samplesData.keySet()
        List<VcfSample> ret = new ArrayList<>(samples.size());
        //        for(String s : samples){
        //            Map<String, String> map = samplesData.get(s);
        //            ret.add(decodeSample(formatLst,map));
        //        }
        for (int i = 0; i < samples.size(); i++) {
            // samplesData should have fields in the same order than formatLst
            ret.add(VcfSample.newBuilder().addAllSampleValues(samplesData.get(i)).build());
        }

        return ret;
    }

    public VcfSample decodeSample(List<String> formatLst, Map<String, String> data) {
        List<String> values = new ArrayList<>(formatLst.size());
        for (String f : formatLst) {
            values.add(data.getOrDefault(f, StringUtils.EMPTY).toString());
        }
        return VcfSample.newBuilder().addAllSampleValues(values).build();
    }

    public List<String> getSamples() {
        return samples;
    }

    public void setSamples(List<String> samples) {
        this.samples.clear();
        this.samples.addAll(samples);
    }

    public String getDefaultFilterKeys() {
        return defaultFilterKeys.get();
    }

    public void setDefaultFilterKeys(String value) {
        defaultFilterKeys.set(value);
    }

    public List<String> getDefaultInfoKeys() {
        return defaultInfoKeys;
    }

    public void setDefaultInfoKeys(List<String> keys) {
        defaultInfoKeys.clear();
        defaultInfoKeys.addAll(keys);
    }

    public List<String> getDefaultFormatKeys() {
        return defaultFormatKeys;
    }

    public void setDefaultFormatKeys(List<String> keys) {
        defaultFormatKeys.clear();
        defaultFormatKeys.addAll(keys);
    }

}