fr.cirad.mgdb.exporting.markeroriented.GFFExportHandler.java Source code

Java tutorial

Introduction

Here is the source code for fr.cirad.mgdb.exporting.markeroriented.GFFExportHandler.java

Source

/*******************************************************************************
 * MGDB Export - Mongo Genotype DataBase, export handlers
 * Copyright (C) 2016 <CIRAD>
 *     
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * See <http://www.gnu.org/licenses/gpl-3.0.html> for details about
 * GNU Affero General Public License V3.
 *******************************************************************************/
package fr.cirad.mgdb.exporting.markeroriented;

import htsjdk.variant.variantcontext.VariantContext.Type;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.springframework.data.mongodb.core.MongoTemplate;

import com.mongodb.DBCursor;
import com.mongodb.DBObject;

import fr.cirad.mgdb.model.mongo.maintypes.Individual;
import fr.cirad.mgdb.model.mongo.maintypes.VariantData;
import fr.cirad.mgdb.model.mongo.subtypes.ReferencePosition;
import fr.cirad.mgdb.model.mongo.subtypes.VariantRunData;
import fr.cirad.mgdb.model.mongo.subtypes.SampleGenotype;
import fr.cirad.mgdb.model.mongo.subtypes.SampleId;
import fr.cirad.mgdb.model.mongodao.MgdbDao;
import fr.cirad.tools.Helper;
import fr.cirad.tools.ProgressIndicator;
import fr.cirad.tools.mongo.MongoTemplateManager;

/**
 * The Class GFFExportHandler.
 * 
 * @author Guilhem SEMPERE, Florian PHILIPPE
 */
public class GFFExportHandler extends AbstractMarkerOrientedExportHandler {

    /** The Constant LOG. */
    private static final Logger LOG = Logger.getLogger(GFFExportHandler.class);

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatName()
     */
    @Override
    public String getExportFormatName() {
        return "GFF3";
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatDescription()
     */
    @Override
    public String getExportFormatDescription() {
        return "Exports data in GFF3 Format based on Sequence Ontology. See <a target='_blank' href='http://rice.bio.indiana.edu:7082/annot/gff3.html'>http://rice.bio.indiana.edu:7082/annot/gff3.html</a> and <a target='_blank' href='http://www.sequenceontology.org/resources/gff3.html'>http://www.sequenceontology.org/resources/gff3.html</a>";
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.markeroriented.AbstractMarkerOrientedExportHandler#exportData(java.io.OutputStream, java.lang.String, java.util.List, fr.cirad.tools.ProgressIndicator, com.mongodb.DBCursor, java.util.Map, int, int, java.util.Map)
     */
    @Override
    public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
            ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
            int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
            throws Exception {
        MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
        ZipOutputStream zos = new ZipOutputStream(outputStream);

        if (readyToExportFiles != null)
            for (String readyToExportFile : readyToExportFiles.keySet()) {
                zos.putNextEntry(new ZipEntry(readyToExportFile));
                InputStream inputStream = readyToExportFiles.get(readyToExportFile);
                byte[] dataBlock = new byte[1024];
                int count = inputStream.read(dataBlock, 0, 1024);
                while (count != -1) {
                    zos.write(dataBlock, 0, count);
                    count = inputStream.read(dataBlock, 0, 1024);
                }
            }

        File warningFile = File.createTempFile("export_warnings_", "");
        FileWriter warningFileWriter = new FileWriter(warningFile);

        int markerCount = markerCursor.count();

        List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs);
        ArrayList<String> individualList = new ArrayList<String>();
        for (int i = 0; i < sampleIDs.size(); i++) {
            Individual individual = individuals.get(i);
            if (!individualList.contains(individual.getId())) {
                individualList.add(individual.getId());
            }
        }

        String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals";
        zos.putNextEntry(new ZipEntry(exportName + ".gff3"));
        String header = "##gff-version 3" + LINE_SEPARATOR;
        zos.write(header.getBytes());

        TreeMap<String, String> typeToOntology = new TreeMap<String, String>();
        typeToOntology.put(Type.SNP.toString(), "SO:0000694");
        typeToOntology.put(Type.INDEL.toString(), "SO:1000032");
        typeToOntology.put(Type.MIXED.toString(), "SO:0001059");
        typeToOntology.put(Type.SYMBOLIC.toString(), "SO:0000109");
        typeToOntology.put(Type.MNP.toString(), "SO:0001059");

        int avgObjSize = (Integer) mongoTemplate
                .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
        int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;
        short nProgress = 0, nPreviousProgress = 0;
        long nLoadedMarkerCount = 0;

        while (markerCursor.hasNext()) {
            int nLoadedMarkerCountInLoop = 0;
            Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>();
            boolean fStartingNewChunk = true;
            markerCursor.batchSize(nChunkSize);
            while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) {
                DBObject exportVariant = markerCursor.next();
                DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
                markerChromosomalPositions.put((Comparable) exportVariant.get("_id"),
                        refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":"
                                + refPos.get(ReferencePosition.FIELDNAME_START_SITE));
                nLoadedMarkerCountInLoop++;
                fStartingNewChunk = false;
            }

            List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet());
            LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes(
                    mongoTemplate, sampleIDs, currentMarkers, true,
                    null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes
            for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample)
            {
                Comparable variantId = variant.getId();
                List<String> variantDataOrigin = new ArrayList<String>();

                Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>();
                Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>();
                Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>();
                List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":");
                if (chromAndPos.size() == 0)
                    LOG.warn("Chromosomal position not found for marker " + variantId);
                // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR);
                if (markerSynonyms != null) {
                    Comparable syn = markerSynonyms.get(variantId);
                    if (syn != null)
                        variantId = syn;
                }

                Collection<VariantRunData> runs = variantsAndRuns.get(variant);
                if (runs != null)
                    for (VariantRunData run : runs)
                        for (Integer sampleIndex : run.getSampleGenotypes().keySet()) {
                            SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex);
                            String individualId = individuals
                                    .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex)))
                                    .getId();

                            Integer gq = null;
                            try {
                                gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ);
                            } catch (Exception ignored) {
                            }
                            if (gq != null && gq < nMinimumGenotypeQuality)
                                continue;

                            Integer dp = null;
                            try {
                                dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP);
                            } catch (Exception ignored) {
                            }
                            if (dp != null && dp < nMinimumReadDepth)
                                continue;

                            String gtCode = sampleGenotype.getCode();
                            List<String> storedIndividualGenotypes = individualGenotypes.get(individualId);
                            if (storedIndividualGenotypes == null) {
                                storedIndividualGenotypes = new ArrayList<String>();
                                individualGenotypes.put(individualId, storedIndividualGenotypes);
                            }
                            storedIndividualGenotypes.add(gtCode);
                        }

                zos.write((chromAndPos.get(0) + "\t" + StringUtils.join(variantDataOrigin, ";") /*source*/ + "\t"
                        + typeToOntology.get(variant.getType()) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t"
                        + Long.parseLong(chromAndPos.get(1)) + "\t" + "." + "\t" + "+" + "\t" + "." + "\t")
                                .getBytes());
                Comparable syn = markerSynonyms == null ? null : markerSynonyms.get(variant.getId());
                zos.write(("ID=" + variant.getId() + ";" + (syn != null ? "Name=" + syn + ";" : "") + "alleles="
                        + StringUtils.join(variant.getKnownAlleleList(), "/") + ";" + "refallele="
                        + variant.getKnownAlleleList().get(0) + ";").getBytes());

                for (int j = 0; j < individualList
                        .size(); j++ /* we use this list because it has the proper ordering*/) {

                    NumberFormat nf = NumberFormat.getInstance(Locale.US);
                    nf.setMaximumFractionDigits(4);
                    HashMap<String, Integer> compt1 = new HashMap<String, Integer>();
                    int highestGenotypeCount = 0;
                    int sum = 0;

                    String individualId = individualList.get(j);
                    List<String> genotypes = individualGenotypes.get(individualId);
                    HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes

                    String mostFrequentGenotype = null;
                    if (genotypes != null)
                        for (String genotype : genotypes) {
                            if (genotype.length() == 0)
                                continue; /* skip missing genotypes */

                            int count = 0;
                            for (String t : variant.getAllelesFromGenotypeCode(genotype)) {
                                for (String t1 : variant.getKnownAlleleList()) {
                                    if (t.equals(t1) && !(compt1.containsKey(t1))) {
                                        count++;
                                        compt1.put(t1, count);
                                    } else if (t.equals(t1) && compt1.containsKey(t1)) {
                                        if (compt1.get(t1) != 0) {
                                            count++;
                                            compt1.put(t1, count);
                                        } else
                                            compt1.put(t1, count);
                                    } else if (!(compt1.containsKey(t1))) {
                                        compt1.put(t1, 0);
                                    }
                                }
                            }
                            for (int countValue : compt1.values()) {
                                sum += countValue;
                            }

                            int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                            if (gtCount > highestGenotypeCount) {
                                highestGenotypeCount = gtCount;
                                mostFrequentGenotype = genotype;
                            }
                            genotypeCounts.put(genotype, gtCount);
                        }

                    List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>()
                            : variant.getAllelesFromGenotypeCode(mostFrequentGenotype);

                    if (alleles.size() != 0) {
                        zos.write(("acounts=" + individualId + ":").getBytes());

                        for (String knowAllelesCompt : compt1.keySet()) {
                            zos.write(
                                    (knowAllelesCompt + " " + nf.format(compt1.get(knowAllelesCompt) / (float) sum)
                                            + " " + compt1.get(knowAllelesCompt) + " ").getBytes());
                        }
                        zos.write((alleles.size() + ";").getBytes());
                    }
                    if (genotypeCounts.size() > 1) {
                        Comparable sVariantId = markerSynonyms != null ? markerSynonyms.get(variant.getId())
                                : variant.getId();
                        warningFileWriter.write("- Dissimilar genotypes found for variant "
                                + (sVariantId == null ? variant.getId() : sVariantId) + ", individual "
                                + individualId + ". Exporting most frequent: " + StringUtils.join(alleles, ",")
                                + "\n");
                    }
                }
                zos.write((LINE_SEPARATOR).getBytes());
            }

            if (progress.hasAborted())
                return;

            nLoadedMarkerCount += nLoadedMarkerCountInLoop;
            nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
            if (nProgress > nPreviousProgress) {
                //            if (nProgress%5 == 0)
                //               LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s");
                progress.setCurrentStepProgress(nProgress);
                nPreviousProgress = nProgress;
            }
        }

        warningFileWriter.close();
        if (warningFile.length() > 0) {
            zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
            int nWarningCount = 0;
            BufferedReader in = new BufferedReader(new FileReader(warningFile));
            String sLine;
            while ((sLine = in.readLine()) != null) {
                zos.write((sLine + "\n").getBytes());
                in.readLine();
                nWarningCount++;
            }
            LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
            in.close();
        }
        warningFile.delete();

        zos.close();
        progress.setCurrentStepProgress((short) 100);
    }

    /* (non-Javadoc)
     * @see fr.cirad.mgdb.exporting.IExportHandler#getStepList()
     */
    @Override
    public List<String> getStepList() {
        return Arrays.asList(new String[] { "Exporting data to GFF3 format" });
    }
}