Java tutorial
/******************************************************************************* * MGDB Export - Mongo Genotype DataBase, export handlers * Copyright (C) 2016 <CIRAD> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * See <http://www.gnu.org/licenses/gpl-3.0.html> for details about * GNU Affero General Public License V3. *******************************************************************************/ package fr.cirad.mgdb.exporting.markeroriented; import fr.cirad.mgdb.model.mongo.maintypes.Individual; import fr.cirad.mgdb.model.mongo.maintypes.VariantData; import fr.cirad.mgdb.model.mongo.subtypes.ReferencePosition; import fr.cirad.mgdb.model.mongo.subtypes.SampleGenotype; import fr.cirad.mgdb.model.mongo.subtypes.SampleId; import fr.cirad.mgdb.model.mongo.subtypes.VariantRunData; import fr.cirad.mgdb.model.mongodao.MgdbDao; import fr.cirad.tools.Helper; import fr.cirad.tools.ProgressIndicator; import fr.cirad.tools.mongo.MongoTemplateManager; import htsjdk.variant.variantcontext.VariantContext.Type; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import org.apache.log4j.Logger; import org.springframework.data.mongodb.core.MongoTemplate; import com.mongodb.DBCursor; import com.mongodb.DBObject; /** * The Class EigenstratExportHandler. */ public class EigenstratExportHandler extends AbstractMarkerOrientedExportHandler { /** The Constant LOG. */ private static final Logger LOG = Logger.getLogger(EigenstratExportHandler.class); /** The Constant EIGENSTRAT_FORMAT. */ public static final String EIGENSTRAT_FORMAT = "EIGENSTRAT"; /** The supported variant types. */ private static List<String> supportedVariantTypes; static { supportedVariantTypes = new ArrayList<String>(); supportedVariantTypes.add(Type.SNP.toString()); } /** * Gets the individual gender code. * * @param sModule the module * @param individual the individual * @return the individual gender code */ protected String getIndividualGenderCode(String sModule, String individual) { return "U"; } /** * Gets the populations from samples. * * @param sModule the module * @param sampleIDs the sample i ds * @return the populations from samples */ @SuppressWarnings("unchecked") protected List<String> getPopulationsFromSamples(final String sModule, final List<SampleId> sampleIDs) { ArrayList<String> result = new ArrayList<String>(); for (Individual individual : getIndividualsFromSamples(sModule, sampleIDs)) result.add(individual.getPopulation()); return result; } /* (non-Javadoc) * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatName() */ @Override public String getExportFormatName() { return EIGENSTRAT_FORMAT; } /* (non-Javadoc) * @see fr.cirad.mgdb.exporting.IExportHandler#getExportFormatDescription() */ @Override public String getExportFormatDescription() { return "Exports zipped ind, snp and eigenstratgeno files, along with an optional remark-file. See <a target='_blank' href='http://genepath.med.harvard.edu/~reich/InputFileFormats.htm'>http://genepath.med.harvard.edu/~reich/InputFileFormats.htm</a> for more details"; } /* (non-Javadoc) * @see fr.cirad.mgdb.exporting.markeroriented.AbstractMarkerOrientedExportHandler#getSupportedVariantTypes() */ @Override public List<String> getSupportedVariantTypes() { return supportedVariantTypes; } // public static void logMemUsage(Logger log) // { // long maxMem = Runtime.getRuntime().maxMemory()/(1024*1024); // long freeMem = Runtime.getRuntime().freeMemory()/(1024*1024); // long totalMem = Runtime.getRuntime().totalMemory()/(1024*1024); // log.debug("total: " + totalMem + ", free: " + freeMem + ", max: " + // maxMem); // } /* (non-Javadoc) * @see fr.cirad.mgdb.exporting.markeroriented.AbstractMarkerOrientedExportHandler#exportData(java.io.OutputStream, java.lang.String, java.util.List, fr.cirad.tools.ProgressIndicator, com.mongodb.DBCursor, java.util.Map, int, int, java.util.Map) */ @Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { // long before = System.currentTimeMillis(); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); File snpFile = null; try { snpFile = File.createTempFile("snpFile", ""); FileWriter snpFileWriter = new FileWriter(snpFile); ZipOutputStream zos = new ZipOutputStream(outputStream); if (ByteArrayOutputStream.class.isAssignableFrom(outputStream.getClass())) zos.setLevel(ZipOutputStream.STORED); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); } } MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); StringBuffer indFileContents = new StringBuffer(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); indFileContents .append(individual.getId() + "\t" + getIndividualGenderCode(sModule, individual.getId()) + "\t" + (individual.getPopulation() == null ? "." : individual.getPopulation()) + LINE_SEPARATOR); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".ind")); zos.write(indFileContents.toString().getBytes()); zos.putNextEntry(new ZipEntry(exportName + ".eigenstratgeno")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats() .get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":"); if (chromAndPos.size() == 0) LOG.warn("Chromosomal position not found for marker " + variantId); // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } snpFileWriter.write(variantId + "\t" + (chromAndPos.size() == 0 ? "0" : chromAndPos.get(0)) + "\t" + 0 + "\t" + (chromAndPos.size() == 0 ? 0l : Long.parseLong(chromAndPos.get(1))) + LINE_SEPARATOR); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String individualId = individuals .get(sampleIDs .indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); Integer gq = null; try { gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ); } catch (Exception ignored) { } if (gq != null && gq < nMinimumGenotypeQuality) continue; Integer dp = null; try { dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP); } catch (Exception ignored) { } if (dp != null && dp < nMinimumReadDepth) continue; String gtCode = sampleGenotype.getCode(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); } for (int j = 0; j < individualList .size(); j++ /* we use this list because it has the proper ordering*/) { String individualId = individualList.get(j); List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>() : variant.getAllelesFromGenotypeCode(mostFrequentGenotype); int nOutputCode = 0; if (mostFrequentGenotype == null) nOutputCode = 9; else for (String all : Helper.split(mostFrequentGenotype, "/")) if ("0".equals(all)) nOutputCode++; if (j == 0 && variant.getKnownAlleleList().size() > 2) warningFileWriter.write("- Variant " + variant.getId() + " is multi-allelic. Make sure Eigenstrat genotype encoding specifications are suitable for you.\n"); zos.write(("" + nOutputCode).getBytes()); if (genotypeCounts.size() > 1 || alleles.size() > 2) { if (genotypeCounts.size() > 1) warningFileWriter.write("- Dissimilar genotypes found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting most frequent: " + nOutputCode + "\n"); if (alleles.size() > 2) warningFileWriter.write("- More than 2 alleles found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting only the first 2 alleles.\n"); } } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("============= exportData: " + nProgress + "% =============" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } snpFileWriter.close(); zos.putNextEntry(new ZipEntry(exportName + ".snp")); BufferedReader in = new BufferedReader(new FileReader(snpFile)); String sLine; while ((sLine = in.readLine()) != null) zos.write((sLine + "\n").getBytes()); in.close(); warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; in = new BufferedReader(new FileReader(warningFile)); while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); } finally { if (snpFile != null && snpFile.exists()) snpFile.delete(); } } /* (non-Javadoc) * @see fr.cirad.mgdb.exporting.IExportHandler#getStepList() */ @Override public List<String> getStepList() { return Arrays.asList(new String[] { "Exporting data to EIGENSTRAT format" }); } }