List of usage examples for com.mongodb DBCursor count
public int count()
From source file:fr.cirad.mgdb.exporting.individualoriented.DARwinExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles, boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); GenotypingProject aProject = mongoTemplate.findOne( new Query(Criteria.where(GenotypingProject.FIELDNAME_PLOIDY_LEVEL).exists(true)), GenotypingProject.class); if (aProject == null) LOG.warn("Unable to find a project containing ploidy level information! Assuming ploidy level is 2."); int ploidy = aProject == null ? 2 : aProject.getPloidyLevel(); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }//w w w .j a v a 2 s. c o m } String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size() + "individuals"; StringBuffer donFileContents = new StringBuffer( "@DARwin 5.0 - DON -" + LINE_SEPARATOR + individualExportFiles.size() + "\t" + 1 + LINE_SEPARATOR + "N" + "\t" + "individual" + LINE_SEPARATOR); int count = 0; String missingGenotype = ""; for (int j = 0; j < ploidy; j++) missingGenotype += "\tN"; zos.putNextEntry(new ZipEntry(exportName + ".var")); zos.write(("@DARwin 5.0 - ALLELIC - " + ploidy + LINE_SEPARATOR + individualExportFiles.size() + "\t" + markerCount * ploidy + LINE_SEPARATOR + "N").getBytes()); DBCursor markerCursorCopy = markerCursor.copy(); // dunno how expensive this is, but seems safer than keeping all IDs in memory at any time short nProgress = 0, nPreviousProgress = 0; int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; markerCursorCopy.batchSize(nChunkSize); int nMarkerIndex = 0; while (markerCursorCopy.hasNext()) { DBObject exportVariant = markerCursorCopy.next(); Comparable markerId = (Comparable) exportVariant.get("_id"); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) markerId = syn; } for (int j = 0; j < ploidy; j++) zos.write(("\t" + markerId).getBytes()); } TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>(); ArrayList<String> distinctAlleles = new ArrayList<String>(); // the index of each allele will be used as its code int i = 0; for (File f : individualExportFiles) { BufferedReader in = new BufferedReader(new FileReader(f)); try { String individualId, line = in.readLine(); // read sample id if (line != null) individualId = line; else throw new Exception("Unable to read first line of temp export file " + f.getName()); donFileContents.append(++count + "\t" + individualId + LINE_SEPARATOR); zos.write((LINE_SEPARATOR + count).getBytes()); nMarkerIndex = 0; while ((line = in.readLine()) != null) { List<String> genotypes = MgdbDao.split(line, "|"); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } if (genotypeCounts.size() > 1) { warningFileWriter.write("- Dissimilar genotypes found for variant __" + nMarkerIndex + "__, individual " + individualId + ". Exporting most frequent: " + mostFrequentGenotype + "\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String codedGenotype = ""; if (mostFrequentGenotype != null) for (String allele : mostFrequentGenotype.split(" ")) { if (!distinctAlleles.contains(allele)) distinctAlleles.add(allele); codedGenotype += "\t" + distinctAlleles.indexOf(allele); } else codedGenotype = missingGenotype.replaceAll("N", "-1"); // missing data is coded as -1 zos.write(codedGenotype.getBytes()); nMarkerIndex++; } } catch (Exception e) { LOG.error("Error exporting data", e); progress.setError("Error exporting data: " + e.getClass().getSimpleName() + (e.getMessage() != null ? " - " + e.getMessage() : "")); return; } finally { in.close(); } if (progress.hasAborted()) return; nProgress = (short) (++i * 100 / individualExportFiles.size()); if (nProgress > nPreviousProgress) { // LOG.debug("============= doDARwinExport (" + i + "): " + nProgress + "% ============="); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } if (!f.delete()) { f.deleteOnExit(); LOG.info("Unable to delete tmp export file " + f.getAbsolutePath()); } } zos.putNextEntry(new ZipEntry(exportName + ".don")); zos.write(donFileContents.toString().getBytes()); // now read variant names for those that induced warnings nMarkerIndex = 0; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext()) { DBObject exportVariant = markerCursor.next(); if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { Comparable markerId = (Comparable) exportVariant.get("_id"); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) markerId = syn; } for (int j = 0; j < ploidy; j++) zos.write(("\t" + markerId).getBytes()); problematicMarkerIndexToNameMap.put(nMarkerIndex, markerId); } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet()) sLine = sLine.replaceAll("__" + aMarkerIndex + "__", problematicMarkerIndexToNameMap.get(aMarkerIndex).toString()); zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.individualoriented.PLinkExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles, boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles) throws Exception { File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }/*from ww w. j ava 2s .com*/ } MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".ped")); TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>(); short nProgress = 0, nPreviousProgress = 0; int i = 0; for (File f : individualExportFiles) { BufferedReader in = new BufferedReader(new FileReader(f)); try { String individualId, line = in.readLine(); // read sample id if (line != null) { individualId = line; String population = getIndividualPopulation(sModule, line); String individualInfo = (population == null ? "." : population) + " " + individualId; zos.write((individualInfo + " 0 0 0 " + getIndividualGenderCode(sModule, individualId)) .getBytes()); } else throw new Exception("Unable to read first line of temp export file " + f.getName()); int nMarkerIndex = 0; while ((line = in.readLine()) != null) { List<String> genotypes = MgdbDao.split(line, "|"); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } if (genotypeCounts.size() > 1) { warningFileWriter.write("- Dissimilar genotypes found for variant " + nMarkerIndex + ", individual " + individualId + ". Exporting most frequent: " + mostFrequentGenotype + "\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String[] alleles = mostFrequentGenotype == null ? new String[0] : mostFrequentGenotype.split(" "); if (alleles.length > 2) { warningFileWriter.write("- More than 2 alleles found for variant " + nMarkerIndex + ", individual " + individualId + ". Exporting only the first 2 alleles.\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String all1 = alleles.length == 0 ? "0" : alleles[0]; String all2 = alleles.length == 0 ? "0" : alleles[alleles.length == 1 ? 0 : 1]; if (all1.length() != 1 || all2.length() != 1) { warningFileWriter .write("- SNP expected, but alleles are not coded on a single char for variant " + nMarkerIndex + ", individual " + individualId + ". Ignoring this genotype.\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } else zos.write((" " + all1 + " " + all2).getBytes()); nMarkerIndex++; } } catch (Exception e) { LOG.error("Error exporting data", e); progress.setError("Error exporting data: " + e.getClass().getSimpleName() + (e.getMessage() != null ? " - " + e.getMessage() : "")); return; } finally { in.close(); } if (progress.hasAborted()) return; nProgress = (short) (++i * 100 / individualExportFiles.size()); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } zos.write('\n'); if (!f.delete()) { f.deleteOnExit(); LOG.info("Unable to delete tmp export file " + f.getAbsolutePath()); } } warningFileWriter.close(); zos.putNextEntry(new ZipEntry(exportName + ".map")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; markerCursor.batchSize(nChunkSize); int nMarkerIndex = 0; while (markerCursor.hasNext()) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); Comparable markerId = (Comparable) exportVariant.get("_id"); String chrom = (String) refPos.get(ReferencePosition.FIELDNAME_SEQUENCE); Long pos = ((Number) refPos.get(ReferencePosition.FIELDNAME_START_SITE)).longValue(); if (chrom == null) LOG.warn("Chromosomal position not found for marker " + markerId); Comparable exportedId = markerSynonyms == null ? markerId : markerSynonyms.get(markerId); zos.write(((chrom == null ? "0" : chrom) + " " + exportedId + " " + 0 + " " + (pos == null ? 0 : pos) + LINE_SEPARATOR).getBytes()); if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { // we are going to need this marker's name for the warning file Comparable variantName = markerId; if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) variantName = syn; } problematicMarkerIndexToNameMap.put(nMarkerIndex, variantName); } nMarkerIndex++; } if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet()) sLine = sLine.replaceAll("__" + aMarkerIndex + "__", problematicMarkerIndexToNameMap.get(aMarkerIndex).toString()); zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.BEDExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }//from w w w .j a va 2 s . c o m } int markerCount = markerCursor.count(); List<String> selectedIndividualList = new ArrayList<String>(); for (Individual ind : getIndividualsFromSamples(sModule, sampleIDs)) selectedIndividualList.add(ind.getId()); String exportName = sModule + "_" + markerCount + "variants_" + selectedIndividualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".bed")); short nProgress = 0, nPreviousProgress = 0; int nChunkSize = Math.min(2000, markerCount), nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } for (Comparable variantId : markerChromosomalPositions.keySet()) // read data and write results into temporary files (one per sample) { String[] chromAndPos = markerChromosomalPositions.get(variantId).split(":"); zos.write((chromAndPos[0] + "\t" + (Long.parseLong(chromAndPos[1]) - 1) + "\t" + (Long.parseLong(chromAndPos[1]) - 1) + "\t" + variantId + "\t" + "0" + "\t" + "+") .getBytes()); zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.EigenstratExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { // long before = System.currentTimeMillis(); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); File snpFile = null;//from ww w . jav a2 s . c om try { snpFile = File.createTempFile("snpFile", ""); FileWriter snpFileWriter = new FileWriter(snpFile); ZipOutputStream zos = new ZipOutputStream(outputStream); if (ByteArrayOutputStream.class.isAssignableFrom(outputStream.getClass())) zos.setLevel(ZipOutputStream.STORED); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); } } MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); StringBuffer indFileContents = new StringBuffer(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); indFileContents .append(individual.getId() + "\t" + getIndividualGenderCode(sModule, individual.getId()) + "\t" + (individual.getPopulation() == null ? "." : individual.getPopulation()) + LINE_SEPARATOR); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".ind")); zos.write(indFileContents.toString().getBytes()); zos.putNextEntry(new ZipEntry(exportName + ".eigenstratgeno")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats() .get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":"); if (chromAndPos.size() == 0) LOG.warn("Chromosomal position not found for marker " + variantId); // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } snpFileWriter.write(variantId + "\t" + (chromAndPos.size() == 0 ? "0" : chromAndPos.get(0)) + "\t" + 0 + "\t" + (chromAndPos.size() == 0 ? 0l : Long.parseLong(chromAndPos.get(1))) + LINE_SEPARATOR); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String individualId = individuals .get(sampleIDs .indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); Integer gq = null; try { gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ); } catch (Exception ignored) { } if (gq != null && gq < nMinimumGenotypeQuality) continue; Integer dp = null; try { dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP); } catch (Exception ignored) { } if (dp != null && dp < nMinimumReadDepth) continue; String gtCode = sampleGenotype.getCode(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); } for (int j = 0; j < individualList .size(); j++ /* we use this list because it has the proper ordering*/) { String individualId = individualList.get(j); List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>() : variant.getAllelesFromGenotypeCode(mostFrequentGenotype); int nOutputCode = 0; if (mostFrequentGenotype == null) nOutputCode = 9; else for (String all : Helper.split(mostFrequentGenotype, "/")) if ("0".equals(all)) nOutputCode++; if (j == 0 && variant.getKnownAlleleList().size() > 2) warningFileWriter.write("- Variant " + variant.getId() + " is multi-allelic. Make sure Eigenstrat genotype encoding specifications are suitable for you.\n"); zos.write(("" + nOutputCode).getBytes()); if (genotypeCounts.size() > 1 || alleles.size() > 2) { if (genotypeCounts.size() > 1) warningFileWriter.write("- Dissimilar genotypes found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting most frequent: " + nOutputCode + "\n"); if (alleles.size() > 2) warningFileWriter.write("- More than 2 alleles found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting only the first 2 alleles.\n"); } } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("============= exportData: " + nProgress + "% =============" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } snpFileWriter.close(); zos.putNextEntry(new ZipEntry(exportName + ".snp")); BufferedReader in = new BufferedReader(new FileReader(snpFile)); String sLine; while ((sLine = in.readLine()) != null) zos.write((sLine + "\n").getBytes()); in.close(); warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; in = new BufferedReader(new FileReader(warningFile)); while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); } finally { if (snpFile != null && snpFile.exists()) snpFile.delete(); } }
From source file:fr.cirad.mgdb.exporting.markeroriented.GFFExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }/* w w w. java 2s. c o m*/ } File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".gff3")); String header = "##gff-version 3" + LINE_SEPARATOR; zos.write(header.getBytes()); TreeMap<String, String> typeToOntology = new TreeMap<String, String>(); typeToOntology.put(Type.SNP.toString(), "SO:0000694"); typeToOntology.put(Type.INDEL.toString(), "SO:1000032"); typeToOntology.put(Type.MIXED.toString(), "SO:0001059"); typeToOntology.put(Type.SYMBOLIC.toString(), "SO:0000109"); typeToOntology.put(Type.MNP.toString(), "SO:0001059"); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); List<String> variantDataOrigin = new ArrayList<String>(); Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":"); if (chromAndPos.size() == 0) LOG.warn("Chromosomal position not found for marker " + variantId); // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String individualId = individuals .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); Integer gq = null; try { gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ); } catch (Exception ignored) { } if (gq != null && gq < nMinimumGenotypeQuality) continue; Integer dp = null; try { dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP); } catch (Exception ignored) { } if (dp != null && dp < nMinimumReadDepth) continue; String gtCode = sampleGenotype.getCode(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); } zos.write((chromAndPos.get(0) + "\t" + StringUtils.join(variantDataOrigin, ";") /*source*/ + "\t" + typeToOntology.get(variant.getType()) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t" + "." + "\t" + "+" + "\t" + "." + "\t") .getBytes()); Comparable syn = markerSynonyms == null ? null : markerSynonyms.get(variant.getId()); zos.write(("ID=" + variant.getId() + ";" + (syn != null ? "Name=" + syn + ";" : "") + "alleles=" + StringUtils.join(variant.getKnownAlleleList(), "/") + ";" + "refallele=" + variant.getKnownAlleleList().get(0) + ";").getBytes()); for (int j = 0; j < individualList .size(); j++ /* we use this list because it has the proper ordering*/) { NumberFormat nf = NumberFormat.getInstance(Locale.US); nf.setMaximumFractionDigits(4); HashMap<String, Integer> compt1 = new HashMap<String, Integer>(); int highestGenotypeCount = 0; int sum = 0; String individualId = individualList.get(j); List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int count = 0; for (String t : variant.getAllelesFromGenotypeCode(genotype)) { for (String t1 : variant.getKnownAlleleList()) { if (t.equals(t1) && !(compt1.containsKey(t1))) { count++; compt1.put(t1, count); } else if (t.equals(t1) && compt1.containsKey(t1)) { if (compt1.get(t1) != 0) { count++; compt1.put(t1, count); } else compt1.put(t1, count); } else if (!(compt1.containsKey(t1))) { compt1.put(t1, 0); } } } for (int countValue : compt1.values()) { sum += countValue; } int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>() : variant.getAllelesFromGenotypeCode(mostFrequentGenotype); if (alleles.size() != 0) { zos.write(("acounts=" + individualId + ":").getBytes()); for (String knowAllelesCompt : compt1.keySet()) { zos.write( (knowAllelesCompt + " " + nf.format(compt1.get(knowAllelesCompt) / (float) sum) + " " + compt1.get(knowAllelesCompt) + " ").getBytes()); } zos.write((alleles.size() + ";").getBytes()); } if (genotypeCounts.size() > 1) { Comparable sVariantId = markerSynonyms != null ? markerSynonyms.get(variant.getId()) : variant.getId(); warningFileWriter.write("- Dissimilar genotypes found for variant " + (sVariantId == null ? variant.getId() : sVariantId) + ", individual " + individualId + ". Exporting most frequent: " + StringUtils.join(alleles, ",") + "\n"); } } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.HapMapExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }//from w ww. j av a 2s .co m } List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".hapmap")); String header = "rs#" + "\t" + "alleles" + "\t" + "chrom" + "\t" + "pos" + "\t" + "strand" + "\t" + "assembly#" + "\t" + "center" + "\t" + "protLSID" + "\t" + "assayLSID" + "\t" + "panelLSID" + "\t" + "QCcode"; zos.write(header.getBytes()); for (int i = 0; i < individualList.size(); i++) { zos.write(("\t" + individualList.get(i)).getBytes()); } zos.write((LINE_SEPARATOR).getBytes()); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor == null || markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } boolean fIsSNP = variant.getType().equals(Type.SNP.toString()); byte[] missingGenotype = ("\t" + "NN").getBytes(); String[] chromAndPos = markerChromosomalPositions.get(variant.getId()).split(":"); zos.write(((variantId == null ? variant.getId() : variantId) + "\t" + StringUtils.join(variant.getKnownAlleleList(), "/") + "\t" + chromAndPos[0] + "\t" + Long.parseLong(chromAndPos[1]) + "\t" + "+").getBytes()); for (int j = 0; j < 6; j++) zos.write(("\t" + "NA").getBytes()); Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String gtCode = run.getSampleGenotypes().get(sampleIndex).getCode(); String individualId = individuals .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); gqValueForSampleId.put(individualId, (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ)); dpValueForSampleId.put(individualId, (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP)); } int writtenGenotypeCount = 0; for (String individualId : individualList /* we use this list because it has the proper ordering */) { int individualIndex = individualList.indexOf(individualId); while (writtenGenotypeCount < individualIndex - 1) { zos.write(missingGenotype); writtenGenotypeCount++; } List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ Integer gqValue = gqValueForSampleId.get(individualId); if (gqValue != null && gqValue < nMinimumGenotypeQuality) continue; /* skip this sample because its GQ is under the threshold */ Integer dpValue = dpValueForSampleId.get(individualId); if (dpValue != null && dpValue < nMinimumReadDepth) continue; /* skip this sample because its DP is under the threshold */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } byte[] exportedGT = mostFrequentGenotype == null ? missingGenotype : ("\t" + StringUtils.join(variant.getAllelesFromGenotypeCode(mostFrequentGenotype), fIsSNP ? "" : "/")).getBytes(); zos.write(exportedGT); writtenGenotypeCount++; if (genotypeCounts.size() > 1) warningFileWriter.write("- Dissimilar genotypes found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting most frequent: " + new String(exportedGT) + "\n"); } while (writtenGenotypeCount < individualList.size()) { zos.write(missingGenotype); writtenGenotypeCount++; } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.VcfExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { Integer projectId = null;/*w w w . j a v a 2 s .c o m*/ for (SampleId spId : sampleIDs) { if (projectId == null) projectId = spId.getProject(); else if (projectId != spId.getProject()) { projectId = 0; break; // more than one project are involved: no header will be written } } File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); } } LinkedHashMap<SampleId, String> sampleIDToIndividualIdMap = new LinkedHashMap<SampleId, String>(); ArrayList<String> individualList = new ArrayList<String>(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); for (int i = 0; i < sampleIDs.size(); i++) { String individualId = individuals.get(i).getId(); sampleIDToIndividualIdMap.put(sampleIDs.get(i), individualId); if (!individualList.contains(individualId)) { individualList.add(individualId); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".vcf")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nQueryChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; VariantContextWriter writer = null; try { List<String> distinctSequenceNames = new ArrayList<String>(); String sequenceSeqCollName = MongoTemplateManager.getMongoCollectionName(Sequence.class); if (mongoTemplate.collectionExists(sequenceSeqCollName)) { DBCursor markerCursorCopy = markerCursor.copy(); markerCursorCopy.batchSize(nQueryChunkSize); while (markerCursorCopy.hasNext()) { int nLoadedMarkerCountInLoop = 0; boolean fStartingNewChunk = true; while (markerCursorCopy.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) { DBObject exportVariant = markerCursorCopy.next(); String chr = (String) ((DBObject) exportVariant .get(VariantData.FIELDNAME_REFERENCE_POSITION)) .get(ReferencePosition.FIELDNAME_SEQUENCE); if (!distinctSequenceNames.contains(chr)) distinctSequenceNames.add(chr); } } markerCursorCopy.close(); } Collections.sort(distinctSequenceNames, new AlphaNumericStringComparator()); SAMSequenceDictionary dict = createSAMSequenceDictionary(sModule, distinctSequenceNames); writer = new CustomVCFWriter(null, zos, dict, false, false, true); // VariantContextWriterBuilder vcwb = new VariantContextWriterBuilder(); // vcwb.unsetOption(Options.INDEX_ON_THE_FLY); // vcwb.unsetOption(Options.DO_NOT_WRITE_GENOTYPES); // vcwb.setOption(Options.USE_ASYNC_IOINDEX_ON_THE_FLY); // vcwb.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER); // vcwb.setReferenceDictionary(dict); // writer = vcwb.build(); // writer = new AsyncVariantContextWriter(writer, 3000); progress.moveToNextStep(); // done with dictionary DBCursor headerCursor = mongoTemplate .getCollection(MongoTemplateManager.getMongoCollectionName(DBVCFHeader.class)) .find(new BasicDBObject("_id." + VcfHeaderId.FIELDNAME_PROJECT, projectId)); Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>(); boolean fWriteCommandLine = true, fWriteEngineHeaders = true; // default values while (headerCursor.hasNext()) { DBVCFHeader dbVcfHeader = DBVCFHeader.fromDBObject(headerCursor.next()); headerLines.addAll(dbVcfHeader.getHeaderLines()); // Add sequence header lines (not stored in our vcf header collection) BasicDBObject projection = new BasicDBObject(SequenceStats.FIELDNAME_SEQUENCE_LENGTH, true); int nSequenceIndex = 0; for (String sequenceName : distinctSequenceNames) { String sequenceInfoCollName = MongoTemplateManager.getMongoCollectionName(SequenceStats.class); boolean fCollectionExists = mongoTemplate.collectionExists(sequenceInfoCollName); if (fCollectionExists) { DBObject record = mongoTemplate.getCollection(sequenceInfoCollName).findOne( new Query(Criteria.where("_id").is(sequenceName)).getQueryObject(), projection); if (record == null) { LOG.warn("Sequence '" + sequenceName + "' not found in collection " + sequenceInfoCollName); continue; } Map<String, String> sequenceLineData = new LinkedHashMap<String, String>(); sequenceLineData.put("ID", (String) record.get("_id")); sequenceLineData.put("length", ((Number) record.get(SequenceStats.FIELDNAME_SEQUENCE_LENGTH)).toString()); headerLines.add(new VCFContigHeaderLine(sequenceLineData, nSequenceIndex++)); } } fWriteCommandLine = headerCursor.size() == 1 && dbVcfHeader.getWriteCommandLine(); // wouldn't make sense to include command lines for several runs if (!dbVcfHeader.getWriteEngineHeaders()) fWriteEngineHeaders = false; } headerCursor.close(); VCFHeader header = new VCFHeader(headerLines, individualList); header.setWriteCommandLine(fWriteCommandLine); header.setWriteEngineHeaders(fWriteEngineHeaders); writer.writeHeader(header); short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; HashMap<SampleId, Comparable /*phID*/> phasingIDsBySample = new HashMap<SampleId, Comparable>(); while (markerCursor.hasNext()) { if (progress.hasAborted()) return; int nLoadedMarkerCountInLoop = 0; boolean fStartingNewChunk = true; markerCursor.batchSize(nQueryChunkSize); List<Comparable> currentMarkers = new ArrayList<Comparable>(); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); currentMarkers.add((Comparable) exportVariant.get("_id")); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) { VariantContext vc = variant.toVariantContext(variantsAndRuns.get(variant), !ObjectId.isValid(variant.getId().toString()), sampleIDToIndividualIdMap, phasingIDsBySample, nMinimumGenotypeQuality, nMinimumReadDepth, warningFileWriter, markerSynonyms == null ? variant.getId() : markerSynonyms.get(variant.getId())); try { writer.add(vc); } catch (Throwable t) { Exception e = new Exception("Unable to convert to VariantContext: " + variant.getId(), t); LOG.debug("error", e); throw e; } if (nLoadedMarkerCountInLoop > currentMarkers.size()) LOG.error("Bug: writing variant number " + nLoadedMarkerCountInLoop + " (only " + currentMarkers.size() + " variants expected)"); } nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } progress.setCurrentStepProgress((short) 100); } catch (Exception e) { LOG.error("Error exporting", e); progress.setError(e.getMessage()); return; } finally { warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); if (writer != null) try { writer.close(); } catch (Throwable ignored) { } } }
From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java
License:Apache License
/** * Read queue state/*from ww w . j av a 2 s . c o m*/ * * starttime -> timestamp (log value) of the current crawl start * * for an item of the collection : * timestamp < starttime => not in queue * timestamp > starttime => in queue * timestamp = starttime => done * * @return last start time */ private Long readState() { // read start time BasicDBObject docsearch = new BasicDBObject(); docsearch.put("_id", new ObjectId(stateId)); DBCursor cur = coll.getColl().find(docsearch); if (cur.count() > 1) return null; if (cur.count() == 0) return null; BasicDBObject doc = (BasicDBObject) cur.next(); startTime = doc.getLong("starttime"); // read sizes if (startTime == 0 || rescan || startDepth > 0) { // TODO v4 : in fact startTime never = 0 !!! if (!rescan) { // previous crawl terminated fine if (startDepth > 0) { //String queryTimeStamp = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}"; //String queryMode = "{\"crawl_mode\":a}"; //String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + "]}"; String query = "{\"depth\":" + String.valueOf(startDepth) + "}"; size = count(query); } else { size = 0; } } else { // get queue size : timestamp != starttime => in queue //String query = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}"; String queryTimeStamp = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}"; String queryMode = "{\"crawl_mode\":\"a\"}"; String query200 = "{\"crawl_status\":200}"; String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + ", " + query200 + "]}"; size = count(query); } doneCount = 0; } else { // previous crawl was not terminated // get queue size : timestamp > starttime => in queue String query = "{\"" + timestampFieldName + "\": {\"$gt\": " + String.valueOf(startTime) + "}}"; //docsearch = MongoDBHelper.JSON2BasicDBObject(query); //cur = coll.getColl().find(docsearch); //size = cur.size(); size = count(query); // get done count : timestamp = starttime => done query = "{\"" + timestampFieldName + "\": " + String.valueOf(startTime) + "}"; //docsearch = MongoDBHelper.JSON2BasicDBObject(query); //cur = coll.getColl().find(docsearch); //doneCount = cur.size(); doneCount = count(query); } return startTime; }
From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java
License:Apache License
public int getCurrentMaxDepth() { //String queryTimeStamp = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}"; //String queryMode = "{\"crawl_mode\":a}"; //String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + "]}"; String query1 = "{\"crawl_mode\":\"a\"}"; String query2 = "{\"crawl_status\":200}"; String query = "{\"$and\": [" + query1 + ", " + query2 + "]}"; BasicDBObject docsearch = new BasicDBObject(); docsearch = MongoDBHelper.JSON2BasicDBObject(query); DBCursor cur = coll.getColl().find(docsearch).sort(new BasicDBObject("depth", -1)); if (cur.count() == 0) return 0; BasicDBObject doc = (BasicDBObject) cur.next(); return doc.getInt("depth"); }
From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java
License:Apache License
/** * Push a new item/*w w w .j a v a 2s . co m*/ * * @return success or not */ public boolean push(Map<String, Object> item) throws QueueIncoherenceException, QueueInvalidDataException { boolean ret = true; BasicDBObject doc = new BasicDBObject(item); String keyValue = doc.getString(uniqueKeyFieldName); String depth = doc.getString(depthFieldName); String sourceId = doc.getString(sourceIdFieldName); if (sourceId == null || keyValue == null || depth == null) throw new QueueInvalidDataException("Missing fields in json"); if (Integer.parseInt(sourceId) != this.sourceId) throw new QueueInvalidDataException("Invalid source id in json"); String referer = doc.getString(refererFieldName); // Get existing item in queue String currentDepth = null; String currentReferers = null; long currentTimestamp = 0; BasicDBObject docsearch = new BasicDBObject(); docsearch.put(sourceIdFieldName, Integer.parseInt(sourceId)); docsearch.put(hashFieldName, keyValue.hashCode()); synchronized (collMonitor) { BasicDBObject curDoc = null; DBCursor cur = coll.getColl().find(docsearch); if (cur.count() > 0) { while (cur.hasNext() && curDoc == null) { curDoc = (BasicDBObject) cur.next(); if (!keyValue.equals(doc.getString(uniqueKeyFieldName))) { curDoc = null; } } if (curDoc != null) { currentDepth = curDoc.getString(depthFieldName); currentReferers = curDoc.getString(referersFieldName); currentTimestamp = curDoc.getLong(timestampFieldName); /* * Remember : for an item of the collection : * timestamp < starttime => not in queue * timestamp > starttime => in queue * timestamp = starttime => done */ if ((Long.parseLong(depth) >= Long.parseLong(currentDepth)) && (currentTimestamp >= startTime)) return false; } } // build new doc doc.put(hashFieldName, keyValue.hashCode()); doc.put(timestampFieldName, new Date().getTime()); if (referer != null) { if (currentReferers == null) { currentReferers = referer; } else { currentReferers += "/n" + referer; } } if (currentReferers != null) { doc.put(referersFieldName, currentReferers); } if (curDoc != null) { doc.put("content_type", curDoc.get("content_type")); doc.put("crawl_last_time", curDoc.get("crawl_last_time")); doc.put("condget_last_modified", curDoc.get("condget_last_modified")); doc.put("condget_etag", curDoc.get("condget_etag")); coll.update(curDoc, doc); // TODO : decrease done size in some case ??? } else { doc.put(createdFieldName, new Date().getTime()); coll.add(doc); } size++; return ret; } }