List of usage examples for com.mongodb DBCursor batchSize
public DBCursor batchSize(final int numberOfElements)
Limits the number of elements returned in one batch.
From source file:com.edgytech.umongo.CollectionPanel.java
License:Apache License
private void exportToFile(final DBCollection col, final DBObject query, final DBObject fields, final DBObject sort, final int skip, final int limit, final int batchSize) { ExportDialog dia = UMongo.instance.getGlobalStore().getExportDialog(); if (!dia.show()) { return;// ww w .j a v a 2 s .c o m } final DocumentSerializer ds = dia.getDocumentSerializer(); final boolean continueOnError = dia.getBooleanFieldValue(ExportDialog.Item.continueOnError); new DbJob() { @Override public Object doRun() throws Exception { try { try { DBCursor cur = col.find(query, fields); if (skip > 0) { cur.skip(skip); } if (batchSize != 0) { cur.batchSize(batchSize); } if (sort != null) { cur.sort(sort); } if (limit > 0) { cur.limit(limit); } while (cur.hasNext() && !stopped) { ds.writeObject(cur.next()); } } catch (Exception e) { if (continueOnError) { getLogger().log(Level.WARNING, null, e); } else { throw e; } } } finally { ds.close(); } return null; } @Override public String getNS() { return col.getFullName(); } @Override public String getShortName() { return "Export"; } }.addJob(); }
From source file:com.eharmony.matching.seeking.executor.mongodb.MongoQueryExecutor.java
License:Apache License
private <T, R> MongoResults<R> fetch(DBCursor cursor, Query<T, R> query) { if (batchSize != DEFAULT_MONGODB_BATCH_SIZE) { cursor = cursor.batchSize(batchSize); }/* w ww. ja v a 2 s .co m*/ return new MongoResults<R>(cursor, mapper, query.getReturnType(), cache); }
From source file:eu.artist.postmigration.nfrvt.strategy.benchmark.AvailabilityCalculator.java
License:Open Source License
public static MultipleDCResults calculateAvailability(int DefinedQuantumofTimeInSeconds, int startYear, int startMonth, int startDay, int stopYear, int stopMonth, int stopDay) {// (Date thisDate, double OVERALL_MONTH_INTERVAL_SECONDS = 0; ///*from ww w.j a v a2 s . co m*/ try { Properties propertiesFile = BenchmarkConstants.getProperties(); String databaseIP = propertiesFile.getProperty("3alibIP"); MultipleDCResults response = new MultipleDCResults(); Mongo mongoClient; System.out.println("DB NoSQL:" + databaseIP); mongoClient = new Mongo(databaseIP); DB db = mongoClient.getDB("3alib"); System.out.println("Host address:" + databaseIP); DBCollection coll = db.getCollection("log_samples"); Date date = new Date(); Calendar calendarFrom = Calendar.getInstance(); calendarFrom.setTime(date); calendarFrom.set(startYear, startMonth - 1, startDay, 0, 0, 0); Date dateFrom = calendarFrom.getTime(); Calendar calendarTo = Calendar.getInstance(); calendarTo.setTime(date); calendarTo.set(stopYear, stopMonth - 1, stopDay, 23, 59, 59); Date dateTo = calendarTo.getTime(); System.out.println("Date beginning:" + dateFrom.toString()); System.out.println("Date ending:" + dateTo.toString()); ObjectId from = new ObjectId(dateFrom); ObjectId to = new ObjectId(dateTo); List<?> distinctTemplates = coll.distinct("location.parent.id");//distinct("imageId"); for (int i = 0; i < distinctTemplates.size(); i++) { String index = "-1"; System.out.println("Distinct Region IDs:" + distinctTemplates.get(i).toString()); // query based on date to filter needed month BasicDBObject query = new BasicDBObject("_id", new BasicDBObject("$gte", from).append("$lte", to)) .append("location.parent.id", distinctTemplates.get(i).toString()); DBCursor cursor = coll.find(query); cursor.addOption(com.mongodb.Bytes.QUERYOPTION_NOTIMEOUT); cursor.batchSize(100); try { long startID = 0; long stopID = 0; long diffSeconds = 0; double PREDEFINED_LOGICAL_SAMPLE_INTERVAL_IN_SECONDS = 500;//interval in which we would logically //have at least one sample if the daemon is running DBObject thisObject = cursor.next(); System.out.println("First object:" + thisObject.toString()); int cursorCount = cursor.count(); System.out.println("Cursor count:" + cursor.count()); DBObject previousObject = thisObject; int k = 0; while (k < (cursorCount + 1)) { if ((k % 1000) == 0) { System.out.println("Progress:" + k + " from " + cursorCount + " overall records"); } if (((thisObject.get("reachability")).equals("UNREACHABLE")) && index.equals("-1")) { //if it is the first unavailable sample System.out.println("Changing index to 1..."); startID = ((ObjectId) thisObject.get("_id")).getTime();//this line's id System.out.println("StartID is: " + startID); index = "1"; } if (((thisObject.get("reachability")).equals("UNREACHABLE")) && (!(index.equals("-1")))) { long gapstopID = ((ObjectId) thisObject.get("_id")).getTime(); long gapstartID = ((ObjectId) previousObject.get("_id")).getTime(); long GapdiffSeconds = (gapstopID - gapstartID) / 1000; // 60; if (GapdiffSeconds > PREDEFINED_LOGICAL_SAMPLE_INTERVAL_IN_SECONDS) { System.out.println("Found gap..."); stopID = ((ObjectId) previousObject.get("_id")).getTime();//this line's id to end interval System.out.println("StopID is previous: " + stopID); diffSeconds = (stopID - startID) / 1000; if (diffSeconds > DefinedQuantumofTimeInSeconds) { OVERALL_MONTH_INTERVAL_SECONDS = OVERALL_MONTH_INTERVAL_SECONDS + diffSeconds; System.out.println("Overall month interval in seconds now:" + OVERALL_MONTH_INTERVAL_SECONDS); } startID = ((ObjectId) thisObject.get("_id")).getTime();//this line's id } else { //standard logic to cover generic case of consecutive unavailable samples } } if (((((thisObject.get("reachability")).equals("REACHABLE")) || (!(cursor.hasNext())))) && (!(index.equals("-1")))) { if (!(cursor.hasNext())) { System.out.println("FINAL ELEMENT REACHED"); } stopID = ((ObjectId) previousObject.get("_id")).getTime(); diffSeconds = (stopID - startID) / 1000; // 60; if (diffSeconds > DefinedQuantumofTimeInSeconds) { OVERALL_MONTH_INTERVAL_SECONDS = OVERALL_MONTH_INTERVAL_SECONDS + diffSeconds; System.out.println( "Overall month interval in seconds now:" + OVERALL_MONTH_INTERVAL_SECONDS); } System.out.println("Resetting index to -1..."); index = "-1"; } if ((cursor.hasNext())) { previousObject = thisObject; thisObject = cursor.next(); } k++; } System.out.println("Final Overall month unavailable interval in seconds now:" + OVERALL_MONTH_INTERVAL_SECONDS); double OverallUnavailableIntervalInMinutes = OVERALL_MONTH_INTERVAL_SECONDS / 60; System.out .println("OverallUnavailableIntervalInMinutes:" + OverallUnavailableIntervalInMinutes); double OverallIntervalInSeconds = (dateTo.getTime() - dateFrom.getTime()) / 1000; double OverallIntervalInMinutes = OverallIntervalInSeconds / 60; double finalAvailabilityPercentage = 100.0 * ((OverallIntervalInMinutes - OverallUnavailableIntervalInMinutes) / OverallIntervalInMinutes); double downtimeInPercent = 100.0 - finalAvailabilityPercentage; response.DC.add(distinctTemplates.get(i).toString()); response.availability.add((Double) downtimeInPercent); System.out.println( "Final percentage of availability based on provider definition in the given interval:" + finalAvailabilityPercentage); } catch (NoSuchElementException e2) { System.out.println("No available data for this period..."); } catch (Exception e1) { e1.printStackTrace(); } finally { cursor.close(); } } return response; } catch (UnknownHostException e) { e.printStackTrace(); return null; } catch (MongoException e) { System.out.println("No available data for this period..."); return null; } }
From source file:fr.cirad.mgdb.exporting.individualoriented.DARwinExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles, boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); GenotypingProject aProject = mongoTemplate.findOne( new Query(Criteria.where(GenotypingProject.FIELDNAME_PLOIDY_LEVEL).exists(true)), GenotypingProject.class); if (aProject == null) LOG.warn("Unable to find a project containing ploidy level information! Assuming ploidy level is 2."); int ploidy = aProject == null ? 2 : aProject.getPloidyLevel(); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }// ww w . j a va 2s .co m } String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size() + "individuals"; StringBuffer donFileContents = new StringBuffer( "@DARwin 5.0 - DON -" + LINE_SEPARATOR + individualExportFiles.size() + "\t" + 1 + LINE_SEPARATOR + "N" + "\t" + "individual" + LINE_SEPARATOR); int count = 0; String missingGenotype = ""; for (int j = 0; j < ploidy; j++) missingGenotype += "\tN"; zos.putNextEntry(new ZipEntry(exportName + ".var")); zos.write(("@DARwin 5.0 - ALLELIC - " + ploidy + LINE_SEPARATOR + individualExportFiles.size() + "\t" + markerCount * ploidy + LINE_SEPARATOR + "N").getBytes()); DBCursor markerCursorCopy = markerCursor.copy(); // dunno how expensive this is, but seems safer than keeping all IDs in memory at any time short nProgress = 0, nPreviousProgress = 0; int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; markerCursorCopy.batchSize(nChunkSize); int nMarkerIndex = 0; while (markerCursorCopy.hasNext()) { DBObject exportVariant = markerCursorCopy.next(); Comparable markerId = (Comparable) exportVariant.get("_id"); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) markerId = syn; } for (int j = 0; j < ploidy; j++) zos.write(("\t" + markerId).getBytes()); } TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>(); ArrayList<String> distinctAlleles = new ArrayList<String>(); // the index of each allele will be used as its code int i = 0; for (File f : individualExportFiles) { BufferedReader in = new BufferedReader(new FileReader(f)); try { String individualId, line = in.readLine(); // read sample id if (line != null) individualId = line; else throw new Exception("Unable to read first line of temp export file " + f.getName()); donFileContents.append(++count + "\t" + individualId + LINE_SEPARATOR); zos.write((LINE_SEPARATOR + count).getBytes()); nMarkerIndex = 0; while ((line = in.readLine()) != null) { List<String> genotypes = MgdbDao.split(line, "|"); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } if (genotypeCounts.size() > 1) { warningFileWriter.write("- Dissimilar genotypes found for variant __" + nMarkerIndex + "__, individual " + individualId + ". Exporting most frequent: " + mostFrequentGenotype + "\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String codedGenotype = ""; if (mostFrequentGenotype != null) for (String allele : mostFrequentGenotype.split(" ")) { if (!distinctAlleles.contains(allele)) distinctAlleles.add(allele); codedGenotype += "\t" + distinctAlleles.indexOf(allele); } else codedGenotype = missingGenotype.replaceAll("N", "-1"); // missing data is coded as -1 zos.write(codedGenotype.getBytes()); nMarkerIndex++; } } catch (Exception e) { LOG.error("Error exporting data", e); progress.setError("Error exporting data: " + e.getClass().getSimpleName() + (e.getMessage() != null ? " - " + e.getMessage() : "")); return; } finally { in.close(); } if (progress.hasAborted()) return; nProgress = (short) (++i * 100 / individualExportFiles.size()); if (nProgress > nPreviousProgress) { // LOG.debug("============= doDARwinExport (" + i + "): " + nProgress + "% ============="); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } if (!f.delete()) { f.deleteOnExit(); LOG.info("Unable to delete tmp export file " + f.getAbsolutePath()); } } zos.putNextEntry(new ZipEntry(exportName + ".don")); zos.write(donFileContents.toString().getBytes()); // now read variant names for those that induced warnings nMarkerIndex = 0; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext()) { DBObject exportVariant = markerCursor.next(); if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { Comparable markerId = (Comparable) exportVariant.get("_id"); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) markerId = syn; } for (int j = 0; j < ploidy; j++) zos.write(("\t" + markerId).getBytes()); problematicMarkerIndexToNameMap.put(nMarkerIndex, markerId); } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet()) sLine = sLine.replaceAll("__" + aMarkerIndex + "__", problematicMarkerIndexToNameMap.get(aMarkerIndex).toString()); zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.individualoriented.PLinkExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles, boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles) throws Exception { File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }// www . ja v a2s.co m } MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".ped")); TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>(); short nProgress = 0, nPreviousProgress = 0; int i = 0; for (File f : individualExportFiles) { BufferedReader in = new BufferedReader(new FileReader(f)); try { String individualId, line = in.readLine(); // read sample id if (line != null) { individualId = line; String population = getIndividualPopulation(sModule, line); String individualInfo = (population == null ? "." : population) + " " + individualId; zos.write((individualInfo + " 0 0 0 " + getIndividualGenderCode(sModule, individualId)) .getBytes()); } else throw new Exception("Unable to read first line of temp export file " + f.getName()); int nMarkerIndex = 0; while ((line = in.readLine()) != null) { List<String> genotypes = MgdbDao.split(line, "|"); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } if (genotypeCounts.size() > 1) { warningFileWriter.write("- Dissimilar genotypes found for variant " + nMarkerIndex + ", individual " + individualId + ". Exporting most frequent: " + mostFrequentGenotype + "\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String[] alleles = mostFrequentGenotype == null ? new String[0] : mostFrequentGenotype.split(" "); if (alleles.length > 2) { warningFileWriter.write("- More than 2 alleles found for variant " + nMarkerIndex + ", individual " + individualId + ". Exporting only the first 2 alleles.\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } String all1 = alleles.length == 0 ? "0" : alleles[0]; String all2 = alleles.length == 0 ? "0" : alleles[alleles.length == 1 ? 0 : 1]; if (all1.length() != 1 || all2.length() != 1) { warningFileWriter .write("- SNP expected, but alleles are not coded on a single char for variant " + nMarkerIndex + ", individual " + individualId + ". Ignoring this genotype.\n"); problematicMarkerIndexToNameMap.put(nMarkerIndex, ""); } else zos.write((" " + all1 + " " + all2).getBytes()); nMarkerIndex++; } } catch (Exception e) { LOG.error("Error exporting data", e); progress.setError("Error exporting data: " + e.getClass().getSimpleName() + (e.getMessage() != null ? " - " + e.getMessage() : "")); return; } finally { in.close(); } if (progress.hasAborted()) return; nProgress = (short) (++i * 100 / individualExportFiles.size()); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } zos.write('\n'); if (!f.delete()) { f.deleteOnExit(); LOG.info("Unable to delete tmp export file " + f.getAbsolutePath()); } } warningFileWriter.close(); zos.putNextEntry(new ZipEntry(exportName + ".map")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; markerCursor.batchSize(nChunkSize); int nMarkerIndex = 0; while (markerCursor.hasNext()) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); Comparable markerId = (Comparable) exportVariant.get("_id"); String chrom = (String) refPos.get(ReferencePosition.FIELDNAME_SEQUENCE); Long pos = ((Number) refPos.get(ReferencePosition.FIELDNAME_START_SITE)).longValue(); if (chrom == null) LOG.warn("Chromosomal position not found for marker " + markerId); Comparable exportedId = markerSynonyms == null ? markerId : markerSynonyms.get(markerId); zos.write(((chrom == null ? "0" : chrom) + " " + exportedId + " " + 0 + " " + (pos == null ? 0 : pos) + LINE_SEPARATOR).getBytes()); if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { // we are going to need this marker's name for the warning file Comparable variantName = markerId; if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(markerId); if (syn != null) variantName = syn; } problematicMarkerIndexToNameMap.put(nMarkerIndex, variantName); } nMarkerIndex++; } if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet()) sLine = sLine.replaceAll("__" + aMarkerIndex + "__", problematicMarkerIndexToNameMap.get(aMarkerIndex).toString()); zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.BEDExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }/*from w w w . j ava 2 s .co m*/ } int markerCount = markerCursor.count(); List<String> selectedIndividualList = new ArrayList<String>(); for (Individual ind : getIndividualsFromSamples(sModule, sampleIDs)) selectedIndividualList.add(ind.getId()); String exportName = sModule + "_" + markerCount + "variants_" + selectedIndividualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".bed")); short nProgress = 0, nPreviousProgress = 0; int nChunkSize = Math.min(2000, markerCount), nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } for (Comparable variantId : markerChromosomalPositions.keySet()) // read data and write results into temporary files (one per sample) { String[] chromAndPos = markerChromosomalPositions.get(variantId).split(":"); zos.write((chromAndPos[0] + "\t" + (Long.parseLong(chromAndPos[1]) - 1) + "\t" + (Long.parseLong(chromAndPos[1]) - 1) + "\t" + variantId + "\t" + "0" + "\t" + "+") .getBytes()); zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.EigenstratExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { // long before = System.currentTimeMillis(); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); File snpFile = null;//from ww w . j a v a 2 s . co m try { snpFile = File.createTempFile("snpFile", ""); FileWriter snpFileWriter = new FileWriter(snpFile); ZipOutputStream zos = new ZipOutputStream(outputStream); if (ByteArrayOutputStream.class.isAssignableFrom(outputStream.getClass())) zos.setLevel(ZipOutputStream.STORED); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); } } MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); StringBuffer indFileContents = new StringBuffer(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); indFileContents .append(individual.getId() + "\t" + getIndividualGenderCode(sModule, individual.getId()) + "\t" + (individual.getPopulation() == null ? "." : individual.getPopulation()) + LINE_SEPARATOR); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".ind")); zos.write(indFileContents.toString().getBytes()); zos.putNextEntry(new ZipEntry(exportName + ".eigenstratgeno")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats() .get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":"); if (chromAndPos.size() == 0) LOG.warn("Chromosomal position not found for marker " + variantId); // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } snpFileWriter.write(variantId + "\t" + (chromAndPos.size() == 0 ? "0" : chromAndPos.get(0)) + "\t" + 0 + "\t" + (chromAndPos.size() == 0 ? 0l : Long.parseLong(chromAndPos.get(1))) + LINE_SEPARATOR); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String individualId = individuals .get(sampleIDs .indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); Integer gq = null; try { gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ); } catch (Exception ignored) { } if (gq != null && gq < nMinimumGenotypeQuality) continue; Integer dp = null; try { dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP); } catch (Exception ignored) { } if (dp != null && dp < nMinimumReadDepth) continue; String gtCode = sampleGenotype.getCode(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); } for (int j = 0; j < individualList .size(); j++ /* we use this list because it has the proper ordering*/) { String individualId = individualList.get(j); List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>() : variant.getAllelesFromGenotypeCode(mostFrequentGenotype); int nOutputCode = 0; if (mostFrequentGenotype == null) nOutputCode = 9; else for (String all : Helper.split(mostFrequentGenotype, "/")) if ("0".equals(all)) nOutputCode++; if (j == 0 && variant.getKnownAlleleList().size() > 2) warningFileWriter.write("- Variant " + variant.getId() + " is multi-allelic. Make sure Eigenstrat genotype encoding specifications are suitable for you.\n"); zos.write(("" + nOutputCode).getBytes()); if (genotypeCounts.size() > 1 || alleles.size() > 2) { if (genotypeCounts.size() > 1) warningFileWriter.write("- Dissimilar genotypes found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting most frequent: " + nOutputCode + "\n"); if (alleles.size() > 2) warningFileWriter.write("- More than 2 alleles found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting only the first 2 alleles.\n"); } } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("============= exportData: " + nProgress + "% =============" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } snpFileWriter.close(); zos.putNextEntry(new ZipEntry(exportName + ".snp")); BufferedReader in = new BufferedReader(new FileReader(snpFile)); String sLine; while ((sLine = in.readLine()) != null) zos.write((sLine + "\n").getBytes()); in.close(); warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; in = new BufferedReader(new FileReader(warningFile)); while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); } finally { if (snpFile != null && snpFile.exists()) snpFile.delete(); } }
From source file:fr.cirad.mgdb.exporting.markeroriented.GFFExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }//from www .j av a 2 s. c o m } File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".gff3")); String header = "##gff-version 3" + LINE_SEPARATOR; zos.write(header.getBytes()); TreeMap<String, String> typeToOntology = new TreeMap<String, String>(); typeToOntology.put(Type.SNP.toString(), "SO:0000694"); typeToOntology.put(Type.INDEL.toString(), "SO:1000032"); typeToOntology.put(Type.MIXED.toString(), "SO:0001059"); typeToOntology.put(Type.SYMBOLIC.toString(), "SO:0000109"); typeToOntology.put(Type.MNP.toString(), "SO:0001059"); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); List<String> variantDataOrigin = new ArrayList<String>(); Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":"); if (chromAndPos.size() == 0) LOG.warn("Chromosomal position not found for marker " + variantId); // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String individualId = individuals .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); Integer gq = null; try { gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ); } catch (Exception ignored) { } if (gq != null && gq < nMinimumGenotypeQuality) continue; Integer dp = null; try { dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP); } catch (Exception ignored) { } if (dp != null && dp < nMinimumReadDepth) continue; String gtCode = sampleGenotype.getCode(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); } zos.write((chromAndPos.get(0) + "\t" + StringUtils.join(variantDataOrigin, ";") /*source*/ + "\t" + typeToOntology.get(variant.getType()) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t" + "." + "\t" + "+" + "\t" + "." + "\t") .getBytes()); Comparable syn = markerSynonyms == null ? null : markerSynonyms.get(variant.getId()); zos.write(("ID=" + variant.getId() + ";" + (syn != null ? "Name=" + syn + ";" : "") + "alleles=" + StringUtils.join(variant.getKnownAlleleList(), "/") + ";" + "refallele=" + variant.getKnownAlleleList().get(0) + ";").getBytes()); for (int j = 0; j < individualList .size(); j++ /* we use this list because it has the proper ordering*/) { NumberFormat nf = NumberFormat.getInstance(Locale.US); nf.setMaximumFractionDigits(4); HashMap<String, Integer> compt1 = new HashMap<String, Integer>(); int highestGenotypeCount = 0; int sum = 0; String individualId = individualList.get(j); List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ int count = 0; for (String t : variant.getAllelesFromGenotypeCode(genotype)) { for (String t1 : variant.getKnownAlleleList()) { if (t.equals(t1) && !(compt1.containsKey(t1))) { count++; compt1.put(t1, count); } else if (t.equals(t1) && compt1.containsKey(t1)) { if (compt1.get(t1) != 0) { count++; compt1.put(t1, count); } else compt1.put(t1, count); } else if (!(compt1.containsKey(t1))) { compt1.put(t1, 0); } } } for (int countValue : compt1.values()) { sum += countValue; } int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>() : variant.getAllelesFromGenotypeCode(mostFrequentGenotype); if (alleles.size() != 0) { zos.write(("acounts=" + individualId + ":").getBytes()); for (String knowAllelesCompt : compt1.keySet()) { zos.write( (knowAllelesCompt + " " + nf.format(compt1.get(knowAllelesCompt) / (float) sum) + " " + compt1.get(knowAllelesCompt) + " ").getBytes()); } zos.write((alleles.size() + ";").getBytes()); } if (genotypeCounts.size() > 1) { Comparable sVariantId = markerSynonyms != null ? markerSynonyms.get(variant.getId()) : variant.getId(); warningFileWriter.write("- Dissimilar genotypes found for variant " + (sVariantId == null ? variant.getId() : sVariantId) + ", individual " + individualId + ". Exporting most frequent: " + StringUtils.join(alleles, ",") + "\n"); } } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.HapMapExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); }// w w w.ja v a 2 s .c o m } List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); ArrayList<String> individualList = new ArrayList<String>(); for (int i = 0; i < sampleIDs.size(); i++) { Individual individual = individuals.get(i); if (!individualList.contains(individual.getId())) { individualList.add(individual.getId()); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".hapmap")); String header = "rs#" + "\t" + "alleles" + "\t" + "chrom" + "\t" + "pos" + "\t" + "strand" + "\t" + "assembly#" + "\t" + "center" + "\t" + "protLSID" + "\t" + "assayLSID" + "\t" + "panelLSID" + "\t" + "QCcode"; zos.write(header.getBytes()); for (int i = 0; i < individualList.size(); i++) { zos.write(("\t" + individualList.get(i)).getBytes()); } zos.write((LINE_SEPARATOR).getBytes()); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; while (markerCursor == null || markerCursor.hasNext()) { int nLoadedMarkerCountInLoop = 0; Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>(); boolean fStartingNewChunk = true; markerCursor.batchSize(nChunkSize); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION); markerChromosomalPositions.put((Comparable) exportVariant.get("_id"), refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":" + refPos.get(ReferencePosition.FIELDNAME_START_SITE)); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet()); LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample) { Comparable variantId = variant.getId(); if (markerSynonyms != null) { Comparable syn = markerSynonyms.get(variantId); if (syn != null) variantId = syn; } boolean fIsSNP = variant.getType().equals(Type.SNP.toString()); byte[] missingGenotype = ("\t" + "NN").getBytes(); String[] chromAndPos = markerChromosomalPositions.get(variant.getId()).split(":"); zos.write(((variantId == null ? variant.getId() : variantId) + "\t" + StringUtils.join(variant.getKnownAlleleList(), "/") + "\t" + chromAndPos[0] + "\t" + Long.parseLong(chromAndPos[1]) + "\t" + "+").getBytes()); for (int j = 0; j < 6; j++) zos.write(("\t" + "NA").getBytes()); Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>(); Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>(); Collection<VariantRunData> runs = variantsAndRuns.get(variant); if (runs != null) for (VariantRunData run : runs) for (Integer sampleIndex : run.getSampleGenotypes().keySet()) { SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex); String gtCode = run.getSampleGenotypes().get(sampleIndex).getCode(); String individualId = individuals .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex))) .getId(); List<String> storedIndividualGenotypes = individualGenotypes.get(individualId); if (storedIndividualGenotypes == null) { storedIndividualGenotypes = new ArrayList<String>(); individualGenotypes.put(individualId, storedIndividualGenotypes); } storedIndividualGenotypes.add(gtCode); gqValueForSampleId.put(individualId, (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ)); dpValueForSampleId.put(individualId, (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP)); } int writtenGenotypeCount = 0; for (String individualId : individualList /* we use this list because it has the proper ordering */) { int individualIndex = individualList.indexOf(individualId); while (writtenGenotypeCount < individualIndex - 1) { zos.write(missingGenotype); writtenGenotypeCount++; } List<String> genotypes = individualGenotypes.get(individualId); HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes int highestGenotypeCount = 0; String mostFrequentGenotype = null; if (genotypes != null) for (String genotype : genotypes) { if (genotype.length() == 0) continue; /* skip missing genotypes */ Integer gqValue = gqValueForSampleId.get(individualId); if (gqValue != null && gqValue < nMinimumGenotypeQuality) continue; /* skip this sample because its GQ is under the threshold */ Integer dpValue = dpValueForSampleId.get(individualId); if (dpValue != null && dpValue < nMinimumReadDepth) continue; /* skip this sample because its DP is under the threshold */ int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype); if (gtCount > highestGenotypeCount) { highestGenotypeCount = gtCount; mostFrequentGenotype = genotype; } genotypeCounts.put(genotype, gtCount); } byte[] exportedGT = mostFrequentGenotype == null ? missingGenotype : ("\t" + StringUtils.join(variant.getAllelesFromGenotypeCode(mostFrequentGenotype), fIsSNP ? "" : "/")).getBytes(); zos.write(exportedGT); writtenGenotypeCount++; if (genotypeCounts.size() > 1) warningFileWriter.write("- Dissimilar genotypes found for variant " + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId + ". Exporting most frequent: " + new String(exportedGT) + "\n"); } while (writtenGenotypeCount < individualList.size()) { zos.write(missingGenotype); writtenGenotypeCount++; } zos.write((LINE_SEPARATOR).getBytes()); } if (progress.hasAborted()) return; nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { // if (nProgress%5 == 0) // LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s"); progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); in.readLine(); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); zos.close(); progress.setCurrentStepProgress((short) 100); }
From source file:fr.cirad.mgdb.exporting.markeroriented.VcfExportHandler.java
License:Open Source License
@Override public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs, ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms, int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles) throws Exception { Integer projectId = null;/*from w w w . j av a 2 s . co m*/ for (SampleId spId : sampleIDs) { if (projectId == null) projectId = spId.getProject(); else if (projectId != spId.getProject()) { projectId = 0; break; // more than one project are involved: no header will be written } } File warningFile = File.createTempFile("export_warnings_", ""); FileWriter warningFileWriter = new FileWriter(warningFile); MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule); int markerCount = markerCursor.count(); ZipOutputStream zos = new ZipOutputStream(outputStream); if (readyToExportFiles != null) for (String readyToExportFile : readyToExportFiles.keySet()) { zos.putNextEntry(new ZipEntry(readyToExportFile)); InputStream inputStream = readyToExportFiles.get(readyToExportFile); byte[] dataBlock = new byte[1024]; int count = inputStream.read(dataBlock, 0, 1024); while (count != -1) { zos.write(dataBlock, 0, count); count = inputStream.read(dataBlock, 0, 1024); } } LinkedHashMap<SampleId, String> sampleIDToIndividualIdMap = new LinkedHashMap<SampleId, String>(); ArrayList<String> individualList = new ArrayList<String>(); List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs); for (int i = 0; i < sampleIDs.size(); i++) { String individualId = individuals.get(i).getId(); sampleIDToIndividualIdMap.put(sampleIDs.get(i), individualId); if (!individualList.contains(individualId)) { individualList.add(individualId); } } String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals"; zos.putNextEntry(new ZipEntry(exportName + ".vcf")); int avgObjSize = (Integer) mongoTemplate .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize"); int nQueryChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize; VariantContextWriter writer = null; try { List<String> distinctSequenceNames = new ArrayList<String>(); String sequenceSeqCollName = MongoTemplateManager.getMongoCollectionName(Sequence.class); if (mongoTemplate.collectionExists(sequenceSeqCollName)) { DBCursor markerCursorCopy = markerCursor.copy(); markerCursorCopy.batchSize(nQueryChunkSize); while (markerCursorCopy.hasNext()) { int nLoadedMarkerCountInLoop = 0; boolean fStartingNewChunk = true; while (markerCursorCopy.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) { DBObject exportVariant = markerCursorCopy.next(); String chr = (String) ((DBObject) exportVariant .get(VariantData.FIELDNAME_REFERENCE_POSITION)) .get(ReferencePosition.FIELDNAME_SEQUENCE); if (!distinctSequenceNames.contains(chr)) distinctSequenceNames.add(chr); } } markerCursorCopy.close(); } Collections.sort(distinctSequenceNames, new AlphaNumericStringComparator()); SAMSequenceDictionary dict = createSAMSequenceDictionary(sModule, distinctSequenceNames); writer = new CustomVCFWriter(null, zos, dict, false, false, true); // VariantContextWriterBuilder vcwb = new VariantContextWriterBuilder(); // vcwb.unsetOption(Options.INDEX_ON_THE_FLY); // vcwb.unsetOption(Options.DO_NOT_WRITE_GENOTYPES); // vcwb.setOption(Options.USE_ASYNC_IOINDEX_ON_THE_FLY); // vcwb.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER); // vcwb.setReferenceDictionary(dict); // writer = vcwb.build(); // writer = new AsyncVariantContextWriter(writer, 3000); progress.moveToNextStep(); // done with dictionary DBCursor headerCursor = mongoTemplate .getCollection(MongoTemplateManager.getMongoCollectionName(DBVCFHeader.class)) .find(new BasicDBObject("_id." + VcfHeaderId.FIELDNAME_PROJECT, projectId)); Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>(); boolean fWriteCommandLine = true, fWriteEngineHeaders = true; // default values while (headerCursor.hasNext()) { DBVCFHeader dbVcfHeader = DBVCFHeader.fromDBObject(headerCursor.next()); headerLines.addAll(dbVcfHeader.getHeaderLines()); // Add sequence header lines (not stored in our vcf header collection) BasicDBObject projection = new BasicDBObject(SequenceStats.FIELDNAME_SEQUENCE_LENGTH, true); int nSequenceIndex = 0; for (String sequenceName : distinctSequenceNames) { String sequenceInfoCollName = MongoTemplateManager.getMongoCollectionName(SequenceStats.class); boolean fCollectionExists = mongoTemplate.collectionExists(sequenceInfoCollName); if (fCollectionExists) { DBObject record = mongoTemplate.getCollection(sequenceInfoCollName).findOne( new Query(Criteria.where("_id").is(sequenceName)).getQueryObject(), projection); if (record == null) { LOG.warn("Sequence '" + sequenceName + "' not found in collection " + sequenceInfoCollName); continue; } Map<String, String> sequenceLineData = new LinkedHashMap<String, String>(); sequenceLineData.put("ID", (String) record.get("_id")); sequenceLineData.put("length", ((Number) record.get(SequenceStats.FIELDNAME_SEQUENCE_LENGTH)).toString()); headerLines.add(new VCFContigHeaderLine(sequenceLineData, nSequenceIndex++)); } } fWriteCommandLine = headerCursor.size() == 1 && dbVcfHeader.getWriteCommandLine(); // wouldn't make sense to include command lines for several runs if (!dbVcfHeader.getWriteEngineHeaders()) fWriteEngineHeaders = false; } headerCursor.close(); VCFHeader header = new VCFHeader(headerLines, individualList); header.setWriteCommandLine(fWriteCommandLine); header.setWriteEngineHeaders(fWriteEngineHeaders); writer.writeHeader(header); short nProgress = 0, nPreviousProgress = 0; long nLoadedMarkerCount = 0; HashMap<SampleId, Comparable /*phID*/> phasingIDsBySample = new HashMap<SampleId, Comparable>(); while (markerCursor.hasNext()) { if (progress.hasAborted()) return; int nLoadedMarkerCountInLoop = 0; boolean fStartingNewChunk = true; markerCursor.batchSize(nQueryChunkSize); List<Comparable> currentMarkers = new ArrayList<Comparable>(); while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) { DBObject exportVariant = markerCursor.next(); currentMarkers.add((Comparable) exportVariant.get("_id")); nLoadedMarkerCountInLoop++; fStartingNewChunk = false; } LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes( mongoTemplate, sampleIDs, currentMarkers, true, null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes for (VariantData variant : variantsAndRuns.keySet()) { VariantContext vc = variant.toVariantContext(variantsAndRuns.get(variant), !ObjectId.isValid(variant.getId().toString()), sampleIDToIndividualIdMap, phasingIDsBySample, nMinimumGenotypeQuality, nMinimumReadDepth, warningFileWriter, markerSynonyms == null ? variant.getId() : markerSynonyms.get(variant.getId())); try { writer.add(vc); } catch (Throwable t) { Exception e = new Exception("Unable to convert to VariantContext: " + variant.getId(), t); LOG.debug("error", e); throw e; } if (nLoadedMarkerCountInLoop > currentMarkers.size()) LOG.error("Bug: writing variant number " + nLoadedMarkerCountInLoop + " (only " + currentMarkers.size() + " variants expected)"); } nLoadedMarkerCount += nLoadedMarkerCountInLoop; nProgress = (short) (nLoadedMarkerCount * 100 / markerCount); if (nProgress > nPreviousProgress) { progress.setCurrentStepProgress(nProgress); nPreviousProgress = nProgress; } } progress.setCurrentStepProgress((short) 100); } catch (Exception e) { LOG.error("Error exporting", e); progress.setError(e.getMessage()); return; } finally { warningFileWriter.close(); if (warningFile.length() > 0) { zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt")); int nWarningCount = 0; BufferedReader in = new BufferedReader(new FileReader(warningFile)); String sLine; while ((sLine = in.readLine()) != null) { zos.write((sLine + "\n").getBytes()); nWarningCount++; } LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount); in.close(); } warningFile.delete(); if (writer != null) try { writer.close(); } catch (Throwable ignored) { } } }