Example usage for com.mongodb DBCursor count

List of usage examples for com.mongodb DBCursor count

Introduction

In this page you can find the example usage for com.mongodb DBCursor count.

Prototype

public int count() 

Source Link

Document

Counts the number of objects matching the query.

Usage

From source file:fr.cirad.mgdb.exporting.individualoriented.DARwinExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles,
        boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor,
        Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    GenotypingProject aProject = mongoTemplate.findOne(
            new Query(Criteria.where(GenotypingProject.FIELDNAME_PLOIDY_LEVEL).exists(true)),
            GenotypingProject.class);
    if (aProject == null)
        LOG.warn("Unable to find a project containing ploidy level information! Assuming ploidy level is 2.");

    int ploidy = aProject == null ? 2 : aProject.getPloidyLevel();

    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);

    int markerCount = markerCursor.count();

    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }//w w w .j a v a  2  s. c  o m
        }

    String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size()
            + "individuals";

    StringBuffer donFileContents = new StringBuffer(
            "@DARwin 5.0 - DON -" + LINE_SEPARATOR + individualExportFiles.size() + "\t" + 1 + LINE_SEPARATOR
                    + "N" + "\t" + "individual" + LINE_SEPARATOR);

    int count = 0;
    String missingGenotype = "";
    for (int j = 0; j < ploidy; j++)
        missingGenotype += "\tN";

    zos.putNextEntry(new ZipEntry(exportName + ".var"));
    zos.write(("@DARwin 5.0 - ALLELIC - " + ploidy + LINE_SEPARATOR + individualExportFiles.size() + "\t"
            + markerCount * ploidy + LINE_SEPARATOR + "N").getBytes());

    DBCursor markerCursorCopy = markerCursor.copy(); // dunno how expensive this is, but seems safer than keeping all IDs in memory at any time

    short nProgress = 0, nPreviousProgress = 0;
    int avgObjSize = (Integer) mongoTemplate
            .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
    int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;
    markerCursorCopy.batchSize(nChunkSize);

    int nMarkerIndex = 0;
    while (markerCursorCopy.hasNext()) {
        DBObject exportVariant = markerCursorCopy.next();
        Comparable markerId = (Comparable) exportVariant.get("_id");

        if (markerSynonyms != null) {
            Comparable syn = markerSynonyms.get(markerId);
            if (syn != null)
                markerId = syn;
        }
        for (int j = 0; j < ploidy; j++)
            zos.write(("\t" + markerId).getBytes());
    }

    TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>();
    ArrayList<String> distinctAlleles = new ArrayList<String>(); // the index of each allele will be used as its code
    int i = 0;
    for (File f : individualExportFiles) {
        BufferedReader in = new BufferedReader(new FileReader(f));
        try {
            String individualId, line = in.readLine(); // read sample id

            if (line != null)
                individualId = line;
            else
                throw new Exception("Unable to read first line of temp export file " + f.getName());

            donFileContents.append(++count + "\t" + individualId + LINE_SEPARATOR);

            zos.write((LINE_SEPARATOR + count).getBytes());
            nMarkerIndex = 0;

            while ((line = in.readLine()) != null) {
                List<String> genotypes = MgdbDao.split(line, "|");
                HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes
                int highestGenotypeCount = 0;
                String mostFrequentGenotype = null;
                for (String genotype : genotypes) {
                    if (genotype.length() == 0)
                        continue; /* skip missing genotypes */

                    int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                    if (gtCount > highestGenotypeCount) {
                        highestGenotypeCount = gtCount;
                        mostFrequentGenotype = genotype;
                    }
                    genotypeCounts.put(genotype, gtCount);
                }

                if (genotypeCounts.size() > 1) {
                    warningFileWriter.write("- Dissimilar genotypes found for variant __" + nMarkerIndex
                            + "__, individual " + individualId + ". Exporting most frequent: "
                            + mostFrequentGenotype + "\n");
                    problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                }

                String codedGenotype = "";
                if (mostFrequentGenotype != null)
                    for (String allele : mostFrequentGenotype.split(" ")) {
                        if (!distinctAlleles.contains(allele))
                            distinctAlleles.add(allele);
                        codedGenotype += "\t" + distinctAlleles.indexOf(allele);
                    }
                else
                    codedGenotype = missingGenotype.replaceAll("N", "-1"); // missing data is coded as -1
                zos.write(codedGenotype.getBytes());

                nMarkerIndex++;
            }
        } catch (Exception e) {
            LOG.error("Error exporting data", e);
            progress.setError("Error exporting data: " + e.getClass().getSimpleName()
                    + (e.getMessage() != null ? " - " + e.getMessage() : ""));
            return;
        } finally {
            in.close();
        }

        if (progress.hasAborted())
            return;

        nProgress = (short) (++i * 100 / individualExportFiles.size());
        if (nProgress > nPreviousProgress) {
            //            LOG.debug("============= doDARwinExport (" + i + "): " + nProgress + "% =============");
            progress.setCurrentStepProgress(nProgress);
            nPreviousProgress = nProgress;
        }

        if (!f.delete()) {
            f.deleteOnExit();
            LOG.info("Unable to delete tmp export file " + f.getAbsolutePath());
        }
    }

    zos.putNextEntry(new ZipEntry(exportName + ".don"));
    zos.write(donFileContents.toString().getBytes());

    // now read variant names for those that induced warnings
    nMarkerIndex = 0;
    markerCursor.batchSize(nChunkSize);
    while (markerCursor.hasNext()) {
        DBObject exportVariant = markerCursor.next();
        if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) {
            Comparable markerId = (Comparable) exportVariant.get("_id");

            if (markerSynonyms != null) {
                Comparable syn = markerSynonyms.get(markerId);
                if (syn != null)
                    markerId = syn;
            }
            for (int j = 0; j < ploidy; j++)
                zos.write(("\t" + markerId).getBytes());

            problematicMarkerIndexToNameMap.put(nMarkerIndex, markerId);
        }
    }

    warningFileWriter.close();
    if (warningFile.length() > 0) {
        zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
        int nWarningCount = 0;
        BufferedReader in = new BufferedReader(new FileReader(warningFile));
        String sLine;
        while ((sLine = in.readLine()) != null) {
            for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet())
                sLine = sLine.replaceAll("__" + aMarkerIndex + "__",
                        problematicMarkerIndexToNameMap.get(aMarkerIndex).toString());
            zos.write((sLine + "\n").getBytes());
            in.readLine();
            nWarningCount++;
        }
        LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
        in.close();
    }
    warningFile.delete();

    zos.close();
    progress.setCurrentStepProgress((short) 100);
}

From source file:fr.cirad.mgdb.exporting.individualoriented.PLinkExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, Collection<File> individualExportFiles,
        boolean fDeleteSampleExportFilesOnExit, ProgressIndicator progress, DBCursor markerCursor,
        Map<Comparable, Comparable> markerSynonyms, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);

    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }/*from ww  w.  j ava 2s  .com*/
        }

    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    int markerCount = markerCursor.count();

    String exportName = sModule + "_" + markerCount + "variants_" + individualExportFiles.size()
            + "individuals";
    zos.putNextEntry(new ZipEntry(exportName + ".ped"));

    TreeMap<Integer, Comparable> problematicMarkerIndexToNameMap = new TreeMap<Integer, Comparable>();
    short nProgress = 0, nPreviousProgress = 0;
    int i = 0;
    for (File f : individualExportFiles) {
        BufferedReader in = new BufferedReader(new FileReader(f));
        try {
            String individualId, line = in.readLine(); // read sample id
            if (line != null) {
                individualId = line;
                String population = getIndividualPopulation(sModule, line);
                String individualInfo = (population == null ? "." : population) + " " + individualId;
                zos.write((individualInfo + " 0 0 0 " + getIndividualGenderCode(sModule, individualId))
                        .getBytes());
            } else
                throw new Exception("Unable to read first line of temp export file " + f.getName());

            int nMarkerIndex = 0;
            while ((line = in.readLine()) != null) {
                List<String> genotypes = MgdbDao.split(line, "|");
                HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes
                int highestGenotypeCount = 0;
                String mostFrequentGenotype = null;
                for (String genotype : genotypes) {
                    if (genotype.length() == 0)
                        continue; /* skip missing genotypes */

                    int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                    if (gtCount > highestGenotypeCount) {
                        highestGenotypeCount = gtCount;
                        mostFrequentGenotype = genotype;
                    }
                    genotypeCounts.put(genotype, gtCount);
                }

                if (genotypeCounts.size() > 1) {
                    warningFileWriter.write("- Dissimilar genotypes found for variant " + nMarkerIndex
                            + ", individual " + individualId + ". Exporting most frequent: "
                            + mostFrequentGenotype + "\n");
                    problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                }

                String[] alleles = mostFrequentGenotype == null ? new String[0]
                        : mostFrequentGenotype.split(" ");
                if (alleles.length > 2) {
                    warningFileWriter.write("- More than 2 alleles found for variant " + nMarkerIndex
                            + ", individual " + individualId + ". Exporting only the first 2 alleles.\n");
                    problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                }

                String all1 = alleles.length == 0 ? "0" : alleles[0];
                String all2 = alleles.length == 0 ? "0" : alleles[alleles.length == 1 ? 0 : 1];
                if (all1.length() != 1 || all2.length() != 1) {
                    warningFileWriter
                            .write("- SNP expected, but alleles are not coded on a single char for variant "
                                    + nMarkerIndex + ", individual " + individualId
                                    + ". Ignoring this genotype.\n");
                    problematicMarkerIndexToNameMap.put(nMarkerIndex, "");
                } else
                    zos.write((" " + all1 + " " + all2).getBytes());

                nMarkerIndex++;
            }
        } catch (Exception e) {
            LOG.error("Error exporting data", e);
            progress.setError("Error exporting data: " + e.getClass().getSimpleName()
                    + (e.getMessage() != null ? " - " + e.getMessage() : ""));
            return;
        } finally {
            in.close();
        }

        if (progress.hasAborted())
            return;

        nProgress = (short) (++i * 100 / individualExportFiles.size());
        if (nProgress > nPreviousProgress) {
            progress.setCurrentStepProgress(nProgress);
            nPreviousProgress = nProgress;
        }
        zos.write('\n');

        if (!f.delete()) {
            f.deleteOnExit();
            LOG.info("Unable to delete tmp export file " + f.getAbsolutePath());
        }
    }
    warningFileWriter.close();

    zos.putNextEntry(new ZipEntry(exportName + ".map"));

    int avgObjSize = (Integer) mongoTemplate
            .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
    int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;

    markerCursor.batchSize(nChunkSize);
    int nMarkerIndex = 0;
    while (markerCursor.hasNext()) {
        DBObject exportVariant = markerCursor.next();
        DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
        Comparable markerId = (Comparable) exportVariant.get("_id");
        String chrom = (String) refPos.get(ReferencePosition.FIELDNAME_SEQUENCE);
        Long pos = ((Number) refPos.get(ReferencePosition.FIELDNAME_START_SITE)).longValue();

        if (chrom == null)
            LOG.warn("Chromosomal position not found for marker " + markerId);
        Comparable exportedId = markerSynonyms == null ? markerId : markerSynonyms.get(markerId);
        zos.write(((chrom == null ? "0" : chrom) + " " + exportedId + " " + 0 + " " + (pos == null ? 0 : pos)
                + LINE_SEPARATOR).getBytes());

        if (problematicMarkerIndexToNameMap.containsKey(nMarkerIndex)) { // we are going to need this marker's name for the warning file
            Comparable variantName = markerId;
            if (markerSynonyms != null) {
                Comparable syn = markerSynonyms.get(markerId);
                if (syn != null)
                    variantName = syn;
            }
            problematicMarkerIndexToNameMap.put(nMarkerIndex, variantName);
        }
        nMarkerIndex++;
    }

    if (warningFile.length() > 0) {
        zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
        int nWarningCount = 0;
        BufferedReader in = new BufferedReader(new FileReader(warningFile));
        String sLine;
        while ((sLine = in.readLine()) != null) {
            for (Integer aMarkerIndex : problematicMarkerIndexToNameMap.keySet())
                sLine = sLine.replaceAll("__" + aMarkerIndex + "__",
                        problematicMarkerIndexToNameMap.get(aMarkerIndex).toString());
            zos.write((sLine + "\n").getBytes());
            in.readLine();
            nWarningCount++;
        }
        LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
        in.close();
    }
    warningFile.delete();

    zos.close();
    progress.setCurrentStepProgress((short) 100);
}

From source file:fr.cirad.mgdb.exporting.markeroriented.BEDExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
        ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
        int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }//from  w w  w .j a  va 2 s .  c o m
        }

    int markerCount = markerCursor.count();

    List<String> selectedIndividualList = new ArrayList<String>();
    for (Individual ind : getIndividualsFromSamples(sModule, sampleIDs))
        selectedIndividualList.add(ind.getId());

    String exportName = sModule + "_" + markerCount + "variants_" + selectedIndividualList.size()
            + "individuals";
    zos.putNextEntry(new ZipEntry(exportName + ".bed"));

    short nProgress = 0, nPreviousProgress = 0;
    int nChunkSize = Math.min(2000, markerCount), nLoadedMarkerCount = 0;
    while (markerCursor.hasNext()) {
        int nLoadedMarkerCountInLoop = 0;
        Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>();
        boolean fStartingNewChunk = true;
        markerCursor.batchSize(nChunkSize);
        while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) {
            DBObject exportVariant = markerCursor.next();
            DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
            markerChromosomalPositions.put((Comparable) exportVariant.get("_id"),
                    refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":"
                            + refPos.get(ReferencePosition.FIELDNAME_START_SITE));
            nLoadedMarkerCountInLoop++;
            fStartingNewChunk = false;
        }

        for (Comparable variantId : markerChromosomalPositions.keySet()) // read data and write results into temporary files (one per sample)
        {
            String[] chromAndPos = markerChromosomalPositions.get(variantId).split(":");
            zos.write((chromAndPos[0] + "\t" + (Long.parseLong(chromAndPos[1]) - 1) + "\t"
                    + (Long.parseLong(chromAndPos[1]) - 1) + "\t" + variantId + "\t" + "0" + "\t" + "+")
                            .getBytes());
            zos.write((LINE_SEPARATOR).getBytes());
        }

        if (progress.hasAborted())
            return;

        nLoadedMarkerCount += nLoadedMarkerCountInLoop;
        nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
        if (nProgress > nPreviousProgress) {
            progress.setCurrentStepProgress(nProgress);
            nPreviousProgress = nProgress;
        }
    }

    zos.close();
    progress.setCurrentStepProgress((short) 100);
}

From source file:fr.cirad.mgdb.exporting.markeroriented.EigenstratExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
        ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
        int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    // long before = System.currentTimeMillis();

    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);
    File snpFile = null;//from  ww  w .  jav  a2 s  . c  om

    try {
        snpFile = File.createTempFile("snpFile", "");
        FileWriter snpFileWriter = new FileWriter(snpFile);

        ZipOutputStream zos = new ZipOutputStream(outputStream);
        if (ByteArrayOutputStream.class.isAssignableFrom(outputStream.getClass()))
            zos.setLevel(ZipOutputStream.STORED);

        if (readyToExportFiles != null)
            for (String readyToExportFile : readyToExportFiles.keySet()) {
                zos.putNextEntry(new ZipEntry(readyToExportFile));
                InputStream inputStream = readyToExportFiles.get(readyToExportFile);
                byte[] dataBlock = new byte[1024];
                int count = inputStream.read(dataBlock, 0, 1024);
                while (count != -1) {
                    zos.write(dataBlock, 0, count);
                    count = inputStream.read(dataBlock, 0, 1024);
                }
            }

        MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
        int markerCount = markerCursor.count();

        List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs);

        ArrayList<String> individualList = new ArrayList<String>();
        StringBuffer indFileContents = new StringBuffer();

        for (int i = 0; i < sampleIDs.size(); i++) {
            Individual individual = individuals.get(i);
            if (!individualList.contains(individual.getId())) {
                individualList.add(individual.getId());
                indFileContents
                        .append(individual.getId() + "\t" + getIndividualGenderCode(sModule, individual.getId())
                                + "\t" + (individual.getPopulation() == null ? "." : individual.getPopulation())
                                + LINE_SEPARATOR);
            }
        }

        String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals";
        zos.putNextEntry(new ZipEntry(exportName + ".ind"));
        zos.write(indFileContents.toString().getBytes());

        zos.putNextEntry(new ZipEntry(exportName + ".eigenstratgeno"));

        int avgObjSize = (Integer) mongoTemplate
                .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats()
                .get("avgObjSize");
        int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;
        short nProgress = 0, nPreviousProgress = 0;
        long nLoadedMarkerCount = 0;

        while (markerCursor.hasNext()) {
            int nLoadedMarkerCountInLoop = 0;
            Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>();
            boolean fStartingNewChunk = true;
            markerCursor.batchSize(nChunkSize);
            while (markerCursor.hasNext()
                    && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) {
                DBObject exportVariant = markerCursor.next();
                DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
                markerChromosomalPositions.put((Comparable) exportVariant.get("_id"),
                        refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":"
                                + refPos.get(ReferencePosition.FIELDNAME_START_SITE));
                nLoadedMarkerCountInLoop++;
                fStartingNewChunk = false;
            }

            List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet());
            LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes(
                    mongoTemplate, sampleIDs, currentMarkers, true,
                    null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes
            for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample)
            {
                Comparable variantId = variant.getId();

                List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":");
                if (chromAndPos.size() == 0)
                    LOG.warn("Chromosomal position not found for marker " + variantId);
                // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR);
                if (markerSynonyms != null) {
                    Comparable syn = markerSynonyms.get(variantId);
                    if (syn != null)
                        variantId = syn;
                }
                snpFileWriter.write(variantId + "\t" + (chromAndPos.size() == 0 ? "0" : chromAndPos.get(0))
                        + "\t" + 0 + "\t" + (chromAndPos.size() == 0 ? 0l : Long.parseLong(chromAndPos.get(1)))
                        + LINE_SEPARATOR);

                Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>();
                Collection<VariantRunData> runs = variantsAndRuns.get(variant);
                if (runs != null)
                    for (VariantRunData run : runs)
                        for (Integer sampleIndex : run.getSampleGenotypes().keySet()) {
                            SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex);
                            String individualId = individuals
                                    .get(sampleIDs
                                            .indexOf(new SampleId(run.getId().getProjectId(), sampleIndex)))
                                    .getId();

                            Integer gq = null;
                            try {
                                gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ);
                            } catch (Exception ignored) {
                            }
                            if (gq != null && gq < nMinimumGenotypeQuality)
                                continue;

                            Integer dp = null;
                            try {
                                dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP);
                            } catch (Exception ignored) {
                            }
                            if (dp != null && dp < nMinimumReadDepth)
                                continue;

                            String gtCode = sampleGenotype.getCode();
                            List<String> storedIndividualGenotypes = individualGenotypes.get(individualId);
                            if (storedIndividualGenotypes == null) {
                                storedIndividualGenotypes = new ArrayList<String>();
                                individualGenotypes.put(individualId, storedIndividualGenotypes);
                            }
                            storedIndividualGenotypes.add(gtCode);
                        }

                for (int j = 0; j < individualList
                        .size(); j++ /* we use this list because it has the proper ordering*/) {
                    String individualId = individualList.get(j);
                    List<String> genotypes = individualGenotypes.get(individualId);
                    HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes
                    int highestGenotypeCount = 0;
                    String mostFrequentGenotype = null;
                    if (genotypes != null)
                        for (String genotype : genotypes) {
                            if (genotype.length() == 0)
                                continue; /* skip missing genotypes */

                            int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                            if (gtCount > highestGenotypeCount) {
                                highestGenotypeCount = gtCount;
                                mostFrequentGenotype = genotype;
                            }
                            genotypeCounts.put(genotype, gtCount);
                        }

                    List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>()
                            : variant.getAllelesFromGenotypeCode(mostFrequentGenotype);

                    int nOutputCode = 0;
                    if (mostFrequentGenotype == null)
                        nOutputCode = 9;
                    else
                        for (String all : Helper.split(mostFrequentGenotype, "/"))
                            if ("0".equals(all))
                                nOutputCode++;
                    if (j == 0 && variant.getKnownAlleleList().size() > 2)
                        warningFileWriter.write("- Variant " + variant.getId()
                                + " is multi-allelic. Make sure Eigenstrat genotype encoding specifications are suitable for you.\n");
                    zos.write(("" + nOutputCode).getBytes());

                    if (genotypeCounts.size() > 1 || alleles.size() > 2) {
                        if (genotypeCounts.size() > 1)
                            warningFileWriter.write("- Dissimilar genotypes found for variant "
                                    + (variantId == null ? variant.getId() : variantId) + ", individual "
                                    + individualId + ". Exporting most frequent: " + nOutputCode + "\n");
                        if (alleles.size() > 2)
                            warningFileWriter.write("- More than 2 alleles found for variant "
                                    + (variantId == null ? variant.getId() : variantId) + ", individual "
                                    + individualId + ". Exporting only the first 2 alleles.\n");
                    }
                }
                zos.write((LINE_SEPARATOR).getBytes());
            }

            if (progress.hasAborted())
                return;

            nLoadedMarkerCount += nLoadedMarkerCountInLoop;
            nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
            if (nProgress > nPreviousProgress) {
                // if (nProgress%5 == 0)
                //    LOG.info("============= exportData: " + nProgress + "% =============" + (System.currentTimeMillis() - before)/1000 + "s");
                progress.setCurrentStepProgress(nProgress);
                nPreviousProgress = nProgress;
            }
        }

        snpFileWriter.close();
        zos.putNextEntry(new ZipEntry(exportName + ".snp"));
        BufferedReader in = new BufferedReader(new FileReader(snpFile));
        String sLine;
        while ((sLine = in.readLine()) != null)
            zos.write((sLine + "\n").getBytes());
        in.close();

        warningFileWriter.close();
        if (warningFile.length() > 0) {
            zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
            int nWarningCount = 0;
            in = new BufferedReader(new FileReader(warningFile));
            while ((sLine = in.readLine()) != null) {
                zos.write((sLine + "\n").getBytes());
                nWarningCount++;
            }
            LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
            in.close();
        }
        warningFile.delete();

        zos.close();
        progress.setCurrentStepProgress((short) 100);
    } finally {
        if (snpFile != null && snpFile.exists())
            snpFile.delete();
    }
}

From source file:fr.cirad.mgdb.exporting.markeroriented.GFFExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
        ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
        int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }/* w w w. java  2s. c  o m*/
        }

    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);

    int markerCount = markerCursor.count();

    List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs);
    ArrayList<String> individualList = new ArrayList<String>();
    for (int i = 0; i < sampleIDs.size(); i++) {
        Individual individual = individuals.get(i);
        if (!individualList.contains(individual.getId())) {
            individualList.add(individual.getId());
        }
    }

    String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals";
    zos.putNextEntry(new ZipEntry(exportName + ".gff3"));
    String header = "##gff-version 3" + LINE_SEPARATOR;
    zos.write(header.getBytes());

    TreeMap<String, String> typeToOntology = new TreeMap<String, String>();
    typeToOntology.put(Type.SNP.toString(), "SO:0000694");
    typeToOntology.put(Type.INDEL.toString(), "SO:1000032");
    typeToOntology.put(Type.MIXED.toString(), "SO:0001059");
    typeToOntology.put(Type.SYMBOLIC.toString(), "SO:0000109");
    typeToOntology.put(Type.MNP.toString(), "SO:0001059");

    int avgObjSize = (Integer) mongoTemplate
            .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
    int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;
    short nProgress = 0, nPreviousProgress = 0;
    long nLoadedMarkerCount = 0;

    while (markerCursor.hasNext()) {
        int nLoadedMarkerCountInLoop = 0;
        Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>();
        boolean fStartingNewChunk = true;
        markerCursor.batchSize(nChunkSize);
        while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) {
            DBObject exportVariant = markerCursor.next();
            DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
            markerChromosomalPositions.put((Comparable) exportVariant.get("_id"),
                    refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":"
                            + refPos.get(ReferencePosition.FIELDNAME_START_SITE));
            nLoadedMarkerCountInLoop++;
            fStartingNewChunk = false;
        }

        List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet());
        LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes(
                mongoTemplate, sampleIDs, currentMarkers, true,
                null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes
        for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample)
        {
            Comparable variantId = variant.getId();
            List<String> variantDataOrigin = new ArrayList<String>();

            Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>();
            Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>();
            Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>();
            List<String> chromAndPos = Helper.split(markerChromosomalPositions.get(variantId), ":");
            if (chromAndPos.size() == 0)
                LOG.warn("Chromosomal position not found for marker " + variantId);
            // LOG.debug(marker + "\t" + (chromAndPos.length == 0 ? "0" : chromAndPos[0]) + "\t" + 0 + "\t" + (chromAndPos.length == 0 ? 0l : Long.parseLong(chromAndPos[1])) + LINE_SEPARATOR);
            if (markerSynonyms != null) {
                Comparable syn = markerSynonyms.get(variantId);
                if (syn != null)
                    variantId = syn;
            }

            Collection<VariantRunData> runs = variantsAndRuns.get(variant);
            if (runs != null)
                for (VariantRunData run : runs)
                    for (Integer sampleIndex : run.getSampleGenotypes().keySet()) {
                        SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex);
                        String individualId = individuals
                                .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex)))
                                .getId();

                        Integer gq = null;
                        try {
                            gq = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ);
                        } catch (Exception ignored) {
                        }
                        if (gq != null && gq < nMinimumGenotypeQuality)
                            continue;

                        Integer dp = null;
                        try {
                            dp = (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP);
                        } catch (Exception ignored) {
                        }
                        if (dp != null && dp < nMinimumReadDepth)
                            continue;

                        String gtCode = sampleGenotype.getCode();
                        List<String> storedIndividualGenotypes = individualGenotypes.get(individualId);
                        if (storedIndividualGenotypes == null) {
                            storedIndividualGenotypes = new ArrayList<String>();
                            individualGenotypes.put(individualId, storedIndividualGenotypes);
                        }
                        storedIndividualGenotypes.add(gtCode);
                    }

            zos.write((chromAndPos.get(0) + "\t" + StringUtils.join(variantDataOrigin, ";") /*source*/ + "\t"
                    + typeToOntology.get(variant.getType()) + "\t" + Long.parseLong(chromAndPos.get(1)) + "\t"
                    + Long.parseLong(chromAndPos.get(1)) + "\t" + "." + "\t" + "+" + "\t" + "." + "\t")
                            .getBytes());
            Comparable syn = markerSynonyms == null ? null : markerSynonyms.get(variant.getId());
            zos.write(("ID=" + variant.getId() + ";" + (syn != null ? "Name=" + syn + ";" : "") + "alleles="
                    + StringUtils.join(variant.getKnownAlleleList(), "/") + ";" + "refallele="
                    + variant.getKnownAlleleList().get(0) + ";").getBytes());

            for (int j = 0; j < individualList
                    .size(); j++ /* we use this list because it has the proper ordering*/) {

                NumberFormat nf = NumberFormat.getInstance(Locale.US);
                nf.setMaximumFractionDigits(4);
                HashMap<String, Integer> compt1 = new HashMap<String, Integer>();
                int highestGenotypeCount = 0;
                int sum = 0;

                String individualId = individualList.get(j);
                List<String> genotypes = individualGenotypes.get(individualId);
                HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes

                String mostFrequentGenotype = null;
                if (genotypes != null)
                    for (String genotype : genotypes) {
                        if (genotype.length() == 0)
                            continue; /* skip missing genotypes */

                        int count = 0;
                        for (String t : variant.getAllelesFromGenotypeCode(genotype)) {
                            for (String t1 : variant.getKnownAlleleList()) {
                                if (t.equals(t1) && !(compt1.containsKey(t1))) {
                                    count++;
                                    compt1.put(t1, count);
                                } else if (t.equals(t1) && compt1.containsKey(t1)) {
                                    if (compt1.get(t1) != 0) {
                                        count++;
                                        compt1.put(t1, count);
                                    } else
                                        compt1.put(t1, count);
                                } else if (!(compt1.containsKey(t1))) {
                                    compt1.put(t1, 0);
                                }
                            }
                        }
                        for (int countValue : compt1.values()) {
                            sum += countValue;
                        }

                        int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                        if (gtCount > highestGenotypeCount) {
                            highestGenotypeCount = gtCount;
                            mostFrequentGenotype = genotype;
                        }
                        genotypeCounts.put(genotype, gtCount);
                    }

                List<String> alleles = mostFrequentGenotype == null ? new ArrayList<String>()
                        : variant.getAllelesFromGenotypeCode(mostFrequentGenotype);

                if (alleles.size() != 0) {
                    zos.write(("acounts=" + individualId + ":").getBytes());

                    for (String knowAllelesCompt : compt1.keySet()) {
                        zos.write(
                                (knowAllelesCompt + " " + nf.format(compt1.get(knowAllelesCompt) / (float) sum)
                                        + " " + compt1.get(knowAllelesCompt) + " ").getBytes());
                    }
                    zos.write((alleles.size() + ";").getBytes());
                }
                if (genotypeCounts.size() > 1) {
                    Comparable sVariantId = markerSynonyms != null ? markerSynonyms.get(variant.getId())
                            : variant.getId();
                    warningFileWriter.write("- Dissimilar genotypes found for variant "
                            + (sVariantId == null ? variant.getId() : sVariantId) + ", individual "
                            + individualId + ". Exporting most frequent: " + StringUtils.join(alleles, ",")
                            + "\n");
                }
            }
            zos.write((LINE_SEPARATOR).getBytes());
        }

        if (progress.hasAborted())
            return;

        nLoadedMarkerCount += nLoadedMarkerCountInLoop;
        nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
        if (nProgress > nPreviousProgress) {
            //            if (nProgress%5 == 0)
            //               LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s");
            progress.setCurrentStepProgress(nProgress);
            nPreviousProgress = nProgress;
        }
    }

    warningFileWriter.close();
    if (warningFile.length() > 0) {
        zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
        int nWarningCount = 0;
        BufferedReader in = new BufferedReader(new FileReader(warningFile));
        String sLine;
        while ((sLine = in.readLine()) != null) {
            zos.write((sLine + "\n").getBytes());
            in.readLine();
            nWarningCount++;
        }
        LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
        in.close();
    }
    warningFile.delete();

    zos.close();
    progress.setCurrentStepProgress((short) 100);
}

From source file:fr.cirad.mgdb.exporting.markeroriented.HapMapExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
        ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
        int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);

    int markerCount = markerCursor.count();

    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }//from  w  ww. j av a  2s .co  m
        }

    List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs);
    ArrayList<String> individualList = new ArrayList<String>();
    for (int i = 0; i < sampleIDs.size(); i++) {
        Individual individual = individuals.get(i);
        if (!individualList.contains(individual.getId())) {
            individualList.add(individual.getId());
        }
    }

    String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals";
    zos.putNextEntry(new ZipEntry(exportName + ".hapmap"));
    String header = "rs#" + "\t" + "alleles" + "\t" + "chrom" + "\t" + "pos" + "\t" + "strand" + "\t"
            + "assembly#" + "\t" + "center" + "\t" + "protLSID" + "\t" + "assayLSID" + "\t" + "panelLSID" + "\t"
            + "QCcode";
    zos.write(header.getBytes());
    for (int i = 0; i < individualList.size(); i++) {
        zos.write(("\t" + individualList.get(i)).getBytes());
    }
    zos.write((LINE_SEPARATOR).getBytes());

    int avgObjSize = (Integer) mongoTemplate
            .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
    int nChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;
    short nProgress = 0, nPreviousProgress = 0;
    long nLoadedMarkerCount = 0;

    while (markerCursor == null || markerCursor.hasNext()) {
        int nLoadedMarkerCountInLoop = 0;
        Map<Comparable, String> markerChromosomalPositions = new LinkedHashMap<Comparable, String>();
        boolean fStartingNewChunk = true;
        markerCursor.batchSize(nChunkSize);
        while (markerCursor.hasNext() && (fStartingNewChunk || nLoadedMarkerCountInLoop % nChunkSize != 0)) {
            DBObject exportVariant = markerCursor.next();
            DBObject refPos = (DBObject) exportVariant.get(VariantData.FIELDNAME_REFERENCE_POSITION);
            markerChromosomalPositions.put((Comparable) exportVariant.get("_id"),
                    refPos.get(ReferencePosition.FIELDNAME_SEQUENCE) + ":"
                            + refPos.get(ReferencePosition.FIELDNAME_START_SITE));
            nLoadedMarkerCountInLoop++;
            fStartingNewChunk = false;
        }

        List<Comparable> currentMarkers = new ArrayList<Comparable>(markerChromosomalPositions.keySet());
        LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes(
                mongoTemplate, sampleIDs, currentMarkers, true,
                null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes
        for (VariantData variant : variantsAndRuns.keySet()) // read data and write results into temporary files (one per sample)
        {
            Comparable variantId = variant.getId();
            if (markerSynonyms != null) {
                Comparable syn = markerSynonyms.get(variantId);
                if (syn != null)
                    variantId = syn;
            }

            boolean fIsSNP = variant.getType().equals(Type.SNP.toString());
            byte[] missingGenotype = ("\t" + "NN").getBytes();

            String[] chromAndPos = markerChromosomalPositions.get(variant.getId()).split(":");
            zos.write(((variantId == null ? variant.getId() : variantId) + "\t"
                    + StringUtils.join(variant.getKnownAlleleList(), "/") + "\t" + chromAndPos[0] + "\t"
                    + Long.parseLong(chromAndPos[1]) + "\t" + "+").getBytes());
            for (int j = 0; j < 6; j++)
                zos.write(("\t" + "NA").getBytes());

            Map<String, Integer> gqValueForSampleId = new LinkedHashMap<String, Integer>();
            Map<String, Integer> dpValueForSampleId = new LinkedHashMap<String, Integer>();
            Map<String, List<String>> individualGenotypes = new LinkedHashMap<String, List<String>>();
            Collection<VariantRunData> runs = variantsAndRuns.get(variant);
            if (runs != null)
                for (VariantRunData run : runs)
                    for (Integer sampleIndex : run.getSampleGenotypes().keySet()) {
                        SampleGenotype sampleGenotype = run.getSampleGenotypes().get(sampleIndex);
                        String gtCode = run.getSampleGenotypes().get(sampleIndex).getCode();
                        String individualId = individuals
                                .get(sampleIDs.indexOf(new SampleId(run.getId().getProjectId(), sampleIndex)))
                                .getId();
                        List<String> storedIndividualGenotypes = individualGenotypes.get(individualId);
                        if (storedIndividualGenotypes == null) {
                            storedIndividualGenotypes = new ArrayList<String>();
                            individualGenotypes.put(individualId, storedIndividualGenotypes);
                        }
                        storedIndividualGenotypes.add(gtCode);
                        gqValueForSampleId.put(individualId,
                                (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_GQ));
                        dpValueForSampleId.put(individualId,
                                (Integer) sampleGenotype.getAdditionalInfo().get(VariantData.GT_FIELD_DP));
                    }

            int writtenGenotypeCount = 0;
            for (String individualId : individualList /* we use this list because it has the proper ordering */) {
                int individualIndex = individualList.indexOf(individualId);
                while (writtenGenotypeCount < individualIndex - 1) {
                    zos.write(missingGenotype);
                    writtenGenotypeCount++;
                }

                List<String> genotypes = individualGenotypes.get(individualId);
                HashMap<Object, Integer> genotypeCounts = new HashMap<Object, Integer>(); // will help us to keep track of missing genotypes
                int highestGenotypeCount = 0;
                String mostFrequentGenotype = null;
                if (genotypes != null)
                    for (String genotype : genotypes) {
                        if (genotype.length() == 0)
                            continue; /* skip missing genotypes */

                        Integer gqValue = gqValueForSampleId.get(individualId);
                        if (gqValue != null && gqValue < nMinimumGenotypeQuality)
                            continue; /* skip this sample because its GQ is under the threshold */

                        Integer dpValue = dpValueForSampleId.get(individualId);
                        if (dpValue != null && dpValue < nMinimumReadDepth)
                            continue; /* skip this sample because its DP is under the threshold */

                        int gtCount = 1 + MgdbDao.getCountForKey(genotypeCounts, genotype);
                        if (gtCount > highestGenotypeCount) {
                            highestGenotypeCount = gtCount;
                            mostFrequentGenotype = genotype;
                        }
                        genotypeCounts.put(genotype, gtCount);
                    }

                byte[] exportedGT = mostFrequentGenotype == null ? missingGenotype
                        : ("\t" + StringUtils.join(variant.getAllelesFromGenotypeCode(mostFrequentGenotype),
                                fIsSNP ? "" : "/")).getBytes();
                zos.write(exportedGT);
                writtenGenotypeCount++;

                if (genotypeCounts.size() > 1)
                    warningFileWriter.write("- Dissimilar genotypes found for variant "
                            + (variantId == null ? variant.getId() : variantId) + ", individual " + individualId
                            + ". Exporting most frequent: " + new String(exportedGT) + "\n");
            }

            while (writtenGenotypeCount < individualList.size()) {
                zos.write(missingGenotype);
                writtenGenotypeCount++;
            }
            zos.write((LINE_SEPARATOR).getBytes());
        }

        if (progress.hasAborted())
            return;

        nLoadedMarkerCount += nLoadedMarkerCountInLoop;
        nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
        if (nProgress > nPreviousProgress) {
            //            if (nProgress%5 == 0)
            //               LOG.info("========================= exportData: " + nProgress + "% =========================" + (System.currentTimeMillis() - before)/1000 + "s");
            progress.setCurrentStepProgress(nProgress);
            nPreviousProgress = nProgress;
        }
    }

    warningFileWriter.close();
    if (warningFile.length() > 0) {
        zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
        int nWarningCount = 0;
        BufferedReader in = new BufferedReader(new FileReader(warningFile));
        String sLine;
        while ((sLine = in.readLine()) != null) {
            zos.write((sLine + "\n").getBytes());
            in.readLine();
            nWarningCount++;
        }
        LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
        in.close();
    }
    warningFile.delete();

    zos.close();
    progress.setCurrentStepProgress((short) 100);
}

From source file:fr.cirad.mgdb.exporting.markeroriented.VcfExportHandler.java

License:Open Source License

@Override
public void exportData(OutputStream outputStream, String sModule, List<SampleId> sampleIDs,
        ProgressIndicator progress, DBCursor markerCursor, Map<Comparable, Comparable> markerSynonyms,
        int nMinimumGenotypeQuality, int nMinimumReadDepth, Map<String, InputStream> readyToExportFiles)
        throws Exception {
    Integer projectId = null;/*w w  w . j  a  v a 2 s .c  o m*/
    for (SampleId spId : sampleIDs) {
        if (projectId == null)
            projectId = spId.getProject();
        else if (projectId != spId.getProject()) {
            projectId = 0;
            break; // more than one project are involved: no header will be written
        }
    }

    File warningFile = File.createTempFile("export_warnings_", "");
    FileWriter warningFileWriter = new FileWriter(warningFile);

    MongoTemplate mongoTemplate = MongoTemplateManager.get(sModule);
    int markerCount = markerCursor.count();
    ZipOutputStream zos = new ZipOutputStream(outputStream);

    if (readyToExportFiles != null)
        for (String readyToExportFile : readyToExportFiles.keySet()) {
            zos.putNextEntry(new ZipEntry(readyToExportFile));
            InputStream inputStream = readyToExportFiles.get(readyToExportFile);
            byte[] dataBlock = new byte[1024];
            int count = inputStream.read(dataBlock, 0, 1024);
            while (count != -1) {
                zos.write(dataBlock, 0, count);
                count = inputStream.read(dataBlock, 0, 1024);
            }
        }

    LinkedHashMap<SampleId, String> sampleIDToIndividualIdMap = new LinkedHashMap<SampleId, String>();
    ArrayList<String> individualList = new ArrayList<String>();
    List<Individual> individuals = getIndividualsFromSamples(sModule, sampleIDs);
    for (int i = 0; i < sampleIDs.size(); i++) {
        String individualId = individuals.get(i).getId();
        sampleIDToIndividualIdMap.put(sampleIDs.get(i), individualId);
        if (!individualList.contains(individualId)) {
            individualList.add(individualId);
        }
    }

    String exportName = sModule + "_" + markerCount + "variants_" + individualList.size() + "individuals";
    zos.putNextEntry(new ZipEntry(exportName + ".vcf"));

    int avgObjSize = (Integer) mongoTemplate
            .getCollection(mongoTemplate.getCollectionName(VariantRunData.class)).getStats().get("avgObjSize");
    int nQueryChunkSize = nMaxChunkSizeInMb * 1024 * 1024 / avgObjSize;

    VariantContextWriter writer = null;
    try {
        List<String> distinctSequenceNames = new ArrayList<String>();

        String sequenceSeqCollName = MongoTemplateManager.getMongoCollectionName(Sequence.class);
        if (mongoTemplate.collectionExists(sequenceSeqCollName)) {
            DBCursor markerCursorCopy = markerCursor.copy();
            markerCursorCopy.batchSize(nQueryChunkSize);
            while (markerCursorCopy.hasNext()) {
                int nLoadedMarkerCountInLoop = 0;
                boolean fStartingNewChunk = true;
                while (markerCursorCopy.hasNext()
                        && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) {
                    DBObject exportVariant = markerCursorCopy.next();
                    String chr = (String) ((DBObject) exportVariant
                            .get(VariantData.FIELDNAME_REFERENCE_POSITION))
                                    .get(ReferencePosition.FIELDNAME_SEQUENCE);
                    if (!distinctSequenceNames.contains(chr))
                        distinctSequenceNames.add(chr);
                }
            }
            markerCursorCopy.close();
        }

        Collections.sort(distinctSequenceNames, new AlphaNumericStringComparator());
        SAMSequenceDictionary dict = createSAMSequenceDictionary(sModule, distinctSequenceNames);
        writer = new CustomVCFWriter(null, zos, dict, false, false, true);
        //         VariantContextWriterBuilder vcwb = new VariantContextWriterBuilder();
        //         vcwb.unsetOption(Options.INDEX_ON_THE_FLY);
        //         vcwb.unsetOption(Options.DO_NOT_WRITE_GENOTYPES);
        //         vcwb.setOption(Options.USE_ASYNC_IOINDEX_ON_THE_FLY);
        //         vcwb.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
        //         vcwb.setReferenceDictionary(dict);
        //         writer = vcwb.build();
        //         writer = new AsyncVariantContextWriter(writer, 3000);

        progress.moveToNextStep(); // done with dictionary
        DBCursor headerCursor = mongoTemplate
                .getCollection(MongoTemplateManager.getMongoCollectionName(DBVCFHeader.class))
                .find(new BasicDBObject("_id." + VcfHeaderId.FIELDNAME_PROJECT, projectId));
        Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
        boolean fWriteCommandLine = true, fWriteEngineHeaders = true; // default values

        while (headerCursor.hasNext()) {
            DBVCFHeader dbVcfHeader = DBVCFHeader.fromDBObject(headerCursor.next());
            headerLines.addAll(dbVcfHeader.getHeaderLines());

            // Add sequence header lines (not stored in our vcf header collection)
            BasicDBObject projection = new BasicDBObject(SequenceStats.FIELDNAME_SEQUENCE_LENGTH, true);
            int nSequenceIndex = 0;
            for (String sequenceName : distinctSequenceNames) {
                String sequenceInfoCollName = MongoTemplateManager.getMongoCollectionName(SequenceStats.class);
                boolean fCollectionExists = mongoTemplate.collectionExists(sequenceInfoCollName);
                if (fCollectionExists) {
                    DBObject record = mongoTemplate.getCollection(sequenceInfoCollName).findOne(
                            new Query(Criteria.where("_id").is(sequenceName)).getQueryObject(), projection);
                    if (record == null) {
                        LOG.warn("Sequence '" + sequenceName + "' not found in collection "
                                + sequenceInfoCollName);
                        continue;
                    }

                    Map<String, String> sequenceLineData = new LinkedHashMap<String, String>();
                    sequenceLineData.put("ID", (String) record.get("_id"));
                    sequenceLineData.put("length",
                            ((Number) record.get(SequenceStats.FIELDNAME_SEQUENCE_LENGTH)).toString());
                    headerLines.add(new VCFContigHeaderLine(sequenceLineData, nSequenceIndex++));
                }
            }
            fWriteCommandLine = headerCursor.size() == 1 && dbVcfHeader.getWriteCommandLine(); // wouldn't make sense to include command lines for several runs
            if (!dbVcfHeader.getWriteEngineHeaders())
                fWriteEngineHeaders = false;
        }
        headerCursor.close();

        VCFHeader header = new VCFHeader(headerLines, individualList);
        header.setWriteCommandLine(fWriteCommandLine);
        header.setWriteEngineHeaders(fWriteEngineHeaders);
        writer.writeHeader(header);

        short nProgress = 0, nPreviousProgress = 0;
        long nLoadedMarkerCount = 0;
        HashMap<SampleId, Comparable /*phID*/> phasingIDsBySample = new HashMap<SampleId, Comparable>();

        while (markerCursor.hasNext()) {
            if (progress.hasAborted())
                return;

            int nLoadedMarkerCountInLoop = 0;
            boolean fStartingNewChunk = true;
            markerCursor.batchSize(nQueryChunkSize);
            List<Comparable> currentMarkers = new ArrayList<Comparable>();
            while (markerCursor.hasNext()
                    && (fStartingNewChunk || nLoadedMarkerCountInLoop % nQueryChunkSize != 0)) {
                DBObject exportVariant = markerCursor.next();
                currentMarkers.add((Comparable) exportVariant.get("_id"));
                nLoadedMarkerCountInLoop++;
                fStartingNewChunk = false;
            }

            LinkedHashMap<VariantData, Collection<VariantRunData>> variantsAndRuns = MgdbDao.getSampleGenotypes(
                    mongoTemplate, sampleIDs, currentMarkers, true,
                    null /*new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_SEQUENCE).and(new Sort(VariantData.FIELDNAME_REFERENCE_POSITION + "." + ChromosomalPosition.FIELDNAME_START_SITE))*/); // query mongo db for matching genotypes
            for (VariantData variant : variantsAndRuns.keySet()) {
                VariantContext vc = variant.toVariantContext(variantsAndRuns.get(variant),
                        !ObjectId.isValid(variant.getId().toString()), sampleIDToIndividualIdMap,
                        phasingIDsBySample, nMinimumGenotypeQuality, nMinimumReadDepth, warningFileWriter,
                        markerSynonyms == null ? variant.getId() : markerSynonyms.get(variant.getId()));
                try {
                    writer.add(vc);
                } catch (Throwable t) {
                    Exception e = new Exception("Unable to convert to VariantContext: " + variant.getId(), t);
                    LOG.debug("error", e);
                    throw e;
                }

                if (nLoadedMarkerCountInLoop > currentMarkers.size())
                    LOG.error("Bug: writing variant number " + nLoadedMarkerCountInLoop + " (only "
                            + currentMarkers.size() + " variants expected)");
            }

            nLoadedMarkerCount += nLoadedMarkerCountInLoop;
            nProgress = (short) (nLoadedMarkerCount * 100 / markerCount);
            if (nProgress > nPreviousProgress) {
                progress.setCurrentStepProgress(nProgress);
                nPreviousProgress = nProgress;
            }
        }
        progress.setCurrentStepProgress((short) 100);

    } catch (Exception e) {
        LOG.error("Error exporting", e);
        progress.setError(e.getMessage());
        return;
    } finally {
        warningFileWriter.close();
        if (warningFile.length() > 0) {
            zos.putNextEntry(new ZipEntry(exportName + "-REMARKS.txt"));
            int nWarningCount = 0;
            BufferedReader in = new BufferedReader(new FileReader(warningFile));
            String sLine;
            while ((sLine = in.readLine()) != null) {
                zos.write((sLine + "\n").getBytes());
                nWarningCount++;
            }
            LOG.info("Number of Warnings for export (" + exportName + "): " + nWarningCount);
            in.close();
        }
        warningFile.delete();
        if (writer != null)
            try {
                writer.close();
            } catch (Throwable ignored) {
            }
    }
}

From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java

License:Apache License

/** 
 * Read queue state/*from ww  w .  j  av  a  2 s .  c o m*/
 * 
 * starttime -> timestamp (log value) of the current crawl start
 * 
 * for an item of the collection :
 *       timestamp < starttime   => not in queue
 *       timestamp > starttime   => in queue
 *       timestamp = starttime   => done
 * 
 * @return last start time
 */
private Long readState() {

    // read start time
    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put("_id", new ObjectId(stateId));
    DBCursor cur = coll.getColl().find(docsearch);
    if (cur.count() > 1)
        return null;
    if (cur.count() == 0)
        return null;

    BasicDBObject doc = (BasicDBObject) cur.next();
    startTime = doc.getLong("starttime");

    // read sizes
    if (startTime == 0 || rescan || startDepth > 0) {
        // TODO v4 : in fact startTime never = 0 !!!
        if (!rescan) {
            // previous crawl terminated fine

            if (startDepth > 0) {
                //String queryTimeStamp =   "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}";      
                //String queryMode = "{\"crawl_mode\":a}";         
                //String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + "]}";
                String query = "{\"depth\":" + String.valueOf(startDepth) + "}";
                size = count(query);
            } else {
                size = 0;
            }
        } else {
            // get queue size : timestamp != starttime => in queue
            //String query = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}";      

            String queryTimeStamp = "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime)
                    + "}}";
            String queryMode = "{\"crawl_mode\":\"a\"}";
            String query200 = "{\"crawl_status\":200}";
            String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + ", " + query200 + "]}";

            size = count(query);
        }
        doneCount = 0;
    } else {
        // previous crawl was not terminated

        // get queue size : timestamp > starttime => in queue
        String query = "{\"" + timestampFieldName + "\": {\"$gt\": " + String.valueOf(startTime) + "}}";
        //docsearch = MongoDBHelper.JSON2BasicDBObject(query);
        //cur = coll.getColl().find(docsearch);
        //size = cur.size();
        size = count(query);

        // get done count : timestamp = starttime => done
        query = "{\"" + timestampFieldName + "\": " + String.valueOf(startTime) + "}";
        //docsearch = MongoDBHelper.JSON2BasicDBObject(query);
        //cur = coll.getColl().find(docsearch);
        //doneCount = cur.size();
        doneCount = count(query);
    }
    return startTime;
}

From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java

License:Apache License

public int getCurrentMaxDepth() {
    //String queryTimeStamp =   "{\"" + timestampFieldName + "\": {\"$ne\": " + String.valueOf(startTime) + "}}";      
    //String queryMode = "{\"crawl_mode\":a}";         
    //String query = "{\"$and\": [" + queryTimeStamp + ", " + queryMode + "]}";

    String query1 = "{\"crawl_mode\":\"a\"}";
    String query2 = "{\"crawl_status\":200}";
    String query = "{\"$and\": [" + query1 + ", " + query2 + "]}";

    BasicDBObject docsearch = new BasicDBObject();
    docsearch = MongoDBHelper.JSON2BasicDBObject(query);

    DBCursor cur = coll.getColl().find(docsearch).sort(new BasicDBObject("depth", -1));

    if (cur.count() == 0)
        return 0;
    BasicDBObject doc = (BasicDBObject) cur.next();
    return doc.getInt("depth");
}

From source file:fr.eolya.crawler.queue.mongodb.MongoDBSourceItemsQueue.java

License:Apache License

/** 
 * Push a new item/*w w  w  .j  a v a  2s .  co  m*/
 * 
 * @return success or not
 */
public boolean push(Map<String, Object> item) throws QueueIncoherenceException, QueueInvalidDataException {

    boolean ret = true;

    BasicDBObject doc = new BasicDBObject(item);

    String keyValue = doc.getString(uniqueKeyFieldName);
    String depth = doc.getString(depthFieldName);
    String sourceId = doc.getString(sourceIdFieldName);

    if (sourceId == null || keyValue == null || depth == null)
        throw new QueueInvalidDataException("Missing fields in json");
    if (Integer.parseInt(sourceId) != this.sourceId)
        throw new QueueInvalidDataException("Invalid source id in json");

    String referer = doc.getString(refererFieldName);

    // Get existing item in queue
    String currentDepth = null;
    String currentReferers = null;
    long currentTimestamp = 0;
    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put(sourceIdFieldName, Integer.parseInt(sourceId));
    docsearch.put(hashFieldName, keyValue.hashCode());

    synchronized (collMonitor) {
        BasicDBObject curDoc = null;
        DBCursor cur = coll.getColl().find(docsearch);
        if (cur.count() > 0) {
            while (cur.hasNext() && curDoc == null) {
                curDoc = (BasicDBObject) cur.next();
                if (!keyValue.equals(doc.getString(uniqueKeyFieldName))) {
                    curDoc = null;
                }
            }
            if (curDoc != null) {
                currentDepth = curDoc.getString(depthFieldName);
                currentReferers = curDoc.getString(referersFieldName);
                currentTimestamp = curDoc.getLong(timestampFieldName);

                /*
                 * Remember : for an item of the collection :
                 *       timestamp < starttime   => not in queue
                 *       timestamp > starttime   => in queue
                 *       timestamp = starttime   => done
                 */
                if ((Long.parseLong(depth) >= Long.parseLong(currentDepth)) && (currentTimestamp >= startTime))
                    return false;
            }
        }

        // build new doc
        doc.put(hashFieldName, keyValue.hashCode());
        doc.put(timestampFieldName, new Date().getTime());

        if (referer != null) {
            if (currentReferers == null) {
                currentReferers = referer;
            } else {
                currentReferers += "/n" + referer;
            }
        }
        if (currentReferers != null) {
            doc.put(referersFieldName, currentReferers);
        }
        if (curDoc != null) {

            doc.put("content_type", curDoc.get("content_type"));
            doc.put("crawl_last_time", curDoc.get("crawl_last_time"));
            doc.put("condget_last_modified", curDoc.get("condget_last_modified"));
            doc.put("condget_etag", curDoc.get("condget_etag"));

            coll.update(curDoc, doc);
            // TODO : decrease done size in some case ???
        } else {
            doc.put(createdFieldName, new Date().getTime());
            coll.add(doc);
        }
        size++;
        return ret;
    }
}