List of usage examples for com.mongodb.client MongoCollection distinct
<TResult> DistinctIterable<TResult> distinct(ClientSession clientSession, String fieldName, Class<TResult> resultClass);
From source file:module.ImportArrayExpress.java
License:Open Source License
public ImportArrayExpress() { // ===== Connection ===== MongoClient mongoClient = MongoUtil.buildMongoClient(); MongoDatabase db = mongoClient.getDatabase("epimed_experiments"); MongoCollection<Document> collectionSeries = db.getCollection("series"); MongoCollection<Document> collectionSamples = db.getCollection("sample"); // ===== Pattern ===== String patternText = "\\[[\\p{Print}\\p{Space}]+\\]"; ;/*from ww w.j ava 2s. c om*/ Pattern pattern = Pattern.compile(patternText); // ===== Series ===== for (String accession : listAccessions) { List<String> accessionAsList = new ArrayList<String>(); accessionAsList.add(accession); String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession + ".idf.txt"; System.out.println(urlString); String text = webService.loadUrl(urlString); String[] parts = text.split(lineSeparator); List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts)); AESeries series = new AESeries(dataSeries); System.out.println(series); // ===== Check if already imported as a GSE ===== boolean isGseFound = false; String gseNumber = null; for (String secondaryAccession : series.getListAccessions()) { if (secondaryAccession.startsWith("GSE")) { gseNumber = secondaryAccession; Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first(); isGseFound = gse != null; } } int nbImportedSamples = 0; if (!isGseFound) { // ===== Create Mongo series ===== Document docSeries = mongoService.createSeries(accession, series.getTitle(), null, series.getSubmissionDate(), series.getSubmissionDate()); if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) { docSeries.put("secondary_accessions", series.getListAccessions()); } if (commit) { UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries)); if (updateResult.getMatchedCount() == 0) { collectionSeries.insertOne(docSeries); } } System.out.println(docSeries); // ===== Import clinical data ===== String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf(); System.out.println(url); String clindata = webService.loadUrl(url); String[] clinparts = clindata.split(lineSeparator); List<String> data = new ArrayList<String>(Arrays.asList(clinparts)); // ===== Recognize samples ===== List<String> header = this.createHeader(data.get(0), pattern); System.out.println(header); for (int i = 1; i < data.size(); i++) { Integer nbSamples = data.size() - 1; Map<String, Object> mapParameters = this.createMapParameters(data.get(i), header); String idSample = this.createIdSample(mapParameters); if (idSample == null) { System.err.println("ERROR: idSample is not recongnized for " + accession); System.out.println("Line " + i); System.out.println(mapParameters); mongoClient.close(); System.exit(0); } else { if (formatIdSample) { idSample = accession + "-" + idSample; idSample = idSample.trim().replaceAll(" ", "-"); } } idSample = idSample.split(" ")[0].trim(); // === Organism === String organism = (String) mapParameters.get("organism"); if (organism == null || organism.isEmpty()) { organism = defaultOrganism; } // === Platform === String platform = (String) mapParameters.get("LIBRARY_STRATEGY"); if (platform != null && !platform.isEmpty()) { platform = platform.toLowerCase().trim(); } else { platform = defaultPlatform; } Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first(); boolean docAlreadyExist = docSampleExist != null; boolean analysed = false; if (docAlreadyExist) { analysed = (Boolean) docSampleExist.get("analyzed"); } // ===== Sample Document ===== Document docSample = mongoService.createSample(idSample, (String) docSeries.get("_id"), accessionAsList, organism, (Date) docSeries.get("submission_date"), (Date) docSeries.get("last_update"), analysed); Document expGroup = null; Document parameters = null; // System.out.println("------------------------------------------------------------------"); if (docAlreadyExist) { // === ID sample alredy exists === System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample + ": already exists in the database, analyzed=" + analysed); expGroup = docSampleExist.get("exp_group", Document.class); parameters = mongoService.updateParameters(docSampleExist, mapParameters); } else { // === New sample === System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample); expGroup = mongoService.createExpGroup(docSample, platform, null, null, organism); parameters = mongoService.createParameters(docSample, mapParameters); nbImportedSamples++; } // === Update sample_title, sample_source, layout === expGroup.put("sample_title", parameters.getString("organism part")); expGroup.put("sample_source", parameters.getString("Source Name")); expGroup.put("layout", parameters.getString("LIBRARY_LAYOUT")); docSample.append("exp_group", expGroup); docSample.append("parameters", parameters); if (commit) { // === Update old if already exist === if (docAlreadyExist) { // collectionSamples.deleteOne(eq("_id", idSample)); collectionSamples.updateOne(Filters.eq("_id", idSample), new Document("$set", docSample)); } else { // ===== Insert data ===== collectionSamples.insertOne(docSample); } // ===== Update series for platforms ===== List<String> listPlatforms = collectionSamples .distinct("exp_group.id_platform", Filters.in("series", accession), String.class) .into(new ArrayList<String>()); docSeries.append("platforms", listPlatforms); collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries)); } } } else { System.out.println("GEO accession " + gseNumber + " corresponding to " + accession + " exists already. Skip import."); } System.out.println("Number of imported samples: " + nbImportedSamples); } mongoClient.close(); }
From source file:module.ImportArrayExpressInit.java
License:Open Source License
public ImportArrayExpressInit() { // ===== Connection ===== MongoClient mongoClient = MongoUtil.buildMongoClient(); MongoDatabase db = mongoClient.getDatabase("epimed_experiments"); MongoCollection<Document> collectionSeries = db.getCollection("series"); MongoCollection<Document> collectionSamples = db.getCollection("sample"); // ===== Pattern ===== String patternText = "\\[[\\p{Print}\\p{Space}]+\\]"; ;/*w w w . j a va2 s . c o m*/ Pattern pattern = Pattern.compile(patternText); // ===== Series ===== for (String accession : listAccessions) { String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession + ".idf.txt"; System.out.println(urlString); String text = webService.loadUrl(urlString); String[] parts = text.split(lineSeparator); List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts)); AESeries series = new AESeries(dataSeries); System.out.println(series); // ===== Check if already imported as a GSE ===== boolean isGseFound = false; String gseNumber = null; for (String secondaryAccession : series.getListAccessions()) { if (secondaryAccession.startsWith("GSE")) { gseNumber = secondaryAccession; Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first(); isGseFound = gse != null; // System.out.println("GEO accession " + gseNumber + " found: " + isGseFound); } } if (!isGseFound) { // ===== Create Mongo series ===== List<String> listSeriesAcc = new ArrayList<String>(); listSeriesAcc.add(accession); Document docSeries = mongoService.createSeries(accession, series.getTitle(), null, series.getSubmissionDate(), series.getSubmissionDate()); if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) { listSeriesAcc.addAll(series.getListAccessions()); } docSeries.put("accessions", listSeriesAcc); UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries)); if (updateResult.getMatchedCount() == 0) { collectionSeries.insertOne(docSeries); } System.out.println(docSeries); // ===== Import clinical data ===== String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf(); System.out.println(url); String clindata = webService.loadUrl(url); String[] clinparts = clindata.split(lineSeparator); List<String> data = new ArrayList<String>(Arrays.asList(clinparts)); // ===== Samples ===== List<String> header = this.createHeader(data.get(0), pattern); System.out.println(header); for (int i = 1; i < data.size(); i++) { Integer nbSamples = data.size() - 1; Map<String, Object> mapParameters = this.createParameters(data.get(i), header); String idSample = this.createIdSample(mapParameters); if (idSample == null) { System.err.println("idSample is not recongnized for " + mapParameters); mongoClient.close(); System.exit(0); } String organism = (String) mapParameters.get("organism"); if (organism == null || organism.isEmpty()) { organism = "Homo sapiens"; } String platform = (String) mapParameters.get("LIBRARY_STRATEGY"); if (platform != null && !platform.isEmpty()) { platform = platform.toLowerCase().trim(); } else { platform = "rna-seq"; } String layout = (String) mapParameters.get("LIBRARY_LAYOUT"); if (layout != null && !layout.isEmpty()) { layout = layout.toLowerCase().trim(); } Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first(); boolean docAlreadyExist = docSampleExist != null; boolean analysed = false; if (docAlreadyExist) { analysed = (Boolean) docSampleExist.get("analyzed"); System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample + ": already exists in the database, analyzed=" + analysed); } else { System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample); } // ===== Sample Document ===== Document docSample = mongoService.createSample(idSample, (String) docSeries.get("_id"), listSeriesAcc, organism, (Date) docSeries.get("submission_date"), (Date) docSeries.get("last_update"), analysed); // ===== Mandatory parameters ===== // Preserve "exp_group" if the document exists already Document expGroup = null; if (docAlreadyExist) { expGroup = (Document) docSampleExist.get("exp_group"); } else { expGroup = mongoService.createExpGroup(docSample, platform, (String) mapParameters.get("organism part"), (String) mapParameters.get("Source Name"), organism); if (layout != null) { expGroup.append("layout", layout); // run_name int j = 0; boolean isFound = false; String runName = null; while (!isFound && j < listRunNameParameters.length) { runName = (String) mapParameters.get(listRunNameParameters[j]); isFound = runName != null; j++; } if (runName != null) { expGroup.append("run_name", runName); } } } docSample.append("exp_group", expGroup); // ===== Supplementary parameters ===== Document parameters = mongoService.createParameters(docSample, mapParameters); docSample.append("parameters", parameters); // === Delete if already exist === collectionSamples.deleteOne(eq("_id", idSample)); // ===== Insert data ===== collectionSamples.insertOne(docSample); // ===== Update series for platforms ===== List<String> listPlatforms = collectionSamples .distinct("exp_group.id_platform", Filters.in("series", accession), String.class) .into(new ArrayList<String>()); docSeries.append("platforms", listPlatforms); collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries)); } } else { System.out.println("GEO accession " + gseNumber + " corresponding to " + accession + " exists already. Skip import."); } } mongoClient.close(); }
From source file:module.test.CustomExport.java
License:Open Source License
public CustomExport() { // ===== Connection ===== MongoClient mongoClient = MongoUtil.buildMongoClient(); MongoDatabase db = mongoClient.getDatabase("epimed_experiments"); MongoCollection<Document> collection = db.getCollection("samples"); // ===== Find exp_group in the database ===== // === Query 1 === /*/* w w w .j av a 2s .co m*/ String queryName = "breast_cancer_GPL570"; List<Bson> filters = new ArrayList<Bson>(); filters.add(Filters.eq("exp_group.id_platform", "GPL570")); filters.add(Filters.eq("exp_group.id_topology_group", "C50")); filters.add(Filters.eq("exp_group.id_tissue_status", 3)); // tumoral */ // === Query 2 === /* String queryName = "breast_normal_GPL570"; List<Bson> filters = new ArrayList<Bson>(); filters.add(Filters.eq("exp_group.id_platform", "GPL570")); filters.add(Filters.eq("exp_group.id_topology_group", "C50")); filters.add(Filters.eq("exp_group.id_tissue_status", 1)); // normal */ // === Query 3 === String queryName = "breast_cancer_with_survival_GPL570"; List<Bson> filters = new ArrayList<Bson>(); filters.add(Filters.eq("exp_group.id_platform", "GPL570")); filters.add(Filters.eq("exp_group.id_topology_group", "C50")); filters.add(Filters.eq("exp_group.id_tissue_status", 3)); // tumoral filters.add(Filters.or(Filters.ne("exp_group.os_months", null), Filters.ne("exp_group.dfss_months", null), Filters.ne("exp_group.relapsed", null), Filters.ne("exp_group.dead", null))); Bson filter = Filters.and(filters); Long nbSamples = collection.count(filter); List<String> listSeries = collection.distinct("exp_group.main_gse_number", filter, String.class) .into(new ArrayList<String>()); queryName = queryName + "_" + nbSamples + "_samples_" + listSeries.size() + "_series"; List<Document> docExpGroup = collection.find(filter) .projection(Projections.fields(Projections.include("exp_group"), Projections.excludeId())) .into(new ArrayList<Document>()); List<Document> docParam = collection.find(filter) .projection(Projections.fields(Projections.include("parameters"), Projections.excludeId())) .into(new ArrayList<Document>()); mongoClient.close(); // ===== Load Exp Group into a matrix ===== List<String> headerExpGroup = new ArrayList<String>(); List<Object> dataExpGroup = new ArrayList<Object>(); for (int i = 0; i < docExpGroup.size(); i++) { Map<String, String> expGroup = (Map<String, String>) docExpGroup.get(i).get("exp_group"); if (i == 0) { headerExpGroup.addAll(expGroup.keySet()); } Object[] dataLine = new Object[headerExpGroup.size()]; for (int j = 0; j < headerExpGroup.size(); j++) { dataLine[j] = expGroup.get(headerExpGroup.get(j)); } dataExpGroup.add(dataLine); } // ===== Load Params into a matrix ===== Set<String> headerParamSet = new HashSet<String>(); List<String> headerParam = new ArrayList<String>(); List<Object> dataParam = new ArrayList<Object>(); for (int i = 0; i < docParam.size(); i++) { Map<String, String> param = (Map<String, String>) docParam.get(i).get("parameters"); headerParamSet.addAll(param.keySet()); } headerParam.addAll(headerParamSet); Collections.sort(headerParam); for (int i = 0; i < docParam.size(); i++) { Map<String, String> param = (Map<String, String>) docParam.get(i).get("parameters"); Object[] dataLine = new Object[headerParam.size()]; for (int j = 0; j < headerParam.size(); j++) { dataLine[j] = param.get(headerParam.get(j)); } // System.out.println(Arrays.toString(dataLine)); dataParam.add(dataLine); } // === Output === String fileName = this.getOutputDirectory() + this.getDirSeparator() + "EpiMed_database_" + queryName + "_" + dateFormat.format(new Date()) + ".xlsx"; System.out.println(fileName); XSSFWorkbook workbook = fileService.createWorkbook(); fileService.addSheet(workbook, "exp_group_" + dateFormat.format(new Date()), headerExpGroup, dataExpGroup); fileService.addSheet(workbook, "parameters_" + dateFormat.format(new Date()), headerParam, dataParam); fileService.writeWorkbook(workbook, fileName); }
From source file:rapture.table.mongodb.MongoIndexHandler.java
License:Open Source License
public TableQueryResult query(final IndexQuery indexQuery) { if (log.isDebugEnabled()) { log.debug("Parsed query " + indexQuery); }/*from w w w. j a v a 2 s . co m*/ TableQueryResult res = new TableQueryResult(); final Document mongoQuery = getClause(indexQuery.getWhere()); final MongoCollection<Document> collection = MongoDBFactory.getCollection(instanceName, tableName); List<List<Object>> rows = new ArrayList<>(); List<String> fieldList = indexQuery.getSelect().getFieldList(); // Mongo can't do distinct based on multiple fields for some reason if (!indexQuery.isDistinct()) { // What fields to return final Document fields = new Document(); for (String fieldName : indexQuery.getSelect().getFieldList()) { log.debug("Adding return field " + fieldName); fields.put(fieldName, 1); } res.setColumnNames(indexQuery.getSelect().getFieldList()); fields.put(KEY, 1); MongoRetryWrapper<List<List<Object>>> wrapper = new MongoRetryWrapper<List<List<Object>>>() { @Override public FindIterable<Document> makeCursor() { FindIterable<Document> ret; if (fields.isEmpty()) { ret = collection.find(mongoQuery); } else { fields.put(KEY, 1); ret = collection.find(mongoQuery).projection(fields); } if (indexQuery.getOrderBy().getFieldList().size() > 0) { Document sort = new Document(); for (String field : indexQuery.getOrderBy().getFieldList()) { sort.put(field, indexQuery.getDirection() != OrderDirection.DESC ? 1 : -1); } ret = ret.sort(sort); } int skip = indexQuery.getSkip(); if (skip > 0) { ret = ret.skip(skip); } int limit = indexQuery.getLimit(); if (limit > 0) { // By specifying a negative limit we tell Mongo that it can close the cursor after returning a single batch. ret = ret.limit(-(limit)); } return ret; } @Override public List<List<Object>> action(FindIterable<Document> cursor) { List<List<Object>> rows = new ArrayList<>(); for (Document obj : cursor) { List<Object> row = new ArrayList<>(); for (String field : indexQuery.getSelect().getFieldList()) { row.add(obj.get(field)); } rows.add(row); } return rows; } }; res.setRows(wrapper.doAction()); return res; // We are done. } else if (fieldList.size() > 1) { // What fields to return final Document fields = new Document(); for (String fieldName : indexQuery.getSelect().getFieldList()) { log.debug("Adding return field " + fieldName); fields.put(fieldName, 1); } res.setColumnNames(indexQuery.getSelect().getFieldList()); fields.put(KEY, 1); MongoRetryWrapper<List<List<Object>>> wrapper = new MongoRetryWrapper<List<List<Object>>>() { @Override public FindIterable<Document> makeCursor() { FindIterable<Document> ret; if (fields.isEmpty()) { ret = collection.find(mongoQuery); } else { fields.put(KEY, 1); ret = collection.find(mongoQuery).projection(fields); } if (indexQuery.getOrderBy().getFieldList().size() > 0) { Document sort = new Document(); for (String field : indexQuery.getOrderBy().getFieldList()) { sort.put(field, indexQuery.getDirection() != OrderDirection.DESC ? 1 : -1); } ret = ret.sort(sort); } // We can't apply SKIP and LIMIT here because we must drop the fields that aren't distinct; // Mongo doesn't appear to support distinct on multiple keys return ret; } @Override public List<List<Object>> action(FindIterable<Document> cursor) { int limit = (indexQuery.getSkip()) + (indexQuery.getLimit()); if (limit == 0) limit = Integer.MAX_VALUE; List<List<Object>> rows = new ArrayList<>(); for (Document obj : cursor) { List<Object> row = new ArrayList<>(); for (String field : indexQuery.getSelect().getFieldList()) { row.add(obj.get(field)); } if (indexQuery.isDistinct() && rows.contains(row)) continue; rows.add(row); if (rows.size() > limit) break; } return rows; } }; rows = wrapper.doAction(); // We are not done - still need to apply skip and limit } else { String key = fieldList.get(0); DistinctIterable<String> values = collection.distinct(key, mongoQuery, String.class); for (String v : values) { rows.add(ImmutableList.of(v)); } res.setColumnNames(ImmutableList.of(key)); if (indexQuery.getOrderBy().getFieldList().size() > 0) { List<String> columnNames = indexQuery.getSelect().getFieldList(); Collections.sort(rows, RowComparatorFactory.createComparator(indexQuery.getOrderBy().getFieldList(), columnNames, indexQuery.getDirection())); if (indexQuery.getDirection() == OrderDirection.DESC) { Collections.reverse(rows); } } } int skip = (indexQuery.getSkip()); if (skip < rows.size()) { int limit = indexQuery.getLimit(); if ((limit > 0) && (rows.size() - skip > limit)) { res.setRows(rows.subList(skip, skip + limit)); } else res.setRows(rows); } // else all rows are skipped return res; }