Java tutorial
/** * EpiMed - Information system for bioinformatics developments in the field of epigenetics * * This software is a computer program which performs the data management * for EpiMed platform of the Institute for Advances Biosciences (IAB) * * Copyright University of Grenoble Alps (UGA) * GNU GENERAL PUBLIC LICENSE * Please check LICENSE file * * Author: Ekaterina Bourova-Flin * */ package module.script; import static com.mongodb.client.model.Filters.eq; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.bson.Document; import com.mongodb.MongoClient; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import com.mongodb.client.model.Filters; import com.mongodb.client.result.UpdateResult; import config.MongoUtil; import model.bind.AESeries; import service.MongoService; import service.WebService; public class ImportArrayExpress1733 { private static String columnSeparator = "\t"; private static String lineSeparator = "\n"; private static String defaultOrganism = "Homo sapiens"; private static String defaultPlatform = "rna-seq"; private String[] listAccessions = { "E-MTAB-1733" }; private boolean commit = true; private boolean formatIdSample = true; // concatenation with accession (recommended true) private WebService webService = new WebService(); private MongoService mongoService = new MongoService(); public ImportArrayExpress1733() { // ===== Connection ===== MongoClient mongoClient = MongoUtil.buildMongoClient(); MongoDatabase db = mongoClient.getDatabase("epimed_experiments"); MongoCollection<Document> collectionSeries = db.getCollection("series"); MongoCollection<Document> collectionSamples = db.getCollection("samples"); // ===== Pattern ===== String patternText = "\\[[\\p{Print}\\p{Space}]+\\]"; ; Pattern pattern = Pattern.compile(patternText); // ===== Series ===== for (String accession : listAccessions) { List<String> accessionAsList = new ArrayList<String>(); accessionAsList.add(accession); String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession + ".idf.txt"; System.out.println(urlString); String text = webService.loadUrl(urlString); String[] parts = text.split(lineSeparator); List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts)); AESeries series = new AESeries(dataSeries); System.out.println(series); // ===== Check if already imported as a GSE ===== boolean isGseFound = false; String gseNumber = null; for (String secondaryAccession : series.getListAccessions()) { if (secondaryAccession.startsWith("GSE")) { gseNumber = secondaryAccession; Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first(); isGseFound = gse != null; } } int nbImportedSamples = 0; if (!isGseFound) { // ===== Create Mongo series ===== Document docSeries = mongoService.createSeries(accession, series.getTitle(), null, series.getSubmissionDate(), series.getSubmissionDate()); if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) { docSeries.put("secondary_accessions", series.getListAccessions()); } if (false) { UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries)); if (updateResult.getMatchedCount() == 0) { collectionSeries.insertOne(docSeries); } } System.out.println(docSeries); // ===== Import clinical data ===== String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf(); System.out.println(url); String clindata = webService.loadUrl(url); String[] clinparts = clindata.split(lineSeparator); List<String> data = new ArrayList<String>(Arrays.asList(clinparts)); // ===== Recognize samples ===== List<String> header = this.createHeader(data.get(0), pattern); System.out.println(header); for (int i = 1; i < data.size(); i++) { Integer nbSamples = data.size() - 1; Map<String, Object> mapParameters = this.createMapParameters(data.get(i), header); String idSample = this.createIdSample(mapParameters); if (idSample == null) { System.err.println("ERROR: idSample is not recongnized for " + accession); System.out.println("Line " + i); System.out.println(mapParameters); mongoClient.close(); System.exit(0); } else { if (formatIdSample) { idSample = "E-MTAB-2836" + "-" + idSample; idSample = idSample.trim().replaceAll(" ", "-"); } } idSample = idSample.split(" ")[0].trim(); // === Organism === String organism = (String) mapParameters.get("organism"); if (organism == null || organism.isEmpty()) { organism = defaultOrganism; } // === Platform === String platform = (String) mapParameters.get("LIBRARY_STRATEGY"); if (platform != null && !platform.isEmpty()) { platform = platform.toLowerCase().trim(); } else { platform = defaultPlatform; } Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first(); boolean docAlreadyExist = docSampleExist != null; System.out.println("docAlreadyExist " + docAlreadyExist); // === Delete old if already exist === if (docAlreadyExist) { List<String> listSeries = (List<String>) docSampleExist.get("series"); Set<String> setSeries = new HashSet<String>(); listSeries.add(accession); setSeries.addAll(listSeries); listSeries.clear(); listSeries.addAll(setSeries); docSampleExist.append("series", listSeries); System.out.println(docSampleExist); if (commit) { collectionSamples.deleteOne(eq("_id", docSampleExist.get("_id"))); collectionSamples.insertOne(docSampleExist); } } } } else { System.out.println("GEO accession " + gseNumber + " corresponding to " + accession + " exists already. Skip import."); } System.out.println("Number of imported samples: " + nbImportedSamples); } mongoClient.close(); } /** =============================================================== */ public String createIdSample(Map<String, Object> mapParameters) { String[] listKeySample = { "Source Name", "ENA_RUN", "RUN_NAME", "Scan Name" }; // run_name int j = 0; boolean isFound = false; String idSample = null; while (!isFound && j < listKeySample.length) { idSample = (String) mapParameters.get(listKeySample[j]); isFound = idSample != null; j++; } return idSample; } /** =============================================================== */ public Map<String, Object> createMapParameters(String dataline, List<String> header) { Map<String, Object> mapParameters = new HashMap<String, Object>(); String[] parts = dataline.split(columnSeparator); for (int i = 0; i < parts.length; i++) { String current = parts[i]; String existing = (String) mapParameters.get(header.get(i)); String merged = mongoService.mergeParameter(current, existing); mapParameters.put(header.get(i), merged); } return mapParameters; } /** =============================================================== */ public List<String> createHeader(String headerText, Pattern pattern) { List<String> list = new ArrayList<String>(); String[] parts = headerText.split(columnSeparator); for (String part : parts) { Matcher matcher = pattern.matcher(part); boolean isPatternFound = matcher.find(); if (isPatternFound) { String value = matcher.group(); list.add(value.replaceAll("[\\]\\[]", "").trim()); } else { list.add(part.trim()); } } return list; } /** =============================================================== */ public static void main(String[] args) { new ImportArrayExpress1733(); } /** ============================================================== */ }