module.ImportArrayExpress.java Source code

Java tutorial

Introduction

Here is the source code for module.ImportArrayExpress.java

Source

/**
 * EpiMed - Information system for bioinformatics developments in the field of epigenetics
 * 
 * This software is a computer program which performs the data management 
 * for EpiMed platform of the Institute for Advances Biosciences (IAB)
 *
 * Copyright University of Grenoble Alps (UGA)
 * GNU GENERAL PUBLIC LICENSE
 * Please check LICENSE file
 *
 * Author: Ekaterina Bourova-Flin 
 *
 */
package module;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.bson.Document;

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.result.UpdateResult;

import config.MongoUtil;
import model.bind.AESeries;
import service.MongoService;
import service.WebService;

public class ImportArrayExpress {

    private static String columnSeparator = "\t";
    private static String lineSeparator = "\n";
    private static String defaultOrganism = "Homo sapiens";
    private static String defaultPlatform = "rna-seq";

    private String[] listAccessions = { "E-MTAB-2919" };
    private boolean commit = true;
    private boolean formatIdSample = false; // concatenation with accession (recommended true)

    private WebService webService = new WebService();
    private MongoService mongoService = new MongoService();

    public ImportArrayExpress() {

        // ===== Connection =====

        MongoClient mongoClient = MongoUtil.buildMongoClient();
        MongoDatabase db = mongoClient.getDatabase("epimed_experiments");
        MongoCollection<Document> collectionSeries = db.getCollection("series");
        MongoCollection<Document> collectionSamples = db.getCollection("sample");

        // ===== Pattern =====
        String patternText = "\\[[\\p{Print}\\p{Space}]+\\]";
        ;
        Pattern pattern = Pattern.compile(patternText);

        // ===== Series =====

        for (String accession : listAccessions) {

            List<String> accessionAsList = new ArrayList<String>();
            accessionAsList.add(accession);

            String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession
                    + ".idf.txt";
            System.out.println(urlString);
            String text = webService.loadUrl(urlString);

            String[] parts = text.split(lineSeparator);
            List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts));

            AESeries series = new AESeries(dataSeries);
            System.out.println(series);

            // ===== Check if already imported as a GSE ===== 
            boolean isGseFound = false;
            String gseNumber = null;
            for (String secondaryAccession : series.getListAccessions()) {
                if (secondaryAccession.startsWith("GSE")) {
                    gseNumber = secondaryAccession;
                    Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first();
                    isGseFound = gse != null;

                }
            }

            int nbImportedSamples = 0;

            if (!isGseFound) {

                // ===== Create Mongo series =====

                Document docSeries = mongoService.createSeries(accession, series.getTitle(), null,
                        series.getSubmissionDate(), series.getSubmissionDate());

                if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) {
                    docSeries.put("secondary_accessions", series.getListAccessions());
                }

                if (commit) {
                    UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession),
                            new Document("$set", docSeries));
                    if (updateResult.getMatchedCount() == 0) {
                        collectionSeries.insertOne(docSeries);
                    }
                }

                System.out.println(docSeries);

                // ===== Import clinical data =====

                String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf();
                System.out.println(url);
                String clindata = webService.loadUrl(url);

                String[] clinparts = clindata.split(lineSeparator);
                List<String> data = new ArrayList<String>(Arrays.asList(clinparts));

                // ===== Recognize samples =====

                List<String> header = this.createHeader(data.get(0), pattern);
                System.out.println(header);

                for (int i = 1; i < data.size(); i++) {

                    Integer nbSamples = data.size() - 1;

                    Map<String, Object> mapParameters = this.createMapParameters(data.get(i), header);
                    String idSample = this.createIdSample(mapParameters);

                    if (idSample == null) {
                        System.err.println("ERROR: idSample is not recongnized for " + accession);
                        System.out.println("Line " + i);
                        System.out.println(mapParameters);
                        mongoClient.close();
                        System.exit(0);
                    } else {
                        if (formatIdSample) {
                            idSample = accession + "-" + idSample;
                            idSample = idSample.trim().replaceAll(" ", "-");
                        }
                    }
                    idSample = idSample.split(" ")[0].trim();

                    // === Organism ===
                    String organism = (String) mapParameters.get("organism");
                    if (organism == null || organism.isEmpty()) {
                        organism = defaultOrganism;
                    }

                    // === Platform ===
                    String platform = (String) mapParameters.get("LIBRARY_STRATEGY");
                    if (platform != null && !platform.isEmpty()) {
                        platform = platform.toLowerCase().trim();
                    } else {
                        platform = defaultPlatform;
                    }

                    Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first();
                    boolean docAlreadyExist = docSampleExist != null;

                    boolean analysed = false;

                    if (docAlreadyExist) {
                        analysed = (Boolean) docSampleExist.get("analyzed");
                    }

                    // ===== Sample Document =====

                    Document docSample = mongoService.createSample(idSample, (String) docSeries.get("_id"),
                            accessionAsList, organism, (Date) docSeries.get("submission_date"),
                            (Date) docSeries.get("last_update"), analysed);

                    Document expGroup = null;
                    Document parameters = null;

                    // System.out.println("------------------------------------------------------------------");

                    if (docAlreadyExist) {
                        // === ID sample alredy exists ===
                        System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample
                                + ":  already exists in the database, analyzed=" + analysed);
                        expGroup = docSampleExist.get("exp_group", Document.class);
                        parameters = mongoService.updateParameters(docSampleExist, mapParameters);
                    } else {
                        // === New sample ===
                        System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample);
                        expGroup = mongoService.createExpGroup(docSample, platform, null, null, organism);
                        parameters = mongoService.createParameters(docSample, mapParameters);
                        nbImportedSamples++;
                    }

                    // === Update sample_title, sample_source, layout ===
                    expGroup.put("sample_title", parameters.getString("organism part"));
                    expGroup.put("sample_source", parameters.getString("Source Name"));
                    expGroup.put("layout", parameters.getString("LIBRARY_LAYOUT"));

                    docSample.append("exp_group", expGroup);
                    docSample.append("parameters", parameters);

                    if (commit) {

                        // === Update old if already exist ===
                        if (docAlreadyExist) {
                            // collectionSamples.deleteOne(eq("_id", idSample));
                            collectionSamples.updateOne(Filters.eq("_id", idSample),
                                    new Document("$set", docSample));
                        } else {
                            // ===== Insert data =====
                            collectionSamples.insertOne(docSample);
                        }

                        // ===== Update series for platforms =====
                        List<String> listPlatforms = collectionSamples
                                .distinct("exp_group.id_platform", Filters.in("series", accession), String.class)
                                .into(new ArrayList<String>());
                        docSeries.append("platforms", listPlatforms);
                        collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries));
                    }

                }

            } else {
                System.out.println("GEO accession " + gseNumber + " corresponding to  " + accession
                        + " exists already. Skip import.");
            }

            System.out.println("Number of imported samples: " + nbImportedSamples);

        }

        mongoClient.close();

    }

    /** =============================================================== */

    public String createIdSample(Map<String, Object> mapParameters) {

        String[] listKeySample = { "Source Name", "BioSD_Sample", "ENA_RUN", "RUN_NAME", "Scan Name" };

        // run_name
        int j = 0;
        boolean isFound = false;
        String idSample = null;
        while (!isFound && j < listKeySample.length) {
            idSample = (String) mapParameters.get(listKeySample[j]);
            isFound = idSample != null;
            j++;
        }

        return idSample;

    }

    /** =============================================================== */

    public Map<String, Object> createMapParameters(String dataline, List<String> header) {

        Map<String, Object> mapParameters = new HashMap<String, Object>();

        String[] parts = dataline.split(columnSeparator);

        for (int i = 0; i < parts.length; i++) {
            String current = parts[i];
            String existing = (String) mapParameters.get(header.get(i));
            String merged = mongoService.mergeParameter(current, existing);
            mapParameters.put(header.get(i), merged);
        }

        return mapParameters;
    }

    /** =============================================================== */

    public List<String> createHeader(String headerText, Pattern pattern) {

        List<String> list = new ArrayList<String>();
        String[] parts = headerText.split(columnSeparator);

        for (String part : parts) {
            Matcher matcher = pattern.matcher(part);
            boolean isPatternFound = matcher.find();
            if (isPatternFound) {
                String value = matcher.group();
                list.add(value.replaceAll("[\\]\\[]", "").trim());
            } else {
                list.add(part.trim());
            }
        }

        return list;

    }

    /** =============================================================== */

    public static void main(String[] args) {
        new ImportArrayExpress();
    }

    /** ============================================================== */

}