module.ImportArrayExpressInit.java Source code

Java tutorial

Introduction

Here is the source code for module.ImportArrayExpressInit.java

Source

/**
 * EpiMed - Information system for bioinformatics developments in the field of epigenetics
 * 
 * This software is a computer program which performs the data management 
 * for EpiMed platform of the Institute for Advances Biosciences (IAB)
 *
 * Copyright University of Grenoble Alps (UGA)
 * GNU GENERAL PUBLIC LICENSE
 * Please check LICENSE file
 *
 * Author: Ekaterina Bourova-Flin 
 *
 */
package module;

import static com.mongodb.client.model.Filters.eq;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.bson.Document;

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.result.UpdateResult;

import config.MongoUtil;
import model.bind.AESeries;
import service.MongoService;
import service.WebService;

public class ImportArrayExpressInit {

    private static String columnSeparator = "\t";
    private static String lineSeparator = "\n";
    private static String[] listRunNameParameters = { "RUN_NAME", "ENA_RUN", "Scan Name" };

    private String[] listAccessions = { "E-MTAB-2919" };

    private WebService webService = new WebService();
    private MongoService mongoService = new MongoService();

    public ImportArrayExpressInit() {

        // ===== Connection =====

        MongoClient mongoClient = MongoUtil.buildMongoClient();
        MongoDatabase db = mongoClient.getDatabase("epimed_experiments");
        MongoCollection<Document> collectionSeries = db.getCollection("series");
        MongoCollection<Document> collectionSamples = db.getCollection("sample");

        // ===== Pattern =====
        String patternText = "\\[[\\p{Print}\\p{Space}]+\\]";
        ;
        Pattern pattern = Pattern.compile(patternText);

        // ===== Series =====

        for (String accession : listAccessions) {

            String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession
                    + ".idf.txt";
            System.out.println(urlString);
            String text = webService.loadUrl(urlString);

            String[] parts = text.split(lineSeparator);
            List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts));

            AESeries series = new AESeries(dataSeries);
            System.out.println(series);

            // ===== Check if already imported as a GSE ===== 
            boolean isGseFound = false;
            String gseNumber = null;
            for (String secondaryAccession : series.getListAccessions()) {
                if (secondaryAccession.startsWith("GSE")) {
                    gseNumber = secondaryAccession;
                    Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first();
                    isGseFound = gse != null;
                    // System.out.println("GEO accession " +  gseNumber + " found: " + isGseFound);
                }
            }

            if (!isGseFound) {

                // ===== Create Mongo series =====

                List<String> listSeriesAcc = new ArrayList<String>();
                listSeriesAcc.add(accession);

                Document docSeries = mongoService.createSeries(accession, series.getTitle(), null,
                        series.getSubmissionDate(), series.getSubmissionDate());

                if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) {
                    listSeriesAcc.addAll(series.getListAccessions());
                }

                docSeries.put("accessions", listSeriesAcc);

                UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession),
                        new Document("$set", docSeries));
                if (updateResult.getMatchedCount() == 0) {
                    collectionSeries.insertOne(docSeries);
                }

                System.out.println(docSeries);

                // ===== Import clinical data =====

                String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf();
                System.out.println(url);
                String clindata = webService.loadUrl(url);

                String[] clinparts = clindata.split(lineSeparator);
                List<String> data = new ArrayList<String>(Arrays.asList(clinparts));

                // ===== Samples =====

                List<String> header = this.createHeader(data.get(0), pattern);
                System.out.println(header);

                for (int i = 1; i < data.size(); i++) {

                    Integer nbSamples = data.size() - 1;

                    Map<String, Object> mapParameters = this.createParameters(data.get(i), header);
                    String idSample = this.createIdSample(mapParameters);

                    if (idSample == null) {
                        System.err.println("idSample is not recongnized for " + mapParameters);
                        mongoClient.close();
                        System.exit(0);
                    }

                    String organism = (String) mapParameters.get("organism");
                    if (organism == null || organism.isEmpty()) {
                        organism = "Homo sapiens";
                    }
                    String platform = (String) mapParameters.get("LIBRARY_STRATEGY");
                    if (platform != null && !platform.isEmpty()) {
                        platform = platform.toLowerCase().trim();
                    } else {
                        platform = "rna-seq";
                    }
                    String layout = (String) mapParameters.get("LIBRARY_LAYOUT");
                    if (layout != null && !layout.isEmpty()) {
                        layout = layout.toLowerCase().trim();
                    }

                    Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first();
                    boolean docAlreadyExist = docSampleExist != null;

                    boolean analysed = false;

                    if (docAlreadyExist) {
                        analysed = (Boolean) docSampleExist.get("analyzed");
                        System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample
                                + ":  already exists in the database, analyzed=" + analysed);
                    } else {
                        System.out.println(i + "/" + nbSamples + "\t " + docSeries.get("_id") + "\t " + idSample);
                    }

                    // ===== Sample Document =====

                    Document docSample = mongoService.createSample(idSample, (String) docSeries.get("_id"),
                            listSeriesAcc, organism, (Date) docSeries.get("submission_date"),
                            (Date) docSeries.get("last_update"), analysed);

                    // ===== Mandatory parameters =====

                    // Preserve "exp_group" if the document exists already

                    Document expGroup = null;
                    if (docAlreadyExist) {
                        expGroup = (Document) docSampleExist.get("exp_group");
                    } else {
                        expGroup = mongoService.createExpGroup(docSample, platform,
                                (String) mapParameters.get("organism part"),
                                (String) mapParameters.get("Source Name"), organism);
                        if (layout != null) {
                            expGroup.append("layout", layout);

                            // run_name
                            int j = 0;
                            boolean isFound = false;
                            String runName = null;
                            while (!isFound && j < listRunNameParameters.length) {
                                runName = (String) mapParameters.get(listRunNameParameters[j]);
                                isFound = runName != null;
                                j++;
                            }
                            if (runName != null) {
                                expGroup.append("run_name", runName);
                            }

                        }
                    }

                    docSample.append("exp_group", expGroup);

                    // ===== Supplementary parameters =====

                    Document parameters = mongoService.createParameters(docSample, mapParameters);
                    docSample.append("parameters", parameters);

                    // === Delete if already exist ===
                    collectionSamples.deleteOne(eq("_id", idSample));

                    // ===== Insert data =====
                    collectionSamples.insertOne(docSample);

                    // ===== Update series for platforms =====
                    List<String> listPlatforms = collectionSamples
                            .distinct("exp_group.id_platform", Filters.in("series", accession), String.class)
                            .into(new ArrayList<String>());
                    docSeries.append("platforms", listPlatforms);
                    collectionSeries.updateOne(Filters.eq("_id", accession), new Document("$set", docSeries));

                }

            } else {
                System.out.println("GEO accession " + gseNumber + " corresponding to  " + accession
                        + " exists already. Skip import.");
            }
        }

        mongoClient.close();

    }

    /** =============================================================== */

    public String createIdSample(Map<String, Object> mapParameters) {

        String idSample = null;

        for (Map.Entry<String, Object> entry : mapParameters.entrySet()) {

            try {
                String value = (String) entry.getValue();
                if (value != null && (value.contains(".fastq") || value.contains(".gz"))) {
                    String[] parts = value.toString().split("[/\\\\]");
                    String fileName = parts[parts.length - 1];
                    idSample = fileName.split(".fastq")[0].trim();

                    // System.out.println(value + "\t --> \t " + Arrays.toString(parts) + " " + fileName + "\t --> \t " + idSample);

                    return idSample;
                }
            } catch (Exception e) {
                // nothing to do
            }
        }
        return idSample;
    }

    /** =============================================================== */

    public Map<String, Object> createParameters(String dataline, List<String> header) {

        Map<String, Object> mapParameters = new HashMap<String, Object>();

        String[] parts = dataline.split(columnSeparator);

        for (int i = 0; i < parts.length; i++) {
            String value = parts[i];
            if (value != null) {
                value = value.trim();
                if (!value.isEmpty()) {
                    mapParameters.put(header.get(i), parts[i]);
                }
            }
        }

        return mapParameters;
    }

    /** =============================================================== */

    public List<String> createHeader(String headerText, Pattern pattern) {

        List<String> list = new ArrayList<String>();
        String[] parts = headerText.split(columnSeparator);

        for (String part : parts) {
            Matcher matcher = pattern.matcher(part);
            boolean isPatternFound = matcher.find();
            if (isPatternFound) {
                String value = matcher.group();
                list.add(value.replaceAll("[\\]\\[]", "").trim());
            } else {
                list.add(part.trim());
            }
        }

        return list;

    }

    /** =============================================================== */

    public static void main(String[] args) {
        new ImportArrayExpressInit();
    }

    /** ============================================================== */

}