module.script.ImportArrayExpress1733.java Source code

Java tutorial

Introduction

Here is the source code for module.script.ImportArrayExpress1733.java

Source

/**
 * EpiMed - Information system for bioinformatics developments in the field of epigenetics
 * 
 * This software is a computer program which performs the data management 
 * for EpiMed platform of the Institute for Advances Biosciences (IAB)
 *
 * Copyright University of Grenoble Alps (UGA)
 * GNU GENERAL PUBLIC LICENSE
 * Please check LICENSE file
 *
 * Author: Ekaterina Bourova-Flin 
 *
 */
package module.script;

import static com.mongodb.client.model.Filters.eq;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.bson.Document;

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.result.UpdateResult;

import config.MongoUtil;
import model.bind.AESeries;
import service.MongoService;
import service.WebService;

public class ImportArrayExpress1733 {

    private static String columnSeparator = "\t";
    private static String lineSeparator = "\n";
    private static String defaultOrganism = "Homo sapiens";
    private static String defaultPlatform = "rna-seq";

    private String[] listAccessions = { "E-MTAB-1733" };
    private boolean commit = true;
    private boolean formatIdSample = true; // concatenation with accession (recommended true)

    private WebService webService = new WebService();
    private MongoService mongoService = new MongoService();

    public ImportArrayExpress1733() {

        // ===== Connection =====

        MongoClient mongoClient = MongoUtil.buildMongoClient();
        MongoDatabase db = mongoClient.getDatabase("epimed_experiments");
        MongoCollection<Document> collectionSeries = db.getCollection("series");
        MongoCollection<Document> collectionSamples = db.getCollection("samples");

        // ===== Pattern =====
        String patternText = "\\[[\\p{Print}\\p{Space}]+\\]";
        ;
        Pattern pattern = Pattern.compile(patternText);

        // ===== Series =====

        for (String accession : listAccessions) {

            List<String> accessionAsList = new ArrayList<String>();
            accessionAsList.add(accession);

            String urlString = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + accession
                    + ".idf.txt";
            System.out.println(urlString);
            String text = webService.loadUrl(urlString);

            String[] parts = text.split(lineSeparator);
            List<String> dataSeries = new ArrayList<String>(Arrays.asList(parts));

            AESeries series = new AESeries(dataSeries);
            System.out.println(series);

            // ===== Check if already imported as a GSE ===== 
            boolean isGseFound = false;
            String gseNumber = null;
            for (String secondaryAccession : series.getListAccessions()) {
                if (secondaryAccession.startsWith("GSE")) {
                    gseNumber = secondaryAccession;
                    Document gse = db.getCollection("series").find(Filters.eq("_id", secondaryAccession)).first();
                    isGseFound = gse != null;

                }
            }

            int nbImportedSamples = 0;

            if (!isGseFound) {

                // ===== Create Mongo series =====

                Document docSeries = mongoService.createSeries(accession, series.getTitle(), null,
                        series.getSubmissionDate(), series.getSubmissionDate());

                if (series.getListAccessions() != null && !series.getListAccessions().isEmpty()) {
                    docSeries.put("secondary_accessions", series.getListAccessions());
                }

                if (false) {
                    UpdateResult updateResult = collectionSeries.updateOne(Filters.eq("_id", accession),
                            new Document("$set", docSeries));
                    if (updateResult.getMatchedCount() == 0) {
                        collectionSeries.insertOne(docSeries);
                    }
                }

                System.out.println(docSeries);

                // ===== Import clinical data =====

                String url = "https://www.ebi.ac.uk/arrayexpress/files/" + accession + "/" + series.getSdrf();
                System.out.println(url);
                String clindata = webService.loadUrl(url);

                String[] clinparts = clindata.split(lineSeparator);
                List<String> data = new ArrayList<String>(Arrays.asList(clinparts));

                // ===== Recognize samples =====

                List<String> header = this.createHeader(data.get(0), pattern);
                System.out.println(header);

                for (int i = 1; i < data.size(); i++) {

                    Integer nbSamples = data.size() - 1;

                    Map<String, Object> mapParameters = this.createMapParameters(data.get(i), header);
                    String idSample = this.createIdSample(mapParameters);

                    if (idSample == null) {
                        System.err.println("ERROR: idSample is not recongnized for " + accession);
                        System.out.println("Line " + i);
                        System.out.println(mapParameters);
                        mongoClient.close();
                        System.exit(0);
                    } else {
                        if (formatIdSample) {
                            idSample = "E-MTAB-2836" + "-" + idSample;
                            idSample = idSample.trim().replaceAll(" ", "-");
                        }
                    }
                    idSample = idSample.split(" ")[0].trim();

                    // === Organism ===
                    String organism = (String) mapParameters.get("organism");
                    if (organism == null || organism.isEmpty()) {
                        organism = defaultOrganism;
                    }

                    // === Platform ===
                    String platform = (String) mapParameters.get("LIBRARY_STRATEGY");
                    if (platform != null && !platform.isEmpty()) {
                        platform = platform.toLowerCase().trim();
                    } else {
                        platform = defaultPlatform;
                    }

                    Document docSampleExist = collectionSamples.find(Filters.eq("_id", idSample)).first();
                    boolean docAlreadyExist = docSampleExist != null;

                    System.out.println("docAlreadyExist " + docAlreadyExist);

                    // === Delete old if already exist ===
                    if (docAlreadyExist) {
                        List<String> listSeries = (List<String>) docSampleExist.get("series");
                        Set<String> setSeries = new HashSet<String>();
                        listSeries.add(accession);
                        setSeries.addAll(listSeries);
                        listSeries.clear();
                        listSeries.addAll(setSeries);
                        docSampleExist.append("series", listSeries);

                        System.out.println(docSampleExist);

                        if (commit) {
                            collectionSamples.deleteOne(eq("_id", docSampleExist.get("_id")));
                            collectionSamples.insertOne(docSampleExist);
                        }

                    }

                }

            } else {
                System.out.println("GEO accession " + gseNumber + " corresponding to  " + accession
                        + " exists already. Skip import.");
            }

            System.out.println("Number of imported samples: " + nbImportedSamples);

        }

        mongoClient.close();

    }

    /** =============================================================== */

    public String createIdSample(Map<String, Object> mapParameters) {

        String[] listKeySample = { "Source Name", "ENA_RUN", "RUN_NAME", "Scan Name" };

        // run_name
        int j = 0;
        boolean isFound = false;
        String idSample = null;
        while (!isFound && j < listKeySample.length) {
            idSample = (String) mapParameters.get(listKeySample[j]);
            isFound = idSample != null;
            j++;
        }

        return idSample;

    }

    /** =============================================================== */

    public Map<String, Object> createMapParameters(String dataline, List<String> header) {

        Map<String, Object> mapParameters = new HashMap<String, Object>();

        String[] parts = dataline.split(columnSeparator);

        for (int i = 0; i < parts.length; i++) {
            String current = parts[i];
            String existing = (String) mapParameters.get(header.get(i));
            String merged = mongoService.mergeParameter(current, existing);
            mapParameters.put(header.get(i), merged);
        }

        return mapParameters;
    }

    /** =============================================================== */

    public List<String> createHeader(String headerText, Pattern pattern) {

        List<String> list = new ArrayList<String>();
        String[] parts = headerText.split(columnSeparator);

        for (String part : parts) {
            Matcher matcher = pattern.matcher(part);
            boolean isPatternFound = matcher.find();
            if (isPatternFound) {
                String value = matcher.group();
                list.add(value.replaceAll("[\\]\\[]", "").trim());
            } else {
                list.add(part.trim());
            }
        }

        return list;

    }

    /** =============================================================== */

    public static void main(String[] args) {
        new ImportArrayExpress1733();
    }

    /** ============================================================== */

}