act.installer.GenbankInstaller.java Source code

Introduction

Here is the source code for act.installer.GenbankInstaller.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer;

import act.installer.sequence.GenbankSeqEntry;
import act.installer.sequence.GenbankSeqEntryFactory;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Organism;
import act.shared.Seq;
import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator;
import com.act.utils.parser.GenbankInterpreter;
import com.mongodb.DBObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.core.sequence.features.FeatureInterface;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;
import org.json.JSONArray;
import org.json.JSONObject;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

public class GenbankInstaller {
    private static final Logger LOGGER = LogManager.getFormatterLogger(GenbankInstaller.class);
    private static final GenbankSeqEntryFactory seqEntryFactory = new GenbankSeqEntryFactory();
    private static final String OPTION_GENBANK_PATH = "p";
    private static final String OPTION_DB_NAME = "d";
    private static final String OPTION_SEQ_TYPE = "s";
    private static final String ACCESSION = "accession";
    private static final String NAME = "name";
    private static final String COUNTRY_CODE = "country_code";
    private static final String PATENT_NUMBER = "patent_number";
    private static final String PATENT_YEAR = "patent_year";
    private static final String SYNONYMS = "synonyms";
    private static final String PRODUCT_NAMES = "product_names";
    private static final String DNA = "DNA";
    private static final String CDS = "CDS";
    private static final String PROTEIN_ID = "protein_id";
    private static final String PROTEIN = "Protein";
    private static final String VAL = "val";
    private static final String SRC = "src";
    private static final String PMID = "PMID";
    private static final String PATENT = "Patent";

    //  http://www.ncbi.nlm.nih.gov/Sequin/acc.html
    public static final Pattern PROTEIN_ACCESSION_PATTERN = Pattern.compile("[a-zA-Z]{3}\\d{5}");
    // matches WGS and MGA sequence accession patterns since they appear in Nucleotide files as well
    public static final Pattern NUCLEOTIDE_ACCESSION_PATTERN = Pattern
            .compile("[a-zA-Z]\\d{5}|[a-zA-Z]{2}\\d{6}|[a-zA-Z]{4}\\d{8,10}|[a-zA-Z]{5}\\d{7}");

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class is the driver to write sequence data from a Genbank file to our database. It can be used on the ",
            "command line with a file path as a parameter." }, "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_GENBANK_PATH).argName("genbank file")
                    .desc("genbank file containing sequence and annotations").hasArg().longOpt("genbank")
                    .required());
            add(Option.builder(OPTION_DB_NAME).argName("db name").desc("name of the database to be queried")
                    .hasArg().longOpt("database").required());
            add(Option.builder(OPTION_SEQ_TYPE).argName("sequence type")
                    .desc("declares whether the sequence type is DNA or Protein").hasArg().longOpt("sequence")
                    .required());
            add(Option.builder("h").argName("help").desc("Example of usage: -p filepath.gb -d marvin -s DNA")
                    .longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    File genbankFile;
    String seqType;
    MongoDB db;
    Map<String, String> minimalPrefixMapping;

    // the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
    public GenbankInstaller(File genbankFile, String seqType, MongoDB db,
            Map<String, String> minimalPrefixMapping) {
        this.genbankFile = genbankFile;
        this.seqType = seqType;
        this.db = db;
        this.minimalPrefixMapping = minimalPrefixMapping;
    }

    public void init() throws Exception {
        GenbankInterpreter reader = new GenbankInterpreter(genbankFile, seqType);
        reader.init();
        List<AbstractSequence> sequences = reader.getSequences();

        int sequenceCount = 0;

        GenbankSeqEntry seqEntry;

        for (AbstractSequence sequence : sequences) {
            if (seqType.equals(DNA)) {
                for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : (List<FeatureInterface<AbstractSequence<Compound>, Compound>>) sequence
                        .getFeatures()) {
                    if (feature.getType().equals(CDS) && feature.getQualifiers().containsKey(PROTEIN_ID)) {
                        seqEntry = seqEntryFactory.createFromDNASequenceReference(sequence, feature.getQualifiers(),
                                db, minimalPrefixMapping);
                        addSeqEntryToDb(seqEntry, db);
                        sequenceCount++;
                    }
                }

            } else if (seqType.equals(PROTEIN)) {
                seqEntry = seqEntryFactory.createFromProteinSequenceReference(sequence, db, minimalPrefixMapping);
                addSeqEntryToDb(seqEntry, db);
                sequenceCount++;
            }
        }

        LOGGER.info("%s sequences installed in the db", sequenceCount);
    }

    /**
     * Verifies the accession string according to the standard Genbank/Uniprot accession qualifications
     * @param proteinAccession the accession string to be validated
     * @param accessionPattern the pattern that the accession string should match
     * @return
     */
    public static boolean verifyAccession(String proteinAccession, Pattern accessionPattern) {
        return accessionPattern.matcher(proteinAccession).matches();
    }

    /**
     * Checks if the new value already exists in the field. If so, doesn't update the metadata. If it doesn't exist,
     * appends the new value to the data.
     * @param field the key referring to the array in the metadata we wish to update
     * @param value the value we wish to add to the array
     * @param data the metadata
     * @return the updated metadata JSONObject
     */
    public static JSONObject updateArrayField(String field, String value, JSONObject data) {
        if (value == null || value.isEmpty()) {
            return data;
        }

        if (data.has(field)) {
            JSONArray fieldData = data.getJSONArray(field);

            for (int i = 0; i < fieldData.length(); i++) {
                if (fieldData.get(i).toString().equals(value)) {
                    return data;
                }
            }
        }

        return data.append(field, value);
    }

    /**
     * Updates the accession JSONObject for the given accessions type
     * @param newAccessionObject the new accession object to load in the new accessions of the given type
     * @param metadata contains the accession object to be updated
     * @param accType the type of accessions to update
     * @param accessionPattern the accession pattern to validate the accession string according to Genbank/Uniprot
     *                         standards
     * @return the metadata containing the updated accession mapping
     */
    public static JSONObject updateAccessions(JSONObject newAccessionObject, JSONObject metadata,
            Seq.AccType accType, Pattern accessionPattern) {
        JSONObject oldAccessionObject = metadata.getJSONObject(ACCESSION);

        if (newAccessionObject.has(accType.toString())) {
            JSONArray newAccTypeAccessions = newAccessionObject.getJSONArray(accType.toString());

            for (int i = 0; i < newAccTypeAccessions.length(); i++) {
                if (!verifyAccession(newAccTypeAccessions.getString(i), accessionPattern)) {
                    LOGGER.error("%s accession not the right format: %s\n", accType.toString(),
                            newAccTypeAccessions.getString(i));
                    continue;
                }

                oldAccessionObject = updateArrayField(accType.toString(), newAccTypeAccessions.getString(i),
                        oldAccessionObject);
            }

        }

        return metadata.put(ACCESSION, oldAccessionObject);
    }

    /**
     * Updates metadata and reference fields with the information extracted from file
     * @param se an instance of the GenbankSeqEntry class that extracts all the relevant information from a sequence
     *           object
     * @param db reference to the database that should be queried and updated
     */
    private void addSeqEntryToDb(GenbankSeqEntry se, MongoDB db) {
        List<Seq> seqs = se.getMatchingSeqs();

        // no prior data on this sequence
        if (seqs.isEmpty()) {
            se.writeToDB(db, Seq.AccDB.genbank);
            return;
        }

        // update prior data
        for (Seq seq : seqs) {
            JSONObject metadata = seq.getMetadata();

            JSONObject accessions = se.getAccession();

            if (!metadata.has(ACCESSION)) {
                metadata.put(ACCESSION, accessions);
            } else {
                metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_nucleotide,
                        NUCLEOTIDE_ACCESSION_PATTERN);
                metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_protein,
                        PROTEIN_ACCESSION_PATTERN);
            }

            List<String> geneSynonyms = se.getGeneSynonyms();

            if (se.getGeneName() != null) {
                if (!metadata.has(NAME) || metadata.get(NAME) == null) {
                    metadata.put(NAME, se.getGeneName());
                } else if (!se.getGeneName().equals(metadata.get(NAME))) {
                    geneSynonyms.add(se.getGeneName());
                }
            }

            for (String geneSynonym : geneSynonyms) {
                if (!geneSynonym.equals(metadata.get(NAME))) {
                    metadata = updateArrayField(SYNONYMS, geneSynonym, metadata);
                }
            }

            if (se.getProductName() != null) {
                metadata = updateArrayField(PRODUCT_NAMES, se.getProductName().get(0), metadata);
            }

            seq.setMetadata(metadata);

            db.updateMetadata(seq);

            List<JSONObject> oldRefs = seq.getReferences();
            List<JSONObject> newPmidRefs = se.getPmids();
            List<JSONObject> newPatentRefs = se.getPatents();

            if (!oldRefs.isEmpty()) {
                Set<String> oldPmids = new HashSet<>();

                for (JSONObject oldRef : oldRefs) {
                    if (oldRef.get(SRC).equals(PMID)) {
                        oldPmids.add(oldRef.getString(VAL));
                    }
                }

                for (JSONObject newPmidRef : newPmidRefs) {
                    if (!oldPmids.contains(newPmidRef.getString(VAL))) {
                        oldRefs.add(newPmidRef);
                    }
                }

                for (JSONObject newPatentRef : newPatentRefs) {
                    Boolean patentExists = false;
                    String countryCode = (String) newPatentRef.get(COUNTRY_CODE);
                    String patentNumber = (String) newPatentRef.get(PATENT_NUMBER);
                    String patentYear = (String) newPatentRef.get(PATENT_YEAR);

                    // checks if any patents are equivalent
                    for (JSONObject newRef : oldRefs) {
                        if (newRef.get(SRC).equals(PATENT) && newRef.get(COUNTRY_CODE).equals(countryCode)
                                && newRef.get(PATENT_NUMBER).equals(patentNumber)
                                && newRef.get(PATENT_YEAR).equals(patentYear)) {
                            patentExists = true;
                        }
                    }

                    if (!patentExists) {
                        oldRefs.add(newPatentRef);
                    }
                }

                seq.setReferences(oldRefs);

            } else {
                seq.setReferences(se.getRefs());
            }

            if (seq.getReferences() != null) {
                db.updateReferences(seq);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(GenbankInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(GenbankInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        File genbankFile = new File(cl.getOptionValue(OPTION_GENBANK_PATH));
        String dbName = cl.getOptionValue(OPTION_DB_NAME);
        String seqType = cl.getOptionValue(OPTION_SEQ_TYPE);

        if (!genbankFile.exists()) {
            String msg = String.format("Genbank file path is null");
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        } else {
            MongoDB db = new MongoDB("localhost", 27017, dbName);

            DBIterator iter = db.getDbIteratorOverOrgs();

            Iterator<Organism> orgIterator = new Iterator<Organism>() {
                @Override
                public boolean hasNext() {
                    boolean hasNext = iter.hasNext();
                    if (!hasNext)
                        iter.close();
                    return hasNext;
                }

                @Override
                public Organism next() {
                    DBObject o = iter.next();
                    return db.convertDBObjectToOrg(o);
                }

            };

            OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator);
            Map<String, String> minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping();

            GenbankInstaller installer = new GenbankInstaller(genbankFile, seqType, db, minimalPrefixMapping);
            installer.init();
        }

    }
}