act.installer.UniprotInstaller.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.UniprotInstaller.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer;

import act.installer.sequence.UniprotSeqEntry;
import act.installer.sequence.UniprotSeqEntryFactory;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Organism;
import act.shared.Seq;
import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator;
import com.act.utils.parser.UniprotInterpreter;
import com.mongodb.DBObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.json.JSONArray;
import org.json.JSONObject;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

public class UniprotInstaller {
    private static final Logger LOGGER = LogManager.getFormatterLogger(UniprotInstaller.class);
    private static final UniprotSeqEntryFactory seqEntryFactory = new UniprotSeqEntryFactory();
    private static final String OPTION_UNIPROT_PATH = "p";
    private static final String OPTION_DB_NAME = "d";
    private static final String NAME = "name";
    private static final String ACCESSION = "accession";
    private static final String SYNONYMS = "synonyms";
    private static final String PRODUCT_NAMES = "product_names";
    private static final String VAL = "val";
    private static final String SRC = "src";
    private static final String PMID = "PMID";
    private static final String CATALYTIC_ACTIVITY = "catalytic_activity";

    //  http://www.uniprot.org/help/accession_numbers
    public static final Pattern UNIPROT_ACCESSION_PATTERN = Pattern
            .compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}");

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class is the driver to write sequence data from a Uniprot file to our database. It can be used on the ",
            "command line with a file path as a parameter." }, "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_UNIPROT_PATH).argName("uniprot file")
                    .desc("uniprot file containing sequence and annotations").hasArg().longOpt("uniprot")
                    .required());
            add(Option.builder(OPTION_DB_NAME).argName("db name").desc("name of the database to be queried")
                    .hasArg().longOpt("database").required());
            add(Option.builder("h").argName("help").desc("Example of usage: -p filepath.gb -d marvin")
                    .longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    File uniprotFile;
    MongoDB db;
    Map<String, String> minimalPrefixMapping;

    // the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
    public UniprotInstaller(File uniprotFile, MongoDB db, Map<String, String> minimalPrefixMapping) {
        this.uniprotFile = uniprotFile;
        this.db = db;
        this.minimalPrefixMapping = minimalPrefixMapping;
    }

    public void init() throws IOException, SAXException, ParserConfigurationException, CompoundNotFoundException {
        UniprotInterpreter uniprotInterpreter = new UniprotInterpreter(uniprotFile);
        uniprotInterpreter.init();

        UniprotSeqEntry seqEntry = seqEntryFactory.createFromDocumentReference(uniprotInterpreter.getXmlDocument(),
                db, minimalPrefixMapping);
        addSeqEntryToDb(seqEntry, db);
    }

    /**
     * Verifies the accession string according to the standard Genbank/Uniprot accession qualifications
     * @param proteinAccession the accession string to be validated
     * @param accessionPattern the pattern that the accession string should match
     * @return
     */
    private boolean verifyAccession(String proteinAccession, Pattern accessionPattern) {
        return accessionPattern.matcher(proteinAccession).matches();
    }

    /**
     * Checks if the new value already exists in the field. If so, doesn't update the metadata. If it doesn't exist,
     * appends the new value to the data.
     * @param field the key referring to the array in the metadata we wish to update
     * @param value the value we wish to add to the array
     * @param data the metadata
     * @return the updated metadata JSONObject
     */
    private JSONObject updateArrayField(String field, String value, JSONObject data) {
        if (value == null || value.isEmpty()) {
            return data;
        }

        if (data.has(field)) {
            JSONArray fieldData = data.getJSONArray(field);

            for (int i = 0; i < fieldData.length(); i++) {
                if (fieldData.get(i).toString().equals(value)) {
                    return data;
                }
            }
        }

        return data.append(field, value);
    }

    /**
     * Updates the accession JSONObject for the given accessions type
     * @param newAccessionObject the new accession object to load in the new accessions of the given type
     * @param metadata contains the accession object to be updated
     * @param accType the type of accessions to update
     * @param accessionPattern the accession pattern to validate the accession string according to Genbank/Uniprot
     *                         standards
     * @return the metadata containing the updated accession mapping
     */
    private JSONObject updateAccessions(JSONObject newAccessionObject, JSONObject metadata, Seq.AccType accType,
            Pattern accessionPattern) {
        JSONObject oldAccessionObject = metadata.getJSONObject(ACCESSION);

        if (newAccessionObject.has(accType.toString())) {
            JSONArray newAccTypeAccessions = newAccessionObject.getJSONArray(accType.toString());

            for (int i = 0; i < newAccTypeAccessions.length(); i++) {
                if (!verifyAccession(newAccTypeAccessions.getString(i), accessionPattern)) {
                    LOGGER.error("%s accession not the right format: %s\n", accType.toString(),
                            newAccTypeAccessions.getString(i));
                    continue;
                }

                oldAccessionObject = updateArrayField(accType.toString(), newAccTypeAccessions.getString(i),
                        oldAccessionObject);
            }

        }

        return metadata.put(ACCESSION, oldAccessionObject);
    }

    /**
     * Updates metadata and reference fields with the information extracted from file
     * @param se an instance of the UniprotSeqEntry class that extracts all the relevant information from a sequence
     *           object
     * @param db reference to the database that should be queried and updated
     */
    private void addSeqEntryToDb(UniprotSeqEntry se, MongoDB db) {
        List<Seq> seqs = se.getMatchingSeqs();

        // no prior data on this sequence
        if (seqs.isEmpty()) {
            se.writeToDB(db, Seq.AccDB.uniprot);
            return;
        }

        // update prior data
        for (Seq seq : seqs) {
            JSONObject metadata = seq.getMetadata();

            JSONObject accessions = se.getAccession();

            if (!metadata.has(ACCESSION)) {
                metadata.put(ACCESSION, accessions);
            } else {
                metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_nucleotide,
                        GenbankInstaller.NUCLEOTIDE_ACCESSION_PATTERN);
                metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_protein,
                        GenbankInstaller.PROTEIN_ACCESSION_PATTERN);
                metadata = updateAccessions(accessions, metadata, Seq.AccType.uniprot, UNIPROT_ACCESSION_PATTERN);
            }

            List<String> geneSynonyms = se.getGeneSynonyms();

            if (se.getGeneName() != null) {
                if (!metadata.has(NAME) || metadata.isNull(NAME)) {
                    metadata.put(NAME, se.getGeneName());
                } else if (!se.getGeneName().equals(metadata.get(NAME))) {
                    geneSynonyms.add(se.getGeneName());
                }
            }

            for (String geneSynonym : geneSynonyms) {
                if (!geneSynonym.equals(metadata.get(NAME))) {
                    metadata = updateArrayField(SYNONYMS, geneSynonym, metadata);
                }
            }

            List<String> productNames = se.getProductName();

            if (!productNames.isEmpty()) {
                for (int i = 0; i < productNames.size(); i++) {
                    metadata = updateArrayField(PRODUCT_NAMES, productNames.get(i), metadata);
                }
            }

            if (se.getCatalyticActivity() != null) {
                metadata.put(CATALYTIC_ACTIVITY, se.getCatalyticActivity());
            }

            seq.setMetadata(metadata);

            db.updateMetadata(seq);

            List<JSONObject> oldRefs = seq.getReferences();
            List<JSONObject> newPmidRefs = se.getRefs();

            if (!oldRefs.isEmpty()) {
                Set<String> oldPmids = new HashSet<>();

                for (JSONObject oldRef : oldRefs) {
                    if (oldRef.get(SRC).equals(PMID)) {
                        oldPmids.add(oldRef.getString(VAL));
                    }
                }

                for (JSONObject newPmidRef : newPmidRefs) {
                    if (!oldPmids.contains(newPmidRef.getString(VAL))) {
                        oldRefs.add(newPmidRef);
                    }
                }

                seq.setReferences(oldRefs);

            } else {
                seq.setReferences(se.getRefs());
            }

            if (seq.getReferences() != null) {
                db.updateReferences(seq);
            }
        }
    }

    public static void main(String[] args)
            throws IOException, SAXException, ParserConfigurationException, CompoundNotFoundException {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(UniprotInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(UniprotInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        File uniprotFile = new File(cl.getOptionValue(OPTION_UNIPROT_PATH));
        String dbName = cl.getOptionValue(OPTION_DB_NAME);

        if (!uniprotFile.exists()) {
            String msg = String.format("Uniprot file path is null");
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        } else {
            MongoDB db = new MongoDB("localhost", 27017, dbName);

            DBIterator iter = db.getDbIteratorOverOrgs();

            Iterator<Organism> orgIterator = new Iterator<Organism>() {
                @Override
                public boolean hasNext() {
                    boolean hasNext = iter.hasNext();
                    if (!hasNext)
                        iter.close();
                    return hasNext;
                }

                @Override
                public Organism next() {
                    DBObject o = iter.next();
                    return db.convertDBObjectToOrg(o);
                }

            };

            OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator);
            Map<String, String> minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping();

            UniprotInstaller installer = new UniprotInstaller(uniprotFile, db, minimalPrefixMapping);
            installer.init();
        }
    }

}