com.act.biointerpretation.metadata.ProteinMetadataFactory.java Source code

Java tutorial

Introduction

Here is the source code for com.act.biointerpretation.metadata.ProteinMetadataFactory.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.metadata;

import act.server.NoSQLAPI;
import act.shared.Reaction;
import org.apache.commons.io.FileUtils;
import org.json.JSONArray;
import org.json.JSONObject;

import java.io.File;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;

/**
 * Requires newer database:
 * mongod --dbpath /Users/jca20n/Downloads/2016-11-22-full-run-w-validation & disown %1
 * <p>
 * This one has the modification and cloning data also included
 * mongod --dbpath /Users/jca20n/Downloads/2016-12-08-actv01_vijay_proteins_only & disown %1
 * Created by jca20n on 12/1/16.
 */
public class ProteinMetadataFactory {
    private Set<String> dataList = new HashSet<>();
    private Map<String, Integer> dataMap = new HashMap<>();

    //Data used in algorithms
    private Set<String> modificationTermsTrue;
    private Set<String> modificationTermsFalse;
    private Map<String, Map<Host, Integer>> clonedtermToScore;
    private Map<String, Map<Host, Localization>> termToHostLocalization;

    private ProteinMetadataFactory() {
    }

    public static ProteinMetadataFactory initiate() throws Exception {
        //Construct the Sets to hold modification data terms
        Set<String> modTrue = new HashSet<>();
        Set<String> modFalse = new HashSet<>();

        // TODO: Move this to resources directory?
        File termfile = new File("data/ProteinMetadata/2016_12_07-modification_terms.txt");
        String data = FileUtils.readFileToString(termfile);
        String[] lines = data.split("\\r|\\r?\\n");
        for (int i = 1; i < lines.length; i++) {
            String line = lines[i];
            String[] tabs = line.split("\t");
            String term = tabs[0].toLowerCase();
            boolean val = Boolean.parseBoolean(tabs[1]);

            if (val == true) {
                modTrue.add(term);
            } else {
                modFalse.add(term);
            }
        }

        //Construct term data for 'cloned'
        Map<String, String> termToGenus = new HashMap<>();

        // TODO: Move this to resources directory?
        termfile = new File("data/ProteinMetadata/2016_12_07-cloned_term_to_genus.txt");
        data = FileUtils.readFileToString(termfile);
        lines = data.split("\\r|\\r?\\n");
        for (int i = 1; i < lines.length; i++) {
            String line = lines[i];
            String[] tabs = line.split("\t");
            if (tabs[1].isEmpty() || tabs[0].isEmpty()) {
                continue;
            }
            termToGenus.put(tabs[0], tabs[1]);
        }

        //Pre-Compute distance to hosts for all terms for "cloned"
        Map<String, Genus> nameToGenus = Genus.parseGenuses();
        Map<String, Map<Host, Integer>> termToScore = new HashMap<>();
        for (String term : termToGenus.keySet()) {
            String tgenus = termToGenus.get(term);
            Genus ggenus = nameToGenus.get(tgenus);

            Map<Host, Integer> hostToScore = new HashMap<>();
            for (Host host : Host.values()) {
                Genus hostgenus = nameToGenus.get(host.toString());
                Integer score = Genus.similarity(ggenus, hostgenus);
                hostToScore.put(host, score);
            }
            termToScore.put(term, hostToScore);
        }

        //Construct the data to handle localization
        Map<String, Map<Host, Localization>> locMap = new HashMap<>();

        // TODO: Move this to resources directory?
        termfile = new File("data/ProteinMetadata/2016_12_06-localization.txt");
        data = FileUtils.readFileToString(termfile);
        lines = data.split("\\r|\\r?\\n");
        for (int i = 1; i < lines.length; i++) {
            //Create the host map for each term and initially assume all values are 'questionable'
            Map<Host, Localization> hostToLoc = new HashMap<>();
            for (Host host : Host.values()) {
                hostToLoc.put(host, Localization.questionable);
            }

            //Parse the line and put in replacement values per host
            String line = lines[i];
            String[] tabs = line.split("\t");

            String term = tabs[0];
            for (int x = 1; x < tabs.length; x++) {
                try {
                    String sloc = tabs[x];
                    Localization loc = Localization.valueOf(sloc);
                    //Ecoli   Bsubtilis   Cglutamicum   Scerevisiae   Ppasteuris   Aniger   Hsapiens   Sfrugiperda
                    if (x == 1) {
                        hostToLoc.put(Host.Ecoli, loc);
                    } else if (x == 2) {
                        hostToLoc.put(Host.Bsubtilis, loc);
                    } else if (x == 3) {
                        hostToLoc.put(Host.Cglutamicum, loc);
                    } else if (x == 4) {
                        hostToLoc.put(Host.Scerevisiae, loc);
                    } else if (x == 5) {
                        hostToLoc.put(Host.Ppasteuris, loc);
                    } else if (x == 6) {
                        hostToLoc.put(Host.Aniger, loc);
                    } else if (x == 7) {
                        hostToLoc.put(Host.Hsapiens, loc);
                    } else if (x == 8) {
                        hostToLoc.put(Host.Sfrugiperda, loc);
                    }
                } catch (Exception err) {
                    err.printStackTrace();
                }
            }

            locMap.put(term, hostToLoc);
        }

        //Create the factory and put in data
        ProteinMetadataFactory factory = new ProteinMetadataFactory();
        factory.modificationTermsTrue = modTrue;
        factory.modificationTermsFalse = modFalse;
        factory.clonedtermToScore = termToScore;
        factory.termToHostLocalization = locMap;

        return factory;
    }

    public ProteinMetadata create(JSONObject json) throws Exception {
        Double kcatkm = handleKcatKm(json);
        Double specificActivity = handleSpecificActivity(json);
        Boolean heteroSubunits = handlesubunits(json);
        Boolean modifications = handleModifications(json);
        Map<Host, Integer> cloning = handleCloned(json);
        Map<Host, Localization> localization = handleLocalization(json);
        List<Long> seqIds = handleSequences(json);

        ProteinMetadata out = new ProteinMetadata();
        out.kcatkm = kcatkm;
        out.specificActivity = specificActivity;
        out.heteroSubunits = heteroSubunits;
        out.modifications = modifications;
        out.cloned = cloning;
        out.localization = localization;
        out.sequences = seqIds;
        return out;
    }

    private List<Long> handleSequences(JSONObject json) {
        //Try to pull the data
        JSONArray jarray = null;
        List<Long> seqIds = new ArrayList<>();

        try {
            jarray = json.getJSONArray("sequences");
        } catch (Exception err) {
            return seqIds;
        }

        //If there is no data, the value is undefined
        if (jarray.length() == 0) {
            return seqIds;
        }

        for (int i = 0; i < jarray.length(); i++) {
            Long data = (Long) jarray.get(i);
            seqIds.add(data);
        }

        return seqIds;
    }

    private Double handleKcatKm(JSONObject json) {
        //Try to pull the data
        JSONArray jarray = null;
        try {
            jarray = json.getJSONArray("kcat/km");
        } catch (Exception err) {
            return null;
        }

        //If there is no data, the value is undefined
        if (jarray.length() == 0) {
            return null;
        }

        //If many observations are given for the protein, return the highest value observed
        /*TODO:  the name of chemical in the substrate field could be matched to the substrate of the reaction
        Currently I am just picking the highest value, but it would be best to pick the highest value for the
        substrate being examined.  This complicates the wiring, so I didn't try to implement
         */
        if (jarray.length() > 1) {
            Double highest = -1.0;
            for (int i = 0; i < jarray.length(); i++) {
                try {
                    JSONObject obj = jarray.getJSONObject(i);
                    Double dval = obj.getDouble("val");
                    if (dval > highest) {
                        highest = dval;
                    }
                } catch (Exception err) {
                }
            }

            if (highest > 0) {
                return highest;
            } else {
                return null;
            }
        }

        //Otherwise return the one value
        try {
            JSONObject obj = jarray.getJSONObject(0);
            Double val = obj.getDouble("val");
            return val;
        } catch (Exception err) {
            return null;
        }
    }

    private Double handleSpecificActivity(JSONObject json) {
        //Try to pull the data
        JSONArray jarray = null;
        try {
            jarray = json.getJSONArray("specific_activity");
        } catch (Exception err) {
            return null;
        }

        //If there is no data, the value is undefined
        if (jarray.length() == 0) {
            return null;
        }

        //If many observations are given for the protein, return the highest value observed
        if (jarray.length() > 1) {
            Double highest = -1.0;
            for (int i = 0; i < jarray.length(); i++) {
                try {
                    JSONObject obj = jarray.getJSONObject(i);
                    Double dval = obj.getDouble("val");
                    if (dval > highest) {
                        highest = dval;
                    }
                } catch (Exception err) {
                }
            }

            if (highest > 0) {
                return highest;
            } else {
                return null;
            }
        }

        //Otherwise return the one value
        try {
            JSONObject obj = jarray.getJSONObject(0);
            Double val = obj.getDouble("val");
            return val;
        } catch (Exception err) {
            return null;
        }
    }

    private Boolean handlesubunits(JSONObject json) {
        //Try to pull the data
        JSONArray jarray = null;
        try {
            jarray = json.getJSONArray("subunits");
        } catch (Exception err) {
            return null;
        }

        //If there is no data, the value is undefined
        if (jarray.length() == 0) {
            return null;
        }

        //If many observations are given for the protein, return the consensus with a true value dominating
        if (jarray.length() > 1) {
            Boolean out = null;
            for (int i = 0; i < jarray.length(); i++) {
                try {
                    JSONObject obj = jarray.getJSONObject(i);
                    Boolean bval = assignSubunitHelper(obj);
                    if (bval == true) {
                        return true;
                    } else if (bval == false) {
                        out = false;
                    }
                } catch (Exception err) {
                }
            }

            return out;
        }

        //Otherwise return the one value
        try {
            JSONObject obj = jarray.getJSONObject(0);
            Boolean bval = assignSubunitHelper(obj);
            return bval;
        } catch (Exception err) {
            return null;
        }
    }

    private Boolean assignSubunitHelper(JSONObject json) {
        try {
            String val = json.getString("val");
            String comment = json.getString("comment");

            if (val.equals("?")) {
                return null;
            }
            if (val.contains("hetero")) {
                return true;
            }
            if (comment.contains("hetero")) {
                return true;
            }
            if (val.contains("monomer")) {
                return false;
            }
            if (val.startsWith("homo")) {
                return false;
            }
            if (comment.contains("alpha") && comment.contains("beta")) {
                return true;
            }
            return false;
        } catch (Exception err) {
            return null;
        }
    }

    /**
     * This is a unit test for handleSubunits
     * @throws Exception
     */
    private boolean testHandlesubunits() throws Exception {
        // TODO: Move this to resources directory?
        File testfile = new File("data/ProteinMetadata/2016_12_07-subunit_testset.txt");
        String data = FileUtils.readFileToString(testfile);
        data = data.replaceAll("\"\"", "\"");
        String[] lines = data.split("\\r|\\r?\\n");
        for (int i = 1; i < lines.length; i++) {
            String line = lines[i];
            String[] tabs = line.split("\t");

            //Pull out the json for each test and re-wrap it
            String arrayStr = tabs[0];
            if (arrayStr.startsWith("\"")) {
                arrayStr = arrayStr.substring(1);
            }
            if (arrayStr.endsWith("\"")) {
                arrayStr = arrayStr.substring(0, arrayStr.length() - 1);
            }

            String jsonstr = "{\"subunits\":" + arrayStr + "}";
            JSONObject json = new JSONObject(jsonstr);

            //Run the test and compare result to expected
            Boolean result = this.handlesubunits(json);
            Boolean expected = null;
            if (!tabs[1].equals("null")) {
                expected = Boolean.parseBoolean(tabs[1]);
            }

            if (result != expected) {
                System.err.println("Subunit testing error:\n" + json.toString() + "  " + tabs[1]);
                System.err.println("Expect: " + expected + " Found: " + result);
                return false;
            }
        }

        return true;
    }

    private Boolean handleModifications(JSONObject json) {
        //Try to pull the data
        JSONArray jarray = null;
        try {
            jarray = json.getJSONArray("post_translational_modification");
        } catch (Exception err) {
            return null;
        }

        //If there is no data, nothing is known
        if (jarray.length() == 0) {
            return null;
        }

        //If many observations are given for the protein, return the consensus with a true value dominating
        if (jarray.length() > 1) {
            Boolean out = null;
            for (int i = 0; i < jarray.length(); i++) {
                try {
                    JSONObject obj = jarray.getJSONObject(i);
                    Boolean bval = assignModificationHelper(obj);
                    if (bval == true) {
                        return true;
                    } else if (bval == false) {
                        out = false;
                    }
                } catch (Exception err) {
                }
            }

            return out;
        }

        //Otherwise return the one value
        try {
            JSONObject obj = jarray.getJSONObject(0);
            Boolean bval = assignModificationHelper(obj);
            return bval;
        } catch (Exception err) {
            return null;
        }
    }

    private Boolean assignModificationHelper(JSONObject json) {
        try {
            String ptm = json.getString("post_translational_modification");
            if (this.modificationTermsTrue.contains(ptm)) {
                return true;
            }
            if (this.modificationTermsFalse.contains(ptm)) {
                return false;
            }
            return null;
        } catch (Exception err) {
            return null;
        }
    }

    private Map<Host, Integer> handleCloned(JSONObject json) {
        //Populate output with null for all hosts (no prediction)
        Map<Host, Integer> out = new HashMap<>();
        for (Host host : Host.values()) {
            out.put(host, null);
        }

        //Read in any data from JSON and interpret prediction based on phylogenetic distance to host
        try {
            JSONArray jarray = json.getJSONArray("cloned");

            for (int i = 0; i < jarray.length(); i++) {
                JSONObject obj = jarray.getJSONObject(i);
                String comment = obj.getString("comment");

                String[] words = comment.toLowerCase().split("[\\s,;]+");
                for (String word : words) {
                    Map<Host, Integer> hosttoint = this.clonedtermToScore.get(word);
                    if (hosttoint == null) {
                        continue;
                    }
                    for (Host host : hosttoint.keySet()) {
                        Integer currval = out.get(host);
                        if (currval == null) {
                            currval = -99999;
                        }
                        Integer newval = hosttoint.get(host);
                        if (newval > currval) {
                            out.put(host, newval);
                        }
                    }
                }
            }
        } catch (Exception err) {
        }

        return out;
    }

    // Dead code: This was being called from within `handleCloned` but not now.
    private void printoutHosts(JSONObject json) {
        try {
            JSONArray jarray = json.getJSONArray("cloned");

            for (int i = 0; i < jarray.length(); i++) {
                JSONObject obj = jarray.getJSONObject(i);
                String comment = obj.getString("comment");

                int index = comment.indexOf("in ");
                if (index < 0) {
                    continue;
                }

                String[] words = comment.substring(index).toLowerCase().split("[\\s,;]+");

                String datapoint = "";
                int limit = 1;
                int counter = 0;

                for (String word : words) {
                    if (counter > limit) {
                        break;
                    }
                    counter++;

                    datapoint += word + "\t";
                }

                Integer currval = dataMap.get(datapoint);
                if (currval == null) {
                    currval = 0;
                }
                currval++;
                dataMap.put(datapoint, currval);

            }
        } catch (Exception err) {
        }
    }

    private Map<Host, Localization> handleLocalization(JSONObject json) {
        //Construct the output with unknown for all Hosts
        Map<Host, Localization> out = new HashMap<>();
        for (Host host : Host.values()) {
            out.put(host, Localization.unknown);
        }

        //Parse out localization information from json and populate the output map with predictions
        JSONArray jarray = null;
        try {
            jarray = json.getJSONArray("localization");
        } catch (Exception err) {
            err.printStackTrace();
        }

        //If there is no metadata, all values should be "unknown"
        if (jarray.length() == 0) {
            return out;
        }

        //Scan through each observation
        for (int i = 0; i < jarray.length(); i++) {
            try {
                JSONObject obj = jarray.getJSONObject(i);
                String term = obj.getString("val");

                Map<Host, Localization> hostToLoc = this.termToHostLocalization.get(term);
                if (hostToLoc == null) {
                    System.out.println("missing term: " + term);
                    dataList.add(term);
                    continue;
                }

                for (Host host : hostToLoc.keySet()) {
                    Localization currval = out.get(host); // Whatever is currently in the Map
                    Localization newval = hostToLoc.get(host); // The potential new value

                    //If the current value is "unknown", replace that value the one with the term
                    if (currval == Localization.unknown) {
                        out.put(host, newval);
                    }

                    //If the current value is "questionable", stay with questionable
                    else if (currval == Localization.questionable) {
                        out.put(host, Localization.questionable);
                    }

                    //If currval and newval have non-identical, but aren't unknown/questionable, then
                    //There must be 2 non-identical predictions, in which case this becomes questionable
                    else if (currval != newval) {
                        out.put(host, Localization.questionable);
                    }
                }
            } catch (Exception err) {
                err.printStackTrace();
            }
        }

        return out;
    }

    public static void main(String[] args) throws Exception {
        // TODO: This is referencing a temporary collection. Change it!
        // TODO: FIX THIS BEFORE MERGE!
        NoSQLAPI api = new NoSQLAPI("actv01_vijay_proteins", "actv01_vijay_proteins");
        Iterator<Reaction> iterator = api.readRxnsFromInKnowledgeGraph();

        //Create a single instance of the factory method to use for all json
        ProteinMetadataFactory factory = ProteinMetadataFactory.initiate();

        //Run some tests
        try {
            if (factory.testHandlesubunits() == true) {
                System.out.println("Subunit test OK");
            }
        } catch (Exception err) {
            System.err.println("Failed to test subunits");
        }

        //Create a list to aggregate the results of the database scan
        List<ProteinMetadata> agg = new ArrayList<>();

        //Scan the database and store ProteinMetadata objects
        while (iterator.hasNext()) {
            Reaction rxn = iterator.next();

            Reaction.RxnDataSource source = rxn.getDataSource();
            if (!source.equals(Reaction.RxnDataSource.BRENDA)) {
                continue;
            }

            Set<JSONObject> jsons = rxn.getProteinData();

            for (JSONObject json : jsons) {
                ProteinMetadata meta = factory.create(json);
                agg.add(meta);
            }
        }

        //Write out any messages to file
        StringBuilder sb = new StringBuilder();
        for (String aline : factory.dataList) {
            sb.append(aline).append("\n");
        }

        File outfile = new File("output/ProteinMetadata/Factory_output.txt");
        if (outfile.exists()) {
            outfile.delete();
        }
        FileUtils.writeStringToFile(outfile, sb.toString());

        sb = new StringBuilder();
        for (String key : factory.dataMap.keySet()) {
            int value = factory.dataMap.get(key);
            sb.append(key + "\t" + value + "\n");
        }

        outfile = new File("output/ProteinMetadata/Factory_output_map.txt");
        if (outfile.exists()) {
            outfile.delete();
        }
        FileUtils.writeStringToFile(outfile, sb.toString());

        //Count up the results of modifications to get statistics
        int falsecount = 0;
        int truecount = 0;
        int nullcount = 0;

        for (ProteinMetadata datum : agg) {
            if (datum == null) {
                System.err.println("null datum");
                continue;
            }
            if (datum.modifications == null) {
                nullcount++;
            } else if (datum.modifications == false) {
                falsecount++;
            } else if (datum.modifications == true) {
                truecount++;
            }
        }
        System.out.println("Total # protein metadata: " + agg.size());
        System.out.println();
        System.out.println("modification true count: " + truecount);
        System.out.println("modification false count: " + falsecount);
        System.out.println("modification null count: " + nullcount);
        System.out.println();

        //Get some statistics for cloned
        nullcount = 0;
        int emptycount = 0;
        int colicount = 0;
        int humancount = 0;
        int bothcount = 0;
        for (ProteinMetadata datum : agg) {
            if (datum == null) {
                System.err.println("null datum");
                continue;
            }
            if (datum.cloned == null) {
                nullcount++;
                continue;
            }
            if (datum.cloned.isEmpty()) {
                emptycount++;
                continue;
            }
            Integer human = datum.cloned.get(Host.Hsapiens);
            if (human != null && human > 0) {
                humancount++;
            }
            Integer coli = datum.cloned.get(Host.Ecoli);
            if (coli != null && coli > 0) {
                colicount++;
                if (human != null && human > 0) {
                    bothcount++;
                }
            }
        }

        System.out.println("cloned null count: " + nullcount);
        System.out.println("cloned empty count: " + emptycount);
        System.out.println("cloned coli count: " + colicount);
        System.out.println("cloned human count: " + humancount);
        System.out.println("cloned both count: " + bothcount);
        System.out.println();
    }
}