Java tutorial
package org.intermine.bio.dataconversion; /* * Copyright (C) 2002-2013 FlyMine * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. See the LICENSE file for more * information or http://www.gnu.org/copyleft/lesser.html. * */ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.intermine.bio.util.OrganismRepository; import org.intermine.objectstore.ObjectStoreException; import org.intermine.sql.writebatch.Batch; import org.intermine.sql.writebatch.BatchWriterPostgresCopyImpl; import org.intermine.util.Util; import org.intermine.xml.full.Attribute; import org.intermine.xml.full.Item; import org.intermine.xml.full.Reference; import org.intermine.xml.full.ReferenceList; /** * Create items from the modENCODE metadata extensions to the chado schema. * @author Kim Rutherford,sc,rns */ public class ModEncodeMetaDataProcessor extends ChadoProcessor { private static final Logger LOG = Logger.getLogger(ModEncodeMetaDataProcessor.class); private static final String DATA_IDS_TABLE_NAME = "data_ids"; private static final String WIKI_URL = "http://wiki.modencode.org/project/index.php?title="; private static final String FILE_URL = "http://submit.modencode.org/submit/public/get_file/"; private static final String DCC_PREFIX = "modENCODE_"; private static final String NA_PROP = "not applicable"; private static final Set<String> DB_RECORD_TYPES = Collections.unmodifiableSet(new HashSet<String>( Arrays.asList("GEO_record", "ArrayExpress_record", "TraceArchive_record", "dbEST_record", "ShortReadArchive_project_ID (SRA)", "ShortReadArchive_project_ID_list (SRA)"))); // preferred synonyms private static final String STRAIN = "strain"; private static final String DEVSTAGE = "developmental stage"; // submission maps // --------------- private Map<Integer, String> submissionOrganismMap = new HashMap<Integer, String>(); // maps from chado identifier to lab/project details private Map<Integer, SubmissionDetails> submissionMap = new HashMap<Integer, SubmissionDetails>(); // subId to dcc id private Map<Integer, String> dccIdMap = new HashMap<Integer, String>(); // superseded/deleted subId to dcc id: to be checked in case we need to skip // loading a sub private Map<Integer, String> deletedSubMap = new HashMap<Integer, String>(); // applied_protocol/data/attribute maps // ------------------- // chado submission id to chado data_id private Map<Integer, List<Integer>> submissionDataMap = new HashMap<Integer, List<Integer>>(); // chado data id to chado submission id private Map<Integer, Integer> dataSubmissionMap = new HashMap<Integer, Integer>(); // to store protocol data until we create applied protocols private Map<Integer, Protocol> protocolMap = new HashMap<Integer, Protocol>(); // used when traversing dag of applied protocols private Map<Integer, AppliedProtocol> appliedProtocolMap = new HashMap<Integer, AppliedProtocol>(); // used when traversing dag of applied protocols private Map<Integer, AppliedData> appliedDataMap = new HashMap<Integer, AppliedData>(); // project/lab/experiment/submission maps // -------------------------------------- // for projects, the maps link the project name with the identifiers... private Map<String, Integer> projectIdMap = new HashMap<String, Integer>(); private Map<String, String> projectIdRefMap = new HashMap<String, String>(); // for labs, the maps link the lab name with the identifiers... private Map<String, Integer> labIdMap = new HashMap<String, Integer>(); private Map<String, String> labIdRefMap = new HashMap<String, String>(); // for experiment, the maps link the exp name (description!) with the identifiers... private Map<String, Integer> experimentIdMap = new HashMap<String, Integer>(); private Map<String, String> experimentIdRefMap = new HashMap<String, String>(); private Map<String, List<Integer>> expSubMap = new HashMap<String, List<Integer>>(); // ...we need a further map to link to submission private Map<Integer, String> submissionProjectMap = new HashMap<Integer, String>(); private Map<Integer, String> submissionLabMap = new HashMap<Integer, String>(); // to store the category in experiment private Map<Integer, String> submissionExpCatMap = new HashMap<Integer, String>(); // to check if experiment type is set private Set<Integer> submissionWithExpTypeSet = new HashSet<Integer>(); // submission/applied_protocol/protocol maps // ----------------------------------------- private Map<String, String> protocolsMap = new HashMap<String, String>(); private Map<Integer, String> protocolItemIds = new HashMap<Integer, String>(); private Map<String, Integer> protocolItemToObjectId = new HashMap<String, Integer>(); // submission chado id to item identifier of Protocol used to generate GFF private Map<Integer, String> scoreProtocols = new HashMap<Integer, String>(); private Map<Integer, String> protocolTypesMap = new HashMap<Integer, String>(); private Map<Integer, Integer> appliedProtocolIdMap = new HashMap<Integer, Integer>(); private Map<Integer, String> appliedProtocolIdRefMap = new HashMap<Integer, String>(); // list of firstAppliedProtocols, first level of the DAG linking // the applied protocols through the data (and giving the flow of data) private List<Integer> firstAppliedProtocols = new ArrayList<Integer>(); private Map<Integer, Integer> publicationIdMap = new HashMap<Integer, Integer>(); private Map<Integer, String> publicationIdRefMap = new HashMap<Integer, String>(); // experimental factor maps // ------------------------ // chado submission id to list of top level attributes, e.g. dev stage, organism_part private Map<Integer, ExperimentalFactor> submissionEFMap = new HashMap<Integer, ExperimentalFactor>(); private Map<Integer, List<String[]>> submissionEFactorMap2 = new HashMap<Integer, List<String[]>>(); private Map<String, Integer> eFactorIdMap = new HashMap<String, Integer>(); private Map<String, String> eFactorIdRefMap = new HashMap<String, String>(); private Map<Integer, List<String>> submissionEFactorMap = new HashMap<Integer, List<String>>(); // caches // ------ // cache cv term names by id private Map<String, String> cvtermCache = new HashMap<String, String>(); private Map<String, String> devStageTerms = new HashMap<String, String>(); private Map<String, String> devOntologies = new HashMap<String, String>(); // just for debugging, itemIdentifier, type private Map<String, String> debugMap = new HashMap<String, String>(); private Map<String, Item> nonWikiSubmissionProperties = new HashMap<String, Item>(); private Map<String, Item> subItemsMap = new HashMap<String, Item>(); Map<Integer, List<SubmissionReference>> submissionRefs = null; protected IdResolver rslv; private Map<String, String> geneToItemIdentifier = new HashMap<String, String>(); // DbRecords Integer=objectId String=submission.itemId // the second map is used directly to build the references private Map<DatabaseRecordKey, Integer> dbRecords = new HashMap<DatabaseRecordKey, Integer>(); private Map<Integer, List<String>> dbRecordIdSubItems = new HashMap<Integer, List<String>>(); private static final class SubmissionDetails { // the identifier assigned to Item eg. "0_23" private String itemIdentifier; // the object id of the stored Item private Integer interMineObjectId; // the identifier assigned to lab Item for this object private String labItemIdentifier; private String title; private SubmissionDetails() { // don't instantiate } } /** * AppliedProtocol class to reconstruct the flow of submission data */ private static final class AppliedProtocol { private Integer submissionId; // chado private Integer protocolId; private Integer step; // the level in the dag for the AP // the output data associated to this applied protocol private List<Integer> outputs = new ArrayList<Integer>(); private List<Integer> inputs = new ArrayList<Integer>(); private AppliedProtocol() { // don't instantiate } } /** * Protocol class to store protocol data */ private static final class Protocol { private Integer protocolId; // possibly we don't need this (map) private String name; private String description; private String wikiLink; private Integer version; // the level in the dag for the AP private Protocol() { // don't instantiate } } /** * AppliedData class * to reconstruct the flow of submission data * */ private static final class AppliedData { private String itemIdentifier; private Integer intermineObjectId; private Integer dataId; private String value; private String actualValue; private String type; private String name; private String url; // in particular, it stores the dccid of the related sub needed for // linking to a result file // the list of applied protocols for which this data item is an input private List<Integer> nextAppliedProtocols = new ArrayList<Integer>(); // the list of applied protocols for which this data item is an output private List<Integer> previousAppliedProtocols = new ArrayList<Integer>(); private AppliedData() { // don't instantiate } } /** * Experimental Factor class * to store the couples (type, name/value) of EF * note that in chado sometime the name is given, other times is the value */ private static final class ExperimentalFactor { private Map<String, String> efTypes = new HashMap<String, String>(); private List<String> efNames = new ArrayList<String>(); private ExperimentalFactor() { // don't instantiate } } /** * Create a new ChadoProcessor object * @param chadoDBConverter the converter that created this Processor */ public ModEncodeMetaDataProcessor(ChadoDBConverter chadoDBConverter) { super(chadoDBConverter); } /** * {@inheritDoc} */ @Override public void process(Connection connection) throws Exception { processDeleted(connection); processProjectTable(connection); processLabTable(connection); processSubmissionOrganism(connection); processSubmission(connection); processSubmissionAttributes(connection); processProtocolTable(connection); processAppliedProtocolTable(connection); processProtocolAttributes(connection); processDag(connection); processAppliedData(connection); processAppliedDataAttributes(connection); processExperiment(connection); findScoreProtocols(); processFeatures(connection, submissionMap); // set references setSubmissionRefs(connection); setSubmissionExperimentRefs(connection); setDAGRefs(connection); // create DatabaseRecords where necessary for each submission createDatabaseRecords(connection); // create result files per submission createResultFiles(connection); // for high level attributes and experimental factors (EF) // TODO: clean up processEFactor(connection); // create id resolvers if (rslv == null) { rslv = IdResolverService.getFlyIdResolver(); rslv = IdResolverService.getWormIdResolver(); } processSubmissionProperties(connection); createRelatedSubmissions(connection); setSubmissionProtocolsRefs(connection); setSubmissionEFactorsRefs(connection); setSubmissionPublicationRefs(connection); } /** * ========================= * DELETED SUBS in CHADO * ========================= * To build a map of deleted subs (submissionId, dcc) * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processDeleted(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getDeleted(connection); while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); String value = res.getString("value"); deletedSubMap.put(submissionId, value); } res.close(); LOG.info("found " + deletedSubMap.size() + " deleted submissions to skip in the build."); LOG.info("PROCESS TIME deleted subs: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return deleted subs in the chado db. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getDeleted(Connection connection) throws SQLException { String query = "SELECT distinct a.experiment_id, a.value " + " FROM experiment_prop a " + " where a.name = 'deleted' "; return doQuery(connection, query, "getDeleted"); } /** * * ============== * FEATURES * ============== * * @param connection * @param submissionMap * @throws Exception */ private void processFeatures(Connection connection, Map<Integer, SubmissionDetails> submissionMap) throws Exception { long bT = System.currentTimeMillis(); // to monitor time spent in the process // keep map of feature to submissions it has been referenced by, some features appear in // more than one submission Map<Integer, List<String>> subCollections = new HashMap<Integer, List<String>>(); // hold features that should only be processed once across all submissions, initialise // processor with this map each time Map<Integer, FeatureData> commonFeaturesMap = new HashMap<Integer, FeatureData>(); for (Map.Entry<Integer, SubmissionDetails> entry : submissionMap.entrySet()) { Integer chadoExperimentId = entry.getKey(); if (deletedSubMap.containsKey(chadoExperimentId)) { continue; } Map<Integer, FeatureData> subFeatureMap = new HashMap<Integer, FeatureData>(); SubmissionDetails submissionDetails = entry.getValue(); String submissionItemIdentifier = submissionDetails.itemIdentifier; String labItemIdentifier = submissionDetails.labItemIdentifier; String submissionTitle = submissionDetails.title; List<Integer> thisSubmissionDataIds = submissionDataMap.get(chadoExperimentId); LOG.info("DATA IDS for " + dccIdMap.get(chadoExperimentId) + ": " + thisSubmissionDataIds.size()); // Create a temporary table with the feature ids related to this submission // based on the data_feature table String dataIdsTempTable = createDataIdsTempTable(connection, chadoExperimentId, thisSubmissionDataIds); ModEncodeFeatureProcessor processor = new ModEncodeFeatureProcessor(getChadoDBConverter(), submissionItemIdentifier, labItemIdentifier, dataIdsTempTable, submissionTitle, scoreProtocols.get(chadoExperimentId)); processor.initialiseCommonFeatures(commonFeaturesMap); processor.process(connection); // all features related to this submission subFeatureMap.putAll(processor.getFeatureMap()); // features common across many submissions commonFeaturesMap.putAll(processor.getCommonFeaturesMap()); LOG.info("COMMON FEATURES: " + commonFeaturesMap.size()); if (subFeatureMap.keySet().size() == 0) { LOG.error("FEATMAP: submission " + chadoExperimentId + " has no featureMap keys."); continue; } LOG.info("FEATMAP: submission " + chadoExperimentId + "|" + "featureMap: " + subFeatureMap.keySet().size()); // Populate map of submissions to features, some features are in multiple submissions processDataFeatureTable(connection, subCollections, subFeatureMap, chadoExperimentId, dataIdsTempTable); dropDataIdsTempTable(connection, dataIdsTempTable); // 1- generate a map of gene-identifiers so we can re-use the same item identifiers // when creating antibody/strain target genes late // 2- fill the 'ChromatinState' state with the secondaryId additionalProcessing(processor, subFeatureMap); } storeSubmissionsCollections(subCollections); LOG.info("PROCESS TIME features: " + (System.currentTimeMillis() - bT) + " ms"); } private void storeSubmissionsCollections(Map<Integer, List<String>> subCollections) throws ObjectStoreException { for (Map.Entry<Integer, List<String>> entry : subCollections.entrySet()) { Integer featureObjectId = entry.getKey(); ReferenceList collection = new ReferenceList("submissions", entry.getValue()); getChadoDBConverter().store(collection, featureObjectId); } } private void additionalProcessing(ModEncodeFeatureProcessor processor, Map<Integer, FeatureData> subFeatureMap) throws ObjectStoreException { for (FeatureData fData : subFeatureMap.values()) { // 1- generate a map of gene-identifiers so we can re-use the same item identifiers // when creating antibody/strain target genes late if ("Gene".equals(fData.getInterMineType())) { String geneIdentifier = processor.fixIdentifier(fData, fData.getUniqueName()); geneToItemIdentifier.put(geneIdentifier, fData.getItemIdentifier()); } // 2- fill the 'ChromatinState' state with the secondaryId if ("ChromatinState".equals(fData.getInterMineType())) { String state = fData.getChadoFeatureName(); Integer imObjectId = fData.getIntermineObjectId(); setAttribute(imObjectId, "state", state); } } } private void processDataFeatureTable(Connection connection, Map<Integer, List<String>> subCols, Map<Integer, FeatureData> featureMap, Integer chadoExperimentId, String dataIdTable) throws SQLException { long bT = System.currentTimeMillis(); // to monitor time spent in the process String submissionItemId = submissionMap.get(chadoExperimentId).itemIdentifier; bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getDataFeature(connection, dataIdTable); while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); Integer featureId = new Integer(res.getInt("feature_id")); FeatureData featureData = featureMap.get(featureId); if (featureData == null) { LOG.debug("Check feature type: no data for feature_id: " + featureId + " in processDataFeatureTable(), data_id =" + dataId); continue; } Integer featureObjectId = featureData.getIntermineObjectId(); List<String> subs = subCols.get(featureObjectId); if (subs == null) { subs = new ArrayList<String>(); subCols.put(featureObjectId, subs); } subs.add(submissionItemId); } LOG.info("DATA IDS PROCESS TIME data_feature table: " + (System.currentTimeMillis() - bT)); } private String createDataIdsTempTable(Connection connection, Integer chadoExperimentId, List<Integer> dataIds) throws SQLException { // the batch writer system doesn't like to have duplicate named tables String tableName = DATA_IDS_TABLE_NAME + "_" + chadoExperimentId + "_" + System.currentTimeMillis(); long bT = System.currentTimeMillis(); String query = " CREATE TEMPORARY TABLE " + tableName + " (data_id int)"; Statement stmt = connection.createStatement(); LOG.info("executing: " + query); stmt.execute(query); try { BatchWriterPostgresCopyImpl batchWriter = new BatchWriterPostgresCopyImpl(); Batch batch = new Batch(batchWriter); HashSet<Integer> uniqueDataIds = new HashSet<Integer>(dataIds); String[] colNames = new String[] { "data_id" }; for (Integer dataId : uniqueDataIds) { batch.addRow(connection, tableName, dataId, colNames, new Object[] { dataId }); } batch.flush(connection); batch.close(connection); LOG.info("CREATED DATA IDS TABLE: " + tableName + " with " + uniqueDataIds.size() + " data ids in " + (System.currentTimeMillis() - bT) + "ms"); String idIndexQuery = "CREATE INDEX " + tableName + "_data_id_index ON " + tableName + "(data_id)"; LOG.info("DATA IDS executing: " + idIndexQuery); long bT1 = System.currentTimeMillis(); stmt.execute(idIndexQuery); LOG.info("DATA IDS TIME creating INDEX: " + (System.currentTimeMillis() - bT1) + "ms"); String analyze = "ANALYZE " + tableName; LOG.info("executing: " + analyze); long bT2 = System.currentTimeMillis(); stmt.execute(analyze); LOG.info("DATA IDS TIME analyzing: " + (System.currentTimeMillis() - bT2) + "ms"); } catch (SQLException e) { // the batch writer system doesn't like to have duplicate named tables query = "DROP TABLE " + tableName; stmt.execute(query); throw e; } return tableName; } private void dropDataIdsTempTable(Connection connection, String dataIdsTableName) throws SQLException { long bT = System.currentTimeMillis(); String query = " DROP TABLE " + dataIdsTableName; LOG.info("executing: " + query); Statement stmt = connection.createStatement(); stmt.execute(query); LOG.info("DATA IDS TIME dropping table '" + dataIdsTableName + "': " + (System.currentTimeMillis() - bT)); } private ResultSet getDataFeature(Connection connection, String dataIdTable) throws SQLException { String query = "SELECT df.data_id, df.feature_id" + " FROM data_feature df, " + dataIdTable + " d" + " WHERE df.data_id = d.data_id"; return doQuery(connection, query, "getDataFeature"); } /** * * ==================== * DAG * ==================== * * In chado, Applied protocols in a submission are linked to each other via * the flow of data (output of a parent AP are input to a child AP). * The method process the data from chado to build the objects * (SubmissionDetails, AppliedProtocol, AppliedData) and their * respective maps to chado identifiers needed to traverse the DAG. * It then traverse the DAG, assigning the experiment_id to all data. * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processDag(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getDAG(connection); AppliedProtocol node = new AppliedProtocol(); AppliedData branch = null; Integer count = new Integer(0); Integer actualSubmissionId = new Integer(0); // to store the experiment id (see below) Integer previousAppliedProtocolId = new Integer(0); boolean isADeletedSub = false; while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); Integer protocolId = new Integer(res.getInt("protocol_id")); Integer appliedProtocolId = new Integer(res.getInt("applied_protocol_id")); Integer dataId = new Integer(res.getInt("data_id")); String direction = res.getString("direction"); LOG.debug("DAG: " + submissionId + " p:" + protocolId + " ap:" + appliedProtocolId + " d:" + dataId + " | " + direction); // the results are ordered, first ap have a subId // if we find a deleted sub, we know that subsequent records with null // subId belongs to the deleted sub // note that while the subId is null in the database, it is = 0 here if (submissionId == 0) { if (isADeletedSub) { LOG.debug("DEL: skipping" + isADeletedSub); continue; } } else { if (deletedSubMap.containsKey(submissionId)) { isADeletedSub = true; LOG.debug("DEL: " + submissionId + " ->" + isADeletedSub); continue; } else { isADeletedSub = false; LOG.debug("DEL: " + submissionId + " ->" + isADeletedSub); } } // build a data node for each iteration if (appliedDataMap.containsKey(dataId)) { branch = appliedDataMap.get(dataId); } else { branch = new AppliedData(); } // could use > (order by apid, apdataid, direction) // NB: using isLast() is expensive if (!appliedProtocolId.equals(previousAppliedProtocolId) || res.isLast()) { // the submissionId != null for the first applied protocol if (submissionId > 0) { firstAppliedProtocols.add(appliedProtocolId); LOG.debug("DAG fap subId:" + submissionId + " apID: " + appliedProtocolId); // set actual submission id // we can either be at a first applied protocol (submissionId > 0).. actualSubmissionId = submissionId; } else { // ..or already down the dag, and we use the stored id. submissionId = actualSubmissionId; } // last one: fill the list of outputs // and add to the general list of data ids for the submission, // used to fetch features if (res.isLast()) { if ("output".equalsIgnoreCase(direction)) { node.outputs.add(dataId); mapSubmissionAndData(submissionId, dataId); } } // if it is not the first iteration, let's store it if (previousAppliedProtocolId > 0) { appliedProtocolMap.put(previousAppliedProtocolId, node); } // new node AppliedProtocol newNode = new AppliedProtocol(); newNode.protocolId = protocolId; newNode.submissionId = submissionId; if (direction.startsWith("in")) { // add this applied protocol to the list of nextAppliedProtocols branch.nextAppliedProtocols.add(appliedProtocolId); // ..and update the map updateAppliedDataMap(branch, dataId); // .. and add the dataId to the list of input Data for this applied protocol newNode.inputs.add(dataId); mapSubmissionAndData(submissionId, dataId); //*** } else if (direction.startsWith("out")) { // add the dataId to the list of output Data for this applied protocol: // it will be used to link to the next set of applied protocols newNode.outputs.add(dataId); if (previousAppliedProtocolId > 0) { branch.previousAppliedProtocols.add(previousAppliedProtocolId); updateAppliedDataMap(branch, dataId); //*** mapSubmissionAndData(submissionId, dataId); //**** } } else { // there is some problem with the strings 'input' or 'output' throw new IllegalArgumentException( "Data direction not valid for dataId: " + dataId + "|" + direction + "|"); } // for the new round.. node = newNode; previousAppliedProtocolId = appliedProtocolId; } else { // keep feeding IN et OUT if (direction.startsWith("in")) { node.inputs.add(dataId); if (submissionId > 0) { // initial data mapSubmissionAndData(submissionId, dataId); } // as above branch.nextAppliedProtocols.add(appliedProtocolId); updateAppliedDataMap(branch, dataId); } else if (direction.startsWith("out")) { node.outputs.add(dataId); branch.previousAppliedProtocols.add(appliedProtocolId); updateAppliedDataMap(branch, dataId); //*** } else { throw new IllegalArgumentException( "Data direction not valid for dataId: " + dataId + "|" + direction + "|"); } } count++; } LOG.info("created " + appliedProtocolMap.size() + "(" + count + " applied data points) DAG nodes (= applied protocols) in map"); res.close(); // now traverse the DAG, and associate submission with all the applied protocols traverseDag(); // set the dag level as an attribute to applied protocol setAppliedProtocolSteps(connection); LOG.info("PROCESS TIME DAG: " + (System.currentTimeMillis() - bT) + " ms"); } /** * @param newAD * @param dataId */ private void updateAppliedDataMap(AppliedData newAD, Integer dataId) { if (appliedDataMap.containsKey(dataId)) { appliedDataMap.remove(dataId); } appliedDataMap.put(dataId, newAD); } /** * @param newAD the new appliedData * @param dataId the data id * @param intermineObjectId just a flag to do an update of attributes instead of a replecament */ private void updateADMap(AppliedData newAD, Integer dataId, Integer intermineObjectId) { if (appliedDataMap.containsKey(dataId)) { AppliedData datum = appliedDataMap.get(dataId); datum.intermineObjectId = newAD.intermineObjectId; datum.itemIdentifier = newAD.itemIdentifier; datum.value = newAD.value; datum.actualValue = newAD.actualValue; datum.dataId = dataId; datum.type = newAD.type; datum.name = newAD.name; datum.url = newAD.url; } else { appliedDataMap.put(dataId, newAD); } } /** * to set the step attribute for the applied protocols */ private void setAppliedProtocolSteps(Connection connection) throws ObjectStoreException { for (Integer appliedProtocolId : appliedProtocolMap.keySet()) { Integer step = appliedProtocolMap.get(appliedProtocolId).step; if (step != null) { Attribute attr = new Attribute("step", step.toString()); getChadoDBConverter().store(attr, appliedProtocolIdMap.get(appliedProtocolId)); } else { AppliedProtocol ap = appliedProtocolMap.get(appliedProtocolId); LOG.warn("AppliedProtocol.step not set for chado id: " + appliedProtocolId + " sub " + dccIdMap.get(ap.submissionId) + " inputs " + ap.inputs + " outputs " + ap.outputs); } } } // Look for protocols that were used to generated GFF files, these are passed to the feature // processor, if features have a score the protocol is set as the scoreProtocol reference. // NOTE this could equally be done with data, data_feature and applied_protocol_data private void findScoreProtocols() { for (Map.Entry<Integer, AppliedData> entry : appliedDataMap.entrySet()) { Integer dataId = entry.getKey(); AppliedData aData = entry.getValue(); if ("Result File".equals(aData.type) && (aData.value.endsWith(".gff") || aData.value.endsWith("gff3"))) { for (Integer papId : aData.previousAppliedProtocols) { AppliedProtocol aProtocol = appliedProtocolMap.get(papId); String protocolItemId = protocolItemIds.get(aProtocol.protocolId); scoreProtocols.put(dataSubmissionMap.get(dataId), protocolItemId); } } } } /** * Return the rows needed to construct the DAG of the data/protocols. * The reference to the submission is available only for the first set * of applied protocols, hence the outer join. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getDAG(Connection connection) throws SQLException { String query = "SELECT eap.experiment_id, ap.protocol_id, apd.applied_protocol_id" + " , apd.data_id, apd.applied_protocol_data_id, apd.direction" + " FROM applied_protocol ap LEFT JOIN experiment_applied_protocol eap" + " ON (eap.first_applied_protocol_id = ap.applied_protocol_id )" + " , applied_protocol_data apd" + " WHERE apd.applied_protocol_id = ap.applied_protocol_id" + " ORDER By 3,5,6"; return doQuery(connection, query, "getDAG"); } /** * Applies iteratively buildADaglevel * * @throws SQLException * @throws ObjectStoreException */ private void traverseDag() throws ObjectStoreException { List<Integer> currentIterationAP = firstAppliedProtocols; List<Integer> nextIterationAP = new ArrayList<Integer>(); Integer step = 1; // DAG level while (currentIterationAP.size() > 0) { nextIterationAP = buildADagLevel(currentIterationAP, step); currentIterationAP = nextIterationAP; step++; } } /** * This method is given a set of applied protocols (already associated with a submission) * and produces the next set of applied protocols. The latter are the protocols attached to the * output data of the starting set (output data for a applied protocol is the input data for the * next one). * It also fills the map linking directly results ('leaf' output data) with submission * * @param previousAppliedProtocols * @return the next batch of appliedProtocolId * @throws SQLException * @throws ObjectStoreException */ private List<Integer> buildADagLevel(List<Integer> previousAppliedProtocols, Integer step) throws ObjectStoreException { List<Integer> nextIterationProtocols = new ArrayList<Integer>(); Iterator<Integer> pap = previousAppliedProtocols.iterator(); while (pap.hasNext()) { List<Integer> outputs = new ArrayList<Integer>(); List<Integer> inputs = new ArrayList<Integer>(); Integer currentId = pap.next(); // add the DAG level here only if these are the first AP if (step == 1) { appliedProtocolMap.get(currentId).step = step; } outputs.addAll(appliedProtocolMap.get(currentId).outputs); Integer submissionId = appliedProtocolMap.get(currentId).submissionId; Iterator<Integer> od = outputs.iterator(); while (od.hasNext()) { Integer currentOD = od.next(); List<Integer> nextProtocols = new ArrayList<Integer>(); // build map submission-data mapSubmissionAndData(submissionId, currentOD); if (appliedDataMap.containsKey(currentOD)) { // fill the list of next (children) protocols nextProtocols.addAll(appliedDataMap.get(currentOD).nextAppliedProtocols); if (appliedDataMap.get(currentOD).nextAppliedProtocols.isEmpty()) { // this is a leaf!! LOG.debug("DAG leaf: " + submissionId + " dataId: " + currentOD); } } // to fill submission-dataId map // this is needed, otherwise inputs to AP that are not outputs // of a previous protocol are not considered inputs.addAll(appliedProtocolMap.get(currentId).inputs); Iterator<Integer> in = inputs.iterator(); while (in.hasNext()) { Integer currentIn = in.next(); // build map submission-data mapSubmissionAndData(submissionId, currentIn); } // build the list of children applied protocols chado identifiers // as input for the next iteration Iterator<Integer> nap = nextProtocols.iterator(); while (nap.hasNext()) { // and fill the map with the chado experiment_id and the DAG level Integer currentAPId = nap.next(); appliedProtocolMap.get(currentAPId).submissionId = submissionId; appliedProtocolMap.get(currentAPId).step = step + 1; nextIterationProtocols.add(currentAPId); // and set the reference from applied protocol to the submission Reference reference = new Reference(); reference.setName("submission"); reference.setRefId(submissionMap.get(submissionId).itemIdentifier); getChadoDBConverter().store(reference, appliedProtocolIdMap.get(currentAPId)); } } } return nextIterationProtocols; } /** * ============== * ORGANISM * ============== * Organism for a submission is derived from the organism associated with * the protocol of the first applied protocol (of the submission). * it is the name. a request to associate the submission directly with * the taxonid has been made to chado people. * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processSubmissionOrganism(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getSubmissionOrganism(connection); while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); if (deletedSubMap.containsKey(submissionId)) { continue; } String value = res.getString("value"); submissionOrganismMap.put(submissionId, value); LOG.debug("TAXID " + submissionId + "|" + value); } res.close(); LOG.info("found an organism for " + submissionOrganismMap.size() + " submissions."); LOG.info("PROCESS TIME organisms: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the row needed for the organism. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getSubmissionOrganism(Connection connection) throws SQLException { String query = "select distinct eap.experiment_id, a.value " + " from experiment_applied_protocol eap, applied_protocol ap, " + " protocol_attribute pa, attribute a " + " where eap.first_applied_protocol_id = ap.applied_protocol_id " + " and ap.protocol_id=pa.protocol_id " + " and pa.attribute_id=a.attribute_id " + " and a.heading='species' "; return doQuery(connection, query, "getSubmissionOrganism"); } /** * ============== * PROJECT * ============== * Projects are loaded statically. A map is built between submissionId and * project's name and used for the references. 2 maps store intermine * objectId and itemId, with key the project name. * Note: the project name in chado is the surname of the PI * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processProjectTable(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getProjects(connection); while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); String value = res.getString("value"); if (deletedSubMap.containsKey(submissionId)) { continue; } submissionProjectMap.put(submissionId, value); } res.close(); Set<Integer> exp = submissionProjectMap.keySet(); Iterator<Integer> i = exp.iterator(); while (i.hasNext()) { Integer thisExp = i.next(); String project = submissionProjectMap.get(thisExp); if (projectIdMap.containsKey(project)) { continue; } LOG.debug("PROJECT: " + project); Item pro = getChadoDBConverter().createItem("Project"); pro.setAttribute("surnamePI", project); Integer intermineObjectId = getChadoDBConverter().store(pro); storeInProjectMaps(pro, project, intermineObjectId); } LOG.info("created " + projectIdMap.size() + " project"); LOG.info("PROCESS TIME projects: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the project name. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getProjects(Connection connection) throws SQLException { String query = "SELECT distinct a.experiment_id, a.value " + " FROM experiment_prop a " + " where a.name = 'Project' " + " AND rank=0"; return doQuery(connection, query, "getProjects"); } /** * ============== * LAB * ============== * Labs are also loaded statically (affiliation is not given in the chado file). * A map is built between submissionId and * lab's name and used for the references. 2 maps store intermine * objectId and itemId, with key the lab name. * TODO: do project and lab together (1 query, 1 process) * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processLabTable(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getLabs(connection); while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); String value = res.getString("value"); if (deletedSubMap.containsKey(submissionId)) { continue; } submissionLabMap.put(submissionId, value); } res.close(); Set<Integer> exp = submissionLabMap.keySet(); Iterator<Integer> i = exp.iterator(); while (i.hasNext()) { Integer thisExp = i.next(); String prov = submissionLabMap.get(thisExp); String project = submissionProjectMap.get(thisExp); if (labIdMap.containsKey(prov)) { continue; } LOG.debug("PROV: " + prov); Item lab = getChadoDBConverter().createItem("Lab"); lab.setAttribute("surname", prov); lab.setReference("project", projectIdRefMap.get(project)); Integer intermineObjectId = getChadoDBConverter().store(lab); storeInLabMaps(lab, prov, intermineObjectId); } LOG.info("created " + labIdMap.size() + " labs"); LOG.info("PROCESS TIME labs: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the lab name. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getLabs(Connection connection) throws SQLException { String query = "SELECT distinct a.experiment_id, a.name, a.value " + " FROM experiment_prop a " + " where a.name = 'Lab' " + " AND a.rank=0"; return doQuery(connection, query, "getLabs"); } /** * ================ * EXPERIMENT * ================ * Experiment is a collection of submissions. They all share the same description. * It has been added later to the model. * A map is built between submissionId and experiment name. * 2 maps store intermine objectId and itemId, with key the experiment name. * They are probably not needed. * */ private void processExperiment(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getExperimentTitles(connection); Map<String, String> expProMap = new HashMap<String, String>(); while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); if (deletedSubMap.containsKey(submissionId)) { continue; } String name = cleanWikiLinks(res.getString("name")); Util.addToListMap(expSubMap, name, submissionId); expProMap.put(name, submissionProjectMap.get(submissionId)); } res.close(); Set<String> experiment = expSubMap.keySet(); Iterator<String> i = experiment.iterator(); while (i.hasNext()) { String name = i.next(); Item exp = getChadoDBConverter().createItem("Experiment"); exp.setAttribute("name", name); // find experiment category from map (take first available) // use the commented lines to get a report of assignments String category = null; for (Integer ii : expSubMap.get(name)) { // String dccId = dccIdMap.get(ii); category = submissionExpCatMap.get(ii); if (category != null && !category.isEmpty()) { // LOG.info("ECS " + name + "|" + dccId + ": " + category); exp.setAttribute("category", category); break; } //else { //LOG.warn("ECS " + name + "|" + dccId + ": no category"); //} } String project = expProMap.get(name); exp.setReference("project", projectIdRefMap.get(project)); // note: the reference to submission collection is in a separate method Integer intermineObjectId = getChadoDBConverter().store(exp); experimentIdMap.put(name, intermineObjectId); experimentIdRefMap.put(name, exp.getIdentifier()); } LOG.info("created " + expSubMap.size() + " experiments"); LOG.info("PROCESS TIME experiments: " + (System.currentTimeMillis() - bT) + " ms"); } /** * method to clean a wiki reference (url to a named page) in chado * used also for experiment names * @param w the wiki reference */ private String cleanWikiLinks(String w) { String url = "http://wiki.modencode.org/project/index.php?title="; // we are stripping from first ':', maybe we want to include project suffix // e.g.: // original "http://...?title=Gene Model Prediction:SC:1&oldid=12356" // now: Gene Model Prediction // maybe? Gene Model Prediction:SC:1 // (:->&) String w1 = StringUtils.replace(w, url, ""); String s1 = null; if (w1.contains(":")) { s1 = StringUtils.substringBefore(w1, ":"); } else { // for links missing the : char, e.g. // MacAlpine Early Origin of Replication Identification&oldid=10464 s1 = StringUtils.substringBefore(w1, "&"); } String s = s1.replace('"', ' ').trim(); if (s.contains("%E2%80%99")) { // prime: for the Piano experiment String s2 = s.replace("%E2%80%99", "'"); return s2; } if (s.contains("%28A%29%2B")) { // this is (A)+, in // Stranded Cell Line Transcriptional Profiling Using Illumina poly%28A%29%2B RNA-seq String s2 = s.replace("%28A%29%2B", "(A)+"); return s2; } if (s.contains("%2B")) { // +: for Celniker experiment "Tissue-specific Transcriptional Profiling..." String s2 = s.replace("%2B", "+"); return s2; } return s; } /** * Return the rows needed for experiment from the experiment_prop table. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getExperimentTitles(Connection connection) throws SQLException { // TODO use standard SQl and deal with string in java String query = "select e.experiment_id, " + " translate(x.accession, '_', ' ') as name " + " from experiment_prop e, dbxref x " + " where e.dbxref_id = x.dbxref_id " + " and e.name='Experiment Description' "; return doQuery(connection, query, "getExperimentTitles"); } /** * ================ * SUBMISSION * ================ */ private void processSubmission(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getSubmissions(connection); int count = 0; while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); if (deletedSubMap.containsKey(submissionId)) { continue; } String name = res.getString("uniquename"); Item submission = getChadoDBConverter().createItem("Submission"); submission.setAttribute("title", name); String project = submissionProjectMap.get(submissionId); String projectItemIdentifier = projectIdRefMap.get(project); submission.setReference("project", projectItemIdentifier); String labName = submissionLabMap.get(submissionId); String labItemIdentifier = labIdRefMap.get(labName); submission.setReference("lab", labItemIdentifier); String organismName = submissionOrganismMap.get(submissionId); int divPos = organismName.indexOf(' '); String genus = organismName.substring(0, divPos); String species = organismName.substring(divPos + 1); OrganismRepository or = OrganismRepository.getOrganismRepository(); Integer taxId = Integer.valueOf(or.getOrganismDataByGenusSpecies(genus, species).getTaxonId()); LOG.debug("SPECIES: " + organismName + "|" + taxId); String organismItemIdentifier = getChadoDBConverter() .getOrganismItem(or.getOrganismDataByGenusSpecies(genus, species).getTaxonId()).getIdentifier(); submission.setReference("organism", organismItemIdentifier); // ..store all Integer intermineObjectId = getChadoDBConverter().store(submission); // ..and fill the SubmissionDetails object SubmissionDetails details = new SubmissionDetails(); details.interMineObjectId = intermineObjectId; details.itemIdentifier = submission.getIdentifier(); details.labItemIdentifier = labItemIdentifier; details.title = name; submissionMap.put(submissionId, details); debugMap.put(details.itemIdentifier, submission.getClassName()); count++; } LOG.info("created " + count + " submissions"); res.close(); LOG.info("PROCESS TIME submissions: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the rows needed for the submission table. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getSubmissions(Connection connection) throws SQLException { String query = "SELECT experiment_id, uniquename " + "FROM experiment"; return doQuery(connection, query, "getSubmissions"); } /** * submission attributes (table experiment_prop) * * @param connection * @throws SQLException * @throws ObjectStoreException */ private void processSubmissionAttributes(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getExperimentProperties(connection); int count = 0; while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); if (deletedSubMap.containsKey(submissionId)) { continue; } String heading = res.getString("name"); String value = res.getString("value"); // TODO this is a temporary hack to make sure we get properly matched Experiment.factors // EF are dealt with separately if (heading.startsWith("Experimental Factor")) { continue; } String fieldName = FIELD_NAME_MAP.get(heading); if (fieldName == null) { LOG.error("NOT FOUND in FIELD_NAME_MAP: " + heading + " [experiment]"); continue; } else if (fieldName == NOT_TO_BE_LOADED) { continue; } if ("DCCid".equals(fieldName)) { value = DCC_PREFIX + value; LOG.debug("DCC: " + submissionId + ", " + value); dccIdMap.put(submissionId, value); } if ("category".equals(fieldName)) { // Data Type, stored in experiment submissionExpCatMap.put(submissionId, value); continue; } if (fieldName.endsWith("Read Count")) { // all read counts are considered a collection for submission Item readCount = getChadoDBConverter().createItem("ReadCount"); readCount.setAttribute("name", fieldName); readCount.setAttribute("value", value); // setting references to SubmissionData readCount.setReference("submission", submissionMap.get(submissionId).itemIdentifier); Integer intermineObjectId = getChadoDBConverter().store(readCount); continue; } if ("experimentType".equals(fieldName)) { // Assay Type submissionWithExpTypeSet.add(submissionId); } if ("pubMedId".equals(fieldName)) { // sometime in the form PMID:16938558 if (value.contains(":")) { value = value.substring(value.indexOf(':') + 1); } Item pub = getChadoDBConverter().createItem("Publication"); pub.setAttribute(fieldName, value); Integer intermineObjectId = getChadoDBConverter().store(pub); publicationIdMap.put(submissionId, intermineObjectId); publicationIdRefMap.put(submissionId, pub.getIdentifier()); continue; } setAttribute(submissionMap.get(submissionId).interMineObjectId, fieldName, value); count++; } LOG.info("created " + count + " submissions attributes"); res.close(); LOG.info("PROCESS TIME submissions attributes: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the rows needed for submission from the experiment_prop table. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getExperimentProperties(Connection connection) throws SQLException { String query = "SELECT ep.experiment_id, ep.name, ep.value, ep.rank " + "from experiment_prop ep "; return doQuery(connection, query, "getExperimentProperties"); } /** * ========================== * EXPERIMENTAL FACTORS * ========================== */ private void processEFactor(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getEFactors(connection); int count = 0; int prevRank = -1; int prevSub = -1; ExperimentalFactor ef = null; String name = null; while (res.next()) { Integer submissionId = new Integer(res.getInt("experiment_id")); if (deletedSubMap.containsKey(submissionId)) { continue; } Integer rank = new Integer(res.getInt("rank")); String value = res.getString("value"); // the data is alternating between EF types and names, in order. if (submissionId != prevSub) { // except for the first record, this is a new EF object if (!res.isFirst()) { submissionEFMap.put(prevSub, ef); LOG.info("EF MAP: " + dccIdMap.get(prevSub) + "|" + ef.efNames); LOG.info("EF MAP types: " + rank + "|" + ef.efTypes); } ef = new ExperimentalFactor(); } if (rank != prevRank || submissionId != prevSub) { // this is a name if (getPreferredSynonym(value) != null) { value = getPreferredSynonym(value); } ef.efNames.add(value); name = value; count++; } else { // this is a type ef.efTypes.put(name, value); name = null; if (res.isLast()) { submissionEFMap.put(submissionId, ef); LOG.debug("EF MAP last: " + submissionId + "|" + rank + "|" + ef.efNames); } } prevRank = rank; prevSub = submissionId; } res.close(); LOG.info("created " + count + " experimental factors"); LOG.info("PROCESS TIME experimental factors: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the rows needed for the experimental factors. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getEFactors(Connection connection) throws SQLException { String query = "SELECT ep.experiment_id, ep.name, ep.value, ep.rank " + " FROM experiment_prop ep " + " where ep.name = 'Experimental Factor Name' " + " OR ep.name = 'Experimental Factor Type' " + " ORDER BY 1,4,2"; return doQuery(connection, query, "getEFactors"); } /** * ============== * PROTOCOL * ============== */ private void processProtocolTable(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getProtocols(connection); int count = 0; while (res.next()) { Integer protocolChadoId = new Integer(res.getInt("protocol_id")); String name = res.getString("name"); String description = res.getString("description"); String wikiLink = res.getString("accession"); Integer version = res.getInt("version"); // needed: it breaks otherwise if (description.length() == 0) { description = "N/A"; } Protocol thisProt = new Protocol(); thisProt.protocolId = protocolChadoId; // rm? thisProt.name = name; thisProt.description = description; thisProt.wikiLink = wikiLink; thisProt.version = version; protocolMap.put(protocolChadoId, thisProt); // we'll do it when creating AP //createProtocol(protocolChadoId, name, description, wikiLink, version); count++; } res.close(); LOG.info("found " + count + " protocols"); LOG.info("PROCESS TIME protocols: " + (System.currentTimeMillis() - bT) + " ms"); } // now doing it with applied protocol (to avoid creating it for delete subs) private String createProtocol(Integer chadoId, String name, String description, String wikiLink, Integer version) throws ObjectStoreException { String protocolItemId = protocolsMap.get(wikiLink); // rename? if (protocolItemId == null) { Item protocol = getChadoDBConverter().createItem("Protocol"); protocol.setAttribute("name", name); protocol.setAttribute("description", description); protocol.setAttribute("wikiLink", wikiLink); protocol.setAttribute("version", "" + version); Integer intermineObjectId = getChadoDBConverter().store(protocol); protocolItemId = protocol.getIdentifier(); protocolItemToObjectId.put(protocolItemId, intermineObjectId); protocolsMap.put(wikiLink, protocolItemId); } protocolItemIds.put(chadoId, protocolItemId); return protocolItemId; } private Integer getProtocolInterMineId(Integer chadoId) { return protocolItemToObjectId.get(protocolItemIds.get(chadoId)); } /** * Return the rows needed from the protocol table. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getProtocols(Connection connection) throws SQLException { String query = "SELECT protocol_id, name, protocol.description, accession, protocol.version" + " FROM protocol, dbxref" + " WHERE protocol.dbxref_id = dbxref.dbxref_id"; return doQuery(connection, query, "getProtocols"); } /** * to store protocol attributes */ private void processProtocolAttributes(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getProtocolAttributes(connection); int count = 0; while (res.next()) { Integer protocolId = new Integer(res.getInt("protocol_id")); String heading = res.getString("heading"); String value = res.getString("value"); String fieldName = FIELD_NAME_MAP.get(heading); if (fieldName == null) { LOG.error("NOT FOUND in FIELD_NAME_MAP: " + heading + " [protocol]"); continue; } else if (fieldName == NOT_TO_BE_LOADED) { continue; } if (getProtocolInterMineId(protocolId) != null) { // in case of deleted sub setAttribute(getProtocolInterMineId(protocolId), fieldName, value); if ("type".equals(fieldName)) { protocolTypesMap.put(protocolId, value); } count++; } } LOG.info("created " + count + " protocol attributes"); res.close(); LOG.info("PROCESS TIME protocol attributes: " + (System.currentTimeMillis() - bT) + " ms"); } /** * Return the rows needed for protocols from the attribute table. * This is a protected method so that it can be overridden for testing * @param connection database connection * @return rows needed for protocols from the attribute table * @throws SQLException if something goes wrong. */ protected ResultSet getProtocolAttributes(Connection connection) throws SQLException { String query = "SELECT p.protocol_id, a.heading, a.value " + "from protocol p, attribute a, protocol_attribute pa " + "where pa.attribute_id = a.attribute_id " + "and pa.protocol_id = p.protocol_id "; return doQuery(connection, query, "getProtocolAttributes"); } /** * ====================== * APPLIED PROTOCOL * ====================== */ private void processAppliedProtocolTable(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getAppliedProtocols(connection); int count = 0; boolean isADeletedSub = false; while (res.next()) { Integer appliedProtocolId = new Integer(res.getInt("applied_protocol_id")); Integer protocolId = new Integer(res.getInt("protocol_id")); Integer submissionId = new Integer(res.getInt("experiment_id")); // the results are ordered, first ap have a subId // if we find a deleted sub, we know that subsequent records with null // subId belongs to the deleted sub if (submissionId == 0) { boolean t = true; if (isADeletedSub == t) { continue; } } else { if (deletedSubMap.containsKey(submissionId)) { isADeletedSub = true; continue; } else { isADeletedSub = false; } } Item appliedProtocol = getChadoDBConverter().createItem("AppliedProtocol"); // creating and setting references to protocols // String protocolItemId = protocolItemIds.get(protocolId); if (protocolId != null) { Protocol qq = protocolMap.get(protocolId); String protocolItemId = createProtocol(qq); appliedProtocol.setReference("protocol", protocolItemId); } if (submissionId > 0) { // setting reference to submission // probably to rm (we do it later anyway). TODO: check appliedProtocol.setReference("submission", submissionMap.get(submissionId).itemIdentifier); } // store it and add to maps Integer intermineObjectId = getChadoDBConverter().store(appliedProtocol); appliedProtocolIdMap.put(appliedProtocolId, intermineObjectId); appliedProtocolIdRefMap.put(appliedProtocolId, appliedProtocol.getIdentifier()); count++; } LOG.info("created " + count + " appliedProtocol"); res.close(); LOG.info("PROCESS TIME applied protocols: " + (System.currentTimeMillis() - bT) + " ms"); } private String createProtocol(Protocol p) throws ObjectStoreException { String protocolItemId = protocolsMap.get(p.wikiLink); // rename map? if (protocolItemId == null) { Item protocol = getChadoDBConverter().createItem("Protocol"); protocol.setAttribute("name", p.name); protocol.setAttribute("description", p.description); protocol.setAttribute("wikiLink", p.wikiLink); protocol.setAttribute("version", "" + p.version); Integer intermineObjectId = getChadoDBConverter().store(protocol); protocolItemId = protocol.getIdentifier(); protocolItemToObjectId.put(protocolItemId, intermineObjectId); protocolsMap.put(p.wikiLink, protocolItemId); } protocolItemIds.put(p.protocolId, protocolItemId); return protocolItemId; } /** * Return the rows needed from the appliedProtocol table. * This is a protected method so that it can be overridden for testing * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getAppliedProtocols(Connection connection) throws SQLException { String query = "SELECT eap.experiment_id ,ap.applied_protocol_id, ap.protocol_id" + " FROM applied_protocol ap" + " LEFT JOIN experiment_applied_protocol eap" + " ON (eap.first_applied_protocol_id = ap.applied_protocol_id )"; return doQuery(connection, query, "getAppliedProtocols"); } /** * ====================== * APPLIED DATA * ====================== */ private void processAppliedData(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getAppliedData(connection); int count = 0; while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); // check if not belonging to a deleted sub Integer submissionId = dataSubmissionMap.get(dataId); if (submissionId == null || deletedSubMap.containsKey(submissionId)) { continue; } String name = res.getString("name"); String heading = res.getString("heading"); String value = res.getString("value"); String typeId = res.getString("type_id"); String url = res.getString("url"); // check if this datum has an official name: ResultSet oName = getOfficialName(connection, dataId); String officialName = null; while (oName.next()) { officialName = oName.getString(1); } // if there is one, use it instead of the value String datumType = getCvterm(connection, typeId); name = getCvterm(connection, typeId); if (!StringUtils.isEmpty(officialName) && doReplaceWithOfficialName(heading, datumType)) { value = officialName; } Item submissionData = getChadoDBConverter().createItem("SubmissionData"); if (name != null && !"".equals(name)) { submissionData.setAttribute("name", name); } // if no name for attribute fetch the cvterm of the type if ((name == null || "".equals(name)) && typeId != null) { name = getCvterm(connection, typeId); submissionData.setAttribute("name", name); } if (!StringUtils.isEmpty(value)) { submissionData.setAttribute("value", value); } submissionData.setAttribute("type", heading); // store it and add to object and maps Integer intermineObjectId = getChadoDBConverter().store(submissionData); AppliedData aData = new AppliedData(); aData.intermineObjectId = intermineObjectId; aData.itemIdentifier = submissionData.getIdentifier(); aData.value = value; aData.actualValue = res.getString("value"); aData.dataId = dataId; aData.type = heading; aData.name = name; aData.url = url; updateADMap(aData, dataId, intermineObjectId); //appliedDataMap.put(dataId, aData); count++; } LOG.info("created " + count + " SubmissionData"); res.close(); LOG.info("PROCESS TIME submission data: " + (System.currentTimeMillis() - bT) + " ms"); } // For some data types we don't want to replace with official name - e.g. file names and // database record ids. It looks like the official name shouldn't actually be present. private boolean doReplaceWithOfficialName(String heading, String type) { if ("Result File".equals(heading)) { return false; } if ("Result Value".equals(heading) && DB_RECORD_TYPES.contains(type)) { return false; } return true; } /** * Return the rows needed for data from the applied_protocol_data table. * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getAppliedData(Connection connection) throws SQLException { String query = "SELECT d.data_id," + " d.heading, d.name, d.value, d.type_id, z.url" + " FROM data d" + " LEFT JOIN dbxref as y ON (d.dbxref_id = y.dbxref_id)" + " LEFT JOIN db as z ON (y.db_id = z.db_id)"; return doQuery(connection, query, "getAppliedData"); } /** * Return the rows needed for data from the applied_protocol_data table. * * @param connection the db connection * @param dataId the dataId * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getOfficialName(Connection connection, Integer dataId) throws SQLException { String query = "SELECT a.value " + " from attribute a, data_attribute da " + " where a.attribute_id=da.attribute_id " + " and da.data_id=" + dataId + " and a.heading='official name'"; return doQuery(connection, query); } /** * Fetch a cvterm by id and cache results in cvtermCache. Returns null if the cv terms isn't * found. * @param connection to chado database * @param cvtermId internal chado id for a cvterm * @return the cvterm name or null if not found * @throws SQLException if database access problem */ private String getCvterm(Connection connection, String cvtermId) throws SQLException { String cvTerm = cvtermCache.get(cvtermId); if (cvTerm == null) { String query = "SELECT c.name " + " from cvterm c" + " where c.cvterm_id=" + cvtermId; Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); while (res.next()) { cvTerm = res.getString("name"); } cvtermCache.put(cvtermId, cvTerm); } return cvTerm; } /** * ===================== * DATA ATTRIBUTES * ===================== */ private void processAppliedDataAttributesNEW(Connection connection) throws SQLException, ObjectStoreException { // attempts to collate attributes // TODO check! long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getAppliedDataAttributes(connection); int count = 0; Integer previousDataId = 0; String previousName = null; String value = null; String type = null; while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); // check if not belonging to a deleted sub // better way? Integer submissionId = dataSubmissionMap.get(dataId); if (submissionId == null || deletedSubMap.containsKey(submissionId)) { continue; } String name = res.getString("heading"); // LOG.info("DA " + dataId + ": " + name); if (previousDataId == 0) { //first pass value = res.getString("value"); type = res.getString("name"); previousDataId = dataId; previousName = name; LOG.info("DA0 " + dataId + ": " + name + "|" + value); continue; } if (dataId > previousDataId) { Item dataAttribute = storeDataAttribute(value, type, previousDataId, previousName); value = res.getString("value"); type = res.getString("name"); count++; previousDataId = dataId; previousName = name; LOG.info("DA1 new: " + previousDataId + ": " + previousName + "|" + value); continue; } if (!name.equalsIgnoreCase(previousName)) { Item dataAttribute = storeDataAttribute(value, type, dataId, previousName); // LOG.info("DA2 store: " + dataId + ": " + previousName + "|" + value); count++; value = res.getString("value"); previousName = name; LOG.info("DA2 new: " + dataId + ": " + previousName + "|" + value); } else { value = value + ", " + res.getString("value"); } type = res.getString("name"); previousDataId = dataId; if (res.isLast()) { Item dataAttribute = storeDataAttribute(value, type, dataId, name); count++; } } LOG.info("created " + count + " data attributes"); res.close(); LOG.info("PROCESS TIME data attributes: " + (System.currentTimeMillis() - bT) + " ms"); } /** * @param value * @param type * @param dataId * @param name * @return * @throws ObjectStoreException */ private Item storeDataAttribute(String value, String type, Integer dataId, String name) throws ObjectStoreException { Item dataAttribute = getChadoDBConverter().createItem("SubmissionDataAttribute"); if (name != null && !"".equals(name)) { dataAttribute.setAttribute("name", name); } if (!StringUtils.isEmpty(value)) { dataAttribute.setAttribute("value", value); } if (!StringUtils.isEmpty(type)) { dataAttribute.setAttribute("type", type); } // setting references to SubmissionData dataAttribute.setReference("submissionData", appliedDataMap.get(dataId).itemIdentifier); getChadoDBConverter().store(dataAttribute); return dataAttribute; } private void processAppliedDataAttributes(Connection connection) throws SQLException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getAppliedDataAttributes(connection); int count = 0; while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); // check if not belonging to a deleted sub // better way? Integer submissionId = dataSubmissionMap.get(dataId); if (submissionId == null || deletedSubMap.containsKey(submissionId)) { continue; } String name = res.getString("heading"); String value = res.getString("value"); String type = res.getString("name"); Item dataAttribute = storeDataAttribute(value, type, dataId, name); count++; } LOG.info("created " + count + " data attributes"); res.close(); LOG.info("PROCESS TIME data attributes: " + (System.currentTimeMillis() - bT) + " ms"); } // first value in the list of synonyms is the 'preferred' value private static String[][] synonyms = new String[][] { new String[] { "developmental stage", "stage", "developmental_stage", "dev stage", "devstage" }, new String[] { "strain", "strain_or_line" }, new String[] { "cell line", "cell_line", "Cell line", "cell id" }, new String[] { "array", "adf" }, new String[] { "compound", "Compound" }, new String[] { "incubation time", "Incubation Time" }, new String[] { "RNAi reagent", "RNAi_reagent", "dsRNA" }, new String[] { "temperature", "temp" } }; private static List<String> makeLookupList(String initialLookup) { for (String[] synonymType : synonyms) { for (String synonym : synonymType) { if (synonym.equals(initialLookup)) { return Arrays.asList(synonymType); } } } return new ArrayList<String>(Collections.singleton(initialLookup)); } private static String getPreferredSynonym(String initialLookup) { return makeLookupList(initialLookup).get(0); } private static Set<String> unifyFactorNames(Collection<String> original) { Set<String> unified = new HashSet<String>(); for (String name : original) { unified.add(getPreferredSynonym(name)); } return unified; } private class SubmissionReference { public SubmissionReference(Integer referencedSubmissionId, String dataValue) { this.referencedSubmissionId = referencedSubmissionId; this.dataValue = dataValue; } private Integer referencedSubmissionId; private String dataValue; } // process new query // get DCC id // add antibody to types private void processSubmissionProperties(Connection connection) throws SQLException, IOException, ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process ResultSet res = getAppliedDataAll(connection); final String comma = ","; String reportName = "build/" + getChadoDBConverter().getDatabase().getName() + "_subs_report.csv"; File f = new File(reportName); FileWriter writer = new FileWriter(f); writeHeader(comma, writer); SubmissionProperty buildSubProperty = null; Integer lastDataId = new Integer(-1); Map<String, SubmissionProperty> props = new HashMap<String, SubmissionProperty>(); Map<Integer, Map<String, List<SubmissionProperty>>> subToTypes = new HashMap<Integer, Map<String, List<SubmissionProperty>>>(); submissionRefs = new HashMap<Integer, List<SubmissionReference>>(); while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); String dataHeading = res.getString("data_heading"); String dataName = res.getString("data_name"); String wikiPageUrl = res.getString("data_value"); String cvTerm = res.getString("cv_term"); String attHeading = res.getString("att_heading"); String attName = res.getString("att_name"); String attValue = res.getString("att_value"); String attDbxref = res.getString("att_dbxref"); int attRank = res.getInt("att_rank"); Integer submissionId = dataSubmissionMap.get(dataId); String dccId = dccIdMap.get(submissionId); writer.write(dccId + comma + dataHeading + comma + dataName + comma + wikiPageUrl + comma + cvTerm + comma + attHeading + comma + attName + comma + attValue + comma + attDbxref + System.getProperty("line.separator")); if (submissionId == null) { LOG.warn("Failed to find a submission id for data id " + dataId + " - this probably" + " means there is a problem with the applied_protocol DAG strucuture."); continue; } // Currently using attValue for referenced submission DCC id, should be dbUrl but seems // to be filled in incorrectly if (attHeading != null && attHeading.startsWith("modENCODE Reference")) { attValue = checkRefSub(wikiPageUrl, attValue, submissionId, dccId); } // we are starting a new data row if (dataId.intValue() != lastDataId.intValue()) { // have we seen this modencodewiki entry before? if (props.containsKey(wikiPageUrl)) { buildSubProperty = null; } else { buildSubProperty = new SubmissionProperty(getPreferredSynonym(dataName), wikiPageUrl); props.put(wikiPageUrl, buildSubProperty); } // submissionId -> [type -> SubmissionProperty] addToSubToTypes(subToTypes, submissionId, props.get(wikiPageUrl)); } if (buildSubProperty != null) { // we are building a new submission attribute, this is the first time we have // seen a data.value that points to modencodewiki buildSubProperty.addDetail(attHeading, attValue, attRank); } lastDataId = dataId; } writer.flush(); writer.close(); // Characteristics are modelled differently to protocol inputs/outputs, read in extra // properties here addSubmissionPropsFromCharacteristics(subToTypes, connection); // some submissions use reagents created in reference submissions, find the properties // of the reagents and add to referencing submission addSubmissionPropsFromReferencedSubmissions(subToTypes, props, submissionRefs); // create and store properties of submission storeSubProperties(subToTypes); LOG.info("PROCESS TIME submission properties: " + (System.currentTimeMillis() - bT) + " ms"); } /** * @param subToTypes * @throws ObjectStoreException */ private void storeSubProperties(Map<Integer, Map<String, List<SubmissionProperty>>> subToTypes) throws ObjectStoreException { for (Integer submissionId : subToTypes.keySet()) { Integer storedSubmissionId = submissionMap.get(submissionId).interMineObjectId; if (deletedSubMap.containsKey(submissionId)) { continue; } Map<String, List<SubmissionProperty>> typeToProp = subToTypes.get(submissionId); String dccId = dccIdMap.get(submissionId); ExperimentalFactor ef = submissionEFMap.get(submissionId); if (ef == null) { LOG.warn("No experimental factors found for submission: " + dccId); continue; } Set<String> exFactorNames = unifyFactorNames(ef.efNames); LOG.info("PROPERTIES " + dccId + " typeToProp keys: " + typeToProp.keySet()); List<Item> allPropertyItems = new ArrayList<Item>(); // DEVELOPMENTAL STAGE List<Item> devStageItems = new ArrayList<Item>(); devStageItems.addAll(createFromWikiPage(dccId, "DevelopmentalStage", typeToProp, makeLookupList("developmental stage"))); if (devStageItems.isEmpty()) { devStageItems.addAll(lookForAttributesInOtherWikiPages(dccId, "DevelopmentalStage", typeToProp, new String[] { "developmental stage.developmental stage", "tissue.developmental stage", "tissue source.developmental stage", "cell line.developmental stage", "cell id.developmental stage" //, "RNA extract.Cell Type" })); } if (!devStageItems.isEmpty() && exFactorNames.contains("developmental stage")) { createExperimentalFactors(submissionId, "developmental stage", devStageItems); exFactorNames.remove("developmental stage"); } if (devStageItems.isEmpty()) { addNotApplicable("DevelopmentalStage", "developmental stage"); } storeSubmissionCollection(storedSubmissionId, "developmentalStages", devStageItems); allPropertyItems.addAll(devStageItems); // STRAIN List<Item> strainItems = new ArrayList<Item>(); strainItems.addAll(createFromWikiPage(dccId, "Strain", typeToProp, makeLookupList("strain"))); if (!strainItems.isEmpty() && exFactorNames.contains("strain")) { createExperimentalFactors(submissionId, "strain", strainItems); exFactorNames.remove("strain"); } if (strainItems.isEmpty()) { addNotApplicable("Strain", "strain"); } storeSubmissionCollection(storedSubmissionId, "strains", strainItems); allPropertyItems.addAll(strainItems); // ARRAY List<Item> arrayItems = new ArrayList<Item>(); arrayItems.addAll(createFromWikiPage(dccId, "Array", typeToProp, makeLookupList("array"))); if (arrayItems.isEmpty()) { arrayItems.addAll(lookForAttributesInOtherWikiPages(dccId, "Array", typeToProp, new String[] { "adf.official name" })); if (!arrayItems.isEmpty()) { LOG.debug("Attribute found in other wiki pages: " + dccId + " ARRAY "); } } if (!arrayItems.isEmpty() && exFactorNames.contains("array")) { createExperimentalFactors(submissionId, "array", arrayItems); exFactorNames.remove("array"); } if (arrayItems.isEmpty()) { addNotApplicable("Array", "array"); } storeSubmissionCollection(storedSubmissionId, "arrays", arrayItems); allPropertyItems.addAll(arrayItems); // CELL LINE List<Item> lineItems = new ArrayList<Item>(); lineItems.addAll(createFromWikiPage(dccId, "CellLine", typeToProp, makeLookupList("cell line"))); if (!lineItems.isEmpty() && exFactorNames.contains("cell line")) { createExperimentalFactors(submissionId, "cell line", lineItems); exFactorNames.remove("cell line"); } if (lineItems.isEmpty()) { addNotApplicable("CellLine", "cell line"); } storeSubmissionCollection(storedSubmissionId, "cellLines", lineItems); allPropertyItems.addAll(lineItems); // RNAi REAGENT List<Item> reagentItems = new ArrayList<Item>(); reagentItems .addAll(createFromWikiPage(dccId, "SubmissionProperty", typeToProp, makeLookupList("dsRNA"))); if (!reagentItems.isEmpty() && exFactorNames.contains("RNAi reagent")) { createExperimentalFactors(submissionId, "RNAi reagent", reagentItems); exFactorNames.remove("RNAi reagent"); } allPropertyItems.addAll(reagentItems); // ANTIBODY List<Item> antibodyItems = new ArrayList<Item>(); antibodyItems.addAll(createFromWikiPage(dccId, "Antibody", typeToProp, makeLookupList("antibody"))); if (antibodyItems.isEmpty()) { LOG.debug("ANTIBODY: " + typeToProp.get("antibody")); antibodyItems.addAll(lookForAttributesInOtherWikiPages(dccId, "Antibody", typeToProp, new String[] { "antibody.official name" })); if (!antibodyItems.isEmpty()) { LOG.debug("Attribute found in other wiki pages: " + dccId + " ANTIBODY "); } } if (!antibodyItems.isEmpty() && exFactorNames.contains("antibody")) { createExperimentalFactors(submissionId, "antibody", antibodyItems); exFactorNames.remove("antibody"); } if (antibodyItems.isEmpty()) { addNotApplicable("Antibody", "antibody"); } storeSubmissionCollection(storedSubmissionId, "antibodies", antibodyItems); allPropertyItems.addAll(antibodyItems); // TISSUE List<Item> tissueItems = new ArrayList<Item>(); tissueItems.addAll(createFromWikiPage(dccId, "Tissue", typeToProp, makeLookupList("tissue"))); if (tissueItems.isEmpty()) { tissueItems.addAll(lookForAttributesInOtherWikiPages(dccId, "Tissue", typeToProp, new String[] { "stage.tissue", "cell line.tissue", "cell id.tissue" })); if (!tissueItems.isEmpty()) { LOG.info("Attribute found in other wiki pages: " + dccId + " TISSUE"); } } if (!tissueItems.isEmpty() && exFactorNames.contains("tissue")) { createExperimentalFactors(submissionId, "tissue", tissueItems); exFactorNames.remove("tissue"); } if (tissueItems.isEmpty()) { addNotApplicable("Tissue", "tissue"); } storeSubmissionCollection(storedSubmissionId, "tissues", tissueItems); allPropertyItems.addAll(tissueItems); // There may be some other experimental factors that require SubmissionProperty objects // but don't fall into the categories above. Create them here and set experimental // factors. ArrayList<String> extraPropNames = new ArrayList<String>(exFactorNames); for (String exFactor : extraPropNames) { List<Item> extraPropItems = new ArrayList<Item>(); extraPropItems.addAll(lookForAttributesInOtherWikiPages(dccId, "SubmissionProperty", typeToProp, new String[] { exFactor })); allPropertyItems.addAll(extraPropItems); createExperimentalFactors(submissionId, exFactor, extraPropItems); exFactorNames.remove(exFactor); } // deal with remaining factor names (e.g. the ones for which we did // not find a corresponding attribute for (String exFactor : exFactorNames) { String type = ef.efTypes.get(exFactor); createEFItem(submissionId, type, exFactor, null); } storeSubmissionCollection(storedSubmissionId, "properties", allPropertyItems); } } /** * @param wikiPageUrl * @param attValue * @param submissionId * @param dccId * @return */ private String checkRefSub(String wikiPageUrl, String attValue, Integer submissionId, String dccId) { if (attValue.indexOf(":") > 0) { attValue = attValue.substring(0, attValue.indexOf(":")); } attValue = DCC_PREFIX + attValue; Integer referencedSubId = getSubmissionIdFromDccId(attValue); if (referencedSubId != null) { SubmissionReference subRef = new SubmissionReference(referencedSubId, wikiPageUrl); Util.addToListMap(submissionRefs, submissionId, subRef); LOG.info("Submission " + dccId + " (" + submissionId + ") has reference to " + attValue + " (" + referencedSubId + ")"); } else { LOG.warn("Could not find submission " + attValue + " referenced by " + dccId); } return attValue; } /** * @param comma * @param writer * @throws IOException */ private void writeHeader(final String comma, FileWriter writer) throws IOException { writer.write("submission" + comma); writer.write("data_heading" + comma); writer.write("data_name" + comma); writer.write("data_value" + comma); writer.write("cv_term" + comma); writer.write("att_heading" + comma); writer.write("att_name" + comma); writer.write("att_value" + comma); writer.write(System.getProperty("line.separator")); } // /** // * @param submissionId // * @param exFactorNames // * @param devStageItems // * @param clsName // * @param propName // * @throws ObjectStoreException // */ // private void finishProp(Integer submissionId, Integer storedSubmissionId, // Set<String> exFactorNames, // List<Item> devStageItems, String clsName, String propName) // throws ObjectStoreException { // if (devStageItems.isEmpty()) { // addNotApplicable(devStageItems, clsName, propName); // storeSubmissionCollection(storedSubmissionId, "developmentalStages", devStageItems); // } else { // if (exFactorNames.contains(propName)) { // createExperimentalFactors(submissionId, propName, devStageItems); // exFactorNames.remove(propName); // } // } // } /** * @param clsName the class name of the property (e.g. Strain) * @param propName the property name (e.g. strain) * @throws ObjectStoreException */ private void addNotApplicable(String clsName, String propName) throws ObjectStoreException { Item subProperty = getChadoDBConverter().createItem(clsName); subProperty.setAttribute("type", propName); subProperty.setAttribute("name", NA_PROP); getChadoDBConverter().store(subProperty); } // Traverse DAG following previous applied protocol links to build a list of all AppliedData private void findAppliedProtocolsAndDataFromEarlierInDag(Integer startDataId, List<AppliedData> foundAppliedData, List<AppliedProtocol> foundAppliedProtocols) { AppliedData aData = appliedDataMap.get(startDataId); if (foundAppliedData != null) { foundAppliedData.add(aData); } for (Integer previousAppliedProtocolId : aData.previousAppliedProtocols) { AppliedProtocol ap = appliedProtocolMap.get(previousAppliedProtocolId); if (foundAppliedProtocols != null) { foundAppliedProtocols.add(ap); } for (Integer previousDataId : ap.inputs) { findAppliedProtocolsAndDataFromEarlierInDag(previousDataId, foundAppliedData, foundAppliedProtocols); } } } private void createExperimentalFactors(Integer submissionId, String type, Collection<Item> items) throws ObjectStoreException { for (Item item : items) { createEFItem(submissionId, type, item.getAttribute("name").getValue(), item.getIdentifier()); } } private void createEFItemNEW(Integer current, String type, String efName, String propertyIdentifier) throws ObjectStoreException { // don't create an EF if it is a primer if (type.endsWith("primer")) { return; } String preferredType = getPreferredSynonym(type); String key = efName + preferredType; String[] efTok = { efName, preferredType }; // create the EF, if not there already if (!eFactorIdMap.containsKey(key)) { Item ef = getChadoDBConverter().createItem("ExperimentalFactor"); ef.setAttribute("type", preferredType); ef.setAttribute("name", efName); if (propertyIdentifier != null) { ef.setReference("property", propertyIdentifier); } LOG.info("ExFactor created for sub " + dccIdMap.get(current) + ":" + efName + "|" + type); Integer intermineObjectId = getChadoDBConverter().store(ef); eFactorIdMap.put(key, intermineObjectId); eFactorIdRefMap.put(key, ef.getIdentifier()); } // if pertinent to the current sub, add to the map for the references Util.addToListMap(submissionEFactorMap2, current, efTok); } private void createEFItem(Integer current, String type, String efName, String propertyIdentifier) throws ObjectStoreException { // don't create an EF if it is a primer if (type.endsWith("primer")) { return; } // create the EF, if not there already if (!eFactorIdMap.containsKey(efName)) { Item ef = getChadoDBConverter().createItem("ExperimentalFactor"); String preferredType = getPreferredSynonym(type); ef.setAttribute("type", preferredType); ef.setAttribute("name", efName); if (propertyIdentifier != null) { ef.setReference("property", propertyIdentifier); } LOG.info("ExFactor created for sub " + current + ":" + efName + "|" + type); Integer intermineObjectId = getChadoDBConverter().store(ef); eFactorIdMap.put(efName, intermineObjectId); eFactorIdRefMap.put(efName, ef.getIdentifier()); } // if pertinent to the current sub, add to the map for the references Util.addToListMap(submissionEFactorMap, current, efName); } private void addToSubToTypes(Map<Integer, Map<String, List<SubmissionProperty>>> subToTypes, Integer submissionId, SubmissionProperty prop) { // submissionId -> [type -> SubmissionProperty] if (submissionId == null) { LOG.error("MISSING SUB: " + prop); return; //throw new RuntimeException("Called addToSubToTypes with a null sub id!"); } Map<String, List<SubmissionProperty>> typeToSubProp = subToTypes.get(submissionId); if (typeToSubProp == null) { typeToSubProp = new HashMap<String, List<SubmissionProperty>>(); subToTypes.put(submissionId, typeToSubProp); } List<SubmissionProperty> subProps = typeToSubProp.get(prop.type); if (subProps == null) { subProps = new ArrayList<SubmissionProperty>(); typeToSubProp.put(prop.type, subProps); } subProps.add(prop); } private void addSubmissionPropsFromCharacteristics( Map<Integer, Map<String, List<SubmissionProperty>>> subToTypes, Connection connection) throws SQLException { ResultSet res = getAppliedDataCharacteristics(connection); Integer lastAttDbXref = new Integer(-1); Integer lastDataId = new Integer(-1); Map<Integer, SubmissionProperty> createdProps = new HashMap<Integer, SubmissionProperty>(); SubmissionProperty buildSubProperty = null; boolean isValidCharacteristic = false; Integer currentSubId = null; // we need those to attach the property to the correct sub Integer previousSubId = null; while (res.next()) { Integer dataId = new Integer(res.getInt("data_id")); String attHeading = res.getString("att_heading"); String attName = res.getString("att_name"); String attValue = res.getString("att_value"); Integer attDbxref = new Integer(res.getInt("att_dbxref")); int attRank = res.getInt("att_rank"); currentSubId = dataSubmissionMap.get(dataId); if (currentSubId == null) { LOG.info("DSM failing dataId: " + dataId + " - " + attHeading + "|" + attName + "|" + attValue); } if (dataId.intValue() != lastDataId.intValue() || attDbxref.intValue() != lastAttDbXref.intValue() || currentSubId != previousSubId) { // store the last build property if created, type is set only if we found an // attHeading of Characteristics // note: dbxref can remain the same in different subs -> or if (buildSubProperty != null && buildSubProperty.type != null) { createdProps.put(lastAttDbXref, buildSubProperty); addToSubToTypes(subToTypes, previousSubId, buildSubProperty); } // set up for next attDbxref if (createdProps.containsKey(attDbxref) && isValidCharacteristic) { // seen this property before so just add for this submission, don't build again buildSubProperty = null; isValidCharacteristic = false; addToSubToTypes(subToTypes, currentSubId, createdProps.get(attDbxref)); } else { buildSubProperty = new SubmissionProperty(); isValidCharacteristic = false; } } if (attHeading.startsWith("Characteristic")) { isValidCharacteristic = true; } // OR // if (buildSubProperty != null) { // if (attHeading.startsWith("Characteristic")) { // buildSubProperty.type = getPreferredSynonym(attName); // buildSubProperty.wikiPageUrl = attValue; // // add detail here as some Characteristics don't reference a wiki page // // but have all information on single row // buildSubProperty.addDetail(attName, attValue, attRank); // } else { // buildSubProperty.addDetail(attHeading, attValue, attRank); // } // } if (buildSubProperty != null) { if (attHeading.startsWith("Characteristic")) { String type = getPreferredSynonym(attName); String wikiUrl = attValue; String wikiType = checkWikiType(type, wikiUrl, currentSubId); buildSubProperty.type = wikiType; buildSubProperty.wikiPageUrl = wikiUrl; // add detail here as some Characteristics don't reference a wiki page // but have all information on single row buildSubProperty.addDetail(wikiType, attValue, attRank); } else { buildSubProperty.addDetail(attHeading, attValue, attRank); } } previousSubId = currentSubId; lastAttDbXref = attDbxref; lastDataId = dataId; } if (buildSubProperty != null && buildSubProperty.type != null) { createdProps.put(lastAttDbXref, buildSubProperty); addToSubToTypes(subToTypes, currentSubId, buildSubProperty); } } private String checkWikiType(String type, String wikiLink, Integer subId) { // for devstages and strain check the type on the wikilink and use it if different // from the declared one. LOG.info("WIKILINK: " + wikiLink + " -- type: " + type); if (wikiLink != null && wikiLink.contains(":")) { String wikiType = wikiLink.substring(0, wikiLink.indexOf(':')); if (type.equals(STRAIN) || type.equals(DEVSTAGE)) { if (!congruentType(type, wikiType)) { LOG.warn("WIKILINK " + dccIdMap.get(subId) + ": " + type + " but in wiki url: " + wikiType); // not strictly necessary (code would deal with wikiType) if (wikiType.contains("Strain")) { return STRAIN; } if (wikiType.contains("Stage")) { return DEVSTAGE; } } } } return type; } private Boolean congruentType(String type, String wikiType) { // check only strain and devstages if (wikiType.contains("Strain") && !type.equals(STRAIN)) { return false; } if (wikiType.contains("Stage") && !type.equalsIgnoreCase(DEVSTAGE)) { return false; } return true; } // Some submission mention e.g. an RNA Sample but the details of how that sample was created, // stage, strain, etc are in a previous submission. There are references to previous submission // DCC ids where a sample with the corresponding name can be found. We then need to traverse // backwards along the AppliedProtocol DAG to find the stage, strain, etc wiki pages. These // should already have been processed so the properties can just be added to the referencing // submission. private void addSubmissionPropsFromReferencedSubmissions( Map<Integer, Map<String, List<SubmissionProperty>>> subToTypes, Map<String, SubmissionProperty> props, Map<Integer, List<SubmissionReference>> submissionRefs) { for (Map.Entry<Integer, List<SubmissionReference>> entry : submissionRefs.entrySet()) { Integer submissionId = entry.getKey(); List<SubmissionReference> lref = entry.getValue(); Iterator<SubmissionReference> i = lref.iterator(); while (i.hasNext()) { SubmissionReference ref = i.next(); List<AppliedData> refAppliedData = findAppliedDataFromReferencedSubmission(ref); for (AppliedData aData : refAppliedData) { String possibleWikiUrl = aData.actualValue; if (possibleWikiUrl != null && props.containsKey(possibleWikiUrl)) { //LOG.debug("EEFF possible wikiurl: " + possibleWikiUrl); SubmissionProperty propFromReferencedSub = props.get(possibleWikiUrl); if (propFromReferencedSub != null) { addToSubToTypes(subToTypes, submissionId, propFromReferencedSub); LOG.debug("EEFF from referenced sub: " + propFromReferencedSub.type + ": " + propFromReferencedSub.details); } } } } } } private List<AppliedData> findAppliedDataFromReferencedSubmission(SubmissionReference subRef) { List<AppliedData> foundAppliedData = new ArrayList<AppliedData>(); findAppliedProtocolsAndDataFromReferencedSubmission(subRef, foundAppliedData, null); return foundAppliedData; } private List<AppliedProtocol> findAppliedProtocolsFromReferencedSubmission(SubmissionReference subRef) { List<AppliedProtocol> foundAppliedProtocols = new ArrayList<AppliedProtocol>(); findAppliedProtocolsAndDataFromReferencedSubmission(subRef, null, foundAppliedProtocols); return foundAppliedProtocols; } private void findAppliedProtocolsAndDataFromReferencedSubmission(SubmissionReference subRef, List<AppliedData> foundAppliedData, List<AppliedProtocol> foundAppliedProtocols) { String refDataValue = subRef.dataValue; Integer refSubId = subRef.referencedSubmissionId; for (AppliedData aData : appliedDataMap.values()) { String currentDataValue = aData.value; Integer currentDataSubId = dataSubmissionMap.get(aData.dataId); // added check that referenced and referring are not the same. if (refDataValue.equals(currentDataValue) && refSubId.equals(currentDataSubId)) { LOG.info("Found a matching data value: " + currentDataValue + " in referenced sub " + dccIdMap.get(currentDataSubId) + " for value " + aData.actualValue); Integer foundDataId = aData.dataId; findAppliedProtocolsAndDataFromEarlierInDag(foundDataId, foundAppliedData, foundAppliedProtocols); } } } private List<Item> createFromWikiPage(String dccId, String clsName, Map<String, List<SubmissionProperty>> typeToProp, List<String> types) throws ObjectStoreException { List<Item> items = new ArrayList<Item>(); List<SubmissionProperty> props = new ArrayList<SubmissionProperty>(); for (String type : types) { if (typeToProp.containsKey(type)) { props.addAll(typeToProp.get(type)); } } items.addAll(createItemsForSubmissionProperties(dccId, clsName, props)); return items; } private void storeSubmissionCollection(Integer storedSubmissionId, String name, List<Item> items) throws ObjectStoreException { if (!items.isEmpty()) { ReferenceList refList = new ReferenceList(name, getIdentifiersFromItems(items)); getChadoDBConverter().store(refList, storedSubmissionId); } } private List<String> getIdentifiersFromItems(Collection<Item> items) { List<String> ids = new ArrayList<String>(); for (Item item : items) { ids.add(item.getIdentifier()); } return ids; } private List<Item> createItemsForSubmissionProperties(String dccId, String clsName, List<SubmissionProperty> subProps) throws ObjectStoreException { List<Item> items = new ArrayList<Item>(); for (SubmissionProperty subProp : subProps) { Item item = getItemForSubmissionProperty(clsName, subProp, dccId); if (item != null) { items.add(item); } } return items; } private Item getItemForSubmissionProperty(String clsName, SubmissionProperty prop, String dccId) throws ObjectStoreException { Item propItem = subItemsMap.get(prop.wikiPageUrl); if (propItem == null) { if (clsName != null) { List<String> checkOfficialName = prop.details.get("official name"); if (checkOfficialName == null) { LOG.warn("No 'official name', using 'name' instead for: " + prop.wikiPageUrl); checkOfficialName = prop.details.get("name"); } if (checkOfficialName == null) { LOG.info("Official name - missing for property: " + prop.type + ", " + prop.wikiPageUrl); return null; } else if (checkOfficialName.size() != 1) { LOG.info("Official name - multiple times for property: " + prop.type + ", " + prop.wikiPageUrl + ", " + checkOfficialName); } String officialName = getCorrectedOfficialName(prop); propItem = createSubmissionProperty(clsName, officialName); propItem.setAttribute("type", getPreferredSynonym(prop.type)); propItem.setAttribute("wikiLink", WIKI_URL + prop.wikiPageUrl); if ("DevelopmentalStage".equals(clsName)) { setAttributeOnProp(prop, propItem, "sex", "sex"); List<String> devStageValues = prop.details.get("developmental stage"); if (devStageValues != null) { for (String devStageValue : devStageValues) { propItem.addToCollection("ontologyTerms", getDevStageTerm(devStageValue, dccId)); } } else { LOG.error("METADATA FAIL on " + dccId + ": no 'developmental stage' values for wiki page: " + prop.wikiPageUrl); } } else if ("Antibody".equals(clsName)) { setAttributeOnProp(prop, propItem, "antigen", "antigen"); setAttributeOnProp(prop, propItem, "host", "hostOrganism"); setAttributeOnProp(prop, propItem, "target name", "targetName"); setGeneItem(dccId, prop, propItem, "Antibody"); } else if ("Array".equals(clsName)) { setAttributeOnProp(prop, propItem, "platform", "platform"); setAttributeOnProp(prop, propItem, "resolution", "resolution"); setAttributeOnProp(prop, propItem, "genome", "genome"); } else if ("CellLine".equals(clsName)) { setAttributeOnProp(prop, propItem, "sex", "sex"); setAttributeOnProp(prop, propItem, "short description", "description"); setAttributeOnProp(prop, propItem, "species", "species"); setAttributeOnProp(prop, propItem, "tissue", "tissue"); setAttributeOnProp(prop, propItem, "cell type", "cellType"); setAttributeOnProp(prop, propItem, "target name", "targetName"); setGeneItem(dccId, prop, propItem, "CellLine"); } else if ("Strain".equals(clsName)) { setAttributeOnProp(prop, propItem, "species", "species"); setAttributeOnProp(prop, propItem, "source", "source"); // the following 2 should be mutually exclusive setAttributeOnProp(prop, propItem, "Description", "description"); setAttributeOnProp(prop, propItem, "details", "description"); setAttributeOnProp(prop, propItem, "aliases", "name"); setAttributeOnProp(prop, propItem, "reference", "reference"); setAttributeOnProp(prop, propItem, "target name", "targetName"); setGeneItem(dccId, prop, propItem, "Strain"); } else if ("Tissue".equals(clsName)) { setAttributeOnProp(prop, propItem, "species", "species"); setAttributeOnProp(prop, propItem, "sex", "sex"); setAttributeOnProp(prop, propItem, "organismPart", "organismPart"); } getChadoDBConverter().store(propItem); } subItemsMap.put(prop.wikiPageUrl, propItem); } return propItem; } private void setGeneItem(String dccId, SubmissionProperty prop, Item propItem, String source) throws ObjectStoreException { String targetText = null; String[] possibleTypes = new String[] { "target id" }; boolean tooMany = false; for (String targetType : possibleTypes) { if (prop.details.containsKey(targetType)) { if (prop.details.get(targetType).size() != 1) { // we used to complain if multiple values, now only // if they don't have the same value if (sameTargetValue(prop, source, targetType)) { tooMany = true; break; } } if (!tooMany) { targetText = prop.details.get(targetType).get(0); } break; } } if (targetText != null) { // if no target name was found use the target id if (!propItem.hasAttribute("targetName")) { propItem.setAttribute("targetName", targetText); } String geneItemId = getTargetGeneItemIdentfier(targetText, dccId); if (geneItemId != null) { propItem.setReference("target", geneItemId); } } } private boolean sameTargetValue(SubmissionProperty prop, String source, String targetType) { String value = prop.details.get(targetType).get(0); for (int i = 1; i < prop.details.get(targetType).size(); i++) { String newValue = prop.details.get(targetType).get(i); if (!newValue.equals(value)) { LOG.error(source + " (" + prop.wikiPageUrl + ") has more than 1 value for '" + targetType + "' field: " + prop.details.get(targetType)); //throw new RuntimeException(source + " should only have one value for '" // + targetType + "' field: " + prop.details.get(targetType)); return true; } } return false; } private void setAttributeOnProp(SubmissionProperty subProp, Item item, String metadataName, String attributeName) { if (subProp.details.containsKey(metadataName)) { if ("aliases".equalsIgnoreCase(metadataName)) { for (String s : subProp.details.get(metadataName)) { if ("yellow cinnabar brown speck".equalsIgnoreCase(s)) { // swapping name with fullName String full = item.getAttribute("name").getValue(); item.setAttribute("fullName", full); item.setAttribute(attributeName, s); break; } } } else if ("description".equalsIgnoreCase(metadataName) || "details".equalsIgnoreCase(metadataName)) { // description is often split in more than 1 line, details should be correct order StringBuffer sb = new StringBuffer(); for (String desc : subProp.details.get(metadataName)) { sb.append(desc); } if (sb.length() > 0) { item.setAttribute(attributeName, sb.toString()); } } else { String value = subProp.details.get(metadataName).get(0); item.setAttribute(attributeName, value); } } } private String getTargetGeneItemIdentfier(String geneTargetIdText, String dccId) throws ObjectStoreException { // TODO check: why not using only the else? String taxonId = ""; String originalId = null; String flyPrefix = "fly_genes:"; String wormPrefix = "worm_genes:"; if (geneTargetIdText.startsWith(flyPrefix)) { originalId = geneTargetIdText.substring(flyPrefix.length()); taxonId = "7227"; } else if (geneTargetIdText.startsWith(wormPrefix)) { originalId = geneTargetIdText.substring(wormPrefix.length()); taxonId = "6239"; } else { // attempt to work out the organism from the submission taxonId = getTaxonIdForSubmission(dccId); originalId = geneTargetIdText; LOG.debug("RESOLVER: found taxon " + taxonId + " for submission " + dccId); } if (!"7227".equals(taxonId) && !"6239".equals(taxonId)) { LOG.info("RESOLVER: unable to work out organism for target id text: " + geneTargetIdText + " in submission " + dccId); } String geneItemId = null; String primaryIdentifier = resolveGene(originalId, taxonId); if (primaryIdentifier != null) { geneItemId = geneToItemIdentifier.get(primaryIdentifier); if (geneItemId == null) { Item gene = getChadoDBConverter().createItem("Gene"); geneItemId = gene.getIdentifier(); gene.setAttribute("primaryIdentifier", primaryIdentifier); getChadoDBConverter().store(gene); geneToItemIdentifier.put(primaryIdentifier, geneItemId); } else { LOG.info("RESOLVER fetched gene from cache: " + primaryIdentifier); } } return geneItemId; } private String resolveGene(String originalId, String taxonId) { String primaryIdentifier = null; int resCount = rslv.countResolutions(taxonId, originalId); if (resCount != 1) { LOG.info("RESOLVER: failed to resolve gene to one identifier, ignoring " + "gene: " + originalId + " for organism " + taxonId + " count: " + resCount + " found ids: " + rslv.resolveId(taxonId, originalId) + "."); } else { primaryIdentifier = rslv.resolveId(taxonId, originalId).iterator().next(); LOG.info("RESOLVER found gene " + primaryIdentifier + " for original id: " + originalId); } return primaryIdentifier; } private List<Item> lookForAttributesInOtherWikiPages(String dccId, String clsName, Map<String, List<SubmissionProperty>> typeToProp, String[] lookFor) throws ObjectStoreException { List<Item> items = new ArrayList<Item>(); for (String typeProp : lookFor) { if (typeProp.indexOf(".") > 0) { String[] bits = StringUtils.split(typeProp, '.'); String type = bits[0]; String propName = bits[1]; if (typeToProp.containsKey(type)) { for (SubmissionProperty subProp : typeToProp.get(type)) { if (subProp.details.containsKey(propName)) { for (String value : subProp.details.get(propName)) { items.add(createNonWikiSubmissionPropertyItem(dccId, clsName, getPreferredSynonym(propName), correctAttrValue(value))); } } } if (!items.isEmpty()) { break; } } } else { // no attribute type given so use the data.value (SubmissionProperty.wikiPageUrl) // which probably won't be a wiki page if (typeToProp.containsKey(typeProp)) { for (SubmissionProperty subProp : typeToProp.get(typeProp)) { String value = subProp.wikiPageUrl; // This is an ugly special case to deal with 'exposure time/24 hours' if (subProp.details.containsKey("Unit")) { String unit = subProp.details.get("Unit").get(0); value = value + " " + unit + (unit.endsWith("s") ? "" : "s"); } items.add(createNonWikiSubmissionPropertyItem(dccId, clsName, subProp.type, correctAttrValue(value))); } } } } return items; } private String correctAttrValue(String value) { if (value == null) { return null; } value = value.replace("", "-"); return value; } private Item createNonWikiSubmissionPropertyItem(String dccId, String clsName, String type, String name) throws ObjectStoreException { if ("DevelopmentalStage".equals(clsName)) { name = correctDevStageTerm(name); } Item item = nonWikiSubmissionProperties.get(name); if (item == null) { item = createSubmissionProperty(clsName, name); item.setAttribute("type", getPreferredSynonym(type)); if ("DevelopmentalStage".equals(clsName)) { String ontologyTermId = getDevStageTerm(name, dccId); item.addToCollection("ontologyTerms", ontologyTermId); } getChadoDBConverter().store(item); nonWikiSubmissionProperties.put(name, item); } return item; } private Item createSubmissionProperty(String clsName, String name) { Item subProp = getChadoDBConverter().createItem(clsName); if (name != null) { subProp.setAttribute("name", name); } return subProp; } private String getCorrectedOfficialName(SubmissionProperty prop) { String preferredType = getPreferredSynonym(prop.type); String name = null; if (prop.details.containsKey("official name")) { name = prop.details.get("official name").get(0); } else if (prop.details.containsKey("name")) { name = prop.details.get("name").get(0); } else { // no official name so maybe there is a key that matches the type - sometimes the // setup for Characteristics for (String lookup : makeLookupList(prop.type)) { if (prop.details.containsKey(lookup)) { name = prop.details.get(lookup).get(0); } } } return correctOfficialName(name, preferredType); } /** * Unify variations on similar official names. * @param name the original 'official name' value * @param type the treatment depends on the type * @return a unified official name */ protected String correctOfficialName(String name, String type) { if (name == null) { return null; } if ("developmental stage".equals(type)) { name = name.replace("_", " "); name = name.replaceFirst("embryo", "Embryo"); name = name.replaceFirst("Embyro", "Embryo"); if (name.matches("E\\d.*")) { name = name.replaceFirst("^E", "Embryo "); } if (name.matches("Embryo.*\\d")) { name = name + " h"; } if (name.matches(".*hr")) { name = name.replace("hr", "h"); } if (name.matches("Embryo.*\\dh")) { name = name.replaceFirst("h", " h"); } if (name.startsWith("DevStage:")) { name = name.replaceFirst("DevStage:", "").trim(); } if (name.matches("L\\d")) { name = name + " stage larvae"; } if (name.matches(".*L\\d")) { name = name + " stage larvae"; } if (name.matches("WPP.*")) { name = name.replaceFirst("WPP", "White prepupae (WPP)"); } } return name; } private String getDevStageTerm(String value, String dccId) throws ObjectStoreException { value = correctDevStageTerm(value); // there may be duplicate terms for fly and worm, include taxon in key String taxonId = getTaxonIdForSubmission(dccId); OrganismRepository or = OrganismRepository.getOrganismRepository(); String genus = or.getOrganismDataByTaxon(Integer.parseInt(taxonId)).getGenus(); String key = value + "_" + genus; String identifier = devStageTerms.get(key); if (identifier == null) { Item term = getChadoDBConverter().createItem("OntologyTerm"); term.setAttribute("name", value); String ontologyRef = getDevelopmentOntologyByTaxon(taxonId); if (ontologyRef != null) { term.setReference("ontology", ontologyRef); } getChadoDBConverter().store(term); devStageTerms.put(key, term.getIdentifier()); identifier = term.getIdentifier(); } return identifier; } private String correctDevStageTerm(String value) { // some terms are prefixed with ontology namespace String prefix = "FlyBase development CV:"; if (value.startsWith(prefix)) { value = value.substring(prefix.length()); } return value; } private String getTaxonIdForSubmission(String dccId) { Integer subChadoId = getSubmissionIdFromDccId(dccId); String organism = submissionOrganismMap.get(subChadoId); OrganismRepository or = OrganismRepository.getOrganismRepository(); return "" + or.getOrganismDataByFullName(organism).getTaxonId(); } private String getDevelopmentOntologyByTaxon(String taxonId) throws ObjectStoreException { if (taxonId == null) { return null; } String ontologyName = null; OrganismRepository or = OrganismRepository.getOrganismRepository(); String genus = or.getOrganismDataByTaxon(Integer.parseInt(taxonId)).getGenus(); if ("Drosophila".equals(genus)) { ontologyName = "Fly Development"; } else { ontologyName = "Worm Development"; } String ontologyId = devOntologies.get(ontologyName); if (ontologyId == null) { Item ontology = getChadoDBConverter().createItem("Ontology"); ontology.setAttribute("name", ontologyName); getChadoDBConverter().store(ontology); ontologyId = ontology.getIdentifier(); devOntologies.put(ontologyName, ontologyId); } return ontologyId; } private Integer getSubmissionIdFromDccId(String dccId) { for (Map.Entry<Integer, String> entry : dccIdMap.entrySet()) { if (entry.getValue().equals(dccId)) { return entry.getKey(); } } return null; } /** * Return the rows needed for data from the applied_protocol_data table. * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getAppliedDataAll(Connection connection) throws SQLException { String sraAcc = "SRA acc"; String query = "SELECT d.data_id, d.heading as data_heading," + " d.name as data_name, d.value as data_value," + " c.name as cv_term," + " a.attribute_id, a.heading as att_heading, a.name as att_name," + " a.value as att_value," + " a.dbxref_id as att_dbxref, a.rank as att_rank" + " FROM data d" + " LEFT JOIN data_attribute da ON (d.data_id = da.data_id)" + " LEFT JOIN attribute a ON (da.attribute_id = a.attribute_id)" + " LEFT JOIN cvterm c ON (d.type_id = c.cvterm_id)" + " LEFT JOIN dbxref as x ON (a.dbxref_id = x.dbxref_id)" + " WHERE d.name != '" + sraAcc + "'" + " AND d.value != '' " + " ORDER BY d.data_id"; return doQuery(connection, query, "getAppliedDataAll"); } /** * Return the rows needed for data from the applied_protocol_data table. * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getAppliedDataCharacteristics(Connection connection) throws SQLException { String query = "select d.data_id, d.heading as data_heading," + " d.name as data_name, d.value as data_value," + " a.attribute_id, a.heading as att_heading, a.name as att_name," + " a.value as att_value," + " a.dbxref_id as att_dbxref, a.rank as att_rank" + " FROM data d, data_attribute da, attribute a, dbxref ax, db" + " WHERE d.data_id = da.data_id" + " AND da.attribute_id = a.attribute_id" + " AND a.dbxref_id = ax.dbxref_id" + " AND ax.db_id = db.db_id" + " ORDER BY d.data_id, a.dbxref_id "; return doQuery(connection, query, "getAppliedDataCharacteristics"); } private class SubmissionProperty { protected String type; protected String wikiPageUrl; protected Map<String, List<String>> details; protected SubmissionProperty() { details = new HashMap<String, List<String>>(); } public SubmissionProperty(String type, String wikiPageUrl) { this.type = type; this.wikiPageUrl = wikiPageUrl; details = new HashMap<String, List<String>>(); } public void addDetail(String type, String value, int rank) { List<String> values = details.get(type); if (values == null) { values = new ArrayList<String>(); details.put(type, values); } while (values.size() <= rank) { values.add(null); } values.set(rank, value); } public String toString() { return this.type + ": " + this.wikiPageUrl + this.details.entrySet(); } } private final class DatabaseRecordConfig { private String dbName; private String dbDescrition; private String dbURL; private Set<String> types = new HashSet<String>(); private DatabaseRecordConfig() { // don't } } private Set<DatabaseRecordConfig> initDatabaseRecordConfigs() { Set<DatabaseRecordConfig> configs = new HashSet<DatabaseRecordConfig>(); DatabaseRecordConfig geo = new DatabaseRecordConfig(); geo.dbName = "GEO"; geo.dbDescrition = "Gene Expression Omnibus (NCBI)"; geo.dbURL = "http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc="; geo.types.add("GEO_record"); configs.add(geo); DatabaseRecordConfig ae = new DatabaseRecordConfig(); ae.dbName = "ArrayExpress"; ae.dbDescrition = "ArrayExpress (EMBL-EBI)"; ae.dbURL = "http://www.ebi.ac.uk/microarray-as/ae/browse.html?keywords="; ae.types.add("ArrayExpress_record"); configs.add(ae); DatabaseRecordConfig sra = new DatabaseRecordConfig(); sra.dbName = "SRA"; sra.dbDescrition = "Sequence Read Archive (NCBI)"; sra.dbURL = "http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=viewer&m=data&s=viewer&run="; sra.types.add("ShortReadArchive_project_ID_list (SRA)"); sra.types.add("ShortReadArchive_project_ID (SRA)"); configs.add(sra); DatabaseRecordConfig ta = new DatabaseRecordConfig(); ta.dbName = "Trace Archive"; ta.dbDescrition = "Trace Archive (NCBI)"; ta.dbURL = "http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?&cmd=retrieve&val="; ta.types.add("TraceArchive_record"); configs.add(ta); DatabaseRecordConfig de = new DatabaseRecordConfig(); de.dbName = "dbEST"; de.dbDescrition = "Expressed Sequence Tags database (NCBI)"; de.dbURL = "http://www.ncbi.nlm.nih.gov/nucest/"; de.types.add("dbEST_record"); configs.add(de); return configs; } /** * Query to get data attributes. * * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getAppliedDataAttributes(Connection connection) throws SQLException { String query = "select da.data_id, a.heading, a.value, a.name " + " from data_attribute da, attribute a" + " where da.attribute_id = a.attribute_id"; return doQuery(connection, query, "getAppliedDataAttributes"); } /** * ================ * REFERENCES * ================ * to store references between submission and submissionData * (1 to many) */ private void setSubmissionRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process // note: the map should contain only live submissions for (Integer submissionId : submissionDataMap.keySet()) { for (Integer dataId : submissionDataMap.get(submissionId)) { LOG.debug("DAG subRef subid: " + submissionId + " dataId: " + dataId); if (appliedDataMap.get(dataId).intermineObjectId == null) { continue; } Reference reference = new Reference(); reference.setName("submission"); reference.setRefId(submissionMap.get(submissionId).itemIdentifier); getChadoDBConverter().store(reference, appliedDataMap.get(dataId).intermineObjectId); } } LOG.info("TIME setting submission-data references: " + (System.currentTimeMillis() - bT) + " ms"); } /** * ===================== * DATABASE RECORDS * ===================== */ private void createDatabaseRecords(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process Set<DatabaseRecordConfig> configs = initDatabaseRecordConfigs(); for (Integer submissionId : submissionDataMap.keySet()) { LOG.info("DB RECORD for sub " + dccIdMap.get(submissionId) + "..."); List<Integer> submissionDbRecords = new ArrayList<Integer>(); for (Integer dataId : submissionDataMap.get(submissionId)) { AppliedData ad = appliedDataMap.get(dataId); if (ad.type.equalsIgnoreCase("Result Value")) { for (DatabaseRecordConfig conf : configs) { for (String type : conf.types) { if (ad.name.equals(type)) { submissionDbRecords.addAll(createDatabaseRecords(ad.value, conf)); } } } } // prepare map for setting references if (!submissionDbRecords.isEmpty()) { for (Integer dbRecord : submissionDbRecords) { addToMap(dbRecordIdSubItems, dbRecord, submissionMap.get(submissionId).itemIdentifier); } } } } LOG.info("TIME creating DatabaseRecord objects: " + (System.currentTimeMillis() - bT) + " ms"); bT = System.currentTimeMillis(); // set references // NB: we are setting them from dbRecord to submissions because more efficient // (a few subs have a big collection of dbRecords) for (Integer dbRecordId : dbRecordIdSubItems.keySet()) { ReferenceList col = new ReferenceList("submissions", dbRecordIdSubItems.get(dbRecordId)); getChadoDBConverter().store(col, dbRecordId); } LOG.info("TIME creating refs DatabaseRecord objects: " + (System.currentTimeMillis() - bT) + " ms"); } private List<Integer> createDatabaseRecords(String accession, DatabaseRecordConfig config) throws ObjectStoreException { List<Integer> dbRecordIds = new ArrayList<Integer>(); String defaultURL = config.dbURL; Set<String> cleanAccessions = new HashSet<String>(); // NOTE - this is a special case to deal with a very strange SRA accession format in some // Celniker submissions. The 'accession' is provided as e.g. // SRR013492.225322.1;SRR013492.462158.1;... // We just want the unique SRR ids if ("SRA".equals(config.dbName) && (accession.indexOf(';') != -1 || accession.indexOf('.') != -1)) { for (String part : accession.split(";")) { if (part.indexOf('.') != -1) { cleanAccessions.add(part.substring(0, part.indexOf('.'))); } else { cleanAccessions.add(part); } } } else if ("SRA".equals(config.dbName) && (accession.startsWith("SRX"))) { config.dbURL = "http://www.ncbi.nlm.nih.gov/sra/"; dbRecordIds.add(createDatabaseRecord(accession, config)); config.dbURL = defaultURL; } else { cleanAccessions.add(accession); } for (String cleanAccession : cleanAccessions) { dbRecordIds.add(createDatabaseRecord(cleanAccession, config)); } return dbRecordIds; } private Integer createDatabaseRecord(String accession, DatabaseRecordConfig config) throws ObjectStoreException { DatabaseRecordKey key = new DatabaseRecordKey(config.dbName, accession); Integer dbRecordId = dbRecords.get(key); if (dbRecordId == null) { Item dbRecord = getChadoDBConverter().createItem("DatabaseRecord"); dbRecord.setAttribute("database", config.dbName); dbRecord.setAttribute("description", config.dbDescrition); if (StringUtils.isEmpty(accession)) { dbRecord.setAttribute("accession", "To be confirmed"); } else { dbRecord.setAttribute("url", config.dbURL + accession); dbRecord.setAttribute("accession", accession); } dbRecordId = getChadoDBConverter().store(dbRecord); dbRecords.put(key, dbRecordId); } return dbRecordId; } private class DatabaseRecordKey { private String db; private String accession; /** * Construct with the database and accession * @param db database name * @param accession id in database */ public DatabaseRecordKey(String db, String accession) { this.db = db; this.accession = accession; } /** * {@inheritDoc} */ public boolean equals(Object o) { if (o instanceof DatabaseRecordKey) { DatabaseRecordKey otherKey = (DatabaseRecordKey) o; if (StringUtils.isNotEmpty(accession) && StringUtils.isNotEmpty(otherKey.accession)) { return this.db.equals(otherKey.db) && this.accession.equals(otherKey.accession); } } return false; } /** * {@inheritDoc} */ public int hashCode() { return db.hashCode() + 3 * accession.hashCode(); } } /** * ===================== * RESULT FILES * ===================== */ private void createResultFiles(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process for (Integer submissionId : submissionDataMap.keySet()) { // the applied data is repeated for each protocol // so we want to uniquefy the created object Set<String> subFiles = new HashSet<String>(); for (Integer dataId : submissionDataMap.get(submissionId)) { AppliedData ad = appliedDataMap.get(dataId); // now checking only for 'file', not 'result file' if (StringUtils.containsIgnoreCase(ad.type, "file")) { if (!StringUtils.isBlank(ad.value) && !subFiles.contains(ad.value)) { String direction = null; if (StringUtils.containsIgnoreCase(ad.type, "result")) { direction = "result"; } else { direction = "input"; } createResultFile(ad.value, ad.name, ad.url, direction, submissionId); subFiles.add(ad.value); } } } } LOG.info("TIME creating ResultFile objects: " + (System.currentTimeMillis() - bT) + " ms"); } private void createResultFile(String fileName, String type, String relDccId, String direction, Integer submissionId) throws ObjectStoreException { Item resultFile = getChadoDBConverter().createItem("ResultFile"); resultFile.setAttribute("name", unversionName(fileName)); String url = null; if (fileName.startsWith("http") || fileName.startsWith("ftp")) { url = fileName; } else { if (relDccId != null) { // the file actually belongs to a related sub url = FILE_URL + relDccId + "/extracted/" + unversionName(fileName); } else { // note: on ftp site submission directories are named with the digits only String dccId = dccIdMap.get(submissionId).substring(DCC_PREFIX.length()); url = FILE_URL + dccId + "/extracted/" + unversionName(fileName); } } resultFile.setAttribute("url", url); resultFile.setAttribute("type", type); resultFile.setAttribute("direction", direction); resultFile.setReference("submission", submissionMap.get(submissionId).itemIdentifier); getChadoDBConverter().store(resultFile); } /** * @param fileName */ private String unversionName(String fileName) { // String versionRegex = "\\.*_*[WS|ws]+\\d\\d\\d+"; String versionRegex = "\\.*_*[Ww][Ss]+\\d\\d\\d+"; LOG.debug("FFFF: " + fileName + "--->>" + "====>" + fileName.replaceAll(versionRegex, "")); return fileName.replaceAll(versionRegex, ""); } private void createRelatedSubmissions(Connection connection) throws ObjectStoreException { Map<Integer, Set<String>> relatedSubs = new HashMap<Integer, Set<String>>(); for (Map.Entry<Integer, List<SubmissionReference>> entry : submissionRefs.entrySet()) { Integer submissionId = entry.getKey(); List<SubmissionReference> lref = entry.getValue(); Iterator<SubmissionReference> i = lref.iterator(); while (i.hasNext()) { SubmissionReference ref = i.next(); addRelatedSubmissions(relatedSubs, submissionId, ref.referencedSubmissionId); addRelatedSubmissions(relatedSubs, ref.referencedSubmissionId, submissionId); } LOG.debug("RRSS11 " + relatedSubs.size() + "|" + relatedSubs.keySet() + "|" + relatedSubs.values()); } for (Map.Entry<Integer, Set<String>> entry : relatedSubs.entrySet()) { ReferenceList related = new ReferenceList("relatedSubmissions", new ArrayList<String>(entry.getValue())); getChadoDBConverter().store(related, entry.getKey()); } } private void addRelatedSubmissions(Map<Integer, Set<String>> relatedSubs, Integer subId, Integer relatedId) { Integer subIdObjectId = submissionMap.get(subId).interMineObjectId; Set<String> itemIds = relatedSubs.get(subIdObjectId); if (itemIds == null) { itemIds = new HashSet<String>(); relatedSubs.put(submissionMap.get(subId).interMineObjectId, itemIds); } itemIds.add(submissionMap.get(relatedId).itemIdentifier); } //sub -> prot private void setSubmissionProtocolsRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process Map<Integer, List<Integer>> submissionProtocolMap = new HashMap<Integer, List<Integer>>(); Iterator<Integer> apId = appliedProtocolMap.keySet().iterator(); while (apId.hasNext()) { Integer thisAP = apId.next(); AppliedProtocol ap = appliedProtocolMap.get(thisAP); Util.addToListMap(submissionProtocolMap, ap.submissionId, ap.protocolId); } Iterator<Integer> subs = submissionProtocolMap.keySet().iterator(); while (subs.hasNext()) { Integer thisSubmissionId = subs.next(); List<Integer> protocolChadoIds = submissionProtocolMap.get(thisSubmissionId); ReferenceList collection = new ReferenceList(); collection.setName("protocols"); for (Integer protocolChadoId : protocolChadoIds) { collection.addRefId(protocolItemIds.get(protocolChadoId)); } Integer storedSubmissionId = submissionMap.get(thisSubmissionId).interMineObjectId; getChadoDBConverter().store(collection, storedSubmissionId); // TODO use Item? // if the experiment type is not set in the db, check protocols if (!submissionWithExpTypeSet.contains(thisSubmissionId)) { LOG.warn("EXPERIMENT TYPE NOT SET in chado for submission: " + dccIdMap.get(thisSubmissionId)); // may need protocols from referenced submissions to work out experiment type List<Integer> relatedSubsProtocolIds = new ArrayList<Integer>( findProtocolIdsFromReferencedSubmissions(thisSubmissionId)); if (relatedSubsProtocolIds != null) { protocolChadoIds.addAll(relatedSubsProtocolIds); } String piName = submissionProjectMap.get(thisSubmissionId); setSubmissionExperimentType(storedSubmissionId, protocolChadoIds, piName); } } LOG.info("TIME setting submission-protocol references: " + (System.currentTimeMillis() - bT) + " ms"); } // store Submission.experimentType if it can be inferred from protocols private void setSubmissionExperimentType(Integer storedSubId, List<Integer> protocolIds, String piName) throws ObjectStoreException { Set<String> protocolTypes = new HashSet<String>(); for (Integer protocolId : protocolIds) { protocolTypes.add(protocolTypesMap.get(protocolId).trim()); } String experimentType = inferExperimentType(protocolTypes, piName); if (experimentType != null) { Attribute expTypeAtt = new Attribute("experimentType", experimentType); getChadoDBConverter().store(expTypeAtt, storedSubId); } } // Fetch protocols used to create reagents that are inputs to this submission, these are // found in referenced submissions private List<Integer> findProtocolIdsFromReferencedSubmissions(Integer submissionId) { List<Integer> protocolIds = new ArrayList<Integer>(); if (submissionRefs == null) { throw new RuntimeException("Attempting to access submissionRefs before it has been" + " populated, this method needs to be called after" + " processSubmissionProperties"); } List<SubmissionReference> refs = submissionRefs.get(submissionId); if (refs == null) { return protocolIds; } for (SubmissionReference subRef : refs) { LOG.info("RRSSprot: " + subRef.referencedSubmissionId + "|" + subRef.dataValue); for (AppliedProtocol aProtocol : findAppliedProtocolsFromReferencedSubmission(subRef)) { LOG.info("RRSSprotId: " + aProtocol.protocolId); protocolIds.add(aProtocol.protocolId); } } return protocolIds; } /** * Work out an experiment type give the combination of protocols used for the * submission. e.g. *immunoprecipitation + hybridization = chIP-chip * @param protocolTypes the protocol types * @param piName name of PI * @return a short experiment type */ protected String inferExperimentType(Set<String> protocolTypes, String piName) { // extraction + sequencing + reverse transcription - ChIP = RTPCR // extraction + sequencing - reverse transcription - ChIP = RNA-seq if (containsMatch(protocolTypes, "nucleic_acid_extraction|RNA extraction") && containsMatch(protocolTypes, "sequencing(_protocol)?") && !containsMatch(protocolTypes, "chromatin_immunoprecipitation")) { if (containsMatch(protocolTypes, "reverse_transcription")) { return "RTPCR"; } else { return "RNA-seq"; } } // reverse transcription + PCR + RACE = RACE // reverse transcription + PCR - RACE = RTPCR if (containsMatch(protocolTypes, "reverse_transcription") && containsMatch(protocolTypes, "PCR(_amplification)?")) { if (containsMatch(protocolTypes, "RACE")) { return "RACE"; } else { return "RTPCR"; } } // ChIP + hybridization = ChIP-chip // ChIP - hybridization = ChIP-seq if (containsMatch(protocolTypes, "(.*)?immunoprecipitation")) { if (containsMatch(protocolTypes, "hybridization")) { return "ChIP-chip"; } else { return "ChIP-seq"; } } // hybridization - ChIP = // Celniker: RNA tiling array // Henikoff: Chromatin-chip // otherwise: Tiling array if (containsMatch(protocolTypes, "hybridization") && !containsMatch(protocolTypes, "immunoprecipitation")) { if ("Celniker".equals(piName)) { return "RNA tiling array"; } else if ("Henikoff".equals(piName)) { return "Chromatin-chip"; } else { return "Tiling array"; } } // annotation = Computational annotation if (containsMatch(protocolTypes, "annotation")) { return "Computational annotation"; } // If we haven't found a type yet, and there is a growth protocol, then // this is probably an RNA sample creation experiment from Celniker if (containsMatch(protocolTypes, "grow")) { return "RNA sample creation"; } return null; } // utility method for looking up in a set by regular expression private boolean containsMatch(Set<String> testSet, String regex) { boolean matches = false; Pattern p = Pattern.compile(regex); for (String test : testSet) { Matcher m = p.matcher(test); if (m.matches()) { matches = true; } } return matches; } //sub -> exp private void setSubmissionExperimentRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process // the map should contain only live submissions Iterator<String> exp = expSubMap.keySet().iterator(); while (exp.hasNext()) { String thisExp = exp.next(); List<Integer> subs = expSubMap.get(thisExp); Iterator<Integer> s = subs.iterator(); while (s.hasNext()) { Integer thisSubId = s.next(); Reference reference = new Reference(); reference.setName("experiment"); reference.setRefId(experimentIdRefMap.get(thisExp)); getChadoDBConverter().store(reference, submissionMap.get(thisSubId).interMineObjectId); } } LOG.info("TIME setting submission-experiment references: " + (System.currentTimeMillis() - bT) + " ms"); } //sub -> ef private void setSubmissionEFactorsRefsNEW(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process Iterator<Integer> subs = submissionEFactorMap2.keySet().iterator(); while (subs.hasNext()) { Integer thisSubmissionId = subs.next(); List<String[]> eFactors = submissionEFactorMap2.get(thisSubmissionId); LOG.info("EF REFS: " + thisSubmissionId + " (" + eFactors.get(0) + ")"); Iterator<String[]> ef = eFactors.iterator(); ReferenceList collection = new ReferenceList(); collection.setName("experimentalFactors"); while (ef.hasNext()) { String[] thisEF = ef.next(); String efName = thisEF[0]; String efType = thisEF[1]; String key = efName + efType; collection.addRefId(eFactorIdRefMap.get(efName + efType)); LOG.info("EF REFS!!: ->" + efName + " ref: " + eFactorIdRefMap.get(key)); } if (!collection.equals(null)) { LOG.info("EF REFS: ->" + thisSubmissionId + "|" + submissionMap.get(thisSubmissionId).interMineObjectId); getChadoDBConverter().store(collection, submissionMap.get(thisSubmissionId).interMineObjectId); } } LOG.info("TIME setting submission-exFactors references: " + (System.currentTimeMillis() - bT) + " ms"); } //sub -> ef private void setSubmissionEFactorsRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process Iterator<Integer> subs = submissionEFactorMap.keySet().iterator(); while (subs.hasNext()) { Integer thisSubmissionId = subs.next(); List<String> eFactors = submissionEFactorMap.get(thisSubmissionId); LOG.info("EF REFS: " + thisSubmissionId + " (" + eFactors + ")"); Iterator<String> ef = eFactors.iterator(); ReferenceList collection = new ReferenceList(); collection.setName("experimentalFactors"); while (ef.hasNext()) { String currentEF = ef.next(); collection.addRefId(eFactorIdRefMap.get(currentEF)); LOG.debug("EF REFS: ->" + currentEF + " ref: " + eFactorIdRefMap.get(currentEF)); } if (!collection.equals(null)) { LOG.info("EF REFS: ->" + thisSubmissionId + "|" + submissionMap.get(thisSubmissionId).interMineObjectId); getChadoDBConverter().store(collection, submissionMap.get(thisSubmissionId).interMineObjectId); } } LOG.info("TIME setting submission-exFactors references: " + (System.currentTimeMillis() - bT) + " ms"); } //sub -> publication private void setSubmissionPublicationRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process Iterator<Integer> subs = publicationIdMap.keySet().iterator(); while (subs.hasNext()) { Integer thisSubmissionId = subs.next(); Reference reference = new Reference(); reference.setName("publication"); reference.setRefId(publicationIdRefMap.get(thisSubmissionId)); getChadoDBConverter().store(reference, submissionMap.get(thisSubmissionId).interMineObjectId); } LOG.info("TIME setting submission-publication references: " + (System.currentTimeMillis() - bT) + " ms"); } /** * to store references between applied protocols and their input data * reverse reference: data -> next appliedProtocols * and between applied protocols and their output data * reverse reference: data -> previous appliedProtocols * (many to many) */ private void setDAGRefs(Connection connection) throws ObjectStoreException { long bT = System.currentTimeMillis(); // to monitor time spent in the process for (Integer thisAP : appliedProtocolMap.keySet()) { AppliedProtocol ap = appliedProtocolMap.get(thisAP); if (!ap.inputs.isEmpty()) { ReferenceList collection = new ReferenceList("inputs"); for (Integer inputId : ap.inputs) { collection.addRefId(appliedDataMap.get(inputId).itemIdentifier); //if (collection.getRefIds().contains(null)) { // LOG.info("Applied Protocol " + thisAP + " of protocol " + ap.protocolId // + " and inputs " + ap.inputs ); // } } if (collection.getRefIds().contains(null)) { LOG.warn("Applied Protocol " + thisAP + " has only inputs not corresponding" + " to any output in previous protocol" + " and cannot be linked in the DAG."); continue; } getChadoDBConverter().store(collection, appliedProtocolIdMap.get(thisAP)); } if (!ap.outputs.isEmpty()) { ReferenceList collection = new ReferenceList("outputs"); for (Integer outputId : ap.outputs) { collection.addRefId(appliedDataMap.get(outputId).itemIdentifier); } if (collection.getRefIds().contains(null)) { LOG.warn("Applied Protocol " + thisAP + " has null AD.itemidentifiers"); continue; } getChadoDBConverter().store(collection, appliedProtocolIdMap.get(thisAP)); } } LOG.info("TIME setting DAG references: " + (System.currentTimeMillis() - bT) + " ms"); } /** * maps from chado field names to ours. * * TODO: check if up to date * * if a field is not needed it is marked with NOT_TO_BE_LOADED * a check is performed and fields unaccounted for are logged. */ private static final Map<String, String> FIELD_NAME_MAP = new HashMap<String, String>(); private static final String NOT_TO_BE_LOADED = "this is ; illegal - anyway"; static { // experiment // swapping back to uniquename in experiment table // FIELD_NAME_MAP.put("Investigation Title", "title"); FIELD_NAME_MAP.put("Investigation Title", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Project", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Project URL", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Lab", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Experiment Description", "description"); FIELD_NAME_MAP.put("Experimental Design", "design"); FIELD_NAME_MAP.put("Experimental Factor Type", "factorType"); FIELD_NAME_MAP.put("Experimental Factor Name", "factorName"); FIELD_NAME_MAP.put("Quality Control Type", "qualityControl"); FIELD_NAME_MAP.put("Replicate Type", "replicate"); FIELD_NAME_MAP.put("Date of Experiment", "experimentDate"); FIELD_NAME_MAP.put("Public Release Date", "publicReleaseDate"); FIELD_NAME_MAP.put("Embargo Date", "embargoDate"); FIELD_NAME_MAP.put("dcc_id", "DCCid"); FIELD_NAME_MAP.put("replaces", "replacesSubmission"); FIELD_NAME_MAP.put("PubMed ID", "pubMedId"); FIELD_NAME_MAP.put("Person First Name", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Mid Initials", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Last Name", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Affiliation", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Address", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Phone", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Email", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Person Roles", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Data Type", "category"); FIELD_NAME_MAP.put("Assay Type", "experimentType"); FIELD_NAME_MAP.put("Release Reservations", "notice"); FIELD_NAME_MAP.put("RNAsize", "RNAsize"); // these are names in name/value couples for ReadCount FIELD_NAME_MAP.put("Total Read Count", "totalReadCount"); FIELD_NAME_MAP.put("Total Mapped Read Count", "totalMappedReadCount"); FIELD_NAME_MAP.put("Multiply Mapped Read Count", "multiplyMappedReadCount"); FIELD_NAME_MAP.put("Uniquely Mapped Read Count", "uniquelyMappedReadCount"); // data: parameter values FIELD_NAME_MAP.put("Array Data File", "arrayDataFile"); FIELD_NAME_MAP.put("Array Design REF", "arrayDesignRef"); FIELD_NAME_MAP.put("Derived Array Data File", "derivedArrayDataFile"); FIELD_NAME_MAP.put("Result File", "resultFile"); // protocol FIELD_NAME_MAP.put("Protocol Type", "type"); FIELD_NAME_MAP.put("url protocol", "url"); FIELD_NAME_MAP.put("Characteristics", "characteristics"); FIELD_NAME_MAP.put("species", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("references", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("lab", NOT_TO_BE_LOADED); FIELD_NAME_MAP.put("Comment", NOT_TO_BE_LOADED); } /** * to store identifiers in project maps. * @param i * @param chadoId * @param intermineObjectId * @throws ObjectStoreException */ private void storeInProjectMaps(Item i, String surnamePI, Integer intermineObjectId) throws ObjectStoreException { if ("Project".equals(i.getClassName())) { projectIdMap.put(surnamePI, intermineObjectId); projectIdRefMap.put(surnamePI, i.getIdentifier()); } else { throw new IllegalArgumentException( "Type mismatch: expecting Project, getting " + i.getClassName().substring(37) + " with intermineObjectId = " + intermineObjectId + ", project = " + surnamePI); } debugMap.put(i.getIdentifier(), i.getClassName()); } /** * to store identifiers in lab maps. * @param i * @param chadoId * @param intermineObjectId * @throws ObjectStoreException */ private void storeInLabMaps(Item i, String labName, Integer intermineObjectId) throws ObjectStoreException { if ("Lab".equals(i.getClassName())) { labIdMap.put(labName, intermineObjectId); labIdRefMap.put(labName, i.getIdentifier()); } else { throw new IllegalArgumentException( "Type mismatch: expecting Lab, getting " + i.getClassName().substring(37) + " with intermineObjectId = " + intermineObjectId + ", lab = " + labName); } debugMap.put(i.getIdentifier(), i.getClassName()); } private void mapSubmissionAndData(Integer submissionId, Integer dataId) { Util.addToListMap(submissionDataMap, submissionId, dataId); dataSubmissionMap.put(dataId, submissionId); } /** * ===================== * UTILITY METHODS * ===================== */ /** * method to wrap the execution of a query, without log info * @param connection * @param query * @return the result set * @throws SQLException */ private ResultSet doQuery(Connection connection, String query) throws SQLException { Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * method to wrap the execution of a query with log info) * @param connection * @param query * @param comment for not logging * @return the result set * @throws SQLException */ private ResultSet doQuery(Connection connection, String query, String comment) throws SQLException { // we could avoid passing comment if we trace the calling method // new Throwable().fillInStackTrace().getStackTrace()[1].getMethodName() LOG.info("executing: " + query); long bT = System.currentTimeMillis(); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); LOG.info("QUERY TIME " + comment + ": " + (System.currentTimeMillis() - bT) + " ms"); return res; } /** * adds an element to a list which is the value of a map * @param m the map (<Integer, List<String>>) * @param key the key for the map * @param value the list */ private static void addToMap(Map<Integer, List<String>> m, Integer key, String value) { List<String> ids = new ArrayList<String>(); if (m.containsKey(key)) { ids = m.get(key); } if (!ids.contains(value)) { ids.add(value); m.put(key, ids); } } }