Java tutorial
package org.intermine.bio.dataconversion; /* * Copyright (C) 2002-2011 FlyMine * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. See the LICENSE file for more * information or http://www.gnu.org/copyleft/lesser.html. * */ import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.collections.keyvalue.MultiKey; import org.apache.commons.collections.map.MultiKeyMap; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.intermine.bio.chado.ChadoCV; import org.intermine.bio.chado.ChadoCVFactory; import org.intermine.bio.chado.ChadoCVTerm; import org.intermine.bio.chado.config.ConfigAction; import org.intermine.bio.chado.config.CreateSynonymAction; import org.intermine.bio.chado.config.SetFieldConfigAction; import org.intermine.bio.util.OrganismData; import org.intermine.objectstore.ObjectStoreException; import org.intermine.util.IntPresentSet; import org.intermine.util.XmlUtil; import org.intermine.xml.full.Item; import org.intermine.xml.full.Reference; import org.intermine.xml.full.ReferenceList; /** * A converter for chado that handles FlyBase specific configuration. * @author Kim Rutherford */ public class FlyBaseProcessor extends SequenceProcessor { /** * The cv.name for the wild type class term. For chromosome_structure_variations, used to * identify the "Feature type" from the "Class of aberration" section of a FlyBase aberation * page. */ private static final String WT_CLASS_CVTERM = "wt_class"; private static final String FLYBASE_DB_NAME = "FlyBase"; /** * The cv.name for the FlyBase miscellaneous CV. */ protected static final String FLYBASE_MISCELLANEOUS_CV = FLYBASE_DB_NAME + " miscellaneous CV"; /** * The cv.name for the FlyBase miscellaneous CV. */ protected static final String FLYBASE_SO_CV_NAME = "SO"; private static final String FLYBASE_ANATOMY_TERM_PREFIX = "FBbt"; // a pattern the matches attribute stored in FlyBase properties, eg. "@FBcv0000289:hypomorph@" private static final String FLYBASE_PROP_ATTRIBUTE_PATTERN = "@([^@]+)@"; // interactions use this - UKNOWN private static final String RELATIONSHIP_TYPE = "MI:0499"; private static final String DEFAULT_ROLE = "unspecified"; /** * A ConfigAction that overrides processValue() to change FlyBase attribute tags * (like "@FBcv0000289:hypomorph@") to text like: "hypomorph" * @author Kim Rutherford */ private class AlleleClassSetFieldAction extends SetFieldConfigAction { /** * Create a new AlleleClassSetFieldAction * @param fieldName the fieldName to process with this object. */ AlleleClassSetFieldAction(String fieldName) { super(fieldName); } /** * {@inheritDoc} */ @Override public String processValue(String value) { Pattern p = Pattern.compile(FLYBASE_PROP_ATTRIBUTE_PATTERN); Matcher m = p.matcher(value); StringBuffer sb = new StringBuffer(); while (m.find()) { String field = m.group(1); int colonPos = field.indexOf(':'); if (colonPos == -1) { m.appendReplacement(sb, field); } else { String text = field.substring(colonPos + 1); m.appendReplacement(sb, text); } } m.appendTail(sb); return sb.toString(); } } private static final Logger LOG = Logger.getLogger(FlyBaseProcessor.class); // the configuration for this processor, set when getConfig() is called the first time private final Map<Integer, MultiKeyMap> config = new HashMap<Integer, MultiKeyMap>(); // a set of feature_ids for those genes that have a location in the featureloc table, set by // the constructor private final IntPresentSet locatedGeneIds; // a map from the uniquename of each allele to its item identifier private Map<String, String> alleleIdMap = new HashMap<String, String>(); // a map from the uniquename of each cdna clone to its item identifier private Map<String, FeatureData> cdnaCloneMap = new HashMap<String, FeatureData>(); // an object representing the FlyBase miscellaneous CV private ChadoCV flyBaseMiscCv = null; // an object representing the sequence ontology, as stored in the FlyBase chado database private ChadoCV sequenceOntologyCV = null; // a map from mutagen description to Mutagen Item identifier private Map<String, String> mutagensMap = new HashMap<String, String>(); // a map from featureId to seqlen // private Map<Integer, Integer> cdnaLengths = null; private final Map<Integer, Integer> chromosomeStructureVariationTypes; private Map<String, String> interactionExperiments = new HashMap<String, String>(); private static final String LOCATED_GENES_TEMP_TABLE_NAME = "intermine_located_genes_temp"; private static final String ALLELE_TEMP_TABLE_NAME = "intermine_flybase_allele_temp"; private static final String INSERTION_TEMP_TABLE_NAME = "intermine_flybase_insertion_temp"; // pattern to match the names of Exelixis insertions // - matches "f07705" in "PBac{WH}f07705" // - matches "f07705" in "PBac{WH}tam[f07705]" private static final Pattern PB_INSERTION_PATTERN = Pattern.compile(".*\\{.*\\}(?:.*\\[)?([def]\\d+)(?:\\])?"); private static final Map<String, String> CHROMOSOME_STRUCTURE_VARIATION_SO_MAP = new HashMap<String, String>(); private final Map<String, FeatureData> proteinFeatureDataMap = new HashMap<String, FeatureData>(); static { CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.put("chromosomal_deletion", "ChromosomalDeletion"); CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.put("chromosomal_duplication", "ChromosomalDuplication"); CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.put("chromosomal_inversion", "ChromosomalInversion"); CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.put("chromosomal_translocation", "ChromosomalTranslocation"); CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.put("transposition", "ChromosomalTransposition"); } private static final String CHROMOSOME_STRUCTURE_VARIATION_SO_NAME = "chromosome_structure_variation"; /** * Create a new FlyBaseChadoDBConverter. * @param chadoDBConverter the converter that created this object */ public FlyBaseProcessor(ChadoDBConverter chadoDBConverter) { super(chadoDBConverter); Connection connection = getChadoDBConverter().getConnection(); try { flyBaseMiscCv = getFlyBaseMiscCV(connection); } catch (SQLException e) { throw new RuntimeException("can't execute query for flybase cv terms", e); } try { sequenceOntologyCV = getFlyBaseSequenceOntologyCV(connection); } catch (SQLException e) { throw new RuntimeException("can't execute query for so cv terms", e); } try { createLocatedGenesTempTable(connection); } catch (SQLException e) { throw new RuntimeException("can't execute query for located genes", e); } locatedGeneIds = getLocatedGeneIds(connection); chromosomeStructureVariationTypes = getChromosomeStructureVariationTypes(connection); // try { // cdnaLengths = makeCDNALengthMap(connection); // } catch (SQLException e) { // e.printStackTrace(); // } } /** * @param connection database connection * @return map of feature_id to seqlen */ // protected Map<Integer, Integer> getLengths(Connection connection) { // if (cdnaLengths == null) { // try { // cdnaLengths = makeCDNALengthMap(connection); // } catch (SQLException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // } // return cdnaLengths; // } /** * Return a map from chromosome_structure_variation feature_ids to the cvterm_id of the * associated cvtermprop. This is needed because the exact type of the * chromosome_structure_variation objects is not used as the type_id of the feature, instead * it's stored in the cvtermprop table. */ private Map<Integer, Integer> getChromosomeStructureVariationTypes(Connection connection) { Map<Integer, Integer> retVal = new HashMap<Integer, Integer>(); ResultSet res; try { res = getChromosomeStructureVariationResultSet(connection); } catch (SQLException e) { throw new RuntimeException("can't execute query for chromosome_structure_variation " + "types", e); } try { while (res.next()) { int featureId = res.getInt("feature_id"); int cvtermId = res.getInt("cvterm_id"); retVal.put(new Integer(featureId), new Integer(cvtermId)); } } catch (SQLException e) { throw new RuntimeException("problem while reading chromosome_structure_variation " + "types", e); } return retVal; } /** * Return the results of running a query for the chromosome_structure_variation feature types. * @param connection the connection * @return the results * @throws SQLException if there is a database problem */ protected ResultSet getChromosomeStructureVariationResultSet(Connection connection) throws SQLException { String query = " SELECT feature.feature_id, cvterm.cvterm_id" + " FROM feature, feature_cvterm, cvterm feature_type, cvterm, cv," + " feature_cvtermprop, cvterm prop_term" + " WHERE feature.type_id = feature_type.cvterm_id" + " AND feature_type.name = '" + CHROMOSOME_STRUCTURE_VARIATION_SO_NAME + "' " + " AND feature_cvterm.feature_id = feature.feature_id" + " AND feature_cvterm.cvterm_id = cvterm.cvterm_id AND cvterm.cv_id = cv.cv_id" + " AND cv.name = 'SO' " + " AND feature_cvtermprop.feature_cvterm_id = feature_cvterm.feature_cvterm_id" + " AND feature_cvtermprop.type_id = prop_term.cvterm_id AND prop_term.name = '" + WT_CLASS_CVTERM + "'"; LOG.info("executing getChromosomeStructureVariationResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a set of ids of those genes that have a location in the featureloc table. */ private IntPresentSet getLocatedGeneIds(Connection connection) { IntPresentSet retVal = new IntPresentSet(); ResultSet res; try { res = getLocatedGenesResultSet(connection); } catch (SQLException e) { throw new RuntimeException("can't execute query for located genes", e); } try { while (res.next()) { int featureId = res.getInt("feature_id"); retVal.set(featureId, true); } } catch (SQLException e) { throw new RuntimeException("problem while reading located genes", e); } return retVal; } /** * Create a temporary table containing the ids of the located genes. This is a protected * method so that it can be overridden for testing * @param connection the Connection * @throws SQLException if there is a database problem */ protected void createLocatedGenesTempTable(Connection connection) throws SQLException { String organismConstraint = getOrganismConstraint(); String orgConstraintForQuery = ""; if (!StringUtils.isEmpty(organismConstraint)) { orgConstraintForQuery = " AND " + organismConstraint; } String query = "CREATE TEMPORARY TABLE " + LOCATED_GENES_TEMP_TABLE_NAME + " AS SELECT feature.feature_id FROM feature, cvterm" + " WHERE feature.type_id = cvterm.cvterm_id" + " AND cvterm.name = 'gene' " + " AND NOT feature.is_obsolete " + " AND feature.feature_id IN " + " (SELECT l.feature_id " + " FROM featureloc l, feature c " + " WHERE l.srcfeature_id = c.feature_id and NOT c.is_obsolete)" + orgConstraintForQuery; Statement stmt = connection.createStatement(); LOG.info("executing createLocatedGenesTempTable(): " + query); stmt.execute(query); String idIndexQuery = "CREATE INDEX " + LOCATED_GENES_TEMP_TABLE_NAME + "_feature_index ON " + LOCATED_GENES_TEMP_TABLE_NAME + "(feature_id)"; LOG.info("executing: " + idIndexQuery); stmt.execute(idIndexQuery); String analyze = "ANALYZE " + LOCATED_GENES_TEMP_TABLE_NAME; LOG.info("executing: " + analyze); stmt.execute(analyze); } /** * Create a temporary table of allele feature_ids. The table will only have allele of genes * with locations. * @param connection the connection * @throws SQLException if there is a database problem */ protected void createAllelesTempTable(Connection connection) throws SQLException { String organismConstraint = getOrganismConstraint(); String orgConstraintForQuery = ""; if (!StringUtils.isEmpty(organismConstraint)) { orgConstraintForQuery = " AND " + organismConstraint; } String query = " CREATE TEMPORARY TABLE " + ALLELE_TEMP_TABLE_NAME + " AS SELECT feature_id" + " FROM feature, cvterm feature_type " + " WHERE feature_type.name = 'gene'" + " AND type_id = feature_type.cvterm_id" + " AND uniquename LIKE 'FBal%'" + " AND NOT feature.is_obsolete" + " AND feature_id IN (SELECT feature_id FROM feature WHERE " + getLocatedGeneAllesSql() + ")" + orgConstraintForQuery; Statement stmt = connection.createStatement(); LOG.info("executing createAllelesTempTable(): " + query); stmt.execute(query); String idIndexQuery = "CREATE INDEX " + ALLELE_TEMP_TABLE_NAME + "_feature_index ON " + ALLELE_TEMP_TABLE_NAME + "(feature_id)"; LOG.info("executing: " + idIndexQuery); stmt.execute(idIndexQuery); String analyze = "ANALYZE " + ALLELE_TEMP_TABLE_NAME; LOG.info("executing: " + analyze); stmt.execute(analyze); } /** * Create a temporary table from pairs of insertions (eg. "FBti0027974" => "FBti0023081") * containing the feature_ids of the pair (the object_id, subject_id in the relation table) * and the fmin and fmax of the first insertion in the pair (ie. the progenitor / object from * the feature_relationship table). * The second in the pair is the "Modified descendant of" the first. The pairs are found using * the 'modified_descendant_of' relation type. All insertions are from DrosDel. * @param connection the connection * @throws SQLException if there is a database problem */ protected void createInsertionTempTable(Connection connection) throws SQLException { String query = " CREATE TEMPORARY TABLE " + INSERTION_TEMP_TABLE_NAME + " AS SELECT obj.feature_id AS obj_id, sub.feature_id AS sub_id," + " obj_loc.fmin, obj_loc.fmax," + " obj_loc.srcfeature_id as chr_feature_id" + " FROM feature sub, cvterm sub_type, feature_relationship rel, cvterm rel_type, " + " feature obj, cvterm obj_type, featureloc obj_loc" + " WHERE sub.feature_id = rel.subject_id AND rel.object_id = obj.feature_id" + " AND sub_type.cvterm_id = sub.type_id AND obj_type.cvterm_id = obj.type_id" + " AND sub_type.name = 'transposable_element_insertion_site' " + " AND obj_type.name = 'transposable_element_insertion_site' " + " AND rel.type_id = rel_type.cvterm_id" + " AND rel_type.name = 'modified_descendant_of'" + " AND sub.feature_id in (select feature_id from feature_pub where pub_id =" + " (SELECT pub_id FROM pub" + " WHERE title = " + "'The DrosDel collection: a set of P-element insertions for " + "generating custom chromosomal aberrations in Drosophila melanogaster.')) " + " AND obj.feature_id = obj_loc.feature_id"; Statement stmt = connection.createStatement(); LOG.info("executing createInsertionTempTable(): " + query); stmt.execute(query); String idIndexQuery = "CREATE INDEX " + INSERTION_TEMP_TABLE_NAME + "index ON " + INSERTION_TEMP_TABLE_NAME + "(sub_id)"; LOG.info("executing: " + idIndexQuery); stmt.execute(idIndexQuery); String analyze = "ANALYZE " + INSERTION_TEMP_TABLE_NAME; LOG.info("executing: " + analyze); stmt.execute(analyze); } /** * Get ChadoCV object representing the FlyBase misc cv. * This is a protected method so that it can be overriden for testing * @param connection the database Connection * @return the cv * @throws SQLException if there is a database problem */ protected ChadoCV getFlyBaseMiscCV(Connection connection) throws SQLException { ChadoCVFactory cvFactory = new ChadoCVFactory(connection); return cvFactory.getChadoCV(FLYBASE_MISCELLANEOUS_CV); } /** * Get ChadoCV object representing SO from FlyBase. * This is a protected method so that it can be overriden for testing * @param connection the database Connection * @return the cv * @throws SQLException if there is a database problem */ protected ChadoCV getFlyBaseSequenceOntologyCV(Connection connection) throws SQLException { ChadoCVFactory cvFactory = new ChadoCVFactory(connection); return cvFactory.getChadoCV(FLYBASE_SO_CV_NAME); } /** * {@inheritDoc} */ @Override protected Integer store(Item feature, int taxonId) throws ObjectStoreException { processItem(feature, new Integer(taxonId)); Integer itemId = super.store(feature, taxonId); return itemId; } /** * note: featureId is needed only by modMine * {@inheritDoc} */ @Override protected Item makeLocation(int start, int end, int strand, FeatureData srcFeatureData, FeatureData featureData, int taxonId, int featureId) throws ObjectStoreException { Item location = super.makeLocation(start, end, strand, srcFeatureData, featureData, taxonId, 0); processItem(location, new Integer(taxonId)); return location; } /** * {@inheritDoc} */ @Override protected Item createSynonym(FeatureData fdat, String identifier) throws ObjectStoreException { Item synonym = super.createSynonym(fdat, identifier); /* synonym can be null if it's been created earlier. this would happen only if * the synonym was created when another protein was created in favour of this one. */ if (synonym != null) { OrganismData od = fdat.getOrganismData(); processItem(synonym, new Integer(od.getTaxonId())); } return synonym; } /** * Return from chado the feature_ids of the genes with entries in the featureloc table. * @param connection the db connection * @return the SQL result set * @throws SQLException if a database problem occurs */ protected ResultSet getLocatedGenesResultSet(Connection connection) throws SQLException { String query = getLocatedGenesSql(); LOG.info("executing getLocatedGenesResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a query that gets the feature_ids of genes that have locations. */ private String getLocatedGenesSql() { return "SELECT feature_id FROM " + LOCATED_GENES_TEMP_TABLE_NAME; } /** * {@inheritDoc} */ @Override protected Map<MultiKey, List<ConfigAction>> getConfig(int taxonId) { MultiKeyMap map = config.get(new Integer(taxonId)); if (map == null) { map = new MultiKeyMap(); config.put(new Integer(taxonId), map); // synomym configuration example: for features of class "Gene", if the type name of // the synonym is "fullname" and "is_current" is true, set the "name" attribute of // the new Gene to be this synonym and then make a Synonym object map.put(new MultiKey("synonym", "Gene", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "Gene", "fullname", Boolean.FALSE), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("synonym", "Gene", "symbol", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("symbol"))); map.put(new MultiKey("synonym", "Gene", "symbol", Boolean.FALSE), Arrays.asList(CREATE_SYNONYM_ACTION)); // dbxref table configuration example: for features of class "Gene", where the // db.name is "FlyBase Annotation IDs" and "is_current" is true, set the // "secondaryIdentifier" attribute of the new Gene to be this dbxref and then make a // Synonym object map.put(new MultiKey("dbxref", "Gene", FLYBASE_DB_NAME + " Annotation IDs", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("secondaryIdentifier"))); map.put(new MultiKey("dbxref", "Gene", FLYBASE_DB_NAME + " Annotation IDs", Boolean.FALSE), Arrays.asList(CREATE_SYNONYM_ACTION)); // null for the "is_current" means either TRUE or FALSE is OK. map.put(new MultiKey("dbxref", "Gene", FLYBASE_DB_NAME, null), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("dbxref", "MRNA", FLYBASE_DB_NAME + " Annotation IDs", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("secondaryIdentifier"))); map.put(new MultiKey("dbxref", "TransposableElementInsertionSite", "drosdel", null), Arrays.asList(new SetFieldConfigAction("symbol"))); map.put(new MultiKey("synonym", "ChromosomeStructureVariation", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "ChromosomalDeletion", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "ChromosomalDuplication", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "ChromosomalInversion", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "ChromosomalTranslocation", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "ChromosomalTransposition", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); map.put(new MultiKey("synonym", "MRNA", "symbol", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("symbol"))); map.put(new MultiKey("synonym", "MRNA", "symbol", Boolean.FALSE), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("dbxref", "MRNA", FLYBASE_DB_NAME + " Annotation IDs", null), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("dbxref", "MRNA", FLYBASE_DB_NAME, null), Arrays.asList(CREATE_SYNONYM_ACTION)); // set the Allele.gene when there is an alleleof relationship between Allele and Gene map.put(new MultiKey("relationship", "Allele", "alleleof", "Gene"), Arrays.asList(new SetFieldConfigAction("gene"))); // Set the protein reference in the MRNA - "rev_relationship" means that the // relationship table actually has Protein, producedby, MRNA. We configure like // this so we can set a reference in MRNA rather than protein map.put(new MultiKey("rev_relationship", "MRNA", "producedby", "Protein"), Arrays.asList(new SetFieldConfigAction("protein"))); map.put(new MultiKey("relationship", "CDNAClone", "derived_assoc_cdna_clone", "Gene"), Arrays.asList(new SetFieldConfigAction("gene"))); map.put(new MultiKey("relationship", "Gene", "producedby", "Protein"), Arrays.asList(new SetFieldConfigAction("proteins"))); // featureprop configuration example: for features of class "Gene", if the type name // of the prop is "cyto_range", set the "cytoLocation" attribute of the // new Gene to be this property map.put(new MultiKey("prop", "Gene", "cyto_range"), Arrays.asList(new SetFieldConfigAction("cytoLocation"))); map.put(new MultiKey("prop", "Gene", "symbol"), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("prop", "TransposableElementInsertionSite", "curated_cytological_location"), Arrays.asList(new SetFieldConfigAction("cytoLocation"))); ConfigAction alleleClassConfigAction = new AlleleClassSetFieldAction("alleleClass"); map.put(new MultiKey("prop", "Allele", "promoted_allele_class"), Arrays.asList(alleleClassConfigAction)); // library config example: for features of class "CDNAClone", if the type name // of the library is "stage", set the "stage" attribute of the // new CDNAClone to be this property map.put(new MultiKey("library", "CDNAClone", "stage"), Arrays.asList(new SetFieldConfigAction("stage"))); // anatomy term config example: for features of class "CDNAClone" if there is an // anatomy term, set a reference in CDNAClone.tissueSource // See #2173 // map.put(new MultiKey("anatomyterm", "CDNAClone", null), // Arrays.asList(new SetFieldConfigAction("tissueSource"))); // feature_cvterm example for Transposition: we create a featureTerms collection in the // Transposition objects containing SequenceOntologyTerm objects. For the current // feature we create one SequenceOntologyTerm object for each associated "SO" cvterm. // We set the "name" field of the SequenceOntologyTerm to be the name from the cvterm // table. // TODO fixme // List<String> chromosomeStructureVariationClassNames = // Arrays.asList("ChromosomeStructureVariation", "ChromosomalDeletion", // "ChromosomalDuplication", "ChromosomalInversion", // "ChromosomalTranslocation", "ChromosomalTransposition"); // for (String className: chromosomeStructureVariationClassNames) { // map.put(new MultiKey("cvterm", className, "SO"), // Arrays.asList(new CreateCollectionAction("SOTerm", "abberationSOTerms", // "name", true))); // } // feature configuration example: for features of class "Exon", from "FlyBase", // set the Gene.symbol to be the "name" field from the chado feature map.put(new MultiKey("feature", "Exon", FLYBASE_DB_NAME, "name"), Arrays.asList(new SetFieldConfigAction("symbol"))); map.put(new MultiKey("feature", "Allele", FLYBASE_DB_NAME, "name"), Arrays.asList(new SetFieldConfigAction("symbol"))); // DO_NOTHING_ACTION means skip the name from this feature map.put(new MultiKey("feature", "Chromosome", FLYBASE_DB_NAME, "name"), Arrays.asList(DO_NOTHING_ACTION)); map.put(new MultiKey("feature", "ChromosomeBand", FLYBASE_DB_NAME, "name"), Arrays.asList(DO_NOTHING_ACTION)); map.put(new MultiKey("feature", "TransposableElementInsertionSite", FLYBASE_DB_NAME, "name"), Arrays.asList(new SetFieldConfigAction("symbol", PB_INSERTION_PATTERN), new SetFieldConfigAction("secondaryIdentifier"))); map.put(new MultiKey("feature", "Gene", FLYBASE_DB_NAME, "uniquename"), Arrays.asList(new SetFieldConfigAction("primaryIdentifier"))); map.put(new MultiKey("feature", "Gene", FLYBASE_DB_NAME, "name"), Arrays.asList(DO_NOTHING_ACTION)); map.put(new MultiKey("feature", "ChromosomeStructureVariation", FLYBASE_DB_NAME, "name"), Arrays.asList(new SetFieldConfigAction("secondaryIdentifier"))); // just make a Synonym because the secondaryIdentifier and the symbol are set from the // dbxref and synonym tables map.put(new MultiKey("feature", "MRNA", FLYBASE_DB_NAME, "name"), Arrays.asList(new CreateSynonymAction())); map.put(new MultiKey("feature", "PointMutation", FLYBASE_DB_NAME, "uniquename"), Arrays.asList(new SetFieldConfigAction("name"), new SetFieldConfigAction("primaryIdentifier"))); // name isn't set in flybase: map.put(new MultiKey("feature", "PointMutation", FLYBASE_DB_NAME, "name"), Arrays.asList(DO_NOTHING_ACTION)); map.put(new MultiKey("dbxref", "Protein", FLYBASE_DB_NAME + " Annotation IDs", Boolean.TRUE), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("feature", "Protein", FLYBASE_DB_NAME, "name"), Arrays.asList(CREATE_SYNONYM_ACTION)); map.put(new MultiKey("feature", "Protein", FLYBASE_DB_NAME, "uniquename"), Arrays.asList(new SetFieldConfigAction("secondaryIdentifier"))); map.put(new MultiKey("dbxref", "Protein", "GB_protein", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("genbankIdentifier"), CREATE_SYNONYM_ACTION)); // transposable_element and natural_transposable_element map.put(new MultiKey("feature", "TransposableElement", FLYBASE_DB_NAME, "name"), Arrays .asList(new SetFieldConfigAction("secondaryIdentifier"), new SetFieldConfigAction("symbol"))); map.put(new MultiKey("feature", "NaturalTransposableElement", FLYBASE_DB_NAME, "name"), Arrays .asList(new SetFieldConfigAction("secondaryIdentifier"), new SetFieldConfigAction("symbol"))); map.put(new MultiKey("relationship", "TransposableElement", "producedby", "NaturalTransposableElement"), Arrays.asList(new SetFieldConfigAction("insertedElement"))); map.put(new MultiKey("synonym", "NaturalTransposableElement", "fullname", Boolean.TRUE), Arrays.asList(new SetFieldConfigAction("name"))); } return map; } /** * {@inheritDoc} */ @Override protected String getExtraFeatureConstraint() { return "NOT ((cvterm.name = 'golden_path_region'" + " OR cvterm.name = 'ultra_scaffold')" + " AND (uniquename LIKE 'Unknown_%' OR uniquename LIKE '%_groupMISC'))" + " AND " + getLocatedGeneAllesSql(); } /** * Query that returns only allele of located genes. */ private String getLocatedGeneAllesSql() { return "(NOT (uniquename LIKE 'FBal%') OR feature_id IN" + " (SELECT subject_id" + " FROM feature_relationship, cvterm" + " WHERE type_id = cvterm.cvterm_id" + " AND cvterm.name = 'alleleof'" + " AND object_id IN (" + getLocatedGenesSql() + ")))"; } /** * {@inheritDoc} */ @Override protected Item makeFeature(Integer featureId, String chadoFeatureType, String interMineType, String name, String uniqueName, int seqlen, int taxonId) { String realInterMineType = interMineType; if ("protein".equals(chadoFeatureType) && !uniqueName.startsWith("FBpp")) { return null; } if ("gene".equals(chadoFeatureType)) { if (uniqueName.startsWith("FBal")) { // fix type of allele "gene" features realInterMineType = "Allele"; } else { if (!locatedGeneIds.contains(featureId.intValue())) { // ignore genes with no location return null; } } } // ignore unknown chromosome from dpse if (uniqueName.startsWith("Unknown_")) { return null; } if (taxonId != 7227 && "chromosome_arm".equals(chadoFeatureType)) { // nothing is located on a chromosome_arm return null; } if ("chromosome".equals(chadoFeatureType) && !"dmel_mitochondrion_genome".equals(uniqueName)) { // ignore Chromosomes from flybase - features are located on ChromosomeArms except // for mitochondrial features return null; } if ("chromosome_arm".equals(chadoFeatureType) || "ultra_scaffold".equals(chadoFeatureType)) { if ("dmel_mitochondrion_genome".equals(uniqueName)) { // ignore - all features are on the Chromosome object with uniqueName // "dmel_mitochondrion_genome" return null; } realInterMineType = "Chromosome"; } if ("golden_path_region".equals(chadoFeatureType)) { // For organisms other than D. melanogaster sometimes we can convert a // golden_path_region to an actual chromosome: if name is 2L, 4, etc if (taxonId == 7237) { // chromosomes are stored as golden_path_region realInterMineType = "Chromosome"; } else { if (taxonId != 7227 && !uniqueName.contains("_")) { realInterMineType = "Chromosome"; } else { // golden_path_fragment is the actual SO term (call scaffold instead?) realInterMineType = "GoldenPathFragment"; } } } if (chadoFeatureType.equals(CHROMOSOME_STRUCTURE_VARIATION_SO_NAME)) { Integer cvtermId = chromosomeStructureVariationTypes.get(featureId); if (cvtermId != null) { ChadoCVTerm term = sequenceOntologyCV.getByChadoId(cvtermId); for (String soName : CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.keySet()) { if (termOrChildrenNameMatches(term, soName)) { realInterMineType = CHROMOSOME_STRUCTURE_VARIATION_SO_MAP.get(soName); break; } } } } if ("transposable_element_insertion_site".equals(chadoFeatureType) && name == null && !uniqueName.startsWith("FBti")) { // ignore this feature as it doesn't have an FBti identifier and there will be // another feature for the same transposable_element_insertion_site that does have // the FBti identifier return null; } if ("mRNA".equals(chadoFeatureType) && seqlen == 0) { // flybase has > 7000 mRNA features that have no sequence and don't appear in their // webapp so we filter them out return null; } if ("protein".equals(chadoFeatureType) && seqlen == 0) { // flybase has ~ 2100 protein features that don't appear in their webapp so we // filter them out return null; } Item feature = getChadoDBConverter().createItem(realInterMineType); if ("Allele".equals(realInterMineType)) { alleleIdMap.put(uniqueName, feature.getIdentifier()); } return feature; } /** * Return true iff the given term or one of its children is named termName. */ private boolean termOrChildrenNameMatches(ChadoCVTerm term, String termName) { if (term.getName().equals(termName)) { return true; } Set<ChadoCVTerm> children = term.getAllChildren(); for (ChadoCVTerm childTerm : children) { if (childTerm.getName().equals(termName)) { return true; } } return false; } private static final List<String> FEATURES = Arrays.asList("gene", "mRNA", "transcript", "protein", "intron", "exon", "regulatory_region", "enhancer", "EST", "cDNA_clone", "miRNA", "snRNA", "ncRNA", "rRNA", "ncRNA", "snoRNA", "tRNA", "chromosome_band", "transposable_element_insertion_site", CHROMOSOME_STRUCTURE_VARIATION_SO_NAME, "point_mutation", "natural_transposable_element", "transposable_element"); /** * Get a list of the chado/so types of the LocatedSequenceFeatures we wish to load. The list * will not include chromosome-like features. * @return the list of features */ @Override protected List<String> getFeatures() { return FEATURES; } /** * For objects that have primaryIdentifier == null, set the primaryIdentifier to be the * uniquename column from chado. * {@inheritDoc} */ @Override protected void extraProcessing(Connection connection, Map<Integer, FeatureData> features) throws ObjectStoreException, SQLException { createAllelesTempTable(connection); createInsertionTempTable(connection); for (FeatureData featureData : features.values()) { if (!featureData.getFlag(FeatureData.IDENTIFIER_SET)) { setAttribute(featureData.getIntermineObjectId(), "primaryIdentifier", featureData.getChadoFeatureUniqueName()); } } if (FEATURES.contains("gene")) { processAlleleProps(connection, features); Map<Integer, List<String>> mutagenMap = makeMutagenMap(connection); for (Integer alleleFeatureId : mutagenMap.keySet()) { FeatureData alleleDat = features.get(alleleFeatureId); List<String> mutagenRefIds = new ArrayList<String>(); for (String mutagenDescription : mutagenMap.get(alleleFeatureId)) { String mutagenIdentifier = getMutagen(mutagenDescription); mutagenRefIds.add(mutagenIdentifier); } ReferenceList referenceList = new ReferenceList(); referenceList.setName("mutagens"); referenceList.setRefIds(mutagenRefIds); getChadoDBConverter().store(referenceList, alleleDat.getIntermineObjectId()); } createIndelReferences(connection); createDeletionLocations(connection); copyInsertionLocations(connection); createInteractions(connection); } } private Item getInteraction(Map<MultiKey, Item> interactions, String refId, String gene2RefId) throws ObjectStoreException { MultiKey key = new MultiKey(refId, gene2RefId); Item item = interactions.get(key); if (item == null) { item = getChadoDBConverter().createItem("Interaction"); item.setReference("gene1", refId); item.setReference("gene2", gene2RefId); interactions.put(key, item); } return item; } /** * Create Interaction objects. */ private void createInteractions(Connection connection) throws SQLException, ObjectStoreException { Map<MultiKey, Item> seenInteractions = new HashMap<MultiKey, Item>(); ResultSet res = getInteractionResultSet(connection); String typeId = getRelationshipType(); while (res.next()) { Integer featureId = new Integer(res.getInt("feature_id")); Integer otherFeatureId = new Integer(res.getInt("other_feature_id")); String pubTitle = res.getString("pub_title"); Integer pubmedId = new Integer(res.getInt("pubmed_id")); FeatureData featureData = getFeatureMap().get(featureId); FeatureData otherFeatureData = getFeatureMap().get(otherFeatureId); OrganismData od = otherFeatureData.getOrganismData(); Item dataSetItem = getChadoDBConverter().getDataSetItem(od.getTaxonId()); String publicationItemId = makePublication(pubmedId); String name = "FlyBase:" + featureData.getChadoFeatureUniqueName() + "_" + otherFeatureData.getChadoFeatureUniqueName(); Item interaction = getInteraction(seenInteractions, featureData.getItemIdentifier(), otherFeatureData.getItemIdentifier()); createDetail(dataSetItem, pubTitle, publicationItemId, interaction, name, typeId); name = "FlyBase:" + otherFeatureData.getChadoFeatureUniqueName() + "_" + featureData.getChadoFeatureUniqueName(); interaction = getInteraction(seenInteractions, otherFeatureData.getItemIdentifier(), featureData.getItemIdentifier()); createDetail(dataSetItem, pubTitle, publicationItemId, interaction, name, typeId); } for (Item item : seenInteractions.values()) { getChadoDBConverter().store(item); } } private String getRelationshipType() throws ObjectStoreException { Item item = getChadoDBConverter().createItem("InteractionTerm"); item.setAttribute("identifier", RELATIONSHIP_TYPE); getChadoDBConverter().store(item); return item.getIdentifier(); } private void createDetail(Item dataSetItem, String pubTitle, String publicationItemId, Item interaction, String name, String typeId) throws SQLException, ObjectStoreException { Item detail = getChadoDBConverter().createItem("InteractionDetail"); detail.setAttribute("name", name); detail.setAttribute("type", "genetic"); detail.setAttribute("role1", DEFAULT_ROLE); detail.setAttribute("role2", DEFAULT_ROLE); String experimentItemIdentifier = makeInteractionExperiment(pubTitle, publicationItemId); detail.setReference("experiment", experimentItemIdentifier); detail.setReference("interaction", interaction); detail.setReference("relationshipType", typeId); detail.addToCollection("dataSets", dataSetItem); getChadoDBConverter().store(detail); } /** * Return the item identifier of the Interaction Item for the given pubmed id, creating the * Item if necessary. * @param experimentTitle the new title * @param publicationItemIdentifier the item identifier of the publication for this experiment * @return the interaction item identifier * @throws ObjectStoreException if the item can't be stored */ protected String makeInteractionExperiment(String experimentTitle, String publicationItemIdentifier) throws ObjectStoreException { if (interactionExperiments.containsKey(experimentTitle)) { return interactionExperiments.get(experimentTitle); } Item experiment = getChadoDBConverter().createItem("InteractionExperiment"); experiment.setAttribute("name", experimentTitle); experiment.setReference("publication", publicationItemIdentifier); getChadoDBConverter().store(experiment); String experimentId = experiment.getIdentifier(); interactionExperiments.put(experimentTitle, experimentId); return experimentId; } /** * Create Location objects for deletions (chromosome_structure_variation) as they don't have * locations in the featureloc table. * @throws ObjectStoreException */ private void createDeletionLocations(Connection connection) throws SQLException, ObjectStoreException { ResultSet res = getDeletionLocationResultSet(connection); while (res.next()) { Integer delId = new Integer(res.getInt("deletion_feature_id")); FeatureData delFeatureData = getFeatureMap().get(delId); if (delFeatureData == null) { LOG.info("can't find deletion " + delId + " in feature map"); continue; } String chromosomeName = res.getString("chromosome_name"); String startString = res.getString("fmin"); String endString = res.getString("fmax"); String strandString = res.getString("strand"); // Df(3L)ZN47/FBab0000006 and some others don't have a strand // I don't know why, but for now we'll just give them a default if (StringUtils.isEmpty(strandString)) { strandString = "1"; } if (StringUtils.isEmpty(startString) || StringUtils.isEmpty(endString)) { continue; } Integer organismId = new Integer(res.getInt("deletion_organism_id")); int start = Integer.parseInt(startString); int end = Integer.parseInt(endString); int strand = Integer.parseInt(strandString); if (start > end) { int tmp = start; start = end; end = tmp; } int taxonId = delFeatureData.getOrganismData().getTaxonId(); Integer chrFeatureId = getChromosomeFeatureMap(organismId).get(chromosomeName); if (chrFeatureId == null) { String msg = "Can't find chromosome " + chromosomeName + " in feature map"; LOG.warn(msg); continue; } FeatureData chrFeatureData = getFeatureMap().get(chrFeatureId); if (chrFeatureData == null) { String msg = "chrFeatureData is null " + chrFeatureId + " for feature " + delId; LOG.warn(msg); continue; } makeAndStoreLocation(chrFeatureId, delFeatureData, start, end, strand, taxonId); } } private void makeAndStoreLocation(Integer chrFeatureId, FeatureData subjectFeatureData, int start, int end, int strand, int taxonId) throws ObjectStoreException { FeatureData chrFeatureData = getFeatureMap().get(chrFeatureId); Item location = getChadoDBConverter().makeLocation(chrFeatureData.getItemIdentifier(), subjectFeatureData.getItemIdentifier(), start, end, strand, taxonId); Item dataSetItem = getChadoDBConverter().getDataSetItem(taxonId); location.addToCollection("dataSets", dataSetItem); Reference chrLocReference = new Reference(); chrLocReference.setName("chromosomeLocation"); chrLocReference.setRefId(location.getIdentifier()); getChadoDBConverter().store(chrLocReference, subjectFeatureData.getIntermineObjectId()); getChadoDBConverter().store(location); } /** * Create the ChromosomalDeletion.element1 and element2 references (to * TransposableElementInsertionSite objects) */ private void createIndelReferences(Connection connection) throws ObjectStoreException, SQLException { ResultSet res = getIndelResultSet(connection); int featureWarnings = 0; while (res.next()) { Integer delId = new Integer(res.getInt("deletion_feature_id")); Integer insId = new Integer(res.getInt("insertion_feature_id")); String breakType = res.getString("breakpoint_type"); Reference reference = new Reference(); if ("bk1".equals(breakType)) { reference.setName("element1"); } else { reference.setName("element2"); } FeatureData insFeatureData = getFeatureMap().get(insId); if (insFeatureData == null) { if (featureWarnings <= 20) { if (featureWarnings < 20) { LOG.warn("insertion " + insId + " was not found in the feature table"); } else { LOG.warn("further warnings ignored"); } featureWarnings++; } continue; } reference.setRefId(insFeatureData.getItemIdentifier()); FeatureData delFeatureData = getFeatureMap().get(delId); if (delFeatureData == null) { if (featureWarnings <= 20) { if (featureWarnings < 20) { LOG.warn("deletion " + delId + " was not found in the feature table"); } else { LOG.warn("further warnings ignored"); } featureWarnings++; } continue; } getChadoDBConverter().store(reference, delFeatureData.getIntermineObjectId()); } } private String getMutagen(String description) throws ObjectStoreException { if (mutagensMap.containsKey(description)) { return mutagensMap.get(description); } Item mutagen = getChadoDBConverter().createItem("Mutagen"); mutagen.setAttribute("description", description); mutagensMap.put(description, mutagen.getIdentifier()); store(mutagen); return mutagen.getIdentifier(); } /** * @param connection */ private void copyInsertionLocations(Connection connection) throws ObjectStoreException, SQLException { ResultSet res = getInsertionLocationsResultSet(connection); while (res.next()) { int subId = res.getInt("sub_id"); int chrId = res.getInt("chr_feature_id"); int fmin = res.getInt("fmin"); int fmax = res.getInt("fmax"); int start = fmin + 1; int end = fmax; FeatureData subFeatureData = getFeatureMap().get(new Integer(subId)); if (subFeatureData != null) { // this is a hack - we should make sure that we only query for features that are in // the feature map, ie. those for the current organism int taxonId = subFeatureData.getOrganismData().getTaxonId(); makeAndStoreLocation(new Integer(chrId), subFeatureData, start, end, 1, taxonId); } } } private void store(Item item) throws ObjectStoreException { getChadoDBConverter().store(item); } // map from anatomy identifier (eg. "FBbt0001234") to Item identifier private Map<String, String> anatomyTermMap = new HashMap<String, String>(); // map from development term identifier (eg. "FBdv0001234") to Item identifier private Map<String, String> developmentTermMap = new HashMap<String, String>(); // map from FlyBase cv identifier (eg. "FBcv0001234") to Item identifier private Map<String, String> cvTermMap = new HashMap<String, String>(); private void processAlleleProps(Connection connection, Map<Integer, FeatureData> features) throws SQLException, ObjectStoreException { Map<Integer, List<String>> annotationPubMap = makeAnnotationPubMap(connection); ResultSet res = getAllelePropResultSet(connection); while (res.next()) { Integer featureId = new Integer(res.getInt("feature_id")); String value = res.getString("value"); String propType = res.getString("type_name"); Integer featurePropId = new Integer(res.getInt("featureprop_id")); FeatureData alleleFeatureData = features.get(featureId); OrganismData od = alleleFeatureData.getOrganismData(); Item dataSetItem = getChadoDBConverter().getDataSetItem(od.getTaxonId()); String alleleItemIdentifier = alleleFeatureData.getItemIdentifier(); Item phenotypeAnnotation = null; if ("derived_pheno_manifest".equals(propType)) { phenotypeAnnotation = makePhenotypeAnnotation(alleleItemIdentifier, value, dataSetItem, annotationPubMap.get(featurePropId)); phenotypeAnnotation.setAttribute("annotationType", "manifest in"); } else { if ("derived_pheno_class".equals(propType)) { phenotypeAnnotation = makePhenotypeAnnotation(alleleItemIdentifier, value, dataSetItem, annotationPubMap.get(featurePropId)); phenotypeAnnotation.setAttribute("annotationType", "phenotype class"); } } if (phenotypeAnnotation != null) { getChadoDBConverter().store(phenotypeAnnotation); } } } /** * Return a Map from allele feature_id to mutagen. The mutagen is found be looking at cvterms * that are associated with each feature and saving those terms that have "origin of mutation" * as a parent term. */ private Map<Integer, List<String>> makeMutagenMap(Connection connection) throws SQLException { Map<Integer, List<String>> retMap = new HashMap<Integer, List<String>>(); ResultSet res = getAlleleCVTermsResultSet(connection); RESULTS: while (res.next()) { Integer featureId = new Integer(res.getInt("feature_id")); Integer cvtermId = new Integer(res.getInt("cvterm_id")); ChadoCVTerm cvterm = flyBaseMiscCv.getByChadoId(cvtermId); Set<ChadoCVTerm> parents = cvterm.getAllParents(); for (ChadoCVTerm parent : parents) { if ("origin of mutation".equals(parent.getName())) { String fixedName = XmlUtil.fixEntityNames(cvterm.getName()); List<String> mutagens; if (retMap.containsKey(featureId)) { mutagens = retMap.get(featureId); } else { mutagens = new ArrayList<String>(); retMap.put(featureId, mutagens); } mutagens.add(fixedName); continue RESULTS; } } } return retMap; } /** * Get result set of feature_id, cvterm_id pairs for the alleles in flybase chado. * @param connection the Connectio * @return the cvterms * @throws SQLException if there is a database problem */ protected ResultSet getAlleleCVTermsResultSet(Connection connection) throws SQLException { String query = "SELECT DISTINCT feature.feature_id, cvterm.cvterm_id" + " FROM feature, feature_cvterm, cvterm" + " WHERE feature.feature_id = feature_cvterm.feature_id" + " AND feature.feature_id IN (" + getAlleleFeaturesSql() + ")" + " AND feature_cvterm.cvterm_id = cvterm.cvterm_id"; LOG.info("executing getAlleleCVTermsResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a map from featureprop_id for alleles to publication item identifier */ private Map<Integer, List<String>> makeAnnotationPubMap(Connection connection) throws SQLException, ObjectStoreException { Map<Integer, List<String>> retMap = new HashMap<Integer, List<String>>(); ResultSet res = getAllelePropPubResultSet(connection); while (res.next()) { Integer featurePropId = new Integer(res.getInt("featureprop_id")); String pubDbId = res.getString("pub_db_identifier"); Integer n = new Integer(Integer.parseInt(pubDbId)); String pubicationItemIdentifier = makePublication(n); if (!retMap.containsKey(featurePropId)) { retMap.put(featurePropId, new ArrayList<String>()); } retMap.get(featurePropId).add(pubicationItemIdentifier); } return retMap; } /** * Return a map from feature_id to seqlen * @throws SQLException if somethign goes wrong */ // private Map<Integer, Integer> makeCDNALengthMap(Connection connection) // throws SQLException { // Map<Integer, Integer> retMap = new HashMap(); // // ResultSet res = getCDNALengthResultSet(connection); // while (res.next()) { // Integer featureId = new Integer(res.getInt("feature_id")); // Integer seqlen = new Integer(res.getInt("seqlen")); // retMap.put(featureId, seqlen); // } // return retMap; // } private Item makePhenotypeAnnotation(String alleleItemIdentifier, String value, Item dataSetItem, List<String> publicationsItemIdList) throws ObjectStoreException { Item phenotypeAnnotation = getChadoDBConverter().createItem("PhenotypeAnnotation"); phenotypeAnnotation.addToCollection("dataSets", dataSetItem); Pattern p = Pattern.compile(FLYBASE_PROP_ATTRIBUTE_PATTERN); Matcher m = p.matcher(value); StringBuffer sb = new StringBuffer(); List<String> dbAnatomyTermIdentifiers = new ArrayList<String>(); List<String> dbDevelopmentTermIdentifiers = new ArrayList<String>(); List<String> dbCVTermIdentifiers = new ArrayList<String>(); while (m.find()) { String field = m.group(1); int colonPos = field.indexOf(':'); if (colonPos == -1) { m.appendReplacement(sb, field); } else { String identifier = field.substring(0, colonPos); if (identifier.startsWith(FLYBASE_ANATOMY_TERM_PREFIX)) { dbAnatomyTermIdentifiers.add(addCVTermColon(identifier)); } else { if (identifier.startsWith("FBdv")) { dbDevelopmentTermIdentifiers.add(addCVTermColon(identifier)); } else { if (identifier.startsWith("FBcv")) { dbCVTermIdentifiers.add(addCVTermColon(identifier)); } } } String text = field.substring(colonPos + 1); m.appendReplacement(sb, text); } } m.appendTail(sb); /* * ignore with for now because the with text is wrong in chado - see ticket #889 List<String> withAlleleIdentifiers = findWithAllele(value); if (withAlleleIdentifiers.size() > 0) { phenotypeAnnotation.setCollection("with", withAlleleIdentifiers); } */ String valueNoRefs = sb.toString(); String valueNoUps = valueNoRefs.replaceAll("<up>", "[").replaceAll("</up>", "]"); phenotypeAnnotation.setAttribute("description", valueNoUps); phenotypeAnnotation.setReference("allele", alleleItemIdentifier); if (publicationsItemIdList != null && publicationsItemIdList.size() > 0) { ReferenceList pubReferenceList = new ReferenceList("publications", publicationsItemIdList); phenotypeAnnotation.addCollection(pubReferenceList); } if (dbAnatomyTermIdentifiers.size() == 1) { String anatomyIdentifier = dbAnatomyTermIdentifiers.get(0); String anatomyTermItemId = makeAnatomyTerm(anatomyIdentifier); phenotypeAnnotation.setReference("anatomyTerm", anatomyTermItemId); } else { if (dbAnatomyTermIdentifiers.size() > 1) { throw new RuntimeException("more than one anatomy term: " + dbAnatomyTermIdentifiers); } } if (dbDevelopmentTermIdentifiers.size() == 1) { String developmentTermIdentifier = dbDevelopmentTermIdentifiers.get(0); String developmentTermItemId = makeDevelopmentTerm(developmentTermIdentifier); phenotypeAnnotation.setReference("developmentTerm", developmentTermItemId); } else { if (dbAnatomyTermIdentifiers.size() > 1) { throw new RuntimeException("more than one anatomy term: " + dbAnatomyTermIdentifiers); } } if (dbCVTermIdentifiers.size() > 0) { for (String cvTermIdentifier : dbCVTermIdentifiers) { String cvTermItemId = makeCVTerm(cvTermIdentifier); phenotypeAnnotation.addToCollection("cvTerms", cvTermItemId); } } return phenotypeAnnotation; } private static final Pattern FLYBASE_TERM_IDENTIFIER_PATTERN = Pattern.compile("^FB[^\\d][^\\d]\\d+"); /** * For a FlyBase cvterm identifier like "FBbt00000001", add a colon in the middle and return: * "FBbt:00000001" * @param identifier the identifier from chado * @return the public identifier */ protected static String addCVTermColon(String identifier) { Matcher m = FLYBASE_TERM_IDENTIFIER_PATTERN.matcher(identifier); if (m.matches()) { return identifier.substring(0, 4) + ":" + identifier.substring(4); } return identifier; } /** * Return the item identifiers of the alleles metioned in the with clauses of the argument. * Currently unused because flybase with clauses are wrong - see ticket #889 */ // @SuppressWarnings("unused") // private List<String> findWithAllele(String value) { // Pattern p = Pattern.compile("with @(FBal\\d+):"); // Matcher m = p.matcher(value); // // List<String> foundIdentifiers = new ArrayList<String>(); // // while (m.find()) { // String identifier = m.group(1); // if (identifier.startsWith("FBal")) { // foundIdentifiers.add(identifier); // } else { // throw new RuntimeException("identifier in a with must start: \"FBal\" not: " // + identifier); // } // } // // List<String> alleleItemIdentifiers = new ArrayList<String>(); // // for (String foundIdentifier: foundIdentifiers) { // if (alleleIdMap.containsKey(foundIdentifier)) { // alleleItemIdentifiers.add(alleleIdMap.get(foundIdentifier)); // } else { // // this allele wasn't stored so probably it didn't have the right organism - some // // GAL4 alleles have cerevisiae as organism, eg. FBal0060667:Scer\GAL4[sd-SG29.1] // // referenced by FBal0038994 Rac1[N17.Scer\UAS] // } // } // // return alleleItemIdentifiers; // } /** * phenotype annotation creates and stores anatomy terms. so does librarycvterm * @param identifier identifier for anatomy term * @return refId for anatomy term object * @throws ObjectStoreException if term can't be stored */ protected String makeAnatomyTerm(String identifier) throws ObjectStoreException { String newIdentifier = identifier; if (!newIdentifier.startsWith(FLYBASE_ANATOMY_TERM_PREFIX)) { newIdentifier = FLYBASE_ANATOMY_TERM_PREFIX + identifier; newIdentifier = addCVTermColon(newIdentifier); } if (anatomyTermMap.containsKey(newIdentifier)) { return anatomyTermMap.get(newIdentifier); } Item anatomyTerm = getChadoDBConverter().createItem("AnatomyTerm"); anatomyTerm.setAttribute("identifier", newIdentifier); getChadoDBConverter().store(anatomyTerm); anatomyTermMap.put(identifier, anatomyTerm.getIdentifier()); return anatomyTerm.getIdentifier(); } private String makeDevelopmentTerm(String identifier) throws ObjectStoreException { if (developmentTermMap.containsKey(identifier)) { return developmentTermMap.get(identifier); } Item developmentTerm = getChadoDBConverter().createItem("DevelopmentTerm"); developmentTerm.setAttribute("identifier", identifier); getChadoDBConverter().store(developmentTerm); developmentTermMap.put(identifier, developmentTerm.getIdentifier()); return developmentTerm.getIdentifier(); } private String makeCVTerm(String identifier) throws ObjectStoreException { if (cvTermMap.containsKey(identifier)) { return cvTermMap.get(identifier); } Item cvTerm = getChadoDBConverter().createItem("CVTerm"); cvTerm.setAttribute("identifier", identifier); getChadoDBConverter().store(cvTerm); cvTermMap.put(identifier, cvTerm.getIdentifier()); return cvTerm.getIdentifier(); } /** * Return a result set containing the interaction genes pairs, the title of the publication * that reported the interaction and its pubmed id. The method is protected * so that is can be overridden for testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getInteractionResultSet(Connection connection) throws SQLException { String query = " SELECT feature.feature_id as feature_id, " + " other_feature.feature_id as other_feature_id, " + " pub.title as pub_title, dbx.accession as pubmed_id " + " FROM feature, cvterm cvt, feature other_feature, " + " feature_relationship_pub frpb, pub, " + " feature_relationship fr, pub_dbxref pdbx, dbxref dbx, db " + " WHERE feature.feature_id = subject_id " + " AND object_id = other_feature.feature_id " + " AND fr.type_id = cvt.cvterm_id AND cvt.name = 'interacts_genetically' " + " AND fr.feature_relationship_id = frpb.feature_relationship_id " + " AND frpb.pub_id = pub.pub_id AND db.name='pubmed' " + " AND pdbx.is_current=true AND pub.pub_id=pdbx.pub_id " + " AND pdbx.dbxref_id = dbx.dbxref_id AND dbx.db_id=db.db_id " + " AND NOT feature.is_obsolete AND NOT other_feature.is_obsolete " + " AND feature.feature_id IN (" + getLocatedGenesSql() + ")" + " AND other_feature.feature_id IN (" + getLocatedGenesSql() + ")"; LOG.info("executing getInteractionResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing the alleles and their featureprops. The method is protected * so that is can be overridden for testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getAllelePropResultSet(Connection connection) throws SQLException { String query = "SELECT feature_id, value, cvterm.name AS type_name, featureprop_id" + " FROM featureprop, cvterm" + " WHERE featureprop.type_id = cvterm.cvterm_id" + " AND feature_id IN (" + getAlleleFeaturesSql() + ")" + " ORDER BY feature_id"; LOG.info("executing getAllelePropResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing pairs of chromosome_structure_variation (deletions) and * transposable_element_insertion_site (insertions). The method is protected * so that is can be overridden for testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getIndelResultSet(Connection connection) throws SQLException { String query = "SELECT del.feature_id as deletion_feature_id," + " ins.feature_id as insertion_feature_id," + " substring(break.uniquename FROM ':([^:]+)$') AS breakpoint_type" + " FROM feature del, cvterm del_type, feature_relationship del_rel," + " cvterm del_rel_type," + " feature break, cvterm break_type," + " feature_relationship ins_rel, cvterm ins_rel_type," + " feature ins, cvterm ins_type" + " WHERE del_rel.object_id = del.feature_id" + " AND del_rel.subject_id = break.feature_id" + " AND ins_rel.subject_id = break.feature_id" + " AND ins_rel.object_id = ins.feature_id" + " AND del.type_id = del_type.cvterm_id" + " AND ins.type_id = ins_type.cvterm_id" + " AND del_type.name = 'chromosome_structure_variation'" + " AND ins_type.name = 'transposable_element_insertion_site'" + " AND del_rel.type_id = del_rel_type.cvterm_id" + " AND del_rel_type.name = 'break_of'" + " AND ins_rel.type_id = ins_rel_type.cvterm_id" + " AND ins_rel_type.name = 'progenitor'" + " AND break.type_id = break_type.cvterm_id" + " AND break_type.name = 'breakpoint'" // ignore the progenitors so we only set element1 and element2 to be the "descendants" + " AND ins.feature_id NOT IN (SELECT obj_id FROM " + INSERTION_TEMP_TABLE_NAME + ")"; LOG.info("executing getIndelResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing pairs of insertion feature_ids (eg. for "FBti0027974" => * "FBti0023081") and the fmin and fmax of the first insertion in the pair (ie. the progenitor). * The second in the pair is the "Modified descendant of" the first. The pairs are found using * the 'modified_descendant_of' relation type. All insertions are from DrosDel. * The method is protected so that is can be overridden for testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getInsertionLocationsResultSet(Connection connection) throws SQLException { String query = "SELECT * from " + INSERTION_TEMP_TABLE_NAME; LOG.info("executing getInsertionLocationsResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing location for deletions (chromosome_structure_variation) * objects. The locations are in the featureprop able in the form: * 2R:12716549..12984803 (53D11;53F8) * The method is protected so that is can be overridden for testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getDeletionLocationResultSet(Connection connection) throws SQLException { String query = "SELECT f.feature_id as deletion_feature_id, f.organism_id as deletion_organism_id, " + "c.name as chromosome_name, fl.fmin, fl.fmax, fl.strand " + "FROM feature f, feature b, feature_relationship fr, cvterm cvt1, cvterm cvt2, " + " featureloc fl, feature c " + "WHERE f.feature_id = fr.object_id " + " AND fr.type_id = cvt1.cvterm_id " + " AND cvt1.name = 'break_of' " + " AND fr.subject_id = b.feature_id " + " AND b.type_id = cvt2.cvterm_id " + " AND cvt2.name = 'breakpoint' " + " AND b.feature_id = fl.feature_id " + " AND f.name ~ '^Df.+' " + " AND f.uniquename like 'FBab%' " + " AND f.is_obsolete = false " + " AND fl.srcfeature_id = c.feature_id "; LOG.info("executing getDeletionLocationResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing the featureprop_id and the publication identifier of the * featureprops for al alleles. The method is protected so that is can be overridden for * testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getAllelePropPubResultSet(Connection connection) throws SQLException { String query = "SELECT DISTINCT featureprop_pub.featureprop_id, dbxref.accession as pub_db_identifier" + " FROM featureprop, featureprop_pub, dbxref, db, pub, pub_dbxref" + " WHERE featureprop_pub.pub_id = pub.pub_id" + " AND featureprop.featureprop_id = featureprop_pub.featureprop_id" + " AND pub.pub_id = pub_dbxref.pub_id" + " AND pub_dbxref.dbxref_id = dbxref.dbxref_id" + " AND dbxref.db_id = db.db_id" + " AND db.name = 'pubmed'" + " AND feature_id IN (" + getAlleleFeaturesSql() + ")" + " ORDER BY featureprop_id"; LOG.info("executing getAllelePropPubResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Return a result set containing the feature_id and its seqlen * The method is protected so that is can be overridden for * testing. * @param connection the Connection * @throws SQLException if there is a database problem * @return the ResultSet */ protected ResultSet getCDNALengthResultSet(Connection connection) throws SQLException { String query = "SELECT cl.feature_id, fls.seqlen " + "FROM feature cl, feature fls, feature_relationship fr, cvterm fls_type " + "WHERE fls_type.name IN ('cDNA','BAC_cloned_genomic_insert') " + " AND cl.feature_id=fr.object_id " + " AND fr.subject_id=fls.feature_id " + " AND fls.type_id=fls_type.cvterm_id "; LOG.info("executing getCDNALengthResultSet(): " + query); Statement stmt = connection.createStatement(); ResultSet res = stmt.executeQuery(query); return res; } /** * Convert ISO entities from FlyBase to HTML entities. * {@inheritDoc} */ @Override protected String fixIdentifier(FeatureData fdat, String identifier) { if (StringUtils.isBlank(identifier)) { return identifier; } return XmlUtil.fixEntityNames(identifier); } /** * {@inheritDoc} */ @Override protected FeatureData makeFeatureData(int featureId, String type, String uniqueName, String name, String md5checksum, int seqlen, int organismId) throws ObjectStoreException { if ("protein".equals(type)) { if (!uniqueName.startsWith("FBpp")) { return null; } FeatureData protein = proteinFeatureDataMap.get(md5checksum); // make a synonym for the protein we're about to discard if (protein != null) { if (StringUtils.isNotEmpty(uniqueName) && !protein.getExistingSynonyms().contains(uniqueName)) { Item synonym = createSynonym(protein, uniqueName); store(synonym); } if (StringUtils.isNotEmpty(name) && !protein.getExistingSynonyms().contains(name)) { Item synonym = createSynonym(protein, name); store(synonym); } return protein; } FeatureData fdat = super.makeFeatureData(featureId, type, uniqueName, name, md5checksum, seqlen, organismId); proteinFeatureDataMap.put(md5checksum, fdat); return fdat; } if ("cDNA_clone".equals(type)) { // flybase has duplicates. to merge with BDGP we need to discard duplicates and // make a synonym FeatureData cdnaClone = cdnaCloneMap.get(name); if (cdnaClone != null) { if (StringUtils.isNotEmpty(name)) { Item synonym = createSynonym(cdnaClone, name); if (synonym != null) { store(synonym); } } return cdnaClone; } FeatureData fdat = super.makeFeatureData(featureId, type, uniqueName, name, md5checksum, seqlen, organismId); cdnaCloneMap.put(name, fdat); return fdat; } return super.makeFeatureData(featureId, type, uniqueName, name, md5checksum, seqlen, organismId); } /** * Return a query that gets the feature_ids of the allele in the feature table. */ private String getAlleleFeaturesSql() { return "SELECT feature_id FROM " + ALLELE_TEMP_TABLE_NAME; } /** * Method to add dataSets and DataSources to items before storing */ private void processItem(Item item, Integer taxonId) { String className = item.getClassName(); if ("DataSource".equals(className) || "DataSet".equals(className) || "Organism".equals(className) || "Sequence".equals(className)) { return; } if (taxonId == null) { ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader(); ClassLoader classLoader = getClass().getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); try { throw new RuntimeException("getCurrentTaxonId() returned null while processing " + item); } finally { Thread.currentThread().setContextClassLoader(currentClassLoader); } } ChadoDBConverter converter = getChadoDBConverter(); BioStoreHook.setDataSets(getModel(), item, converter.getDataSetItem(taxonId.intValue()).getIdentifier(), converter.getDataSourceItem().getIdentifier()); } }