org.intermine.bio.dataconversion.IdResolver.java Source code

Introduction

Here is the source code for org.intermine.bio.dataconversion.IdResolver.java
Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.collections.keyvalue.MultiKey;
import org.apache.commons.collections.map.MultiKeyMap;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;

/**
 * Hold data about primary identifiers and synonyms for a particular class in the
 * data model and provide methods to resolved synonyms into corresponding
 * primary identifier(s).
 *
 * @author rns
 * @author Fengyuan Hu
 */
public class IdResolver {
    private static final Logger LOG = Logger.getLogger(IdResolver.class);

    private String clsName;

    @SuppressWarnings("unchecked")
    protected Map<MultiKey, Map<String, Set<String>>> orgIdMaps = new MultiKeyMap();
    @SuppressWarnings("unchecked")
    protected Map<MultiKey, Map<String, Set<String>>> orgSynMaps = new MultiKeyMap();
    @SuppressWarnings("unchecked")
    protected Map<MultiKey, Map<String, Set<String>>> orgMainMaps = new MultiKeyMap();
    @SuppressWarnings("unchecked")
    protected Map<MultiKey, Map<String, Set<String>>> orgIdMainMaps = new MultiKeyMap();
    @SuppressWarnings("unchecked")
    protected Map<MultiKey, Map<String, Set<String>>> orgIdSynMaps = new MultiKeyMap();

    /**
     * Construct and empty IdResolver
     */
    public IdResolver() {
    }

    /**
     * Construct and empty IdResolver
     * @param clsName the class to resolve identifiers for
     */
    public IdResolver(String clsName) {
        this.clsName = clsName;
    }

    // check that the given taxon id has some data for it
    // if an exception thrown, there must be something wrong with resolver factory.
    protected void checkTaxonId(String taxonId, String clsName) {
        if (!orgIdMaps.containsKey(new MultiKey(taxonId, clsName))) {
            throw new IllegalArgumentException(
                    clsName + " IdResolver has " + "no data for taxonId: " + taxonId + ".");
        }
    }

    /**
     * Check whether the given id is a primary identifier for this taxonId
     * @param taxonId the organism to look up
     * @param clsName go term
     * @param id an identifier
     * @return true if id is a primaryIdentifier
     */
    public boolean isPrimaryIdentifier(String taxonId, String clsName, String id) {
        checkTaxonId(taxonId, clsName);
        return orgIdMaps.get(new MultiKey(taxonId, clsName)).containsKey(id);
    }

    /**
     * Check whether the given id is a primary identifier for this taxonId
     * @param taxonId the organism to look up
     * @param id an identifier
     * @return true if id is a primaryIdentifier
     */
    public boolean isPrimaryIdentifier(String taxonId, String id) {
        return isPrimaryIdentifier(taxonId, this.clsName, id);
    }

    /**
     * For the given id return a set of matching primary identifiers in the given
     * taxonId.  In many cases the set will have just one element. Some will have
     * zero element.
     * @param taxonId the organism to search within
     * @param clsName go term
     * @param id the identifier to resolve
     * @return a set of matching primary identifiers
     */
    public Set<String> resolveId(String taxonId, String clsName, String id) {
        checkTaxonId(taxonId, clsName);
        // if this is a primary identifier, just return it
        if (isPrimaryIdentifier(taxonId, clsName, id)) {
            return Collections.singleton(id);
        }
        if (orgMainMaps.containsKey(new MultiKey(taxonId, clsName))
                && orgMainMaps.get(new MultiKey(taxonId, clsName)).containsKey(id)) {
            return orgMainMaps.get(new MultiKey(taxonId, clsName)).get(id);
        }
        if (orgSynMaps.containsKey(new MultiKey(taxonId, clsName))
                && orgSynMaps.get(new MultiKey(taxonId, clsName)).containsKey(id)) {
            return orgSynMaps.get(new MultiKey(taxonId, clsName)).get(id);
        }
        return Collections.emptySet();
    }

    /**
     * For the given set of ids return a map of matching primary identifiers in the given
     * taxonId.  In many cases the set will have just one element. Some will have
     * zero element.
     * @param taxonId the organism to search within
     * @param clsName go term
     * @param ids the identifier set to resolve
     * @return a set of common pid
     */
    public String resolveIds(String taxonId, String clsName, List<String> ids) {
        Set<String> common = new LinkedHashSet<String>();
        for (int i = 0; i < ids.size(); i++) {
            Set<String> resovledSet = resolveId(taxonId, clsName, ids.get(i));
            common.addAll(resovledSet);
        }

        common.remove(null);
        if (common.size() != 1) {
            LOG.info("Not resolve to an unique identifier: " + common);
            return null;
        } else {
            return common.iterator().next();
        }
    }

    /**
     * For the given id return a set of matching primary identifiers in the given
     * taxonId.  In many cases the set will have just one element. Some will have
     * zero element.
     * @param taxonId the organism to search within
     * @param id the identifier to resolve
     * @return a set of matching primary identifiers
     */
    public Set<String> resolveId(String taxonId, String id) {
        return resolveId(taxonId, this.clsName, id);
    }

    /**
     * For the given id set return a map of matching primary identifiers in the given
     * taxonId.  In many cases the set will have just one element. Some will have
     * zero element.
     * @param taxonId the organism to search within
     * @param ids the identifier set to resolve
     * @return a map of matching primary identifiers
     */
    public String resolveIds(String taxonId, List<String> ids) {
        return resolveIds(taxonId, this.clsName, ids);
    }

    /**
     * For a particular primary identifier fetch a set of synonyms or return
     * null if id is not a primary identifier for the taxonId given.
     * @param taxonId the organism to do a lookup for
     * @param clsName go term
     * @param id the primary identifier to look up
     * @return a set of synonyms or null if id is not a primary identifier
     */
    public Set<String> getSynonyms(String taxonId, String clsName, String primaryIdentifier) {
        checkTaxonId(taxonId, clsName);
        if (!isPrimaryIdentifier(taxonId, clsName, primaryIdentifier)) {
            return null;
        }
        return orgIdMaps.get(new MultiKey(taxonId, clsName)).get(primaryIdentifier);
    }

    /**
     * For a particular primary identifier fetch a set of synonyms or return
     * null if id is not a primary identifier for the taxonId given.
     * @param taxonId the organism to do a lookup for
     * @param id the primary identifier to look up
     * @return a set of synonyms or null if id is not a primary identifier
     */
    public Set<String> getSynonyms(String taxonId, String primaryIdentifier) {
        return getSynonyms(taxonId, this.clsName, primaryIdentifier);
    }

    /**
     * Return the count of matching primary identifiers for a particular identifier
     * @param taxonId the organism to check for
     * @param clsName go term
     * @param id the identifier to look up
     * @return a count of the resolutions for this identifier
     */
    public int countResolutions(String taxonId, String clsName, String id) {
        checkTaxonId(taxonId, clsName);
        Set<String> resolvedIds = resolveId(taxonId, clsName, id);
        return resolvedIds == null ? 0 : resolvedIds.size();
    }

    /**
     * Return the count of matching primary identifiers for a particular identifier
     * @param taxonId the organism to check for
     * @param id the identifier to look up
     * @return a count of the resolutions for this identifier
     */
    public int countResolutions(String taxonId, String id) {
        return countResolutions(taxonId, this.clsName, id);
    }

    /**
     * Return true if the idResolver contains information about this taxon id.
     * @param taxonId an organism to check for
     * @return true if data about this taxon id
     */
    public boolean hasTaxon(String taxonId) {
        return hasTaxons(new HashSet<String>(Arrays.asList(new String[] { taxonId })));
    }

    /**
     * Return true if the idResolver contains information about a collection of taxon id.
     * @param taxonIds a collection of organism to check for
     * @return true if data about this taxon id
     */
    public boolean hasTaxons(Set<String> taxonIds) {
        Set<String> taxonIdSet = new HashSet<String>();
        for (MultiKey key : orgIdMaps.keySet()) {
            taxonIdSet.add((String) key.getKey(0));
        }
        return taxonIdSet.containsAll(taxonIds);
    }

    /**
     * Return a set of taxon id.
     * @return all taxon ids in resolver
     */
    public Set<String> getTaxons() {
        Set<String> taxonIdSet = new LinkedHashSet<String>();
        for (MultiKey key : orgIdMaps.keySet()) {
            taxonIdSet.add((String) key.getKey(0));
        }
        return taxonIdSet;
    }

    /**
     * Return true if the idResolver contains information about this class name.
     * @param clsName an go term to check for
     * @return true if has this term
     */
    public boolean hasClassName(String clsName) {
        Set<String> clsNameSet = new HashSet<String>();
        for (MultiKey key : orgIdMaps.keySet()) {
            clsNameSet.add((String) key.getKey(1));
        }
        return clsNameSet.contains(clsName);
    }

    /**
     * Return a set of class names the reslover holds
     * @return a set of class names
     */
    public Set<String> getClassNames() {
        Set<String> clsNameSet = new HashSet<String>();
        for (MultiKey key : orgIdMaps.keySet()) {
            clsNameSet.add((String) key.getKey(1));
        }
        return clsNameSet;
    }

    /**
     * Check if resolver has taxon id and class name
     * @param taxonId taxon id as string
     * @param clsName class name as string
     */
    public boolean hasTaxonAndClassName(String taxonId, String clsName) {
        return orgIdMaps.keySet().contains(new MultiKey(taxonId, clsName));
    }

    /**
     * Check if resolver has taxon id and class name
     * @param taxonId taxon id as string
     * @param clsName class name as string
     */
    public boolean hasTaxonAndClassNames(String taxonId, Set<String> clsNames) {
        Map<String, Set<String>> taxonIdAndClsNameMap = new HashMap<String, Set<String>>();
        taxonIdAndClsNameMap.put(taxonId, clsNames);
        return hasTaxonsAndClassNames(taxonIdAndClsNameMap);
    }

    /**
     * Check if resolver has taxon id and class name
     * @param taxonId taxon id as string
     * @param clsName class name as string
     */
    public boolean hasTaxonsAndClassName(Set<String> taxonIds, String clsName) {
        Map<String, Set<String>> taxonIdAndClsNameMap = new HashMap<String, Set<String>>();
        for (String taxonId : taxonIds) {
            taxonIdAndClsNameMap.put(taxonId, new HashSet<String>(Arrays.asList(new String[] { clsName })));
        }
        return hasTaxonsAndClassNames(taxonIdAndClsNameMap);
    }

    /**
     * Check if resolver has a set of keys (taxon id + class name)
     * @param taxonIdAndClsNameMap data structure to hold keys
     * @return boolean value
     */
    public boolean hasTaxonsAndClassNames(Map<String, Set<String>> taxonIdAndClsNameMap) {
        Set<MultiKey> keySet = new HashSet<MultiKey>();
        for (Entry<String, Set<String>> e : taxonIdAndClsNameMap.entrySet()) {
            for (String clsName : e.getValue()) {
                keySet.add(new MultiKey(e.getKey(), clsName));
            }
        }

        return orgIdMaps.keySet().containsAll(keySet);
    }

    /**
     * Get a set of keys (taxon id + class name) resolver holds
     * @return a set of MultiKey, parse it to use, e.g. Map<taxonid, Set<clsName>>
     */
    public Map<String, Set<String>> getTaxonsAndClassNames() {
        Map<String, Set<String>> taxonIdAndClsNameMap = new HashMap<String, Set<String>>();
        for (MultiKey key : orgIdMaps.keySet()) {
            String taxonId = (String) key.getKey(0);
            String clsName = (String) key.getKey(1);
            if (taxonIdAndClsNameMap.get(taxonId) == null) {
                taxonIdAndClsNameMap.put(taxonId, new HashSet<String>(Arrays.asList(new String[] { clsName })));
            } else {
                taxonIdAndClsNameMap.get(taxonId).add(clsName);
            }
        }

        return taxonIdAndClsNameMap;
    }

    /**
     * Add alternative main identifiers for a primary identifier to the IdResolver.
     * @param taxonId the organism of the identifier
     * @param clsName go term
     * @param primaryIdentifier the main identifier
     * @param ids a set of alternative main identifiers
     */
    protected void addMainIds(String taxonId, String clsName, String primaryIdentifier, Set<String> ids) {
        addEntry(taxonId, clsName, primaryIdentifier, ids, Boolean.TRUE);
    }

    /**
     * Add alternative main identifiers for a primary identifier to the IdResolver.
     * @param taxonId the organism of the identifier
     * @param primaryIdentifier the main identifier
     * @param ids a set of alternative main identifiers
     */
    protected void addMainIds(String taxonId, String primaryIdentifier, Set<String> ids) {
        addMainIds(taxonId, this.clsName, primaryIdentifier, ids);
    }

    /**
     * Add synonyms for a primary identifier to the IdResolver
     * @param taxonId the organism of the identifier
     * @param clsName go term
     * @param primaryIdentifier the main identifier
     * @param ids a set synonyms
     */
    protected void addSynonyms(String taxonId, String clsName, String primaryIdentifier, Set<String> ids) {
        addEntry(taxonId, clsName, primaryIdentifier, ids, Boolean.FALSE);
    }

    /**
     * Add synonyms for a primary identifier to the IdResolver
     * @param taxonId the organism of the identifier
     * @param primaryIdentifier the main identifier
     * @param ids a set synonyms
     */
    protected void addSynonyms(String taxonId, String primaryIdentifier, Set<String> ids) {
        addSynonyms(taxonId, this.clsName, primaryIdentifier, ids);
    }

    /**
     * Create entries for the IdResolver, these will be added when getIdResolver
     * is called.
     * @param taxonId the organism of identifiers
     * @param clsName go term
     * @param primaryId main identifier
     * @param synonyms synonyms for the main identifier
     */
    public void addResolverEntry(String taxonId, String clsName, String primaryId, Set<String> synonyms) {
        addSynonyms(taxonId, clsName, primaryId, synonyms);
    }

    /**
     * Create entries for the IdResolver, these will be added when getIdResolver
     * is called.
     * @param taxonId the organism of identifiers
     * @param primaryId main identifier
     * @param synonyms synonyms for the main identifier
     */
    public void addResolverEntry(String taxonId, String primaryId, Set<String> synonyms) {
        addResolverEntry(taxonId, this.clsName, primaryId, synonyms);
    }

    /**
     * Add an entry to the IdResolver, a primary identifier and any number of synonyms.
     * @param taxonId the organism of the identifier
     * @param clsName go term
     * @param primaryIdentifier the main identifier
     * @param synonyms a set of synonyms
     * @param mainId if true these are main ids, otherwise synonms
    */
    protected void addEntry(String taxonId, String clsName, String primaryIdentifier, Collection<String> ids,
            Boolean mainId) {
        Map<String, Set<String>> idMap = orgIdMaps.get(new MultiKey(taxonId, clsName));
        if (idMap == null) {
            idMap = new LinkedHashMap<String, Set<String>>();
            orgIdMaps.put(new MultiKey(taxonId, clsName), idMap);
        }

        addToMapList(idMap, primaryIdentifier, ids);

        Map<String, Set<String>> lookupMap = null;
        Map<String, Set<String>> reverseMap = null;
        if (mainId.booleanValue()) {
            lookupMap = orgMainMaps.get(new MultiKey(taxonId, clsName));
            if (lookupMap == null) {
                lookupMap = new HashMap<String, Set<String>>();
                orgMainMaps.put(new MultiKey(taxonId, clsName), lookupMap);
            }

            reverseMap = orgIdMainMaps.get(new MultiKey(taxonId, clsName));
            if (reverseMap == null) {
                reverseMap = new LinkedHashMap<String, Set<String>>();
                orgIdMainMaps.put(new MultiKey(taxonId, clsName), reverseMap);
            }
        } else {
            // these ids are synonyms
            lookupMap = orgSynMaps.get(new MultiKey(taxonId, clsName));
            if (lookupMap == null) {
                lookupMap = new LinkedHashMap<String, Set<String>>();
                orgSynMaps.put(new MultiKey(taxonId, clsName), lookupMap);
            }

            reverseMap = orgIdSynMaps.get(new MultiKey(taxonId, clsName));
            if (reverseMap == null) {
                reverseMap = new LinkedHashMap<String, Set<String>>();
                orgIdSynMaps.put(new MultiKey(taxonId, clsName), reverseMap);
            }
        }

        // map from primaryId back to main/synonym ids
        addToMapList(reverseMap, primaryIdentifier, ids);

        for (String id : ids) {
            addToMapList(lookupMap, id, Collections.singleton(primaryIdentifier));
        }
    }

    /**
     * Write IdResolver contents to a flat file
     * @param f the file to write to
     * @throws IOException if fail to write
     */
    public void writeToFile(File f) throws IOException {
        LOG.info("Writing id resolver to file: " + f.getName());
        FileWriter fw = new FileWriter(f, true); // append if true
        //        FileWriter fw = new FileWriter(f);
        for (MultiKey key : orgIdMaps.keySet()) {

            // get maps for this organism
            Map<String, Set<String>> idMap = orgIdMaps.get(key);
            Map<String, Set<String>> mainIdsMap = orgIdMainMaps.get(key);
            Map<String, Set<String>> synonymMap = orgIdSynMaps.get(key);

            for (Map.Entry<String, Set<String>> idMapEntry : idMap.entrySet()) {
                StringBuffer sb = new StringBuffer();

                String primaryId = idMapEntry.getKey();

                sb.append((String) key.getKey(0) + "\t"); // write taxon id
                sb.append((String) key.getKey(1) + "\t"); // write class name
                sb.append(primaryId + "\t"); // write primary id

                if (mainIdsMap != null && mainIdsMap.containsKey(primaryId)) {
                    boolean first = true;
                    for (String mainId : mainIdsMap.get(primaryId)) {
                        if (!first) {
                            sb.append(",");
                        } else {
                            first = false;
                        }
                        sb.append(mainId);
                    }
                }

                if (synonymMap != null && synonymMap.containsKey(primaryId)) {
                    boolean first = true;
                    sb.append("\t");
                    for (String synonym : synonymMap.get(primaryId)) {
                        if (!first) {
                            sb.append(",");
                        } else {
                            first = false;
                        }
                        sb.append(synonym);
                    }
                }
                sb.append(System.getProperty("line.separator"));
                fw.write(sb.toString());
            }
        }
        fw.flush();
        fw.close();
    }

    /**
     * Read contents of an IdResolver from file, allows for caching during a build.
     * @param f the file to read from
     * @throws IOException if problem reading from file
     */
    public void populateFromFile(File f) throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(f));
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] cols = line.split("\t");
            String taxonId = cols[0];
            String clsName = cols[1];
            String primaryId = cols[2];

            String mainIdsStr = cols[3];
            if (!StringUtils.isBlank(mainIdsStr)) {
                String[] mainIds = mainIdsStr.split(",");
                addEntry(taxonId, clsName, primaryId, Arrays.asList(mainIds), Boolean.TRUE);
            }

            // read synonyms if they are present
            if (cols.length >= 5) {
                String synonymsStr = cols[4];
                if (!StringUtils.isBlank(synonymsStr)) {
                    String[] synonyms = synonymsStr.split(",");
                    addEntry(taxonId, clsName, primaryId, Arrays.asList(synonyms), Boolean.FALSE);
                }
            }
        }
        reader.close();
    }

    // TODO populate part from file with given taxons and classes, what if there
    // are some data nonexists? Maybe not a good idea...

    // add a new list to a map or add elements of set to existing map entry
    private void addToMapList(Map<String, Set<String>> map, String key, Collection<String> values) {
        Set<String> set = map.get(key);
        if (set == null) {
            set = new LinkedHashSet<String>();
            map.put(key, set);
        }
        set.addAll(values);
    }
}