org.intermine.bio.dataconversion.EnsemblSnpDbConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.intermine.bio.dataconversion.EnsemblSnpDbConverter.java

Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.intermine.dataconversion.ItemWriter;
import org.intermine.metadata.Model;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.sql.Database;
import org.intermine.util.StringUtil;
import org.intermine.xml.full.Item;
import org.intermine.xml.full.ReferenceList;

/**
 * Read Ensembl SNP data directly from MySQL variation database.
 * Ref to Ensembl variation schema -
 * http://www.ensembl.org/info/docs/variation/variation_schema.html
 *
 * @author Richard Smith
 * @author Fengyuan Hu
 */
public class EnsemblSnpDbConverter extends BioDBConverter {
    private static final String DATASET_TITLE = "Ensembl SNP data";
    private static final String DATA_SOURCE_NAME = "Ensembl";
    // pendingSnpConsequences: key - rs number, value- a collection of consequences
    // The snp has been stored but it will appear multiple times, so hang consequences over
    private final Map<Integer, Set<String>> pendingSnpVariationIdToConsequencesMap = new HashMap<Integer, Set<String>>();

    // storedSnpIds: key - rs number, value - IM object id
    private final Map<Integer, Integer> storedSnpVariationIdToInterMineIdMap = new HashMap<Integer, Integer>();

    private Set<String> snpSourceIds = null;

    // store a mapping from variation_id in ensembl database to stored SNP item id
    private Map<Integer, String> snpVariationIdToItemIdMap = new HashMap<Integer, String>();

    // default to human or take value set by parser
    Integer taxonId = null;
    private static final int PLANT = 3702;

    // There may be SNPs from multiple sources in the database, optionally restrict them
    Set<String> snpSources = new HashSet<String>();

    // Edit to restrict to loading fewer chromosomes
    private static final int MIN_CHROMOSOME = 1;
    private Set<String> processedChrNames = new HashSet<String>();

    private Map<String, String> sources = new HashMap<String, String>();
    private Map<String, String> states = new HashMap<String, String>();
    private Map<String, String> transcripts = new HashMap<String, String>();
    private Map<String, String> consequenceTypes = new HashMap<String, String>();

    private Map<Integer, Integer> strainIds = new HashMap<Integer, Integer>();
    private Map<Integer, String> popIdentifiers = new HashMap<Integer, String>();

    private boolean isEmptyResultSet = false;

    private static final Logger LOG = Logger.getLogger(EnsemblSnpDbConverter.class);

    /**
     * Construct a new EnsemblSnpDbConverter.
     * @param database the database to read from
     * @param model the Model used by the object store we will write to with the ItemWriter
     * @param writer an ItemWriter used to handle Items created
     */
    public EnsemblSnpDbConverter(Database database, Model model, ItemWriter writer) {
        super(database, model, writer, DATA_SOURCE_NAME, DATASET_TITLE);
    }

    /**
     * Set the organism to load
     * @param taxonId the organism to load
     */
    public void setOrganism(String taxonId) {
        this.taxonId = Integer.valueOf(taxonId);
    }

    /**
     * Optionally restrict the sources of SNPs to load by entries in source table, e.g. to dbSNP.
     * @param sourceStr a space-separated list of sources
     */
    public void setSources(String sourceStr) {
        for (String source : sourceStr.split(" ")) {
            snpSources.add(source.trim());
        }
    }

    /**
     * {@inheritDoc}
     */
    public void process() throws Exception {
        // a database has been initialised from properties starting with db.ensembl-snp-db
        if (this.taxonId == null) {
            throw new IllegalArgumentException("Must supply a taxon id for this variation database"
                    + " set the 'organism' property in project.xml");
        }

        Connection connection = getDatabase().getConnection();

        List<String> chrNames = new ArrayList<String>();
        for (int i = MIN_CHROMOSOME; i <= 22; i++) {
            chrNames.add("" + i);
        }
        chrNames.add("X");
        chrNames.add("Y");
        chrNames.add("MT");
        chrNames.add("Mt");
        chrNames.add("Pt");

        for (String chrName : chrNames) {
            System.out.println("Starting to process chromosome " + chrName);
            LOG.info("Starting to process chromosome " + chrName);
            ResultSet res = queryVariation(connection, chrName);
            process(res, chrName);
            createSynonyms(connection, chrName);
        }
        storeFinalSnps();

        if (PLANT == this.taxonId.intValue()) {
            processGenotypes(connection);
            processPopulations(connection);
            processStrainPopulationReferences(connection);
        }
        connection.close();
    }

    /**
     * Store the consequences as reference list to a SNP
     * @throws Exception exception
     */
    protected void storeFinalSnps() throws Exception {
        LOG.info("storeFinalSnps() pendingSnpRsNumberToConsequencesMap.size(): "
                + pendingSnpVariationIdToConsequencesMap.size());
        LOG.info("storeFinalSnps() storedSnpRsNumberToInterMineIdMap.size(): "
                + storedSnpVariationIdToInterMineIdMap.size());
        for (Integer variationId : pendingSnpVariationIdToConsequencesMap.keySet()) {
            Integer storedSnpInterMineId = storedSnpVariationIdToInterMineIdMap.get(variationId);
            Set<String> consequenceItemIdSet = pendingSnpVariationIdToConsequencesMap.get(variationId);
            storeSnpConsequenceCollections(storedSnpInterMineId, consequenceItemIdSet);
        }
    }

    /**
     * {@inheritDoc}
     */
    public void process(ResultSet res, String chrName) throws Exception {

        if (processedChrNames.contains(chrName.toUpperCase())) {
            LOG.info("Chr " + chrName + " has been processed...");
            return;
        } else {
            processedChrNames.add(chrName.toUpperCase());
        }

        // If empty set
        if (!res.next()) {
            setEmptyResultSet(true);
            LOG.info("Empty result for chr " + chrName);
            return;
        }

        int counter = 0; // result counter
        int snpCounter = 0;
        int consequenceCounter = 0;
        Set<String> seenLocsForSnp = new HashSet<String>(); // stored locations
        Boolean previousUniqueLocation = true; // whether previous snp has an unique location
        String previousTranscriptStableId = null;
        Set<String> consequenceItemIdSet = new HashSet<String>();
        boolean storeSnp = false;
        Integer previousVariationId = null;
        Integer currentVariationId = null;
        Item currentSnpItem = null; // the snp item to be stored
        String currentSnpItemId = null;
        Map<String, Integer> nonTranscriptConsequences = new HashMap<String, Integer>();

        // This code is complicated because not all SNPs map to a unique location and often have
        // locations on multiple chromosomes - we're processing one chromosome at a time for faster
        // queries to MySQL.
        res.previous(); // roll back one position
        while (res.next()) {

            counter++;

            currentVariationId = res.getInt("variation_id");

            // a new snp can't be the previous one OR one with pending consequences
            boolean newSnp = currentVariationId.equals(previousVariationId)
                    || pendingSnpVariationIdToConsequencesMap.containsKey(currentVariationId) ? false : true;

            // For the first snp, the if condition is false
            if (pendingSnpVariationIdToConsequencesMap.containsKey(currentVariationId)) {
                previousUniqueLocation = false;
            }

            if (newSnp) {
                // starting a new SNP, store the one just finished - previousRsNumber

                {
                    Integer storedSnpInterMineId = storedSnpVariationIdToInterMineIdMap.get(previousVariationId);

                    // if we didn't get back a storedSnpId this was the first time we found this
                    // (previous) SNP, so store it now
                    // For the first snp, there is no previous one, storeSnp is false, if condition
                    // is false
                    if (storeSnp && storedSnpInterMineId == null) {
                        storedSnpInterMineId = store(currentSnpItem);
                        storedSnpVariationIdToInterMineIdMap.put(previousVariationId, storedSnpInterMineId);
                        snpCounter++;
                    }

                    if (previousUniqueLocation) {
                        // the SNP we just stored has only one location so we won't see it again
                        // For the first snp, previousUniqueLocation is true, consequenceItemIdSet is
                        // empty, nothing to store
                        storeSnpConsequenceCollections(storedSnpInterMineId, consequenceItemIdSet);
                    } else {
                        // we'll see the previous SNP multiple times so hang onto data
                        Set<String> snpConsequences = pendingSnpVariationIdToConsequencesMap
                                .get(previousVariationId);

                        if (snpConsequences == null) {
                            snpConsequences = new HashSet<String>();
                            pendingSnpVariationIdToConsequencesMap.put(previousVariationId, snpConsequences);
                            snpConsequences.addAll(consequenceItemIdSet);
                        }
                    }
                }

                // START NEW SNP
                previousVariationId = currentVariationId;
                seenLocsForSnp = new HashSet<String>();
                consequenceItemIdSet = new HashSet<String>();
                storeSnp = true;

                // map weight is the number of chromosome locations for the SNP, in practice there
                // are sometimes fewer locations than the map_weight indicates
                int mapWeight = res.getInt("map_weight");
                boolean currentUniqueLocation = (mapWeight == 1) ? true : false;
                previousUniqueLocation = currentUniqueLocation;

                // if not a unique location and we've seen the SNP before (e.g. on a
                // different chromosome), don't store, because the snp has been stored
                if (!currentUniqueLocation
                        && pendingSnpVariationIdToConsequencesMap.containsKey(currentVariationId)) {
                    storeSnp = false;
                }

                if (storeSnp) {
                    currentSnpItem = createItem("SNP");
                    currentSnpItemId = currentSnpItem.getIdentifier();

                    snpVariationIdToItemIdMap.put(currentVariationId, currentSnpItemId);

                    currentSnpItem.setAttribute("primaryIdentifier", res.getString("variation_name"));
                    currentSnpItem.setReference("organism", getOrganismItem(taxonId));
                    currentSnpItem.setAttribute("uniqueLocation", "" + currentUniqueLocation);

                    String alleles = res.getString("allele_string");
                    if (!StringUtils.isBlank(alleles)) {
                        currentSnpItem.setAttribute("alleles", alleles);
                    }

                    String type = determineType(alleles);
                    if (type != null) {
                        currentSnpItem.setAttribute("type", type);
                    }

                    // CHROMOSOME AND LOCATION
                    // if SNP is mapped to multiple locations don't set chromosome and
                    // chromosomeLocation references
                    int start = res.getInt("seq_region_start");
                    int end = res.getInt("seq_region_end");
                    int chrStrand = res.getInt("seq_region_strand");

                    int chrStart = Math.min(start, end);
                    int chrEnd = Math.max(start, end);

                    Item loc = storeLocation("" + chrStart, "" + chrEnd, "" + chrStrand, currentSnpItemId, chrName);

                    // if mapWeight is 1 there is only one chromosome location, so set shortcuts
                    if (currentUniqueLocation) {
                        currentSnpItem.setReference("chromosome", getChromosome(chrName, taxonId));
                        currentSnpItem.setReference("chromosomeLocation", loc);
                    }
                    seenLocsForSnp.add(chrName + ":" + chrStart);

                    // SOURCE
                    String source = res.getString("source_name");
                    currentSnpItem.setReference("source", getSourceIdentifier(source));

                    // VALIDATION STATES
                    String validationStatus = res.getString("validation_status");
                    List<String> validationStates = getValidationStateCollection(validationStatus);
                    if (!validationStates.isEmpty()) {
                        currentSnpItem.setCollection("validations", validationStates);
                    }
                }
            }

            currentSnpItemId = snpVariationIdToItemIdMap.get(currentVariationId);

            if (currentSnpItemId == null) {
                LOG.error("currentSNP is null. vf.variation_feature_id: " + res.getInt("variation_feature_id")
                        + " CurrentVariationId: " + currentVariationId + " previousVariationId: "
                        + previousVariationId + " storeSnp: " + storeSnp);
            } else {
                // we're on the same SNP but maybe a new location

                int start = res.getInt("seq_region_start");
                int end = res.getInt("seq_region_end");
                int strand = res.getInt("seq_region_strand");

                int chrStart = Math.min(start, end);
                int chrEnd = Math.max(start, end);

                String chrLocStr = chrName + ":" + chrStart;
                if (!seenLocsForSnp.contains(chrLocStr)) {
                    seenLocsForSnp.add(chrLocStr);
                    // if this location is on a chromosome we want, store it
                    storeLocation("" + chrStart, "" + chrEnd, "" + strand, currentSnpItemId, chrName);
                }
            }

            // CONSEQUENCE TYPES
            // for SNPs without a uniqueLocation there will be different consequences at each one.
            // some consequences will need to stored at the end
            String currentTranscriptStableId = res.getString("feature_stable_id");

            if (!StringUtils.isBlank(currentTranscriptStableId)) {
                // In Ensembl 66, there are records with same transcript different allel_string
                // | variation_feature_id | feature_stable_id | allele_string | consequence_types     |
                // |             53025155 | ENST00000465814   | A/T           | nc_transcript_variant |
                // |             53025155 | ENST00000465814   | A/C           | nc_transcript_variant |
                // |             53025155 | ENST00000465814   | A/G           | nc_transcript_variant |

                // a new consequence type should not belong to same snp and same transcript
                boolean newConsequenceType = currentTranscriptStableId.equals(previousTranscriptStableId)
                        && currentVariationId.equals(previousVariationId) ? false : true;
                if (newConsequenceType) {
                    previousTranscriptStableId = currentTranscriptStableId;
                    String type = res.getString("transcript_variation_consequence_types");
                    // Seen one example so far where consequence type is an empty string
                    if (StringUtils.isBlank(type)) {
                        type = "UNKNOWN";
                    }

                    Item consequenceItem = createItem("Consequence");
                    consequenceItem.setAttribute("description", type);
                    for (String individualType : type.split(",")) {
                        consequenceItem.addToCollection("types", getConsequenceType(individualType.trim()));
                    }
                    setAttIfValue(consequenceItem, "peptideAlleles", res.getString("pep_allele_string"));
                    setAttIfValue(consequenceItem, "siftPrediction", res.getString("sift_prediction"));
                    setAttIfValue(consequenceItem, "siftScore", res.getString("sift_score"));
                    setAttIfValue(consequenceItem, "polyphenPrediction", res.getString("polyphen_prediction"));
                    setAttIfValue(consequenceItem, "polyphenScore", res.getString("polyphen_score"));

                    consequenceItem.setReference("transcript", getTranscriptIdentifier(currentTranscriptStableId));

                    consequenceItemIdSet.add(consequenceItem.getIdentifier());
                    store(consequenceItem);
                    consequenceCounter++;
                }
            } else { // transcriptStableId is empty, log it
                // variation_feature_consequence_types is full list of consequence types which
                // should include transcript_variation_consequence_types
                String variationConsequences = res.getString("variation_feature_consequence_types");
                Integer consequenceCount = nonTranscriptConsequences.get(variationConsequences);

                if (consequenceCount == null) {
                    consequenceCount = new Integer(0);
                }

                // nonTranscriptConsequences is for log message only
                nonTranscriptConsequences.put(variationConsequences, new Integer(consequenceCount + 1));
            }

            if (counter % 100000 == 0) {
                LOG.info("Read " + counter + " rows total, stored " + snpCounter + " SNPs. for chr " + chrName);
            }
        }

        // The last record to store on the chromosome
        Integer storedSnpInterMineId;

        if (previousUniqueLocation) {
            storedSnpInterMineId = store(currentSnpItem);
            storedSnpVariationIdToInterMineIdMap.put(currentVariationId, storedSnpInterMineId);
            storeSnpConsequenceCollections(storedSnpInterMineId, consequenceItemIdSet);
        } else {
            storedSnpInterMineId = storedSnpVariationIdToInterMineIdMap.get(currentVariationId);

            if (storedSnpInterMineId == null) {
                storedSnpInterMineId = store(currentSnpItem);
                storedSnpVariationIdToInterMineIdMap.put(currentVariationId, storedSnpInterMineId);

                Set<String> snpConsequences = pendingSnpVariationIdToConsequencesMap.get(currentVariationId);
                if (snpConsequences == null) {
                    snpConsequences = new HashSet<String>();
                    pendingSnpVariationIdToConsequencesMap.put(currentVariationId, snpConsequences);
                    snpConsequences.addAll(consequenceItemIdSet);
                }
            } else {
                pendingSnpVariationIdToConsequencesMap.get(currentVariationId).addAll(consequenceItemIdSet);
            }
        }

        LOG.info("Finished " + counter + " rows total, stored " + snpCounter + " SNPs for chr " + chrName);
        LOG.info("storedSnpRsNumberToInterMineIdMap.size() = " + storedSnpVariationIdToInterMineIdMap.size());
        LOG.info("Consequence count: " + consequenceCounter);
        LOG.info("Consequence types (consequence type to count) without transcript on Chromosome " + chrName + " : "
                + nonTranscriptConsequences);
    }

    private void setAttIfValue(Item item, String attName, String attValue) {
        if (!StringUtils.isBlank(attValue)) {
            item.setAttribute(attName, attValue);
        }
    }

    private String getConsequenceType(String type) throws ObjectStoreException {
        if (!consequenceTypes.containsKey(type)) {
            Item consequenceType = createItem("ConsequenceType");
            consequenceType.setAttribute("type", type);
            store(consequenceType);
            consequenceTypes.put(type, consequenceType.getIdentifier());
        }
        return consequenceTypes.get(type);
    }

    // This has to be called after process() called for the chromosome because it needs
    // variationIdToItemIdentifier to be populated.
    private void createSynonyms(Connection connection, String chrName) throws SQLException, ObjectStoreException {
        ResultSet res = querySynonyms(connection, chrName);

        int synonymCounter = 0;
        while (res.next()) {
            Integer variationId = res.getInt("variation_id");
            String synonym = res.getString("name");

            if (!StringUtils.isBlank(synonym)) {
                synonymCounter++;
                createSynonym(snpVariationIdToItemIdMap.get(variationId), synonym, true);
            }
        }
        LOG.info("Created " + synonymCounter + " synonyms for chr " + chrName);
    }

    private void processGenotypes(Connection connection) throws Exception {
        // query for strains
        ResultSet res = queryStrains(connection);
        while (res.next()) {
            Integer strainId = res.getInt("sample_id");
            String strainName = res.getString("name");

            Item strain = createItem("Strain");
            strain.setAttribute("name", strainName);
            Integer storedStrainId = store(strain);
            strainIds.put(strainId, storedStrainId);
            LOG.warn("Read strain: " + strainId);

            // for each strain query and store genotypes
            processGenotypesForStrain(connection, strainId, strain.getIdentifier());

            //strainCounter++;
            //if (strainCounter >= 100) {
            //    break;
            //}
        }
    }

    private void processGenotypesForStrain(Connection connection, Integer strainId, String strainIdentifier)
            throws Exception {
        // One table contains SNPs and once contains bigger indels, etc.
        ResultSet res = queryGenotypesForStrainSingleBp(connection, strainId);
        createGeneotypesForStrain(res, strainId, strainIdentifier);

        res = queryGenotypesForStrainMultipleBp(connection, strainId);
        createGeneotypesForStrain(res, strainId, strainIdentifier);
    }

    private void createGeneotypesForStrain(ResultSet res, Integer strainId, String strainIdentifier)
            throws Exception {
        int snpReferenceCount = 0;
        int ignoredCount = 0;
        while (res.next()) {
            Integer variationId = res.getInt("variation_id");
            String allele1 = res.getString("allele_1");
            String allele2 = res.getString("allele_2");

            String snpItemIdentifier = snpVariationIdToItemIdMap.get(variationId);

            Item genotype = createItem("StrainGenotype");
            genotype.setAttribute("allele1", allele1);
            genotype.setAttribute("allele2", allele2);
            if (snpItemIdentifier != null) {
                genotype.setReference("snp", snpItemIdentifier);
                snpReferenceCount++;

            } else {
                ignoredCount++;
            }
            genotype.setReference("strain", strainIdentifier);

            store(genotype);
        }
        String message = "For strain " + strainId + " snp ref: " + snpReferenceCount + ", no ref: " + ignoredCount;
        LOG.info(message);
        System.out.println(message);
    }

    private void processPopulations(Connection connection) throws Exception {

        ResultSet res = queryPopulations(connection);
        while (res.next()) {
            Integer popId = res.getInt("sample_id");
            String popName = res.getString("name");
            String popDesc = res.getString("description");

            Item pop = createItem("Population");
            pop.setAttribute("name", popName);
            pop.setAttribute("description", popDesc);
            store(pop);
            popIdentifiers.put(popId, pop.getIdentifier());
            LOG.warn("Processing population: " + popId);
            System.out.println("Processing population: " + popId);

            // for each population query and store genotypes
            processAllelesForPopulation(connection, popId, pop.getIdentifier());
        }
    }

    private void processAllelesForPopulation(Connection connection, Integer popId, String popIdentifier)
            throws SQLException, ObjectStoreException {
        ResultSet res = queryAllelesForPopulation(connection, popId);

        int snpReferenceCount = 0;
        int ignoredCount = 0;
        int counter = 0;
        while (res.next()) {
            Integer variationId = res.getInt("variation_id");
            String allele = res.getString("allele");
            String frequency = res.getString("frequency");

            String snpItemIdentifier = snpVariationIdToItemIdMap.get(variationId);

            Item genotype = createItem("PopulationGenotype");
            genotype.setAttribute("allele", allele);
            genotype.setAttribute("frequency", frequency);
            if (snpItemIdentifier != null) {
                genotype.setReference("snp", snpItemIdentifier);
                snpReferenceCount++;
            } else {
                ignoredCount++;
            }
            genotype.setReference("population", popIdentifier);

            store(genotype);

            counter++;
            if (counter % 100000 == 0) {
                String message = "Read " + counter + " alleles.";
                LOG.info(message);
            }

        }
        String message = "For population " + popId + " snp ref: " + snpReferenceCount + ", no ref: " + ignoredCount;
        LOG.info(message);
    }

    private void processStrainPopulationReferences(Connection connection)
            throws SQLException, ObjectStoreException {
        ResultSet res = queryStrainPopulationReferences(connection);

        Map<Integer, List<String>> strainToPopulation = new HashMap<Integer, List<String>>();
        while (res.next()) {
            Integer strainId = res.getInt("individual_sample_id");
            Integer popId = res.getInt("population_sample_id");

            List<String> strainPopIdentifiers = strainToPopulation.get(strainId);
            if (strainPopIdentifiers == null) {
                strainPopIdentifiers = new ArrayList<String>();
                strainToPopulation.put(strainId, strainPopIdentifiers);
            }
            strainPopIdentifiers.add(popIdentifiers.get(popId));
        }
        for (Integer strainId : strainToPopulation.keySet()) {
            ReferenceList populations = new ReferenceList("populations", strainToPopulation.get(strainId));
            if (strainIds.containsKey(strainId)) {
                store(populations, strainIds.get(strainId));
            }
        }
    }

    private ResultSet queryStrainPopulationReferences(Connection connection) throws SQLException {
        String query = "SELECT individual_sample_id, population_sample_id" + " FROM individual_population";
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private ResultSet queryGenotypesForStrainSingleBp(Connection connection, Integer strainId) throws SQLException {
        String query = "SELECT variation_id, allele_1, allele_2" + " FROM tmp_individual_genotype_single_bp"
                + " WHERE sample_id = " + strainId;
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private ResultSet queryGenotypesForStrainMultipleBp(Connection connection, Integer strainId)
            throws SQLException {
        String query = "SELECT variation_id, allele_1, allele_2" + " FROM individual_genotype_multiple_bp"
                + " WHERE sample_id = " + strainId;
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private ResultSet queryAllelesForPopulation(Connection connection, Integer popId) throws SQLException {
        String query = "SELECT a.variation_id, a.sample_id, ac.allele, a.frequency"
                + " FROM allele a, allele_code ac" + " WHERE a.sample_id = " + popId
                + " AND a.allele_code_id = ac.allele_code_id";
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private void storeSnpConsequenceCollections(Integer storedSnpInterMineId, Set<String> consequenceItemIdSet)
            throws ObjectStoreException {
        if (!consequenceItemIdSet.isEmpty()) {
            ReferenceList col = new ReferenceList("consequences", new ArrayList<String>(consequenceItemIdSet));
            store(col, storedSnpInterMineId);
        }
    }

    /**
     * Given an allele string read from the database determine the type of variation, e.g. snp,
     * in-del, etc.  This is a re-implementation of code from the Ensembl perl API, see:
     * http://www.ensembl.org/info/docs/Doxygen/variation-api/
     *    classBio_1_1EnsEMBL_1_1Variation_1_1Utils_1_1Sequence.html
     * @param alleleStr the alleles to determine the type for
     * @return a variation class or null if none can be determined
     */
    protected String determineType(String alleleStr) {
        String type = null;

        final String validBases = "ATUGCYRSWKMBDHVN";

        alleleStr = alleleStr.toUpperCase();
        if (!StringUtils.isBlank(alleleStr)) {
            // snp if e.g. A/C or A|C
            if (alleleStr.matches("^[" + validBases + "]([\\/\\|\\\\][" + validBases + "])+$")) {
                type = "snp";
            } else if ("CNV".equals(alleleStr)) {
                type = alleleStr.toLowerCase();
            } else if ("CNV_PROBE".equals(alleleStr)) {
                type = "cnv probe";
            } else if ("HGMD_MUTATION".equals(alleleStr)) {
                type = alleleStr.toLowerCase();
            } else {
                String[] alleles = alleleStr.split("[\\|\\/\\\\]");

                if (alleles.length == 1) {
                    type = "het";
                } else if (alleles.length == 2) {
                    if ((StringUtils.containsOnly(alleles[0], validBases) && "-".equals(alleles[1]))
                            || (StringUtils.containsOnly(alleles[1], validBases) && "-".equals(alleles[0]))) {
                        type = "in-del";
                    } else if (containsOneOf(alleles[0], "LARGE", "INS", "DEL")
                            || containsOneOf(alleles[1], "LARGE", "INS", "DEL")) {
                        type = "named";
                    } else if ((StringUtils.containsOnly(alleles[0], validBases) && alleles[0].length() > 1)
                            || (StringUtils.containsOnly(alleles[1], validBases) && alleles[1].length() > 1)) {
                        // AA/GC 2 alleles
                        type = "substitution";
                    }
                } else if (alleles.length > 2) {
                    if (containsDigit(alleles[0])) {
                        type = "microsat";
                    } else if (anyContainChar(alleles, "-")) {
                        type = "mixed";
                    }
                }
                if (type == null) {
                    LOG.warn("Failed to work out allele type for: " + alleleStr);
                }
            }
        }

        return type;
    }

    private String getSourceIdentifier(String name) throws ObjectStoreException {
        String sourceIdentifier = sources.get(name);
        if (sourceIdentifier == null) {
            Item source = createItem("Source");
            source.setAttribute("name", name);
            store(source);
            sourceIdentifier = source.getIdentifier();
            sources.put(name, sourceIdentifier);
        }
        return sourceIdentifier;
    }

    private String getTranscriptIdentifier(String transcriptStableId) throws ObjectStoreException {
        String transcriptIdentifier = transcripts.get(transcriptStableId);
        if (transcriptIdentifier == null) {
            Item transcript = createItem("Transcript");
            transcript.setAttribute("primaryIdentifier", transcriptStableId);
            store(transcript);
            transcriptIdentifier = transcript.getIdentifier();
            transcripts.put(transcriptStableId, transcriptIdentifier);
        }
        return transcriptIdentifier;
    }

    private List<String> getValidationStateCollection(String input) throws ObjectStoreException {
        List<String> stateIdentifiers = new ArrayList<String>();
        if (!StringUtils.isBlank(input)) {
            for (String state : input.split(",")) {
                stateIdentifiers.add(getStateIdentifier(state.trim()));
            }
        }
        return stateIdentifiers;
    }

    private String getStateIdentifier(String name) throws ObjectStoreException {
        String stateIdentifier = states.get(name);
        if (stateIdentifier == null) {
            Item state = createItem("ValidationState");
            state.setAttribute("name", name);
            store(state);
            stateIdentifier = state.getIdentifier();
            states.put(name, stateIdentifier);
        }
        return stateIdentifier;
    }

    private ResultSet queryVariation(Connection connection, String chrName) throws SQLException {
        // ensembl "variation_feature" table:
        // Doc: http://www.ensembl.org/info/docs/variation/variation_schema.html#variation_feature
        // Column: consequence_type (not integrated to intermine)
        // Description: The SO accession(s) representing the 'worst' consequence(s) of the
        //              variation in a transcript or regulatory region

        // ensembl "transcript_variation" table
        // Doc:http://www.ensembl.org/info/docs/variation/variation_schema.html#transcript_variation
        // Column: consequence_types
        // Description: The consequence(s) of the variant allele on this transcript

        /*
        String query = "SELECT vf.variation_feature_id, vf.variation_name, vf.variation_id,"
        + " vf.allele_string, sr.name,"
        + " vf.map_weight, vf.seq_region_start, vf.seq_region_end, vf.seq_region_strand, "
        + " s.name,"
        + " vf.validation_status,"
        + " vf.consequence_type,"
        + " tv.cdna_start,tv.consequence_types,tv.pep_allele_string,tv.feature_stable_id,"
        + " tv.sift_prediction, tv.sift_score, tv.polyphen_prediction, tv.polyphen_score"
        //+ " tv.sift_prediction, tv.polyphen_prediction"
        + " FROM seq_region sr, source s, variation_feature vf "
        + " LEFT JOIN (transcript_variation tv)"
        + " ON (vf.variation_feature_id = tv.variation_feature_id"
        + "    AND tv.consequence_types NOT IN ('5KB_downstream_variant',"
        + "    '5KB_upstream_variant','500B_downstream_variant','2KB_upstream_variant'))"
        + " WHERE vf.seq_region_id = sr.seq_region_id"
        + " AND vf.source_id = s.source_id"
        + " AND sr.name = '" + chrName + "'"
        + " ORDER BY vf.variation_id";
        */

        /*
         * NOTE: from Ensembl 67, we start to create precompute tables for
         * variations in MySQL before hand:
         *
         * 1) CREATE TABLE mM_snp_tmp_no_order_chr_all SELECT
         * vf.variation_feature_id, vf.variation_name, vf.variation_id,
         * vf.allele_string, sr.name AS seq_region_name, vf.map_weight,
         * vf.seq_region_start, vf.seq_region_end, vf.seq_region_strand, s.name
         * AS source_name, vf.validation_status, vf.consequence_type,
         * tv.cdna_start,tv.consequence_types
         * ,tv.pep_allele_string,tv.feature_stable_id, tv.sift_prediction,
         * tv.sift_score, tv.polyphen_prediction, tv.polyphen_score FROM
         * seq_region sr, source s, variation_feature vf LEFT JOIN
         * (transcript_variation tv) ON (vf.variation_feature_id =
         * tv.variation_feature_id AND tv.consequence_types NOT IN
         * ('5KB_downstream_variant',
         * '5KB_upstream_variant','500B_downstream_variant','2KB_upstream_variant'))
         * WHERE vf.seq_region_id = sr.seq_region_id AND vf.source_id =
         * s.source_id;
         *
         * 2) CREATE TABLE mM_snp_tmp_ordered_chr_all SELECT * FROM
         * mM_snp_tmp_no_order_chr_all ORDER BY seq_region_name, variation_id;
         */

        // in mysql, string comparisons are case insensitive by default, use BINARY
        // we had a loading issue with "MT" and "Mt", a snp was created twice
        String query = "SELECT *" + " FROM mM_snp_tmp_ordered_chr_all" + " WHERE BINARY seq_region_name = '"
                + chrName + "'";

        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private ResultSet querySynonyms(Connection connection, String chrName) throws SQLException {
        String query = "SELECT vs.variation_id, vs.name"
                + " FROM variation_synonym vs, variation_feature vf, seq_region sr"
                + " WHERE vs.variation_id = vf.variation_id" + " AND vf.seq_region_id = sr.seq_region_id"
                + " AND sr.name = '" + chrName + "'" + " AND vs.source_id IN ("
                + StringUtil.join(getSnpSourceIds(connection), ",") + ")" + " ORDER BY vs.variation_id";

        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private Set<String> getSnpSourceIds(Connection connection) throws SQLException {
        if (snpSourceIds == null) {
            snpSourceIds = new HashSet<String>();
            String sql = "SELECT source_id FROM source";
            if (snpSources != null && !snpSources.isEmpty()) {
                sql += " WHERE name IN (" + makeInList(snpSources) + ")";
            }
            Statement stmt = connection.createStatement();
            ResultSet res = stmt.executeQuery(sql);
            while (res.next()) {
                snpSourceIds.add(res.getString("source_id"));
            }
            if (snpSourceIds.isEmpty()) {
                throw new RuntimeException("Failed to retrieve source_ids for dbSNP source");
            }
        }
        return snpSourceIds;
    }

    private String makeInList(Collection<String> strings) {
        Set<String> quoted = new HashSet<String>();
        for (String s : strings) {
            quoted.add("\"" + s + "\"");
        }
        return StringUtil.join(quoted, ",");
    }

    private ResultSet queryStrains(Connection connection) throws SQLException {

        String query = "SELECT s.sample_id, s.name" + " FROM sample s, individual i"
                + " WHERE i.sample_id = s.sample_id";
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    private ResultSet queryPopulations(Connection connection) throws SQLException {

        String query = "SELECT s.sample_id, s.name, s.description" + " FROM sample s, population p"
                + " WHERE p.sample_id = s.sample_id";
        LOG.warn(query);

        Statement stmt = connection.createStatement();
        ResultSet res = stmt.executeQuery(query);
        return res;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getDataSetTitle(int taxonId) {
        return DATASET_TITLE;
    }

    private boolean containsOneOf(String target, String... substrings) {
        for (String substring : substrings) {
            if (target.contains(substring)) {
                return true;
            }
        }
        return false;
    }

    private boolean anyContainChar(String[] targets, String substring) {
        for (String target : targets) {
            if (target.contains(substring)) {
                return true;
            }
        }
        return false;
    }

    private boolean containsDigit(String target) {
        for (int i = 0; i < target.length(); i++) {
            if (Character.isDigit(target.charAt(i))) {
                return true;
            }
        }
        return false;
    }

    /**
     * For unit test perpose, when result set is empty, set to true
     * @return isEmptyResultSet a boolean value
     */
    public boolean isEmptyResultSet() {
        return isEmptyResultSet;
    }

    /**
     * For unit test perpose, when result set is empty, set to true
     * @param isEmptyResultSet a boolean value, default value false
     */
    public void setEmptyResultSet(boolean isEmptyResultSet) {
        this.isEmptyResultSet = isEmptyResultSet;
    }

    private Item storeLocation(String chrStart, String chrEnd, String strand, String currentSnpItemId,
            String chrName) throws ObjectStoreException {
        Item loc = createItem("Location");
        loc.setAttribute("start", "" + chrStart);
        loc.setAttribute("end", "" + chrEnd);
        loc.setAttribute("strand", "" + strand);
        // The magic part, location can reference back to snp, for snp with multiple
        // locations, no need to set reference in snp
        loc.setReference("feature", currentSnpItemId);
        loc.setReference("locatedOn", getChromosome(chrName, taxonId));
        store(loc);
        return loc;
    }
}