ubic.gemma.loader.expression.geo.GeoConverterImpl.java Source code

Introduction

Here is the source code for ubic.gemma.loader.expression.geo.GeoConverterImpl.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.expression.geo;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;

import ubic.basecode.io.ByteArrayConverter;
import ubic.gemma.genome.taxon.service.TaxonService;
import ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl;
import ubic.gemma.loader.expression.geo.model.GeoChannel;
import ubic.gemma.loader.expression.geo.model.GeoContact;
import ubic.gemma.loader.expression.geo.model.GeoData;
import ubic.gemma.loader.expression.geo.model.GeoDataset;
import ubic.gemma.loader.expression.geo.model.GeoDataset.ExperimentType;
import ubic.gemma.loader.expression.geo.model.GeoDataset.PlatformType;
import ubic.gemma.loader.expression.geo.model.GeoPlatform;
import ubic.gemma.loader.expression.geo.model.GeoReplication;
import ubic.gemma.loader.expression.geo.model.GeoReplication.ReplicationType;
import ubic.gemma.loader.expression.geo.model.GeoSample;
import ubic.gemma.loader.expression.geo.model.GeoSeries;
import ubic.gemma.loader.expression.geo.model.GeoSubset;
import ubic.gemma.loader.expression.geo.model.GeoValues;
import ubic.gemma.loader.expression.geo.model.GeoVariable;
import ubic.gemma.loader.expression.geo.model.GeoVariable.VariableType;
import ubic.gemma.loader.expression.geo.util.GeoConstants;
import ubic.gemma.loader.util.parser.ExternalDatabaseUtils;
import ubic.gemma.model.association.GOEvidenceCode;
import ubic.gemma.model.common.auditAndSecurity.Contact;
import ubic.gemma.model.common.auditAndSecurity.Person;
import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.DatabaseType;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.description.ExternalDatabaseService;
import ubic.gemma.model.common.description.LocalFile;
import ubic.gemma.model.common.description.VocabCharacteristic;
import ubic.gemma.model.common.quantitationtype.PrimitiveType;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.biomaterial.Treatment;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExperimentalDesign;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.FactorType;
import ubic.gemma.model.expression.experiment.FactorValue;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.PolymerType;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.ontology.providers.MgedOntologyService;
import ubic.gemma.util.ConfigUtils;

/**
 * Convert GEO domain objects into Gemma objects. Usually we trigger this by passing in GeoSeries objects.
 * <p>
 * GEO has four basic kinds of objects: Platforms (ArrayDesigns), Samples (BioAssays), Series (Experiments) and DataSets
 * (which are curated Experiments). Note that a sample can belong to more than one series. A series can include more
 * than one dataset. GEO also supports the concept of a superseries. See
 * http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html.
 * <p>
 * A curated expression data set is at first represented by a GEO "GDS" number (a curated dataset), which maps to a
 * series (GSE). HOWEVER, multiple datasets may go together to form a series (GSE). This can happen when the "A" and "B"
 * arrays were both run on the same samples. Thus we actually normally go by GSE.
 * <p>
 * This service can be used in database-aware or unaware states. However, it has prototype scope as it has some 'global'
 * data structures used during processing.
 * 
 * @author keshav
 * @author pavlidis
 * @version $Id: GeoConverterImpl.java,v 1.26 2013/05/02 16:43:04 paul Exp $
 */
@Component
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class GeoConverterImpl implements GeoConverter {

    private static final int DEFAULT_DEFINITION_OF_TOO_MANY_ELEMENTS = 100000;

    /**
     * This string is inserted into the descriptions of constructed biomaterials.
     */
    private static final String BIOMATERIAL_DESCRIPTION_PREFIX = "BioMat:";

    /**
     * This string is inserted into the names of constructed biomaterials, so you get names like GSE5929_BioMat_58.
     */
    private static final String BIOMATERIAL_NAME_TAG = "_Biomat_";

    /**
     * How often we tell the user about data processing (items per update)
     */
    private static final int LOGGING_VECTOR_COUNT_UPDATE = 2000;

    private static Log log = LogFactory.getLog(ArrayDesignSequenceProcessingServiceImpl.class.getName());

    /**
     * Initial guess at how many designelementdatavectors to allocate space for.
     */
    private static final int INITIAL_VECTOR_CAPACITY = 10000;

    @Autowired
    private ExternalDatabaseService externalDatabaseService;

    @Autowired
    private TaxonService taxonService;

    private ByteArrayConverter byteArrayConverter = new ByteArrayConverter();

    private ExternalDatabase geoDatabase;

    private Map<String, Map<String, CompositeSequence>> platformDesignElementMap = new HashMap<String, Map<String, CompositeSequence>>();

    private Map<String, Taxon> taxonScientificNameMap = new HashMap<String, Taxon>();

    private Map<String, Taxon> taxonAbbreviationMap = new HashMap<String, Taxon>();

    private Collection<Object> results = new HashSet<Object>();

    private Map<String, ArrayDesign> seenPlatforms = new HashMap<String, ArrayDesign>();

    private ExternalDatabase genbank;

    /**
     * `
     */
    private boolean splitByPlatform = false;

    /**
     * The scientific name used for rat species. FIXME this should be updated elsewhere; avoid this hardcoding.
     */
    private static final String RAT = "Rattus norvegicus";

    /**
     * More than this and we apply stricter selection criteria for choosing elements to keep on a platform.
     */
    int tooManyElements = ConfigUtils.getInt("geo.platform.import.maxelements",
            DEFAULT_DEFINITION_OF_TOO_MANY_ELEMENTS);

    private boolean forceConvertElements = false;

    private static Map<String, String> organismDatabases = new HashMap<String, String>();

    static {
        organismDatabases.put("Saccharomyces cerevisiae", "SGD");
        organismDatabases.put("Schizosaccharomyces pombe", "GeneDB");
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#clear()
     */
    @Override
    public void clear() {
        results = new HashSet<Object>();
        seenPlatforms = new HashMap<String, ArrayDesign>();
        platformDesignElementMap = new HashMap<String, Map<String, CompositeSequence>>();
        taxonAbbreviationMap.clear();
        taxonScientificNameMap.clear();
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#convert(java.util.Collection)
     */
    @Override
    public Collection<Object> convert(Collection<? extends GeoData> geoObjects) {
        for (Object geoObject : geoObjects) {
            Object convertedObject = convert((GeoData) geoObject);
            if (convertedObject != null) {
                if (convertedObject instanceof Collection) {
                    results.addAll((Collection<?>) convertedObject);
                } else {
                    results.add(convertedObject);
                }
            }
        }

        log.info("Converted object tally:\n" + this);

        // log.debug( "Detailed object tree:" );
        // log.debug( PrettyPrinter.print( results ) );

        return results;
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#convert(java.lang.Object)
     */
    @Override
    @SuppressWarnings("unchecked")
    public Object convert(GeoData geoObject) {
        if (geoObject == null) {
            log.warn("Null object");
            return null;
        }
        if (geoObject instanceof Collection) {
            return convert((Collection<GeoData>) geoObject);
        } else if (geoObject instanceof GeoDataset) {
            return convertDataset((GeoDataset) geoObject);
        } else if (geoObject instanceof GeoSeries) { // typically we start here, with a series.
            return convertSeries((GeoSeries) geoObject);
        } else if (geoObject instanceof GeoSubset) {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        } else if (geoObject instanceof GeoSample) {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        } else if (geoObject instanceof GeoPlatform) {
            return convertPlatform((GeoPlatform) geoObject);
        } else {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        }

    }

    /**
     * Convert a vector of strings into a byte[] for saving in the database. . Blanks(missing values) are treated as NAN
     * (double), 0 (integer), false (booleans) or just empty strings (strings). Other invalid values are treated the
     * same way as missing data (to keep the parser from failing when dealing with strange GEO files that have values
     * like "Error" for an expression value).
     * 
     * @param vector of Strings to be converted to primitive values (double, int etc)
     * @param qt The quantitation type for the values to be converted.
     * @return
     */
    @Override
    public byte[] convertData(List<Object> vector, QuantitationType qt) {

        if (vector == null || vector.size() == 0)
            return null;

        boolean containsAtLeastOneNonNull = false;
        for (Object string : vector) {
            if (string != null) {
                containsAtLeastOneNonNull = true;
                break;
            }
        }

        if (!containsAtLeastOneNonNull) {
            if (log.isDebugEnabled()) {
                log.debug("No data for " + qt + " in vector of length " + vector.size());
            }
            return null;
        }

        List<Object> toConvert = new ArrayList<Object>();
        PrimitiveType pt = qt.getRepresentation();
        int numMissing = 0;
        for (Object rawValue : vector) {
            if (rawValue == null) {
                numMissing++;
                handleMissing(toConvert, pt);
            } else if (rawValue instanceof String) { // needs to be coverted.
                String valueString = (String) rawValue;
                if (StringUtils.isBlank(valueString)) {
                    numMissing++;
                    handleMissing(toConvert, pt);
                    continue;
                }
                try {
                    if (pt.equals(PrimitiveType.DOUBLE)) {
                        toConvert.add(Double.parseDouble(valueString));
                    } else if (pt.equals(PrimitiveType.STRING)) {
                        toConvert.add(rawValue);
                    } else if (pt.equals(PrimitiveType.CHAR)) {
                        if (valueString.length() != 1) {
                            throw new IllegalStateException(
                                    "Attempt to cast a string of length " + valueString.length() + " to a char: "
                                            + rawValue + "(quantitation type =" + qt);
                        }
                        toConvert.add(valueString.toCharArray()[0]);
                    } else if (pt.equals(PrimitiveType.INT)) {
                        toConvert.add(Integer.parseInt(valueString));
                    } else if (pt.equals(PrimitiveType.BOOLEAN)) {
                        toConvert.add(Boolean.parseBoolean(valueString));
                    } else {
                        throw new UnsupportedOperationException("Data vectors of type " + pt + " not supported");
                    }
                } catch (NumberFormatException e) {
                    numMissing++;
                    handleMissing(toConvert, pt);
                }
            } else { // use as is.
                toConvert.add(rawValue);
            }
        }

        if (numMissing == vector.size()) {
            return null;
        }

        byte[] bytes = byteArrayConverter.toBytes(toConvert.toArray());

        /*
         * Debugging - absolutely make sure we can convert the data back.
         */
        if (pt.equals(PrimitiveType.DOUBLE)) {
            double[] byteArrayToDoubles = byteArrayConverter.byteArrayToDoubles(bytes);
            if (byteArrayToDoubles.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToDoubles.length + " doubles");
            }
        } else if (pt.equals(PrimitiveType.INT)) {
            int[] byteArrayToInts = byteArrayConverter.byteArrayToInts(bytes);
            if (byteArrayToInts.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToInts.length + " ints");
            }
        } else if (pt.equals(PrimitiveType.BOOLEAN)) {
            boolean[] byteArrayToBooleans = byteArrayConverter.byteArrayToBooleans(bytes);
            if (byteArrayToBooleans.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToBooleans.length + " booleans");
            }
        }

        return bytes;
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#convertSubsetToExperimentalFactor(ubic.gemma.model.expression.
     * experiment.ExpressionExperiment, ubic.gemma.loader.expression.geo.model.GeoSubset)
     */
    @Override
    public void convertSubsetToExperimentalFactor(ExpressionExperiment expExp, GeoSubset geoSubSet) {

        ExperimentalDesign experimentalDesign = expExp.getExperimentalDesign();
        Collection<ExperimentalFactor> existingExperimentalFactors = experimentalDesign.getExperimentalFactors();

        ExperimentalFactor experimentalFactor = ExperimentalFactor.Factory.newInstance();
        experimentalFactor.setName(geoSubSet.getType().toString());
        VocabCharacteristic term = convertVariableType(geoSubSet.getType());
        term.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());
        term.setValue(term.getCategory());

        term.setValueUri(term.getCategoryUri());

        experimentalFactor.setCategory(term);
        experimentalFactor.setType(FactorType.CATEGORICAL);
        experimentalFactor.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());

        boolean duplicateExists = false;
        for (ExperimentalFactor existingExperimentalFactor : existingExperimentalFactors) {
            if ((experimentalFactor.getName()).equalsIgnoreCase(existingExperimentalFactor.getName())) {
                duplicateExists = true;
                experimentalFactor = existingExperimentalFactor;
                if (log.isDebugEnabled())
                    log.debug(experimentalFactor.getName()
                            + " already exists.  Not adding to list of experimental factors.");
                break;
            }
        }

        if (!duplicateExists) {
            experimentalDesign.getExperimentalFactors().add(experimentalFactor);
        }

        /* bi-directional ... don't forget this. */
        experimentalFactor.setExperimentalDesign(experimentalDesign);

        FactorValue factorValue = convertSubsetDescriptionToFactorValue(geoSubSet, experimentalFactor);
        addFactorValueToBioMaterial(expExp, geoSubSet, factorValue);
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#convertSupplementaryFileToLocalFile(java.lang.Object)
     */
    @Override
    public LocalFile convertSupplementaryFileToLocalFile(Object object) {

        URL remoteFileUrl = null;
        LocalFile remoteFile = null;

        if (object instanceof GeoSeries) {
            GeoSeries series = (GeoSeries) object;
            String file = series.getSupplementaryFile();
            if (!StringUtils.isEmpty(file) && !StringUtils.equalsIgnoreCase(file, "NONE")) {
                try {
                    remoteFile = LocalFile.Factory.newInstance();
                    remoteFileUrl = new URL(file);
                } catch (MalformedURLException e) {
                    reportUrlError(remoteFileUrl, e);
                }
            }
        }

        else if (object instanceof GeoSample) {
            GeoSample sample = (GeoSample) object;
            String file = sample.getSupplementaryFile();
            if (!StringUtils.isEmpty(file) && !StringUtils.equalsIgnoreCase(file, "NONE")) {
                try {
                    remoteFile = LocalFile.Factory.newInstance();
                    remoteFileUrl = new URL(file);
                } catch (MalformedURLException e) {
                    reportUrlError(remoteFileUrl, e);
                }
            }
        }

        else if (object instanceof GeoPlatform) {
            GeoPlatform platform = (GeoPlatform) object;
            String file = platform.getSupplementaryFile();
            if (!StringUtils.isEmpty(file) && !StringUtils.equalsIgnoreCase(file, "NONE")) {
                try {
                    remoteFile = LocalFile.Factory.newInstance();
                    remoteFileUrl = new URL(file);
                } catch (MalformedURLException e) {
                    reportUrlError(remoteFileUrl, e);
                }
            }
        }

        /* nulls allowed in remoteFile ... deal with later. */
        if (remoteFile != null)
            remoteFile.setRemoteURL(remoteFileUrl);

        return remoteFile;
    }

    /**
     * This method determines the primary taxon on the array: There are 4 main branches of logic. 1.First it checks if
     * there is only one platform taxon defined on the GEO submission: If there is that is the primary taxon. 2.If
     * multiple taxa are given for the platform then the taxa are checked to see if they share a common parent if so
     * that is the primary taxon e.g. salmonid where atlantic salmon and rainbow trout are given. 3.Finally the
     * probeTaxa are looked at and the most common probe taxa is calculated as the primary taxon 4. No taxon found
     * throws an error
     * 
     * @param platformTaxa Collection of taxa that were given on the GEO array submission as platform taxa
     * @param probeTaxa Collection of taxa strings defining the taxon of each probe on the array.
     * @return Primary taxon of array as determined by this method
     * @exception Thrown if no primary taxon can be determined for array.
     */
    @Override
    public Taxon getPrimaryArrayTaxon(Collection<Taxon> platformTaxa, Collection<String> probeTaxa)
            throws IllegalArgumentException {

        if (platformTaxa == null || platformTaxa.isEmpty()) {
            return null;
        }

        // if there is only 1 taxon on the platform submission then this is the primary taxon
        if (platformTaxa.size() == 1) {
            log.debug("Only 1 taxon given on GEO platform: " + platformTaxa.iterator().next());
            return platformTaxa.iterator().next();
        }

        // If there are multiple taxa on array
        else if (platformTaxa.size() > 1) {
            log.debug(platformTaxa.size() + " taxa in GEO platform");
            // check if they share a common parent taxon to use as primary taxa.
            Collection<Taxon> parentTaxa = new HashSet<Taxon>();
            for (Taxon platformTaxon : platformTaxa) {
                // thaw to get parent taxon
                this.taxonService.thaw(platformTaxon);
                Taxon platformParentTaxon = platformTaxon.getParentTaxon();
                parentTaxa.add(platformParentTaxon);
            }
            // check now if we only have one parent taxon and check if not null, if a null then there was a taxon with
            // no
            // parent
            if (!(parentTaxa.contains(null)) && parentTaxa.size() == 1) {
                log.debug("Parent taxon found " + parentTaxa);
                return parentTaxa.iterator().next();
            }
            // No common parent then calculate based on probe taxa:

            log.debug("Looking at probe taxa to determine 'primary' taxon");
            // create a hashmap keyed on taxon with a counter to count the number of probes for that taxon.
            Map<String, Integer> taxonProbeNumberList = new HashMap<String, Integer>();

            if (probeTaxa != null) {
                for (String probeTaxon : probeTaxa) {
                    // reset each iteration so if no probes already processed set to 1
                    Integer counter = 1;
                    if (taxonProbeNumberList.containsKey(probeTaxon)) {
                        counter = taxonProbeNumberList.get(probeTaxon) + 1;
                        taxonProbeNumberList.put(probeTaxon, counter);
                    }
                    taxonProbeNumberList.put(probeTaxon, counter);
                }
            }

            String primaryTaxonName = "";
            Integer highestScore = 0;
            for (String taxon : taxonProbeNumberList.keySet()) {
                // filter out those probes that have no taxon set control spots. Here's that 'n/a' again, kind of
                // ugly but we see it in some arrays
                if (!taxon.equals("n/a") && StringUtils.isNotBlank(taxon)
                        && taxonProbeNumberList.get(taxon) > highestScore) {
                    primaryTaxonName = taxon;
                    highestScore = taxonProbeNumberList.get(taxon);
                }
            }
            if (StringUtils.isNotBlank(primaryTaxonName)) {
                return this.convertProbeOrganism(primaryTaxonName);
            }

        }
        // error no taxon on array submission

        throw new IllegalArgumentException("No taxon could be determined for GEO platform ");

    }

    /*
     * This is really only here for tests, at the moment.
     */
    @Override
    public void setElementLimitForStrictness(int tooManyElements) {
        this.tooManyElements = tooManyElements;
    }

    @Override
    public void setForceConvertElements(boolean forceConvertElements) {
        this.forceConvertElements = forceConvertElements;
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.expression.geo.GeoConverter#setSplitByPlatform(boolean)
     */
    @Override
    public void setSplitByPlatform(boolean splitByPlatform) {
        this.splitByPlatform = splitByPlatform;
    }

    @Override
    public String toString() {
        StringBuilder buf = new StringBuilder();
        Map<String, Integer> tally = new HashMap<String, Integer>();
        for (Object element : results) {
            String clazz = element.getClass().getName();
            if (!tally.containsKey(clazz)) {
                tally.put(clazz, new Integer(0));
            }
            tally.put(clazz, new Integer((tally.get(clazz)).intValue() + 1));
        }
        for (String clazz : tally.keySet()) {
            buf.append(tally.get(clazz) + " " + clazz + "s\n");
        }

        return buf.toString();
    }

    /**
     * @param expExp
     * @param geoSubSet
     * @param factorValue
     */
    private void addFactorValueToBioMaterial(ExpressionExperiment expExp, GeoSubset geoSubSet,
            FactorValue factorValue) {
        // fill in biomaterial-->factorvalue.
        for (GeoSample sample : geoSubSet.getSamples()) {

            // find the matching biomaterial(s) in the expression experiment.
            for (BioAssay bioAssay : expExp.getBioAssays()) {
                if (bioAssay.getAccession().getAccession().equals(sample.getGeoAccession())) {
                    BioMaterial material = bioAssay.getSampleUsed();
                    if (log.isDebugEnabled()) {
                        log.debug("Adding " + factorValue.getExperimentalFactor() + " : " + factorValue + " to "
                                + material);
                    }
                    material.getFactorValues().add(factorValue);
                }

            }

        }
    }

    /**
     * @param bioMaterial
     * @param experimentalFactor
     * @return true if the biomaterial already has a factorvalue for the given experimentalFactor; false otherwise.
     */
    private boolean alreadyHasFactorValueForFactor(BioMaterial bioMaterial, ExperimentalFactor experimentalFactor) {
        for (FactorValue fv : bioMaterial.getFactorValues()) {
            ExperimentalFactor existingEf = fv.getExperimentalFactor();
            // This is a weak form of 'equals' - we just check the name.
            if (existingEf.getName().equals(experimentalFactor.getName())) {
                return true;
            }
        }
        return false;
    }

    /**
     * Flag as unneeded data that are not from experiments types that we support, such as ChIP.
     * 
     * @param series
     * @param dataSetsToSkip
     * @param samplesToSkip
     */
    private void checkForDataToSkip(GeoSeries series, Collection<String> dataSetsToSkip,
            Collection<GeoSample> samplesToSkip) {

        for (GeoDataset dataset : series.getDatasets()) {
            // This doesn't cover every possibility...
            if (dataset.getExperimentType() == ExperimentType.arrayCGH
                    || dataset.getExperimentType() == ExperimentType.ChIPChip
                    || dataset.getExperimentType() == ExperimentType.geneExpressionSAGEbased) {
                log.warn("Gemma does not know how to handle " + dataset.getExperimentType());

                if (series.getDatasets().size() == 1) {
                    log.warn("Because the experiment type cannot be handled, "
                            + "and there is only one data set in this series, nothing will be returned!");
                }
                samplesToSkip.addAll(this.getDatasetSamples(dataset));
                dataSetsToSkip.add(dataset.getGeoAccession());
            } else {
                log.info("Data from " + dataset + " is of type " + dataset.getExperimentType() + ", "
                        + getDatasetSamples(dataset).size() + " samples.");
            }
        }

        // /*
        // * Skip samples that are for a nonsupported taxon, when there is more than one taxon. We would still have to
        // * split series if there are two supported taxa.
        // */
        // Collection<String> seriesTaxa = new HashSet<String>();
        // for ( GeoSample sample : series.getSamples() ) {
        // String o = sample.getOrganism();
        // seriesTaxa.add( o );
        // }
        //
        // if ( seriesTaxa.size() > 1 ) {
        // for ( GeoSample sample : series.getSamples() ) {
        // String o = sample.getOrganism();
        // if ( taxonService.findByScientificName( o ) == null ) {
        // log.info( "Skipping sample " + sample + " that has taxon=" + o );
        // samplesToSkip.add( sample );
        // }
        //
        // /*
        // * TODO Might rarely need to worry about removing all samples from a data set.
        // */
        // }
        //
        // }

    }

    /**
     * Used for the case where we want to split the GSE into two (or more) separate ExpressionExperiments based on
     * platform. This is necessary when the two platforms are completely incompatible.
     * 
     * @param series
     * @param converted
     * @param platformDatasetMap
     * @param i
     * @param platform
     */
    private void convertByPlatform(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<GeoPlatform, Collection<GeoData>> platformDatasetMap, int i, GeoPlatform platform) {
        GeoSeries platformSpecific = new GeoSeries();

        Collection<GeoData> datasets = platformDatasetMap.get(platform);
        assert datasets.size() > 0;

        for (GeoSample sample : series.getSamples()) {
            // ugly, we have to assume there is only one platform per sampl.
            if (sample.getPlatforms().iterator().next().equals(platform)) {
                platformSpecific.addSample(sample);
            }
        }

        // strip out samples that aren't from this platform.
        for (GeoData dataset : datasets) {
            if (dataset instanceof GeoDataset) {
                ((GeoDataset) dataset).dissociateFromSeries(series);
                platformSpecific.addDataSet((GeoDataset) dataset);
            }
        }

        /*
         * Basically copy over most of the information
         */
        platformSpecific.setContact(series.getContact());
        platformSpecific.setContributers(series.getContributers());
        platformSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        platformSpecific.setKeyWords(series.getKeyWords());
        platformSpecific.setOverallDesign(series.getOverallDesign());
        platformSpecific.setPubmedIds(series.getPubmedIds());
        platformSpecific.setReplicates(series.getReplicates());
        platformSpecific.setSampleCorrespondence(series.getSampleCorrespondence());
        platformSpecific.setSummaries(series.getSummaries());
        platformSpecific.setTitle(series.getTitle() + " - " + platform.getGeoAccession());
        platformSpecific.setWebLinks(series.getWebLinks());
        platformSpecific.setValues(series.getValues());

        converted.add(convertSeries(platformSpecific, null));

    }

    /**
     * GEO does not keep track of 'biomaterials' that make up different channels. Therefore the two channels effectively
     * make up a single biomaterial, as far as we're concerned. We're losing information here.
     * 
     * @param sample
     * @param channel
     * @return
     */
    private BioMaterial convertChannel(GeoSample sample, GeoChannel channel, BioMaterial bioMaterial) {
        if (bioMaterial == null)
            return null;
        log.debug("Sample: " + sample.getGeoAccession() + " - Converting channel " + channel.getSourceName());

        bioMaterial.setDescription((bioMaterial.getDescription() == null ? "" : bioMaterial.getDescription() + ";")
                + "Channel " + channel.getChannelNumber());

        if (!StringUtils.isBlank(channel.getGrowthProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " treatment");
            treatment.setDescription(channel.getGrowthProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getTreatmentProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " growth");
            treatment.setDescription(channel.getTreatmentProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getExtractProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " extraction");
            treatment.setDescription(channel.getExtractProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getLabelProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " labeling");
            treatment.setDescription(channel.getLabelProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        for (String characteristic : channel.getCharacteristics()) {

            characteristic = trimString(characteristic);

            /*
             * Sometimes values are like Age:8 weeks, so we can try to convert them.
             */
            String[] fields = characteristic.split(":");
            String defaultDescription = "GEO Sample characteristic";
            if (fields.length == 2) {

                String category = fields[0].trim();
                String value = fields[1].trim();

                try {
                    VocabCharacteristic gemmaChar = convertVariableType(GeoVariable.convertStringToType(category));
                    gemmaChar.setDescription(defaultDescription);
                    gemmaChar.setValue(value);
                    gemmaChar.setEvidenceCode(GOEvidenceCode.IIA);
                    bioMaterial.getCharacteristics().add(gemmaChar);
                } catch (Exception e) {
                    // conversion didn't work, fall back.
                    Characteristic gemmaChar = Characteristic.Factory.newInstance();
                    gemmaChar.setValue(characteristic);
                    gemmaChar.setDescription(defaultDescription);
                    gemmaChar.setEvidenceCode(GOEvidenceCode.IIA);
                    bioMaterial.getCharacteristics().add(gemmaChar);
                }

            } else {
                // no colon, just use raw (same as fallback above)
                Characteristic gemmaChar = Characteristic.Factory.newInstance();
                gemmaChar.setValue(characteristic);
                gemmaChar.setDescription(defaultDescription);
                gemmaChar.setEvidenceCode(GOEvidenceCode.IIA);
                bioMaterial.getCharacteristics().add(gemmaChar);
            }

        }

        if (StringUtils.isNotBlank(channel.getSourceName())) {
            VocabCharacteristic sourceChar = VocabCharacteristic.Factory.newInstance();
            sourceChar.setDescription("GEO Sample source");
            String characteristic = trimString(channel.getSourceName());
            sourceChar.setCategory("BioSource");
            sourceChar.setCategoryUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#BioSource");
            sourceChar.setValue(characteristic);
            sourceChar.setEvidenceCode(GOEvidenceCode.IIA);
            bioMaterial.getCharacteristics().add(sourceChar);
        }

        if (StringUtils.isNotBlank(channel.getOrganism())) {
            // if we have a case where the two channels have different taxon throw an exception.
            String currentChannelTaxon = channel.getOrganism();
            if (bioMaterial.getSourceTaxon() != null) {
                String previousChannelTaxon = bioMaterial.getSourceTaxon().getScientificName();
                if (previousChannelTaxon != null && !(previousChannelTaxon.equals(currentChannelTaxon))) {
                    throw new IllegalArgumentException(
                            "Channel 1 taxon is " + bioMaterial.getSourceTaxon().getScientificName()
                                    + " Channel 2 taxon is " + currentChannelTaxon
                                    + " Check that is expected for sample " + sample.getGeoAccession());
                }

            } else {
                // get it from the channel.
                Taxon taxon = Taxon.Factory.newInstance();
                taxon.setIsSpecies(true);
                taxon.setScientificName(channel.getOrganism());
                taxon.setIsGenesUsable(true); // plausible default, doesn't matter.
                bioMaterial.setSourceTaxon(taxon);
            }

        }

        if (channel.getMolecule() != null) {
            // this we can convert automatically pretty easily.
            Characteristic c = channel.getMoleculeAsCharacteristic();
            bioMaterial.getCharacteristics().add(c);
        }

        if (StringUtils.isNotBlank(channel.getLabel())) {
            String characteristic = trimString(channel.getLabel());
            // This is typically something like "biotin-labeled nucleotides", which we can convert later.
            VocabCharacteristic labelChar = VocabCharacteristic.Factory.newInstance();
            labelChar.setDescription("GEO Sample label");
            labelChar.setCategory("LabelCompound");
            labelChar.setCategoryUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#LabelCompound");
            labelChar.setValue(characteristic);
            labelChar.setEvidenceCode(GOEvidenceCode.IIA);
            bioMaterial.getCharacteristics().add(labelChar);
        }

        return bioMaterial;
    }

    /**
     * @param contact
     * @return
     */
    private Person convertContact(GeoContact contact) {
        Person result = Person.Factory.newInstance();

        /*
         * Note: removed address conversion. We don't normally get that info from GEO nor do we need it.
         */

        result.setPhone(contact.getPhone());
        result.setName(contact.getName());
        result.setEmail(contact.getEmail());
        result.setFax(contact.getFax());
        result.setURL(contact.getWebLink());

        return result;
    }

    /**
     * Take contact and contributer information from a GeoSeries and put it in the ExpressionExperiment.
     * 
     * @param series
     * @param expExp
     */
    private void convertContacts(GeoSeries series, ExpressionExperiment expExp) {
        expExp.getInvestigators().add(convertContact(series.getContact()));
        if (series.getContributers().size() > 0) {
            expExp.setDescription(expExp.getDescription() + "\nContributers: ");
            for (GeoContact contributer : series.getContributers()) {
                expExp.setDescription(expExp.getDescription() + " " + contributer.getName());
                expExp.getInvestigators().add(convertContact(contributer));
            }
            expExp.setDescription(expExp.getDescription() + "\n");
        }
    }

    /**
     * Often-needed generation of a valid databaseentry object.
     * 
     * @param geoData
     * @return
     */
    private DatabaseEntry convertDatabaseEntry(GeoData geoData) {
        DatabaseEntry result = DatabaseEntry.Factory.newInstance();

        initGeoExternalDatabase();

        result.setExternalDatabase(this.geoDatabase);

        // remove trailing ".1" etc. in case it was split.
        result.setAccession(geoData.getGeoAccession().replaceAll("\\.[0-9]+$", ""));

        return result;
    }

    /**
     * @param geoDataset
     */
    private ExpressionExperiment convertDataset(GeoDataset geoDataset) {

        if (geoDataset.getSeries().size() == 0) {
            throw new IllegalArgumentException("GEO Dataset must have associated series");
        }

        if (geoDataset.getSeries().size() > 1) {
            throw new UnsupportedOperationException("GEO Dataset can only be associated with one series");
        }

        Collection<ExpressionExperiment> seriesResults = this
                .convertSeries(geoDataset.getSeries().iterator().next());
        assert seriesResults.size() == 1; // unless we have multiple species, not possible.
        return seriesResults.iterator().next();
    }

    /**
     * @param dataset
     * @param expExp
     */
    private ExpressionExperiment convertDataset(GeoDataset geoDataset, ExpressionExperiment expExp) {

        /*
         * First figure out of there are any samples for this data set. It could be that they were duplicates of ones
         * found in other series, so were skipped. See GeoService
         */
        if (this.getDatasetSamples(geoDataset).size() == 0) {
            log.info("No samples remain for " + geoDataset + ", nothing to do");
            return expExp;
        }

        log.info("Converting dataset:" + geoDataset);

        convertDatasetDescriptions(geoDataset, expExp);

        GeoPlatform platform = geoDataset.getPlatform();
        ArrayDesign ad = seenPlatforms.get(platform.getGeoAccession());
        if (ad == null) {
            /*
             * See bug 1672. Sometimes the platform for the dataset is wrong so we should just go on. The exception was
             * otherwise catching a case we don't see under normal use.
             */
            throw new IllegalStateException("ArrayDesigns must be converted before datasets - didn't find "
                    + geoDataset.getPlatform() + "; possibly dataset has incorrect platform?");
        }
        ad.setDescription(ad.getDescription() + "\nFrom " + platform.getGeoAccession() + "\nLast Updated: "
                + platform.getLastUpdateDate());

        LocalFile arrayDesignRawFile = convertSupplementaryFileToLocalFile(platform);
        if (arrayDesignRawFile != null) {
            Collection<LocalFile> arrayDesignLocalFiles = ad.getLocalFiles();
            if (arrayDesignLocalFiles == null) {
                arrayDesignLocalFiles = new HashSet<LocalFile>();
            }
            arrayDesignLocalFiles.add(arrayDesignRawFile);
            ad.setLocalFiles(arrayDesignLocalFiles);
        }

        convertDataSetDataVectors(geoDataset.getSeries().iterator().next().getValues(), geoDataset, expExp);

        convertSubsetAssociations(expExp, geoDataset);
        return expExp;

    }

    /**
     * Convert the GEO data into DesignElementDataVectors associated with the ExpressionExperiment
     * 
     * @param geoDataset Source of the data
     * @param expExp ExpressionExperiment to fill in.
     */
    private void convertDataSetDataVectors(GeoValues values, GeoDataset geoDataset, ExpressionExperiment expExp) {
        List<GeoSample> datasetSamples = new ArrayList<GeoSample>(getDatasetSamples(geoDataset));
        log.info(datasetSamples.size() + " samples in " + geoDataset);
        GeoPlatform geoPlatform = geoDataset.getPlatform();

        convertVectorsForPlatform(values, expExp, datasetSamples, geoPlatform);

        values.clear(geoPlatform);
    }

    /**
     * @param geoDataset
     * @param expExp
     */
    private void convertDatasetDescriptions(GeoDataset geoDataset, ExpressionExperiment expExp) {
        if (StringUtils.isEmpty(expExp.getDescription())) {
            expExp.setDescription(geoDataset.getDescription()); // probably not empty.
        }

        expExp.setDescription(expExp.getDescription() + "\nIncludes " + geoDataset.getGeoAccession() + ".\n");
        if (StringUtils.isNotEmpty(geoDataset.getUpdateDate())) {
            expExp.setDescription(expExp.getDescription() + " Update date: " + geoDataset.getUpdateDate() + ".\n");
        }

        if (StringUtils.isEmpty(expExp.getName())) {
            expExp.setName(geoDataset.getTitle());
        } else {
            expExp.setDescription(expExp.getDescription() + " Dataset description " + geoDataset.getGeoAccession()
                    + ": " + geoDataset.getTitle() + ".\n");
        }
    }

    /**
     * @param geoDataset
     * @param expExp
     * @param bioAssayDimension
     * @param designElementName
     * @param dataVector to convert.
     * @return vector, or null if the dataVector was null or empty.
     */
    private RawExpressionDataVector convertDesignElementDataVector(GeoPlatform geoPlatform,
            ExpressionExperiment expExp, BioAssayDimension bioAssayDimension, String designElementName,
            List<Object> dataVector, QuantitationType qt) {

        if (dataVector == null || dataVector.size() == 0)
            return null;

        int numValuesExpected = bioAssayDimension.getBioAssays().size();
        if (dataVector.size() != numValuesExpected) {
            throw new IllegalArgumentException(
                    "Expected " + numValuesExpected + " in bioassaydimension, data contains " + dataVector.size());
        }
        byte[] blob = convertData(dataVector, qt);
        if (blob == null) { // all missing etc.
            if (log.isDebugEnabled())
                log.debug("All missing values for DE=" + designElementName + " QT=" + qt);
            return null;
        }
        if (log.isDebugEnabled()) {
            log.debug(blob.length + " bytes for " + dataVector.size() + " raw elements");
        }

        ArrayDesign p = convertPlatform(geoPlatform);
        assert p != null;

        Map<String, CompositeSequence> designMap = platformDesignElementMap.get(p.getShortName());
        assert designMap != null;

        /*
         * Replace name with the one we're using in the array design after conversion. This information gets filled in
         * earlier in the conversion process (see GeoService)
         */
        String mappedName = geoPlatform.getProbeNamesInGemma().get(designElementName);

        if (mappedName == null) {
            // Sigh..this is unlikely to work in general, but see bug 1709.
            mappedName = geoPlatform.getProbeNamesInGemma().get(designElementName.toUpperCase());
        }

        if (mappedName == null) {
            throw new IllegalStateException("There is  no probe matching " + designElementName);
        }

        CompositeSequence compositeSequence = designMap.get(mappedName);

        if (compositeSequence == null)
            throw new IllegalStateException("No composite sequence " + designElementName);

        if (compositeSequence.getBiologicalCharacteristic() != null
                && compositeSequence.getBiologicalCharacteristic().getSequenceDatabaseEntry() != null
                && compositeSequence.getBiologicalCharacteristic().getSequenceDatabaseEntry().getExternalDatabase()
                        .getName() == null) {
            // this is obscure.
            throw new IllegalStateException(compositeSequence + " sequence accession external database lacks name");
        }

        if (log.isDebugEnabled())
            log.debug("Associating " + compositeSequence + " with dedv");
        RawExpressionDataVector vector = RawExpressionDataVector.Factory.newInstance();
        vector.setDesignElement(compositeSequence);
        vector.setExpressionExperiment(expExp);

        vector.setBioAssayDimension(bioAssayDimension);
        vector.setQuantitationType(qt);
        vector.setData(blob);
        return vector;
    }

    /**
     * @param datasetSamples List of GeoSamples to be matched up with BioAssays.
     * @param expExp ExpresssionExperiment
     * @return BioAssayDimension representing the samples.
     */
    private BioAssayDimension convertGeoSampleList(List<GeoSample> datasetSamples, ExpressionExperiment expExp) {
        BioAssayDimension resultBioAssayDimension = BioAssayDimension.Factory.newInstance();

        StringBuilder bioAssayDimName = new StringBuilder();
        Collections.sort(datasetSamples);
        bioAssayDimName.append(expExp.getShortName() + ": ");
        for (GeoSample sample : datasetSamples) {
            boolean found = false;
            String sampleAcc = sample.getGeoAccession();
            bioAssayDimName.append(sampleAcc + ","); // FIXME this is rather silly!
            found = matchSampleToBioAssay(expExp, resultBioAssayDimension, sampleAcc);
            if (!found) {
                // this is normal because not all headings are
                // sample ids.
                log.warn("No bioassay match for " + sampleAcc);
            }
        }
        log.debug(resultBioAssayDimension.getBioAssays().size() + " Bioassays in biodimension");
        resultBioAssayDimension.setName(formatName(bioAssayDimName));
        resultBioAssayDimension.setDescription(bioAssayDimName.toString());
        return resultBioAssayDimension;
    }

    /**
     * Given an organisms name from GEO, create or find the taxon in the DB.
     * 
     * @param organisms name as provided by GEO presumed to be a scientific name
     * @return Taxon details
     */
    private Taxon convertOrganismToTaxon(String taxonScientificName) {
        assert taxonScientificName != null;

        /* if not, either create a new one and persist, or get from db and put in map. */

        if (taxonScientificName.toLowerCase().startsWith(GeoConverterImpl.RAT)) {
            taxonScientificName = GeoConverterImpl.RAT; // we don't distinguish between species.
        }

        Taxon taxon = Taxon.Factory.newInstance();
        taxon.setScientificName(taxonScientificName);
        taxon.setIsSpecies(true);
        taxon.setIsGenesUsable(false);
        if (taxonService != null) {
            Taxon t = taxonService.findOrCreate(taxon);
            if (t != null) {
                taxon = t;
            }
        }

        taxonScientificNameMap.put(taxonScientificName, taxon);
        return taxon;

    }

    /**
     * @param platform
     */
    private ArrayDesign convertPlatform(GeoPlatform platform) {

        if (seenPlatforms.containsKey(platform.getGeoAccession())) {
            return (seenPlatforms.get(platform.getGeoAccession()));
        }

        ArrayDesign arrayDesign = createMinimalArrayDesign(platform);

        log.info("Converting platform: " + platform.getGeoAccession());
        platformDesignElementMap.put(arrayDesign.getShortName(), new HashMap<String, CompositeSequence>());

        // convert the design element information.
        String identifier = platform.getIdColumnName();
        if (identifier == null && !platform.getColumnNames().isEmpty()) {
            throw new IllegalStateException("Cannot determine the platform design element id column for " + platform
                    + "; " + platform.getColumnNames().size() + " column names available.");
        }

        Collection<String> externalReferences = determinePlatformExternalReferenceIdentifier(platform);
        String descriptionColumn = determinePlatformDescriptionColumn(platform);
        String sequenceColumn = determinePlatformSequenceColumn(platform);
        ExternalDatabase externalDb = determinePlatformExternalDatabase(platform);

        List<String> descriptions = platform.getColumnData(descriptionColumn);

        List<String> sequences = null;
        if (sequenceColumn != null) {
            sequences = platform.getColumnData(sequenceColumn);
        }
        // The primary taxon for the array: this should be a taxon that is listed as the platform taxon on geo
        // submission
        String probeOrganismColumn = determinePlatformProbeOrganismColumn(platform);
        Collection<Taxon> platformTaxa = convertPlatformOrganisms(platform, probeOrganismColumn);

        // represent taxa for the probes
        List<String> probeOrganism = null;
        if (probeOrganismColumn != null) {
            log.debug("Organism details found for probes on array " + platform.getGeoAccession());
            probeOrganism = platform.getColumnData(probeOrganismColumn);
        }

        // The primary taxon for the array: either taxon listed on geo submission, or parent taxon listed on geo
        // submission or predominant probe taxon
        // calcualted using platformTaxa or probeOrganismColumn
        Taxon primaryTaxon = this.getPrimaryArrayTaxon(platformTaxa, probeOrganism);

        if (primaryTaxon == null) {
            throw new IllegalStateException("No taxon could be determined for platform: " + arrayDesign);
        }

        arrayDesign.setPrimaryTaxon(primaryTaxon);

        // We don't get reporters from GEO SOFT files.
        // arrayDesign.setReporters( new HashSet() );

        if (StringUtils.isNotBlank(platform.getManufacturer())) {
            Contact manufacturer = Contact.Factory.newInstance();
            manufacturer.setName(platform.getManufacturer());
            arrayDesign.setDesignProvider(manufacturer);
        }

        arrayDesign.getExternalReferences().add(convertDatabaseEntry(platform));

        seenPlatforms.put(platform.getGeoAccession(), arrayDesign);

        if (identifier == null) {
            // we don't get any probe information; e.g., MPSS, SAGE, Exon arrays.
            log.warn("No identifiers, so platform elements will be skipped");
            return arrayDesign;
        }

        convertPlatformElements(identifier, platform, arrayDesign, externalReferences, probeOrganismColumn,
                externalDb, descriptions, sequences, probeOrganism, primaryTaxon);

        return arrayDesign;
    }

    /**
     * Convert the elements/probes.
     * 
     * @param identifier
     * @param platform
     * @param arrayDesign
     * @param externalReferences
     * @param probeOrganismColumn
     * @param externalDb
     * @param descriptions
     * @param sequences
     * @param probeOrganism
     * @param primaryTaxon
     */
    private void convertPlatformElements(String identifier, GeoPlatform platform, ArrayDesign arrayDesign,
            Collection<String> externalReferences, String probeOrganismColumn, ExternalDatabase externalDb,
            List<String> descriptions, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon) {

        /*
         * This is a very commonly found column name in files, it seems standard in GEO. If we don't find it, it's okay.
         */
        List<String> cloneIdentifiers = platform.getColumnData("CLONE_ID");
        List<String> identifiers = platform.getColumnData(identifier);

        if (identifiers == null) {
            // we don't get any probe information; e.g., MPSS, SAGE, Exon arrays.
            log.warn("No identifiers, so platform elements will be skipped");
            return;
        }

        if (!platform.useDataFromGeo() && !forceConvertElements) {
            log.warn("Will not convert elements for this platform - set forceConvertElements to override");
            return;
        }

        assert cloneIdentifiers == null || cloneIdentifiers.size() == identifiers.size();

        List<List<String>> externalRefs = null;
        if (externalReferences != null) {
            externalRefs = platform.getColumnData(externalReferences);
        }

        if (externalRefs != null) {
            assert externalRefs.iterator().next().size() == identifiers
                    .size() : "Unequal numbers of identifiers and external references! "
                            + externalRefs.iterator().next().size() + " != " + identifiers.size();
        }

        if (log.isDebugEnabled()) {
            log.debug("Converting " + identifiers.size() + " probe identifiers on GEO platform "
                    + platform.getGeoAccession());
        }

        Iterator<String> descIter = null;

        if (descriptions != null) {
            descIter = descriptions.iterator();
        }

        Pattern refSeqAccessionPattern = Pattern.compile("^[A-Z]{2}_");

        boolean strictSelection = false;

        if (identifiers.size() > tooManyElements) {
            // something odd like an exon array.
            log.warn("Platform has more elements than expected, turning on strict selection method");
            strictSelection = true;
        }

        List<String> skipped = new ArrayList<String>();
        Collection<CompositeSequence> compositeSequences = new ArrayList<CompositeSequence>(5000);
        int i = 0; // to get sequences, if we have them, and clone identifiers.
        for (String id : identifiers) {
            String externalAccession = null;
            if (externalRefs != null) {
                externalAccession = getExternalAccession(externalRefs, i);
            }

            if (strictSelection && StringUtils.isBlank(externalAccession)) {

                // currently this is crafted to deal with affymetrix exon arrays, but could be expanded.

                // mrna_assignment is less strict than gene_assignement

                // salvage it if it has a gene assignment.
                // String filteringColumn = "gene_assignment";
                String filteringColumn = "gene_assignment";
                if (platform.getColumnNames().contains(filteringColumn)) {
                    String cd = platform.getColumnData(filteringColumn).get(i);
                    if (StringUtils.isBlank(cd) || cd.equals("---")) {

                        skipped.add(id);
                        if (skipped.size() % 10000 == 0) {
                            log.info("Skipped " + skipped.size() + " elements due to strict selection; last was "
                                    + id);
                        }
                        i++;
                        continue;
                    }
                    // keep it.
                } else {
                    // we just skip ones that don't have an external accession.
                    continue;
                }

                // remaining case here: externalAccession is blank, but there is another column that we think saves it.
            }

            String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);

            String description = "";
            if (externalAccession != null) {
                String[] refs = externalAccession.split(",");
                if (refs.length > 1) {
                    description = "Multiple external sequence references: " + externalAccession + "; ";
                    externalAccession = refs[0];
                }
            }

            if (descIter != null)
                description = description + " " + descIter.next();

            CompositeSequence cs = CompositeSequence.Factory.newInstance();
            String probeName = platform.getProbeNamesInGemma().get(id);
            if (probeName == null) {
                probeName = id;
                if (log.isDebugEnabled())
                    log.debug("Probe retaining original name: " + probeName);
                platform.getProbeNamesInGemma().put(id, id); // must make sure this is populated.
            } else {
                if (log.isDebugEnabled())
                    log.debug("Found probe: " + probeName);
            }

            cs.setName(probeName);
            cs.setDescription(description);
            cs.setArrayDesign(arrayDesign);

            // LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
            // if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
            // biosequence
            Taxon probeTaxon = Taxon.Factory.newInstance();
            if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
                probeTaxon = convertProbeOrganism(probeOrganism.get(i));
            }
            // if there are no probe taxons then all the probes should take the taxon from the primary taxon
            if (probeOrganismColumn == null) {
                probeTaxon = primaryTaxon;
            }

            BioSequence bs = createMinimalBioSequence(probeTaxon);

            boolean isRefseq = false;
            // ExternalDB will be null if it's IMAGE (this is really pretty messy, sorry)
            if (externalAccession != null && externalDb != null && externalDb.getName().equals("Genbank")
                    && StringUtils.isNotBlank(externalAccession)) {
                // http://www.ncbi.nlm.nih.gov/RefSeq/key.html#accessions : "RefSeq accession numbers can be
                // distinguished from GenBank accessions by their prefix distinct format of [2 characters|underbar]"
                Matcher refSeqAccessionMatcher = refSeqAccessionPattern.matcher(externalAccession);
                isRefseq = refSeqAccessionMatcher.matches();
            }

            boolean isImage = false;
            if (cloneIdentifier != null) {
                bs.setName(cloneIdentifier);
                isImage = cloneIdentifier.startsWith("IMAGE");
            }

            /*
             * If we are given a sequence, we don't need the genbank identifier, which is probably not correct anyway.
             */
            if (sequences != null && StringUtils.isNotBlank(sequences.get(i))) {
                bs.setSequence(sequences.get(i));
                bs.setIsApproximateLength(false);
                bs.setLength(new Long(bs.getSequence().length()));
                bs.setType(SequenceType.DNA);
                // bs.setName( platform.getGeoAccession() + "_" + id );
                bs.setName(id);
                bs.setDescription(
                        "Sequence from platform " + platform.getGeoAccession() + " provided by manufacturer. "
                                + (externalAccession != null ? "Used in leiu of " + externalAccession
                                        : "No external accession provided"));
            } else if (externalAccession != null && !isRefseq && !isImage && externalDb != null) {
                /*
                 * We don't use this if we have an IMAGE clone because the accession might be wrong (e.g., for a
                 * Refseq). During persisting the IMAGE clone will be replaced with the 'real' thing.
                 */

                /*
                 * We also don't store them if they are refseq ids, because refseq ids are generally not the actual
                 * sequences put on arrays.
                 */

                DatabaseEntry dbe = createDatabaseEntry(externalDb, externalAccession, bs);
                bs.setSequenceDatabaseEntry(dbe);
            }

            /*
             * If we have no basis for describing the sequence, we have to skip it.
             */
            if (StringUtils.isBlank(externalAccession) && StringUtils.isBlank(cloneIdentifier)) {
                if (log.isDebugEnabled()) {
                    log.debug("Blank external reference and clone id for " + cs + " on " + arrayDesign
                            + ", no biological characteristic can be added.");
                }
            } else if (probeTaxon == null) {
                /*
                 * FIXME we might want to just skip the probe entirely.
                 */
                if (log.isDebugEnabled()) {
                    log.debug("No valid taxon identified for " + cs + " on " + arrayDesign
                            + ", no biological characteristic can be added.");
                }
            } else if (probeTaxon.getId() != null) {
                // IF there is no taxon given for probe do not create a biosequence otherwise bombs as there is no taxon
                // to persist
                cs.setBiologicalCharacteristic(bs);
            }

            compositeSequences.add(cs);
            platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);

            i++;
        }
        arrayDesign.setCompositeSequences(new HashSet<CompositeSequence>(compositeSequences));
        arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequences.size());

        if (!skipped.isEmpty()) {
            log.info("Skipped " + skipped.size() + " elements due to strict selection; last was "
                    + skipped.get(skipped.size() - 1));
        }

        if (arrayDesign.getCompositeSequences().size() > tooManyElements) {
            // this is just a safeguard; perhaps temporary.
            throw new IllegalStateException(
                    "Platform has too many elements to be loaded. " + arrayDesign.getCompositeSequences().size());
        }

        log.info(arrayDesign.getCompositeSequences().size() + " elements on the platform");
    }

    /**
     * Retrieve full taxon details for a platform given the organism's scientific name in GEO. If multiple organisms are
     * recorded against an array only first taxon details are returned. Warning is given when no column is found to give
     * the taxa for the probes
     * 
     * @param platform GEO platform details
     * @param probeTaxonColumnName Column name of probe taxa
     * @return List of taxa on platform
     */
    private Collection<Taxon> convertPlatformOrganisms(GeoPlatform platform, String probeTaxonColumnName) {
        Collection<String> organisms = platform.getOrganisms();
        Collection<Taxon> platformTaxa = new HashSet<Taxon>();
        StringBuffer taxaOnPlatform = new StringBuffer();

        if (organisms.isEmpty()) {
            return platformTaxa;
        }

        for (String taxonScientificName : organisms) {
            if (taxonScientificName == null)
                continue;
            taxaOnPlatform.append(": " + taxonScientificName);
            // make sure add scientific name to map for platform
            if (taxonScientificNameMap.containsKey(taxonScientificName)) {
                platformTaxa.add(taxonScientificNameMap.get(taxonScientificName));
            } else {
                platformTaxa.add(convertOrganismToTaxon(taxonScientificName));
            }
        }

        // multiple organisms are found on the platform yet there is no column defined to represent taxon for the
        // probes.
        if (platformTaxa.size() > 1 && probeTaxonColumnName == null) {
            /*
             * This is okay if all the platformTaxa have the same parent. Here we're just doing a check.
             */
            Taxon parentTaxon = null;
            for (Taxon taxon : platformTaxa) {
                this.taxonService.thaw(taxon);
                if (taxon.getParentTaxon() != null) {
                    if (parentTaxon != null && !parentTaxon.equals(taxon.getParentTaxon())) {
                        throw new IllegalArgumentException(
                                platformTaxa.size() + " taxon found on platform" + taxaOnPlatform
                                        + " but there is no probe specific taxon Column found for platform "
                                        + platform + " and the parentTaxon is not the same for the taxa.");
                    }
                    parentTaxon = taxon.getParentTaxon();

                }
            }

        }
        // no platform organism given
        if (platformTaxa.size() == 0) {
            throw new IllegalArgumentException("No organisms found on platform  " + platform);
        }
        return platformTaxa;

    }

    /**
     * Retrieve taxon details for a probe given an abbreviation or scientific name. All scientific names should be in
     * the map as they were set there by the convertPlatform method. If the abbreviation is not found in the database
     * then stop processing as the organism name is likely to be an unknown abbreviation.
     * 
     * @param probeOrganism scientific name, common name or abbreviation of organism associated to a biosequence.
     * @return Taxon of biosequence.
     * @throws IllegalArgumentException taxon supplied has not been processed before, it does not match the scientific
     *         names used in platform definition and does not match a known abbreviation in the database.
     */
    private Taxon convertProbeOrganism(String probeOrganism) {
        Taxon taxon = Taxon.Factory.newInstance();
        // Check if we have processed this organism before as defined by scientific or abbreviation definition.
        assert probeOrganism != null;

        /*
         * Detect blank taxon. We support 'n/a' here .... a little kludgy but shows up in some files.
         */
        if (StringUtils.isBlank(probeOrganism) || probeOrganism.equals("n/a")) {
            return null;
        }
        if (taxonScientificNameMap.containsKey(probeOrganism)) {
            return taxonScientificNameMap.get(probeOrganism);
        }
        if (taxonAbbreviationMap.containsKey(probeOrganism)) {
            return taxonAbbreviationMap.get(probeOrganism);
        }

        taxon.setAbbreviation(probeOrganism);
        // taxon not processed before check database.
        if (taxonService != null) {
            Taxon t = taxonService.findByAbbreviation(probeOrganism.toLowerCase());

            if (t != null) {
                taxon = t;
                taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
            } else {

                t = taxonService.findByCommonName(probeOrganism.toLowerCase());

                if (t != null) {
                    taxon = t;
                    taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
                } else {

                    // if probe organism can not be found i.e it is not a known abbreviation or scientific name
                    // and it was not already created during platform organism processing then warn user. Examples would
                    // be "taxa" like "ILMN Controls". See bug 3207 (we used to throw an exception)
                    log.warn("'" + probeOrganism + "' is not recognized as a taxon in Gemma");
                    return null;
                }
            }
        }
        return taxon;

    }

    /**
     * @param series
     * @param expExp
     */
    private void convertPubMedIds(GeoSeries series, ExpressionExperiment expExp) {
        Collection<String> ids = series.getPubmedIds();
        if (ids == null || ids.size() == 0)
            return;
        for (String string : ids) {
            BibliographicReference bibRef = BibliographicReference.Factory.newInstance();
            DatabaseEntry pubAccession = DatabaseEntry.Factory.newInstance();
            pubAccession.setAccession(string);
            ExternalDatabase ed = ExternalDatabase.Factory.newInstance();
            ed.setName("PubMed");
            pubAccession.setExternalDatabase(ed);
            bibRef.setPubAccession(pubAccession);
            expExp.setPrimaryPublication(bibRef);
            break; // usually just one...
        }
    }

    /**
     * @param repType
     * @return
     */
    private VocabCharacteristic convertReplicatationType(ReplicationType repType) {
        VocabCharacteristic result = VocabCharacteristic.Factory.newInstance();
        result.setCategory("ReplicateDescriptionType");
        result.setCategoryUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#ReplicateDescriptionType");
        result.setEvidenceCode(GOEvidenceCode.IIA);
        ExternalDatabase mged = ExternalDatabase.Factory.newInstance();
        mged.setName("MGED Ontology");
        mged.setType(DatabaseType.ONTOLOGY);

        if (repType.equals(ReplicationType.biologicalReplicate)) {
            result.setValue("biological_replicate");
            result.setValueUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#biological_replicate");
        } else if (repType.equals(ReplicationType.technicalReplicateExtract)) {
            result.setValue("technical_replicate");
            result.setValueUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#technical_replicate");
        } else if (repType.equals(ReplicationType.technicalReplicateLabeledExtract)) {
            result.setValue("technical_replicate");
            result.setValueUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#technical_replicate"); // MGED doesn't have
            // a
            // term to distinguish
            // these.
        } else {
            throw new IllegalStateException("Unhandled replication type: " + repType);
        }

        return result;

    }

    /**
     * Convert a variable into a ExperimentalFactor
     * 
     * @param variable
     * @return
     */
    private ExperimentalFactor convertReplicationToFactor(GeoReplication replication) {
        log.debug("Converting replication " + replication.getType());
        ExperimentalFactor result = ExperimentalFactor.Factory.newInstance();
        result.setName(replication.getType().toString());
        result.setDescription(replication.getDescription());
        result.setType(FactorType.CATEGORICAL);
        VocabCharacteristic term = convertReplicatationType(replication.getType());

        result.setCategory(term);
        return result;

    }

    /**
     * @param replication
     * @return
     */
    private FactorValue convertReplicationToFactorValue(GeoReplication replication) {
        FactorValue factorValue = FactorValue.Factory.newInstance();
        VocabCharacteristic term = convertReplicatationType(replication.getType());
        factorValue.setValue(term.getValue());
        factorValue.getCharacteristics().add(term);
        return factorValue;
    }

    /**
     * @param variable
     * @param factor
     */
    private void convertReplicationToFactorValue(GeoReplication replication, ExperimentalFactor factor) {
        FactorValue factorValue = convertReplicationToFactorValue(replication);
        factor.getFactorValues().add(factorValue);
    }

    /**
     * A Sample corresponds to a BioAssay; the channels correspond to BioMaterials.
     * 
     * @param sample
     */
    private BioAssay convertSample(GeoSample sample, BioMaterial bioMaterial,
            ExperimentalDesign experimentalDesign) {
        if (sample == null) {
            log.warn("Null sample");
            return null;
        }

        if (sample.getGeoAccession() == null || sample.getGeoAccession().length() == 0) {
            log.error("No GEO accession for sample");
            return null;
        }

        log.debug("Converting sample: " + sample.getGeoAccession());

        BioAssay bioAssay = BioAssay.Factory.newInstance();
        String title = sample.getTitle();
        if (StringUtils.isBlank(title)) {
            // throw new IllegalArgumentException( "Title cannot be blank for sample " + sample );
            log.warn("Blank title for sample " + sample + ", using accession number instead.");
            sample.setTitle(sample.getGeoAccession());
        }
        bioAssay.setName(sample.getTitle());
        bioAssay.setDescription(sample.getDescription());
        bioAssay.setAccession(convertDatabaseEntry(sample));
        bioAssay.setIsOutlier(false);
        bioAssay.setSequencePairedReads(false);

        /*
         * NOTE - according to GEO (http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html) "variable information is
         * optional and does not appear in Series records or downloads, but will be used to assemble corresponding GEO
         * DataSet records" If we would get that information we would pass it into this method as
         * expExp.getExperimentalDesign().getExperimentalFactors().
         */

        // : use the ones from the ExperimentalFactor. In other words, these factor values should correspond to
        // experimentalfactors
        Collection<ExperimentalFactor> experimentalFactors = experimentalDesign.getExperimentalFactors();
        for (GeoReplication replication : sample.getReplicates()) {
            matchSampleReplicationToExperimentalFactorValue(bioMaterial, experimentalFactors, replication);
        }

        // : use the ones from the ExperimentalFactor.
        for (GeoVariable variable : sample.getVariables()) {
            matchSampleVariableToExperimentalFactorValue(bioMaterial, experimentalFactors, variable);
        }

        for (GeoChannel channel : sample.getChannels()) {
            /*
             * In reality GEO does not have information about the samples run on each channel. We're just making it up.
             * So we need to just add the channel information to the biomaterials we have already. Note taxon is now
             * taken from sample FIXME this is no longer accurate; GEO has species information for each channel.
             * 
             * Actually this has changed. GEO does store channel information. However, we don't use it (see bug 2902).
             */
            if (bioAssay.getSampleUsed() != null) {
                bioMaterial = bioAssay.getSampleUsed();
                log.info("Multi-sample information stored in biomaterial " + bioMaterial);
            }
            convertChannel(sample, channel, bioMaterial);
            bioAssay.setSampleUsed(bioMaterial);
        }

        // Taxon lastTaxon = null;

        for (GeoPlatform platform : sample.getPlatforms()) {
            ArrayDesign arrayDesign;
            if (seenPlatforms.containsKey(platform.getGeoAccession())) {
                arrayDesign = seenPlatforms.get(platform.getGeoAccession());
            } else {
                // platform not exist yet
                arrayDesign = convertPlatform(platform);
            }

            bioAssay.setArrayDesignUsed(arrayDesign);

        }

        return bioAssay;
    }

    /**
     * Convert a GEO series into one or more ExpressionExperiments. The "more than one" case comes up if the are
     * platforms from more than one organism represented in the series, or if 'split by platform' is set, or if multiple
     * species were run on a single platform. If the series is split into two or more ExpressionExperiments, each refers
     * to a modified GEO accession such as GSE2393.1, GSE2393.2 etc for each organism/platform
     * <p>
     * Similarly, because there is no concept of "biomaterial" in GEO, samples that are inferred to have been run using
     * the same biomaterial. The biomaterials are given names after the GSE and the bioAssays (GSMs) such as
     * GSE2939_biomaterial_1|GSM12393|GSN12394.
     * 
     * @param series
     * @return
     */
    private Collection<ExpressionExperiment> convertSeries(GeoSeries series) {

        Collection<ExpressionExperiment> converted = new HashSet<ExpressionExperiment>();

        // figure out if there are multiple species involved here.

        Map<String, Collection<GeoData>> organismDatasetMap = getOrganismDatasetMap(series);
        Map<GeoPlatform, Collection<GeoData>> platformDatasetMap = getPlatformDatasetMap(series);
        Map<String, Collection<GeoSample>> organismSampleMap = getOrganismSampleMap(series);
        // get map of platform to dataset.

        if (organismDatasetMap.size() > 1) {
            log.warn("**** Multiple-species series, with multiple datasets. This series will be split into "
                    + organismDatasetMap.size() + " experiments. ****");
            int i = 1;
            for (String organism : organismDatasetMap.keySet()) {
                convertSpeciesSpecific(series, converted, organismDatasetMap, i, organism);
                i++;
            }
        } else if (organismSampleMap.size() > 1) {
            log.warn("**** Multiple-species series. This series will be split into " + organismSampleMap.size()
                    + " experiments. ****");
            int i = 1;
            for (String organism : organismSampleMap.keySet()) {
                convertSpeciesSpecificSamples(series, converted, organismSampleMap, i, organism);
                i++;
            }
        } else if (platformDatasetMap.size() > 1 && this.splitByPlatform) {
            int i = 1;
            for (GeoPlatform platform : platformDatasetMap.keySet()) {
                convertByPlatform(series, converted, platformDatasetMap, i, platform);
                i++;
            }
        } else {
            converted.add(this.convertSeries(series, null));
        }

        return converted;
    }

    /**
     * @param series
     * @param resultToAddTo
     * @return
     * @see convertSeries
     */
    private ExpressionExperiment convertSeries(GeoSeries series, ExpressionExperiment resultToAddTo) {
        if (series == null)
            return null;
        log.info("Converting series: " + series.getGeoAccession());

        Collection<GeoDataset> dataSets = series.getDatasets();
        Collection<String> dataSetsToSkip = new HashSet<String>();
        Collection<GeoSample> samplesToSkip = new HashSet<GeoSample>();
        checkForDataToSkip(series, dataSetsToSkip, samplesToSkip);
        if (dataSets.size() > 0 && dataSetsToSkip.size() == dataSets.size()) {
            return null;
        }

        ExpressionExperiment expExp;

        if (resultToAddTo == null) {
            expExp = ExpressionExperiment.Factory.newInstance();
            expExp.setDescription("");
        } else {
            expExp = resultToAddTo;
        }

        expExp.setDescription(series.getSummaries() + (series.getSummaries().endsWith("\n") ? "" : "\n"));
        if (series.getLastUpdateDate() != null) {
            expExp.setDescription(
                    expExp.getDescription() + "Last Updated (by provider): " + series.getLastUpdateDate() + "\n");
        }

        expExp.setName(series.getTitle());
        expExp.setShortName(series.getGeoAccession());

        convertContacts(series, expExp);

        convertPubMedIds(series, expExp);

        expExp.setAccession(convertDatabaseEntry(series));

        LocalFile expExpRawDataFile = convertSupplementaryFileToLocalFile(series);
        expExp.setRawDataFile(expExpRawDataFile);

        ExperimentalDesign design = ExperimentalDesign.Factory.newInstance();
        design.setDescription("");
        design.setName("");
        Collection<GeoVariable> variables = series.getVariables().values();
        for (GeoVariable variable : variables) {
            log.debug("Adding variable " + variable);
            ExperimentalFactor ef = convertVariableToFactor(variable);
            convertVariableToFactorValue(variable, ef);
            design.getExperimentalFactors().add(ef);
            design.setName(variable.getDescription() + " " + design.getName());
        }

        if (series.getKeyWords().size() > 0) {
            for (String keyWord : series.getKeyWords()) {
                // design.setDescription( design.getDescription() + " Keyword: " + keyWord );
                Characteristic o = Characteristic.Factory.newInstance();
                o.setDescription("GEO Keyword");
                o.setValue(keyWord);
                o.setEvidenceCode(GOEvidenceCode.IIA);
                o.setDescription("Keyword from GEO series definition file.");
            }
        }

        if (series.getOverallDesign() != null) {
            design.setDescription(design.getDescription() + " Overall design: " + series.getOverallDesign());
        }

        Collection<GeoReplication> replication = series.getReplicates().values();
        for (GeoReplication replicate : replication) {
            log.debug("Adding replication " + replicate);
            ExperimentalFactor ef = convertReplicationToFactor(replicate);
            convertReplicationToFactorValue(replicate, ef);
            design.getExperimentalFactors().add(ef);
        }

        expExp.setExperimentalDesign(design);

        // GEO does not have the concept of a biomaterial.
        Collection<GeoSample> allSeriesSamples = series.getSamples();
        log.info("Series has " + series.getSamples().size() + " samples");
        if (samplesToSkip.size() > 0) {
            log.info(samplesToSkip.size() + " samples will be skipped");
        }
        expExp.setBioAssays(new HashSet<BioAssay>());

        if (series.getSampleCorrespondence().size() == 0) {
            throw new IllegalArgumentException("No sample correspondence!");
        }

        // spits out a big summary of the correspondence.
        if (log.isDebugEnabled())
            log.debug(series.getSampleCorrespondence());
        int numBioMaterials = 0;

        /*
         * For each _set_ of "corresponding" samples (from the same RNA, or so we think) we make up a new BioMaterial.
         */

        Collection<String> seen = new HashSet<String>();
        for (Iterator<Set<String>> iter = series.getSampleCorrespondence().iterator(); iter.hasNext();) {

            Set<String> correspondingSamples = iter.next();
            if (correspondingSamples.isEmpty())
                continue; // can happen after removing samples (multitaxon)

            BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
            String bioMaterialName = getBiomaterialPrefix(series, ++numBioMaterials);
            String bioMaterialDescription = BIOMATERIAL_DESCRIPTION_PREFIX;

            // From the series samples, find the sample that corresponds and convert it.
            for (String cSample : correspondingSamples) {
                boolean found = false;
                for (GeoSample sample : allSeriesSamples) {
                    if (sample == null || sample.getGeoAccession() == null) {
                        log.warn("Null sample or no accession for " + sample);
                        continue;
                    }

                    if (samplesToSkip.contains(sample)) {
                        continue;
                    }

                    String accession = sample.getGeoAccession();

                    if (accession.equals(cSample)) {

                        if (seen.contains(accession)) {
                            log.error("Got " + accession + " twice, this time in set " + correspondingSamples);
                        }
                        seen.add(accession);

                        BioAssay ba = convertSample(sample, bioMaterial, expExp.getExperimentalDesign());

                        LocalFile rawDataFile = convertSupplementaryFileToLocalFile(sample);
                        ba.setRawDataFile(rawDataFile);// deal with null at UI

                        // TODO these custom string prefixes should be made into constants, need to make public for use
                        // by ExpressionExperimentAnnotator

                        ba.setDescription(ba.getDescription() + "\nSource GEO sample is " + sample.getGeoAccession()
                                + "\nLast updated (according to GEO): " + sample.getLastUpdateDate());

                        assert ba.getSampleUsed() != null;
                        bioMaterial.getBioAssaysUsedIn().add(ba);
                        bioMaterialDescription = bioMaterialDescription + "," + sample;
                        expExp.getBioAssays().add(ba);
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    if (log.isDebugEnabled())
                        log.debug("No sample found in " + series + " to match " + cSample
                                + "; this can happen if some samples were not run on all platforms.");

                }
            }
            bioMaterial.setName(bioMaterialName);
            bioMaterial.setDescription(bioMaterialDescription);
        }

        log.info("Expression Experiment from " + series + " has " + expExp.getBioAssays().size() + " bioassays and "
                + numBioMaterials + " biomaterials.");

        int expectedNumSamples = series.getSamples().size() - samplesToSkip.size();
        int actualNumSamples = expExp.getBioAssays().size();
        if (expectedNumSamples > actualNumSamples) {
            log.warn((expectedNumSamples - actualNumSamples) + " samples were not in the 'sample correspondence'"
                    + " and have been omitted. Possibly they were in the Series (GSE) but not in the corresponding Dataset (GDS)?");
        }

        // Dataset has additional information about the samples.

        if (dataSets.size() == 0) {
            // we miss extra description and the subset information.
            convertSeriesDataVectors(series, expExp);
        } else {
            for (GeoDataset dataset : dataSets) {
                if (dataSetsToSkip.contains(dataset.getGeoAccession()))
                    continue;
                convertDataset(dataset, expExp);
            }
        }

        return expExp;
    }

    /**
     * Use this when we don't have a GDS for a GSE.
     * 
     * @param geoSeries
     * @param expExp
     */
    private void convertSeriesDataVectors(GeoSeries geoSeries, ExpressionExperiment expExp) {
        /*
         * Tricky thing is that series contains data from multiple platforms.
         */
        Map<GeoPlatform, List<GeoSample>> platformSamples = DatasetCombiner.getPlatformSampleMap(geoSeries);

        for (GeoPlatform platform : platformSamples.keySet()) {
            List<GeoSample> samples = platformSamples.get(platform);
            log.debug(samples.size() + " samples on " + platform);
            convertVectorsForPlatform(geoSeries.getValues(), expExp, samples, platform);
            geoSeries.getValues().clear(platform);
        }

    }

    /**
     * @param series
     * @param converted
     * @param organismDatasetMap
     * @param i
     * @param organism
     */
    private void convertSpeciesSpecific(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<String, Collection<GeoData>> organismDatasetMap, int i, String organism) {
        GeoSeries speciesSpecific = new GeoSeries();

        Collection<GeoData> datasets = organismDatasetMap.get(organism);
        assert datasets.size() > 0;

        for (GeoSample sample : series.getSamples()) {
            // ugly, we have to assume there is only one platform and one organism...
            if (sample.getPlatforms().iterator().next().getOrganisms().iterator().next().equals(organism)) {
                speciesSpecific.addSample(sample);
            }
        }

        // strip out samples that aren't from this organism.

        for (GeoData dataset : datasets) {
            if (dataset instanceof GeoDataset) {
                ((GeoDataset) dataset).dissociateFromSeries(series);
                speciesSpecific.addDataSet((GeoDataset) dataset);
            }
        }

        /*
         * Basically copy over most of the information
         */
        speciesSpecific.setContact(series.getContact());
        speciesSpecific.setContributers(series.getContributers());
        speciesSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        speciesSpecific.setKeyWords(series.getKeyWords());
        speciesSpecific.setOverallDesign(series.getOverallDesign());
        speciesSpecific.setPubmedIds(series.getPubmedIds());
        speciesSpecific.setReplicates(series.getReplicates());
        speciesSpecific.setSampleCorrespondence(series.getSampleCorrespondence());
        speciesSpecific.setSummaries(series.getSummaries());
        speciesSpecific.setTitle(series.getTitle() + " - " + organism);
        speciesSpecific.setWebLinks(series.getWebLinks());
        speciesSpecific.setValues(series.getValues());

        converted.add(convertSeries(speciesSpecific, null));
    }

    /**
     * Handle the case where a single series has samples from more than one species.
     * 
     * @param series
     * @param organismSampleMap the samples divvied up by organism
     * @param converted
     */
    private void convertSpeciesSpecificSamples(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<String, Collection<GeoSample>> organismSampleMap, int i, String organism) {

        GeoSeries speciesSpecific = new GeoSeries();

        Collection<GeoSample> samples = organismSampleMap.get(organism);

        for (GeoSample s : samples) {
            speciesSpecific.addSample(s);
        }

        /*
         * Strip out sample correspondence for samples not for this organism.
         */
        GeoSampleCorrespondence sampleCorrespondence = series.getSampleCorrespondence().copy();

        for (String o : organismSampleMap.keySet()) {
            if (o.equals(organism)) {
                continue;
            }
            for (GeoSample s : organismSampleMap.get(o)) {
                sampleCorrespondence.removeSample(s.getGeoAccession());
            }
        }

        /*
         * Basically copy over most of the information
         */
        speciesSpecific.setContact(series.getContact());
        speciesSpecific.setContributers(series.getContributers());
        speciesSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        speciesSpecific.setKeyWords(series.getKeyWords());
        speciesSpecific.setOverallDesign(series.getOverallDesign());
        speciesSpecific.setPubmedIds(series.getPubmedIds());
        speciesSpecific.setReplicates(series.getReplicates());
        speciesSpecific.setSampleCorrespondence(sampleCorrespondence);
        speciesSpecific.setSummaries(series.getSummaries());
        speciesSpecific.setTitle(series.getTitle() + " - " + organism);
        speciesSpecific.setWebLinks(series.getWebLinks());
        speciesSpecific.setValues(series.getValues(speciesSpecific.getSamples()));

        converted.add(convertSeries(speciesSpecific, null));

    }

    /**
     * @param result
     * @param geoDataset
     */
    private void convertSubsetAssociations(ExpressionExperiment result, GeoDataset geoDataset) {
        for (GeoSubset subset : geoDataset.getSubsets()) {
            if (log.isDebugEnabled())
                log.debug("Converting subset to experimentalFactor" + subset.getType());
            convertSubsetToExperimentalFactor(result, subset);
        }
    }

    /**
     * Creates a new factorValue, or identifies an existing one, matching the subset. If it is a new one it adds it to
     * the given experimentalFactor.
     * 
     * @param geoSubSet
     * @param experimentalFactor
     * @return
     */
    private FactorValue convertSubsetDescriptionToFactorValue(GeoSubset geoSubSet,
            ExperimentalFactor experimentalFactor) {
        // By definition each subset defines a new factor value.
        FactorValue factorValue = FactorValue.Factory.newInstance();
        Characteristic term = convertVariableType(geoSubSet.getType());
        term.setValue(geoSubSet.getDescription());
        term.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());
        factorValue.getCharacteristics().add(term);
        factorValue.setExperimentalFactor(experimentalFactor);
        factorValue.setValue(term.getValue());

        /* Check that there isn't already a factor value for this in the factor */

        for (FactorValue fv : experimentalFactor.getFactorValues()) {
            if (fv.equals(factorValue)) {
                log.debug(factorValue + " is matched by existing factorValue for " + experimentalFactor);
                return fv;
            }
        }
        experimentalFactor.getFactorValues().add(factorValue);
        return factorValue;
    }

    /**
     * @param variable
     * @param type
     * @return
     */
    private FactorValue convertTypeToFactorValue(VariableType type, String value) {
        FactorValue factorValue = FactorValue.Factory.newInstance();
        Characteristic term = convertVariableType(type);
        term.setValue(value); // TODO map onto an ontology.
        factorValue.setValue(term.getValue());
        factorValue.getCharacteristics().add(term);
        return factorValue;
    }

    /**
     * Convert a variable into a ExperimentalFactor
     * 
     * @param variable
     * @return
     */
    private ExperimentalFactor convertVariableToFactor(GeoVariable variable) {
        log.debug("Converting variable " + variable.getType());
        ExperimentalFactor result = ExperimentalFactor.Factory.newInstance();
        result.setName(variable.getType().toString());
        result.setType(FactorType.CATEGORICAL);
        result.setDescription(variable.getDescription());
        Characteristic term = convertVariableType(variable.getType());
        result.setCategory(term);
        return result;
    }

    /**
     * @param variable
     * @return Category will be filled in with a URI but value will just be plain text.
     */
    private FactorValue convertVariableToFactorValue(GeoVariable variable) {
        log.info("Converting variable " + variable);
        VariableType type = variable.getType();
        FactorValue factorValue = convertTypeToFactorValue(type, variable.getDescription());
        return factorValue;
    }

    /**
     * @param variable
     * @param factor
     */
    private void convertVariableToFactorValue(GeoVariable variable, ExperimentalFactor factor) {
        FactorValue factorValue = convertVariableToFactorValue(variable);
        factor.getFactorValues().add(factorValue);
    }

    /**
     * Convert a variable
     * 
     * @param variable
     * @return a VocabCharacteristic with the category URI and category filled in.
     */
    private VocabCharacteristic convertVariableType(VariableType varType) {

        String mgedTerm = null;
        if (varType.equals(VariableType.age)) {
            mgedTerm = "Age";
        } else if (varType.equals(VariableType.agent)) {
            mgedTerm = "Compound"; // THERE IS no such term as 'Agent' in MGED.
        } else if (varType.equals(VariableType.cellLine)) {
            mgedTerm = "CellLine";
        } else if (varType.equals(VariableType.cellType)) {
            mgedTerm = "CellType";
        } else if (varType.equals(VariableType.developmentStage)) {
            mgedTerm = "DevelopmentalStage";
        } else if (varType.equals(VariableType.diseaseState)) {
            mgedTerm = "DiseaseState";
        } else if (varType.equals(VariableType.dose)) {
            mgedTerm = "Dose";
        } else if (varType.equals(VariableType.gender)) {
            mgedTerm = "Sex";
        } else if (varType.equals(VariableType.genotypeOrVariation)) {
            mgedTerm = "IndividualGeneticCharacteristics";
        } else if (varType.equals(VariableType.growthProtocol)) {
            mgedTerm = "GrowthCondition";
        } else if (varType.equals(VariableType.individual)) {
            mgedTerm = "Individual";
        } else if (varType.equals(VariableType.infection)) {
            mgedTerm = "Phenotype";
        } else if (varType.equals(VariableType.isolate)) {
            mgedTerm = "Age";
        } else if (varType.equals(VariableType.metabolism)) {
            mgedTerm = "Metabolism";
        } else if (varType.equals(VariableType.other)) {
            mgedTerm = "Other";
        } else if (varType.equals(VariableType.protocol)) {
            mgedTerm = "Protocol";
        } else if (varType.equals(VariableType.shock)) {
            mgedTerm = "EnvironmentalStress";
        } else if (varType.equals(VariableType.species)) {
            mgedTerm = "Organism";
        } else if (varType.equals(VariableType.specimen)) {
            mgedTerm = "BioSample";
        } else if (varType.equals(VariableType.strain)) {
            mgedTerm = "StrainOrLine";
        } else if (varType.equals(VariableType.stress)) {
            mgedTerm = "EnvironmentalStress";
        } else if (varType.equals(VariableType.temperature)) {
            mgedTerm = "Temperature";
        } else if (varType.equals(VariableType.time)) {
            mgedTerm = "Time";
        } else if (varType.equals(VariableType.tissue)) {
            mgedTerm = "OrganismPart";
        } else {
            throw new IllegalStateException();
        }

        log.debug("Category term: " + mgedTerm + " ");
        return setCategory(mgedTerm);

    }

    /**
     * For data coming from a single platform, create vectors.
     * 
     * @param values A GeoValues object holding the parsed results.
     * @param expExp
     * @param datasetSamples
     * @param geoPlatform
     */
    private void convertVectorsForPlatform(GeoValues values, ExpressionExperiment expExp,
            List<GeoSample> datasetSamples, GeoPlatform geoPlatform) {
        assert datasetSamples.size() > 0 : "No samples in dataset";

        log.info("Converting vectors for " + geoPlatform.getGeoAccession() + ", " + datasetSamples.size()
                + " samples.");

        BioAssayDimension bioAssayDimension = convertGeoSampleList(datasetSamples, expExp);

        if (bioAssayDimension.getBioAssays().size() == 0)
            throw new IllegalStateException("No bioAssays in the BioAssayDimension");

        sanityCheckQuantitationTypes(datasetSamples);

        List<String> quantitationTypes = datasetSamples.iterator().next().getColumnNames();
        List<String> quantitationTypeDescriptions = datasetSamples.iterator().next().getColumnDescriptions();
        boolean first = true;

        /*
         * For the data that are put in 'datasets' (GDS), we know the type of data, but it can be misleading (e.g., Affy
         * data is 'counts'). For others we just have free text in the column descriptions
         */

        for (String quantitationType : quantitationTypes) {

            // skip the first quantitationType, it's the ID or ID_REF.
            if (first) {
                first = false;
                continue;
            }

            int columnAccordingToSample = quantitationTypes.indexOf(quantitationType);

            int quantitationTypeIndex = values.getQuantitationTypeIndex(geoPlatform, quantitationType);
            log.debug("Processing " + quantitationType + " (column=" + quantitationTypeIndex
                    + " - according to sample, it's " + columnAccordingToSample + ")");

            Map<String, List<Object>> dataVectors = makeDataVectors(values, datasetSamples, quantitationTypeIndex);

            if (dataVectors == null || dataVectors.size() == 0) {
                log.debug("No data for " + quantitationType + " (column=" + quantitationTypeIndex + ")");
                continue;
            }
            log.info(dataVectors.size() + " data vectors for " + quantitationType);

            Object exampleValue = dataVectors.values().iterator().next().iterator().next();

            QuantitationType qt = QuantitationType.Factory.newInstance();
            qt.setName(quantitationType);
            String description = quantitationTypeDescriptions.get(columnAccordingToSample);
            qt.setDescription(description);
            QuantitationTypeParameterGuesser.guessQuantitationTypeParameters(qt, quantitationType, description,
                    exampleValue);

            int count = 0;
            int skipped = 0;
            for (String designElementName : dataVectors.keySet()) {
                List<Object> dataVector = dataVectors.get(designElementName);
                if (dataVector == null || dataVector.size() == 0)
                    continue;

                RawExpressionDataVector vector = convertDesignElementDataVector(geoPlatform, expExp,
                        bioAssayDimension, designElementName, dataVector, qt);

                if (vector == null) {
                    skipped++;
                    if (log.isDebugEnabled())
                        log.debug("Null vector for DE=" + designElementName + " QT=" + quantitationType);
                    continue;
                }

                if (log.isTraceEnabled()) {
                    log.trace(designElementName + " " + qt.getName() + " " + qt.getRepresentation() + " "
                            + dataVector.size() + " elements in vector");
                }

                expExp.getRawExpressionDataVectors().add(vector);

                if (++count % LOGGING_VECTOR_COUNT_UPDATE == 0 && log.isDebugEnabled()) {
                    log.debug(count + " Data vectors added");
                }
            }

            if (count > 0) {
                expExp.getQuantitationTypes().add(qt);
                if (log.isDebugEnabled() && count > 1000) {
                    log.debug(count + " Data vectors added for '" + quantitationType + "'");
                }
            } else {
                log.info("No vectors were retained for " + quantitationType
                        + " -- usually this is due to all values being missing.");
            }

            if (skipped > 0) {
                log.info("Skipped " + skipped + " vectors");
            }
        }
        log.info("Total of " + expExp.getRawExpressionDataVectors().size() + " vectors on platform " + geoPlatform
                + ", " + expExp.getQuantitationTypes().size() + " quantitation types.");
    }

    private DatabaseEntry createDatabaseEntry(ExternalDatabase externalDb, String externalRef, BioSequence bs) {
        DatabaseEntry dbe;
        if (externalDb.getName().equalsIgnoreCase("genbank")) {
            // deal with accessions in the form XXXXX.N
            dbe = ExternalDatabaseUtils.getGenbankAccession(externalRef);
            dbe.setExternalDatabase(externalDb); // make sure it matches the one used here.
            bs.setName(dbe.getAccession()); // trimmed version.
        } else {
            bs.setName(externalRef);
            dbe = DatabaseEntry.Factory.newInstance();
            dbe.setAccession(externalRef);
            dbe.setExternalDatabase(externalDb);
        }
        return dbe;
    }

    /**
     * @param platform
     * @return
     */
    private ArrayDesign createMinimalArrayDesign(GeoPlatform platform) {
        ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
        arrayDesign.setName(platform.getTitle());
        arrayDesign.setShortName(platform.getGeoAccession());
        arrayDesign.setDescription(platform.getDescriptions());
        PlatformType technology = platform.getTechnology();
        if (technology == PlatformType.dualChannel || technology == PlatformType.dualChannelGenomic
                || technology == PlatformType.spottedOligonucleotide
                || technology == PlatformType.spottedDNAOrcDNA) {
            arrayDesign.setTechnologyType(TechnologyType.TWOCOLOR);
        } else if (technology == PlatformType.singleChannel || technology == PlatformType.oligonucleotideBeads
                || technology == PlatformType.inSituOligonucleotide) {
            arrayDesign.setTechnologyType(TechnologyType.ONECOLOR);
        } else if (technology == null) {
            log.warn("No technology type available for " + platform + ", provisionally setting to 'dual mode'");
            arrayDesign.setTechnologyType(TechnologyType.DUALMODE);
        } else if (technology.equals(PlatformType.MPSS)) {
            // we don't support this directly
            arrayDesign.setTechnologyType(TechnologyType.NONE);
        } else if (technology.equals(PlatformType.SAGE) || technology.equals(PlatformType.SAGENlaIII)
                || technology.equals(PlatformType.SAGERsaI) || technology.equals(PlatformType.SAGESau3A)
                || technology.equals(PlatformType.other)) {
            // we don't support this directly
            arrayDesign.setTechnologyType(TechnologyType.NONE);
        } else {
            throw new IllegalArgumentException("Don't know how to interpret technology type " + technology);
        }
        return arrayDesign;
    }

    /**
     * @param taxon Can be null, we will discard this
     * @return
     */
    private BioSequence createMinimalBioSequence(Taxon taxon) {
        BioSequence bs = BioSequence.Factory.newInstance();
        bs.setTaxon(taxon);
        bs.setPolymerType(PolymerType.DNA);
        bs.setType(SequenceType.DNA);
        return bs;
    }

    /**
     * @param platform
     * @return
     */
    private String determinePlatformDescriptionColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String string : columnNames) {
            if (GeoConstants.likelyProbeDescription(string)) {
                log.debug(string + " appears to indicate the  probe descriptions in column " + index
                        + " for platform " + platform);
                return string;
            }
            index++;
        }
        log.debug("No platform element description column found for " + platform);
        return null;
    }

    /**
     * @param platform
     * @return
     */
    private ExternalDatabase determinePlatformExternalDatabase(GeoPlatform platform) {
        ExternalDatabase result = ExternalDatabase.Factory.newInstance();

        Collection<String> likelyExternalDatabaseIdentifiers = determinePlatformExternalReferenceIdentifier(
                platform);
        String dbIdentifierDescription = getDbIdentifierDescription(platform);

        String url = null;
        if (dbIdentifierDescription == null) {
            return null;
        } else if (dbIdentifierDescription.indexOf("LINK_PRE:") >= 0) {
            // example: #ORF = ORF reference LINK_PRE:"http://genome-www4.stanford.edu/cgi-bin/SGD/locus.pl?locus="
            url = dbIdentifierDescription.substring(dbIdentifierDescription.indexOf("LINK_PRE:"));
            result.setWebUri(url);
        }

        if (likelyExternalDatabaseIdentifiers == null || likelyExternalDatabaseIdentifiers.size() == 0) {
            throw new IllegalStateException("No external database identifier column was identified");
        }

        String likelyExternalDatabaseIdentifier = likelyExternalDatabaseIdentifiers.iterator().next();
        if (likelyExternalDatabaseIdentifier.equals("GB_ACC") || likelyExternalDatabaseIdentifier.equals("GB_LIST")
                || likelyExternalDatabaseIdentifier.toLowerCase().equals("genbank")) {
            if (genbank == null) {
                if (externalDatabaseService != null) {
                    genbank = externalDatabaseService.find("Genbank");
                } else {
                    result.setName("Genbank");
                    result.setType(DatabaseType.SEQUENCE);
                    genbank = result;
                }
            }
            result = genbank;
        } else if (likelyExternalDatabaseIdentifier.equals("ORF")) {
            String organism = platform.getOrganisms().iterator().next();

            result.setType(DatabaseType.GENOME);

            if (organismDatabases.containsKey(organism)) {
                result.setName(organismDatabases.get(organism));
            } else {
                // Placeholder
                result.setName(organism + " ORFs");
                log.warn("External database is " + result);
            }

            // } else if ( likelyExternalDatabaseIdentifier.equals( "CLONE_ID" ) ) {
            // String sample = platform.getColumnData( "CLONE_ID" ).iterator().next();
            // if ( sample.startsWith( "IMAGE" ) ) {
            // result.setType( DatabaseType.SEQUENCE );
            // result.setName( "IMAGE" );
            // } else {
            // throw new IllegalStateException( "No external database was identified, but had CLONE_ID" );
            // }
        }
        if (result == null || result.getName() == null) {
            throw new IllegalStateException("No external database was identified");
        }
        return result;
    }

    /**
     * @param platform
     * @return
     */
    private Collection<String> determinePlatformExternalReferenceIdentifier(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        Collection<String> matches = new HashSet<String>();
        for (String string : columnNames) {
            if (GeoConstants.likelyExternalReference(string)) {
                log.debug(string + " appears to indicate a possible external reference identifier in column "
                        + index + " for platform " + platform);
                matches.add(string);

            }
            index++;
        }

        if (matches.size() == 0) {
            return null;
        }
        return matches;

    }

    /**
     * Allow multiple taxa for a platform. Method retrieves from parsed GEO file the header column name which contains
     * the species/organism used to create probe.
     * 
     * @param platform Parsed GEO platform details.
     * @return Column name in GEO used to identify column containing species/organism used to create probe
     */
    private String determinePlatformProbeOrganismColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String columnName : columnNames) {
            if (GeoConstants.likelyProbeOrganism(columnName)) {
                log.debug("'" + columnName + "' appears to indicate the sequences in column " + index
                        + " for platform " + platform);
                return columnName;
            }
            index++;
        }
        log.debug("No platform organism description column found for " + platform);
        return null;
    }

    /**
     * @param platform
     * @return
     */
    private String determinePlatformSequenceColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String columnName : columnNames) {
            if (GeoConstants.likelySequence(columnName)) {
                log.debug("'" + columnName + "' appears to indicate the sequences in column " + index
                        + " for platform " + platform);
                return columnName;
            }
            index++;
        }
        log.debug("No platform sequence description column found for " + platform);
        return null;
    }

    /**
     * @param experimentalFactors
     * @param convertVariableToFactorValue
     * @return
     */
    private FactorValue findMatchingExperimentalFactorValue(Collection<ExperimentalFactor> experimentalFactors,
            FactorValue convertVariableToFactorValue) {
        Collection<Characteristic> characteristics = convertVariableToFactorValue.getCharacteristics();
        if (characteristics.size() > 1)
            throw new UnsupportedOperationException(
                    "Can't handle factor values with multiple characteristics in GEO conversion");
        Characteristic c = characteristics.iterator().next();

        FactorValue matchingFactorValue = null;
        factors: for (ExperimentalFactor factor : experimentalFactors) {
            for (FactorValue fv : factor.getFactorValues()) {
                for (Characteristic m : fv.getCharacteristics()) {
                    if (m.getCategory().equals(c.getCategory()) && m.getValue().equals(c.getValue())) {
                        matchingFactorValue = fv;
                        break factors;
                    }

                }
            }
        }
        return matchingFactorValue;
    }

    /**
     * Turn a rough-cut dimension name into something of reasonable length.
     * 
     * @param dimensionName
     * @return
     */
    private String formatName(StringBuilder dimensionName) {
        return dimensionName.length() > 100 ? dimensionName.toString().substring(0, 100)
                : dimensionName.toString() + "...";
    }

    /**
     * @param series
     * @param i
     * @return
     */
    private String getBiomaterialPrefix(GeoSeries series, int i) {
        String bioMaterialName = series.getGeoAccession() + BIOMATERIAL_NAME_TAG + i;
        return bioMaterialName;
    }

    /**
     * @param geoDataset
     * @return
     */
    private Collection<GeoSample> getDatasetSamples(GeoDataset geoDataset) {
        Collection<GeoSample> seriesSamples = getSeriesSamplesForDataset(geoDataset);

        // get just the samples used in this dataset
        Collection<GeoSample> datasetSamples = new ArrayList<GeoSample>();

        for (GeoSample sample : seriesSamples) {
            if (geoDataset.getColumnNames().contains(sample.getGeoAccession())) {
                if (log.isDebugEnabled()) {
                    log.debug("Dataset " + geoDataset + " includes sample " + sample + " on platform "
                            + sample.getPlatforms().iterator().next());
                }
                datasetSamples.add(sample);
            }

            if (log.isDebugEnabled()) {
                log.debug("Dataset " + geoDataset + " DOES NOT include sample " + sample + " on platform "
                        + sample.getPlatforms().iterator().next());
            }
        }

        return datasetSamples;
    }

    /**
     * @param platform
     * @return
     */
    private String getDbIdentifierDescription(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String string : columnNames) {
            if (GeoConstants.likelyExternalReference(string)) {
                return platform.getColumnDescriptions().get(index);
            }
            index++;
        }
        return null;
    }

    private String getExternalAccession(List<List<String>> externalRefs, int i) {
        for (List<String> refs : externalRefs) {
            if (StringUtils.isNotBlank(refs.get(i))) {
                return refs.get(i);
            }
        }
        return null;
    }

    /**
     * @param series
     * @return map of organisms to a collection of either datasets or platforms.
     */
    private Map<String, Collection<GeoData>> getOrganismDatasetMap(GeoSeries series) {
        Map<String, Collection<GeoData>> organisms = new HashMap<String, Collection<GeoData>>();

        if (series.getDatasets() == null || series.getDatasets().size() == 0) {
            for (GeoSample sample : series.getSamples()) {

                assert sample.getPlatforms().size() > 0 : sample + " has no platform";
                assert sample.getPlatforms().size() == 1 : sample + " has multiple platforms: "
                        + StringUtils.join(sample.getPlatforms().toArray(), ",");
                String organism = sample.getPlatforms().iterator().next().getOrganisms().iterator().next();

                if (!organisms.containsKey(organism)) {
                    organisms.put(organism, new HashSet<GeoData>());
                }
                organisms.get(organism).add(sample.getPlatforms().iterator().next());
            }
        } else {
            for (GeoDataset dataset : series.getDatasets()) {
                String organism = dataset.getOrganism();
                if (organisms.get(organism) == null) {
                    organisms.put(organism, new HashSet<GeoData>());
                }
                organisms.get(organism).add(dataset);
            }
        }
        return organisms;
    }

    /**
     * Based on the sample organisms, not the platforms. For rare cases where more than one species is run on a platform
     * (e.g., chimp and human run on a human platform)
     * 
     * @param series
     * @return
     */
    private Map<String, Collection<GeoSample>> getOrganismSampleMap(GeoSeries series) {
        Map<String, Collection<GeoSample>> result = new HashMap<String, Collection<GeoSample>>();
        for (GeoSample sample : series.getSamples()) {
            String organism = sample.getOrganism();
            if (!result.containsKey(organism)) {
                result.put(organism, new HashSet<GeoSample>());
            }
            result.get(organism).add(sample);
        }
        return result;
    }

    /**
     * @param series
     * @return
     */
    private Map<GeoPlatform, Collection<GeoData>> getPlatformDatasetMap(GeoSeries series) {
        Map<GeoPlatform, Collection<GeoData>> platforms = new HashMap<GeoPlatform, Collection<GeoData>>();

        if (series.getDatasets() == null || series.getDatasets().size() == 0) {
            for (GeoSample sample : series.getSamples()) {
                assert sample.getPlatforms().size() > 0 : sample + " has no platform";
                assert sample.getPlatforms().size() == 1 : sample + " has multiple platforms: "
                        + StringUtils.join(sample.getPlatforms().toArray(), ",");
                GeoPlatform platform = sample.getPlatforms().iterator().next();

                if (platforms.get(platform) == null) {
                    platforms.put(platform, new HashSet<GeoData>());
                }
                // This is a bit silly, but made coding this easier.
                platforms.get(platform).add(sample.getPlatforms().iterator().next());
            }
        } else {
            for (GeoDataset dataset : series.getDatasets()) {
                GeoPlatform platform = dataset.getPlatform();
                if (platforms.get(platform) == null) {
                    platforms.put(platform, new HashSet<GeoData>());
                }
                platforms.get(platform).add(dataset);
            }
        }
        return platforms;
    }

    /**
     * Assumes that all samples have the same platform. If not, throws an exception.
     * 
     * @param datasetSamples
     * @return
     */
    private GeoPlatform getPlatformForSamples(List<GeoSample> datasetSamples) {
        GeoPlatform platform = null;
        for (GeoSample sample : datasetSamples) {
            Collection<GeoPlatform> platforms = sample.getPlatforms();
            assert platforms.size() != 0;
            if (platforms.size() > 1) {
                throw new UnsupportedOperationException(
                        "Can't handle GEO sample ids associated with multiple platforms just yet");
            }
            GeoPlatform nextPlatform = platforms.iterator().next();
            if (platform == null)
                platform = nextPlatform;
            else if (!platform.equals(nextPlatform))
                throw new IllegalArgumentException("All samples here must use the same platform");
        }
        return platform;
    }

    private Collection<GeoSample> getSeriesSamplesForDataset(GeoDataset geoDataset) {
        Collection<GeoSample> seriesSamples = null;
        Collection<GeoSeries> series = geoDataset.getSeries();

        // this is highly defensive programming prompted by a bug that caused the same series to be listed more than
        // once, but empty in one case.

        if (series == null || series.size() == 0) {
            throw new IllegalStateException("No series for " + geoDataset);
        }

        if (series.size() > 1) {
            log.warn("More than one series for a data set, probably some kind of parsing bug!");
        }

        boolean found = false;
        for (GeoSeries series2 : series) {
            if (series2.getSamples() != null && series2.getSamples().size() > 0) {
                if (found == true) {
                    throw new IllegalStateException(
                            "More than one of the series for " + geoDataset + " has samples: " + series2);
                }
                seriesSamples = series2.getSamples();
                found = true;
            }
        }

        if (seriesSamples == null || seriesSamples.size() == 0) {
            throw new IllegalStateException("No series had samples for " + geoDataset);
        }

        return seriesSamples;
    }

    /**
     * Deal with missing values, identified by nulls or number format exceptions.
     * 
     * @param toConvert
     * @param pt
     */
    private void handleMissing(List<Object> toConvert, PrimitiveType pt) {
        if (pt.equals(PrimitiveType.DOUBLE)) {
            toConvert.add(Double.NaN);
        } else if (pt.equals(PrimitiveType.STRING)) {
            toConvert.add("");
        } else if (pt.equals(PrimitiveType.INT)) {
            toConvert.add(0);
        } else if (pt.equals(PrimitiveType.BOOLEAN)) {
            toConvert.add(false);
        } else {
            throw new UnsupportedOperationException(
                    "Missing values in data vectors of type " + pt + " not supported");
        }
    }

    /**
     * 
     */
    private void initGeoExternalDatabase() {
        if (geoDatabase == null) {
            if (externalDatabaseService != null) {
                ExternalDatabase ed = externalDatabaseService.find("GEO");
                if (ed != null) {
                    geoDatabase = ed;
                }
            } else {
                geoDatabase = ExternalDatabase.Factory.newInstance();
                geoDatabase.setName("GEO");
                geoDatabase.setType(DatabaseType.EXPRESSION);
            }
        }
    }

    /**
     * Check to see if we got any data. If not, we should return null. This can happen if the quantitation type was
     * filtered during parsing.
     */
    private boolean isPopulated(Map<String, List<Object>> dataVectors) {
        boolean filledIn = false;
        for (List<Object> vector : dataVectors.values()) {
            for (Object object : vector) {
                if (object != null) {
                    filledIn = true;
                    break;
                }
            }
            if (filledIn == true) {
                break;
            }
        }
        return filledIn;
    }

    /**
     * Convert the by-sample data for a given quantitation type to by-designElement data vectors.
     * 
     * @param datasetSamples The samples we want to get data for. These should all have been run on the same platform.
     * @param quantitationTypeIndex - first index is 0
     * @return A map of Strings (design element names) to Lists of Strings containing the data.
     * @throws IllegalArgumentException if the columnNumber is not valid
     */
    private Map<String, List<Object>> makeDataVectors(GeoValues values, List<GeoSample> datasetSamples,
            Integer quantitationTypeIndex) {
        Map<String, List<Object>> dataVectors = new HashMap<String, List<Object>>(INITIAL_VECTOR_CAPACITY);
        Collections.sort(datasetSamples);
        GeoPlatform platform = getPlatformForSamples(datasetSamples);

        // the locations of the data we need in the target vectors (mostly reordering)
        Integer[] indices = values.getIndices(platform, datasetSamples, quantitationTypeIndex);

        if (indices == null || indices.length == 0)
            return null; // can happen if quantitation type was filtered out.

        assert indices.length == datasetSamples.size();

        String identifier = platform.getIdColumnName();
        List<String> designElements = platform.getColumnData(identifier);

        if (designElements == null) {
            return dataVectors;
        }

        for (String designElementName : designElements) {
            /*
             * Note: null data can happen if the platform has probes that aren't in the data, or if this is a
             * quantitation type that was filtered out during parsing, or absent from some samples.
             */
            List<Object> ob = values.getValues(platform, quantitationTypeIndex, designElementName, indices);
            if (ob == null || ob.size() == 0)
                continue;
            assert ob.size() == datasetSamples.size();
            dataVectors.put(designElementName, ob);
        }

        boolean filledIn = isPopulated(dataVectors);

        values.clear(platform, datasetSamples, quantitationTypeIndex);

        if (!filledIn)
            return null;

        return dataVectors;
    }

    /**
     * @param bioMaterial
     * @param experimentalFactors
     * @param variable
     */
    private void matchSampleReplicationToExperimentalFactorValue(BioMaterial bioMaterial,
            Collection<ExperimentalFactor> experimentalFactors, GeoReplication replication) {
        // find the experimentalFactor that matches this.
        FactorValue convertVariableToFactorValue = convertReplicationToFactorValue(replication);
        FactorValue matchingFactorValue = findMatchingExperimentalFactorValue(experimentalFactors,
                convertVariableToFactorValue);
        if (matchingFactorValue != null) {
            bioMaterial.getFactorValues().add(matchingFactorValue);
        } else {
            throw new IllegalStateException("Could not find matching factor value for " + replication
                    + " in experimental design for sample " + bioMaterial);
        }
    }

    /**
     * @param expExp ExpressionExperiment to be searched for matching BioAssays
     * @param bioAssayDimension BioAssayDimension to be added to
     * @param sampleAcc The GEO accession id for the sample. This is compared to the external accession recorded for the
     *        BioAssay
     * @return
     */
    private boolean matchSampleToBioAssay(ExpressionExperiment expExp, BioAssayDimension bioAssayDimension,
            String sampleAcc) {

        for (BioAssay bioAssay : expExp.getBioAssays()) {
            if (sampleAcc.equals(bioAssay.getAccession().getAccession())) {
                bioAssayDimension.getBioAssays().add(bioAssay);
                log.debug("Found sample match for bioAssay " + bioAssay.getAccession().getAccession());
                return true;
            }
        }
        return false;
    }

    /**
     * @param bioMaterial
     * @param experimentalFactors
     * @param variable
     */
    private void matchSampleVariableToExperimentalFactorValue(BioMaterial bioMaterial,
            Collection<ExperimentalFactor> experimentalFactors, GeoVariable variable) {

        // find the experimentalFactor that matches this.
        FactorValue convertVariableToFactorValue = convertVariableToFactorValue(variable);
        FactorValue matchingFactorValue = findMatchingExperimentalFactorValue(experimentalFactors,
                convertVariableToFactorValue);

        if (matchingFactorValue == null) {
            throw new IllegalStateException("Could not find matching factor value for " + variable
                    + " in experimental design for sample " + bioMaterial);
        }

        // make sure we don't put the factor value on more than once.
        if (alreadyHasFactorValueForFactor(bioMaterial, matchingFactorValue.getExperimentalFactor())) {
            return;
        }

        bioMaterial.getFactorValues().add(matchingFactorValue);

    }

    /**
     * @param remoteFileUrl
     * @param e
     */
    private void reportUrlError(URL remoteFileUrl, MalformedURLException e) {
        log.error("Problems with url: " + remoteFileUrl
                + ".  Will not store the url of the raw data file.  Full error is: ");
        e.printStackTrace();
    }

    /**
     * Sanity check.
     * 
     * @param datasetSamples
     */
    private void sanityCheckQuantitationTypes(List<GeoSample> datasetSamples) {
        List<String> reference = new ArrayList<String>();

        // Choose a reference that is populated ...
        boolean expectingData = true;
        for (GeoSample sample : datasetSamples) {
            if (sample.hasUsableData()) {
                reference = sample.getColumnNames();
                if (!reference.isEmpty())
                    break;
            } else {
                expectingData = false;
            }
        }

        if (!expectingData) {
            log.warn("Not expecting any data, so quantitation type checking is skipped.");
            return;
        }

        if (reference.isEmpty()) {
            throw new IllegalStateException("None of the samples have any quantitation type names");
        }

        boolean someDidntMatch = false;
        String lastError = "";
        for (GeoSample sample : datasetSamples) {
            List<String> columnNames = sample.getColumnNames();

            assert !columnNames.isEmpty();

            if (!reference.equals(columnNames)) {

                StringBuilder buf = new StringBuilder();
                buf.append("\nSample " + sample.getGeoAccession() + ":");
                for (String string : columnNames) {
                    buf.append(" " + string);
                }
                buf.append("\nReference " + datasetSamples.iterator().next().getGeoAccession() + ":");
                for (String string : reference) {
                    buf.append(" " + string);
                }
                someDidntMatch = true;

                lastError = "*** Sample quantitation type names do not match: " + buf.toString();
                log.debug(lastError);
            }
        }
        if (someDidntMatch) {
            log.warn("Samples do not have consistent quantification type names. Last error was: " + lastError);
        }
    }

    /**
     * @param mgedTerm
     * @return
     */
    private VocabCharacteristic setCategory(String mgedTerm) {
        VocabCharacteristic categoryTerm = VocabCharacteristic.Factory.newInstance();
        categoryTerm.setCategory(mgedTerm);
        categoryTerm.setCategoryUri(MgedOntologyService.MGED_ONTO_BASE_URL + "#" + mgedTerm);
        categoryTerm.setEvidenceCode(GOEvidenceCode.IIA);
        return categoryTerm;
    }

    /**
     * @param characteristic
     * @return
     */
    private String trimString(String characteristic) {
        if (characteristic.length() > 255) {
            log.warn("** Characteristic too long: " + characteristic + " - will truncate - ****");
            characteristic = characteristic.substring(0, 199) + " (truncated at 200 characters)";
        }
        return characteristic;
    }

}