ubic.gemma.core.loader.expression.geo.GeoConverterImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.expression.geo.GeoConverterImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.expression.geo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;

import ubic.basecode.io.ByteArrayConverter;
import ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl;
import ubic.gemma.core.loader.expression.geo.model.GeoChannel;
import ubic.gemma.core.loader.expression.geo.model.GeoContact;
import ubic.gemma.core.loader.expression.geo.model.GeoData;
import ubic.gemma.core.loader.expression.geo.model.GeoDataset;
import ubic.gemma.core.loader.expression.geo.model.GeoDataset.ExperimentType;
import ubic.gemma.core.loader.expression.geo.model.GeoDataset.PlatformType;
import ubic.gemma.core.loader.expression.geo.model.GeoPlatform;
import ubic.gemma.core.loader.expression.geo.model.GeoReplication;
import ubic.gemma.core.loader.expression.geo.model.GeoReplication.ReplicationType;
import ubic.gemma.core.loader.expression.geo.model.GeoSample;
import ubic.gemma.core.loader.expression.geo.model.GeoSeries;
import ubic.gemma.core.loader.expression.geo.model.GeoSeries.SeriesType;
import ubic.gemma.core.loader.expression.geo.model.GeoSubset;
import ubic.gemma.core.loader.expression.geo.model.GeoValues;
import ubic.gemma.core.loader.expression.geo.model.GeoVariable;
import ubic.gemma.core.loader.expression.geo.model.GeoVariable.VariableType;
import ubic.gemma.core.loader.expression.geo.util.GeoConstants;
import ubic.gemma.core.loader.util.parser.ExternalDatabaseUtils;
import ubic.gemma.model.association.GOEvidenceCode;
import ubic.gemma.model.common.auditAndSecurity.Contact;
import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.DatabaseType;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.quantitationtype.PrimitiveType;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.biomaterial.Treatment;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExperimentalDesign;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.FactorType;
import ubic.gemma.model.expression.experiment.FactorValue;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.PolymerType;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.model.genome.gene.phenotype.valueObject.CharacteristicBasicValueObject;
import ubic.gemma.persistence.service.common.description.ExternalDatabaseService;
import ubic.gemma.persistence.service.genome.taxon.TaxonService;
import ubic.gemma.persistence.util.Settings;

/**
 * Convert GEO domain objects into Gemma objects. Usually we trigger this by passing in GeoSeries objects.
 * GEO has four basic kinds of objects: Platforms (ArrayDesigns), Samples (BioAssays), Series (Experiments) and DataSets
 * (which are curated Experiments). Note that a sample can belong to more than one series. A series can include more
 * than one dataset. GEO also supports the concept of a superseries. See
 * http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html.
 * A curated expression data set is at first represented by a GEO "GDS" number (a curated dataset), which maps to a
 * series (GSE). HOWEVER, multiple datasets may go together to form a series (GSE). This can happen when the "A" and "B"
 * arrays were both run on the same samples. Thus we actually normally go by GSE.
 * This service can be used in database-aware or unaware states. However, it has prototype scope as it has some 'global'
 * data structures used during processing.
 *
 * @author keshav
 * @author pavlidis
 */
@Component
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class GeoConverterImpl implements GeoConverter {

    private static final int DEFAULT_DEFINITION_OF_TOO_MANY_ELEMENTS = 100000;

    /**
     * This string is inserted into the descriptions of constructed biomaterials.
     */
    private static final String BIOMATERIAL_DESCRIPTION_PREFIX = "Generated by Gemma for: ";

    /**
     * This string is inserted into the names of constructed biomaterials, so you get names like GSE5929_BioMat_58.
     */
    private static final String BIOMATERIAL_NAME_TAG = "_Biomat_";

    /**
     * How often we tell the user about data processing (items per update)
     */
    private static final int LOGGING_VECTOR_COUNT_UPDATE = 2000;
    /**
     * Initial guess at how many designelementdatavectors to allocate space for.
     */
    private static final int INITIAL_VECTOR_CAPACITY = 10000;
    /**
     * The scientific name used for rat species.
     */
    private static final String RAT = "Rattus norvegicus";
    private static final Log log = LogFactory.getLog(ArrayDesignSequenceProcessingServiceImpl.class.getName());
    private static final Map<String, String> organismDatabases = new HashMap<>();

    static {
        GeoConverterImpl.organismDatabases.put("Saccharomyces cerevisiae", "SGD");
        GeoConverterImpl.organismDatabases.put("Schizosaccharomyces pombe", "GeneDB");
    }

    private final ByteArrayConverter byteArrayConverter = new ByteArrayConverter();
    private final Map<String, Taxon> taxonScientificNameMap = new HashMap<>();
    private final Map<String, Taxon> taxonCommonNameMap = new HashMap<>();
    /**
     * More than this and we apply stricter selection criteria for choosing elements to keep on a platform.
     */
    private int tooManyElements = Settings.getInt("geo.platform.import.maxelements",
            GeoConverterImpl.DEFAULT_DEFINITION_OF_TOO_MANY_ELEMENTS);
    @Autowired
    private ExternalDatabaseService externalDatabaseService;
    @Autowired
    private TaxonService taxonService;
    private ExternalDatabase geoDatabase;
    private Map<String, Map<String, CompositeSequence>> platformDesignElementMap = new HashMap<>();
    private Collection<Object> results = new HashSet<>();
    private Map<String, ArrayDesign> seenPlatforms = new HashMap<>();
    private ExternalDatabase genbank;
    private boolean splitByPlatform = false;
    private boolean forceConvertElements = false;

    @Override
    public void clear() {
        results = new HashSet<>();
        seenPlatforms = new HashMap<>();
        platformDesignElementMap = new HashMap<>();
        taxonCommonNameMap.clear();
        taxonScientificNameMap.clear();
    }

    @Override
    public Collection<Object> convert(Collection<? extends GeoData> geoObjects) {
        for (Object geoObject : geoObjects) {
            Object convertedObject = this.convert((GeoData) geoObject);
            if (convertedObject != null) {
                if (convertedObject instanceof Collection) {
                    results.addAll((Collection<?>) convertedObject);
                } else {
                    results.add(convertedObject);
                }
            }
        }

        GeoConverterImpl.log.info("Converted object tally:\n" + this);

        return results;
    }

    @Override
    public Object convert(GeoData geoObject) {
        if (geoObject == null) {
            GeoConverterImpl.log.warn("Null object");
            return null;
        }
        if (geoObject instanceof Collection) {
            //noinspection unchecked
            return this.convert((Collection<GeoData>) geoObject);
        } else if (geoObject instanceof GeoDataset) {
            return this.convertDataset((GeoDataset) geoObject);
        } else if (geoObject instanceof GeoSeries) { // typically we start here, with a series.
            return this.convertSeries((GeoSeries) geoObject);
        } else if (geoObject instanceof GeoSubset) {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        } else if (geoObject instanceof GeoSample) {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        } else if (geoObject instanceof GeoPlatform) {
            return this.convertPlatform((GeoPlatform) geoObject);
        } else {
            throw new IllegalArgumentException(
                    "Can't deal with " + geoObject.getClass().getName() + " ('" + geoObject + "')");
        }

    }

    @Override
    public void convertSubsetToExperimentalFactor(ExpressionExperiment expExp, GeoSubset geoSubSet) {

        ExperimentalDesign experimentalDesign = expExp.getExperimentalDesign();
        Collection<ExperimentalFactor> existingExperimentalFactors = experimentalDesign.getExperimentalFactors();

        ExperimentalFactor experimentalFactor = ExperimentalFactor.Factory.newInstance();
        experimentalFactor.setName(geoSubSet.getType().toString());
        Characteristic term = Characteristic.Factory.newInstance();
        this.convertVariableType(term, geoSubSet.getType());
        term.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());
        term.setValue(term.getCategory());// is this right?
        term.setValueUri(term.getCategoryUri()); // is this right?
        experimentalFactor.setCategory(term);

        experimentalFactor.setType(FactorType.CATEGORICAL);
        experimentalFactor.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());

        boolean duplicateExists = false;
        for (ExperimentalFactor existingExperimentalFactor : existingExperimentalFactors) {
            if ((experimentalFactor.getName()).equalsIgnoreCase(existingExperimentalFactor.getName())) {
                duplicateExists = true;
                experimentalFactor = existingExperimentalFactor;
                if (GeoConverterImpl.log.isDebugEnabled())
                    GeoConverterImpl.log.debug(experimentalFactor.getName()
                            + " already exists.  Not adding to list of experimental factors.");
                break;
            }
        }

        if (!duplicateExists) {
            experimentalDesign.getExperimentalFactors().add(experimentalFactor);
        }

        /* bi-directional ... don't forget this. */
        experimentalFactor.setExperimentalDesign(experimentalDesign);

        FactorValue factorValue = this.convertSubsetDescriptionToFactorValue(geoSubSet, experimentalFactor);
        this.addFactorValueToBioMaterial(expExp, geoSubSet, factorValue);
    }

    /**
     * This method determines the primary taxon on the array: There are 4 main branches of logic. 1.First it checks if
     * there is only one platform taxon defined on the GEO submission: If there is that is the primary taxon. 2.If
     * multiple taxa are given for the platform then the taxa are checked to see if they share a common parent if so
     * that is the primary taxon e.g. salmonid where atlantic salmon and rainbow trout are given. 3.Finally the
     * probeTaxa are looked at and the most common probe taxa is calculated as the primary taxon 4. No taxon found
     * throws an error
     *
     * @param  platformTaxa Collection of taxa that were given on the GEO array submission as platform taxa
     * @param  probeTaxa    Collection of taxa strings defining the taxon of each probe on the array.
     * @return              Primary taxon of array as determined by this method
     */
    @Override
    public Taxon getPrimaryArrayTaxon(Collection<Taxon> platformTaxa, Collection<String> probeTaxa)
            throws IllegalArgumentException {

        if (platformTaxa == null || platformTaxa.isEmpty()) {
            return null;
        }

        // if there is only 1 taxon on the platform submission then this is the primary taxon
        if (platformTaxa.size() == 1) {
            GeoConverterImpl.log.debug("Only 1 taxon given on GEO platform: " + platformTaxa.iterator().next());
            return platformTaxa.iterator().next();
        }

        // If there are multiple taxa on array

        GeoConverterImpl.log.debug(platformTaxa.size() + " taxa in GEO platform");
        // check if they share a common parent taxon to use as primary taxa.
        for (Taxon platformTaxon : platformTaxa) {
            // thaw to get parent taxon
            this.taxonService.thaw(platformTaxon);
        }
        // calculate based on probe taxa:

        GeoConverterImpl.log.debug("Looking at probe taxa to determine 'primary' taxon");
        // create a hashmap keyed on taxon with a counter to count the number of probes for that taxon.
        Map<String, Integer> taxonProbeNumberList = new HashMap<>();

        if (probeTaxa != null) {
            for (String probeTaxon : probeTaxa) {
                // reset each iteration so if no probes already processed set to 1
                Integer counter = 1;
                if (taxonProbeNumberList.containsKey(probeTaxon)) {
                    counter = taxonProbeNumberList.get(probeTaxon) + 1;
                    taxonProbeNumberList.put(probeTaxon, counter);
                }
                taxonProbeNumberList.put(probeTaxon, counter);
            }
        }

        String primaryTaxonName = "";
        Integer highestScore = 0;
        for (String taxon : taxonProbeNumberList.keySet()) {
            // filter out those probes that have no taxon set control spots. Here's that 'n/a' again, kind of
            // ugly but we see it in some arrays
            if (!taxon.equals("n/a") && StringUtils.isNotBlank(taxon)
                    && taxonProbeNumberList.get(taxon) > highestScore) {
                primaryTaxonName = taxon;
                highestScore = taxonProbeNumberList.get(taxon);
            }
        }
        if (StringUtils.isNotBlank(primaryTaxonName)) {
            return this.convertProbeOrganism(primaryTaxonName);
        }

        // error no taxon on array submission

        throw new IllegalArgumentException("No taxon could be determined for GEO platform ");

    }

    @Override
    public void setSplitByPlatform(boolean splitByPlatform) {
        this.splitByPlatform = splitByPlatform;
    }

    /**
     * Convert a vector of strings into a byte[] for saving in the database. . Blanks(missing values) are treated as NAN
     * (double), 0 (integer), false (booleans) or just empty strings (strings). Other invalid values are treated the
     * same way as missing data (to keep the parser from failing when dealing with strange GEO files that have values
     * like "Error" for an expression value).
     *
     * @param vector of Strings to be converted to primitive values (double, int etc)
     * @param qt     The quantitation type for the values to be converted.
     */
    @Override
    public byte[] convertData(List<Object> vector, QuantitationType qt) {

        if (vector == null || vector.size() == 0)
            return null;

        boolean containsAtLeastOneNonNull = false;
        for (Object string : vector) {
            if (string != null) {
                containsAtLeastOneNonNull = true;
                break;
            }
        }

        if (!containsAtLeastOneNonNull) {
            if (GeoConverterImpl.log.isDebugEnabled()) {
                GeoConverterImpl.log.debug("No data for " + qt + " in vector of length " + vector.size());
            }
            return null;
        }

        List<Object> toConvert = new ArrayList<>();
        PrimitiveType pt = qt.getRepresentation();
        int numMissing = 0;
        for (Object rawValue : vector) {
            if (rawValue == null) {
                numMissing++;
                this.handleMissing(toConvert, pt);
            } else if (rawValue instanceof String) { // needs to be coverted.
                String valueString = (String) rawValue;
                if (StringUtils.isBlank(valueString)) {
                    numMissing++;
                    this.handleMissing(toConvert, pt);
                    continue;
                }
                try {
                    if (pt.equals(PrimitiveType.DOUBLE)) {
                        toConvert.add(Double.parseDouble(valueString));
                    } else if (pt.equals(PrimitiveType.STRING)) {
                        toConvert.add(rawValue);
                    } else if (pt.equals(PrimitiveType.CHAR)) {
                        if (valueString.length() != 1) {
                            throw new IllegalStateException(
                                    "Attempt to cast a string of length " + valueString.length() + " to a char: "
                                            + rawValue + "(quantitation type =" + qt);
                        }
                        toConvert.add(valueString.toCharArray()[0]);
                    } else if (pt.equals(PrimitiveType.INT)) {
                        toConvert.add(Integer.parseInt(valueString));
                    } else if (pt.equals(PrimitiveType.BOOLEAN)) {
                        toConvert.add(Boolean.parseBoolean(valueString));
                    } else {
                        throw new UnsupportedOperationException("Data vectors of type " + pt + " not supported");
                    }
                } catch (NumberFormatException e) {
                    numMissing++;
                    this.handleMissing(toConvert, pt);
                }
            } else { // use as is.
                toConvert.add(rawValue);
            }
        }

        if (numMissing == vector.size()) {
            return null;
        }

        byte[] bytes = byteArrayConverter.toBytes(toConvert.toArray());

        /*
         * Debugging - absolutely make sure we can convert the data back.
         */
        if (pt.equals(PrimitiveType.DOUBLE)) {
            double[] byteArrayToDoubles = byteArrayConverter.byteArrayToDoubles(bytes);
            if (byteArrayToDoubles.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToDoubles.length + " doubles");
            }
        } else if (pt.equals(PrimitiveType.INT)) {
            int[] byteArrayToInts = byteArrayConverter.byteArrayToInts(bytes);
            if (byteArrayToInts.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToInts.length + " ints");
            }
        } else if (pt.equals(PrimitiveType.BOOLEAN)) {
            boolean[] byteArrayToBooleans = byteArrayConverter.byteArrayToBooleans(bytes);
            if (byteArrayToBooleans.length != vector.size()) {
                throw new IllegalStateException(
                        "Expected " + vector.size() + " got " + byteArrayToBooleans.length + " booleans");
            }
        }

        return bytes;
    }

    @Override
    public void setForceConvertElements(boolean forceConvertElements) {
        this.forceConvertElements = forceConvertElements;
    }

    @Override
    public void setElementLimitForStrictness(int tooManyElements) {
        this.tooManyElements = tooManyElements;
    }

    @Override
    public String toString() {
        StringBuilder buf = new StringBuilder();
        Map<String, Integer> tally = new HashMap<>();
        for (Object element : results) {
            String clazz = element.getClass().getName();
            if (!tally.containsKey(clazz)) {
                tally.put(clazz, 0);
            }
            tally.put(clazz, tally.get(clazz) + 1);
        }
        for (String clazz : tally.keySet()) {
            buf.append(tally.get(clazz)).append(" ").append(clazz).append("s\n");
        }

        return buf.toString();
    }

    private void addFactorValueToBioMaterial(ExpressionExperiment expExp, GeoSubset geoSubSet,
            FactorValue factorValue) {
        // fill in biomaterial-->factorvalue.
        for (GeoSample sample : geoSubSet.getSamples()) {

            // find the matching biomaterial(s) in the expression experiment.
            for (BioAssay bioAssay : expExp.getBioAssays()) {
                if (bioAssay.getAccession().getAccession().equals(sample.getGeoAccession())) {
                    BioMaterial material = bioAssay.getSampleUsed();
                    if (GeoConverterImpl.log.isDebugEnabled()) {
                        GeoConverterImpl.log.debug("Adding " + factorValue.getExperimentalFactor() + " : "
                                + factorValue + " to " + material);
                    }
                    material.getFactorValues().add(factorValue);
                }

            }

        }
    }

    /**
     * @param  bioMaterial        BA
     * @param  experimentalFactor exp factor
     * @return                    true if the biomaterial already has a factorvalue for the given experimentalFactor;
     *                            false otherwise.
     */
    private boolean alreadyHasFactorValueForFactor(BioMaterial bioMaterial, ExperimentalFactor experimentalFactor) {
        for (FactorValue fv : bioMaterial.getFactorValues()) {
            ExperimentalFactor existingEf = fv.getExperimentalFactor();
            // This is a weak form of 'equals' - we just check the name.
            if (existingEf.getName().equals(experimentalFactor.getName())) {
                return true;
            }
        }
        return false;
    }

    /**
     * Flag as unneeded data that are not from experiments types that we support, such as ChIP.
     *
     * @param dataSetsToSkip datasets to skip
     * @param samplesToSkip  samples to skip
     * @param series         series
     */
    private void checkForDataToSkip(GeoSeries series, Collection<String> dataSetsToSkip,
            Collection<GeoSample> samplesToSkip) {

        for (GeoDataset dataset : series.getDatasets()) {
            // This doesn't cover every possibility...
            if (dataset.getExperimentType().equals(ExperimentType.arrayCGH)
                    || dataset.getExperimentType().equals(ExperimentType.ChIPChip)
                    || dataset.getExperimentType().equals(ExperimentType.geneExpressionSAGEbased)
                    || dataset.getExperimentType().equals(ExperimentType.Other)) {
                GeoConverterImpl.log
                        .warn("Gemma does not know how to handle experiment type=" + dataset.getExperimentType());

                if (series.getDatasets().size() == 1) {
                    GeoConverterImpl.log.warn("Because the experiment type cannot be handled, "
                            + "and there is only one data set in this series, nothing will be returned!");
                }
                samplesToSkip.addAll(this.getDatasetSamples(dataset));
                dataSetsToSkip.add(dataset.getGeoAccession());
            } else {
                GeoConverterImpl.log.info("Data from " + dataset + " is of type " + dataset.getExperimentType()
                        + ", " + this.getDatasetSamples(dataset).size() + " samples.");
            }
        }

        for (GeoSample sample : series.getSamples()) {
            if (sample.getType().equals("RNA")) {
                // this is apparently what we get for microarrays
                continue;
            } else if (sample.getType().equals("SRA") || sample.getType().equals("MPSS")) {

                if (sample.getLibSource() != null && sample.getLibSource().equals("transcriptomic")) {

                    // have to drill down.
                    if (sample.getLibStrategy().equals("RNA-Seq") || sample.getLibStrategy().equals("ncRNA-Seq")
                            || sample.getLibStrategy().equals("miRNA-Seq")
                            || sample.getLibStrategy().equals("ssRNA-seq")) {
                        continue;
                    }
                }
            }

            // some MPSS might not have libSource filled in. Other possibilities we know about for type are 'other', 'SAGE' and 'mixed'; 

            GeoConverterImpl.log
                    .info("Skipping ineligible sample: " + sample.getGeoAccession() + ": Type=" + sample.getType()
                            + " LibSource=" + sample.getLibSource() + " LibStrategy=" + sample.getLibStrategy());
            samplesToSkip.add(sample);
        }

    }

    /**
     * Used for the case where we want to split the GSE into two (or more) separate ExpressionExperiments based on
     * platform. This is necessary when the two platforms are completely incompatible.
     *
     * @param converted          converted
     * @param series             series
     * @param i                  i
     * @param platform           platform
     * @param platformDatasetMap dataset map
     */
    private void convertByPlatform(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<GeoPlatform, Collection<GeoData>> platformDatasetMap, int i, GeoPlatform platform) {
        GeoSeries platformSpecific = new GeoSeries();

        Collection<GeoData> datasets = platformDatasetMap.get(platform);
        assert datasets.size() > 0;

        for (GeoSample sample : series.getSamples()) {
            // ugly, we have to assume there is only one platform per sample.
            if (sample.getPlatforms().iterator().next().equals(platform)) {
                platformSpecific.addSample(sample);
            }
        }

        // strip out samples that aren't from this platform.
        for (GeoData dataset : datasets) {
            if (dataset instanceof GeoDataset) {
                ((GeoDataset) dataset).dissociateFromSeries(series);
                platformSpecific.addDataSet((GeoDataset) dataset);
            }
        }

        /*
         * Basically copy over most of the information
         */
        platformSpecific.setContact(series.getContact());
        platformSpecific.setContributers(series.getContributers());
        platformSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        platformSpecific.setKeyWords(series.getKeyWords());
        platformSpecific.setOverallDesign(series.getOverallDesign());
        platformSpecific.setPubmedIds(series.getPubmedIds());
        platformSpecific.setReplicates(series.getReplicates());
        platformSpecific.setSampleCorrespondence(series.getSampleCorrespondence());
        platformSpecific.setSummaries(series.getSummaries());
        platformSpecific.setTitle(series.getTitle() + " - " + platform.getGeoAccession());
        platformSpecific.setWebLinks(series.getWebLinks());
        platformSpecific.setValues(series.getValues());
        platformSpecific.getSeriesTypes().addAll(series.getSeriesTypes());

        converted.add(this.convertSeriesSingle(platformSpecific));

    }

    /**
     * Collect information about the sample into biomaterial characteristics and Treatments.
     * GEO does not keep track of 'biomaterials' that make up different channels. Therefore the two channels effectively
     * make up a single biomaterial, as far as we're concerned. We're losing information in those cases.
     *
     * @param sample      sample
     * @param channel     channel
     * @param bioMaterial BA
     */
    private void convertChannel(GeoSample sample, GeoChannel channel, BioMaterial bioMaterial) {
        if (bioMaterial == null)
            return;
        GeoConverterImpl.log
                .debug("Sample: " + sample.getGeoAccession() + " - Converting channel " + channel.getSourceName());

        bioMaterial.setDescription((bioMaterial.getDescription() == null ? "" : bioMaterial.getDescription() + ";")
                + "Channel " + channel.getChannelNumber());

        if (!StringUtils.isBlank(channel.getGrowthProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " treatment");
            treatment.setDescription(channel.getGrowthProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getTreatmentProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " growth");
            treatment.setDescription(channel.getTreatmentProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getExtractProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " extraction");
            treatment.setDescription(channel.getExtractProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        if (!StringUtils.isBlank(channel.getLabelProtocol())) {
            Treatment treatment = Treatment.Factory.newInstance();
            treatment.setName(sample.getGeoAccession() + " channel " + channel.getChannelNumber() + " labeling");
            treatment.setDescription(channel.getLabelProtocol());
            bioMaterial.getTreatments().add(treatment);
        }

        for (String characteristic : channel.getCharacteristics()) {

            characteristic = this.trimString(characteristic);

            parseGEOSampleCharacteristicString(characteristic, bioMaterial);

        }

        if (StringUtils.isNotBlank(channel.getSourceName())) {
            Characteristic sourceChar = Characteristic.Factory.newInstance();
            sourceChar.setDescription("GEO Sample source");
            String characteristic = this.trimString(channel.getSourceName());

            /*
             * We once considered this like "organism part" but Biosource in GEO often (usually?) has information
             * besides organism part
             * or else can be cell type. Best to leave it blank.
             */
            sourceChar.setCategory("BioSource");
            // sourceChar.setCategoryUri( "http://www.ebi.ac.uk/efo/EFO_0000635"  );

            sourceChar.setValue(characteristic);
            sourceChar.setOriginalValue(characteristic);
            sourceChar.setEvidenceCode(GOEvidenceCode.IIA);
            bioMaterial.getCharacteristics().add(sourceChar);
        }

        if (StringUtils.isNotBlank(channel.getOrganism())) {
            // if we have a case where the two channels have different taxon throw an exception.
            String currentChannelTaxon = channel.getOrganism();
            if (bioMaterial.getSourceTaxon() != null) {
                String previousChannelTaxon = bioMaterial.getSourceTaxon().getScientificName();
                if (previousChannelTaxon != null && !(previousChannelTaxon.equals(currentChannelTaxon))) {
                    throw new IllegalArgumentException(
                            "Channel 1 taxon is " + bioMaterial.getSourceTaxon().getScientificName()
                                    + " Channel 2 taxon is " + currentChannelTaxon
                                    + " Check that is expected for sample " + sample.getGeoAccession());
                }

            } else {
                // get it from the channel.
                Taxon taxon = Taxon.Factory.newInstance();
                taxon.setScientificName(channel.getOrganism());
                taxon.setIsGenesUsable(true); // plausible default, doesn't matter.
                bioMaterial.setSourceTaxon(taxon);
            }

        }

        if (channel.getMolecule() != null) {
            // this we can convert automatically pretty easily.
            Characteristic c = channel.getMoleculeAsCharacteristic();
            bioMaterial.getCharacteristics().add(c);
        }

        if (StringUtils.isNotBlank(channel.getLabel())) {
            String characteristic = this.trimString(channel.getLabel());
            // This is typically something like "biotin-labeled nucleotides", which we can convert later.
            Characteristic labelChar = Characteristic.Factory.newInstance();
            labelChar.setDescription("GEO Sample label");
            labelChar.setCategory("labelling"); /* used to be LabelCompound */
            labelChar.setCategoryUri("http://www.ebi.ac.uk/efo/EFO_0000562");
            labelChar.setValue(characteristic);
            labelChar.setOriginalValue(characteristic);
            labelChar.setEvidenceCode(GOEvidenceCode.IIA);
            bioMaterial.getCharacteristics().add(labelChar);
        }
    }

    /**
     * GEO gives sample descriptors (provided by submitters) that we try to parse into Characteristics associated with
     * the BioMaterial.
     * 
     * The format for these strings varies idiosyncratically. This parser is designed to handle things like:
     * <ul>
     * <li>Sex=M or Sex:M
     * <li>Sex=M;Tissue=brain
     * <li>Sex:M,Tissue:brain
     * <li>and variants thereof.
     * </ul>
     * 
     * <p>
     * Terms are not mapped unless we recognized the category, acceptable choices being
     * a hard-coded and limited list of terms (See GeoVariable).
     * 
     * <p>
     * This method does not do anything too sophisticated (e.g., stemming),
     * and the mappings are created manually, so many strings will not be matched.
     * Because characteristics on biomaterials are not mission-critical, it's not worth too much effort.
     * 
     * @param characteristic string to be parsed
     * @param bioMaterial    to which characteristics will be added
     */
    void parseGEOSampleCharacteristicString(String characteristic, BioMaterial bioMaterial) {
        /*
         * Sometimes strings are like Age :8 weeks; Sex: M so we should first split on ";" - sometimes "," is used.
         */
        String[] topFields = characteristic.split("[;,]");

        for (String field : topFields) {

            /*
             * Sometimes values are like Age:8 weeks, so we can try to convert them.
             */
            String[] fields = field.split("[:=]"); // sometimes it is '='
            String defaultDescription = "GEO Sample characteristic";
            if (fields.length == 2) {

                String category = fields[0].trim().replaceAll("\t", " ").replaceAll("_", " ");
                String value = fields[1].trim().replaceAll("\t", " ").replaceAll("_", " ");
                value = value.replaceFirst("^(human|mouse|rat|murine|mus musculus|homo sapiens)\\s", "");

                Characteristic gemmaChar = Characteristic.Factory.newInstance();
                gemmaChar.setOriginalValue(value);
                gemmaChar.setEvidenceCode(GOEvidenceCode.IIA);
                gemmaChar.setDescription(defaultDescription);

                VariableType vartype = GeoVariable.convertStringToType(category);
                if (vartype == null) {
                    log.debug("Could not parse into VariableType: " + category + " (in: " + characteristic + ")");
                    this.doFallback(bioMaterial, value, defaultDescription);
                    continue;
                }

                this.convertVariableType(gemmaChar, vartype);

                CharacteristicBasicValueObject mappedValueTerm = ontologyLookupSampleCharacteristic(value,
                        gemmaChar.getCategory());

                try {
                    if (mappedValueTerm != null) {
                        gemmaChar.setValue(mappedValueTerm.getValue());
                        gemmaChar.setValueUri(mappedValueTerm.getValueUri());
                        gemmaChar.setCategory(mappedValueTerm.getCategory());
                        gemmaChar.setCategoryUri(mappedValueTerm.getCategoryUri());
                    } else {
                        gemmaChar.setValue(value);
                        // There may not be a category, but that's okay.
                    }
                    bioMaterial.getCharacteristics().add(gemmaChar);
                } catch (Exception e) {
                    // conversion didn't work, fall back.
                    this.doFallback(bioMaterial, value, defaultDescription);
                }

            } else {
                // no colon, just use raw (same as fallback above)
                this.doFallback(bioMaterial, field, defaultDescription);
            }
        }
    }

    /**
     * Attempt to identify a preset value (ontology term) for certain strings found in GEO data sets. The presets are
     * stored in valueStringToOntologyTermMappings.txt.
     * 
     * @param  value
     * @param  category
     * @return
     */
    private CharacteristicBasicValueObject ontologyLookupSampleCharacteristic(String value, String category) {
        if (term2OntologyMappings.isEmpty()) {
            initializeTerm2OntologyMappings();
        }

        if (category == null || !term2OntologyMappings.containsKey(category)) {
            return null;
        }

        return term2OntologyMappings.get(category).get(value.toLowerCase());
    }

    /**
     * See also GeoChannel, in which we have canned values for some sample characteristics.
     * See also convertVariableType where we map some to some categories.
     */
    private void initializeTerm2OntologyMappings() {
        InputStream r = this.getClass()
                .getResourceAsStream("/ubic/gemma/core/ontology/valueStringToOntologyTermMappings.txt");
        try (BufferedReader in = new BufferedReader(new InputStreamReader(r))) {
            while (in.ready()) {
                String line = in.readLine().trim();
                if (line.startsWith("#")) {
                    continue;
                }
                if (line.isEmpty())
                    continue;

                String[] split = StringUtils.split(line, "\t");

                if (split.length < 5) {
                    log.warn("Did not get expected fields for line: " + line);
                    continue;
                }

                String inputValue = split[0].toLowerCase();

                String value = split[1];
                String valueUri = split[2];
                String category = split[3];
                String categoryUri = split[4];

                if (StringUtils.isBlank(value) || StringUtils.isBlank(valueUri) || StringUtils.isBlank(category)
                        || StringUtils.isBlank(categoryUri)) {
                    throw new IllegalArgumentException("Invalid line had blank field(s): " + line);
                }

                if (!term2OntologyMappings.containsKey(category)) {
                    term2OntologyMappings.put(category, new HashMap<String, CharacteristicBasicValueObject>());
                }

                if (term2OntologyMappings.get(category).containsKey(inputValue)) {
                    log.warn("Duplicate value: " + inputValue + ", ignoring");
                    continue;
                }

                CharacteristicBasicValueObject c = new CharacteristicBasicValueObject(null, value, valueUri,
                        category, categoryUri);
                term2OntologyMappings.get(category).put(inputValue, c);
            }
        } catch (IOException e) {
            log.error("Ontology terms mapped from strings failed to initialize from file");
        }

    }

    private static Map<String, Map<String, CharacteristicBasicValueObject>> term2OntologyMappings = new ConcurrentHashMap<>();

    /**
     * Take contact and contributer information from a GeoSeries and put it in the ExpressionExperiment.
     *
     * @param series series
     * @param expExp ee
     */
    private void convertContacts(GeoSeries series, ExpressionExperiment expExp) {
        if (series.getContributers().size() > 0) {
            expExp.setDescription(expExp.getDescription() + "\nContributors: ");
            List<String> names = new ArrayList<>();
            for (GeoContact contributer : series.getContributers()) {
                names.add(contributer.getName());
            }
            expExp.setDescription(expExp.getDescription() + StringUtils.join("; ", names));
        }
    }

    /**
     * Often-needed generation of a valid databaseentry object.
     */
    private DatabaseEntry convertDatabaseEntry(GeoData geoData) {
        DatabaseEntry result = DatabaseEntry.Factory.newInstance();

        this.initGeoExternalDatabase();

        result.setExternalDatabase(this.geoDatabase);

        // remove trailing ".1" etc. in case it was split.
        result.setAccession(geoData.getGeoAccession().replaceAll("\\.[0-9]+$", ""));

        return result;
    }

    private ExpressionExperiment convertDataset(GeoDataset geoDataset) {

        if (geoDataset.getSeries().size() == 0) {
            throw new IllegalArgumentException("GEO Dataset must have associated series");
        }

        if (geoDataset.getSeries().size() > 1) {
            throw new UnsupportedOperationException("GEO Dataset can only be associated with one series");
        }

        Collection<ExpressionExperiment> seriesResults = this
                .convertSeries(geoDataset.getSeries().iterator().next());
        assert seriesResults.size() == 1; // unless we have multiple species, not possible.
        return seriesResults.iterator().next();
    }

    private void convertDataset(GeoDataset geoDataset, ExpressionExperiment expExp) {

        /*
         * First figure out of there are any samples for this data set. It could be that they were duplicates of ones
         * found in other series, so were skipped. See GeoService
         */
        if (this.getDatasetSamples(geoDataset).size() == 0) {
            GeoConverterImpl.log.info("No samples remain for " + geoDataset + ", nothing to do");
            return;
        }

        GeoConverterImpl.log.info("Converting dataset:" + geoDataset);

        this.convertDatasetDescriptions(geoDataset, expExp);

        GeoPlatform platform = geoDataset.getPlatform();
        ArrayDesign ad = seenPlatforms.get(platform.getGeoAccession());
        if (ad == null) {
            /*
             * See bug 1672. Sometimes the platform for the dataset is wrong so we should just go on. The exception was
             * otherwise catching a case we don't see under normal use.
             */
            throw new IllegalStateException("ArrayDesigns must be converted before datasets - didn't find "
                    + geoDataset.getPlatform() + "; possibly dataset has incorrect platform?");
        }
        ad.setDescription(ad.getDescription() + "\nFrom " + platform.getGeoAccession() + "\nLast Updated: "
                + platform.getLastUpdateDate());

        this.convertDataSetDataVectors(geoDataset.getSeries().iterator().next().getValues(), geoDataset, expExp);

        this.convertSubsetAssociations(expExp, geoDataset);

    }

    /**
     * Convert the GEO data into DesignElementDataVectors associated with the ExpressionExperiment
     *
     * @param geoDataset Source of the data
     * @param expExp     ExpressionExperiment to fill in.
     */
    private void convertDataSetDataVectors(GeoValues values, GeoDataset geoDataset, ExpressionExperiment expExp) {
        List<GeoSample> datasetSamples = new ArrayList<>(this.getDatasetSamples(geoDataset));
        GeoConverterImpl.log.info(datasetSamples.size() + " samples in " + geoDataset);
        GeoPlatform geoPlatform = geoDataset.getPlatform();

        this.convertVectorsForPlatform(values, expExp, datasetSamples, geoPlatform);

        values.clear(geoPlatform);
    }

    private void convertDatasetDescriptions(GeoDataset geoDataset, ExpressionExperiment expExp) {
        if (StringUtils.isEmpty(expExp.getDescription())) {
            expExp.setDescription(geoDataset.getDescription()); // probably not empty.
        }

        expExp.setDescription(expExp.getDescription() + "\nIncludes " + geoDataset.getGeoAccession());
        if (StringUtils.isNotEmpty(geoDataset.getUpdateDate())) {
            expExp.setDescription(expExp.getDescription() + " (Last updated by provider at import time: "
                    + geoDataset.getUpdateDate() + ")\n");
        }

        if (StringUtils.isEmpty(expExp.getName())) {
            expExp.setName(geoDataset.getTitle());
        } else {
            expExp.setDescription(expExp.getDescription() + " Dataset description " + geoDataset.getGeoAccession()
                    + ": " + geoDataset.getTitle() + "\n");
        }
    }

    private RawExpressionDataVector convertDesignElementDataVector(GeoPlatform geoPlatform,
            ExpressionExperiment expExp, BioAssayDimension bioAssayDimension, String designElementName,
            List<Object> dataVector, QuantitationType qt) {

        if (dataVector == null || dataVector.size() == 0)
            return null;

        int numValuesExpected = bioAssayDimension.getBioAssays().size();
        if (dataVector.size() != numValuesExpected) {
            throw new IllegalArgumentException(
                    "Expected " + numValuesExpected + " in bioassaydimension, data contains " + dataVector.size());
        }
        byte[] blob = this.convertData(dataVector, qt);
        if (blob == null) { // all missing etc.
            if (GeoConverterImpl.log.isDebugEnabled())
                GeoConverterImpl.log.debug("All missing values for DE=" + designElementName + " QT=" + qt);
            return null;
        }
        if (GeoConverterImpl.log.isDebugEnabled()) {
            GeoConverterImpl.log.debug(blob.length + " bytes for " + dataVector.size() + " raw elements");
        }

        ArrayDesign p = this.convertPlatform(geoPlatform);
        assert p != null;

        Map<String, CompositeSequence> designMap = platformDesignElementMap.get(p.getShortName());
        assert designMap != null;

        /*
         * Replace name with the one we're using in the array design after conversion. This information gets filled in
         * earlier in the conversion process (see GeoService)
         */
        String mappedName = geoPlatform.getProbeNamesInGemma().get(designElementName);

        if (mappedName == null) {
            // Sigh..this is unlikely to work in general, but see bug 1709.
            mappedName = geoPlatform.getProbeNamesInGemma().get(designElementName.toUpperCase());
        }

        if (mappedName == null) {
            /*
             * This situation can be okay, it can happen in cases where (for example) the element was filtered out when
             * the
             * platform was being created, but it appears in the data.
             */
            return null;

            //    throw new IllegalStateException( "There is  no probe matching " + designElementName + " on " + geoPlatform.getGeoAccession() );
        }

        CompositeSequence compositeSequence = designMap.get(mappedName);

        if (compositeSequence == null) {
            /*
             * This could be an error, but also can happen if we are on a platform for which we expect to replace data.
             */
            // throw new IllegalStateException( "No composite sequence " + designElementName + " for mapped name " + mappedName );
            return null;
        }

        if (compositeSequence.getBiologicalCharacteristic() != null
                && compositeSequence.getBiologicalCharacteristic().getSequenceDatabaseEntry() != null
                && compositeSequence.getBiologicalCharacteristic().getSequenceDatabaseEntry().getExternalDatabase()
                        .getName() == null) {
            // this is obscure.
            throw new IllegalStateException(compositeSequence + " sequence accession external database lacks name");
        }

        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Associating " + compositeSequence + " with dedv");
        RawExpressionDataVector vector = RawExpressionDataVector.Factory.newInstance();
        vector.setDesignElement(compositeSequence);
        vector.setExpressionExperiment(expExp);

        vector.setBioAssayDimension(bioAssayDimension);
        vector.setQuantitationType(qt);
        vector.setData(blob);
        return vector;
    }

    /**
     * @param  datasetSamples List of GeoSamples to be matched up with BioAssays.
     * @param  expExp         ExpresssionExperiment
     * @return                BioAssayDimension representing the samples.
     */
    private BioAssayDimension convertGeoSampleList(List<GeoSample> datasetSamples, ExpressionExperiment expExp) {
        BioAssayDimension resultBioAssayDimension = BioAssayDimension.Factory.newInstance();

        StringBuilder bioAssayDimName = new StringBuilder();
        Collections.sort(datasetSamples);
        bioAssayDimName.append(expExp.getShortName()).append(": ");
        for (GeoSample sample : datasetSamples) {
            boolean found;
            String sampleAcc = sample.getGeoAccession();
            bioAssayDimName.append(sampleAcc).append(",");
            found = this.matchSampleToBioAssay(expExp, resultBioAssayDimension, sampleAcc);
            if (!found) {
                // this is normal because not all headings are
                // sample ids, and we may have skipped samples
                GeoConverterImpl.log.warn("No bioassay match for " + sampleAcc);
            }
        }
        GeoConverterImpl.log.debug(resultBioAssayDimension.getBioAssays().size() + " Bioassays in biodimension");
        resultBioAssayDimension.setName(this.formatName(bioAssayDimName));
        resultBioAssayDimension.setDescription(bioAssayDimName.toString());
        return resultBioAssayDimension;
    }

    /**
     * Given an organisms name from GEO, create or find the taxon in the DB.
     *
     * @param  taxonScientificName name as provided by GEO presumed to be a scientific name
     * @return                     Taxon details
     */
    private Taxon convertOrganismToTaxon(String taxonScientificName) {
        assert taxonScientificName != null;

        /* if not, either create a new one and persist, or get from db and put in map. */

        if (taxonScientificName.toLowerCase().startsWith(GeoConverterImpl.RAT)) {
            taxonScientificName = GeoConverterImpl.RAT; // we don't distinguish between species.
        }

        Taxon taxon = Taxon.Factory.newInstance();
        taxon.setScientificName(taxonScientificName);
        taxon.setIsGenesUsable(false);
        if (taxonService != null) {
            Taxon t = taxonService.findOrCreate(taxon);
            if (t != null) {
                taxon = t;
            }
        }

        taxonScientificNameMap.put(taxonScientificName, taxon);
        return taxon;

    }

    private ArrayDesign convertPlatform(GeoPlatform platform) {

        if (seenPlatforms.containsKey(platform.getGeoAccession())) {
            return (seenPlatforms.get(platform.getGeoAccession()));
        }

        ArrayDesign arrayDesign = this.createMinimalArrayDesign(platform);

        GeoConverterImpl.log.info("Converting platform: " + platform.getGeoAccession());
        platformDesignElementMap.put(arrayDesign.getShortName(), new HashMap<String, CompositeSequence>());

        // convert the design element information.
        String identifier = platform.getIdColumnName();
        if (identifier == null && !platform.getColumnNames().isEmpty()) {
            throw new IllegalStateException("Cannot determine the platform design element id column for " + platform
                    + "; " + platform.getColumnNames().size() + " column names available.");
        }

        Collection<String> externalReferences = this.determinePlatformExternalReferenceIdentifier(platform);
        String descriptionColumn = this.determinePlatformDescriptionColumn(platform);
        String sequenceColumn = this.determinePlatformSequenceColumn(platform);
        ExternalDatabase externalDb = this.determinePlatformExternalDatabase(platform);

        List<String> descriptions = platform.getColumnData(descriptionColumn);

        List<String> sequences = null;
        if (sequenceColumn != null) {
            sequences = platform.getColumnData(sequenceColumn);
        }
        // The primary taxon for the array: this should be a taxon that is listed as the platform taxon on geo
        // submission
        String probeOrganismColumn = this.determinePlatformProbeOrganismColumn(platform);
        Collection<Taxon> platformTaxa = this.convertPlatformOrganisms(platform, probeOrganismColumn);

        // represent taxa for the probes
        List<String> probeOrganism = null;
        if (probeOrganismColumn != null) {
            GeoConverterImpl.log.debug("Organism details found for probes on array " + platform.getGeoAccession());
            probeOrganism = platform.getColumnData(probeOrganismColumn);
        }

        // The primary taxon for the array: either taxon listed on geo submission, or parent taxon listed on geo
        // submission or predominant probe taxon
        // calcualted using platformTaxa or probeOrganismColumn
        Taxon primaryTaxon = this.getPrimaryArrayTaxon(platformTaxa, probeOrganism);

        if (primaryTaxon == null) {
            throw new IllegalStateException("No taxon could be determined for platform: " + arrayDesign);
        }

        arrayDesign.setPrimaryTaxon(primaryTaxon);

        // We don't get reporters from GEO SOFT files.
        // arrayDesign.setReporters( new HashSet() );

        if (StringUtils.isNotBlank(platform.getManufacturer())) {
            Contact manufacturer = Contact.Factory.newInstance();
            manufacturer.setName(platform.getManufacturer());
            arrayDesign.setDesignProvider(manufacturer);
        }

        arrayDesign.getExternalReferences().add(this.convertDatabaseEntry(platform));

        seenPlatforms.put(platform.getGeoAccession(), arrayDesign);

        if (identifier == null) {
            // we don't get any probe information; e.g., MPSS, SAGE, Exon arrays.
            GeoConverterImpl.log.warn("No identifiers, so platform elements will be skipped");
            return arrayDesign;
        }

        boolean fullyUsable = this.convertPlatformElements(identifier, platform, arrayDesign, externalReferences,
                probeOrganismColumn, externalDb, descriptions, sequences, probeOrganism, primaryTaxon);

        if (!fullyUsable) {
            GeoConverterImpl.log.warn("Some or all identifiers may have been skipped during parse");
        }

        return arrayDesign;
    }

    /**
     * @param  identifier          identifier
     * @param  platform            GEO platform
     * @param  arrayDesign         array design
     * @param  externalReferences  external references
     * @param  probeOrganismColumn probe organism column
     * @param  externalDb          external db
     * @param  descriptions        descriptions
     * @param  sequences           sequences
     * @param  probeOrganism       probe organism
     * @param  primaryTaxon        primary taxon
     * @return                     true if we expect this platform to be fully usable or whether some or all elements
     *                             may have been omitted
     *                             in our parse (so how freaked out later in processing should we be if an element in
     *                             the data doesn't
     *                             match)
     */
    private boolean convertPlatformElements(String identifier, GeoPlatform platform, ArrayDesign arrayDesign,
            Collection<String> externalReferences, String probeOrganismColumn, ExternalDatabase externalDb,
            List<String> descriptions, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon) {

        /*
         * This is a very commonly found column name in files, it seems standard in GEO. If we don't find it, it's okay.
         */
        List<String> cloneIdentifiers = platform.getColumnData("CLONE_ID");
        List<String> identifiers = platform.getColumnData(identifier);

        if (identifiers == null) {
            // we don't get any probe information; e.g., MPSS, SAGE, Exon arrays.
            GeoConverterImpl.log.warn("No identifiers, so platform elements will be skipped");
            return false;
        }

        if (!platform.useDataFromGeo() && !forceConvertElements) {
            GeoConverterImpl.log
                    .warn("Will not convert elements for this platform - set forceConvertElements to override");
            return false;
        }

        assert cloneIdentifiers == null || cloneIdentifiers.size() == identifiers.size();

        List<List<String>> externalRefs = null;
        if (externalReferences != null) {
            externalRefs = platform.getColumnData(externalReferences);
        }

        assert externalRefs == null || externalRefs.iterator().next().size() == identifiers
                .size() : "Unequal numbers of identifiers and external references! "
                        + externalRefs.iterator().next().size() + " != " + identifiers.size();

        if (GeoConverterImpl.log.isDebugEnabled()) {
            GeoConverterImpl.log.debug("Converting " + identifiers.size() + " probe identifiers on GEO platform "
                    + platform.getGeoAccession());
        }

        Iterator<String> descIter = null;

        if (descriptions != null) {
            descIter = descriptions.iterator();
        }

        // http://www.ncbi.nlm.nih.gov/RefSeq/key.html#accessions : "RefSeq accession numbers can be
        // distinguished from GenBank accessions by their prefix distinct format of [2 characters|underbar]"
        Pattern refSeqAccessionPattern = Pattern.compile("^[A-Z]{2}_");

        boolean strictSelection = false;

        if (identifiers.size() > tooManyElements) {
            // something odd like an exon array in GEO, there are lots of unused probes (data sets don't use them)
            GeoConverterImpl.log
                    .warn("Platform " + platform.getGeoAccession() + " has more elements than expected ("
                            + identifiers.size() + "), turning on strict selection method");
            strictSelection = true;
        }

        List<String> skipped = new ArrayList<>();
        Collection<CompositeSequence> compositeSequences = new ArrayList<>(5000);
        int i = 0; // to get sequences, if we have them, and clone identifiers.
        for (String id : identifiers) {
            i = this.processId(platform, arrayDesign, probeOrganismColumn, externalDb, sequences, probeOrganism,
                    primaryTaxon, cloneIdentifiers, externalRefs, descIter, refSeqAccessionPattern, strictSelection,
                    skipped, compositeSequences, i, id);
        }
        arrayDesign.setCompositeSequences(new HashSet<>(compositeSequences));
        arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequences.size());

        if (!skipped.isEmpty()) {
            GeoConverterImpl.log.info("Skipped " + skipped.size() + " elements due to strict selection; last was "
                    + skipped.get(skipped.size() - 1) + "; retained: " + compositeSequences.size());
        }

        if (arrayDesign.getCompositeSequences().size() > tooManyElements) {
            // this is just a safeguard; perhaps temporary.
            throw new IllegalStateException("Platform " + arrayDesign.getShortName()
                    + " has too many elements to be loaded. " + arrayDesign.getCompositeSequences().size());
        }

        GeoConverterImpl.log.info(arrayDesign.getCompositeSequences().size() + " elements on the platform");
        return !strictSelection;
    }

    private int processId(GeoPlatform platform, ArrayDesign arrayDesign, String probeOrganismColumn,
            ExternalDatabase externalDb, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon,
            List<String> cloneIdentifiers, List<List<String>> externalRefs, Iterator<String> descIter,
            Pattern refSeqAccessionPattern, boolean strictSelection, List<String> skipped,
            Collection<CompositeSequence> compositeSequences, int i, String id) {
        String externalAccession = null;
        if (externalRefs != null) {
            externalAccession = this.getExternalAccession(externalRefs, i);
        }

        if (strictSelection && StringUtils.isBlank(externalAccession)) {

            // currently this is crafted to deal with affymetrix exon arrays from GEO, but could be expanded.
            // The problem is data sets in GEO use only a fraction of the probes listed for the gene-level version of platforms.
            // example: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL5175, has 300k probes, but data sets using it have only 17.7k
            // mrna_assignment is less strict than gene_assignement

            // salvage it if it has a gene assignment.
            String filteringColumn = "gene_assignment";
            if (platform.getColumnNames().contains(filteringColumn)) {
                String cd = platform.getColumnData(filteringColumn).get(i);
                if (StringUtils.isBlank(cd) || cd.equals("---")) {

                    skipped.add(id);
                    if (skipped.size() % 50000 == 0) {
                        GeoConverterImpl.log.info(
                                "Skipped " + skipped.size() + " elements due to strict selection; last was " + id);
                    }
                    i++;
                    return i;
                }
                // keep it.
            } else {
                // we just skip ones that don't have an external accession.
                return i;
            }
            // remaining case here: externalAccession is blank, but there is another column that we think saves it.
        }

        String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);

        String description = "";
        if (externalAccession != null) {
            String[] refs = externalAccession.split(",");
            if (refs.length > 1) {
                description = "Multiple external sequence references: " + externalAccession + "; ";
                externalAccession = refs[0];
            }
        }

        if (descIter != null)
            description = description + " " + descIter.next();

        CompositeSequence cs = CompositeSequence.Factory.newInstance();
        String probeName = platform.getProbeNamesInGemma().get(id);
        if (probeName == null) {
            probeName = id;
            if (GeoConverterImpl.log.isDebugEnabled())
                GeoConverterImpl.log.debug("Probe retaining original name: " + probeName);
            platform.getProbeNamesInGemma().put(id, id); // must make sure this is populated.
        } else {
            if (GeoConverterImpl.log.isDebugEnabled())
                GeoConverterImpl.log.debug("Found probe: " + probeName);
        }

        cs.setName(probeName);
        cs.setDescription(description);
        cs.setArrayDesign(arrayDesign);

        // LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
        // if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
        // biosequence
        Taxon probeTaxon = Taxon.Factory.newInstance();
        if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
            probeTaxon = this.convertProbeOrganism(probeOrganism.get(i));
        }
        // if there are no probe taxons then all the probes should take the taxon from the primary taxon
        if (probeOrganismColumn == null) {
            probeTaxon = primaryTaxon;
        }

        BioSequence bs = this.createMinimalBioSequence(probeTaxon);
        this.setBsProps(platform, externalDb, sequences, refSeqAccessionPattern, i, id, externalAccession,
                cloneIdentifier, bs);

        this.checkCs(arrayDesign, externalAccession, cloneIdentifier, cs, probeTaxon, bs);

        compositeSequences.add(cs);
        platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);

        i++;
        return i;
    }

    private void checkCs(ArrayDesign arrayDesign, String externalAccession, String cloneIdentifier,
            CompositeSequence cs, Taxon probeTaxon, BioSequence bs) {
        /*
         * If we have no basis for describing the sequence, we have to skip it.
         */
        if (StringUtils.isBlank(externalAccession) && StringUtils.isBlank(cloneIdentifier)) {
            if (GeoConverterImpl.log.isDebugEnabled()) {
                GeoConverterImpl.log.debug("Blank external reference and clone id for " + cs + " on " + arrayDesign
                        + ", no biological characteristic can be added.");
            }
        } else if (probeTaxon == null) {
            if (GeoConverterImpl.log.isDebugEnabled()) {
                GeoConverterImpl.log.debug("No valid taxon identified for " + cs + " on " + arrayDesign
                        + ", no biological characteristic can be added.");
            }
        } else if (probeTaxon.getId() != null) {
            // IF there is no taxon given for probe do not create a biosequence otherwise bombs as there is no taxon
            // to persist
            cs.setBiologicalCharacteristic(bs);
        }
    }

    private void setBsProps(GeoPlatform platform, ExternalDatabase externalDb, List<String> sequences,
            Pattern refSeqAccessionPattern, int i, String id, String externalAccession, String cloneIdentifier,
            BioSequence bs) {
        boolean isRefseq = false;

        // ExternalDB will be null if it's IMAGE (this is really pretty messy, sorry)
        if (StringUtils.isNotBlank(externalAccession) && this.isGenbank(externalDb)) {
            Matcher refSeqAccessionMatcher = refSeqAccessionPattern.matcher(externalAccession);
            isRefseq = refSeqAccessionMatcher.matches();
            bs.setName(externalAccession);
        } else if (StringUtils.isNotBlank(cloneIdentifier)) {
            bs.setName(cloneIdentifier);
        } else {
            bs.setName(id);
        }

        /*
         * If we are given a sequence (as in, AGTC), we don't need the genbank identifier, which is probably not
         * correct anyway.
         */
        if (sequences != null && StringUtils.isNotBlank(sequences.get(i))) {
            bs.setSequence(sequences.get(i));
            bs.setIsApproximateLength(false);
            bs.setLength((long) bs.getSequence().length());
            bs.setType(SequenceType.DNA);
            bs.setName(id);
            bs.setDescription("Sequence from platform " + platform.getGeoAccession() + " provided by manufacturer. "
                    + (externalAccession != null ? "Used in leiu of " + externalAccession
                            : "No external accession provided"));
        } else if (externalAccession != null && !isRefseq && externalDb != null) {

            /*
             * We also don't store them if they are refseq ids, because refseq ids are generally not the actual
             * sequences put on arrays.
             */

            DatabaseEntry dbe = this.createDatabaseEntry(externalDb, externalAccession, bs);
            bs.setSequenceDatabaseEntry(dbe);
        }
    }

    /**
     * Retrieve full taxon details for a platform given the organism's scientific name in GEO. If multiple organisms are
     * recorded against an array only first taxon details are returned. Warning is given when no column is found to give
     * the taxa for the probes
     *
     * @param  platform             GEO platform details
     * @param  probeTaxonColumnName Column name of probe taxa
     * @return                      List of taxa on platform
     */
    private Collection<Taxon> convertPlatformOrganisms(GeoPlatform platform, String probeTaxonColumnName) {
        Collection<String> organisms = platform.getOrganisms();
        Collection<Taxon> platformTaxa = new HashSet<>();
        StringBuilder taxaOnPlatform = new StringBuilder();

        if (organisms.isEmpty()) {
            return platformTaxa;
        }

        for (String taxonScientificName : organisms) {
            if (taxonScientificName == null)
                continue;
            taxaOnPlatform.append(": ").append(taxonScientificName);
            // make sure add scientific name to map for platform
            if (taxonScientificNameMap.containsKey(taxonScientificName)) {
                platformTaxa.add(taxonScientificNameMap.get(taxonScientificName));
            } else {
                platformTaxa.add(this.convertOrganismToTaxon(taxonScientificName));
            }
        }

        // multiple organisms are found on the platform yet there is no column defined to represent taxon for the
        // probes.
        if (platformTaxa.size() > 1 && probeTaxonColumnName == null) {
            throw new IllegalArgumentException("No organisms found on platform  " + platform);
        }
        return platformTaxa;

    }

    /**
     * Retrieve taxon details for a probe given a scientific name. All scientific names should be in
     * the map as they were set there by the convertPlatform method. If the common name is not found in the database
     * then stop processing as the organism name is likely to be an unknown taxon.
     *
     * @param  probeOrganism            scientific name or common name of organism associated to a
     *                                  biosequence.
     * @return                          Taxon of biosequence.
     * @throws IllegalArgumentException taxon supplied has not been processed before, it does not match the scientific
     *                                  names used in platform definition and does not match a known common name in the
     *                                  database.
     */
    private Taxon convertProbeOrganism(String probeOrganism) {
        Taxon taxon = Taxon.Factory.newInstance();
        // Check if we have processed this organism before as defined by scientific or common definition.
        assert probeOrganism != null;

        /*
         * Detect blank taxon. We support 'n/a' here .... a little kludgy but shows up in some files.
         */
        if (StringUtils.isBlank(probeOrganism) || probeOrganism.equals("n/a")) {
            return null;
        }
        if (taxonScientificNameMap.containsKey(probeOrganism)) {
            return taxonScientificNameMap.get(probeOrganism);
        }
        if (taxonCommonNameMap.containsKey(probeOrganism)) {
            return taxonCommonNameMap.get(probeOrganism);
        }

        taxon.setCommonName(probeOrganism);
        // taxon not processed before check database.
        if (taxonService != null) {

            Taxon t = taxonService.findByCommonName(probeOrganism.toLowerCase());

            if (t != null) {
                taxon = t;
                taxonCommonNameMap.put(taxon.getCommonName(), t);
            } else {

                // if probe organism can not be found i.e it is not a known common or scientific name
                // and it was not already created during platform organism processing then warn user. Examples would
                // be "taxa" like "ILMN Controls". See bug 3207 (we used to throw an exception)
                GeoConverterImpl.log.warn("'" + probeOrganism + "' is not recognized as a taxon in Gemma");
                return null;
            }

        }
        return taxon;

    }

    private void convertPubMedIds(GeoSeries series, ExpressionExperiment expExp) {
        Collection<String> ids = series.getPubmedIds();
        if (ids == null || ids.size() == 0)
            return;

        //noinspection LoopStatementThatDoesntLoop // Usually just one
        for (String string : ids) {
            BibliographicReference bibRef = BibliographicReference.Factory.newInstance();
            DatabaseEntry pubAccession = DatabaseEntry.Factory.newInstance();
            pubAccession.setAccession(string);
            ExternalDatabase ed = ExternalDatabase.Factory.newInstance();
            ed.setName("PubMed");
            pubAccession.setExternalDatabase(ed);
            bibRef.setPubAccession(pubAccession);
            expExp.setPrimaryPublication(bibRef);
            break; // usually just one...
        }
    }

    /*
     * Note that this is apparently never actually used?
     */
    private Characteristic convertReplicatationType(ReplicationType repType) {
        Characteristic result = Characteristic.Factory.newInstance();
        result.setCategory("replicate");
        result.setCategoryUri("http://www.ebi.ac.uk/efo/EFO_0000683" /* replicate */ );
        result.setEvidenceCode(GOEvidenceCode.IIA);
        ExternalDatabase mged = ExternalDatabase.Factory.newInstance();
        mged.setName("MGED Ontology");
        mged.setType(DatabaseType.ONTOLOGY);

        if (repType.equals(ReplicationType.biologicalReplicate)) {
            result.setValue("biological replicate");
            result.setValueUri("http://www.ebi.ac.uk/efo/EFO_0002091" /* biological replicate */ );
        } else if (repType.equals(ReplicationType.technicalReplicateExtract)) {
            result.setValue("technical replicate");
            result.setValueUri("http://www.ebi.ac.uk/efo/EFO_0002090" /* technical replicate */ );
        } else if (repType.equals(ReplicationType.technicalReplicateLabeledExtract)) {
            result.setValue("technical replicate");
            result.setValueUri("http://www.ebi.ac.uk/efo/EFO_0002090" /* technical replicate */ );
        } else {
            throw new IllegalStateException("Unhandled replication type: " + repType);
        }

        result.setOriginalValue(result.getValue());

        return result;

    }

    /**
     * @param  replication Convert a variable into a ExperimentalFactor
     * @return             exp factor
     */
    private ExperimentalFactor convertReplicationToFactor(GeoReplication replication) {
        GeoConverterImpl.log.debug("Converting replication " + replication.getType());
        ExperimentalFactor result = ExperimentalFactor.Factory.newInstance();
        result.setName(replication.getType().toString());
        result.setDescription(replication.getDescription());
        result.setType(FactorType.CATEGORICAL);
        Characteristic term = this.convertReplicatationType(replication.getType());

        result.setCategory(term);
        return result;

    }

    private FactorValue convertReplicationToFactorValue(GeoReplication replication) {
        FactorValue factorValue = FactorValue.Factory.newInstance();
        Characteristic term = this.convertReplicatationType(replication.getType());
        factorValue.setValue(term.getValue());
        factorValue.getCharacteristics().add(term);
        return factorValue;
    }

    private void convertReplicationToFactorValue(GeoReplication replication, ExperimentalFactor factor) {
        FactorValue factorValue = this.convertReplicationToFactorValue(replication);
        factor.getFactorValues().add(factorValue);
    }

    /**
     * A Sample corresponds to a BioAssay; the channels correspond to BioMaterials.
     *
     * @param  sample             sample
     * @param  bioMaterial        BA
     * @param  experimentalDesign experimental design
     * @return                    BA
     */
    private BioAssay convertSample(GeoSample sample, BioMaterial bioMaterial,
            ExperimentalDesign experimentalDesign) {
        if (sample == null) {
            GeoConverterImpl.log.warn("Null sample");
            return null;
        }

        if (sample.getGeoAccession() == null || sample.getGeoAccession().length() == 0) {
            GeoConverterImpl.log.error("No GEO accession for sample");
            return null;
        }

        GeoConverterImpl.log.debug("Converting sample: " + sample.getGeoAccession());

        BioAssay bioAssay = BioAssay.Factory.newInstance();
        String title = sample.getTitle();
        if (StringUtils.isBlank(title)) {
            // throw new IllegalArgumentException( "Title cannot be blank for sample " + sample );
            GeoConverterImpl.log.warn("Blank title for sample " + sample + ", using accession number instead.");
            sample.setTitle(sample.getGeoAccession());
        }
        bioAssay.setName(sample.getTitle());
        bioAssay.setDescription(sample.getDescription());
        bioAssay.setAccession(this.convertDatabaseEntry(sample));
        bioAssay.setIsOutlier(false);
        bioAssay.setSequencePairedReads(false);

        /*
         * NOTE - according to GEO (http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html) "variable information is
         * optional and does not appear in Series records or downloads, but will be used to assemble corresponding GEO
         * DataSet records" If we would get that information we would pass it into this method as
         * expExp.getExperimentalDesign().getExperimentalFactors().
         */

        // : use the ones from the ExperimentalFactor. In other words, these factor values should correspond to
        // experimentalfactors
        Collection<ExperimentalFactor> experimentalFactors = experimentalDesign.getExperimentalFactors();
        for (GeoReplication replication : sample.getReplicates()) {
            this.matchSampleReplicationToExperimentalFactorValue(bioMaterial, experimentalFactors, replication);
        }

        // : use the ones from the ExperimentalFactor.
        for (GeoVariable variable : sample.getVariables()) {
            this.matchSampleVariableToExperimentalFactorValue(bioMaterial, experimentalFactors, variable);
        }

        for (GeoChannel channel : sample.getChannels()) {
            /*
             * In reality GEO does not have information about the samples run on each channel. We're just making it up.
             * So we need to just add the channel information to the biomaterials we have already. Note taxon is now
             * taken from sample FIXME this is no longer accurate; GEO has species information for each channel.
             *
             * Actually this has changed. GEO does store channel information. However, we don't use it (see bug 2902).
             */
            if (bioAssay.getSampleUsed() != null) {
                bioMaterial = bioAssay.getSampleUsed();
                GeoConverterImpl.log.info("Multi-sample information stored in biomaterial " + bioMaterial);
            }
            this.convertChannel(sample, channel, bioMaterial);
            bioAssay.setSampleUsed(bioMaterial);
        }

        // Taxon lastTaxon = null;

        for (GeoPlatform platform : sample.getPlatforms()) {
            ArrayDesign arrayDesign;
            if (seenPlatforms.containsKey(platform.getGeoAccession())) {
                arrayDesign = seenPlatforms.get(platform.getGeoAccession());
            } else {
                // platform not exist yet
                arrayDesign = this.convertPlatform(platform);
            }

            bioAssay.setArrayDesignUsed(arrayDesign);

        }

        return bioAssay;
    }

    /**
     * Convert a GEO series into one or more ExpressionExperiments. The "more than one" case comes up if the are
     * platforms from more than one organism represented in the series, or if 'split by platform' is set, or if multiple
     * species were run on a single platform. If the series is split into two or more ExpressionExperiments, each refers
     * to a modified GEO accession such as GSE2393.1, GSE2393.2 etc for each organism/platform
     * Similarly, because there is no concept of "biomaterial" in GEO, samples that are inferred to have been run using
     * the same biomaterial. The biomaterials are given names after the GSE and the bioAssays (GSMs) such as
     * GSE2939_biomaterial_1|GSM12393|GSN12394.
     *
     * @param  series series
     * @return        ees
     */
    private Collection<ExpressionExperiment> convertSeries(GeoSeries series) {

        Collection<ExpressionExperiment> converted = new HashSet<>();

        // figure out if there are multiple species involved here.

        Map<String, Collection<GeoData>> organismDatasetMap = this.getOrganismDatasetMap(series);
        Map<GeoPlatform, Collection<GeoData>> platformDatasetMap = this.getPlatformDatasetMap(series);
        Map<String, Collection<GeoSample>> organismSampleMap = this.getOrganismSampleMap(series);
        // get map of platform to dataset.

        if (organismDatasetMap.size() > 1) {
            GeoConverterImpl.log
                    .warn("**** Multiple-species series, with multiple datasets. This series will be split into "
                            + organismDatasetMap.size() + " experiments. ****");
            int i = 1;
            for (String organism : organismDatasetMap.keySet()) {
                this.convertSpeciesSpecific(series, converted, organismDatasetMap, i, organism);
                i++;
            }
        } else if (organismSampleMap.size() > 1) {
            GeoConverterImpl.log.warn("**** Multiple-species series. This series will be split into "
                    + organismSampleMap.size() + " experiments. ****");
            int i = 1;
            for (String organism : organismSampleMap.keySet()) {
                this.convertSpeciesSpecificSamples(series, converted, organismSampleMap, i, organism);
                i++;
            }
        } else if (platformDatasetMap.size() > 1 && this.splitByPlatform) {
            int i = 1;
            for (GeoPlatform platform : platformDatasetMap.keySet()) {
                this.convertByPlatform(series, converted, platformDatasetMap, i, platform);
                i++;
            }
        } else {
            ExpressionExperiment ee = this.convertSeriesSingle(series);
            if (ee != null)
                converted.add(ee);
        }

        return converted;
    }

    /**
     * Use this when we don't have a GDS for a GSE.
     *
     * @param expExp    ee
     * @param geoSeries geo series
     */
    private void convertSeriesDataVectors(GeoSeries geoSeries, ExpressionExperiment expExp) {
        /*
         * Tricky thing is that series contains data from multiple platforms.
         */
        Map<GeoPlatform, List<GeoSample>> platformSamples = DatasetCombiner.getPlatformSampleMap(geoSeries);

        for (GeoPlatform platform : platformSamples.keySet()) {
            List<GeoSample> samples = platformSamples.get(platform);
            GeoConverterImpl.log.debug(samples.size() + " samples on " + platform);
            this.convertVectorsForPlatform(geoSeries.getValues(), expExp, samples, platform);
            geoSeries.getValues().clear(platform);
        }

    }

    /**
     * Main method that converts a single (mono-species) GEO series to an ExpressionExperiment.
     *
     * @param  series
     * @return        ExpressionExperiment, or null if the series cannot be converted (wrong sample type, etc.)
     */
    private ExpressionExperiment convertSeriesSingle(GeoSeries series) {
        if (series == null)
            return null;
        GeoConverterImpl.log.info("Converting series: " + series.getGeoAccession());

        Collection<GeoDataset> dataSets = series.getDatasets();
        Collection<String> dataSetsToSkip = new HashSet<>();
        Collection<GeoSample> samplesToSkip = new HashSet<>();
        this.checkForDataToSkip(series, dataSetsToSkip, samplesToSkip);
        if (dataSets.size() > 0 && dataSetsToSkip.size() == dataSets.size()) {
            return null;
        }

        if (!this.isUsable(series)) {
            GeoConverterImpl.log
                    .warn("Series was not usable: types=" + StringUtils.join(series.getSeriesTypes(), " "));
            return null;
        }

        // GEO does not have the concept of a biomaterial.
        Collection<GeoSample> allSeriesSamples = series.getSamples();
        GeoConverterImpl.log.info("Series has " + allSeriesSamples.size() + " samples in total");

        if (samplesToSkip.size() == allSeriesSamples.size()) {
            GeoConverterImpl.log.info("Series has no usable samples, conversion will be aborted");
            return null;
        }

        if (samplesToSkip.size() > 0) {
            GeoConverterImpl.log.info(samplesToSkip.size() + " samples will be skipped");
        }

        ExpressionExperiment expExp = ExpressionExperiment.Factory.newInstance();
        expExp.setDescription("");

        expExp.setDescription(series.getSummaries() + (series.getSummaries().endsWith("\n") ? "" : "\n"));
        if (series.getLastUpdateDate() != null) {
            expExp.setDescription(expExp.getDescription() + "At time of import, last updated (by provider) on: "
                    + series.getLastUpdateDate() + "\n");
        }

        expExp.setName(series.getTitle());
        expExp.setShortName(series.getGeoAccession());

        this.convertContacts(series, expExp);

        this.convertPubMedIds(series, expExp);

        expExp.setAccession(this.convertDatabaseEntry(series));

        ExperimentalDesign design = ExperimentalDesign.Factory.newInstance();
        design.setDescription("");
        design.setName("");
        Collection<GeoVariable> variables = series.getVariables().values();
        for (GeoVariable variable : variables) {
            GeoConverterImpl.log.debug("Adding variable " + variable);
            ExperimentalFactor ef = this.convertVariableToFactor(variable);
            this.convertVariableToFactorValue(variable, ef);
            design.getExperimentalFactors().add(ef);
            design.setName(variable.getDescription() + " " + design.getName());
        }

        if (series.getKeyWords().size() > 0) {
            for (String keyWord : series.getKeyWords()) {
                // design.setDescription( design.getDescription() + " Keyword: " + keyWord );
                Characteristic o = Characteristic.Factory.newInstance();
                o.setDescription("GEO Keyword");
                o.setValue(keyWord);
                o.setOriginalValue(keyWord); // preserve
                o.setEvidenceCode(GOEvidenceCode.IIA);
                o.setDescription("Keyword from GEO series definition file.");
            }
        }

        if (series.getOverallDesign() != null) {
            design.setDescription(design.getDescription() + " Overall design: " + series.getOverallDesign());
        }

        Collection<GeoReplication> replication = series.getReplicates().values();
        for (GeoReplication replicate : replication) {
            GeoConverterImpl.log.debug("Adding replication " + replicate);
            ExperimentalFactor ef = this.convertReplicationToFactor(replicate);
            this.convertReplicationToFactorValue(replicate, ef);
            design.getExperimentalFactors().add(ef);
        }

        expExp.setExperimentalDesign(design);

        expExp.setBioAssays(new HashSet<BioAssay>());

        if (series.getSampleCorrespondence().size() == 0) {
            throw new IllegalArgumentException("No sample correspondence!");
        }

        // spits out a big summary of the correspondence.
        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug(series.getSampleCorrespondence());
        int numBioMaterials = 0;
        int numSkippedBioMaterials = 0;
        /*
         * For each _set_ of "corresponding" samples (from the same RNA, or so we think) we make up a new BioMaterial.
         */

        Collection<String> seen = new HashSet<>();
        for (Iterator<Set<String>> iter = series.getSampleCorrespondence().iterator(); iter.hasNext();) {

            Set<String> correspondingSamples = iter.next();
            if (correspondingSamples.isEmpty())
                continue; // can happen after removing samples (multitaxon)

            BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
            String bioMaterialName = this.getBiomaterialPrefix(series, ++numBioMaterials);
            StringBuilder bioMaterialDescription = new StringBuilder(
                    GeoConverterImpl.BIOMATERIAL_DESCRIPTION_PREFIX + series.getGeoAccession());

            // From the series samples, find the sample that corresponds and convert it.
            for (String cSample : correspondingSamples) {
                boolean found = false;
                for (GeoSample sample : allSeriesSamples) {
                    if (sample == null || sample.getGeoAccession() == null) {
                        GeoConverterImpl.log.warn("Null sample or no accession for " + sample);
                        continue;
                    }

                    if (samplesToSkip.contains(sample)) {
                        continue;
                    }

                    String accession = sample.getGeoAccession();

                    if (accession.equals(cSample)) {

                        if (seen.contains(accession)) {
                            GeoConverterImpl.log
                                    .error("Got " + accession + " twice, this time in set " + correspondingSamples);
                        }
                        seen.add(accession);

                        BioAssay ba = this.convertSample(sample, bioMaterial, expExp.getExperimentalDesign());

                        assert (ba != null);
                        ba.setDescription(ba.getDescription() + "\nSource GEO sample is " + sample.getGeoAccession()
                                + "\nLast updated (according to GEO): " + sample.getLastUpdateDate());

                        assert ba.getSampleUsed() != null;
                        bioMaterial.getBioAssaysUsedIn().add(ba);
                        bioMaterialDescription.append(",").append(sample);
                        expExp.getBioAssays().add(ba);
                        expExp.setTaxon(bioMaterial.getSourceTaxon()); // denormalization
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    if (GeoConverterImpl.log.isDebugEnabled())
                        GeoConverterImpl.log.debug("No sample found in " + series + " to match " + cSample
                                + "; this can happen if some samples were not run on all platforms.");

                }
            }
            bioMaterial.setName(bioMaterialName);
            bioMaterial.setDescription(bioMaterialDescription.toString());

            if (bioMaterial.getBioAssaysUsedIn().isEmpty()) {
                GeoConverterImpl.log.info("Unused/skipped bioMaterial: " + bioMaterial);
                numSkippedBioMaterials++;
            }
        }

        GeoConverterImpl.log.info("Expression Experiment from " + series + " has " + expExp.getBioAssays().size()
                + " bioassays and " + (numBioMaterials - numSkippedBioMaterials) + " biomaterials.");

        int expectedNumSamples = series.getSamples().size() - samplesToSkip.size();
        int actualNumSamples = expExp.getBioAssays().size();
        expExp.setNumberOfSamples(actualNumSamples);

        if (expectedNumSamples > actualNumSamples) {
            GeoConverterImpl.log.warn((expectedNumSamples - actualNumSamples)
                    + " samples were not in the 'sample correspondence'"
                    + " and have been omitted. Possibly they were in the Series (GSE) but not in the corresponding Dataset (GDS)?");
        }

        // this is mostly only needed for converting data vectors, which will be confused by the extra ones
        series.removeSamples(samplesToSkip);

        // Dataset has additional information about the samples.

        if (dataSets.size() == 0) {
            // we miss extra description and the subset information.
            if (series.getValues().hasData()) {
                this.convertSeriesDataVectors(series, expExp);
            }
        } else {
            for (GeoDataset dataset : dataSets) {
                if (dataSetsToSkip.contains(dataset.getGeoAccession()))
                    continue;
                this.convertDataset(dataset, expExp);
            }
        }

        return expExp;
    }

    private void convertSpeciesSpecific(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<String, Collection<GeoData>> organismDatasetMap, int i, String organism) {
        GeoSeries speciesSpecific = new GeoSeries();

        Collection<GeoData> datasets = organismDatasetMap.get(organism);
        assert datasets.size() > 0;

        for (GeoSample sample : series.getSamples()) {
            // ugly, we have to assume there is only one platform and one organism...
            if (sample.getPlatforms().iterator().next().getOrganisms().iterator().next().equals(organism)) {
                speciesSpecific.addSample(sample);
            }
        }

        // strip out samples that aren't from this organism.

        for (GeoData dataset : datasets) {
            if (dataset instanceof GeoDataset) {
                ((GeoDataset) dataset).dissociateFromSeries(series);
                speciesSpecific.addDataSet((GeoDataset) dataset);
            }
        }

        /*
         * Basically copy over most of the information
         */
        speciesSpecific.setContact(series.getContact());
        speciesSpecific.setContributers(series.getContributers());
        speciesSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        speciesSpecific.setKeyWords(series.getKeyWords());
        speciesSpecific.setOverallDesign(series.getOverallDesign());
        speciesSpecific.setPubmedIds(series.getPubmedIds());
        speciesSpecific.setReplicates(series.getReplicates());
        speciesSpecific.setSampleCorrespondence(series.getSampleCorrespondence());
        speciesSpecific.setSummaries(series.getSummaries());
        speciesSpecific.setTitle(series.getTitle() + " - " + organism);
        speciesSpecific.setWebLinks(series.getWebLinks());
        speciesSpecific.setValues(series.getValues());
        speciesSpecific.getSeriesTypes().addAll(series.getSeriesTypes()); // even though this might apply to samples left behind in other part.

        converted.add(this.convertSeriesSingle(speciesSpecific));
    }

    private void convertSpeciesSpecificSamples(GeoSeries series, Collection<ExpressionExperiment> converted,
            Map<String, Collection<GeoSample>> organismSampleMap, int i, String organism) {

        GeoSeries speciesSpecific = new GeoSeries();

        Collection<GeoSample> samples = organismSampleMap.get(organism);

        for (GeoSample s : samples) {
            speciesSpecific.addSample(s);
        }

        /*
         * Strip out sample correspondence for samples not for this organism.
         */
        GeoSampleCorrespondence sampleCorrespondence = series.getSampleCorrespondence().copy();

        for (String o : organismSampleMap.keySet()) {
            if (o.equals(organism)) {
                continue;
            }
            for (GeoSample s : organismSampleMap.get(o)) {
                sampleCorrespondence.removeSample(s.getGeoAccession());
            }
        }

        /*
         * Basically copy over most of the information
         */
        speciesSpecific.setContact(series.getContact());
        speciesSpecific.setContributers(series.getContributers());
        speciesSpecific.setGeoAccession(series.getGeoAccession() + "." + i);
        speciesSpecific.setKeyWords(series.getKeyWords());
        speciesSpecific.setOverallDesign(series.getOverallDesign());
        speciesSpecific.setPubmedIds(series.getPubmedIds());
        speciesSpecific.setReplicates(series.getReplicates());
        speciesSpecific.setSampleCorrespondence(sampleCorrespondence);
        speciesSpecific.setSummaries(series.getSummaries());
        speciesSpecific.setTitle(series.getTitle() + " - " + organism);
        speciesSpecific.setWebLinks(series.getWebLinks());
        speciesSpecific.setValues(series.getValues(speciesSpecific.getSamples()));
        speciesSpecific.getSeriesTypes().addAll(series.getSeriesTypes());

        converted.add(this.convertSeriesSingle(speciesSpecific));

    }

    private void convertSubsetAssociations(ExpressionExperiment result, GeoDataset geoDataset) {
        for (GeoSubset subset : geoDataset.getSubsets()) {
            if (GeoConverterImpl.log.isDebugEnabled())
                GeoConverterImpl.log.debug("Converting subset to experimentalFactor" + subset.getType());
            this.convertSubsetToExperimentalFactor(result, subset);
        }
    }

    /**
     * Creates a new factorValue, or identifies an existing one, matching the subset. If it is a new one it adds it to
     * the given experimentalFactor.
     *
     * @param  experimentalFactor ef
     * @param  geoSubSet          geo subset
     * @return                    FV
     */
    private FactorValue convertSubsetDescriptionToFactorValue(GeoSubset geoSubSet,
            ExperimentalFactor experimentalFactor) {
        // By definition each subset defines a new factor value.
        FactorValue factorValue = FactorValue.Factory.newInstance();

        Characteristic term = Characteristic.Factory.newInstance();
        this.convertVariableType(term, geoSubSet.getType());
        if (term.getCategory() != null) {
            term.setValue(geoSubSet.getDescription());
            term.setDescription("Converted from GEO subset " + geoSubSet.getGeoAccession());
            factorValue.getCharacteristics().add(term);
        }

        factorValue.setExperimentalFactor(experimentalFactor);
        factorValue.setValue(geoSubSet.getDescription());

        /* Check that there isn't already a factor value for this in the factor */

        for (FactorValue fv : experimentalFactor.getFactorValues()) {
            if (fv.equals(factorValue)) {
                GeoConverterImpl.log
                        .debug(factorValue + " is matched by existing factorValue for " + experimentalFactor);
                return fv;
            }
        }
        experimentalFactor.getFactorValues().add(factorValue);
        return factorValue;
    }

    private FactorValue convertTypeToFactorValue(VariableType type, String value) {
        FactorValue factorValue = FactorValue.Factory.newInstance();
        Characteristic term = Characteristic.Factory.newInstance();
        this.convertVariableType(term, type);
        if (term.getCategory() != null) { // is this right ???
            factorValue.setValue(value);
            return factorValue;
        }
        term.setValue(value);
        term.setOriginalValue(value);
        factorValue.setValue(term.getValue());
        factorValue.getCharacteristics().add(term);
        return factorValue;
    }

    /**
     * @param  variable Convert a variable into a ExperimentalFactor
     * @return          ef
     */
    private ExperimentalFactor convertVariableToFactor(GeoVariable variable) {
        GeoConverterImpl.log.debug("Converting variable " + variable.getType());
        ExperimentalFactor result = ExperimentalFactor.Factory.newInstance();
        result.setName(variable.getType().toString());
        result.setType(FactorType.CATEGORICAL);
        result.setDescription(variable.getDescription());
        Characteristic term = Characteristic.Factory.newInstance();
        this.convertVariableType(term, variable.getType());

        if (term.getCategory() != null)
            result.setCategory(term);
        return result;
    }

    /**
     * @param  variable variable
     * @return          Category will be filled in with a URI but value will just be plain text.
     */
    private FactorValue convertVariableToFactorValue(GeoVariable variable) {
        GeoConverterImpl.log.info("Converting variable " + variable);
        VariableType type = variable.getType();
        return this.convertTypeToFactorValue(type, variable.getDescription());
    }

    private void convertVariableToFactorValue(GeoVariable variable, ExperimentalFactor factor) {
        FactorValue factorValue = this.convertVariableToFactorValue(variable);
        factor.getFactorValues().add(factorValue);
    }

    /**
     * Convert a variable, category URI and category filled in. Will not be filled in (null) the case of "Other" or
     * "Organism"
     *
     * @param  c                     to be modified
     * @param  varType               var type
     * @throws IllegalStateException if it's a variable type we don't know how to handle.
     */
    @SuppressWarnings("StatementWithEmptyBody") // Better readability
    private void convertVariableType(Characteristic c, VariableType varType) {
        c.setCategory(null);
        String term = null;
        String uri = null;
        if (varType.equals(VariableType.age)) {
            term = "age";
            uri = "http://www.ebi.ac.uk/efo/EFO_0000246";
        } else if (varType.equals(VariableType.agent)) {
            uri = "http://purl.obolibrary.org/obo/CHEBI_23367";
            term = "molecular entity";
        } else if (varType.equals(VariableType.cellLine)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000322";
            term = "cell line";
        } else if (varType.equals(VariableType.cellType)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000324";
            term = "cell type";
        } else if (varType.equals(VariableType.developmentStage)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000399";
            term = "developmental stage";
        } else if (varType.equals(VariableType.diseaseState)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000408";
            term = "disease";
        } else if (varType.equals(VariableType.dose)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000428";
            term = "dose";
        } else if (varType.equals(VariableType.gender)) {
            // see bug 4317
            uri = "http://purl.obolibrary.org/obo/PATO_0000047";
            term = "biological sex";
        } else if (varType.equals(VariableType.genotypeOrVariation)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000513";
            term = "genotype";
        } else if (varType.equals(VariableType.growthProtocol)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000523";
            term = "growth condition";
        } else if (varType.equals(VariableType.individual)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000542";
            term = "individual";
        } else if (varType.equals(VariableType.infection)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000651";
            term = "phenotype";
        } else if (varType.equals(VariableType.isolate)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000246";
            term = "age";
        } else if (varType.equals(VariableType.metabolism)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000651";
            term = "phenotype";
        } else if (varType.equals(VariableType.other)) {
            // NO-OP
        } else if (varType.equals(VariableType.protocol)) {
            uri = "http://purl.obolibrary.org/obo/OBI_0000272";
            term = "protocol";
        } else if (varType.equals(VariableType.shock)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000470";
            term = "environmental stress";
        } else if (varType.equals(VariableType.species)) {
            // Shouldn't be using this
        } else if (varType.equals(VariableType.specimen)) {
            uri = "http://purl.obolibrary.org/obo/OBI_0100051";
            term = "specimen";
        } else if (varType.equals(VariableType.strain)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0005135";
            term = "strain";
        } else if (varType.equals(VariableType.stress)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000470";
            term = "environmental stress";
        } else if (varType.equals(VariableType.temperature)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0001702";
            term = "Temperature";
        } else if (varType.equals(VariableType.time)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000724";
            term = "timepoint";
        } else if (varType.equals(VariableType.organismPart)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000635";
            term = "organism part";
        } else if (varType.equals(VariableType.treatment)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0000727";
            term = "treatment";
        } else if (varType.equals(VariableType.environmentalHistory)) {
            uri = "http://www.ebi.ac.uk/efo/EFO_0004444";
            term = "environmental history";
        } else {
            throw new IllegalStateException("No action for " + varType);
        }

        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Category term: " + term + " ");
        c.setCategory(term);
        c.setCategoryUri(uri);
        c.setEvidenceCode(GOEvidenceCode.IIA);

    }

    /**
     * For data coming from a single platform, create vectors.
     *
     * @param values A GeoValues object holding the parsed results.
     */
    private void convertVectorsForPlatform(GeoValues values, ExpressionExperiment expExp,
            List<GeoSample> datasetSamples, GeoPlatform geoPlatform) {

        assert datasetSamples.size() > 0 : "No samples in dataset";

        if (!geoPlatform.useDataFromGeo()) {
            // see bug 4181
            GeoConverterImpl.log.warn(
                    "Platform characteristics indicate data from GEO should be ignored or will not be present anyway ("
                            + geoPlatform + ")");
            return;
        }

        GeoConverterImpl.log.info("Converting vectors for " + geoPlatform.getGeoAccession() + ", "
                + datasetSamples.size() + " samples.");

        BioAssayDimension bioAssayDimension = this.convertGeoSampleList(datasetSamples, expExp);

        if (bioAssayDimension.getBioAssays().size() == 0)
            throw new IllegalStateException("No bioAssays in the BioAssayDimension");

        this.sanityCheckQuantitationTypes(datasetSamples);

        List<String> quantitationTypes = datasetSamples.iterator().next().getColumnNames();
        List<String> quantitationTypeDescriptions = datasetSamples.iterator().next().getColumnDescriptions();
        boolean first = true;

        /*
         * For the data that are put in 'datasets' (GDS), we know the type of data, but it can be misleading (e.g., Affy
         * data is 'counts'). For others we just have free text in the column descriptions
         */

        for (String quantitationType : quantitationTypes) {

            // skip the first quantitationType, it's the ID or ID_REF.
            if (first) {
                first = false;
                continue;
            }

            int columnAccordingToSample = quantitationTypes.indexOf(quantitationType);

            int quantitationTypeIndex = values.getQuantitationTypeIndex(geoPlatform, quantitationType);
            GeoConverterImpl.log.debug("Processing " + quantitationType + " (column=" + quantitationTypeIndex
                    + " - according to sample, it's " + columnAccordingToSample + ")");

            Map<String, List<Object>> dataVectors = this.makeDataVectors(values, datasetSamples,
                    quantitationTypeIndex);

            if (dataVectors == null || dataVectors.size() == 0) {
                GeoConverterImpl.log
                        .debug("No data for " + quantitationType + " (column=" + quantitationTypeIndex + ")");
                continue;
            }
            GeoConverterImpl.log.info(dataVectors.size() + " data vectors for " + quantitationType);

            Object exampleValue = dataVectors.values().iterator().next().iterator().next();

            QuantitationType qt = QuantitationType.Factory.newInstance();
            qt.setName(quantitationType);
            String description = quantitationTypeDescriptions.get(columnAccordingToSample);
            qt.setDescription(description);
            QuantitationTypeParameterGuesser.guessQuantitationTypeParameters(qt, quantitationType, description,
                    exampleValue);

            int count = 0;
            int skipped = 0;
            for (String designElementName : dataVectors.keySet()) {
                List<Object> dataVector = dataVectors.get(designElementName);
                if (dataVector == null || dataVector.size() == 0)
                    continue;

                RawExpressionDataVector vector = this.convertDesignElementDataVector(geoPlatform, expExp,
                        bioAssayDimension, designElementName, dataVector, qt);

                if (vector == null) {
                    skipped++;
                    if (GeoConverterImpl.log.isDebugEnabled())
                        GeoConverterImpl.log
                                .debug("Null vector for DE=" + designElementName + " QT=" + quantitationType);
                    continue;
                }

                if (GeoConverterImpl.log.isTraceEnabled()) {
                    GeoConverterImpl.log.trace(designElementName + " " + qt.getName() + " " + qt.getRepresentation()
                            + " " + dataVector.size() + " elements in vector");
                }

                expExp.getRawExpressionDataVectors().add(vector);

                if (++count % GeoConverterImpl.LOGGING_VECTOR_COUNT_UPDATE == 0
                        && GeoConverterImpl.log.isDebugEnabled()) {
                    GeoConverterImpl.log.debug(count + " Data vectors added");
                }
            }

            if (count > 0) {
                expExp.getQuantitationTypes().add(qt);
                if (GeoConverterImpl.log.isDebugEnabled() && count > 1000) {
                    GeoConverterImpl.log.debug(count + " Data vectors added for '" + quantitationType + "'");
                }
            } else {
                GeoConverterImpl.log.info("No vectors were retained for " + quantitationType
                        + " -- usually this is due to all values being missing.");
            }

            if (skipped > 0) {
                GeoConverterImpl.log.info("Skipped " + skipped + " vectors");
            }
        }
        GeoConverterImpl.log
                .info("Total of " + expExp.getRawExpressionDataVectors().size() + " vectors on platform "
                        + geoPlatform + ", " + expExp.getQuantitationTypes().size() + " quantitation types.");
    }

    private DatabaseEntry createDatabaseEntry(ExternalDatabase externalDb, String externalRef, BioSequence bs) {
        DatabaseEntry dbe;
        if (this.isGenbank(externalDb)) {
            // deal with accessions in the form XXXXX.N
            dbe = ExternalDatabaseUtils.getGenbankAccession(externalRef);
            dbe.setExternalDatabase(externalDb); // make sure it matches the one used here.
            bs.setName(dbe.getAccession()); // trimmed version.
        } else {
            bs.setName(externalRef);
            dbe = DatabaseEntry.Factory.newInstance();
            dbe.setAccession(externalRef);
            dbe.setExternalDatabase(externalDb);
        }
        return dbe;
    }

    private ArrayDesign createMinimalArrayDesign(GeoPlatform platform) {
        ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
        arrayDesign.setName(platform.getTitle());
        arrayDesign.setShortName(platform.getGeoAccession());
        arrayDesign.setDescription(platform.getDescriptions());
        PlatformType technology = platform.getTechnology();
        if (technology == PlatformType.dualChannel || technology == PlatformType.dualChannelGenomic
                || technology == PlatformType.spottedOligonucleotide
                || technology == PlatformType.spottedDNAOrcDNA) {
            arrayDesign.setTechnologyType(TechnologyType.TWOCOLOR);
        } else if (technology == PlatformType.singleChannel || technology == PlatformType.oligonucleotideBeads
                || technology == PlatformType.inSituOligonucleotide) {
            arrayDesign.setTechnologyType(TechnologyType.ONECOLOR);
        } else if (technology == null) {
            GeoConverterImpl.log
                    .warn("No technology type available for " + platform + ", provisionally setting to 'other'");
            arrayDesign.setTechnologyType(TechnologyType.OTHER);
        } else if (technology.equals(PlatformType.MPSS)) {
            arrayDesign.setTechnologyType(TechnologyType.SEQUENCING);
        } else if (technology.equals(PlatformType.SAGE) || technology.equals(PlatformType.SAGENlaIII)
                || technology.equals(PlatformType.SAGERsaI) || technology.equals(PlatformType.SAGESau3A)) {
            arrayDesign.setTechnologyType(TechnologyType.SEQUENCING);
        } else if (technology.equals(PlatformType.other)) {
            // We don't know....
            arrayDesign.setTechnologyType(TechnologyType.OTHER);
        } else {
            throw new IllegalArgumentException("Don't know how to interpret technology type " + technology);
        }
        return arrayDesign;
    }

    /**
     * @param taxon Can be null, we will discard this
     */
    private BioSequence createMinimalBioSequence(Taxon taxon) {
        BioSequence bs = BioSequence.Factory.newInstance();
        bs.setTaxon(taxon);
        bs.setPolymerType(PolymerType.DNA);
        bs.setType(SequenceType.DNA);
        return bs;
    }

    private String determinePlatformDescriptionColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String string : columnNames) {
            if (GeoConstants.likelyProbeDescription(string)) {
                GeoConverterImpl.log.debug(string + " appears to indicate the  probe descriptions in column "
                        + index + " for platform " + platform);
                return string;
            }
            index++;
        }
        GeoConverterImpl.log.debug("No platform element description column found for " + platform);
        return null;
    }

    private ExternalDatabase determinePlatformExternalDatabase(GeoPlatform platform) {
        ExternalDatabase result = ExternalDatabase.Factory.newInstance();

        Collection<String> likelyExternalDatabaseIdentifiers = this
                .determinePlatformExternalReferenceIdentifier(platform);
        String dbIdentifierDescription = this.getDbIdentifierDescription(platform);

        String url;
        if (dbIdentifierDescription == null) {
            return null;
        } else if (dbIdentifierDescription.contains("LINK_PRE:")) {
            // example: #ORF = ORF reference LINK_PRE:"http://genome-www4.stanford.edu/cgi-bin/SGD/locus.pl?locus="
            url = dbIdentifierDescription.substring(dbIdentifierDescription.indexOf("LINK_PRE:"));
            result.setWebUri(url);
        }

        if (likelyExternalDatabaseIdentifiers == null || likelyExternalDatabaseIdentifiers.size() == 0) {
            throw new IllegalStateException("No external database identifier column was identified");
        }

        String likelyExternalDatabaseIdentifier = likelyExternalDatabaseIdentifiers.iterator().next();
        if (likelyExternalDatabaseIdentifier.equals("GB_ACC") || likelyExternalDatabaseIdentifier.equals("GB_LIST")
                || likelyExternalDatabaseIdentifier.toLowerCase().equals("genbank")) {
            if (genbank == null) {
                if (externalDatabaseService != null) {
                    genbank = externalDatabaseService.findByName("Genbank");
                } else {
                    result.setName("Genbank");
                    result.setType(DatabaseType.SEQUENCE);
                    genbank = result;
                }
            }
            result = genbank;
        } else if (likelyExternalDatabaseIdentifier.equals("ORF")) {
            String organism = platform.getOrganisms().iterator().next();

            result.setType(DatabaseType.GENOME);

            if (GeoConverterImpl.organismDatabases.containsKey(organism)) {
                result.setName(GeoConverterImpl.organismDatabases.get(organism));
            } else {
                // Placeholder
                result.setName(organism + " ORFs");
                GeoConverterImpl.log.warn("External database is " + result);
            }
        }
        if (result == null || result.getName() == null) {
            throw new IllegalStateException("No external database was identified");
        }
        return result;
    }

    private Collection<String> determinePlatformExternalReferenceIdentifier(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        Collection<String> matches = new HashSet<>();
        for (String string : columnNames) {
            if (GeoConstants.likelyExternalReference(string)) {
                GeoConverterImpl.log
                        .debug(string + " appears to indicate a possible external reference identifier in column "
                                + index + " for platform " + platform);
                matches.add(string);

            }
            index++;
        }

        if (matches.size() == 0) {
            return null;
        }
        return matches;

    }

    /**
     * Allow multiple taxa for a platform. Method retrieves from parsed GEO file the header column name which contains
     * the species/organism used to create probe.
     *
     * @param  platform Parsed GEO platform details.
     * @return          Column name in GEO used to identify column containing species/organism used to create probe
     */
    private String determinePlatformProbeOrganismColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String columnName : columnNames) {
            if (GeoConstants.likelyProbeOrganism(columnName)) {
                GeoConverterImpl.log.debug("'" + columnName + "' appears to indicate the sequences in column "
                        + index + " for platform " + platform);
                return columnName;
            }
            index++;
        }
        GeoConverterImpl.log.debug("No platform organism description column found for " + platform);
        return null;
    }

    private String determinePlatformSequenceColumn(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String columnName : columnNames) {
            if (GeoConstants.likelySequence(columnName)) {
                GeoConverterImpl.log.debug("'" + columnName + "' appears to indicate the sequences in column "
                        + index + " for platform " + platform);
                return columnName;
            }
            index++;
        }
        GeoConverterImpl.log.debug("No platform sequence description column found for " + platform);
        return null;
    }

    private void doFallback(BioMaterial bioMaterial, String value, String defaultDescription) {
        Characteristic gemmaChar = Characteristic.Factory.newInstance();
        gemmaChar.setValue(value);
        gemmaChar.setOriginalValue(value);
        gemmaChar.setDescription(defaultDescription);
        gemmaChar.setEvidenceCode(GOEvidenceCode.IIA);
        bioMaterial.getCharacteristics().add(gemmaChar);
    }

    private FactorValue findMatchingExperimentalFactorValue(Collection<ExperimentalFactor> experimentalFactors,
            FactorValue convertVariableToFactorValue) {
        Collection<Characteristic> characteristics = convertVariableToFactorValue.getCharacteristics();
        if (characteristics.size() > 1)
            throw new UnsupportedOperationException(
                    "Can't handle factor values with multiple characteristics in GEO conversion");
        Characteristic c = characteristics.iterator().next();

        FactorValue matchingFactorValue = null;
        factors: for (ExperimentalFactor factor : experimentalFactors) {
            for (FactorValue fv : factor.getFactorValues()) {
                for (Characteristic m : fv.getCharacteristics()) {
                    if (m.getCategory().equals(c.getCategory()) && m.getValue().equals(c.getValue())) {
                        matchingFactorValue = fv;
                        break factors;
                    }

                }
            }
        }
        return matchingFactorValue;
    }

    /**
     * Turn a rough-cut dimension name into something of reasonable length.
     */
    private String formatName(StringBuilder dimensionName) {
        return StringUtils.abbreviate(dimensionName.toString(), 100);
    }

    private String getBiomaterialPrefix(GeoSeries series, int i) {
        return series.getGeoAccession() + GeoConverterImpl.BIOMATERIAL_NAME_TAG + i;
    }

    private Collection<GeoSample> getDatasetSamples(GeoDataset geoDataset) {
        Collection<GeoSample> seriesSamples = this.getSeriesSamplesForDataset(geoDataset);

        // get just the samples used in this dataset
        Collection<GeoSample> datasetSamples = new ArrayList<>();

        for (GeoSample sample : seriesSamples) {
            if (geoDataset.getColumnNames().contains(sample.getGeoAccession())) {
                if (GeoConverterImpl.log.isDebugEnabled()) {
                    GeoConverterImpl.log.debug("Dataset " + geoDataset + " includes sample " + sample
                            + " on platform " + sample.getPlatforms().iterator().next());
                }
                datasetSamples.add(sample);
            }

            if (GeoConverterImpl.log.isDebugEnabled()) {
                GeoConverterImpl.log.debug("Dataset " + geoDataset + " DOES NOT include sample " + sample
                        + " on platform " + sample.getPlatforms().iterator().next());
            }
        }

        return datasetSamples;
    }

    private String getDbIdentifierDescription(GeoPlatform platform) {
        Collection<String> columnNames = platform.getColumnNames();
        int index = 0;
        for (String string : columnNames) {
            if (GeoConstants.likelyExternalReference(string)) {
                return platform.getColumnDescriptions().get(index);
            }
            index++;
        }
        return null;
    }

    private String getExternalAccession(List<List<String>> externalRefs, int i) {
        for (List<String> refs : externalRefs) {
            if (StringUtils.isNotBlank(refs.get(i))) {
                return refs.get(i);
            }
        }
        return null;
    }

    /**
     * @return map of organisms to a collection of either datasets or platforms.
     */
    private Map<String, Collection<GeoData>> getOrganismDatasetMap(GeoSeries series) {
        Map<String, Collection<GeoData>> organisms = new HashMap<>();

        if (series.getDatasets() == null || series.getDatasets().size() == 0) {
            for (GeoSample sample : series.getSamples()) {

                assert sample.getPlatforms().size() > 0 : sample + " has no platform";
                assert sample.getPlatforms().size() == 1 : sample + " has multiple platforms: "
                        + StringUtils.join(sample.getPlatforms().toArray(), ",");
                String organism = sample.getPlatforms().iterator().next().getOrganisms().iterator().next();

                if (!organisms.containsKey(organism)) {
                    organisms.put(organism, new HashSet<GeoData>());
                }
                organisms.get(organism).add(sample.getPlatforms().iterator().next());
            }
        } else {
            for (GeoDataset dataset : series.getDatasets()) {
                String organism = dataset.getOrganism();
                if (organisms.get(organism) == null) {
                    organisms.put(organism, new HashSet<GeoData>());
                }
                organisms.get(organism).add(dataset);
            }
        }
        return organisms;
    }

    /**
     * Based on the sample organisms, not the platforms. For rare cases where more than one species is run on a platform
     * (e.g., chimp and human run on a human platform)
     */
    private Map<String, Collection<GeoSample>> getOrganismSampleMap(GeoSeries series) {
        Map<String, Collection<GeoSample>> result = new HashMap<>();
        for (GeoSample sample : series.getSamples()) {
            String organism = sample.getOrganism();
            if (!result.containsKey(organism)) {
                result.put(organism, new HashSet<GeoSample>());
            }
            result.get(organism).add(sample);
        }
        return result;
    }

    private Map<GeoPlatform, Collection<GeoData>> getPlatformDatasetMap(GeoSeries series) {
        Map<GeoPlatform, Collection<GeoData>> platforms = new HashMap<>();

        if (series.getDatasets() == null || series.getDatasets().size() == 0) {
            for (GeoSample sample : series.getSamples()) {
                assert sample.getPlatforms().size() > 0 : sample + " has no platform";
                assert sample.getPlatforms().size() == 1 : sample + " has multiple platforms: "
                        + StringUtils.join(sample.getPlatforms().toArray(), ",");
                GeoPlatform platform = sample.getPlatforms().iterator().next();

                if (platforms.get(platform) == null) {
                    platforms.put(platform, new HashSet<GeoData>());
                }
                // This is a bit silly, but made coding this easier.
                platforms.get(platform).add(sample.getPlatforms().iterator().next());
            }
        } else {
            for (GeoDataset dataset : series.getDatasets()) {
                GeoPlatform platform = dataset.getPlatform();
                if (platforms.get(platform) == null) {
                    platforms.put(platform, new HashSet<GeoData>());
                }
                platforms.get(platform).add(dataset);
            }
        }
        return platforms;
    }

    /**
     * Assumes that all samples have the same platform. If not, throws an exception.
     */
    private GeoPlatform getPlatformForSamples(List<GeoSample> datasetSamples) {
        GeoPlatform platform = null;
        for (GeoSample sample : datasetSamples) {
            Collection<GeoPlatform> platforms = sample.getPlatforms();
            assert platforms.size() != 0;
            if (platforms.size() > 1) {
                throw new UnsupportedOperationException(
                        "Can't handle GEO sample ids associated with multiple platforms just yet");
            }
            GeoPlatform nextPlatform = platforms.iterator().next();
            if (platform == null)
                platform = nextPlatform;
            else if (!platform.equals(nextPlatform))
                throw new IllegalArgumentException("All samples here must use the same platform");
        }
        return platform;
    }

    private Collection<GeoSample> getSeriesSamplesForDataset(GeoDataset geoDataset) {
        Collection<GeoSample> seriesSamples = null;
        Collection<GeoSeries> series = geoDataset.getSeries();

        // this is highly defensive programming prompted by a bug that caused the same series to be listed more than
        // once, but empty in one case.

        if (series == null || series.size() == 0) {
            throw new IllegalStateException("No series for " + geoDataset);
        }

        if (series.size() > 1) {
            GeoConverterImpl.log.warn("More than one series for a data set, probably some kind of parsing bug!");
        }

        boolean found = false;
        for (GeoSeries series2 : series) {
            if (series2.getSamples() != null && series2.getSamples().size() > 0) {
                if (found) {
                    throw new IllegalStateException(
                            "More than one of the series for " + geoDataset + " has samples: " + series2);
                }
                seriesSamples = series2.getSamples();
                found = true;
            }
        }

        if (seriesSamples == null || seriesSamples.size() == 0) {
            throw new IllegalStateException("No series had samples for " + geoDataset);
        }

        return seriesSamples;
    }

    /**
     * Deal with missing values, identified by nulls or number format exceptions.
     */
    private void handleMissing(List<Object> toConvert, PrimitiveType pt) {
        if (pt.equals(PrimitiveType.DOUBLE)) {
            toConvert.add(Double.NaN);
        } else if (pt.equals(PrimitiveType.STRING)) {
            toConvert.add("");
        } else if (pt.equals(PrimitiveType.INT)) {
            toConvert.add(0);
        } else if (pt.equals(PrimitiveType.BOOLEAN)) {
            toConvert.add(false);
        } else {
            throw new UnsupportedOperationException(
                    "Missing values in data vectors of type " + pt + " not supported");
        }
    }

    private void initGeoExternalDatabase() {
        if (geoDatabase == null) {
            if (externalDatabaseService != null) {
                ExternalDatabase ed = externalDatabaseService.findByName("GEO");
                if (ed != null) {
                    geoDatabase = ed;
                }
            } else {
                geoDatabase = ExternalDatabase.Factory.newInstance();
                geoDatabase.setName("GEO");
                geoDatabase.setType(DatabaseType.EXPRESSION);
            }
        }
    }

    private boolean isGenbank(ExternalDatabase externalDb) {
        return externalDb != null && externalDb.getName().equalsIgnoreCase("Genbank");
    }

    /**
     * Check to see if we got any data. If not, we should return null. This can happen if the quantitation type was
     * filtered during parsing.
     */
    private boolean isPopulated(Map<String, List<Object>> dataVectors) {
        boolean filledIn = false;
        for (List<Object> vector : dataVectors.values()) {
            for (Object object : vector) {
                if (object != null) {
                    filledIn = true;
                    break;
                }
            }
            if (filledIn) {
                break;
            }
        }
        return filledIn;
    }

    /**
     * Note that series can have more than one type, if it has mixed samples; if at least on type matches one we can
     * use, we keep it.
     *
     * @param  series series
     * @return        is usable
     */
    private boolean isUsable(GeoSeries series) {

        return series.getSeriesTypes().contains(SeriesType.geneExpressionByArray)
                || series.getSeriesTypes().contains(SeriesType.geneExpressionBySequencing);

    }

    /**
     * Convert the by-sample data for a given quantitation type to by-designElement data vectors.
     *
     * @param  datasetSamples           The samples we want to get data for. These should all have been run on the same
     *                                  platform.
     * @param  quantitationTypeIndex    - first index is 0
     * @return                          A map of Strings (design element names) to Lists of Strings containing the data.
     * @throws IllegalArgumentException if the columnNumber is not valid
     */
    private Map<String, List<Object>> makeDataVectors(GeoValues values, List<GeoSample> datasetSamples,
            Integer quantitationTypeIndex) {
        Map<String, List<Object>> dataVectors = new HashMap<>(GeoConverterImpl.INITIAL_VECTOR_CAPACITY);
        Collections.sort(datasetSamples);
        GeoPlatform platform = this.getPlatformForSamples(datasetSamples);

        // the locations of the data we need in the target vectors (mostly reordering)
        Integer[] indices = values.getIndices(platform, datasetSamples, quantitationTypeIndex);

        if (indices == null || indices.length == 0)
            return null; // can happen if quantitation type was filtered out.

        assert indices.length == datasetSamples.size();

        String identifier = platform.getIdColumnName();
        List<String> designElements = platform.getColumnData(identifier);

        if (designElements == null) {
            return dataVectors;
        }

        for (String designElementName : designElements) {
            /*
             * Note: null data can happen if the platform has probes that aren't in the data, or if this is a
             * quantitation type that was filtered out during parsing, or absent from some samples.
             */
            List<Object> ob = values.getValues(platform, quantitationTypeIndex, designElementName, indices);
            if (ob == null || ob.size() == 0)
                continue;
            assert ob.size() == datasetSamples.size();
            dataVectors.put(designElementName, ob);
        }

        boolean filledIn = this.isPopulated(dataVectors);

        values.clear(platform, datasetSamples, quantitationTypeIndex);

        if (!filledIn)
            return null;

        return dataVectors;
    }

    private void matchSampleReplicationToExperimentalFactorValue(BioMaterial bioMaterial,
            Collection<ExperimentalFactor> experimentalFactors, GeoReplication replication) {
        // find the experimentalFactor that matches this.
        FactorValue convertVariableToFactorValue = this.convertReplicationToFactorValue(replication);
        FactorValue matchingFactorValue = this.findMatchingExperimentalFactorValue(experimentalFactors,
                convertVariableToFactorValue);
        if (matchingFactorValue != null) {
            bioMaterial.getFactorValues().add(matchingFactorValue);
        } else {
            throw new IllegalStateException("Could not find matching factor value for " + replication
                    + " in experimental design for sample " + bioMaterial);
        }
    }

    /**
     * @param expExp            ExpressionExperiment to be searched for matching BioAssays
     * @param bioAssayDimension BioAssayDimension to be added to
     * @param sampleAcc         The GEO accession id for the sample. This is compared to the external accession recorded
     *                          for the
     *                          BioAssay
     */
    private boolean matchSampleToBioAssay(ExpressionExperiment expExp, BioAssayDimension bioAssayDimension,
            String sampleAcc) {

        for (BioAssay bioAssay : expExp.getBioAssays()) {
            if (sampleAcc.equals(bioAssay.getAccession().getAccession())) {
                bioAssayDimension.getBioAssays().add(bioAssay);
                GeoConverterImpl.log
                        .debug("Found sample match for bioAssay " + bioAssay.getAccession().getAccession());
                return true;
            }
        }
        return false;
    }

    private void matchSampleVariableToExperimentalFactorValue(BioMaterial bioMaterial,
            Collection<ExperimentalFactor> experimentalFactors, GeoVariable variable) {

        // find the experimentalFactor that matches this.
        FactorValue convertVariableToFactorValue = this.convertVariableToFactorValue(variable);
        FactorValue matchingFactorValue = this.findMatchingExperimentalFactorValue(experimentalFactors,
                convertVariableToFactorValue);

        if (matchingFactorValue == null) {
            throw new IllegalStateException("Could not find matching factor value for " + variable
                    + " in experimental design for sample " + bioMaterial);
        }

        // make sure we don't put the factor value on more than once.
        if (this.alreadyHasFactorValueForFactor(bioMaterial, matchingFactorValue.getExperimentalFactor())) {
            return;
        }

        bioMaterial.getFactorValues().add(matchingFactorValue);

    }

    /*
     * Sanity check.
     */
    private void sanityCheckQuantitationTypes(List<GeoSample> datasetSamples) {
        List<String> reference = new ArrayList<>();

        // Choose a reference that is populated ...
        boolean expectingData = true;
        for (GeoSample sample : datasetSamples) {
            if (sample.hasUsableData()) {
                reference = sample.getColumnNames();
                if (!reference.isEmpty())
                    break;
            } else {
                expectingData = false;
            }
        }

        if (!expectingData) {
            GeoConverterImpl.log.warn("Not expecting any data, so quantitation type checking is skipped.");
            return;
        }

        if (reference.isEmpty()) {
            throw new IllegalStateException("None of the samples have any quantitation type names");
        }

        boolean someDidntMatch = false;
        String lastError = "";
        for (GeoSample sample : datasetSamples) {
            List<String> columnNames = sample.getColumnNames();

            assert !columnNames.isEmpty();

            if (!reference.equals(columnNames)) {

                StringBuilder buf = new StringBuilder();
                buf.append("\nSample ").append(sample.getGeoAccession()).append(":");
                for (String string : columnNames) {
                    buf.append(" ").append(string);
                }
                buf.append("\nReference ").append(datasetSamples.iterator().next().getGeoAccession()).append(":");
                for (String string : reference) {
                    buf.append(" ").append(string);
                }
                someDidntMatch = true;

                lastError = "*** Sample quantitation type names do not match: " + buf.toString();
                GeoConverterImpl.log.debug(lastError);
            }
        }
        if (someDidntMatch) {
            GeoConverterImpl.log
                    .warn("Samples do not have consistent quantification type names. Last error was: " + lastError);
        }
    }

    private String trimString(String characteristic) {
        if (characteristic.length() > 255) {
            GeoConverterImpl.log.warn("** Characteristic too long: " + characteristic + " - will truncate - ****");
            characteristic = characteristic.substring(0, 199) + " (truncated at 200 characters)";
        }
        return characteristic;
    }

}