ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.expression.arrayDesign;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import ubic.gemma.analysis.report.ArrayDesignReportService;
import ubic.gemma.analysis.sequence.SequenceManipulation;
import ubic.gemma.loader.genome.FastaCmd;
import ubic.gemma.loader.genome.FastaParser;
import ubic.gemma.loader.genome.ProbeSequenceParser;
import ubic.gemma.loader.genome.SimpleFastaCmd;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.description.ExternalDatabaseService;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.BioSequenceService;
import ubic.gemma.model.genome.biosequence.PolymerType;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.persistence.Persister;

/**
 * Handles collapsing the sequences, attaching sequences to DesignElements, either from provided input or via a fetch.
 * 
 * @author pavlidis
 * @version $Id: ArrayDesignSequenceProcessingServiceImpl.java,v 1.9 2013/02/08 22:03:05 paul Exp $
 */
@Component
public class ArrayDesignSequenceProcessingServiceImpl implements ArrayDesignSequenceProcessingService {

    /**
     * After seeing more than this number of compositeSequences lacking sequences we don't give a detailed warning.
     */
    private static final int MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS = 20;

    private static final int BATCH_SIZE = 100;

    private static Log log = LogFactory.getLog(ArrayDesignSequenceProcessingServiceImpl.class.getName());

    @Autowired
    private ArrayDesignService arrayDesignService;

    @Autowired
    private Persister persisterHelper;

    @Autowired
    private BioSequenceService bioSequenceService;

    @Autowired
    private ExternalDatabaseService externalDatabaseService;

    @Autowired
    ArrayDesignReportService arrayDesignReportService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#assignSequencesToDesignElements
     * (java.util.Collection, java.util.Collection)
     */
    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements,
            Collection<BioSequence> sequences) {

        Map<String, BioSequence> nameMap = new HashMap<String, BioSequence>();
        for (BioSequence sequence : sequences) {
            nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);
        }

        int numNotFound = 0;
        for (CompositeSequence designElement : designElements) {
            if (!nameMap.containsKey(designElement.getName())) {
                log.debug("No sequence matches " + designElement.getName());
                numNotFound++;
                continue;
            }

            designElement.setBiologicalCharacteristic(nameMap.get(designElement.getName()));

        }

        log.info(sequences.size() + " sequences processed for " + designElements.size() + " design elements");
        if (numNotFound > 0) {
            log.warn(numNotFound + " probes had no matching sequence");
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#assignSequencesToDesignElements
     * (java.util.Collection, java.io.File)
     */
    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, File fastaFile)
            throws IOException {

        FastaParser fp = new FastaParser();
        fp.parse(fastaFile);
        Collection<BioSequence> sequences = fp.getResults();
        log.debug("Parsed " + sequences.size() + " sequences");

        assignSequencesToDesignElements(designElements, sequences);
    }

    /**
     * Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
     * of a design element. Provided for testing purposes.
     * 
     * @param designElements
     * @param fastaFile
     * @throws IOException
     */
    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile)
            throws IOException {

        FastaParser fp = new FastaParser();
        fp.parse(fastaFile);
        Collection<BioSequence> sequences = fp.getResults();
        log.debug("Parsed " + sequences.size() + " sequences");

        assignSequencesToDesignElements(designElements, sequences);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processAffymetrixDesign(ubic.gemma
     * .model.expression.arrayDesign.ArrayDesign, java.io.InputStream, ubic.gemma.model.genome.Taxon, boolean)
     */
    @Override
    public Collection<BioSequence> processAffymetrixDesign(ArrayDesign arrayDesign, InputStream probeSequenceFile,
            Taxon taxon, boolean force) throws IOException {

        log.info("Processing Affymetrix design");
        // arrayDesignService.thaw( arrayDesign );
        boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0;
        taxon = validateTaxon(taxon, arrayDesign);
        Collection<BioSequence> bioSequences = new HashSet<BioSequence>();

        int done = 0;
        int percent = 0;

        AffyProbeReader apr = new AffyProbeReader();
        apr.parse(probeSequenceFile);
        Collection<CompositeSequence> compositeSequencesFromProbes = apr.getKeySet();

        int total = compositeSequencesFromProbes.size();

        Map<String, CompositeSequence> quickFindMap = new HashMap<String, CompositeSequence>();
        List<BioSequence> sequenceBuffer = new ArrayList<BioSequence>();
        Map<String, CompositeSequence> csBuffer = new HashMap<String, CompositeSequence>();
        for (CompositeSequence newCompositeSequence : compositeSequencesFromProbes) {

            // these composite sequences are just use
            newCompositeSequence.setArrayDesign(arrayDesign);
            BioSequence collapsed = SequenceManipulation.collapse(apr.get(newCompositeSequence));
            String sequenceName = newCompositeSequence.getName() + "_collapsed";
            collapsed.setName(sequenceName);
            collapsed.setType(SequenceType.AFFY_COLLAPSED);
            collapsed.setPolymerType(PolymerType.DNA);
            collapsed.setTaxon(taxon);

            sequenceBuffer.add(collapsed);
            if (csBuffer.containsKey(sequenceName))
                throw new IllegalArgumentException("All probes must have unique names");
            csBuffer.put(sequenceName, newCompositeSequence);
            if (sequenceBuffer.size() == BATCH_SIZE) {
                flushBuffer(bioSequences, sequenceBuffer, csBuffer);
            }

            if (wasOriginallyLackingCompositeSequences) {
                arrayDesign.getCompositeSequences().add(newCompositeSequence);
            } else {
                if (force) {
                    // FIXME I don't think 'force' works right for affymetrix arrays - old sequences are still there.
                    // Anyway, we don't need it very badly as it was easy to clean up 'manually'.
                    collapsed = persistSequence(collapsed);
                    assert collapsed.getTaxon().equals(taxon);
                }
                quickFindMap.put(newCompositeSequence.getName(), newCompositeSequence);
            }

            if (++done % 1000 == 0) {
                percent = updateProgress(total, done, percent);
            }
        }
        flushBuffer(bioSequences, sequenceBuffer, csBuffer);
        updateProgress(total, done, percent);

        if (!wasOriginallyLackingCompositeSequences) {
            percent = 0;
            done = 0;
            int numWithNoSequence = 0;
            for (CompositeSequence originalCompositeSequence : arrayDesign.getCompositeSequences()) {
                // go back and fill this information into the composite sequences, namely the database entry
                // information.

                CompositeSequence compositeSequenceFromParse = quickFindMap
                        .get(originalCompositeSequence.getName());
                if (compositeSequenceFromParse == null) {
                    numWithNoSequence++;
                    notifyAboutMissingSequences(numWithNoSequence, originalCompositeSequence);
                    continue;
                }

                log.debug(originalCompositeSequence + " matches " + compositeSequenceFromParse + " seq is "
                        + compositeSequenceFromParse.getBiologicalCharacteristic());

                originalCompositeSequence
                        .setBiologicalCharacteristic(compositeSequenceFromParse.getBiologicalCharacteristic());

                assert originalCompositeSequence.getBiologicalCharacteristic().getId() != null;

                originalCompositeSequence.setArrayDesign(compositeSequenceFromParse.getArrayDesign());

                if (++done % 1000 == 0) {
                    percent = updateProgress(total, done, percent);
                }
            }
            log.info(numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size()
                    + " probes could not be matched to a sequence");
        }

        arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequencesFromProbes.size());
        log.info("Updating " + arrayDesign);

        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        log.info("Done adding sequence information!");
        return bioSequences;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, java.io.InputStream, ubic.gemma.model.genome.biosequence.SequenceType)
     */
    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            SequenceType sequenceType) throws IOException {
        return this.processArrayDesign(arrayDesign, sequenceFile, sequenceType, null);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, java.io.InputStream, ubic.gemma.model.genome.biosequence.SequenceType,
     * ubic.gemma.model.genome.Taxon)
     */
    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            SequenceType sequenceType, Taxon taxon) throws IOException {

        // arrayDesign = arrayDesignService.thaw( arrayDesign );

        if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
            return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon, true);
        } else if (sequenceType.equals(SequenceType.OLIGO)) {
            return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
        }
        taxon = validateTaxon(taxon, arrayDesign);

        checkForCompositeSequences(arrayDesign);

        FastaParser fastaParser = new FastaParser();
        fastaParser.parse(sequenceFile);
        Collection<BioSequence> bioSequences = fastaParser.getResults();

        // make two maps: one for genbank ids, one for the sequence name.
        Map<String, BioSequence> gbIdMap = new HashMap<String, BioSequence>();
        Map<String, BioSequence> nameMap = new HashMap<String, BioSequence>();

        int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
        int done = 0;
        int percent = 0;

        for (BioSequence sequence : bioSequences) {

            sequence.setType(sequenceType);
            sequence.setPolymerType(PolymerType.DNA);
            sequence.setTaxon(taxon);
            sequence = persistSequence(sequence);

            addToMaps(gbIdMap, nameMap, sequence);

        }

        log.info("Sequences done, updating composite sequences");

        int numWithNoSequence = 0;
        int numMatchedByAccession = 0;
        int numMatchedByProbeName = 0;
        String mungeRegex = ArrayDesignProbeRenamingService.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";

        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            if (log.isTraceEnabled())
                log.trace("Looking for sequence for: " + compositeSequence.getName());

            BioSequence match = null;
            if (nameMap.containsKey(compositeSequence.getName())) {
                match = nameMap.get(compositeSequence.getName());
                numMatchedByProbeName++;
            } else if (compositeSequence.getName().matches(mungeRegex)) {
                String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
                if (nameMap.containsKey(unMungedName)) {
                    match = nameMap.get(unMungedName);
                    numMatchedByProbeName++;
                    continue;
                }
            } else {
                BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
                if (biologicalCharacteristic != null) {
                    biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
                    if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap
                            .containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
                        match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
                        numMatchedByAccession++;
                    } else {
                        compositeSequence.setBiologicalCharacteristic(null);
                        numWithNoSequence++;
                        notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                    }
                } else {
                    numWithNoSequence++;
                    notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                }
            }

            if (match != null) {
                // overwrite the existing characteristic if necessary.
                compositeSequence.setBiologicalCharacteristic(match);
                compositeSequence.setArrayDesign(arrayDesign);
            }

            if (++done % 1000 == 0) {
                percent = updateProgress(total, done, percent);
            }
        }

        log.info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size()
                + " composite sequences were matched to sequences by Genbank accession");
        log.info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size()
                + " composite sequences were matched to sequences by probe name");

        if (numWithNoSequence > 0)
            log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size()
                    + " composite sequences with no associated biological characteristic");

        log.info("Updating sequences on arrayDesign");
        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        return bioSequences;

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, java.io.InputStream, java.lang.String[], java.lang.String,
     * ubic.gemma.model.genome.Taxon, boolean)
     */
    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceIdentifierFile,
            String[] databaseNames, String blastDbHome, Taxon taxon, boolean force) throws IOException {
        checkForCompositeSequences(arrayDesign);

        Map<String, String> probe2acc = parseAccessionFile(sequenceIdentifierFile);
        Collection<BioSequence> finalResult = new HashSet<BioSequence>();
        Collection<String> notFound = new HashSet<String>();

        // values that were enot found
        notFound.addAll(probe2acc.values());

        // the actual thing values to search for (with version numbers)
        Collection<String> accessionsToFetch = new HashSet<String>();
        accessionsToFetch.addAll(probe2acc.values());

        // only 1 taxon should be on array design if taxon not supplied on command line
        taxon = validateTaxon(taxon, arrayDesign);

        /*
         * Fill in sequences from BLAST databases.
         */
        int versionNumber = 1;
        int numSwitched = 0;
        while (versionNumber < MAX_VERSION_NUMBER) {
            Collection<BioSequence> retrievedSequences = searchBlastDbs(databaseNames, blastDbHome, notFound);

            // map of accessions to sequence.
            Map<String, BioSequence> found = findOrUpdateSequences(accessionsToFetch, retrievedSequences, taxon,
                    force);

            finalResult.addAll(retrievedSequences);

            // replace the sequences.
            for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
                String probeName = cs.getName();
                String acc = probe2acc.get(probeName);
                if (found.containsKey(acc)) {
                    numSwitched++;
                    log.debug("Setting seq. for " + cs + " to " + found.get(acc));
                    cs.setBiologicalCharacteristic(found.get(acc));
                }
            }

            notFound = getUnFound(notFound, found);

            if (notFound.isEmpty()) {
                break; // we're done!
            }

            // bump up the version numbers for ones we haven't found yet.

            for (String accession : notFound) {
                if (log.isTraceEnabled())
                    log.trace(accession + " not found, increasing version number to " + versionNumber);
                accessionsToFetch.remove(accession);

                // add or increase the version number.
                accession = accession.replaceFirst("\\.\\d+$", "");
                accession = accession + "." + Integer.toString(versionNumber);
                accessionsToFetch.add(accession);
            }
            notFound = accessionsToFetch;

            ++versionNumber;

        }

        if (!notFound.isEmpty() && taxon != null) {

            Collection<String> stillLooking = new HashSet<String>();
            stillLooking.addAll(notFound);
            notFound.clear();

            /*
             * clear the version number.
             */
            for (String accession : stillLooking) {
                notFound.remove(accession);
                accession = accession.replaceFirst("\\.\\d+$", "");
                notFound.add(accession);
            }
            assert notFound.size() > 0;
            /*
             * See if they're already in Gemma. This is good for sequences that are not in genbank but have been loaded
             * previously.
             */
            Map<String, BioSequence> found = findLocalSequences(notFound, taxon);
            finalResult.addAll(found.values());

            for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
                String probeName = cs.getName();
                String acc = probe2acc.get(probeName);
                if (found.containsKey(acc)) {
                    numSwitched++;
                    log.debug("Setting seq. for " + cs + " to " + found.get(acc));
                    cs.setBiologicalCharacteristic(found.get(acc));
                }
            }
            notFound = getUnFound(notFound, found);
        }

        if (!notFound.isEmpty()) {
            logMissingSequences(arrayDesign, notFound);
        }

        log.info(numSwitched + " composite sequences had their biologicalCharacteristics changed");

        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        return finalResult;

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, java.lang.String[], boolean)
     */
    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames,
            boolean force) {
        return this.processArrayDesign(arrayDesign, databaseNames, null, force);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceProcessingService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, java.lang.String[], java.lang.String, boolean)
     */
    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames,
            String blastDbHome, boolean force) {

        Map<String, BioSequence> accessionsToFetch = initializeFetchList(arrayDesign, force);

        if (accessionsToFetch.size() == 0) {
            log.info("No accessions to fetch, no processing will be done");
            return null;
        }

        Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
        // not taxon found
        if (taxaOnArray.size() == 0) {
            throw new IllegalArgumentException(
                    taxaOnArray.size() + " taxon found for " + arrayDesign + "please specifiy which taxon to run");
        }

        Collection<String> notFound = accessionsToFetch.keySet();
        Collection<BioSequence> finalResult = new HashSet<BioSequence>();

        int versionNumber = 1;
        while (versionNumber < MAX_VERSION_NUMBER) {
            Collection<BioSequence> retrievedSequences = searchBlastDbs(databaseNames, blastDbHome, notFound);

            // we can loop through the taxa as we can ignore sequence when retrieved and arraydesign taxon not match.

            Map<String, BioSequence> found = findOrUpdateSequences(accessionsToFetch, retrievedSequences,
                    taxaOnArray, force);

            finalResult.addAll(found.values());
            notFound = getUnFound(notFound, found);

            if (notFound.isEmpty()) {
                break;
            }

            // bump up the version numbers.

            for (String accession : notFound) {
                if (log.isTraceEnabled())
                    log.trace(accession + " not found, increasing version number to " + versionNumber);
                // remove the version number and increase it
                BioSequence bs = accessionsToFetch.get(accession);
                accessionsToFetch.remove(accession);

                // add or increase the version number.
                accession = accession.replaceFirst("\\.\\d+$", "");
                accession = accession + "." + Integer.toString(versionNumber);
                accessionsToFetch.put(accession, bs);
            }
            notFound = accessionsToFetch.keySet();
            ++versionNumber;
        }

        if (!notFound.isEmpty()) {
            logMissingSequences(arrayDesign, notFound);
        }

        log.info(finalResult.size() + " sequences found");
        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

        return finalResult;

    }

    /**
     * Update a single sequence in the system.
     * 
     * @param sequenceId
     * @param databaseNames
     * @param blastDbHome
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @return persistent BioSequence.
     */
    @Override
    public BioSequence processSingleAccession(String sequenceId, String[] databaseNames, String blastDbHome,
            boolean force) {
        BioSequence found = this.searchBlastDbs(databaseNames, blastDbHome, sequenceId);
        if (found == null)
            return null;
        return createOrUpdateGenbankSequence(found, force);

    }

    /**
     * If taxon is null then it has not been provided on the command line, then deduce the taxon from the arrayDesign.
     * If there are 0 or more than one taxon on the array design throw an error as this programme can only be run for 1
     * taxon at a time if processing from a file.
     * 
     * @param taxon Taxon as passed in on the command line
     * @param arrayDesign Array design to process
     * @return taxon Taxon to process
     * @throws IllegalArgumentException Thrown when there is not exactly 1 taxon.
     */
    @Override
    public Taxon validateTaxon(Taxon taxon, ArrayDesign arrayDesign) throws IllegalArgumentException {

        if (arrayDesign == null) {
            throw new IllegalArgumentException("Array design cannot be null");
        }

        if (taxon == null) {
            Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());

            if (taxaOnArray.size() == 1 && taxaOnArray.iterator().next() != null) {
                return taxaOnArray.iterator().next();
            }
            throw new IllegalArgumentException(
                    taxaOnArray.size() + " taxa found for " + arrayDesign + "please specify which taxon to run");
        }
        return taxon;
    }

    /**
     * @param nameMap
     * @param sequence
     */
    private void addToMaps(Map<String, BioSequence> gbIdMap, Map<String, BioSequence> nameMap,
            BioSequence sequence) {
        nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);

        if (sequence.getSequenceDatabaseEntry() != null) {
            gbIdMap.put(sequence.getSequenceDatabaseEntry().getAccession(), sequence);
        } else {
            if (log.isTraceEnabled())
                log.trace("No sequence database entry for " + sequence.getName());
        }
    }

    private void checkForCompositeSequences(ArrayDesign arrayDesign) {
        boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0;

        if (wasOriginallyLackingCompositeSequences) {
            throw new IllegalArgumentException(
                    "You need to pass in an array design that already has compositeSequences filled in.");
        }
    }

    /**
     * @param found a new (nonpersistent) biosequence that can be used to create a new entry or update an existing one
     *        with the sequence. The sequence would have come from Genbank.
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten. Otherwise, the sequence will only be updated if the
     *        actual sequence information was missing in our DB and 'found' has a sequence.
     * @return persistent BioSequence.
     */
    private BioSequence createOrUpdateGenbankSequence(BioSequence found, boolean force) {
        assert found != null;
        DatabaseEntry sequenceDatabaseEntry = found.getSequenceDatabaseEntry();

        assert sequenceDatabaseEntry != null; // this should always be the case because the sequences comes from
        // genbank (blastdb)
        assert sequenceDatabaseEntry.getExternalDatabase() != null;

        BioSequence existing = null;

        existing = bioSequenceService.findByAccession(sequenceDatabaseEntry);

        BioSequence result = null;
        if (existing == null) {
            if (log.isDebugEnabled())
                log.debug("Find (or creating) new sequence " + found);

            result = bioSequenceService.find(found); // there still might be a match.

            if (result == null) {
                result = bioSequenceService.create(found);
            }
        } else {
            result = existing;
        }

        assert result != null;
        // note that no matter what we make sure the database entry is filled in.
        if (force || (StringUtils.isBlank(result.getSequence()) && !StringUtils.isBlank(found.getSequence()))) {
            result = updateExistingWithSequenceData(found, result);
        } else {
            fillInDatabaseEntry(found, result);
        }

        // result = bioSequenceService.thaw( result );

        return result;
    }

    /**
     * When the probe id is in the format ArrayName:ProbeId, just return the ProbeId. For anything else return the
     * entire string.
     * 
     * @param probeId
     * @return
     */
    private String deMangleProbeId(String probeId) {
        String[] toks = StringUtils.split(probeId, ":");
        if (toks.length > 1) {
            return toks[1];
        }
        return probeId;
    }

    /**
     * Unfortunately we have to make sure the database entry is filled in, even if we aren't using 'force'. This seems
     * due in part to an old bug in the system that left these blank. This is our opporunity to fill them in.
     */
    private void fillInDatabaseEntry(BioSequence found, BioSequence existing) {
        if (existing.getSequenceDatabaseEntry() == null) {
            existing.setSequenceDatabaseEntry(found.getSequenceDatabaseEntry());
            bioSequenceService.update(existing);
        }
    }

    /**
     * @param retrievedSequences
     */
    private void fillInGenbank(Collection<BioSequence> retrievedSequences) {
        ExternalDatabase genbank = getGenbank();
        assert genbank.getId() != null;
        for (BioSequence bioSequence : retrievedSequences) {
            if (bioSequence.getSequenceDatabaseEntry() == null) {
                log.warn("No database entry for " + bioSequence);
                continue;
            }
            if (bioSequence.getSequenceDatabaseEntry().getExternalDatabase().getId() == null) {
                bioSequence.getSequenceDatabaseEntry().setExternalDatabase(genbank);
            }
        }
    }

    /**
     * @param identifiersToSearch
     * @param taxon
     * @return
     */
    private Map<String, BioSequence> findLocalSequences(Collection<String> identifiersToSearch, Taxon taxon) {
        Map<String, BioSequence> found = new HashMap<String, BioSequence>();
        for (String id : identifiersToSearch) {
            BioSequence template = BioSequence.Factory.newInstance();
            template.setTaxon(taxon);
            template.setName(id);
            BioSequence seq = bioSequenceService.find(template);
            if (seq != null) {
                found.put(id, seq);
            }
        }
        return found;
    }

    /**
     * Copy sequences into the original versions, or create new sequences in the DB, as needed.
     * 
     * @param accessionsToFetch
     * @param retrievedSequences
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @return Items that were found.
     */
    private Map<String, BioSequence> findOrUpdateSequences(Collection<String> accessionsToFetch,
            Collection<BioSequence> retrievedSequences, Taxon taxon, boolean force) {

        Map<String, BioSequence> found = new HashMap<String, BioSequence>();
        for (BioSequence sequence : retrievedSequences) {
            if (log.isDebugEnabled())
                log.debug("Processing retrieved sequence: " + sequence);
            sequence.setTaxon(taxon);
            sequence = createOrUpdateGenbankSequence(sequence, force);
            String accession = sequence.getSequenceDatabaseEntry().getAccession();
            found.put(accession, sequence);
            accessionsToFetch.remove(accession);
        }
        return found;
    }

    /**
     * Copy sequences into the original versions, or create new sequences in the DB, as needed.
     * 
     * @param accessionsToFetch accessions that we need to fill in
     * @param retrievedSequences candidate sequence information for copying into the database.
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @param taxa Representing taxa on array
     * @return Items that were found.
     */
    private Map<String, BioSequence> findOrUpdateSequences(Map<String, BioSequence> accessionsToFetch,
            Collection<BioSequence> retrievedSequences, Collection<Taxon> taxa, boolean force) {

        Map<String, BioSequence> found = new HashMap<String, BioSequence>();
        for (Taxon taxon : taxa) {

            if (taxon == null) {
                log.warn("Null taxon ..."); // probably should be an exception
            }
            assert taxon != null;

            boolean warned = false;
            for (BioSequence sequence : retrievedSequences) {
                if (sequence.getTaxon() == null) {
                    if (!warned) {
                        log.warn("Sequence taxon is null [" + sequence + "], copying array taxon " + taxon
                                + " ; further warnings for this array taxon are suppressed.");
                    }
                    warned = true;
                } else if (!sequence.getTaxon().equals(taxon)) {
                    continue;
                }

                sequence.setTaxon(taxon);
                if (sequence.getSequenceDatabaseEntry() == null) {
                    log.warn("Sequence from BLAST db lacks database entry: " + sequence + "; skipping");
                    continue;
                }
                sequence = createOrUpdateGenbankSequence(sequence, force);
                sequence = this.bioSequenceService.thaw(sequence);
                String accession = sequence.getSequenceDatabaseEntry().getAccession();
                found.put(accession, sequence);
                accessionsToFetch.remove(accession);
            }
        }
        return found;
    }

    private void flushBuffer(Collection<BioSequence> bioSequences, Collection<BioSequence> sequenceBuffer,
            Map<String, CompositeSequence> csBuffer) {
        Collection<BioSequence> newOnes = bioSequenceService.findOrCreate(sequenceBuffer);
        bioSequences.addAll(newOnes);
        for (BioSequence sequence : newOnes) {
            CompositeSequence cs = csBuffer.get(sequence.getName());
            assert cs != null;
            cs.setBiologicalCharacteristic(sequence);
        }
        csBuffer.clear();
        sequenceBuffer.clear();
    }

    /**
     * Used to check if an IMAGE clone exists to use for an accession. If the IMAGE clone is used instead, we update the
     * composite sequence.
     * 
     * @param cs
     * @return
     */
    private String getAccession(CompositeSequence cs) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (bs.getSequenceDatabaseEntry() == null) {
            return null;
        }
        bs = this.bioSequenceService.thaw(bs);
        return bs.getSequenceDatabaseEntry().getAccession();
    }

    private ExternalDatabase getGenbank() {
        return this.externalDatabaseService.find("Genbank");
    }

    /**
     * @param accessionsToFetch
     * @param found
     * @return
     */
    private Collection<String> getUnFound(Collection<String> accessionsToFetch, Map<String, BioSequence> found) {
        Collection<String> notFound = new HashSet<String>();
        for (String accession : accessionsToFetch) {
            if (!found.containsKey(accession)) {
                notFound.add(accession);
            }
        }
        return notFound;
    }

    /**
     * @param arrayDesign
     * @param accessionsToFetch
     * @param sequenceProvided
     * @param noSequence
     */
    private void informAboutFetchListResults(ArrayDesign arrayDesign, Map<String, BioSequence> accessionsToFetch,
            int sequenceProvided, int noSequence) {
        log.info("Array Design has " + accessionsToFetch.size() + " accessions to fetch for "
                + arrayDesign.getCompositeSequences().size() + " compositeSequences");
        log.info(sequenceProvided + " had sequences already and will not be replaced");
        log.info(noSequence + " have no BioSequence association at all and will not be processed further.");
    }

    /**
     * @param arrayDesign
     * @param accessionsToFetch
     * @param force if true, sequence will be replaced even if it is already there.
     * @return map of biosequence accessions to BioSequences (the existing ones)
     */
    private Map<String, BioSequence> initializeFetchList(ArrayDesign arrayDesign, boolean force) {
        Map<String, BioSequence> accessionsToFetch = new HashMap<String, BioSequence>();
        int sequenceProvided = 0;
        int noSequence = 0;
        boolean warned = false;
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            BioSequence bs = cs.getBiologicalCharacteristic();
            if (bs == null) {
                warned = warnAboutMissingSequence(noSequence, warned, cs);
                noSequence++;
                continue;
            }

            if (!force && StringUtils.isNotBlank(bs.getSequence())) {
                sequenceProvided++;
                continue;
            }

            String accession = getAccession(cs);

            if (accession == null) {
                if (log.isDebugEnabled())
                    log.debug("No accession for " + cs + ": " + bs);
                continue;
            }

            accessionsToFetch.put(accession, bs);
        }
        informAboutFetchListResults(arrayDesign, accessionsToFetch, sequenceProvided, noSequence);
        return accessionsToFetch;
    }

    /**
     * @param arrayDesign
     * @param notFound
     * @return
     */
    private void logMissingSequences(ArrayDesign arrayDesign, Collection<String> notFound) {
        log.warn(notFound.size() + " sequences were not found (or were already filled in) for " + arrayDesign);
        StringBuilder buf = new StringBuilder();
        buf.append("Missing (or already present) sequences for following accessions " + "at version numbers up to "
                + MAX_VERSION_NUMBER + " : ");
        int i = 0;
        for (String string : notFound) {
            string = string.replaceFirst("\\.\\d$", "");
            buf.append(string + " ");
            if (++i > 10) {
                buf.append("... (skipping logging of rest)");
                break;
            }
        }
        log.info(buf.toString());
    }

    /**
     * @param numWithNoSequence
     * @param compositeSequence
     */
    private void notifyAboutMissingSequences(int numWithNoSequence, CompositeSequence compositeSequence) {
        if (numWithNoSequence == MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS) {
            log.warn("More than " + MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS
                    + " compositeSequences do not have" + " biologicalCharacteristics, skipping further details.");
        } else if (numWithNoSequence < MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS) {
            log.warn("No sequence match for " + compositeSequence + " (Description="
                    + compositeSequence.getDescription() + "); it will not have a biologicalCharacteristic!");
        }
    }

    /**
     * @param sequenceIdentifierFile with two columns: first is probe id, second is genbank accession.
     * @return
     * @throws IOException
     */
    private Map<String, String> parseAccessionFile(InputStream sequenceIdentifierFile) throws IOException {
        BufferedReader br = new BufferedReader(new InputStreamReader(sequenceIdentifierFile));

        String line = null;

        StopWatch timer = new StopWatch();
        timer.start();

        Map<String, String> probe2acc = new HashMap<String, String>();
        int count = 0;
        int totalLines = 0;
        while ((line = br.readLine()) != null) {
            String[] fields = line.split("\t");
            ++totalLines;
            if (fields.length < 2) {
                continue;
            }

            String probeName = fields[0];
            String seqAcc = fields[1];

            if (StringUtils.isBlank(seqAcc)) {
                continue;
            }

            probe2acc.put(probeName, seqAcc);
            if (++count % 2000 == 0 && timer.getTime() > 10000) {
                log.info(count + " / " + totalLines + " probes read so far have identifiers");
            }
        }
        br.close();
        log.info(count + " / " + totalLines + " probes have accessions");
        return probe2acc;

    }

    /**
     * If the sequence already exists
     * 
     * @param sequence
     * @return
     */
    private BioSequence persistSequence(BioSequence sequence) {
        return (BioSequence) persisterHelper.persistOrUpdate(sequence);
    }

    /**
     * @param arrayDesign
     * @param sequenceFile; the expected format is described in {@link ProbeSequenceParser}
     * @param taxon
     * @return
     * @see ProbeSequenceParser
     */
    private Collection<BioSequence> processOligoDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            Taxon taxon) throws IOException {
        checkForCompositeSequences(arrayDesign);

        ProbeSequenceParser parser = new ProbeSequenceParser();
        parser.parse(sequenceFile);

        int total = arrayDesign.getCompositeSequences().size();
        int done = 0;
        int percent = 0;
        taxon = validateTaxon(taxon, arrayDesign);

        log.info("Sequences done, updating composite sequences");

        int numWithNoSequence = 0;

        Collection<BioSequence> res = new HashSet<BioSequence>();
        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            if (log.isTraceEnabled())
                log.trace("Looking for sequence for: " + compositeSequence.getName());
            BioSequence sequence = parser.get(compositeSequence.getName());

            if (sequence != null) {
                // overwrite the existing characteristic if necessary.
                assert sequence.getSequence() != null;
                sequence.setType(SequenceType.OLIGO);
                sequence.setPolymerType(PolymerType.DNA);
                sequence.setTaxon(taxon);
                sequence = persistSequence(sequence);
                compositeSequence.setBiologicalCharacteristic(sequence);
                compositeSequence.setArrayDesign(arrayDesign);
                res.add(sequence);
            } else {
                numWithNoSequence++;
                notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
            }

            if (++done % 1000 == 0) {
                percent = updateProgress(total, done, percent);
            }
        }

        if (numWithNoSequence > 0)
            log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size()
                    + " composite sequences with no associated biological characteristic");

        log.info("Updating sequences on arrayDesign");
        arrayDesignService.update(arrayDesign);
        return res;
    }

    /**
     * @param databaseNames
     * @param blastDbHome
     * @param accessionsToFetch
     * @return
     */
    private Collection<BioSequence> searchBlastDbs(String[] databaseNames, String blastDbHome,
            Collection<String> accessionsToFetch) {
        // search the databases.
        FastaCmd fc = new SimpleFastaCmd();
        Collection<BioSequence> retrievedSequences = new HashSet<BioSequence>();
        for (String dbname : databaseNames) {
            Collection<BioSequence> moreBioSequences;
            if (blastDbHome != null) {
                moreBioSequences = fc.getBatchAccessions(accessionsToFetch, dbname, blastDbHome);
            } else {
                moreBioSequences = fc.getBatchAccessions(accessionsToFetch, dbname);
            }

            if (log.isDebugEnabled())
                log.debug(moreBioSequences.size() + " sequences of " + accessionsToFetch.size() + " fetched "
                        + " from " + dbname);
            retrievedSequences.addAll(moreBioSequences);
        }

        fillInGenbank(retrievedSequences);

        return retrievedSequences;
    }

    /**
     * Search for a single accession
     * 
     * @param databaseNames
     * @param blastDbHome
     * @param accessionToFetch
     * @return
     */
    private BioSequence searchBlastDbs(String[] databaseNames, String blastDbHome, String accessionToFetch) {
        FastaCmd fc = new SimpleFastaCmd();
        for (String dbname : databaseNames) {
            BioSequence moreBioSequence;
            if (blastDbHome != null) {
                moreBioSequence = fc.getByAccession(accessionToFetch, dbname, blastDbHome);
            } else {
                moreBioSequence = fc.getByAccession(accessionToFetch, dbname, null);
            }
            if (moreBioSequence != null)
                return moreBioSequence;
        }
        return null;

    }

    /**
     * Replace information in "existing" with data from "found".
     * 
     * @param found
     * @param existing
     * @return
     */
    private BioSequence updateExistingWithSequenceData(BioSequence found, BioSequence existing) {
        assert found != null;
        assert existing != null;

        if (existing.getType() == null)
            existing.setType(found.getType()); // generic...
        existing.setLength(found.getLength());
        assert found.getSequence() != null;

        existing.setSequence(found.getSequence());
        existing.setIsApproximateLength(found.getIsApproximateLength());

        if (existing.getSequenceDatabaseEntry() == null) {
            log.debug("Inserting database entry into existing sequence " + existing);
            existing.setSequenceDatabaseEntry(found.getSequenceDatabaseEntry());
        }

        // This is just for debugging purposes -- some array designs give suspicious sequence information.
        if (existing.getSequence().length() > 10e4) {
            log.warn(existing + " - new sequence is very long for an expression probe ("
                    + existing.getSequence().length() + " bases)");
        }

        bioSequenceService.update(existing);
        if (log.isDebugEnabled())
            log.debug(
                    "Updated " + existing + " with sequence " + StringUtils.abbreviate(existing.getSequence(), 20));

        assert found.getSequenceDatabaseEntry().getExternalDatabase() != null;
        return existing;
    }

    /**
     * FIXME - factor this out, it might be useful elsewhere.
     * 
     * @param totalThingsToDo
     * @param howManyAreDone
     * @param percentDoneLastTimeWeChecked
     * @return
     */
    private int updateProgress(int totalThingsToDo, int howManyAreDone, int percentDoneLastTimeWeChecked) {
        int newPercent = (int) Math.ceil((100.00 * howManyAreDone / totalThingsToDo));
        if (newPercent > percentDoneLastTimeWeChecked) {
            log.info(howManyAreDone + " sequence+probes of " + totalThingsToDo + " processed.");
        }

        return newPercent;
    }

    /**
     * @param noSequence
     * @param warned
     * @param cs
     * @return
     */
    private boolean warnAboutMissingSequence(int noSequence, boolean warned, CompositeSequence cs) {
        if (!warned) {
            if (noSequence < 20) {
                log.warn(cs + " has no biosequence");
            } else {
                log.warn("...More than 20 are missing sequences, details omitted");
                warned = true;
            }
        }
        return warned;
    }

}