ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignSequenceProcessingServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.expression.arrayDesign;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.gemma.core.analysis.report.ArrayDesignReportService;
import ubic.gemma.core.analysis.sequence.SequenceManipulation;
import ubic.gemma.core.loader.genome.FastaCmd;
import ubic.gemma.core.loader.genome.FastaParser;
import ubic.gemma.core.loader.genome.ProbeSequenceParser;
import ubic.gemma.core.loader.genome.SimpleFastaCmd;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.PolymerType;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.persistence.persister.Persister;
import ubic.gemma.persistence.service.common.description.ExternalDatabaseService;
import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.persistence.service.genome.biosequence.BioSequenceService;

import java.io.*;
import java.util.*;

/**
 * Handles collapsing the sequences, attaching sequences to DesignElements, either from provided input or via a fetch.
 *
 * @author pavlidis
 */
@Component

public class ArrayDesignSequenceProcessingServiceImpl implements ArrayDesignSequenceProcessingService {

    /**
     * When we encounter two probes with the same name, we add this string along with a unique identifier to the end of
     * the name. This comes into play when the probe name is the sequence name, and the same sequence is used multiple
     * times on the array design.
     */
    public static final String DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR = "___";
    private static final int BATCH_SIZE = 100;
    /**
     * After seeing more than this number of compositeSequences lacking sequences we don't give a detailed warning.
     */
    private static final int MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS = 20;
    private static final Log log = LogFactory.getLog(ArrayDesignSequenceProcessingServiceImpl.class.getName());
    private final ArrayDesignReportService arrayDesignReportService;
    private final ArrayDesignService arrayDesignService;
    private final BioSequenceService bioSequenceService;
    private final ExternalDatabaseService externalDatabaseService;
    private final Persister persisterHelper;

    @Autowired
    public ArrayDesignSequenceProcessingServiceImpl(ArrayDesignReportService arrayDesignReportService,
            ArrayDesignService arrayDesignService, BioSequenceService bioSequenceService,
            ExternalDatabaseService externalDatabaseService, Persister persisterHelper) {
        this.arrayDesignReportService = arrayDesignReportService;
        this.arrayDesignService = arrayDesignService;
        this.bioSequenceService = bioSequenceService;
        this.externalDatabaseService = externalDatabaseService;
        this.persisterHelper = persisterHelper;
    }

    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements,
            Collection<BioSequence> sequences) {

        Map<String, BioSequence> nameMap = new HashMap<>();
        for (BioSequence sequence : sequences) {
            nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);
        }

        int numNotFound = 0;
        for (CompositeSequence designElement : designElements) {
            if (!nameMap.containsKey(designElement.getName())) {
                ArrayDesignSequenceProcessingServiceImpl.log
                        .debug("No sequence matches " + designElement.getName());
                numNotFound++;
                continue;
            }

            designElement.setBiologicalCharacteristic(nameMap.get(designElement.getName()));

        }

        ArrayDesignSequenceProcessingServiceImpl.log
                .info(sequences.size() + " sequences processed for " + designElements.size() + " design elements");
        if (numNotFound > 0) {
            ArrayDesignSequenceProcessingServiceImpl.log.warn(numNotFound + " probes had no matching sequence");
        }
    }

    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, File fastaFile)
            throws IOException {
        try (FileInputStream fis = new FileInputStream(fastaFile)) {
            this.assignSequencesToDesignElements(designElements, fis);
        }
    }

    /**
     * Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
     * of a design element. Provided for testing purposes.
     */
    @Override
    public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile)
            throws IOException {
        FastaParser fp = new FastaParser();
        fp.parse(fastaFile);
        Collection<BioSequence> sequences = fp.getResults();
        ArrayDesignSequenceProcessingServiceImpl.log.debug("Parsed " + sequences.size() + " sequences");

        this.assignSequencesToDesignElements(designElements, sequences);
    }

    @Override
    public Collection<BioSequence> processAffymetrixDesign(ArrayDesign arrayDesign, InputStream probeSequenceFile,
            Taxon taxon) throws IOException {

        ArrayDesignSequenceProcessingServiceImpl.log.info("Processing Affymetrix design");
        boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0; // this would be unusual
        log.info("Platform has " + arrayDesign.getCompositeSequences().size() + " elements");

        taxon = this.validateTaxon(taxon, arrayDesign);
        Collection<BioSequence> bioSequences = new HashSet<>();

        int done = 0;
        int percent = 0;

        AffyProbeReader apr = new AffyProbeReader();
        apr.parse(probeSequenceFile);
        Collection<CompositeSequence> compositeSequencesFromProbes = apr.getKeySet();

        int total = compositeSequencesFromProbes.size();

        Map<String, CompositeSequence> quickFindMap = new HashMap<>();
        List<BioSequence> sequenceBuffer = new ArrayList<>();
        Map<String, CompositeSequence> csBuffer = new HashMap<>();
        for (CompositeSequence newCompositeSequence : compositeSequencesFromProbes) {

            // these composite sequences are just use
            newCompositeSequence.setArrayDesign(arrayDesign);
            BioSequence collapsed = SequenceManipulation.collapse(apr.get(newCompositeSequence));
            String sequenceName = newCompositeSequence.getName() + "_collapsed";
            collapsed.setName(sequenceName);
            collapsed.setType(SequenceType.AFFY_COLLAPSED);
            collapsed.setPolymerType(PolymerType.DNA);
            collapsed.setTaxon(taxon);

            if (log.isDebugEnabled()) {
                System.err.println(newCompositeSequence.getName() + " " + collapsed.getSequence() + "\n");
            }

            if (wasOriginallyLackingCompositeSequences) {
                arrayDesign.getCompositeSequences().add(newCompositeSequence);
            } else {
                /*
                 * usual case. If it already exists, we try to update the sequence itself by default. This is generally
                 * safe for affymetrix probes because affy doesn't reuse probe names. These updates actually only affect
                 * the sequence itself in situations where we have a misparse.
                 */
                collapsed = this.persistSequence(collapsed);
                quickFindMap.put(newCompositeSequence.getName(), newCompositeSequence);
            }

            sequenceBuffer.add(collapsed);
            if (csBuffer.containsKey(sequenceName))
                throw new IllegalArgumentException("All probes must have unique names");
            csBuffer.put(sequenceName, newCompositeSequence);
            if (sequenceBuffer.size() == ArrayDesignSequenceProcessingServiceImpl.BATCH_SIZE) {
                this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
            }

            if (++done % 1000 == 0) {
                percent = this.updateProgress(total, done, percent);
            }
        }

        this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
        this.updateProgress(total, done, percent);

        /*
         * 
         */
        if (!wasOriginallyLackingCompositeSequences) {
            // usual case.
            percent = 0;
            done = 0;
            int numWithNoSequence = 0;
            for (CompositeSequence originalCompositeSequence : arrayDesign.getCompositeSequences()) {

                CompositeSequence compositeSequenceFromParse = quickFindMap
                        .get(originalCompositeSequence.getName());
                if (compositeSequenceFromParse == null) {
                    numWithNoSequence++;
                    this.notifyAboutMissingSequences(numWithNoSequence, originalCompositeSequence);
                    continue;
                }

                if (log.isDebugEnabled())
                    log.debug(originalCompositeSequence + " matches " + compositeSequenceFromParse + " seq is "
                            + compositeSequenceFromParse.getBiologicalCharacteristic());

                originalCompositeSequence
                        .setBiologicalCharacteristic(compositeSequenceFromParse.getBiologicalCharacteristic());

                assert originalCompositeSequence.getBiologicalCharacteristic().getId() != null;

                originalCompositeSequence.setArrayDesign(compositeSequenceFromParse.getArrayDesign());

                if (++done % 1000 == 0) {
                    percent = this.updateProgress(total, done, percent);
                }
            }
            ArrayDesignSequenceProcessingServiceImpl.log.info(numWithNoSequence + "/"
                    + arrayDesign.getCompositeSequences().size() + " probes could not be matched to a sequence");
        }

        arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequencesFromProbes.size());
        ArrayDesignSequenceProcessingServiceImpl.log.info("Updating " + arrayDesign);

        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        ArrayDesignSequenceProcessingServiceImpl.log.info("Done adding sequence information!");
        return bioSequences;
    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            SequenceType sequenceType) throws IOException {
        return this.processArrayDesign(arrayDesign, sequenceFile, sequenceType, null);
    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            SequenceType sequenceType, Taxon taxon) throws IOException {

        // arrayDesign = arrayDesignService.thawRawAndProcessed( arrayDesign );

        if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
            return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon);
        } else if (sequenceType.equals(SequenceType.OLIGO)) {
            return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
        }
        taxon = this.validateTaxon(taxon, arrayDesign);

        this.checkForCompositeSequences(arrayDesign);

        FastaParser fastaParser = new FastaParser();
        fastaParser.parse(sequenceFile);
        Collection<BioSequence> bioSequences = fastaParser.getResults();

        // make two maps: one for genbank ids, one for the sequence name.
        Map<String, BioSequence> gbIdMap = new HashMap<>();
        Map<String, BioSequence> nameMap = new HashMap<>();

        int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
        int done = 0;
        int percent = 0;

        for (BioSequence sequence : bioSequences) {

            sequence.setType(sequenceType);
            sequence.setPolymerType(PolymerType.DNA);
            sequence.setTaxon(taxon);
            sequence = this.persistSequence(sequence);

            this.addToMaps(gbIdMap, nameMap, sequence);

        }

        ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");

        int numWithNoSequence = 0;
        int numMatchedByAccession = 0;
        int numMatchedByProbeName = 0;
        String mungeRegex = ArrayDesignSequenceProcessingServiceImpl.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";

        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log
                        .trace("Looking for sequence for: " + compositeSequence.getName());

            BioSequence match = null;
            if (nameMap.containsKey(compositeSequence.getName())) {
                match = nameMap.get(compositeSequence.getName());
                numMatchedByProbeName++;
            } else if (compositeSequence.getName().matches(mungeRegex)) {
                String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
                if (nameMap.containsKey(unMungedName)) {
                    numMatchedByProbeName++;
                    continue;
                }
            } else {
                BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
                if (biologicalCharacteristic != null) {
                    biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
                    if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap
                            .containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
                        match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
                        numMatchedByAccession++;
                    } else {
                        compositeSequence.setBiologicalCharacteristic(null);
                        numWithNoSequence++;
                        this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                    }
                } else {
                    numWithNoSequence++;
                    this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                }
            }

            if (match != null) {
                // overwrite the existing characteristic if necessary.
                compositeSequence.setBiologicalCharacteristic(match);
                compositeSequence.setArrayDesign(arrayDesign);
            }

            if (++done % 1000 == 0) {
                percent = this.updateProgress(total, done, percent);
            }
        }

        ArrayDesignSequenceProcessingServiceImpl.log
                .info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size()
                        + " composite sequences were matched to sequences by Genbank accession");
        ArrayDesignSequenceProcessingServiceImpl.log
                .info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size()
                        + " composite sequences were matched to sequences by probe name");

        if (numWithNoSequence > 0)
            ArrayDesignSequenceProcessingServiceImpl.log
                    .info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size()
                            + " composite sequences with no associated biological characteristic");

        ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        return bioSequences;

    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceIdentifierFile,
            String[] databaseNames, String blastDbHome, Taxon taxon, boolean force) throws IOException {
        return this.processArrayDesign(arrayDesign, sequenceIdentifierFile, databaseNames, blastDbHome, taxon,
                force, null);
    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceIdentifierFile,
            String[] databaseNames, String blastDbHome, Taxon taxon, boolean force, FastaCmd fc)
            throws IOException {
        this.checkForCompositeSequences(arrayDesign);

        Map<String, String> probe2acc = this.parseAccessionFile(sequenceIdentifierFile);

        // values that were not found
        Collection<String> notFound = new HashSet<>(probe2acc.values());

        // the actual thing values to search for (with version numbers)
        Collection<String> accessionsToFetch = new HashSet<>(probe2acc.values());

        // only 1 taxon should be on array design if taxon not supplied on command line
        taxon = this.validateTaxon(taxon, arrayDesign);

        /*
         * Fill in sequences from BLAST databases.
         */
        int numSwitched = 0;
        if (fc == null)
            fc = new SimpleFastaCmd();

        Collection<BioSequence> retrievedSequences = this.searchBlastDbs(databaseNames, blastDbHome, notFound, fc);

        // map of accessions to sequence.
        Map<String, BioSequence> found = this.findOrUpdateSequences(accessionsToFetch, retrievedSequences, taxon,
                force);

        Collection<BioSequence> finalResult = new HashSet<>(retrievedSequences);

        // replace the sequences.
        numSwitched = this.replaceSequences(arrayDesign, probe2acc, numSwitched, found);

        notFound = this.getUnFound(notFound, found);

        if (!notFound.isEmpty() && taxon != null) {

            Collection<String> stillLooking = new HashSet<>(notFound);
            notFound.clear();

            /*
             * clear the version number.
             */
            for (String accession : stillLooking) {
                notFound.remove(accession);
                accession = accession.replaceFirst("\\.\\d+$", "");
                notFound.add(accession);
            }
            assert notFound.size() > 0;
            /*
             * See if they're already in Gemma. This is good for sequences that are not in genbank but have been loaded
             * previously.
             */
            found = this.findLocalSequences(notFound, taxon);
            finalResult.addAll(found.values());

            numSwitched = this.replaceSequences(arrayDesign, probe2acc, numSwitched, found);
            notFound = this.getUnFound(notFound, found);
        }

        if (!notFound.isEmpty()) {
            this.logMissingSequences(arrayDesign, notFound);
        }

        ArrayDesignSequenceProcessingServiceImpl.log
                .info(numSwitched + " composite sequences had their biologicalCharacteristics changed");

        arrayDesignService.update(arrayDesign);

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        return finalResult;

    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames,
            boolean force) {
        return this.processArrayDesign(arrayDesign, databaseNames, null, force);
    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames,
            String blastDbHome, boolean force) {
        return this.processArrayDesign(arrayDesign, databaseNames, blastDbHome, force, null);
    }

    @Override
    public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames,
            String blastDbHome, boolean force, FastaCmd fc) {

        Map<String, BioSequence> accessionsToFetch = this.initializeFetchList(arrayDesign, force);

        if (accessionsToFetch.size() == 0) {
            ArrayDesignSequenceProcessingServiceImpl.log.info("No accessions to fetch, no processing will be done");
            return null;
        }

        Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
        // not taxon found
        if (taxaOnArray.size() == 0) {
            throw new IllegalArgumentException(
                    taxaOnArray.size() + " taxon found for " + arrayDesign + "please specify which taxon to run");
        }

        Collection<String> notFound = accessionsToFetch.keySet();

        if (fc == null)
            fc = new SimpleFastaCmd();
        Collection<BioSequence> retrievedSequences = this.searchBlastDbs(databaseNames, blastDbHome, notFound, fc);

        Map<String, BioSequence> found = this.findOrUpdateSequences(accessionsToFetch, retrievedSequences,
                taxaOnArray, force);

        Collection<BioSequence> finalResult = new HashSet<>(found.values());
        notFound = this.getUnFound(notFound, found);

        if (!notFound.isEmpty()) {
            this.logMissingSequences(arrayDesign, notFound);
        }

        ArrayDesignSequenceProcessingServiceImpl.log.info(finalResult.size() + " sequences found");
        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

        return finalResult;

    }

    /**
     * Update a single sequence in the system.
     *
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @return persistent BioSequence.
     */
    @Override
    public BioSequence processSingleAccession(String sequenceId, String[] databaseNames, String blastDbHome,
            boolean force) {
        BioSequence found = this.searchBlastDbs(databaseNames, blastDbHome, sequenceId, new SimpleFastaCmd());
        if (found == null)
            return null;
        return this.createOrUpdateGenbankSequence(found, force);

    }

    /**
     * If taxon is null then it has not been provided on the command line, then deduce the taxon from the arrayDesign.
     * If there are 0 or more than one taxon on the array design throw an error as this programme can only be run for 1
     * taxon at a time if processing from a file.
     *
     * @param taxon Taxon as passed in on the command line
     * @param arrayDesign Array design to process
     * @return taxon Taxon to process
     * @throws IllegalArgumentException Thrown when there is not exactly 1 taxon.
     */
    @Override
    public Taxon validateTaxon(Taxon taxon, ArrayDesign arrayDesign) throws IllegalArgumentException {

        if (arrayDesign == null) {
            throw new IllegalArgumentException("Array design cannot be null");
        }

        if (taxon == null) {
            Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());

            if (taxaOnArray.size() == 1 && taxaOnArray.iterator().next() != null) {
                return taxaOnArray.iterator().next();
            }
            throw new IllegalArgumentException(
                    taxaOnArray.size() + " taxa found for " + arrayDesign + "please specify which taxon to run");
        }
        return taxon;
    }

    private int replaceSequences(ArrayDesign arrayDesign, Map<String, String> probe2acc, int numSwitched,
            Map<String, BioSequence> found) {
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            String probeName = cs.getName();
            String acc = probe2acc.get(probeName);
            if (found.containsKey(acc)) {
                numSwitched++;
                ArrayDesignSequenceProcessingServiceImpl.log
                        .debug("Setting seq. for " + cs + " to " + found.get(acc));
                cs.setBiologicalCharacteristic(found.get(acc));
            }
        }
        return numSwitched;
    }

    private void addToMaps(Map<String, BioSequence> gbIdMap, Map<String, BioSequence> nameMap,
            BioSequence sequence) {
        nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);

        if (sequence.getSequenceDatabaseEntry() != null) {
            gbIdMap.put(sequence.getSequenceDatabaseEntry().getAccession(), sequence);
        } else {
            if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log
                        .trace("No sequence database entry for " + sequence.getName());
        }
    }

    private void checkForCompositeSequences(ArrayDesign arrayDesign) {
        boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0;

        if (wasOriginallyLackingCompositeSequences) {
            throw new IllegalArgumentException(
                    "You need to pass in an array design that already has compositeSequences filled in.");
        }
    }

    /**
     * @param found a new (non-persistent) biosequence that can be used to create a new entry or update an existing one
     *        with the sequence. The sequence would have come from Genbank.
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten. Otherwise, the sequence will only be updated if the
     *        actual sequence information was missing in our DB and 'found' has a sequence.
     * @return persistent BioSequence.
     */
    private BioSequence createOrUpdateGenbankSequence(BioSequence found, boolean force) {
        assert found != null;
        DatabaseEntry sequenceDatabaseEntry = found.getSequenceDatabaseEntry();

        assert sequenceDatabaseEntry != null; // this should always be the case because the sequences comes from
        // genbank (blastDb)
        assert sequenceDatabaseEntry.getExternalDatabase() != null;

        BioSequence existing;

        existing = bioSequenceService.findByAccession(sequenceDatabaseEntry);

        BioSequence result;
        if (existing == null) {
            if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log.debug("Find (or creating) new sequence " + found);

            result = bioSequenceService.find(found); // there still might be a match.

            if (result == null) {
                result = bioSequenceService.create(found);
            }
        } else {
            result = existing;
        }

        assert result != null;
        // note that no matter what we make sure the database entry is filled in.
        if (force || (StringUtils.isBlank(result.getSequence()) && !StringUtils.isBlank(found.getSequence()))) {
            result = this.updateExistingWithSequenceData(found, result);
        } else {
            this.fillInDatabaseEntry(found, result);
        }

        // result = bioSequenceService.thawRawAndProcessed( result );

        return result;
    }

    /**
     * When the probe id is in the format ArrayName:ProbeId, just return the ProbeId. For anything else return the
     * entire string.
     */
    private String deMangleProbeId(String probeId) {
        String[] split = StringUtils.split(probeId, ":");
        if (split.length > 1) {
            return split[1];
        }
        return probeId;
    }

    /**
     * Unfortunately we have to make sure the database entry is filled in, even if we aren't using 'force'. This seems
     * due in part to an old bug in the system that left these blank. This is our opportunity to fill them in.
     */
    private void fillInDatabaseEntry(BioSequence found, BioSequence existing) {
        if (existing.getSequenceDatabaseEntry() == null) {
            existing.setSequenceDatabaseEntry(found.getSequenceDatabaseEntry());
            bioSequenceService.update(existing);
        }
    }

    private void fillInGenbank(Collection<BioSequence> retrievedSequences) {
        ExternalDatabase genbank = this.getGenbank();
        assert genbank.getId() != null;
        for (BioSequence bioSequence : retrievedSequences) {
            if (bioSequence.getSequenceDatabaseEntry() == null) {
                ArrayDesignSequenceProcessingServiceImpl.log.warn("No database entry for " + bioSequence);
                continue;
            }
            if (bioSequence.getSequenceDatabaseEntry().getExternalDatabase().getId() == null) {
                bioSequence.getSequenceDatabaseEntry().setExternalDatabase(genbank);
            }
        }
    }

    private Map<String, BioSequence> findLocalSequences(Collection<String> identifiersToSearch, Taxon taxon) {
        Map<String, BioSequence> found = new HashMap<>();
        for (String id : identifiersToSearch) {
            BioSequence template = BioSequence.Factory.newInstance();
            template.setTaxon(taxon);
            template.setName(id);
            BioSequence seq = bioSequenceService.find(template);
            if (seq != null) {
                found.put(id, seq);
            }
        }
        return found;
    }

    /**
     * Copy sequences into the original versions, or create new sequences in the DB, as needed.
     *
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @return Items that were found.
     */
    private Map<String, BioSequence> findOrUpdateSequences(Collection<String> accessionsToFetch,
            Collection<BioSequence> retrievedSequences, Taxon taxon, boolean force) {

        Map<String, BioSequence> found = new HashMap<>();
        for (BioSequence sequence : retrievedSequences) {
            if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log.debug("Processing retrieved sequence: " + sequence);
            sequence.setTaxon(taxon);
            sequence = this.createOrUpdateGenbankSequence(sequence, force);
            String accession = sequence.getSequenceDatabaseEntry().getAccession();
            found.put(accession, sequence);
            accessionsToFetch.remove(accession);
        }
        return found;
    }

    /**
     * Copy sequences into the original versions, or create new sequences in the DB, as needed.
     *
     * @param accessionsToFetch accessions that we need to fill in
     * @param retrievedSequences candidate sequence information for copying into the database.
     * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
     *        information in the BioSequence will be overwritten.
     * @param taxa Representing taxa on array
     * @return Items that were found.
     */
    private Map<String, BioSequence> findOrUpdateSequences(Map<String, BioSequence> accessionsToFetch,
            Collection<BioSequence> retrievedSequences, Collection<Taxon> taxa, boolean force) {

        Map<String, BioSequence> found = new HashMap<>();
        for (Taxon taxon : taxa) {

            if (taxon == null) {
                ArrayDesignSequenceProcessingServiceImpl.log.warn("Null taxon ..."); // probably should be an exception
            }
            assert taxon != null;

            boolean warned = false;
            for (BioSequence sequence : retrievedSequences) {
                if (sequence.getTaxon() == null) {
                    if (!warned) {
                        ArrayDesignSequenceProcessingServiceImpl.log
                                .warn("Sequence taxon is null [" + sequence + "], copying array taxon " + taxon
                                        + " ; further warnings for this array taxon are suppressed.");
                    }
                    warned = true;
                } else if (!sequence.getTaxon().equals(taxon)) {
                    continue;
                }

                sequence.setTaxon(taxon);
                if (sequence.getSequenceDatabaseEntry() == null) {
                    ArrayDesignSequenceProcessingServiceImpl.log
                            .warn("Sequence from BLAST db lacks database entry: " + sequence + "; skipping");
                    continue;
                }
                sequence = this.createOrUpdateGenbankSequence(sequence, force);
                sequence = this.bioSequenceService.thaw(sequence);
                String accession = sequence.getSequenceDatabaseEntry().getAccession();
                found.put(accession, sequence);
                accessionsToFetch.remove(accession);
            }
        }
        return found;
    }

    /**
     * for affymetrix processing
     * 
     * @param bioSequences bio sequences
     * @param sequenceBuffer sequence buffer
     * @param csBuffer cs buffer
     */
    private void flushBuffer(Collection<BioSequence> bioSequences, Collection<BioSequence> sequenceBuffer,
            Map<String, CompositeSequence> csBuffer) {
        Collection<BioSequence> newOnes = bioSequenceService.findOrCreate(sequenceBuffer);
        bioSequences.addAll(newOnes);
        for (BioSequence sequence : newOnes) {
            CompositeSequence cs = csBuffer.get(sequence.getName());
            assert cs != null;
            if (log.isDebugEnabled()) {
                log.debug("Updating " + cs + " to sequence " + sequence + ": " + sequence.getSequence());
            }
            cs.setBiologicalCharacteristic(sequence);
        }
        csBuffer.clear();
        sequenceBuffer.clear();
    }

    /**
     * Used to check if an IMAGE clone exists to use for an accession. If the IMAGE clone is used instead, we update the
     * composite sequence.
     */
    private String getAccession(CompositeSequence cs) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (bs.getSequenceDatabaseEntry() == null) {
            return null;
        }
        bs = this.bioSequenceService.thaw(bs);
        return bs.getSequenceDatabaseEntry().getAccession();
    }

    private ExternalDatabase getGenbank() {
        return this.externalDatabaseService.findByName("Genbank");
    }

    private Collection<String> getUnFound(Collection<String> accessionsToFetch, Map<String, BioSequence> found) {
        Collection<String> notFound = new HashSet<>();
        for (String accession : accessionsToFetch) {
            if (!found.containsKey(accession)) {
                notFound.add(accession);
            }
        }
        return notFound;
    }

    private void informAboutFetchListResults(ArrayDesign arrayDesign, Map<String, BioSequence> accessionsToFetch,
            int sequenceProvided, int noSequence) {
        ArrayDesignSequenceProcessingServiceImpl.log.info("Array Design has " + accessionsToFetch.size()
                + " accessions to fetch for " + arrayDesign.getCompositeSequences().size() + " compositeSequences");
        ArrayDesignSequenceProcessingServiceImpl.log
                .info(sequenceProvided + " had sequences already and will not be replaced");
        ArrayDesignSequenceProcessingServiceImpl.log
                .info(noSequence + " have no BioSequence association at all and will not be processed further.");
    }

    /**
     * @param force if true, sequence will be replaced even if it is already there.
     * @return map of biosequence accessions to BioSequences (the existing ones)
     */
    private Map<String, BioSequence> initializeFetchList(ArrayDesign arrayDesign, boolean force) {
        Map<String, BioSequence> accessionsToFetch = new HashMap<>();
        int sequenceProvided = 0;
        int noSequence = 0;
        boolean warned = false;
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            BioSequence bs = cs.getBiologicalCharacteristic();
            if (bs == null) {
                warned = this.warnAboutMissingSequence(noSequence, warned, cs);
                noSequence++;
                continue;
            }

            if (!force && StringUtils.isNotBlank(bs.getSequence())) {
                sequenceProvided++;
                continue;
            }

            String accession = this.getAccession(cs);

            if (accession == null) {
                if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
                    ArrayDesignSequenceProcessingServiceImpl.log.debug("No accession for " + cs + ": " + bs);
                continue;
            }

            accessionsToFetch.put(accession, bs);
        }
        this.informAboutFetchListResults(arrayDesign, accessionsToFetch, sequenceProvided, noSequence);
        return accessionsToFetch;
    }

    private void logMissingSequences(ArrayDesign arrayDesign, Collection<String> notFound) {
        ArrayDesignSequenceProcessingServiceImpl.log
                .warn(notFound.size() + " sequences were not found (or were already filled in) for " + arrayDesign);
        StringBuilder buf = new StringBuilder();
        buf.append("Missing (or already present) sequences for following accessions : ");
        int i = 0;
        for (String string : notFound) {
            string = string.replaceFirst("\\.\\d$", "");
            buf.append(string).append(" ");
            if (++i > 10) {
                buf.append("... (skipping logging of rest)");
                break;
            }
        }
        ArrayDesignSequenceProcessingServiceImpl.log.info(buf.toString());
    }

    private void notifyAboutMissingSequences(int numWithNoSequence, CompositeSequence compositeSequence) {
        if (numWithNoSequence == ArrayDesignSequenceProcessingServiceImpl.MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS) {
            ArrayDesignSequenceProcessingServiceImpl.log.warn("More than "
                    + ArrayDesignSequenceProcessingServiceImpl.MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS
                    + " compositeSequences do not have" + " biologicalCharacteristics, skipping further details.");
        } else if (numWithNoSequence < ArrayDesignSequenceProcessingServiceImpl.MAX_NUM_WITH_NO_SEQUENCE_FOR_DETAILED_WARNINGS) {
            ArrayDesignSequenceProcessingServiceImpl.log.warn("No sequence match for " + compositeSequence
                    + " (Description=" + compositeSequence.getDescription()
                    + "); it will not have a biologicalCharacteristic!");
        }
    }

    /**
     * @param sequenceIdentifierFile with two columns: first is probe id, second is genbank accession.
     */
    private Map<String, String> parseAccessionFile(InputStream sequenceIdentifierFile) throws IOException {
        try (BufferedReader br = new BufferedReader(new InputStreamReader(sequenceIdentifierFile))) {

            String line;

            StopWatch timer = new StopWatch();
            timer.start();

            Map<String, String> probe2acc = new HashMap<>();
            int count = 0;
            int totalLines = 0;
            while ((line = br.readLine()) != null) {
                String[] fields = line.split("\t");
                ++totalLines;
                if (fields.length < 2) {
                    continue;
                }

                String probeName = fields[0];
                String seqAcc = fields[1];

                if (StringUtils.isBlank(seqAcc)) {
                    continue;
                }

                probe2acc.put(probeName, seqAcc);
                if (++count % 2000 == 0 && timer.getTime() > 10000) {
                    ArrayDesignSequenceProcessingServiceImpl.log
                            .info(count + " / " + totalLines + " probes read so far have identifiers");
                }
            }
            ArrayDesignSequenceProcessingServiceImpl.log
                    .info(count + " / " + totalLines + " probes have accessions");
            return probe2acc;
        }

    }

    /**
     * If the sequence already exists
     */
    private BioSequence persistSequence(BioSequence sequence) {
        return (BioSequence) persisterHelper.persistOrUpdate(sequence);
    }

    /**
     * @param sequenceFile; the expected format is described in {@link ProbeSequenceParser}
     * @see ProbeSequenceParser
     */
    private Collection<BioSequence> processOligoDesign(ArrayDesign arrayDesign, InputStream sequenceFile,
            Taxon taxon) throws IOException {
        this.checkForCompositeSequences(arrayDesign);

        ProbeSequenceParser parser = new ProbeSequenceParser();
        parser.parse(sequenceFile);

        int total = arrayDesign.getCompositeSequences().size();
        int done = 0;
        int percent = 0;
        taxon = this.validateTaxon(taxon, arrayDesign);

        ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");

        int numWithNoSequence = 0;

        Collection<BioSequence> res = new HashSet<>();
        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log
                        .trace("Looking for sequence for: " + compositeSequence.getName());
            BioSequence sequence = parser.get(compositeSequence.getName());

            if (sequence != null) {
                // overwrite the existing characteristic if necessary.
                assert sequence.getSequence() != null;
                sequence.setType(SequenceType.OLIGO);
                sequence.setPolymerType(PolymerType.DNA);
                sequence.setTaxon(taxon);
                sequence = this.persistSequence(sequence);
                compositeSequence.setBiologicalCharacteristic(sequence);
                compositeSequence.setArrayDesign(arrayDesign);
                res.add(sequence);
            } else {
                numWithNoSequence++;
                this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
            }

            if (++done % 1000 == 0) {
                percent = this.updateProgress(total, done, percent);
            }
        }

        if (numWithNoSequence > 0)
            ArrayDesignSequenceProcessingServiceImpl.log
                    .info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size()
                            + " composite sequences with no associated biological characteristic");

        ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
        arrayDesignService.update(arrayDesign);
        return res;
    }

    private Collection<BioSequence> searchBlastDbs(String[] databaseNames, String blastDbHome,
            Collection<String> accessionsToFetch, FastaCmd fc) {

        Collection<BioSequence> retrievedSequences = new HashSet<>();
        for (String dbName : databaseNames) {
            Collection<BioSequence> moreBioSequences;
            if (blastDbHome != null) {
                moreBioSequences = fc.getBatchAccessions(accessionsToFetch, dbName, blastDbHome);
            } else {
                moreBioSequences = fc.getBatchAccessions(accessionsToFetch, dbName);
            }

            if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log.debug(moreBioSequences.size() + " sequences of "
                        + accessionsToFetch.size() + " fetched " + " from " + dbName);
            retrievedSequences.addAll(moreBioSequences);
        }

        this.fillInGenbank(retrievedSequences);

        return retrievedSequences;
    }

    /**
     * Search for a single accession
     */
    private BioSequence searchBlastDbs(String[] databaseNames, String blastDbHome, String accessionToFetch,
            FastaCmd fc) {

        for (String dbName : databaseNames) {
            BioSequence moreBioSequence;
            if (blastDbHome != null) {
                moreBioSequence = fc.getByAccession(accessionToFetch, dbName, blastDbHome);
            } else {
                moreBioSequence = fc.getByAccession(accessionToFetch, dbName, null);
            }
            if (moreBioSequence != null)
                return moreBioSequence;
        }
        return null;

    }

    /**
     * Replace information in "existing" with data from "found".
     */
    private BioSequence updateExistingWithSequenceData(BioSequence found, BioSequence existing) {
        assert found != null;
        assert existing != null;

        if (existing.getType() == null)
            existing.setType(found.getType()); // generic...
        existing.setLength(found.getLength());
        assert found.getSequence() != null;

        existing.setSequence(found.getSequence());
        existing.setIsApproximateLength(found.getIsApproximateLength());

        if (existing.getSequenceDatabaseEntry() == null) {
            ArrayDesignSequenceProcessingServiceImpl.log
                    .debug("Inserting database entry into existing sequence " + existing);
            existing.setSequenceDatabaseEntry(found.getSequenceDatabaseEntry());
        }

        // This is just for debugging purposes -- some array designs give suspicious sequence information.
        if (existing.getSequence().length() > 10e4) {
            ArrayDesignSequenceProcessingServiceImpl.log
                    .warn(existing + " - new sequence is very long for an expression probe ("
                            + existing.getSequence().length() + " bases)");
        }

        bioSequenceService.update(existing);
        if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.debug(
                    "Updated " + existing + " with sequence " + StringUtils.abbreviate(existing.getSequence(), 20));

        assert found.getSequenceDatabaseEntry().getExternalDatabase() != null;
        return existing;
    }

    private int updateProgress(int totalThingsToDo, int howManyAreDone, int percentDoneLastTimeWeChecked) {
        int newPercent = (int) Math.ceil((100.00 * howManyAreDone / totalThingsToDo));
        if (newPercent > percentDoneLastTimeWeChecked) {
            ArrayDesignSequenceProcessingServiceImpl.log
                    .info(howManyAreDone + " sequence+probes of " + totalThingsToDo + " processed.");
        }

        return newPercent;
    }

    private boolean warnAboutMissingSequence(int noSequence, boolean warned, CompositeSequence cs) {
        if (!warned) {
            if (noSequence < 20) {
                ArrayDesignSequenceProcessingServiceImpl.log.warn(cs + " has no biosequence");
            } else {
                ArrayDesignSequenceProcessingServiceImpl.log
                        .warn("...More than 20 are missing sequences, details omitted");
                warned = true;
            }
        }
        return warned;
    }

}