ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2008 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.expression.simple;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import ubic.basecode.ontology.model.OntologyTerm;
import ubic.gemma.datastructure.matrix.ExpressionDataWriterUtils;
import ubic.gemma.expression.experiment.service.ExperimentalDesignService;
import ubic.gemma.expression.experiment.service.ExpressionExperimentService;
import ubic.gemma.model.association.GOEvidenceCode;
import ubic.gemma.model.common.description.VocabCharacteristic;
import ubic.gemma.model.common.measurement.Measurement;
import ubic.gemma.model.common.measurement.MeasurementType;
import ubic.gemma.model.common.quantitationtype.PrimitiveType;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.biomaterial.BioMaterialService;
import ubic.gemma.model.expression.experiment.ExperimentalDesign;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.FactorType;
import ubic.gemma.model.expression.experiment.FactorValue;
import ubic.gemma.model.expression.experiment.FactorValueService;
import ubic.gemma.ontology.OntologyService;
import ubic.gemma.ontology.providers.MgedOntologyService;

/**
 * See interface for docs.
 * 
 * @author Paul
 * @version $Id: ExperimentalDesignImporterImpl.java,v 1.21 2013/05/03 00:17:35 paul Exp $
 */
@Service
public class ExperimentalDesignImporterImpl implements ExperimentalDesignImporter {

    private static final int NUMBER_OF_EXTRA_COLUMNS_ALLOWED = 2;

    public static final String EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR = "#$";

    private static Log log = LogFactory.getLog(ExperimentalDesignImporterImpl.class.getName());

    @Autowired
    private BioMaterialService bioMaterialService;

    @Autowired
    private ExperimentalDesignService experimentalDesignService;

    @Autowired
    private OntologyService ontologyService;

    @Autowired
    FactorValueService factorValueServiceService = null;

    @Autowired
    ExpressionExperimentService expressionExperimentService;

    private MgedOntologyService mgedOntologyService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.simple.ExperimentalDesignImporter#importDesign(ubic.gemma.model.expression.experiment
     * .ExpressionExperiment, java.io.InputStream)
     */
    @Override
    public void importDesign(ExpressionExperiment experiment, InputStream is) throws IOException {
        this.importDesign(experiment, is, false);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.simple.ExperimentalDesignImporter#importDesign(ubic.gemma.model.expression.experiment
     * .ExpressionExperiment, java.io.InputStream, boolean)
     */
    @Override
    public void importDesign(ExpressionExperiment experiment, InputStream is, boolean dryRun) throws IOException {
        this.mgedOntologyService = this.ontologyService.getMgedOntologyService();

        log.debug("Parsing input file");
        boolean readHeader = false;

        BufferedReader r = new BufferedReader(new InputStreamReader(is));
        String line = null;
        if (mgedOntologyService == null) {
            throw new IllegalStateException("Please set the MGED OntologyService, thanks.");
        }

        ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign();

        if (!experimentalDesign.getExperimentalFactors().isEmpty()) {
            log.warn("Experimental design already has factors, import will add new ones");
        }

        experimentalDesign.setDescription("Parsed from file.");

        List<String> experimentalFactorLines = new ArrayList<String>();
        String sampleHeaderLine = "";
        List<String> factorValueLines = new ArrayList<String>();

        while ((line = r.readLine()) != null) {
            if (line.startsWith(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) {
                experimentalFactorLines.add(line);
            } else if (line.startsWith("#") || StringUtils.isBlank(line)) {
                continue;
            } else if (!readHeader) {
                sampleHeaderLine = line;
                readHeader = true;
            } else {
                factorValueLines.add(line);
            }
        }
        String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

        Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment);

        validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines);
        validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine);
        validateFactorFileContent(experimentalFactorLines.size(), factorValueLines);
        validateBioMaterialFileContent(experiment, experimentBioMaterials, factorValueLines);

        // build up the composite: create experimental factor then add the experimental value
        addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields,
                factorValueLines);

        experimentalDesignService.update(experimentalDesign);

        // a bit tricky as there is an assumption that the first biomaterial in the bioassay set is the relevent one;
        // safer to use biomaterial collection returned; cannot guarantee order of objects in collection.
        Collection<BioMaterial> bioMaterialsWithFactorValues = addFactorValuesToBioMaterialsInExpressionExperiment(
                experiment, experimentBioMaterials, experimentalDesign, factorValueLines, headerFields);

        for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) {
            this.bioMaterialService.update(bioMaterial);

            // just a debugging sanity check.
            BioMaterial bbm = this.bioMaterialService.load(bioMaterial.getId());
            if (log.isDebugEnabled())
                log.debug(bbm + ": " + bbm.getFactorValues().size() + " factor values: "
                        + StringUtils.join(bbm.getFactorValues(), " ; "));
        }

    }

    /**
     * This method reads the file line e.g. $Run time : Category=EnvironmentalHistory Type=categorical and creates
     * experimental factors from it and adds them to the experimental design.
     * 
     * @param experimentalDesign Experimental design for this expression experiment
     * @param experimentalFactorFileLines List of strings representing lines from input file containing experimental
     *        factors
     * @param headerFields Sample header line split on tab.
     * @param factorValueLines Lines containing biomaterial names and their factor values
     */
    private void addExperimentalFactorsToExperimentalDesign(ExperimentalDesign experimentalDesign,
            List<String> experimentalFactorFileLines, String[] headerFields, List<String> factorValueLines) {

        int maxWait = 0;
        while (!mgedOntologyService.isOntologyLoaded()) {
            try {
                Thread.sleep(1000);
                if (maxWait++ > 100) {
                    throw new RuntimeException("MGED is not loaded and gave up waiting");
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        log.info("Addding experimental factors to experimental design: " + experimentalDesign.getId());

        Collection<OntologyTerm> terms = mgedOntologyService.getMgedTermsByKey("factor");
        if (experimentalDesign.getExperimentalFactors() == null) {
            experimentalDesign.setExperimentalFactors(new HashSet<ExperimentalFactor>());
        }

        Map<String, Set<String>> mapFactorSampleValues = getMapFactorSampleValues(headerFields, factorValueLines);

        for (String experimentalFactorFileLine : experimentalFactorFileLines) {

            // $Run time : Category=EnvironmentalHistory Type=categorical
            String[] experimentalFactorfields = experimentalFactorFileLine.split(":");

            String factorValue = (StringUtils.strip(experimentalFactorfields[0]
                    .replaceFirst(Pattern.quote(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", "")))
                            .trim();
            String categoryAndType = StringUtils.strip(experimentalFactorfields[1]);
            String[] categoryAndTypeFields = StringUtils.split(categoryAndType);

            // e.g. Category=EnvironmentalHistory
            String category = categoryAndTypeFields[0];
            // e.g. EnvironmentalHistory
            String categoryValue = StringUtils.split(category, "=")[1];

            ExperimentalFactor experimentalFactorFromFile = ExperimentalFactor.Factory.newInstance();
            experimentalFactorFromFile.setExperimentalDesign(experimentalDesign);
            VocabCharacteristic vc = mgedLookup(categoryValue, terms);

            // e.g. Category=EnvironmentalHistory
            String categoryTypeValue = categoryAndTypeFields[1];
            String factorType = StringUtils.split(categoryTypeValue, "=")[1];

            // vc.setCategory( categoryType );

            experimentalFactorFromFile.setCategory(vc);
            experimentalFactorFromFile.setName(factorValue);
            experimentalFactorFromFile.setDescription(factorValue);
            experimentalFactorFromFile.setType(
                    factorType.equalsIgnoreCase("CATEGORICAL") ? FactorType.CATEGORICAL : FactorType.CONTINUOUS);

            addFactorValuesToExperimentalFactor(experimentalFactorFromFile, mapFactorSampleValues, factorType);

            if (!checkForDuplicateExperimentalFactorOnExperimentalDesign(experimentalDesign,
                    experimentalFactorFromFile)) {
                // assert experimentalFactorFromFile.getId() != null;
                experimentalDesign.getExperimentalFactors().add(experimentalFactorFromFile);
                // here is was the update
                log.debug("Added experimental factor value " + experimentalFactorFromFile
                        + " to experimental design " + experimentalDesign);

            }
        }

    }

    /**
     * Add the factor values to the biomaterial
     * 
     * @param experiment
     * @param experimentBioMaterials Current expression experiment's biomaterials.
     * @param experimentalDesign experimental design
     * @param factorValueLines Lines from file containing factor values and biomaterial ids
     * @param headerFields
     * @return Collection of biomaterials associated with this experiment, this is returned as the biomaterial is in a
     *         bioassay (first one retrieved)
     */
    private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment(
            ExpressionExperiment experiment, Collection<BioMaterial> experimentBioMaterials,
            ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) {
        log.debug("Adding factors values to biomaterials: " + experimentalDesign.getId());
        Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign
                .getExperimentalFactors();
        Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<BioMaterial>();

        Collection<BioMaterial> seenBioMaterials = new HashSet<BioMaterial>();

        Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<ExperimentalFactor, Collection<BioMaterial>>();

        for (String factorValueLine : factorValueLines) {
            String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

            String externalId = null;
            boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID");
            if (hasExternalId) {
                externalId = factorValueFields[1];
            }
            BioMaterial currentBioMaterial = getBioMaterialFromExpressionExperiment(experiment,
                    experimentBioMaterials, factorValueFields[0], externalId);

            if (currentBioMaterial == null) {
                throw new IllegalStateException("No biomaterial for " + factorValueFields[0]);
            }

            if (seenBioMaterials.contains(currentBioMaterial)) {
                throw new IllegalArgumentException(
                        "A biomaterial occurred more than once in the file: " + currentBioMaterial);
            }

            seenBioMaterials.add(currentBioMaterial);

            int start = 1;
            if (hasExternalId) {
                start = 2;
            }

            for (int i = start; i < factorValueFields.length; i++) {
                ExperimentalFactor currentExperimentalFactor = null;
                String currentExperimentalFactorName = StringUtils.strip(headerFields[i]);

                FactorValue currentFactorValue = null;
                String currentFactorValueValue = StringUtils.strip(factorValueFields[i]);

                if (StringUtils.isBlank(currentFactorValueValue)) {
                    // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings.
                    continue;
                }

                for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) {
                    if (experimentalFactor.getName().equals(currentExperimentalFactorName)) {
                        currentExperimentalFactor = experimentalFactor;
                    }
                }

                if (currentExperimentalFactor == null)
                    throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName);

                Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor
                        .getFactorValues();

                for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) {
                    if (factorValue.getValue().trim().equalsIgnoreCase(currentFactorValueValue.trim())) {
                        currentFactorValue = factorValue;
                    }
                }

                if (currentFactorValue == null) {
                    log.error("Current factor value not found " + currentExperimentalFactor
                            + currentFactorValueValue);
                } else {
                    if (!checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) {
                        currentBioMaterial.getFactorValues().add(currentFactorValue);
                    } else {
                        // already got warned.
                    }
                }
                log.debug("Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial);
                biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial);

                if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) {
                    factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>());
                }
                factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial);

            }

        }

        /*
         * Check if every biomaterial got used. Worth a warning, at least.
         */
        for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) {
            if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) {
                log.warn(
                        "File did not contain values for all factor - biomaterial combinations: Missing at least one for "
                                + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/"
                                + experimentBioMaterials.size() + " ]");
            }
        }

        return biomaterialsWithFactorValuesInExperiment;
    }

    /**
     * Method that adds factor values to a given experimental factor
     * 
     * @param experimentalFactor The experimental factor to add the factor values to
     * @param factorSampleValues A map of factor value names keyed on experimental factor name
     * @param factorType Whether the factor is continuous or categorical
     */
    private void addFactorValuesToExperimentalFactor(ExperimentalFactor experimentalFactor,
            Map<String, Set<String>> factorSampleValues, String factorType) {
        log.debug("Addding factors values to experimental factor: " + experimentalFactor.getName());
        VocabCharacteristic category = (VocabCharacteristic) experimentalFactor.getCategory();

        Set<String> values = factorSampleValues.get(experimentalFactor.getName());
        for (String value : values) {

            FactorValue factorValue = FactorValue.Factory.newInstance();
            factorValue.setValue(value);

            if (factorType.equalsIgnoreCase("CATEGORICAL")) {
                log.debug("Factor is categorical");
                VocabCharacteristic newVc = VocabCharacteristic.Factory.newInstance();
                String category2 = category.getCategory();
                assert category2 != null;
                newVc.setCategory(category2);
                newVc.setCategoryUri(category.getCategoryUri());
                newVc.setValue(value);
                newVc.setEvidenceCode(GOEvidenceCode.IC);
                factorValue.getCharacteristics().add(newVc);
            } else {
                log.debug("Factor is continous");
                addMeasurementToFactorValueOfTypeContinous(factorValue);
            }
            // set bidirectional relationship
            experimentalFactor.getFactorValues().add(factorValue);
            factorValue.setExperimentalFactor(experimentalFactor);
            log.debug("Added factor value " + factorValue + " to experimental factor " + experimentalFactor);
        }

    }

    /**
     * Add a measurement to a factor value which is of type continuous
     * 
     * @param FactorValue representing a continuous factor with an associated measurement
     */
    private void addMeasurementToFactorValueOfTypeContinous(FactorValue factorValue) {
        Measurement m = Measurement.Factory.newInstance();
        m.setType(MeasurementType.ABSOLUTE);
        m.setValue(factorValue.getValue());
        try {
            Double.parseDouble(factorValue.getValue()); // check if it is a number, don't need the value.
            m.setRepresentation(PrimitiveType.DOUBLE);
        } catch (NumberFormatException e) {
            m.setRepresentation(PrimitiveType.STRING);
        }

        factorValue.setMeasurement(m);
        log.debug("Created " + factorValue + " for experimental factor ");

    }

    /**
     * Check that experimental design does not already contain the experimental factor.
     * 
     * @param experimentalDesign Existing experimental design.
     * @param experimentalFactorFromFile The experimental factor in the file
     */
    private boolean checkForDuplicateExperimentalFactorOnExperimentalDesign(ExperimentalDesign experimentalDesign,
            ExperimentalFactor experimentalFactorFromFile) {

        boolean foundMatch = false;
        for (ExperimentalFactor existingExperimentalFactors : experimentalDesign.getExperimentalFactors()) {
            if (existingExperimentalFactors.getName().equals(experimentalFactorFromFile.getName())) {
                log.info(experimentalFactorFromFile + " matches existing " + existingExperimentalFactors);
                experimentalFactorFromFile = existingExperimentalFactors;
                foundMatch = true;
            }
        }
        return foundMatch;
    }

    /**
     * This method checks that the biomaterial does not already have a factor.
     * 
     * @param bioMaterial
     * @param factorValue
     * @return
     */
    private boolean checkForDuplicateFactorOnBioMaterial(BioMaterial bioMaterial, FactorValue factorValue) {
        boolean foundMatch = false;
        // make sure we don't add two values.
        for (FactorValue existingfv : bioMaterial.getFactorValues()) {
            if (factorValue.equals(existingfv)
                    || existingfv.getExperimentalFactor().equals(factorValue.getExperimentalFactor())) {
                log.warn(bioMaterial + " already has a factorvalue for " + factorValue.getExperimentalFactor()
                        + " [" + factorValue + " matched existing: " + existingfv + "]");
                foundMatch = true;
                break;
            }
        }

        return foundMatch;
    }

    /**
     * This method retrieves a biomaterial from the expression experiment based on a biomaterial name given in the input
     * file. If no biomaterial is found then null is returned, indicating that a biomaterial name was given in the file
     * which does not match those stored for the expression experiment.
     * 
     * @param expressionExperiment The current expression experiment
     * @param biomaterialNameFromFile - A factor value file line whose first column contains biomaterial name
     * @param externalId - the external id stored in the file, which might not be available (so this can be null or
     *        blank)
     * @return The biomaterial in the expression experiment set matching the biosource name given in the first column of
     *         the factor value line.
     */
    private BioMaterial getBioMaterialFromExpressionExperiment(ExpressionExperiment ee,
            Collection<BioMaterial> bioMaterials, String biomaterialNameFromFile, String externalId) {

        Map<String, BioMaterial> biomaterialsInExpressionExperiment = mapBioMaterialsToNamePossibilities(
                bioMaterials);

        // format the biomaterial name gemma style
        String bioMaterialNameFormatedWithShortName = SimpleExpressionDataLoaderServiceImpl.makeBioMaterialName(ee,
                biomaterialNameFromFile);

        BioMaterial bioMaterial = biomaterialsInExpressionExperiment.get(biomaterialNameFromFile);
        if (bioMaterial == null) {
            // try alternative format...
            bioMaterial = biomaterialsInExpressionExperiment.get(bioMaterialNameFormatedWithShortName);
        }

        if (bioMaterial == null && StringUtils.isNotBlank(externalId)) {
            // FIXME document this better. If there are two or more GSM's grouped together we list them in the file
            // separated by '/'.
            String[] externalIds = StringUtils.split(externalId, "/");

            for (String id : externalIds) {
                bioMaterial = biomaterialsInExpressionExperiment.get(id);
                if (bioMaterial != null)
                    break;
            }

        }

        return bioMaterial;
    }

    /**
     * Get a map of experimental values keyed on experimental factor name
     * 
     * @param headerFields
     * @param factorValueLines
     * @return map of experimental factor values keyed on experimental factor
     */
    private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields,
            List<String> factorValueLines) {
        Map<String, Set<String>> factorSampleValues = new HashMap<String, Set<String>>();
        for (String factorValueLine : factorValueLines) {
            String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

            for (int i = 1; i < headerFields.length; i++) {

                // get the key
                String value = headerFields[i];
                value = StringUtils.strip(value);
                String factorValue = StringUtils.strip(factorValueFields[i]);
                Set<String> listFactorValues = factorSampleValues.get(value);
                if (listFactorValues == null) {
                    listFactorValues = new HashSet<String>();
                }
                listFactorValues.add(factorValue);
                factorSampleValues.put(value, listFactorValues);

            }

        }
        return factorSampleValues;

    }

    /**
     * Create a map of various strings that we might find in a design importing file to the biomaterials.
     * 
     * @param expressionExperiment
     * @return
     */
    private Map<String, BioMaterial> mapBioMaterialsToNamePossibilities(Collection<BioMaterial> bioMaterials) {
        Map<String, BioMaterial> biomaterialsInExpressionExperiment = new HashMap<String, BioMaterial>();

        // this rather big loop is recomputed each time we call this method. No big deal, but could be more efficient.
        for (BioMaterial bm : bioMaterials) {
            biomaterialsInExpressionExperiment.put(bm.getName(), bm);

            // we allow multiple bioassays per biomaterial - e.g. two platforms run on the sa
            for (BioAssay ba : bm.getBioAssaysUsedIn()) {

                /*
                 * Allow matches to the accession (external id) of the bioassay; trying to be flexible! This _could_
                 * cause problems if there are multiple bioassays per biomaterial, thus the check here.
                 */
                if (ba.getAccession() != null && StringUtils.isNotBlank(ba.getAccession().getAccession())) {
                    String accession = ba.getAccession().getAccession();
                    /*
                     * We get at most one bioassay per biomaterial.
                     */
                    biomaterialsInExpressionExperiment.put(accession, bm);
                }

                /*
                 * Similarly allow match on the bioassay name
                 */
                biomaterialsInExpressionExperiment.put(ba.getName(), bm);
            }

            /*
             * All put in the very-mangled name we use in the 'native' Gemma export format. This includes the ID, so not
             * useful for tests.
             */
            biomaterialsInExpressionExperiment
                    .put(ExpressionDataWriterUtils.constructBioAssayName(bm, bm.getBioAssaysUsedIn()), bm);

        }
        return biomaterialsInExpressionExperiment;
    }

    /**
     * Does an mged lookup
     * 
     * @param category
     * @return
     */
    private VocabCharacteristic mgedLookup(String category, Collection<OntologyTerm> terms) {

        OntologyTerm t = null;
        for (OntologyTerm to : terms) {
            if (to.getTerm().equals(category)) {
                t = to;
                break;
            }
        }

        if (t == null) {
            throw new IllegalArgumentException("No MGED term matches '" + category + "'");
        }

        VocabCharacteristic vc = VocabCharacteristic.Factory.newInstance();
        vc.setCategoryUri(t.getUri());
        vc.setCategory(t.getTerm());
        vc.setValueUri(t.getUri());
        vc.setValue(t.getTerm());
        vc.setEvidenceCode(GOEvidenceCode.IC);
        return vc;
    }

    /**
     * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
     * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
     * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
     * match what is stored
     * 
     * @param experiment Current experiment
     * @param factorValueLines Lines containing biomaterial names and their factor values
     */
    private void validateBioMaterialFileContent(ExpressionExperiment experiment,
            Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException {

        for (String factorValueLine : factorValueLines) {
            String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
            if (vals.length < 2) {
                throw new IllegalArgumentException(
                        "Expected a file with at least two columns separated by tabs, got " + factorValueLine);
            }
            BioMaterial bioMaterialInFile = getBioMaterialFromExpressionExperiment(experiment, bioMaterials,
                    vals[0], vals[1]);
            if (bioMaterialInFile == null) {
                throw new IllegalArgumentException(
                        "The uploaded file has a biomaterial name that does not match the study: "
                                + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0]
                                + " (formatted based on on input: ");
            }
        }
    }

    /**
     * Validates that the input for experimental factors is correct: Experimental factor file line should be for e.g.
     * #$Run time : Category=EnvironmentalHistory Type=categorical Checks there is a colon, between experimental factor
     * and category and that category is correctly formatted.
     * 
     * @param sampleHeaderLine Lines in file corresponding to order of experimental factors
     * @param experimentalFactorList The lines in the file corresponding to experimental factors.
     * @throws IOException Experimental factor lines were not correctly format.
     */
    private void validateExperimentalFactorFileContent(List<String> experimentalFactorLines,
            String sampleHeaderLine) throws IOException {
        Set<String> experimentalFactorValueNames = new HashSet<String>();
        // validate experimental factor lines
        for (String line : experimentalFactorLines) {
            String[] fields = line.split(":");
            if (fields.length != 2) {
                throw new IOException(
                        "EF description must have two fields with a single ':' in between (" + line + ")");
            }
            String factorName = StringUtils.strip(fields[0]
                    .replaceFirst(Pattern.quote(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", ""));

            experimentalFactorValueNames.add(factorName);
            String category = StringUtils.strip(fields[1]);

            String[] descriptions = StringUtils.split(category);

            if (descriptions.length != 2) {
                throw new IOException("EF details should have the format 'Category=CATEGORY Type=TYPE'");
            }

        }

        validateSampleHeaderFileContent(experimentalFactorValueNames, experimentalFactorLines.size(),
                sampleHeaderLine);

    }

    /**
     * Validates that factor values given in file for each biomaterial match the number of experimental factor values
     * expected.
     * 
     * @para numberOfExperimentalFactors
     * @param factorValueList Represents lines of file containing factor values for a biomaterial
     */
    private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList)
            throws IOException {
        for (String factorValueLine : factorValueList) {
            String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");
            if (fields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
                throw new IOException(
                        "Expected no more than " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                                + " columns based on EF descriptions (plus id column), got " + fields.length);
            }
            if (fields.length <= numberOfExperimentalFactors) {
                throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1)
                        + " columns based on EF descriptions (plus id column), got " + fields.length);

            }
        }
    }

    /**
     * Simple file content validation checking that the 3 file components are present in the file
     * 
     * @param experimentalFactorLines Lines identified by EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR (#$) detailing
     *        experimental factor values.
     * @param sampleHeaderLine Header Giving order of experimental factor values in the file
     * @param factorValues The factor values in this file
     * @throws IOException File was not in correct format.
     */
    private void validateFileComponents(List<String> experimentalFactorLines, String sampleHeaderLine,
            List<String> factorValues) throws IOException {
        if (experimentalFactorLines.isEmpty()) {
            throw new IOException("No experimentalFactorLine definitions found in the design file.");
        }
        if (StringUtils.isBlank(sampleHeaderLine)) {
            throw new IOException("No Sample header found");
        }

        if (factorValues.isEmpty()) {
            throw new IOException("No factorValues definitions found in the design file.");
        }

    }

    /**
     * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the
     * header match those in the experimental factor file lines.
     * 
     * @param experimentalFactorValueNames
     * @param numberOfExperimentalFactors
     * @param sampleHeaderLine
     * @throws IOException Validation fails.
     */
    private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames,
            Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException {
        String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

        // we might have the ids, and the external id.
        if (headerFields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
            throw new IOException("Expected " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                    + " columns based on EF descriptions (plus id column), got " + headerFields.length);
        }

        for (int i = 1; i < headerFields.length; i++) {

            String value = headerFields[i];

            value = StringUtils.strip(value);

            if (value.equals("ExternalID")) {
                // that's fine.
                continue;
            }

            if (!experimentalFactorValueNames.contains(value)) {
                throw new IOException("Expected to find an EF matching the column heading '" + value + "'");
            }

        }

    }

}