ubic.gemma.core.analysis.service.ExpressionDataFileServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.analysis.service.ExpressionDataFileServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2007 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.analysis.service;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.util.FileTools;
import ubic.basecode.util.StringUtil;
import ubic.gemma.core.analysis.expression.diff.DifferentialExpressionAnalysisConfig;
import ubic.gemma.core.analysis.preprocess.ExpressionDataMatrixBuilder;
import ubic.gemma.core.analysis.preprocess.filter.FilterConfig;
import ubic.gemma.core.datastructure.matrix.ExperimentalDesignWriter;
import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.core.datastructure.matrix.ExpressionDataMatrix;
import ubic.gemma.core.datastructure.matrix.MatrixWriter;
import ubic.gemma.model.analysis.expression.diff.ContrastResult;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysisResult;
import ubic.gemma.model.analysis.expression.diff.ExpressionAnalysisResultSet;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.*;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.persistence.service.analysis.expression.diff.DifferentialExpressionAnalysisService;
import ubic.gemma.persistence.service.association.coexpression.CoexpressionService;
import ubic.gemma.persistence.service.association.coexpression.CoexpressionValueObject;
import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
import ubic.gemma.persistence.util.DifferentialExpressionAnalysisResultComparator;
import ubic.gemma.persistence.util.EntityUtils;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

/**
 * Supports the creation and location of 'flat file' versions of data in the system, for download by users. Files are
 * cached on the filesystem and reused if possible, rather than recreating them every time.
 *
 * @author paul
 */
@Component
public class ExpressionDataFileServiceImpl implements ExpressionDataFileService {

    private static final String DECIMAL_FORMAT = "%.4g";
    private static final Log log = LogFactory.getLog(ArrayDesignAnnotationServiceImpl.class.getName());
    private static final String MSG_FILE_EXISTS = " File (%s) exists, not regenerating";
    private static final String MSG_FILE_FORCED = "Forcing file (%s) regeneration";
    private static final String MSG_FILE_NOT_EXISTS = "File (%s) does not exist or can not be accessed ";
    private static final String MSG_FILE_OUTDATED = "File (%s) outdated, regenerating";

    private static ExpressionExperiment experimentForBioAssaySet(BioAssaySet bas) {
        ExpressionExperiment ee;
        if (bas instanceof ExpressionExperimentSubSet) {
            ee = ((ExpressionExperimentSubSet) bas).getSourceExperiment();
        } else {
            ee = (ExpressionExperiment) bas;
        }
        return ee;
    }

    @Autowired
    private ArrayDesignService arrayDesignService;
    @Autowired
    private DifferentialExpressionAnalysisService differentialExpressionAnalysisService = null;
    @Autowired
    private ExpressionDataMatrixService expressionDataMatrixService;
    @Autowired
    private ExpressionExperimentService expressionExperimentService;
    @Autowired
    private CoexpressionService gene2geneCoexpressionService = null;

    @Autowired
    private RawExpressionDataVectorService rawExpressionDataVectorService;

    @Override
    public void analysisResultSetsToString(Collection<ExpressionAnalysisResultSet> results,
            Map<Long, String[]> geneAnnotations, StringBuilder buf) {
        Map<Long, StringBuilder> probe2String = new HashMap<>();

        List<DifferentialExpressionAnalysisResult> sortedFirstColumnOfResults = null;

        for (ExpressionAnalysisResultSet ears : results) {
            sortedFirstColumnOfResults = this.analysisResultSetToString(ears, geneAnnotations, buf, probe2String,
                    sortedFirstColumnOfResults);

        } // ears loop

        buf.append("\n");

        if (sortedFirstColumnOfResults == null) {
            throw new IllegalStateException("No results for ");
        }

        // Dump the probe data in the sorted order of the 1st column that we originally sorted
        for (DifferentialExpressionAnalysisResult sortedResult : sortedFirstColumnOfResults) {

            CompositeSequence cs = sortedResult.getProbe();
            StringBuilder sb = probe2String.get(cs.getId());
            if (sb == null) {
                ExpressionDataFileServiceImpl.log.warn("Unable to find element " + cs.getId() + " in map");
                break;
            }
            buf.append(sb);
            buf.append("\n");

        }
    }

    @Override
    public List<DifferentialExpressionAnalysisResult> analysisResultSetToString(ExpressionAnalysisResultSet ears,
            Map<Long, String[]> geneAnnotations, StringBuilder buf, Map<Long, StringBuilder> probe2String,
            List<DifferentialExpressionAnalysisResult> sortedFirstColumnOfResults) {

        if (sortedFirstColumnOfResults == null) { // Sort P values in ears (because 1st column)
            sortedFirstColumnOfResults = new ArrayList<>(ears.getResults());
            Collections.sort(sortedFirstColumnOfResults,
                    DifferentialExpressionAnalysisResultComparator.Factory.newInstance());
        }

        // Generate a description of the factors involved "factor1_factor2", trying to be R-friendly
        StringBuilder factorColumnName = new StringBuilder();
        for (ExperimentalFactor ef : ears.getExperimentalFactors()) {
            factorColumnName.append(ef.getName().replaceAll("\\s+", "_")).append("_");
        }
        factorColumnName = new StringBuilder(
                StringUtil.makeValidForR(StringUtils.removeEnd(factorColumnName.toString(), "_")));

        // Generate headers
        buf.append("\tQValue_").append(factorColumnName);
        buf.append("\tPValue_").append(factorColumnName);

        // Generate probe details
        for (DifferentialExpressionAnalysisResult dear : ears.getResults()) {
            StringBuilder probeBuffer = new StringBuilder();

            CompositeSequence cs = dear.getProbe();

            // Make a hashMap so we can organize the data by probe with factors as columns
            // Need to cache the information until we have it organized in the correct format to write
            Long csid = cs.getId();
            if (probe2String.containsKey(csid)) {
                probeBuffer = probe2String.get(csid);
            } else {// no entry for probe yet
                probeBuffer.append(cs.getName());
                if (geneAnnotations.containsKey(csid)) {
                    String[] annotationStrings = geneAnnotations.get(csid);
                    /*
                     * Fields:
                     *
                     * 1: gene symbols
                     * 2: gene name
                     * 4: ncbi ID
                     */
                    probeBuffer.append("\t").append(annotationStrings[1]).append("\t").append(annotationStrings[2])
                            .append("\t").append(annotationStrings[4]);
                } else {
                    probeBuffer.append("\t\t\t");
                }

                probe2String.put(csid, probeBuffer);
            }

            Double correctedPvalue = dear.getCorrectedPvalue();
            Double pvalue = dear.getPvalue();

            String formattedCP = correctedPvalue == null ? ""
                    : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, correctedPvalue);
            String formattedP = pvalue == null ? ""
                    : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, pvalue);
            probeBuffer.append("\t").append(formattedCP).append("\t").append(formattedP);

        }
        return sortedFirstColumnOfResults;

    }

    @Override
    public void deleteAllFiles(ExpressionExperiment ee) {
        ee = this.expressionExperimentService.thawLite(ee);

        // data files.
        this.deleteAndLog(this.getOutputFile(ee, true));
        this.deleteAndLog(this.getOutputFile(ee, false));

        // diff ex files
        Collection<DifferentialExpressionAnalysis> analyses = this.differentialExpressionAnalysisService
                .getAnalyses(ee);
        for (DifferentialExpressionAnalysis analysis : analyses) {
            this.deleteDiffExArchiveFile(analysis);
        }

        // coexpression file
        this.deleteAndLog(this.getOutputFile(this.getCoexpressionDataFilename(ee)));

        // design file
        this.deleteAndLog(this.getOutputFile(this.getDesignFileName(ee)));
    }

    @Override
    public void deleteDiffExArchiveFile(DifferentialExpressionAnalysis analysis) {
        String filename = this.getDiffExArchiveFileName(analysis);
        this.deleteAndLog(this.getOutputFile(filename));
    }

    @Override
    public File getDiffExpressionAnalysisArchiveFile(Long analysisId, boolean forceCreate) {
        DifferentialExpressionAnalysis analysis = this.differentialExpressionAnalysisService.load(analysisId);
        return getDiffExpressionAnalysisArchiveFile(analysis, forceCreate);
    }

    @Override
    public File getOutputFile(ExpressionExperiment ee, boolean filtered) {
        return this.getOutputFile(ee, filtered, true, false);
    }

    @Override
    public File getOutputFile(ExpressionExperiment ee, boolean filtered, boolean compressed, boolean temporary) {
        String filteredAdd = "";
        if (!filtered) {
            filteredAdd = ".unfilt";
        }
        String suffix;

        if (compressed) {
            suffix = ExpressionDataFileService.DATA_FILE_SUFFIX_COMPRESSED;
        } else {
            suffix = ExpressionDataFileService.DATA_FILE_SUFFIX;
        }

        String filename = this.getDataFileName(ee, filteredAdd, suffix);

        // randomize file name if temporary in case of access by more than one user at once
        if (temporary) {

            filename = RandomStringUtils.randomAlphabetic(6) + filename;

        }

        return this.getOutputFile(filename, temporary);
    }

    @Override
    public File getOutputFile(String filename) {
        return this.getOutputFile(filename, false);

    }

    @Override
    public File getOutputFile(String filename, boolean temporary) {
        String fullFilePath;
        if (temporary) {
            fullFilePath = ExpressionDataFileService.TMP_DATA_DIR + filename;
        } else {
            fullFilePath = ExpressionDataFileService.DATA_DIR + filename;
        }
        File f = new File(fullFilePath);

        if (f.exists()) {
            return f;
        }

        EntityUtils.mkdirs(f.getParentFile());
        return f;
    }

    @Override
    public File writeDataFile(ExpressionExperiment ee, boolean filtered, String fileName, boolean compress)
            throws IOException {
        File f = new File(fileName);
        return this.writeDataFile(ee, filtered, f, compress);
    }

    @Override
    public void writeDiffExArchiveFile(BioAssaySet experimentAnalyzed, DifferentialExpressionAnalysis analysis,
            DifferentialExpressionAnalysisConfig config) throws IOException {
        Collection<ArrayDesign> arrayDesigns = this.expressionExperimentService
                .getArrayDesignsUsed(experimentAnalyzed);
        Map<Long, String[]> geneAnnotations = this.getGeneAnnotationsAsStrings(arrayDesigns);
        String filename = this.getDiffExArchiveFileName(analysis);
        File f = this.getOutputFile(filename);

        ExpressionDataFileServiceImpl.log
                .info("Creating differential expression analysis archive file: " + f.getName());
        try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(f))) {

            // top-level analysis results - ANOVA-style
            zipOut.putNextEntry(new ZipEntry("analysis.results.txt"));
            String analysisData = this.convertDiffExpressionAnalysisData(analysis, geneAnnotations, config);
            zipOut.write(analysisData.getBytes());
            zipOut.closeEntry();

            if (analysis.getId() != null) // might be transient if using -nodb from CLI
                differentialExpressionAnalysisService.thaw(analysis);

            // Add a file for each result set with contrasts information.
            int i = 0;
            for (ExpressionAnalysisResultSet resultSet : analysis.getResultSets()) {
                if (resultSet.getExperimentalFactors().size() > 1) {
                    // Skip interactions.
                    ExpressionDataFileServiceImpl.log.info("Result file for interaction is omitted"); // Why?
                    continue;
                }

                String resultSetData = this.convertDiffExpressionResultSetData(resultSet, geneAnnotations, config);

                if (resultSet.getId() == null) { // -nodb option on analysis
                    zipOut.putNextEntry(new ZipEntry(
                            "resultset_" + ++i + "of" + analysis.getResultSets().size() + ".data.txt")); // to make it clearer this is not an ID
                } else {
                    zipOut.putNextEntry(new ZipEntry("resultset_ID" + resultSet.getId() + ".data.txt"));
                }

                zipOut.write(resultSetData.getBytes());
                zipOut.closeEntry();
            }
        }
    }

    @Override
    public File writeOrLocateCoexpressionDataFile(ExpressionExperiment ee, boolean forceWrite) {

        ee = expressionExperimentService.thawLite(ee);

        try {
            File f = this.getOutputFile(this.getCoexpressionDataFilename(ee));
            if (!forceWrite && f.canRead()) {
                ExpressionDataFileServiceImpl.log.info(f + " exists, not regenerating");
                return f;
            }

            this.writeCoexpressionData(f, ee);
            return f;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    @Override
    public File writeOrLocateDataFile(ExpressionExperiment ee, boolean forceWrite, boolean filtered) {
        try {
            File f = this.getOutputFile(ee, filtered);
            Date check = expressionExperimentService.getLastArrayDesignUpdate(ee);

            if (this.checkFileOkToReturn(forceWrite, f, check)) {
                return f;
            }

            return this.writeDataFile(ee, filtered, f, true);

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public File writeOrLocateDataFile(QuantitationType type, boolean forceWrite) {

        try {
            File f = this.getOutputFile(type);
            if (!forceWrite && f.canRead()) {
                ExpressionDataFileServiceImpl.log.info(f + " exists, not regenerating");
                return f;
            }

            ExpressionDataFileServiceImpl.log
                    .info("Creating new quantitation type expression data file: " + f.getName());

            Collection<DesignElementDataVector> vectors = rawExpressionDataVectorService.findRawAndProcessed(type);
            Collection<ArrayDesign> arrayDesigns = this.getArrayDesigns(vectors);
            Map<CompositeSequence, String[]> geneAnnotations = this
                    .getGeneAnnotationsAsStringsByProbe(arrayDesigns);

            if (vectors.size() == 0) {
                ExpressionDataFileServiceImpl.log.warn("No vectors for " + type);
                return null;
            }

            this.writeVectors(f, vectors, geneAnnotations);
            return f;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public File writeOrLocateDesignFile(ExpressionExperiment ee, boolean forceWrite) {
        ee = expressionExperimentService.thawLite(ee);
        try {
            File f = this.getOutputFile(this.getDesignFileName(ee));
            Date check = ee.getCurationDetails().getLastUpdated();

            if (this.checkFileOkToReturn(forceWrite, f, check)) {
                return f;
            }

            return this.writeDesignMatrix(f, ee);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    @Override
    public Collection<File> writeOrLocateDiffExpressionDataFiles(ExpressionExperiment ee, boolean forceWrite) {

        ee = this.expressionExperimentService.thawLite(ee);

        Collection<DifferentialExpressionAnalysis> analyses = this.differentialExpressionAnalysisService
                .getAnalyses(ee);

        Collection<File> result = new HashSet<>();
        for (DifferentialExpressionAnalysis analysis : analyses) {
            result.add(this.getDiffExpressionAnalysisArchiveFile(analysis, forceWrite));
        }

        return result;

    }

    @Override
    public File writeOrLocateJSONDataFile(ExpressionExperiment ee, boolean forceWrite, boolean filtered) {

        try {
            File f = this.getOutputFile(ee, filtered);
            if (!forceWrite && f.canRead()) {
                ExpressionDataFileServiceImpl.log.info(f + " exists, not regenerating");
                return f;
            }

            ExpressionDataFileServiceImpl.log.info("Creating new JSON expression data file: " + f.getName());
            ExpressionDataDoubleMatrix matrix = this.getDataMatrix(ee, filtered);

            this.writeJson(f, matrix);
            return f;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public File writeOrLocateJSONDataFile(QuantitationType type, boolean forceWrite) {

        try {
            File f = this.getJSONOutputFile(type);
            if (!forceWrite && f.canRead()) {
                ExpressionDataFileServiceImpl.log.info(f + " exists, not regenerating");
                return f;
            }

            ExpressionDataFileServiceImpl.log
                    .info("Creating new quantitation type  JSON data file: " + f.getName());

            Collection<DesignElementDataVector> vectors = rawExpressionDataVectorService.findRawAndProcessed(type);

            if (vectors.size() == 0) {
                ExpressionDataFileServiceImpl.log.warn("No vectors for " + type);
                return null;
            }

            this.writeJson(f, vectors);
            return f;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void addGeneAnnotationsToLine(StringBuilder rowBuffer, DifferentialExpressionAnalysisResult dear,
            Map<Long, String[]> geneAnnotations) {
        CompositeSequence cs = dear.getProbe();
        Long csid = cs.getId();
        rowBuffer.append(cs.getName());
        if (geneAnnotations.containsKey(csid)) {
            String[] annotationStrings = geneAnnotations.get(csid);
            rowBuffer.append("\t").append(annotationStrings[1]).append("\t").append(annotationStrings[2])
                    .append("\t");

            // leaving out Gemma ID, which is annotationStrings[3]
            if (annotationStrings.length > 4) {
                // ncbi id, if we have it.
                rowBuffer.append(annotationStrings[4]);
            }
        } else {
            rowBuffer.append("\t\t\t");
        }
    }

    private String analysisResultSetWithContrastsToString(ExpressionAnalysisResultSet resultSet,
            Map<Long, String[]> geneAnnotations) {
        StringBuilder buf = new StringBuilder();

        ExperimentalFactor ef = resultSet.getExperimentalFactors().iterator().next();

        if (ef.getType().equals(FactorType.CONTINUOUS)) {

            buf.append("\tCoefficient_").append(StringUtil.makeValidForR(ef.getName())).append("\tPValue_")
                    .append(StringUtil.makeValidForR(ef.getName())).append("\n");

            for (DifferentialExpressionAnalysisResult dear : resultSet.getResults()) {
                StringBuilder rowBuffer = new StringBuilder();

                if (geneAnnotations.isEmpty()) {
                    rowBuffer.append(dear.getProbe().getName());
                } else {
                    this.addGeneAnnotationsToLine(rowBuffer, dear, geneAnnotations);
                }

                assert dear.getContrasts().size() == 1;

                ContrastResult contrast = dear.getContrasts().iterator().next();

                Double coefficient = contrast.getCoefficient();
                Double pValue = contrast.getPvalue();
                String formattedPvalue = pValue == null ? ""
                        : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, pValue);
                String formattedCoefficient = coefficient == null ? ""
                        : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, coefficient);
                String contrastData = "\t" + formattedCoefficient + "\t" + formattedPvalue;

                rowBuffer.append(contrastData);

                buf.append(rowBuffer.toString()).append('\n');
            }

        } else {

            Long baselineId = resultSet.getBaselineGroup().getId();
            List<Long> factorValueIdOrder = new ArrayList<>();

            /*
             * First find out what factor values are relevant in case this is a subsetted analysis. With this we
             * probably not worry about the baselineId since it won't be here.
             */
            Collection<Long> usedFactorValueIds = new HashSet<>();
            for (DifferentialExpressionAnalysisResult dear : resultSet.getResults()) {
                for (ContrastResult contrast : dear.getContrasts()) {
                    usedFactorValueIds.add(contrast.getFactorValue().getId());
                }
                break; // only have to look at one.
            }

            for (FactorValue factorValue : ef.getFactorValues()) {

                /*
                 * deal correctly with subset situations - only use factor values relevant to the subset
                 */
                if (Objects.equals(factorValue.getId(), baselineId)
                        || !usedFactorValueIds.contains(factorValue.getId())) {
                    continue;
                }
                factorValueIdOrder.add(factorValue.getId());
                // Generate column headers, try to be R-friendly
                buf.append("\tFoldChange_").append(this.getFactorValueString(factorValue));
                buf.append("\tTstat_").append(this.getFactorValueString(factorValue));
                buf.append("\tPValue_").append(this.getFactorValueString(factorValue));
            }

            buf.append('\n');

            // Generate element details
            for (DifferentialExpressionAnalysisResult dear : resultSet.getResults()) {
                StringBuilder rowBuffer = new StringBuilder();

                this.addGeneAnnotationsToLine(rowBuffer, dear, geneAnnotations);

                Map<Long, String> factorValueIdToData = new HashMap<>();
                // I don't think we can expect them in the same order.
                for (ContrastResult contrast : dear.getContrasts()) {
                    Double foldChange = contrast.getLogFoldChange();
                    Double pValue = contrast.getPvalue();
                    Double tStat = contrast.getTstat();
                    String formattedPvalue = pValue == null ? ""
                            : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, pValue);
                    String formattedFoldChange = foldChange == null ? ""
                            : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, foldChange);
                    String formattedTState = tStat == null ? ""
                            : String.format(ExpressionDataFileServiceImpl.DECIMAL_FORMAT, tStat);
                    String contrastData = "\t" + formattedFoldChange + "\t" + formattedTState + "\t"
                            + formattedPvalue;
                    assert contrast.getFactorValue() != null;

                    factorValueIdToData.put(contrast.getFactorValue().getId(), contrastData);
                }

                // Get them in the right order.
                for (Long factorValueId : factorValueIdOrder) {
                    String s = factorValueIdToData.get(factorValueId);
                    if (s == null)
                        s = "";
                    rowBuffer.append(s);
                }

                buf.append(rowBuffer.toString()).append('\n');

            } // resultSet.getResults() loop
        }
        return buf.toString();
    }

    /**
     * Checks whether the given file is ok to return, or it should be regenerated.
     *
     * @param  forceWrite whether the file should be overridden even if found.
     * @param  f          the file to check.
     * @param  check      the file will be considered invalid after this date.
     * @return            true, if the given file is ok to be returned, false if it should be regenerated.
     */
    private boolean checkFileOkToReturn(boolean forceWrite, File f, Date check) {
        Date modified = new Date(f.lastModified());
        if (f.canRead()) {
            if (forceWrite) {
                ExpressionDataFileServiceImpl.log
                        .info(String.format(ExpressionDataFileServiceImpl.MSG_FILE_FORCED, f.getPath()));
            } else if (modified.after(check)) {
                ExpressionDataFileServiceImpl.log
                        .info(String.format(ExpressionDataFileServiceImpl.MSG_FILE_OUTDATED, f.getPath()));
            } else {
                ExpressionDataFileServiceImpl.log
                        .info(String.format(ExpressionDataFileServiceImpl.MSG_FILE_EXISTS, f.getPath()));
                return true;
            }
        } else if (!f.canRead()) {
            ExpressionDataFileServiceImpl.log
                    .info(String.format(ExpressionDataFileServiceImpl.MSG_FILE_NOT_EXISTS, f.getPath()));
        }

        return false;
    }

    /**
     * Given diff exp analysis and gene annotation generate header and tab delimited data. The output is qValue....
     *
     * @param analysis (might not be persistent)
     */
    private String convertDiffExpressionAnalysisData(DifferentialExpressionAnalysis analysis,
            Map<Long, String[]> geneAnnotations, DifferentialExpressionAnalysisConfig config) {
        if (analysis.getId() != null)
            analysis = differentialExpressionAnalysisService.thawFully(analysis);
        Collection<ExpressionAnalysisResultSet> results = analysis.getResultSets();
        if (results == null || results.isEmpty()) {
            ExpressionDataFileServiceImpl.log.warn("No differential expression results found for " + analysis);
            return "";
        }

        StringBuilder buf = new StringBuilder();

        buf.append(this.makeDiffExpressionFileHeader(analysis, analysis.getResultSets(), geneAnnotations, config));
        this.analysisResultSetsToString(results, geneAnnotations, buf);

        return buf.toString();
    }

    /**
     * Given result set and gene annotation generate header and tab delimited data. The output is foldChange and pValue
     * associated with each contrast.
     * eneAnnotations
     */
    private String convertDiffExpressionResultSetData(ExpressionAnalysisResultSet resultSet,
            Map<Long, String[]> geneAnnotations, DifferentialExpressionAnalysisConfig config) {
        // Write header.
        // Write contrasts data.
        return this.makeDiffExpressionResultSetFileHeader(resultSet, geneAnnotations, config)
                + this.analysisResultSetWithContrastsToString(resultSet, geneAnnotations);
    }

    private void deleteAndLog(File f1) {
        if (f1.canWrite() && f1.delete()) {
            ExpressionDataFileServiceImpl.log.info("Deleted: " + f1);
        }
    }

    private Collection<ArrayDesign> getArrayDesigns(Collection<? extends DesignElementDataVector> vectors) {
        Collection<ArrayDesign> ads = new HashSet<>();
        for (DesignElementDataVector v : vectors) {
            ads.add(v.getDesignElement().getArrayDesign());
        }
        return ads;
    }

    private String getCoexpressionDataFilename(ExpressionExperiment ee) {
        return ee.getId() + "_" + FileTools.cleanForFileName(ee.getShortName()) + "_coExp"
                + ExpressionDataFileService.DATA_FILE_SUFFIX_COMPRESSED;
    }

    /**
     * @return Name, without full path.
     */
    private String getDataFileName(ExpressionExperiment ee, String filteredAdd, String suffix) {
        return ee.getId() + "_" + FileTools.cleanForFileName(ee.getShortName()) + "_expmat" + filteredAdd + suffix;
    }

    private ExpressionDataDoubleMatrix getDataMatrix(ExpressionExperiment ee, boolean filtered) {
        ee = expressionExperimentService.thawLite(ee);
        ExpressionDataDoubleMatrix matrix;
        if (filtered) {
            FilterConfig filterConfig = new FilterConfig();
            filterConfig.setIgnoreMinimumSampleThreshold(true);
            filterConfig.setIgnoreMinimumRowsThreshold(true);
            matrix = expressionDataMatrixService.getFilteredMatrix(ee, filterConfig);
        } else {
            matrix = expressionDataMatrixService.getProcessedExpressionDataMatrix(ee);
        }
        return matrix;
    }

    private String getDesignFileName(ExpressionExperiment ee) {
        return ee.getId() + "_" + FileTools.cleanForFileName(ee.getShortName()) + "_expdesign"
                + ExpressionDataFileService.DATA_FILE_SUFFIX_COMPRESSED;
    }

    private String getDiffExArchiveFileName(DifferentialExpressionAnalysis diff) {
        BioAssaySet experimentAnalyzed = diff.getExperimentAnalyzed();

        ExpressionExperiment ee;
        if (experimentAnalyzed instanceof ExpressionExperiment) {
            ee = (ExpressionExperiment) experimentAnalyzed;
        } else if (experimentAnalyzed instanceof ExpressionExperimentSubSet) {
            ExpressionExperimentSubSet subset = (ExpressionExperimentSubSet) experimentAnalyzed;
            ee = subset.getSourceExperiment();
        } else {
            throw new UnsupportedOperationException("Don't know about " + experimentAnalyzed.getClass().getName());
        }

        return experimentAnalyzed.getId() + "_" + FileTools.cleanForFileName(ee.getShortName()) + "_diffExpAnalysis"
                + (diff.getId() != null ? "_" + diff.getId() : "")
                + ExpressionDataFileService.DATA_ARCHIVE_FILE_SUFFIX;
    }

    private File getDiffExpressionAnalysisArchiveFile(DifferentialExpressionAnalysis analysis,
            boolean forceCreate) {
        String filename = this.getDiffExArchiveFileName(analysis);
        File f = this.getOutputFile(filename);

        // Force create if file is older than one year
        if (!forceCreate && f.canRead()) {
            Date d = new Date(f.lastModified());
            Calendar calendar = Calendar.getInstance();
            calendar.add(Calendar.YEAR, -1);
            forceCreate = d.before(new Date(calendar.getTimeInMillis()));
        }

        // If not force create and the file exists (can be read from), return the existing file.
        if (!forceCreate && f.canRead()) {
            ExpressionDataFileServiceImpl.log.info(f + " exists, not regenerating");
            return f;
        }

        // (Re-)create the file
        analysis = this.differentialExpressionAnalysisService.thawFully(analysis);
        BioAssaySet experimentAnalyzed = analysis.getExperimentAnalyzed();

        try {
            this.writeDiffExArchiveFile(experimentAnalyzed, analysis, null);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        return f;
    }

    private String getFactorValueString(FactorValue fv) {
        String result;
        if (fv == null)
            return "null";

        if (fv.getCharacteristics() != null && fv.getCharacteristics().size() > 0) {
            StringBuilder fvString = new StringBuilder();
            for (Characteristic c : fv.getCharacteristics()) {
                fvString.append(c.getValue()).append("_");
            }
            result = StringUtils.removeEnd(fvString.toString(), "_");
        } else if (fv.getMeasurement() != null) {
            result = fv.getMeasurement().getValue();
        } else if (fv.getValue() != null && !fv.getValue().isEmpty()) {
            result = fv.getValue();
        } else
            return "no_data";

        // R-friendly, but no need to add "X" to the beginning since this is a suffix.
        return result.replaceAll("[\\W]+", ".");
    }

    /**
     * @return Map of composite sequence ids to an array of strings: [probe name, genes symbol(s), gene Name(s), gemma
     *         id(s), ncbi id(s)].
     */
    private Map<Long, String[]> getGeneAnnotationsAsStrings(Collection<ArrayDesign> ads) {
        Map<Long, String[]> annotations = new HashMap<>();
        for (ArrayDesign arrayDesign : ads) {
            arrayDesign = arrayDesignService.thaw(arrayDesign);
            annotations.putAll(ArrayDesignAnnotationServiceImpl.readAnnotationFileAsString(arrayDesign));
        }
        return annotations;
    }

    private Map<CompositeSequence, String[]> getGeneAnnotationsAsStringsByProbe(Collection<ArrayDesign> ads) {
        Map<CompositeSequence, String[]> annotations = new HashMap<>();
        for (ArrayDesign arrayDesign : ads) {
            arrayDesign = arrayDesignService.thaw(arrayDesign);

            Map<Long, CompositeSequence> csIdMap = EntityUtils.getIdMap(arrayDesign.getCompositeSequences());

            Map<Long, String[]> geneAnnotations = ArrayDesignAnnotationServiceImpl
                    .readAnnotationFileAsString(arrayDesign);

            for (Entry<Long, String[]> e : geneAnnotations.entrySet()) {

                if (!csIdMap.containsKey(e.getKey())) {
                    continue;
                }

                annotations.put(csIdMap.get(e.getKey()), e.getValue());

            }

        }
        return annotations;
    }

    private File getJSONOutputFile(QuantitationType type) throws IOException {
        String filename = this.getJSONOutputFilename(type);
        String fullFilePath = ExpressionDataFileService.DATA_DIR + filename;

        File f = new File(fullFilePath);

        if (f.exists()) {
            ExpressionDataFileServiceImpl.log.warn("Will overwrite existing file " + f);
            EntityUtils.deleteFile(f);
        }

        File parentDir = f.getParentFile();
        EntityUtils.mkdirs(parentDir);
        EntityUtils.createFile(f);
        return f;
    }

    /**
     * @return Name, without full path.
     */
    private String getJSONOutputFilename(QuantitationType type) {
        return FileTools.cleanForFileName(type.getName()) + ExpressionDataFileService.JSON_FILE_SUFFIX;
    }

    private File getOutputFile(QuantitationType type) {
        String filename = this.getOutputFilename(type);
        return this.getOutputFile(filename);
    }

    /**
     * @return Name, without full path.
     */
    private String getOutputFilename(QuantitationType type) {
        return type.getId() + "_" + FileTools.cleanForFileName(type.getName())
                + ExpressionDataFileService.DATA_FILE_SUFFIX_COMPRESSED;
    }

    private String makeDiffExpressionFileHeader(DifferentialExpressionAnalysis analysis,
            Collection<ExpressionAnalysisResultSet> resultSets, Map<Long, String[]> geneAnnotations,
            DifferentialExpressionAnalysisConfig config) {

        if (analysis.getId() != null) // It might not be a persistent analysis: using -nodb
            differentialExpressionAnalysisService.thaw(analysis);

        StringBuilder buf = new StringBuilder();

        BioAssaySet bas = analysis.getExperimentAnalyzed();

        ExpressionExperiment ee = ExpressionDataFileServiceImpl.experimentForBioAssaySet(bas);

        Date timestamp = new Date(System.currentTimeMillis());
        buf.append("# Differential expression analysis for:  ").append(ee.getShortName()).append(" : ")
                .append(ee.getName()).append(" (ID=").append(ee.getId()).append(")\n");

        buf.append(
                "# This file contains summary statistics for the factors included in the analysis (e.g. ANOVA effects); "
                        + "details of contrasts are in separate files.\n");

        // It might not be a persistent analysis.
        if (analysis.getId() != null) {
            buf.append("# Analysis ID = ").append(analysis.getId()).append("\n");
        } else {
            buf.append("# Analysis was not persisted to the database\n");
        }

        if (config != null) {
            buf.append(config.toString());
        } else if (analysis.getProtocol() != null
                && StringUtils.isNotBlank(analysis.getProtocol().getDescription())) {
            buf.append(analysis.getProtocol().getDescription());
        } else {
            // This can happen if we are re-writing files for a stored analysis that didn't get proper protocol information saved. 
            // Basically this is here for backwards compatibility. 
            ExpressionDataFileServiceImpl.log.warn(
                    "No configuration or protocol available, adding available analysis information to header");
            buf.append("# Configuration information was not fully available");
            buf.append("# Factors:\n");

            if (analysis.getSubsetFactorValue() != null) {
                buf.append("# Subset ID=").append(bas.getId()).append("\n");
                buf.append("# Subset factor ").append(analysis.getSubsetFactorValue().getExperimentalFactor())
                        .append("\n");
                buf.append("# Subset is of samples with ").append(analysis.getSubsetFactorValue()).append("\n");
            }

            for (ExpressionAnalysisResultSet rs : resultSets) {
                String f = StringUtils.join(rs.getExperimentalFactors(), ":");
                buf.append("# ").append(f).append("\n");
            }
        }

        buf.append("# Generated by Gemma ").append(timestamp).append(" \n");

        buf.append(ExpressionDataFileService.DISCLAIMER);

        // Different Headers if Gene Annotations missing.
        if (geneAnnotations.isEmpty()) {
            //   log.info( "Annotation file is missing for this experiment, unable to include gene annotation information" );
            buf.append("#\n# The gene annotations were not available\n");
            // but leave the blank columns there to make parsing easier.
        }
        buf.append("Element_Name\tGene_Symbol\tGene_Name\tNCBI_ID");// column information

        // Note we don't put a newline here, because the rest of the headers have to be added for the pvalue columns.

        return buf.toString();

    }

    private String makeDiffExpressionResultSetFileHeader(ExpressionAnalysisResultSet resultSet,
            Map<Long, String[]> geneAnnotations, DifferentialExpressionAnalysisConfig config) {
        StringBuilder buf = new StringBuilder();

        BioAssaySet bas = resultSet.getAnalysis().getExperimentAnalyzed();

        ExpressionExperiment ee = ExpressionDataFileServiceImpl.experimentForBioAssaySet(bas);

        Date timestamp = new Date(System.currentTimeMillis());
        buf.append("# Differential expression result set for:  ").append(ee.getShortName()).append(" : ")
                .append(ee.getName()).append(" (ID=").append(ee.getId()).append(")\n");
        buf.append("# This file contains contrasts for:");
        String f = StringUtils.join(resultSet.getExperimentalFactors(), " x ");
        buf.append(f).append("\n");

        if (resultSet.getAnalysis().getId() == null) {
            buf.append("# Analysis is not stored in the database\n");
        } else {
            buf.append("# Analysis ID = ").append(resultSet.getAnalysis().getId()).append("\n");
        }

        if (resultSet.getId() != null) {
            buf.append("# ResultSet ID = ").append(resultSet.getId()).append("\n");
        }

        /*
         * Use the config if available; otherwise the protocol description
         * (which currently is same as config.toString() anyway; fall back on "by-hand", which we can probably get rid
         * of
         * later and always use the config (for new analyses) or stored protocol (for stored analyses)
         */
        buf.append("# Analysis configuration:\n");
        if (config != null) {
            buf.append(config);
        } else if (resultSet.getAnalysis().getProtocol() != null
                && StringUtils.isNotBlank(resultSet.getAnalysis().getProtocol().getDescription())) {
            buf.append(resultSet.getAnalysis().getProtocol().getDescription());
        } else {
            ExpressionDataFileServiceImpl.log
                    .warn("Full configuration not available, adding available analysis information to header");
            if (resultSet.getAnalysis().getSubsetFactorValue() != null) {
                buf.append("# This analysis is for subset ID=").append(bas.getId()).append("\n");
                buf.append("# The subsetting factor was ")
                        .append(resultSet.getAnalysis().getSubsetFactorValue().getExperimentalFactor())
                        .append("\n");
                buf.append("# This subset is of samples with ")
                        .append(resultSet.getAnalysis().getSubsetFactorValue()).append("\n");
            }
        }

        String batchConf = expressionExperimentService.getBatchConfound(ee);

        if (batchConf != null) {
            buf.append("# !!! Warning, this dataset has a batch confound with the factors analysed\n");
        }

        buf.append("#\n# Generated by Gemma ").append(timestamp).append(" \n");
        buf.append(ExpressionDataFileService.DISCLAIMER + "#\n");

        if (geneAnnotations.isEmpty()) {
            // log.debug( "Annotation file is missing for this experiment, unable to include gene annotation information" );
            buf.append(
                    "# The platform annotation file is missing for this Experiment, gene annotation information is omitted\n#\n");
            // but leave the blank columns there to make parsing easier.
        }
        buf.append("Element_Name\tGene_Symbol\tGene_Name\tNCBI_ID");// column information

        // Note we don't put a newline here, because the rest of the headers have to be added for the pvalue columns.
        return buf.toString();
    }

    /**
     * Loads the probe to probe coexpression link information for a given expression experiment and writes it to disk.
     */
    private void writeCoexpressionData(File file, ExpressionExperiment ee) throws IOException {

        Taxon tax = expressionExperimentService.getTaxon(ee);
        assert tax != null;

        Collection<CoexpressionValueObject> geneLinks = gene2geneCoexpressionService.getCoexpression(ee, true);

        if (geneLinks.isEmpty()) {
            log.warn("No coexpression links for this experiment, file will not be created: " + ee);
            return;
        }

        ExpressionDataFileServiceImpl.log.info("Creating new coexpression data file: " + file.getAbsolutePath());

        Date timestamp = new Date(System.currentTimeMillis());
        StringBuilder buf = new StringBuilder();

        // Write header information
        buf.append("# Coexpression data for:  ").append(ee.getShortName()).append(" : ").append(ee.getName())
                .append(" \n");
        buf.append("# Generated On: ").append(timestamp).append(" \n");
        buf.append(
                "# Links are listed in an arbitrary order with an indication of positive or negative correlation\n");
        buf.append(ExpressionDataFileService.DISCLAIMER);
        buf.append("GeneSymbol1\tGeneSymbol2\tDirection\tSupport\n");

        // Data
        for (CoexpressionValueObject link : geneLinks) {

            buf.append(link.getQueryGeneSymbol()).append("\t").append(link.getCoexGeneSymbol()).append("\t");

            buf.append(link.isPositiveCorrelation() ? "+" : "-" + "\n");
        }

        // Write coexpression data to file (zipped of course)
        try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
            writer.write(buf.toString());
        }

    }

    /**
     * @param compress if true, file will be output in GZIP format.
     */
    private File writeDataFile(ExpressionExperiment ee, boolean filtered, File f, boolean compress)
            throws IOException {
        ExpressionDataFileServiceImpl.log.info("Creating new expression data file: " + f.getName());
        ExpressionDataDoubleMatrix matrix = this.getDataMatrix(ee, filtered);

        Collection<ArrayDesign> arrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
        Map<CompositeSequence, String[]> geneAnnotations = this.getGeneAnnotationsAsStringsByProbe(arrayDesigns);
        this.writeMatrix(f, geneAnnotations, matrix, compress);
        return f;
    }

    /**
     * Writes out the experimental design for the given experiment. The bioassays (col 0) matches match the header row
     * of the data matrix printed out by the {@link MatrixWriter}.
     *
     * @return file that was written
     */
    private File writeDesignMatrix(File file, ExpressionExperiment expressionExperiment) throws IOException {

        OutputStream oStream;
        oStream = new GZIPOutputStream(new FileOutputStream(file));

        try (Writer writer = new OutputStreamWriter(oStream)) {
            ExperimentalDesignWriter edWriter = new ExperimentalDesignWriter();
            edWriter.write(writer, expressionExperiment, true);
        }
        return file;
    }

    private void writeJson(File file, Collection<DesignElementDataVector> vectors) throws IOException {
        this.rawExpressionDataVectorService.thawRawAndProcessed(vectors);
        ExpressionDataMatrix<?> expressionDataMatrix = ExpressionDataMatrixBuilder.getMatrix(vectors);
        try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
            MatrixWriter matrixWriter = new MatrixWriter();
            matrixWriter.writeJSON(writer, expressionDataMatrix);
        }
    }

    private void writeJson(File file, ExpressionDataMatrix<?> expressionDataMatrix) throws IOException {
        try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
            MatrixWriter matrixWriter = new MatrixWriter();
            matrixWriter.writeJSON(writer, expressionDataMatrix);
        }
    }

    private void writeMatrix(File file, Map<CompositeSequence, String[]> geneAnnotations,
            ExpressionDataMatrix<?> expressionDataMatrix) throws IOException {

        this.writeMatrix(file, geneAnnotations, expressionDataMatrix, true);

    }

    private void writeMatrix(File file, Map<CompositeSequence, String[]> geneAnnotations,
            ExpressionDataMatrix<?> expressionDataMatrix, boolean gzipped) throws IOException {
        MatrixWriter matrixWriter = new MatrixWriter();

        if (gzipped) {
            try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
                matrixWriter.writeWithStringifiedGeneAnnotations(writer, expressionDataMatrix, geneAnnotations,
                        true);
            }
        } else {
            try (Writer writer = new OutputStreamWriter(new FileOutputStream(file))) {
                matrixWriter.writeWithStringifiedGeneAnnotations(writer, expressionDataMatrix, geneAnnotations,
                        true);
            }
        }

    }

    private void writeVectors(File file, Collection<DesignElementDataVector> vectors,
            Map<CompositeSequence, String[]> geneAnnotations) throws IOException {
        this.rawExpressionDataVectorService.thawRawAndProcessed(vectors);

        ExpressionDataMatrix<?> expressionDataMatrix = ExpressionDataMatrixBuilder.getMatrix(vectors);

        this.writeMatrix(file, geneAnnotations, expressionDataMatrix);
    }

}