chibi.gemmaanalysis.ExperimentMetaDataExtractorCli.java Source code

Java tutorial

Introduction

Here is the source code for chibi.gemmaanalysis.ExperimentMetaDataExtractorCli.java

Source

/*
 * The GemmaAnalysis project
 * 
 * Copyright (c) 2014 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package chibi.gemmaanalysis;

import gemma.gsec.SecurityService;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;

import ubic.basecode.dataStructure.CountingMap;
import ubic.gemma.analysis.preprocess.OutlierDetails;
import ubic.gemma.analysis.preprocess.OutlierDetectionService;
import ubic.gemma.analysis.preprocess.batcheffects.BatchEffectDetails;
import ubic.gemma.analysis.preprocess.batcheffects.BatchInfoPopulationServiceImpl;
import ubic.gemma.analysis.preprocess.filter.FilterConfig;
import ubic.gemma.analysis.preprocess.svd.SVDService;
import ubic.gemma.analysis.preprocess.svd.SVDValueObject;
import ubic.gemma.analysis.service.ExpressionDataMatrixService;
import ubic.gemma.analysis.util.ExperimentalDesignUtils;
import ubic.gemma.apps.ExpressionExperimentManipulatingCLI;
import ubic.gemma.apps.GemmaCLI.CommandGroup;
import ubic.gemma.expression.experiment.service.ExperimentalDesignService;
import ubic.gemma.model.common.auditAndSecurity.AuditTrailService;
import ubic.gemma.model.common.auditAndSecurity.Status;
import ubic.gemma.model.common.auditAndSecurity.StatusService;
import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExperimentalDesign;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
import ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject;
import ubic.gemma.util.FactorValueVector;
import ubic.gemma.util.Settings;

/**
 * Extracts expression experiment meta data such as experimental design, array design, outlier count, and publication
 * into a .txt.gz TSV file. See Bug 3968.
 * 
 * @author paul
 * @version $Id$
 */
public class ExperimentMetaDataExtractorCli extends ExpressionExperimentManipulatingCLI {

    private static final String EXPERIMENT_META_DATA_BASENAME = "ExperimentMetaData";
    private static final String VIEW_FILE_SUFFIX = ".txt.gz";
    public static final String DEFAULT_VIEW_FILE = Settings.getString("gemma.appdata.home") + File.separatorChar
            + "dataFiles" + File.separatorChar + EXPERIMENT_META_DATA_BASENAME + VIEW_FILE_SUFFIX;
    private static final String NA = "";
    private OutlierDetectionService outlierDetectionService;
    private StatusService statusService;
    private ExperimentalDesignService edService;
    private SecurityService securityService;
    private String viewFile = DEFAULT_VIEW_FILE;
    private SVDService svdService;
    private ExpressionDataMatrixService expressionDataMatrixService;

    @Override
    public CommandGroup getCommandGroup() {
        return CommandGroup.METADATA;
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.util.AbstractCLI#doWork(java.lang.String[])
     */
    @Override
    protected Exception doWork(String[] args) {
        super.processCommandLine(args);
        auditTrailService = getBean(AuditTrailService.class);
        outlierDetectionService = getBean(OutlierDetectionService.class);
        statusService = getBean(StatusService.class);
        edService = getBean(ExperimentalDesignService.class);
        securityService = getBean(SecurityService.class);
        svdService = getBean(SVDService.class);
        expressionDataMatrixService = getBean(ExpressionDataMatrixService.class);

        process(super.expressionExperiments);

        return null;
    }

    @Override
    @SuppressWarnings("static-access")
    protected void buildOptions() {
        super.buildOptions();

        Option expOption = OptionBuilder.hasArg().withArgName("outfile").withDescription("GZipped output filename")
                .withLongOpt("outfile").create('o');

        addOption(expOption);

        // to keep troubled experiments
        this.addForceOption();
    }

    @Override
    protected void processOptions() {
        super.processOptions();

        if (hasOption('o')) {
            this.viewFile = getOptionValue('o');
            log.info("GZipped txt output will be written to " + viewFile);
        } else {
            this.viewFile = DEFAULT_VIEW_FILE;
        }

    }

    /**
     * @param bas
     */
    private void process(Collection<BioAssaySet> expressionExperiments) {
        try {
            generateExperimentMetaData(expressionExperiments);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public File getOutputFile(String filename) {
        String fullFilePath = filename;
        File f = new File(fullFilePath);

        if (f.exists()) {
            return f;
        }

        File parentDir = f.getParentFile();
        if (!parentDir.exists())
            parentDir.mkdirs();
        return f;
    }

    public Collection<BatchEffectDetails> getBatchEffect(ExpressionExperiment ee, int maxcomp) {
        Collection<BatchEffectDetails> ret = new ArrayList<>();

        for (ExperimentalFactor ef : ee.getExperimentalDesign().getExperimentalFactors()) {
            if (BatchInfoPopulationServiceImpl.isBatchFactor(ef)) {
                SVDValueObject svd = svdService.getSvdFactorAnalysis(ee.getId());
                if (svd == null)
                    break;

                for (Integer component : svd.getFactorPvals().keySet()) {
                    if (component.intValue() >= maxcomp) {
                        break;
                    }
                    Map<Long, Double> cmpEffects = svd.getFactorPvals().get(component);

                    Double pval = cmpEffects.get(ef.getId());
                    if (pval != null) {
                        BatchEffectDetails details = new BatchEffectDetails();
                        details.setPvalue(pval.doubleValue());
                        details.setComponent(new Integer(component.intValue() + 1));
                        details.setComponentVarianceProportion(
                                svd.getVariances()[component.intValue()].doubleValue());
                        details.setHasBatchInformation(true);
                        ret.add(details);
                    }

                }
            }
        }
        return ret;
    }

    public void generateExperimentMetaData(Collection<BioAssaySet> expressionExperiments) throws IOException {

        File file = getOutputFile(this.viewFile);

        try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)));) {

            String[] colNames = { "ShortName", "Taxon", "DateUpload", "IsPublic", "NumPlatform", "Platform",
                    "Channel", "IsExonArray", "QtIsRatio", "QtIsNormalized", "QtScale", "NumProfiles",
                    "NumFilteredProfiles", "NumSamples", "NumConditions", "NumReplicatesPerCondition",
                    "PossibleOutliers", "CuratedOutlier", "IsTroubled", "PubTroubled", "PubYear", "PubJournal",
                    "Batch.PC1.Var", "Batch.PC2.Var", "Batch.PC3.Var", "Batch.PC1.Pval", "Batch.PC2.Pval",
                    "Batch.PC3.Pval", "NumFactors", "FactorNames", "FactorCategories", "NumFactorValues" };
            // log.info( StringUtils.join( colNames, "\t" ) + "\n" );
            writer.write(StringUtils.join(colNames, "\t") + "\n");

            int i = 0;
            Collection<String> failedEEs = new ArrayList<>();

            StopWatch timer = new StopWatch();
            timer.start();

            for (BioAssaySet bas : expressionExperiments) {
                /*
                 * Skip subsets
                 */
                if (bas instanceof ExpressionExperimentSubSet)
                    return;

                try {

                    ExpressionExperiment ee = (ExpressionExperiment) bas;
                    ee = eeService.thawLite(ee);
                    ExpressionExperimentValueObject vo = eeService.loadValueObject(ee.getId());
                    vo.setIsPublic(!securityService.isPrivate(ee));
                    log.info("Processing (" + ++i + "/" + expressionExperiments.size() + ") : " + ee);

                    BibliographicReference primaryPublication = ee.getPrimaryPublication();
                    Status pubStatus = primaryPublication != null ? statusService.getStatus(primaryPublication)
                            : null;
                    Collection<ArrayDesign> arrayDesignsUsed = eeService.getArrayDesignsUsed(ee);

                    Collection<String> arrayDesignIsExon = new ArrayList<>();
                    Collection<String> arrayDesignTechTypes = new ArrayList<>();
                    Collection<String> arrayDesignShortNames = new ArrayList<>();

                    // for multiple platforms e.g. GSE5949
                    for (ArrayDesign ad : arrayDesignsUsed) {
                        arrayDesignShortNames.add(ad.getShortName());
                        arrayDesignTechTypes.add(ad.getTechnologyType().getValue());
                        arrayDesignIsExon.add(ad.getName().toLowerCase().contains("exon") + "");
                    }

                    QuantitationType qt = null;
                    for (QuantitationType q : ee.getQuantitationTypes()) {
                        if (q.getIsPreferred().booleanValue()) {
                            qt = q;
                            break;
                        }
                    }

                    int manualOutlierCount = 0;
                    for (BioAssay ba : ee.getBioAssays()) {
                        if (ba.getIsOutlier().booleanValue()) {
                            manualOutlierCount++;
                        }
                    }

                    ExperimentalDesign experimentalDesign = edService.load(vo.getExperimentalDesign());

                    // Batch PCs
                    int maxcomp = 3;
                    BatchEffectDetails batchEffectPC1 = null;
                    BatchEffectDetails batchEffectPC2 = null;
                    BatchEffectDetails batchEffectPC3 = null;
                    Collection<BatchEffectDetails> batchEffects = getBatchEffect(ee, maxcomp);
                    Iterator<BatchEffectDetails> batchEffectsIterator;
                    if (batchEffects == null || batchEffects.size() == 0) {
                        log.warn("No batch effect info");
                    } else {
                        batchEffectsIterator = batchEffects.iterator();
                        if (batchEffectsIterator.hasNext()) {
                            batchEffectPC1 = batchEffectsIterator.next();
                        }
                        if (batchEffectsIterator.hasNext()) {
                            batchEffectPC2 = batchEffectsIterator.next();
                        }
                        if (batchEffectsIterator.hasNext()) {
                            batchEffectPC3 = batchEffectsIterator.next();
                        }
                    }

                    // eeService.getExperimentsWithOutliers();
                    StopWatch timerOutlier = new StopWatch();
                    timerOutlier.start();
                    // log.info( "Outlier detection service started " + timer.getTime() + "ms" );
                    Collection<OutlierDetails> possibleOutliers = outlierDetectionService.identifyOutliers(ee);
                    if (timerOutlier.getTime() > 10000) {
                        log.info("Automatic outlier detection took " + timerOutlier.getTime() + "ms");
                    }
                    // Collection<OutlierDetails> possibleOutliers = null;

                    // samples per condition
                    boolean removeBatchFactor = false;
                    Collection<String> samplesPerConditionCount = new ArrayList<>();
                    CountingMap<FactorValueVector> assayCount = ExperimentalDesignUtils.getDesignMatrix(ee,
                            removeBatchFactor);
                    List<FactorValueVector> keys = assayCount.sortedKeyList(true);
                    for (FactorValueVector key : keys) {
                        samplesPerConditionCount.add(Integer.toString(assayCount.get(key).intValue()));
                    }

                    // factor names
                    Collection<ExperimentalFactor> factors = ee.getExperimentalDesign().getExperimentalFactors();
                    Collection<String> factorNames = new ArrayList<>();
                    Collection<String> factorCategories = new ArrayList<>();
                    Collection<Integer> factorValues = new ArrayList<>();
                    for (ExperimentalFactor f : factors) {
                        factorNames.add(f.getName());
                        String cat = f.getCategory() != null ? f.getCategory().getCategory() : NA;
                        factorCategories.add(cat);
                        factorValues.add(Integer.valueOf(f.getFactorValues().size()));
                    }

                    int filteredProfilesCount = -1;

                    try {
                        FilterConfig filterConfig = new FilterConfig();
                        filterConfig.setIgnoreMinimumSampleThreshold(true);
                        filteredProfilesCount = expressionDataMatrixService.getFilteredMatrix(ee, filterConfig,
                                expressionDataMatrixService.getProcessedExpressionDataVectors(ee)).rows();
                    } catch (Exception e) {
                        log.error(e.getMessage(), e);
                    }

                    String val[] = { vo.getShortName(), vo.getTaxon(),
                            DateFormat.getDateInstance(DateFormat.MEDIUM).format(vo.getDateCreated()),
                            vo != null ? Boolean.toString(vo.getIsPublic()) : NA,
                            Integer.toString(arrayDesignsUsed.size()), StringUtils.join(arrayDesignShortNames, ','),
                            StringUtils.join(arrayDesignTechTypes, ','), // arrayDesign.getTechnologyType().getValue(),
                            // ONE-COLOR, TWO-COLOR, NONE (RNA-seq
                            // GSE37646), DUAL-MODE (one or two color)
                            StringUtils.join(arrayDesignIsExon, ','), // exon GSE28383
                            qt != null ? Boolean.toString(qt.getIsRatio().booleanValue()) : NA,
                            qt != null ? Boolean.toString(qt.getIsNormalized().booleanValue()) : NA,
                            qt != null ? qt.getScale().getValue() : NA,
                            Integer.toString(vo.getProcessedExpressionVectorCount().intValue()), // NumProfiles
                            Integer.toString(filteredProfilesCount), // NumFilteredProfiles
                            Integer.toString(vo.getBioAssayCount().intValue()), // NumSamples
                            Integer.toString(assayCount.size()), // NumConditions
                            StringUtils.join(samplesPerConditionCount, ","),
                            possibleOutliers != null ? Integer.toString(possibleOutliers.size()) : NA,
                            Integer.toString(manualOutlierCount), Boolean.toString(vo.getTroubled()),
                            pubStatus != null ? Boolean.toString(pubStatus.getTroubled().booleanValue()) : NA,
                            primaryPublication != null ? DateFormat.getDateInstance(DateFormat.MEDIUM)
                                    .format(primaryPublication.getPublicationDate()) : NA,
                            primaryPublication != null ? primaryPublication.getPublication() : NA,

                            batchEffectPC1 != null
                                    ? Double.toString(batchEffectPC1.getComponentVarianceProportion())
                                    : NA,
                            batchEffectPC2 != null
                                    ? Double.toString(batchEffectPC2.getComponentVarianceProportion())
                                    : NA,
                            batchEffectPC3 != null
                                    ? Double.toString(batchEffectPC3.getComponentVarianceProportion())
                                    : NA,

                            batchEffectPC1 != null ? Double.toString(batchEffectPC1.getPvalue()) : NA,
                            batchEffectPC2 != null ? Double.toString(batchEffectPC2.getPvalue()) : NA,
                            batchEffectPC3 != null ? Double.toString(batchEffectPC3.getPvalue()) : NA,

                            // factors
                            factors != null ? Integer.toString(factors.size()) : NA, // NumFactors
                            factorNames != null ? StringUtils.join(factorNames, ",") : NA,
                            factorCategories != null ? StringUtils.join(factorCategories, ",") : NA,
                            factorValues != null ? StringUtils.join(factorValues, ",") : NA,

                    };

                    // log.info( StringUtils.join( val, "\t" ) + "\n" );
                    writer.write(StringUtils.join(val, "\t") + "\n");

                } catch (Exception e) {
                    failedEEs.add(((ExpressionExperiment) bas).getShortName());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    log.error(sw.toString());
                }
            }

            log.info("Finished processing " + expressionExperiments.size() + " datasets in " + timer.getTime()
                    + " ms. ");
            log.info("Writen to " + file);
            log.info("Number of failed experiment metadata extraction(s): " + failedEEs.size() + " / "
                    + expressionExperiments.size());

            if (failedEEs.size() > 0) {
                log.info("Skipped experiments:");
                log.info(StringUtils.join(failedEEs, ","));
            }
        }
    }

    public static void main(String[] args) {
        ExperimentMetaDataExtractorCli s = new ExperimentMetaDataExtractorCli();
        try {
            Exception ex = s.doWork(args);
            if (ex != null) {
                ex.printStackTrace();
            }

            System.exit(0);
        } catch (Exception e) {
            // throw new RuntimeException( e );
            log.error(e.getMessage(), e);
            System.exit(1);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.util.AbstractCLI#getCommandName()
     */
    @Override
    public String getCommandName() {
        // TODO Auto-generated method stub
        return null;
    }

}