de.dkfz.roddy.core.Analysis.java Source code

Introduction

Here is the source code for de.dkfz.roddy.core.Analysis.java
Source

/*
 * Copyright (c) 2016 eilslabs.
 *
 * Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/Roddy/LICENSE.txt).
 */

package de.dkfz.roddy.core;

import de.dkfz.roddy.AvailableFeatureToggles;
import de.dkfz.roddy.Roddy;
import de.dkfz.roddy.execution.io.ExecutionService;
import de.dkfz.roddy.execution.io.fs.FileSystemAccessProvider;
import de.dkfz.roddy.execution.jobs.*;
import de.dkfz.roddy.tools.LoggerWrapper;
import de.dkfz.roddy.tools.RoddyIOHelperMethods;
import de.dkfz.roddy.config.*;
import org.apache.commons.io.filefilter.WildcardFileFilter;

import java.io.File;
import java.util.*;
import java.util.logging.Level;

/**
 * An analysis represents the combination of a project, a workflow implementation and a configuration.
 * An analysis object is valid for only one project. The analysis configuration can be used by different projects.
 * Each project can have several analysis. So i.e. prostate can have whole_genome_analysis or exome_analysis.
 * Each found input dataset in the project can then be processed by all the projects analysis.
 * <p>
 * TODO Enable a initialize (cache erase?) to make it possible to reset data sets and data set states for an analysis.
 */
public class Analysis {

    private static final de.dkfz.roddy.tools.LoggerWrapper logger = de.dkfz.roddy.tools.LoggerWrapper
            .getLogger(Analysis.class.getSimpleName());

    /**
     * An analysis should have a unique name like i.e. whole_genome_processing or exome_analysis
     */
    private final String name;

    /**
     * An analysis has a single link to a project, but the project can hav multiple analysis objects.
     */
    private final Project project;

    /**
     * An analysis has a single link to a workflow but a workflow can be used by multiple workflows.
     */
    private final Workflow workflow;

    /**
     * An analysis is directly linked to a configuration
     */
    private final Configuration configuration;

    /**
     * The basic input directory for an analysis. This directory should then contain a list of datasets.
     * The list of datasets is created by the projects runtime service.
     */
    private File inputBaseDirectory;

    /**
     * Runtime service is set in the analysis config. But the user can also set it for a project. The project then goes first, afterwards the analysis.
     */
    private RuntimeService runtimeService;

    public Analysis(String name, Project project, Workflow workflow, RuntimeService runtimeService,
            AnalysisConfiguration configuration) {
        this.name = name;
        this.project = project;
        this.workflow = workflow;
        this.configuration = configuration;
        this.runtimeService = runtimeService;
    }

    public String getName() {
        return name;
    }

    public Project getProject() {
        return project;
    }

    public Workflow getWorkflow() {
        return workflow;
    }

    public String getUsername() {
        try {
            return FileSystemAccessProvider.getInstance().callWhoAmI();
        } catch (Exception e) {
            return "UNKNOWN";
        }
    }

    public String getUsergroup() {
        try {
            //Get the default value.
            String groupID = FileSystemAccessProvider.getInstance().getMyGroup();

            //If it is configured, get the group id from the config.
            boolean processSetUserGroup = getConfiguration().getConfigurationValues()
                    .getBoolean(ConfigurationConstants.CVALUE_PROCESS_OPTIONS_SETUSERGROUP, true);
            if (processSetUserGroup) {
                groupID = getConfiguration().getConfigurationValues()
                        .getString(ConfigurationConstants.CFG_OUTPUT_FILE_GROUP, groupID);
            }
            return groupID;
        } catch (Exception e) {
            return "UNKNOWN";
        }
    }

    private ContextConfiguration _contextConfiguration = null;

    public Configuration getConfiguration() {
        if (_contextConfiguration == null)
            _contextConfiguration = new ContextConfiguration((AnalysisConfiguration) this.configuration,
                    (ProjectConfiguration) this.project.getConfiguration());
        return _contextConfiguration;
        //        return configuration;
    }

    @Override
    public String toString() {
        return name;
    }

    /**
     * Returns the base input directory for this analysis object.
     *
     * @return
     */
    public File getInputBaseDirectory() {
        if (inputBaseDirectory == null)
            inputBaseDirectory = getRuntimeService().getInputFolderForAnalysis(this);
        return inputBaseDirectory;
    }

    /**
     * Returns the base output directory for this analysis object.
     *
     * @return
     */
    public File getOutputBaseDirectory() {
        return getRuntimeService().getOutputFolderForAnalysis(this);
    }

    /**
     * Returns the base output directory for this analysis object.
     *
     * @return
     */
    public File getOutputAnalysisBaseDirectory() {
        return getRuntimeService().getOutputFolderForAnalysis(this);
    }

    public List<DataSet> getListOfDataSets() {
        return getRuntimeService().getListOfPossibleDataSets(this);
    }

    public DataSet getDataSet(String dataSetID) {
        // TODO: The avoidRecursion variable is more or less a hack. It work
        for (DataSet d : getRuntimeService().getListOfPossibleDataSets(this, true))
            if (d.getId().equals(dataSetID))
                return d;
        return null;
    }

    public RuntimeService getRuntimeService() {
        RuntimeService rs = project.getRuntimeService();
        if (rs == null)
            rs = runtimeService;
        return rs;
    }

    /**
     * Executes this analysis for the specified list of identifiers.
     * An identifier can contain wildcards. The Apache WildCardFileFilter is used for wildcard resolution.
     *
     * @param pidFilters A list of identifiers or wildcards
     * @param level      The level for the context execution
     * @return
     */
    public List<ExecutionContext> run(List<String> pidFilters, ExecutionContextLevel level) {
        List<DataSet> selectedDatasets = getRuntimeService().loadDatasetsWithFilter(this, pidFilters);
        List<ExecutionContext> runs = new LinkedList<>();

        long creationCheckPoint = System.nanoTime();

        for (DataSet ds : selectedDatasets) {
            if (level.isOrWasAllowedToSubmitJobs && !checkJobStartability(ds))
                continue;

            ExecutionContext ec = new ExecutionContext(FileSystemAccessProvider.getInstance().callWhoAmI(), this,
                    ds, level, ds.getOutputFolderForAnalysis(this), ds.getInputFolderForAnalysis(this), null,
                    creationCheckPoint);

            executeRun(ec);
            runs.add(ec);
        }
        return runs;
    }

    /**
     * Reruns a list of execution context objects.
     * A new context is created using the old objects files.
     *
     * @param contexts
     * @return
     */
    public List<ExecutionContext> rerun(List<ExecutionContext> contexts, boolean test) {
        long creationCheckPoint = System.nanoTime();
        LinkedList<ExecutionContext> newRuns = new LinkedList<>();
        for (ExecutionContext oldContext : contexts) {
            DataSet ds = oldContext.getDataSet();
            if (!test && !checkJobStartability(ds))
                continue;

            ExecutionContext context = new ExecutionContext(FileSystemAccessProvider.getInstance().callWhoAmI(),
                    this, oldContext.getDataSet(),
                    test ? ExecutionContextLevel.TESTRERUN : ExecutionContextLevel.RERUN,
                    oldContext.getOutputDirectory(), oldContext.getInputDirectory(), null, creationCheckPoint);

            context.getAllFilesInRun().addAll(oldContext.getAllFilesInRun());
            executeRun(context);
            newRuns.add(context);
        }
        return newRuns;
    }

    private boolean checkJobStartability(DataSet ds) {
        String datasetID = ds.getId();
        boolean running = checkStatusForDataset(ds);

        if (running && Roddy.getFeatureToggleValue(AvailableFeatureToggles.ForbidSubmissionOnRunning)) {
            logger.postAlwaysInfo(
                    "The pid " + datasetID + " is still running and will be skipped for the process.");
            return false;
        }
        return true;
    }

    public boolean checkStatusForDataset(DataSet ds) {
        AnalysisProcessingInformation api = ds.getLatestValidProcessingInformation(this);
        ExecutionContext detailedProcessingInfo = api != null ? api.getDetailedProcessingInfo() : null;
        return detailedProcessingInfo != null && detailedProcessingInfo.hasRunningJobs();
    }

    public Map<DataSet, Boolean> checkStatus(List<String> pids) {
        return checkStatus(pids, false);
    }

    public Map<DataSet, Boolean> checkStatus(List<String> pids, boolean suppressInfo) {
        List<DataSet> dataSets = getRuntimeService().loadDatasetsWithFilter(this, pids, suppressInfo);
        Map<DataSet, Boolean> results = new LinkedHashMap<>();
        dataSets.parallelStream().forEach(ds -> {
            boolean result = checkStatusForDataset(ds);
            synchronized (results) {
                results.put(ds, result);
            }
        });
        List<DataSet> sortedKeys = new LinkedList<>(results.keySet());
        sortedKeys.sort((ds1, ds2) -> ds1.getId().compareTo(ds2.getId()));
        Map<DataSet, Boolean> sortedMap = new LinkedHashMap<>();
        for (DataSet ds : sortedKeys) {
            sortedMap.put(ds, results.get(ds));
        }
        return sortedMap;
    }

    /**
     * Convenience accessor to runtimeService
     *
     * @return
     */
    public List<DataSet> getListOfPossibleDataSets() {
        return runtimeService.getListOfPossibleDataSets(this);
    }

    @Deprecated
    public List<DataSet> loadDatasetsWithFilter(List<String> pidFilters) {
        return loadDatasetsWithFilter(pidFilters, false);
    }

    @Deprecated()
    public List<DataSet> loadDatasetsWithFilter(List<String> pidFilters, boolean suppressInfo) {
        return runtimeService.loadDatasetsWithFilter(this, pidFilters, suppressInfo);
    }

    @Deprecated
    public List<DataSet> selectDatasetsFromPattern(List<String> pidFilters, List<DataSet> listOfDataSets,
            boolean suppressInfo) {
        return runtimeService.selectDatasetsFromPattern(this, pidFilters, listOfDataSets, suppressInfo);
    }

    /**
     * Creates a single execution context (similar to context(pidfilters...)) but does not execute it.
     * This method is mainly for ui based / asynchnronous execution context generation.
     * A separate thread is created which executes the context.
     *
     * @param pidFilter
     * @param executionContextLevel
     * @return
     */
    public ExecutionContext runDeferredContext(String pidFilter,
            final ExecutionContextLevel executionContextLevel) {
        long creationCheckPoint = System.nanoTime();
        for (DataSet ds : getRuntimeService().getListOfPossibleDataSets(this)) {
            if (!new WildcardFileFilter(pidFilter).accept(ds.getInputFolderForAnalysis(this)))
                continue;
            final ExecutionContext context = new ExecutionContext(
                    FileSystemAccessProvider.getInstance().callWhoAmI(), this, ds, executionContextLevel,
                    ds.getOutputFolderForAnalysis(this), ds.getInputFolderForAnalysis(this), null,
                    creationCheckPoint);
            runDeferredContext(context);
            return context;
        }
        return null;
    }

    /**
     */
    public ExecutionContext rerunDeferredContext(ExecutionContext oldContext,
            final ExecutionContextLevel executionContextLevel, long creationCheckPoint, boolean test) {
        ExecutionContext context = new ExecutionContext(oldContext.getExecutingUser(), oldContext.getAnalysis(),
                oldContext.getDataSet(), test ? ExecutionContextLevel.TESTRERUN : ExecutionContextLevel.RERUN,
                oldContext.getOutputDirectory(), oldContext.getInputDirectory(), null, creationCheckPoint);
        context.getAllFilesInRun().addAll(oldContext.getAllFilesInRun());
        runDeferredContext(context);
        return context;
    }

    /**
     * Runs the passed execution context in a separate thread. The context level is taken from the passed object.
     *
     * @param ec The execution context which will be context in a separate thread.
     */
    public void runDeferredContext(final ExecutionContext ec) {
        //        ThreadGroup
        Thread t = new Thread(() -> {
            executeRun(ec);
        });
        t.setName(String.format("Deferred execution context execution for pid %s", ec.getDataSet().getId()));
        t.start();
    }

    /*
    /**
     * Executes the context object.
     * If the contexts level is QUERY_STATUS:
     * Writes context specific files before execution (like analysis tools, configuration files).
     * Writes some log files after execution.
     * <p/>
     * If its level is QUERY_STATUS:
     * Check the file validity after execution.<br />
     * Also there are no files written for this level.
     *
     *
     * @param context
     */
    protected void executeRun(ExecutionContext context) {
        logger.postSometimesInfo("" + context.getExecutionContextLevel());
        boolean isExecutable;
        String datasetID = context.getDataSet().getId();
        Exception eCopy = null;
        try {
            isExecutable = ExecutionService.getInstance().checkContextPermissions(context)
                    && context.checkExecutability();
            if (!isExecutable) {
                logger.postAlwaysInfo("The workflow does not seem to be executable for dataset " + datasetID);
            } else {
                try {
                    ExecutionService.getInstance().writeFilesForExecution(context);
                    context.execute();
                } finally {
                    if (context.getExecutionContextLevel() == ExecutionContextLevel.QUERY_STATUS) { //Clean up
                        //Query file validity of all files
                        FileSystemAccessProvider.getInstance().validateAllFilesInContext(context);
                    } else {

                        //First, check if there were any executed jobs. If not, we can safely delete the the context directory.
                        if (context.getStartedJobs().size() == 0) {
                            logger.postAlwaysInfo(
                                    "There were no started jobs, the execution directory will be removed.");
                            if (context.getExecutionDirectory().getName()
                                    .contains(ConfigurationConstants.RODDY_EXEC_DIR_PREFIX))
                                FileSystemAccessProvider.getInstance()
                                        .removeDirectory(context.getExecutionDirectory());
                            else {
                                throw new RuntimeException("A wrong path would be deleted: "
                                        + context.getExecutionDirectory().getAbsolutePath());
                            }
                        } else {
                            ExecutionService.getInstance().writeAdditionalFilesAfterExecution(context);
                        }
                    }
                }
            }
        } catch (Exception e) {
            eCopy = e;
            context.addErrorEntry(ExecutionContextError.EXECUTION_UNCATCHEDERROR.expand(e));

        } finally {
            if (eCopy != null) {
                logger.postAlwaysInfo("An exception occurred: '" + eCopy.getLocalizedMessage() + "'");
                if (logger.isVerbosityMedium()) {
                    logger.log(Level.SEVERE, eCopy.toString());
                    logger.postAlwaysInfo(RoddyIOHelperMethods.getStackTraceAsString(eCopy));
                } else {
                    logger.postAlwaysInfo("Set --verbositylevel >=" + LoggerWrapper.VERBOSITY_WARNING
                            + " or higher to see stack trace.");
                }
            }

            // Look up errors when jobs are executed directly and when there were any started jobs.
            if (context.getStartedJobs().size() > 0) {
                for (Job job : context.getExecutedJobs()) {
                    if (job.getJobState() == JobState.FAILED)
                        context.addErrorEntry(
                                ExecutionContextError.EXECUTION_JOBFAILED.expand("A job execution failed "));
                }
            }

            // Print out context errors.
            if (context.getErrors().size() > 0) {
                StringBuilder messages = new StringBuilder();
                boolean warningsOnly = true;
                for (ExecutionContextError executionContextError : context.getErrors()) {
                    if (executionContextError.getErrorLevel().intValue() > Level.WARNING.intValue())
                        warningsOnly = false;
                }
                if (warningsOnly)
                    messages.append("\nThere were warnings for the execution context for dataset " + datasetID);
                else
                    messages.append("\nThere were errors for the execution context for dataset " + datasetID);
                for (ExecutionContextError executionContextError : context.getErrors()) {
                    messages.append("\n\t* ").append(executionContextError.toString());
                }
                logger.postAlwaysInfo(messages.toString());
            }

        }
    }

    /**
     * Calls a cleanup script and / or a workflows cleanup method to cleanup the directories of a workflow.
     * If you call the cleanup script, a new execution context log directory will be created for this purpose. This directory will not be created if
     * the workflows cleanup method is called!.
     *
     * @param pidList
     */
    public void cleanup(List<String> pidList) {
        if (!((AnalysisConfiguration) getConfiguration()).hasCleanupScript() && !getWorkflow().hasCleanupMethod())
            logger.postAlwaysInfo(
                    "There is neither a configured cleanup script or a native workflow cleanup method available for this analysis.");
        List<DataSet> dataSets = getRuntimeService().loadDatasetsWithFilter(this, pidList, true);
        for (DataSet ds : dataSets) {
            // Call a custom cleanup shell script.
            if (((AnalysisConfiguration) getConfiguration()).hasCleanupScript()) {
                //TODO Think hard if this could be generified and simplified! This is also used in other places in a similar way right?
                ExecutionContext context = new ExecutionContext(FileSystemAccessProvider.getInstance().callWhoAmI(),
                        this, ds, ExecutionContextLevel.CLEANUP, ds.getOutputFolderForAnalysis(this),
                        ds.getInputFolderForAnalysis(this), null);
                Job cleanupJob = new Job(context, "cleanup",
                        ((AnalysisConfiguration) getConfiguration()).getCleanupScript(), null);
                //                Command cleanupCmd = JobManager.getInstance().createCommand(cleanupJob, cleanupJob.getToolPath(), new LinkedList<>());
                try {
                    ExecutionService.getInstance().writeFilesForExecution(context);
                    cleanupJob.run();
                } catch (Exception ex) {
                    // Philip: We don't want to see any cleanup errors?
                } finally {
                    ExecutionService.getInstance().writeAdditionalFilesAfterExecution(context);
                }
            }

            // Call the workflows cleanup java method.
            if (getWorkflow().hasCleanupMethod())
                getWorkflow().cleanup(ds);
        }
    }

    public File getReadmeFile() {
        return getConfiguration().getInformationalConfigurationContent().getReadmeFile();
    }
}