Java tutorial
/* * Copyright (C) 2007-2010 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.bdval; import cern.jet.random.engine.MersenneTwister; import com.martiansoftware.jsap.*; import edu.cornell.med.icb.R.RConnectionPool; import edu.cornell.med.icb.R.RUtils; import edu.cornell.med.icb.cli.UseModality; import edu.cornell.med.icb.geo.*; import edu.cornell.med.icb.geo.tools.*; import edu.cornell.med.icb.identifier.DoubleIndexedIdentifier; import edu.cornell.med.icb.identifier.IndexedIdentifier; import edu.cornell.med.icb.learning.*; import edu.cornell.med.icb.learning.libsvm.LibSvmClassifier; import edu.mssm.crover.tables.*; import edu.mssm.crover.tables.readers.*; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.objects.*; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.SystemUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bdval.cache.TableCache; import org.bdval.pathways.AverageAcrossPathwayFeatureAggregator; import org.bdval.pathways.PCAFeatureAggregator; import org.bdval.pathways.PathwayFeatureAggregator; import org.bdval.pathways.PathwayInfo; import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; /** * Main class used to define and process modes for * {@link org.bdval.DiscoverAndValidate}. * New modes should override this class. See the wiki at * <a href="http://bdval.org">BDVal.org</a> for more details. * * @author Fabien Campagne Date: Oct 19, 2007 Time: 2:14:46 PM */ public class DAVMode extends UseModality<DAVOptions> { /** * Used to log debug and informational messages. */ private static final Log LOG = LogFactory.getLog(DAVMode.class); /** * Container for options common to {@link org.bdval.DiscoverAndValidate} * modes. */ private DAVOptions davOptions; /** * The split plan used for this run. */ protected SplitPlan splitPlan; /** * The type of split for this run. */ private String splitType; /** * The id being processed in this run. */ private int splitId; /** * The name of the file containing the split plan for this run. */ private String splitPlanFilename; /** * A cache for data used during processing. */ private TableCache tableCache; /** * Whether or not caching is enabled for this run. */ private boolean isTableCacheEnabled; /** * Used for aggregation of pathway information. */ private PathwayFeatureAggregator pathwayHelper; /** * Type of pathway aggregation in use (i.e., "PCA", "average"). */ private String aggregationType; /** * Synchronization object for pathway processing. * TODO: probably should not be a String */ private static final String SEMAPHORE = "global".intern(); /** * BDVal configuration properties, from the --properties argument. */ protected Properties configurationProperties; /** * Define basic command line options for this mode. Individual modes should override * making sure that options are reused or removed appropriately. Options cannot * be defined more than once in {@link com.martiansoftware.jsap.JSAP}. * * @param jsap the JSAP command line parser * @throws JSAPException if there is a problem building the options */ @Override public void defineOptions(final JSAP jsap) throws JSAPException { super.defineOptions(jsap); // help option is defined by the superclass call final Parameter inputFilenameOption = new FlaggedOption("input").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(true).setShortFlag('i').setLongFlag("input") .setHelp("Input filename. This file contains the measurement data used to" + " discover markers."); jsap.registerParameter(inputFilenameOption); final Parameter outputFlag = new FlaggedOption("output").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setShortFlag('o').setLongFlag("output") .setHelp("Name of the output file. Output is printed to the console when this " + "flag is absent or when the value \"-\" is given."); jsap.registerParameter(outputFlag); final Parameter propertiesFlag = new FlaggedOption("properties").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false) .setLongFlag("properties").setHelp( "Name of the properties file. A Java properties file with bdval specific configuration properties."); jsap.registerParameter(propertiesFlag); final Parameter outputOverwriteFlag = new FlaggedOption("overwrite-output") .setStringParser(JSAP.BOOLEAN_PARSER).setDefault("false").setRequired(true) .setLongFlag("overwrite-output") .setHelp("When true and -o is specified, the output file will be over-written."); jsap.registerParameter(outputOverwriteFlag); final Parameter modelIdFlag = new FlaggedOption("model-id").setStringParser(JSAP.STRING_PARSER) .setDefault("no_model_id").setRequired(false).setLongFlag("model-id") .setHelp("The model-id, created in ExecuteSplitsMode (a hash of the options)"); jsap.registerParameter(modelIdFlag); final Parameter tasksFlag = new FlaggedOption("task-list").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(true).setShortFlag('t').setLongFlag("task-list") .setHelp("Name of the file that describes the classification tasks. " + "This file is tab delimited, with one line per task. " + "First column is the input filename. " + "Second column is the name of the first condition. " + "Third column is the name of the second condition. " + "Fourth column is the number of samples in the first condition. " + "Fifth column is the number of samples in the second condition."); jsap.registerParameter(tasksFlag); final Parameter geneListsFlag = new FlaggedOption("gene-lists").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setShortFlag('g').setLongFlag("gene-lists") .setHelp("Name of the file that describes the gene lists. " + "This file is tab delimited, with one line per gene list. " + "First column is the name of the gene list. " + "Second column (optional) is the name of the file which describes " + "the gene list. " + "If the file has only one column, the name of the gene list must be " + "full (for the full array). If the name of the gene " + "list is random, the second field indicates how many random probesets " + "must be selected, and a third field indicates the random seed to use " + "for probeset selection."); jsap.registerParameter(geneListsFlag); final Parameter geneListFlag = new FlaggedOption("gene-list").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("gene-list") .setHelp("Argument of the form gene-list-name|filename. " + "The filename points to a single gene list file."); jsap.registerParameter(geneListFlag); final Parameter platformFlag = new FlaggedOption("platform-filenames").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(true).setShortFlag('p').setLongFlag("platform-filenames") .setHelp("Comma separated list of platform filenames."); jsap.registerParameter(platformFlag); final Parameter conditionFlag = new FlaggedOption("conditions").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(true).setShortFlag('c').setLongFlag("conditions") .setHelp("Specify the file with the mapping condition-name column-identifier " + "(tab delimited, with one mapping per line)."); jsap.registerParameter(conditionFlag); final Parameter seed = new FlaggedOption("seed").setStringParser(JSAP.INTEGER_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("seed") .setHelp("Seed to initialize random generator."); jsap.registerParameter(seed); final Parameter pathways = new FlaggedOption("pathways").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("pathways") .setHelp("Filename of the pathway description information. " + "The pathway description information is a file with one line per " + "pathway. Each line has two tab delimited field. The first field provides a " + "pathway identifier. The second field is space delimited. Each token of the " + " second field is an " + "(Ensembl) gene ids for gene that belong to the pathway. " + "When this option is provided, features are aggregated by pathway " + "and computations are performed in aggregated feature space. " + "Some aggregation algorithms may generate several aggregated " + "features per pathway. When this option is active, the " + "option --gene2probes must be provided on the command line."); jsap.registerParameter(pathways); final Parameter pathwayAggregationMethod = new FlaggedOption("pathway-aggregation-method") .setStringParser(JSAP.STRING_PARSER).setDefault("PCA").setRequired(false) .setLongFlag("pathway-aggregation-method") .setHelp("Indicate which method should be used to aggregate features for pathway " + "runs. Two methods are available: PCA or average. PCA performs a " + "principal component analysis for the probesets of each pathway. " + "Average uses a single feature for each pathway calculated as the " + "average of the probeset signal in each pathway. Default is PCA."); jsap.registerParameter(pathwayAggregationMethod); final Parameter gene2Probes = new FlaggedOption("gene2probes").setStringParser(JSAP.STRING_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("gene-to-probes") .setHelp("Filename of the gene to probe description information. " + "The pathway description information is a file with one line per gene. " + "Each line is tab delimited. The first field is an ensembl gene id. " + "The second field is a probe id which measures expression of a " + "transcript of the gene. Several lines may share the same gene " + "id, indicating that multiple probe ids exist for the gene. "); jsap.registerParameter(gene2Probes); final Parameter floorParam = new FlaggedOption("floor").setStringParser(JSAP.DOUBLE_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("floor") .setHelp("Specify a floor value for the signal. If a signal is lower " + "than the floor, it is set to the floor. If no floor is provided, " + "values are unchanged."); jsap.registerParameter(floorParam); final Parameter twoChannelArray = new FlaggedOption("two-channel-array") .setStringParser(JSAP.BOOLEAN_PARSER).setDefault(JSAP.NO_DEFAULT).setRequired(false) .setLongFlag("two-channel-array") .setHelp("Indicate that the data is for a two channel array. This flag affects " + "how the floor value is interpreted. " + "For two channel arrays, values on the array are set to 1.0" + "if (Math.abs(oldValue-1.0)+1)<=floorValue, whereas for one channel " + "array the condition becomes: oldValue<=floorValue."); jsap.registerParameter(twoChannelArray); final Parameter log10Array = new FlaggedOption("log-10-array").setStringParser(JSAP.BOOLEAN_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("log-10-array") .setHelp("See --logged-array"); jsap.registerParameter(log10Array); final Parameter loggedArray = new FlaggedOption("logged-array").setStringParser(JSAP.BOOLEAN_PARSER) .setDefault(JSAP.NO_DEFAULT).setRequired(false).setLongFlag("logged-array") .setHelp("Indicate that the data on this array has been logged. This option " + "affects flooring for two color aryays. When the option is specified, " + "the floor is applied around a center value of zero. When the option " + "is not specified, two color arrays are floored around a value of 1 " + "(no change)."); jsap.registerParameter(loggedArray); final Parameter scalingOption = new FlaggedOption("scale-features").setStringParser(JSAP.BOOLEAN_PARSER) .setDefault("true").setRequired(false).setLongFlag("scale-features") .setHelp("Indicate whether the features should be scaled to the range [-1 1]. " + "If false, no scaling occurs. If true (default), features are scaled."); jsap.registerParameter(scalingOption); final Parameter percentileScalingOption = new FlaggedOption("percentile-scaling") .setStringParser(JSAP.BOOLEAN_PARSER).setDefault("false").setRequired(false) .setLongFlag("percentile-scaling") .setHelp("Indicate whether feature scaling is done with percentile and median " + "or full range and average. When percentiles are used, the range " + "of each feature is determined as the range of the " + "20-80 percentile of the data and median is used instead of the mean."); jsap.registerParameter(percentileScalingOption); final Parameter scalerClassName = new FlaggedOption("scaler-class-name").setStringParser(JSAP.STRING_PARSER) .setRequired(false).setLongFlag("scaler-class-name") .setHelp("The classname of the scaler implementation. Overrides " + "--percentile-scaling if provided."); jsap.registerParameter(scalerClassName); final Parameter normalizeOption = new FlaggedOption("normalize-features") .setStringParser(JSAP.BOOLEAN_PARSER).setDefault("false").setRequired(false) .setLongFlag("normalize-features") .setHelp("Indicate whether the feature vectors should be normalized. If false, " + "no normalizing occurs. If true (default), features are normalized."); jsap.registerParameter(normalizeOption); final Parameter classifierClassName = new FlaggedOption("classifier").setStringParser(JSAP.CLASS_PARSER) .setDefault(LibSvmClassifier.class.getCanonicalName()).setRequired(false).setShortFlag('l') .setLongFlag("classifier").setHelp("Fully qualified class name of the classifier implementation."); jsap.registerParameter(classifierClassName); final Parameter classifierParameterArgument = new FlaggedOption("classifier-parameters") .setStringParser(JSAP.STRING_PARSER).setDefault(JSAP.NO_DEFAULT).setRequired(false) .setAllowMultipleDeclarations(true).setShortFlag('a').setLongFlag("classifier-parameters") .setHelp("Comma separated list of parameters that will be passed to the " + "classifier. Parameters vary from one classifier to the next. Check " + "the documentation of the classifier and the source code to see " + "which parameters can be set."); jsap.registerParameter(classifierParameterArgument); final Parameter geneFeaturesDirArgument = new FlaggedOption("gene-features-dir") .setStringParser(JSAP.STRING_PARSER).setDefault("." + SystemUtils.FILE_SEPARATOR).setRequired(false) .setLongFlag("gene-features-dir") .setHelp("The directory where gene features files will be read from" + " (when specified in a -gene-lists.txt file)."); jsap.registerParameter(geneFeaturesDirArgument); final Parameter datasetNameArgument = new FlaggedOption("dataset-name").setStringParser(JSAP.STRING_PARSER) .setRequired(false).setDefault("dataset-name").setLongFlag("dataset-name") .setHelp("The name of the dataset being run."); jsap.registerParameter(datasetNameArgument); // TODO: ?? Could we be more precise here. An example? final Parameter datasetRootArgument = new FlaggedOption("dataset-root").setStringParser(JSAP.STRING_PARSER) .setRequired(false).setDefault("ds-root").setLongFlag("dataset-root") .setHelp("The root directory where the dataset files exist."); jsap.registerParameter(datasetRootArgument); final Switch outputStatsFromGeneListSwitch = new Switch("output-stats-from-gene-list") .setLongFlag("output-stats-from-gene-list"); jsap.registerParameter(outputStatsFromGeneListSwitch); final Parameter rservePortFlag = new FlaggedOption("rserve-port").setStringParser(JSAP.INTEGER_PARSER) .setDefault("-1").setRequired(false).setLongFlag("rserve-port").setHelp("The Rserve port to use"); jsap.registerParameter(rservePortFlag); final Parameter splitIdParam = new FlaggedOption("process-split-id").setStringParser(JSAP.INTEGER_PARSER) .setRequired(false).setLongFlag("process-split-id") .setHelp("Restricts execution to a split id. A split execution plan must be " + "provided as well. The split id is used together with the split plan to " + "determine which samples should be processed. Typical usage would be " + "\"--process-split-id 2 --split-plan theplan.txt --split-type training\" " + "This would result in training samples being used that match split #2 " + "in theplan.txt."); jsap.registerParameter(splitIdParam); final Parameter splitPlanParam = new FlaggedOption("split-plan").setStringParser(JSAP.STRING_PARSER) .setRequired(false).setLongFlag("split-plan") .setHelp("Filename for the split plan definition. See process-split-id."); jsap.registerParameter(splitPlanParam); final Parameter splitTypeParam = new FlaggedOption("split-type").setStringParser(JSAP.STRING_PARSER) .setRequired(false).setLongFlag("split-type") .setHelp("Split type (i.e., training, test, feature-selection, must match " + "a type listed in the split plan). See process-split-id."); jsap.registerParameter(splitTypeParam); final Parameter cacheDirParam = new FlaggedOption("cache-dir").setStringParser(JSAP.STRING_PARSER) .setDefault("cache").setRequired(true).setLongFlag("cache-dir") .setHelp("Cache directory. Specify a directory when intermediate processed " + "tables will be saved for faster access."); jsap.registerParameter(cacheDirParam); final Parameter cacheEnabledParam = new Switch("cache-enabled").setLongFlag("enable-cache") .setHelp("Enables caching for faster access to processed tables."); jsap.registerParameter(cacheEnabledParam); final Parameter pathwayComponentDirParam = new FlaggedOption("pathway-components-dir") .setStringParser(JSAP.STRING_PARSER).setDefault("pathway-components").setRequired(false) .setLongFlag("pathway-components-dir") .setHelp("Directory where pathway components will be stored. "); jsap.registerParameter(pathwayComponentDirParam); } /** * Interpret the command line arguments. * * @param jsap the JSAP command line parser * @param result the results of command line parsing * @param options the options for this mode */ @Override public void interpretArguments(final JSAP jsap, final JSAPResult result, final DAVOptions options) { checkArgumentsSound(jsap, result, false); setupProperties(result); setupModelId(result, options); setupDatasetName(result, options); setupDatasetRoot(result, options); setupSplitPlan(result, options); setupPathwayOptions(result, options); setupTableCache(result, options); setupTaskAndConditions(result, options); setupInput(result, options); setupPlatforms(result, options); setupGeneFeaturesDir(result, options); setupGeneLists(result, options); setupRandomArguments(result, options); setupArrayAttributes(result, options); setupClassifier(result, options); setupRservePort(result, options); setupOutput(result, options); setupScalerOptions(result, options); } private void setupProperties(final JSAPResult result) { if (result.contains("properties")) { final String propsFilename = result.getString("properties"); System.out.println("Loading BDVal properties from file " + propsFilename); final Properties props = new Properties(); Reader propsReader = null; try { propsReader = new FileReader(propsFilename); props.load(propsReader); configurationProperties = props; } catch (IOException e) { LOG.error("Cannot load BDVal properties file " + propsFilename, e); System.exit(1); } finally { IOUtils.closeQuietly(propsReader); } } } /** * Initialize state of the cache and the cache itself if enabled. * * @param result the results of command line parsing * @param options the options for this mode */ protected void setupTableCache(final JSAPResult result, final DAVOptions options) { tableCache = null; isTableCacheEnabled = result.getBoolean("cache-enabled"); if (LOG.isInfoEnabled()) { LOG.info("Caching is" + (isTableCacheEnabled ? " " : " NOT ") + "enabled."); } if (isTableCacheEnabled) { String tableCacheDirectory = result.getString("cache-dir"); try { tableCacheDirectory += "/pathways=" + Boolean.toString(options.pathways != null); if (options.pathways != null && !"PCA".equalsIgnoreCase(aggregationType)) { // do not append aggregation= for PCA feature aggregation, so that we can use // the cache built in previous versions: tableCacheDirectory += "/aggregation=" + aggregationType; } final File cacheDir = new File(tableCacheDirectory); if (!cacheDir.exists()) { FileUtils.forceMkdir(cacheDir); } tableCache = new TableCache(cacheDir); } catch (IOException e) { LOG.error("Cannot setup table cache in directory" + tableCacheDirectory, e); System.exit(1); } } } /** * Determine if a sample id should be removed from consideration because it is not in the * split partition under consideration. * * @param sampleId The id to check * @return true if the split plan contains the specifid id or there is no split plan */ protected boolean splitPlanContainsSampleId(final String sampleId) { return splitPlan == null || splitPlan.getSampleIds(splitId, splitType).contains(sampleId); } public int getRepeatId() { return splitPlan == null ? 0 : splitPlan.getRepeatId(splitId); } protected void setupDatasetName(final JSAPResult result, final DAVOptions options) { options.datasetName = result.getString("dataset-name"); } protected void setupModelId(final JSAPResult result, final DAVOptions options) { if (result.contains("model-id")) { options.modelId = result.getString("model-id"); } } protected void setupDatasetRoot(final JSAPResult result, final DAVOptions options) { options.datasetRoot = result.getString("dataset-root"); } protected void setupSplitPlan(final JSAPResult result, final DAVOptions options) { final boolean splitPlanMaybe = result.contains("split-plan") || result.contains("split-type") || result.contains("process-split-id"); if (splitPlanMaybe) { final boolean splitPlanOk = result.contains("split-plan") && result.contains("split-type") && result.contains("process-split-id"); if (!splitPlanOk) { System.err.println("Split plan options are not consistent. You must provide " + "all options together or none at all. See --help for details."); System.exit(1); } else { splitPlan = new SplitPlan(); String filename = null; try { filename = result.getString("split-plan"); splitPlanFilename = filename; splitPlan.load(filename); if (splitPlan.getMaxSplitIndex() == 0) { System.err.println( "The split plan provided contains no split. " + "Nothing will be executed."); } } catch (IOException e) { System.err.println("Cannot load split plan from filename: " + filename); } splitType = result.getString("split-type"); splitId = result.getInt("process-split-id"); } } } protected void setupPathwayOptions(final DAVOptions options, final String pathwayAggregationMethod, final String pathwayInfoFilename, final String geneToProbesFilename, final String pathwayComponentsDirectory) { if (pathwayInfoFilename != null && geneToProbesFilename == null || pathwayInfoFilename == null && geneToProbesFilename != null) { System.err.println("--pathways, --gene2probes and optionally " + "--pathway-aggregation-method options must be used together."); } if (pathwayAggregationMethod != null) { options.pathwayAggregtionMethod = pathwayAggregationMethod; if ("PCA".equalsIgnoreCase(pathwayAggregationMethod)) { pathwayHelper = new PCAFeatureAggregator(pathwayComponentsDirectory); } else if ("average".equalsIgnoreCase(pathwayAggregationMethod)) { pathwayHelper = new AverageAcrossPathwayFeatureAggregator(); } else { System.err.println("--pathway-aggregation-method must be one of: 'PCA' or 'average'."); System.exit(1); } aggregationType = pathwayAggregationMethod; } if (pathwayInfoFilename != null) { loadGene2Probes(options, geneToProbesFilename); loadPathways(options, pathwayInfoFilename); options.pathwaysInfoFilename = pathwayInfoFilename; options.geneToProbeFilename = geneToProbesFilename; } } protected void setupPathwayOptions(final JSAPResult result, final DAVOptions options) { String pathwayComponentsDir = null; String pathwayAggregationMethod = null; if (result.contains("pathway-aggregation-method")) { pathwayAggregationMethod = result.getString("pathway-aggregation-method"); if (result.contains("pathway-components-dir")) { pathwayComponentsDir = result.getString("pathway-components-dir"); } } String pathwayFilename = null; String gene2ProbeFilename = null; if (result.contains("pathways")) { pathwayFilename = result.getString("pathways"); gene2ProbeFilename = result.getString("gene2probes"); } setupPathwayOptions(options, pathwayAggregationMethod, pathwayFilename, gene2ProbeFilename, pathwayComponentsDir); } public void loadPathways(final DAVOptions options, final String pathwayFilename) { LOG.info("Loading pathway information."); FastBufferedReader pathwayReader = null; try { pathwayReader = new FastBufferedReader(new FileReader(pathwayFilename)); final LineIterator it = new LineIterator(pathwayReader); final IndexedIdentifier pathwayIds = new IndexedIdentifier(); options.pathways = new ObjectArraySet<PathwayInfo>(); while (it.hasNext()) { final MutableString line = it.next(); final String[] tokens = line.toString().split("[\t]"); if (tokens.length <= 1) { continue; } final String[] geneIds = tokens[1].split("[ ]"); final MutableString pathwayId = new MutableString(tokens[0]).compact(); final int pathwayIndex = pathwayIds.registerIdentifier(pathwayId); final PathwayInfo pi = new PathwayInfo(); pi.pathwayId = pathwayId; pi.pathwayIndex = pathwayIndex; final ObjectSet<MutableString> probesetIds = new ObjectOpenHashSet<MutableString>(); for (final String sGeneId : geneIds) { final MutableString geneId = new MutableString(sGeneId).compact(); final IntSet set = options.gene2Probe.getProbesetIndices(geneId); if (set != null) { for (final int probesetIndex : set) { final MutableString probesetId = options.probeIndexMapping.getId(probesetIndex); probesetIds.add(probesetId); } } } pi.probesetIds = probesetIds; pi.probesetIndices = null; options.pathways.add(pi); } options.pathwayIndexMapping = new DoubleIndexedIdentifier(pathwayIds); } catch (FileNotFoundException e) { System.err.println("Cannot find pathway info filename " + pathwayFilename); System.exit(10); } finally { IOUtils.closeQuietly(pathwayReader); } } private void loadGene2Probes(final DAVOptions options, final String gene2ProbeFilename) { LOG.info("Loading gene-to-probe mapping information."); options.gene2Probe = new Gene2Probesets(); try { options.probeIndexMapping = options.gene2Probe.load(gene2ProbeFilename); } catch (FileNotFoundException e) { LOG.fatal("Cannot find gene2probe filename " + gene2ProbeFilename); System.exit(10); } } protected void setupClassifier(final JSAPResult result, final DAVOptions options) { options.classiferClass = result.getClass("classifier"); final String[] parameters = result.getStringArray("classifier-parameters"); // combine multiple occurences of --classifier-parameters on the command line into one parameter string, // separating occurences by a comma. final StringBuffer parameterCollected = new StringBuffer(); int i = 0; for (final String params : parameters) { parameterCollected.append(params); if (i != parameters.length - 1) { parameterCollected.append(","); } i++; } LOG.info("collected classifier parameters: " + parameterCollected.toString()); final String classifierParametersResult = StringUtils.defaultString(parameterCollected.toString()); options.classifierParameters = classifierParametersResult.split("[,]"); options.scaleFeatures = result.getBoolean("scale-features"); options.percentileScaling = result.getBoolean("percentile-scaling"); if (result.contains("scaler-class-name")) { options.scalerClassName = result.getString("scaler-class-name"); options.percentileScaling = false; } else { options.scalerClassName = null; } options.normalizeFeatures = result.getBoolean("normalize-features"); if (options.scaleFeatures) { LOG.info("Features will be scaled."); } else { LOG.info("Features will not be scaled."); } } protected synchronized void setupInput(final JSAPResult result, final DAVOptions options) { options.input = result.getString("input"); if (isTableCacheEnabled) { assert tableCache != null : "TableCache must be initialized."; if (tableCache.isTableCached(getSplitId(), getSplitType(), options.datasetName)) { // do not load the input file if we are going to work on a cached table.. return; } } try { options.inputTable = readInputFile(options.input); } catch (IOException e) { System.err.println("Cannot read input file \"" + options.input + "\""); LOG.fatal("Cannot read input file \"" + options.input + "\"", e); System.exit(10); } catch (UnsupportedFormatException e) { System.err.println("The format of the input file \"" + options.input + "\" is not supported."); LOG.fatal("The format of the input file \"" + options.input + "\" is not supported.", e); System.exit(10); } catch (SyntaxErrorException e) { System.err.println("A syntax error was found in the input file \"" + options.input + "\""); LOG.fatal("A syntax error was found in the input file \"" + options.input + "\"", e); System.exit(10); } } protected void setupRservePort(final JSAPResult result, final DAVOptions options) { options.rservePort = result.getInt("rserve-port"); if (options.rservePort > 0) { try { // Startup. LOG.info("++ This JVM will use the Rserve at port " + options.rservePort); RConnectionPool.getInstance(RUtils.makeConfiguration(options.rservePort)); } catch (ConfigurationException e) { LOG.error("Could not configure to connect to localhost Rserve at port" + options.rservePort, e); } } } protected void checkArgumentsSound(final JSAP jsap, final JSAPResult result, final boolean requiresGeneList) { if (!result.success()) { System.out.println("Error parsing command line."); printHelp(jsap); System.exit(1); } if (requiresGeneList) { if (result.getString("gene-lists") == null && result.getString("gene-list") == null) { System.err.println("One of the --gene-lists or --gene-list options must be provided."); printHelp(jsap); System.exit(1); } } } protected void setupRandomArguments(final JSAPResult result, final DAVOptions options) { final int seed = result.contains("seed") ? result.getInt("seed") : (int) new Date().getTime(); options.randomGenerator = new MersenneTwister(seed); options.randomSeed = seed; } protected void setupArrayAttributes(final JSAPResult result, final DAVOptions options) { options.adjustSignalToFloorValue = false; if (result.contains("floor")) { options.signalFloorValue = result.getDouble("floor"); options.adjustSignalToFloorValue = true; } options.oneChannelArray = !result.contains("two-channel-array"); options.loggedArray = !(result.contains("log-10-array") || result.contains("logged-array")); } protected void setupGeneFeaturesDir(final JSAPResult result, final DAVOptions options) { options.setGeneFeaturesDir(result.getString("gene-features-dir")); } protected void setupGeneLists(final JSAPResult result, final DAVOptions options) { String singleGeneListFilename = result.getString("gene-list"); if (singleGeneListFilename != null) { String[] tokens = { singleGeneListFilename, singleGeneListFilename }; final Vector<GeneList> list = new Vector<GeneList>(); try { if (singleGeneListFilename.split("[|]").length == 2) { // single gene list filename may be suffixed with name of gene list // for instance HM200|/path/to/HM200.txt tokens = singleGeneListFilename.split("[|]"); singleGeneListFilename = tokens[1]; } final GeneList geneList = GeneList.createList(tokens, ""); geneList.setPlatforms(options.platforms); list.add(geneList); } catch (IOException e) { LOG.error("Cannot read gene list from file " + singleGeneListFilename, e); } options.geneLists = list.toArray(new GeneList[list.size()]); } else { final String geneListFilename = result.getString("gene-lists"); if (geneListFilename != null) { readGeneLists(geneListFilename, options.platforms, options); } else { // use all the probesets if no gene list was specified: options.geneLists = new GeneList[1]; options.geneLists[0] = new FullGeneList("full"); } } } protected void setupPlatforms(final JSAPResult result, final DAVOptions options) { final String platformFilenames = result.getString("platform-filenames"); readPlatforms(platformFilenames, options); } protected void setupTaskAndConditions(final JSAPResult result, final DAVOptions options) { final String taskListFilename = result.getString("task-list"); final String conditionsFilename = result.getString("conditions"); LOG.info("Loading task-list: " + taskListFilename); LOG.info("Loading cids: " + conditionsFilename); try { options.classificationTasks = MicroarrayTrainEvaluate.readTasksAndConditions(taskListFilename, conditionsFilename); if (splitPlan != null) { options.classificationTasks = filterBySplitPlan(options.classificationTasks, splitPlan, splitId, splitType); } } catch (IOException e) { LOG.error("Unable to read task list or condition files.", e); } } private ClassificationTask[] filterBySplitPlan(final ClassificationTask[] classificationTasks, final SplitPlan splitPlan, final int splitId, final String splitType) { for (int i = 0; i < classificationTasks.length; i++) { classificationTasks[i] = filterBySplitPlan(classificationTasks[i], splitPlan, splitId, splitType); } return classificationTasks; } /** * Filter a task to keep only samples that match a split plan, split id and split type. * * @param task * @param splitPlan * @param splitId * @param splitType * @return */ private ClassificationTask filterBySplitPlan(final ClassificationTask task, final SplitPlan splitPlan, final int splitId, final String splitType) { ConditionIdentifiers conditionsIdentifiers = task.getConditionsIdentifiers(); final Set<String> samplesForClass0 = conditionsIdentifiers.getLabelGroup(task.getFirstConditionName()); ObjectSet<String> splitPlanSamples = splitPlan.getSampleIds(splitId, splitType); samplesForClass0.retainAll(splitPlanSamples); if (samplesForClass0.size() == 0) { throw new IllegalArgumentException( "Condition 0 (" + task.getFirstConditionName() + ") must have some samples."); } final ConditionIdentifiers cids = new ConditionIdentifiers(); for (final String negativeSample : samplesForClass0) { cids.addIdentifier(task.getFirstConditionName().intern(), negativeSample); } final Set<String> samplesForClass1 = conditionsIdentifiers.getLabelGroup(task.getSecondConditionName()); samplesForClass1.retainAll(splitPlanSamples); if (samplesForClass1.size() == 0) { throw new IllegalArgumentException( "Condition 1 (" + task.getSecondConditionName() + ") must have some samples."); } for (final String positiveSample : samplesForClass1) { cids.addIdentifier(task.getSecondConditionName().intern(), positiveSample); } task.setConditionsIdentifiers(cids); task.setNumberSamplesFirstCondition(samplesForClass0.size()); task.setNumberSamplesSecondCondition(samplesForClass1.size()); return task; } public static String makeStatsFileFromGeneListsFile(final GeneList[] geneLists, final String defaultVal) { if (geneLists == null) { System.out.println(">>> geneLists is null"); return defaultVal; } if (geneLists.length == 0) { System.out.println(">>> geneLists is empty"); return defaultVal; } if (!(geneLists[0] instanceof FileGeneList)) { System.out.println(">>> geneList[0] is not a FileGeneList"); return defaultVal; } final FileGeneList geneList = (FileGeneList) geneLists[0]; final String geneListFilename = FilenameUtils.getName(geneList.getFilename()); if (!geneListFilename.endsWith("-features.txt")) { System.out.println(">>> geneList[0] filename does not end in -features.txt, is " + geneListFilename); return defaultVal; } final String[] parts = StringUtils.split(geneListFilename, "-"); final StringBuilder statsFilename = new StringBuilder(); for (int i = 0; i < parts.length - 1; i++) { statsFilename.append(parts[i]); statsFilename.append("-"); } statsFilename.append("stats.txt"); return statsFilename.toString(); } public int getSplitId() { return splitId; } public String getSplitType() { return splitType; } public String getSplitPlanFilename() { return splitPlanFilename; } protected void setupOutput(final JSAPResult result, final DAVOptions options) { final String output; options.overwriteOutput = result.getBoolean("overwrite-output"); if (result.getBoolean("output-stats-from-gene-list")) { output = makeStatsFileFromGeneListsFile(options.geneLists, result.getString("output")); } else { output = result.getString("output"); } System.out.println(">>> DAVMode.Output (stats) file is " + output); if (output == null || output.equals("-")) { options.output = new PrintWriter(System.out); System.err.println("Output will be written to stdout"); } else { try { // Make sure the path of the output file exists. final String path = FilenameUtils.getFullPath(output); if (StringUtils.isNotBlank(path)) { FileUtils.forceMkdir(new File(path)); } synchronized (output.intern()) { final File outputFile = new File(output); options.outputFilePreexist = options.overwriteOutput ? false : outputFile.exists(); options.output = new PrintWriter(new FileWriter(output, !options.overwriteOutput)); System.out.println("Output will be written to file " + output); } } catch (IOException e) { System.err.println("Cannot create output file for filename " + output); // e.printStackTrace(); } } } private void readPlatforms(final String platformFilenames, final DAVOptions options) { // read platforms: options.platforms = new Vector<GEOPlatform>(); final String[] filenames = platformFilenames.split(","); for (final String pFilename : filenames) { GEOPlatform platform = new GEOPlatform(); if ("dummy".equals(pFilename)) { platform = new DummyPlatform(); options.platforms.add(platform); System.out.println("Will proceed with dummy platform."); return; } System.out.print("Reading platform " + pFilename + ".. "); System.out.flush(); try { platform.read(pFilename); System.out.println("done."); System.out.flush(); options.platforms.add(platform); } catch (IOException e) { LOG.error("Problem reading " + pFilename, e); System.exit(10); } catch (SyntaxErrorException e) { LOG.error("Syntax error reading " + pFilename, e); System.exit(10); } } } private void readGeneLists(final String geneListFilename, final Vector<GEOPlatform> platforms, final DAVOptions options) { BufferedReader geneListReader = null; try { // read gene list info: geneListReader = new BufferedReader(new FileReader(geneListFilename)); String line; final Vector<GeneList> list = new Vector<GeneList>(); while ((line = geneListReader.readLine()) != null) { if (line.startsWith("#")) { continue; } final String[] tokens = line.split("[\t]"); if (tokens.length < 1) { throw new IllegalArgumentException( "Gene list line must have at least 1 field." + " Line was : " + line); } final GeneList geneList = GeneList.createList(tokens, options.getGeneFeaturesDir()); geneList.setPlatforms(platforms); list.add(geneList); } options.geneLists = list.toArray(new GeneList[list.size()]); } catch (FileNotFoundException e) { LOG.fatal("Cannot find gene list file: " + geneListFilename, e); System.exit(1); } catch (IOException e) { LOG.error("Cannot read gene list file: " + geneListFilename, e); System.exit(2); } finally { IOUtils.closeQuietly(geneListReader); } } /** * Reads data from a file and stores it into a {@link edu.mssm.crover.tables.Table}. * * @param fileName Name of the file to read * @return A table that contains data read from the input file * @throws SyntaxErrorException if there is an error in the file * @throws IOException if the input file cannot be read * @throws UnsupportedFormatException if the file format is not recognized */ protected Table readInputFile(final String fileName) throws IOException, SyntaxErrorException, UnsupportedFormatException { System.out.print("Reading input " + fileName + "... "); final Table table; Reader reader = null; try { // get a reader object for the file and store the file extension final InputStream inputStream = FileUtils.openInputStream(new File(fileName)); String fileExtension = FilenameUtils.getExtension(fileName); if ("gz".equalsIgnoreCase(fileExtension)) { reader = new InputStreamReader(new GZIPInputStream(inputStream)); // strip the ".gz" to get the "real" extension fileExtension = FilenameUtils.getExtension(FilenameUtils.getBaseName(fileName)); } else { reader = new InputStreamReader(inputStream); } if ("soft".equalsIgnoreCase(fileExtension)) { if (fileName.contains("GDS")) { // GEO dataset final TableReader tableReader = new GeoDataSetReader(); table = tableReader.read(reader); } else if (fileName.contains("GSE")) { // GEO series table = readGeoSeries(reader); } else { throw new UnsupportedFormatException(fileName); } } else if ("res".equalsIgnoreCase(fileExtension)) { final TableReader tableReader = new WhiteheadResReader(); table = tableReader.read(reader); } else if ("tmm".equalsIgnoreCase(fileExtension)) { final TableReader tableReader = new ColumbiaTmmReader(); table = tableReader.read(reader); } else if ("iconix".equalsIgnoreCase(fileExtension)) { final TableReader tableReader = new IconixReader(); table = tableReader.read(reader); } else if ("cologne".equalsIgnoreCase(fileExtension)) { final TableReader tableReader = new CologneReader(); table = tableReader.read(reader); } else { throw new UnsupportedFormatException(fileName); } if (table == null) { System.err.println("The input file could not be read. "); System.exit(1); } System.out.println("done"); } finally { IOUtils.closeQuietly(reader); } return table; } private Table readGeoSeries(final Reader reader) { final ArrayTable result = new ArrayTable(); final GeoSoftFamilyParser parser = new GeoSoftFamilyParser(reader); if (parser.skipToDatabaseSection()) { final MutableString databaseName = parser.getSectionAttribute(); System.out.println("Database: " + databaseName); } else { System.out.println("No database section found."); } GEOPlatformIndexed platform = null; if (parser.skipToPlatformSection()) { final MutableString platformName = parser.getSectionAttribute(); System.out.println("Platform: " + platformName); platform = parser.parsePlatform(); platform.setName(platformName); System.out.println("Platform has " + platform.getNumProbeIds() + " probes."); } else { System.out.println("No database section found."); } result.setChunk(platform.getNumProbeIds()); result.setInitialSize(platform.getNumProbeIds()); result.addColumn("ID_REF", String.class); // append probeset ids: for (int probeIndex = 0; probeIndex < platform.getNumProbeIds(); probeIndex++) { final String probeId = platform.getProbesetIdentifier(probeIndex).toString(); result.appendObject(0, probeId); } final DefaultSignalAdapter formatAdapter = new DefaultSignalAdapter(true); while (parser.skipToSampleSection()) { // append signal value for each sample: final MutableString sampleName = parser.getSectionAttribute(); final int newColumnIndex = result.addColumn(sampleName.toString(), double.class); final SampleDataCallback callback = formatAdapter.getCallback(platform); parser.parseSampleData(platform, callback); final AffymetrixSampleData data = (AffymetrixSampleData) callback.getParsedData(); for (final float signal : data.signal) { try { result.appendDoubleValue(newColumnIndex, signal); // result.parseAppend(newColumnIndex, Float.toString(signal)); } catch (TypeMismatchException e) { throw new InternalError("Column must be of type double."); } } formatAdapter.analyzeSampleData(platform, callback, sampleName); } return result; } /** * Removes any column of the input table not required by the classification task. * * @param inputTable Input table. * @param labelValueGroups * @return filtered version of the input table. */ public Table filterInputTable(final Table inputTable, final List<Set<String>> labelValueGroups) { try { Set<String> reduction = getReduction(labelValueGroups); return MicroarrayTrainEvaluate.filterColumnsForTask(inputTable, labelValueGroups, MicroarrayTrainEvaluate.IDENTIFIER_COLUMN_NAME, reduction); } catch (TypeMismatchException e) { LOG.error(e); return null; } catch (InvalidColumnException e) { LOG.error(e); return null; } } private Set<String> getReduction(List<Set<String>> labelValueGroups) { if (splitPlan == null) { return MicroarrayTrainEvaluate.getAllSamples(labelValueGroups); } else { return splitPlan.getSampleIds(splitId, splitType); } } /** * Process with options. This method may exit the JVM at any time. * * @param options Interpreted options. */ @Override public void process(final DAVOptions options) { davOptions = options; } /** * Process an input data table for a specific gene list, options, and binary classification. * This method is used when training a model. * * @param geneList The gene list to exclude the data with * @param inputTable The entire data set before filtering * @param options DAVMode options for processing * @param labelValueGroups Sets of labels for each classification target. Labels of class 0 * appear first, followed by labels of class 1. * @return * @throws TypeMismatchException * @throws InvalidColumnException * @throws ColumnTypeException */ protected Table processTable(final GeneList geneList, final Table inputTable, final DAVOptions options, final List<Set<String>> labelValueGroups) throws TypeMismatchException, InvalidColumnException, ColumnTypeException { return processTable(geneList, inputTable, options, labelValueGroups, false); } /** * Process an input data table for a specific gene list, options, and binary classification. * * @param geneList The gene list to exclude the data with * @param inputTable The entire data set before filtering * @param options DAVMode options for processing * @param labelValueGroups Sets of labels for each classification target. Labels of class 0 appear first, followed by labels of class 1. * @param predictOnly When true, no filtering is done by labelValueGroups. Suitable when the table will be used for prediction with a previously trained model. * @return * @throws TypeMismatchException * @throws InvalidColumnException * @throws ColumnTypeException */ protected Table processTable(final GeneList geneList, final Table inputTable, final DAVOptions options, final List<Set<String>> labelValueGroups, final boolean predictOnly) throws InvalidColumnException, ColumnTypeException, TypeMismatchException { return processTable(geneList, inputTable, options, labelValueGroups, predictOnly, splitId, splitType); } /** * Process an input data table for a specific gene list, options, and binary classification. * * @param geneList The gene list to exclude the data with * @param inputTable The entire data set before filtering * @param options DAVMode options for processing * @param labelValueGroups Sets of labels for each classification target. Labels of class 0 appear first, followed by labels of class 1. * @param predictOnly When true, no filtering is done by labelValueGroups. Suitable when the table will be used for prediction with a previously trained model. * @return * @throws TypeMismatchException * @throws InvalidColumnException * @throws ColumnTypeException */ protected Table processTable(final GeneList geneList, final Table inputTable, final DAVOptions options, final List<Set<String>> labelValueGroups, final boolean predictOnly, final int splitId, final String splitType) throws TypeMismatchException, InvalidColumnException, ColumnTypeException { final Table table; if (isTableCacheEnabled && tableCache.isTableCached(splitId, splitType, options.datasetName)) { table = tableCache.getCachedTable(splitId, splitType, options.datasetName, geneList); rebuildTrainingPlatform(options, table); } else { int idColumnIndex = 0; final Table taskSpecificTable = predictOnly ? inputTable : filterInputTable(inputTable, labelValueGroups); try { idColumnIndex = taskSpecificTable.getColumnIndex(options.IDENTIFIER_COLUMN_NAME); } catch (InvalidColumnException e) { e.printStackTrace(); System.out.println("Column " + options.IDENTIFIER_COLUMN_NAME + " could not be found in input file. Unable to exclude by probeset."); System.exit(10); } final int[] columnSelection = MicroarrayTrainEvaluate.getDoubleColumnIndices(taskSpecificTable); normalize(options, taskSpecificTable); applyFloor(options, taskSpecificTable, columnSelection); // taskSpecificTable = considerFilterTableByGeneList(geneList, options, labelValueGroups, idColumnIndex, // taskSpecificTable); System.gc(); final Table transposedTable = transposeTable(options, taskSpecificTable, idColumnIndex); final Table aggregatedTable = doPathwayAggregation(options, transposedTable, splitId, splitType); rebuildTrainingPlatform(options, aggregatedTable); table = aggregatedTable; if (isTableCacheEnabled) { if (LOG.isDebugEnabled()) { LOG.debug("Saving " + options.datasetName + " to cache"); } tableCache.saveTableToCache(splitId, splitType, options.datasetName, table); } } final Table result = filterByGeneList(geneList, options, labelValueGroups, table); // scaleFeatures(options, predictOnly, aggregated); System.gc(); return result; } private Table filterByGeneList(final GeneList geneList, final DAVOptions options, final List<Set<String>> labelValueGroups, Table aggregated) { final ObjectSet<CharSequence> probeIdOnTable = new ObjectOpenHashSet<CharSequence>(); for (int colIndex = 1; colIndex < aggregated.getColumnNumber(); colIndex++) { probeIdOnTable.add(aggregated.getIdentifier(colIndex)); } final Set<String> validProbesetIds = geneList.calculateProbeSetSelection(probeIdOnTable); aggregated = filterTable(options, aggregated, geneList); // exclude by gene list now that pathway features are available. // System.out.println("aggregated: "+aggregated.toString(aggregated,false)); checkPostFiltering(options, labelValueGroups, aggregated); rebuildTrainingPlatform(options, aggregated); if (!(validProbesetIds != null && validProbesetIds.size() != 0)) { System.err .println("there is no overlap between this gene list and the chip. Stop this evaluation here."); System.exit(10); } return aggregated; } private Table doPathwayAggregation(final DAVOptions options, final Table transposed, final int splitId, final String splitType) { final Table aggregated; if (options.pathways != null) { // Map probesetIds in the pathwayInfo instances to probeset indices in the trainingPlatform: for (final PathwayInfo pi : options.pathways) { final IntList probeIndices = new IntArrayList(); final ObjectSet<MutableString> probesetIds = pi.probesetIds; // sort probesetIds by alphabetical order final ObjectList<MutableString> sortedProbesetIds = new ObjectArrayList<MutableString>(); sortedProbesetIds.addAll(pi.probesetIds); Collections.sort(sortedProbesetIds); // probeset indices will be added in the defined order of the probeset ids: for (final MutableString probeId : probesetIds) { final int anIndex = options.trainingPlatform.getProbeIds().getInt(probeId); // remove result produced when probeset not found on platform: if (anIndex != -1) { probeIndices.add(anIndex); } } pi.probesetIndices = probeIndices; } } final String pathwayComponentPrefix = options.datasetName; if (splitId == 0) { // generating final model from whole training set. We synchronize so that // parallel synchronization of models do not interact with each other // (i.e., partial writing and reading of compound file will not work). synchronized (SEMAPHORE) { aggregated = pathwayHelper.aggregateFeaturesForPathways(options.pathways, transposed, pathwayComponentPrefix, splitType, splitId); } } else { // independent splits, we can do them in parallel: aggregated = pathwayHelper.aggregateFeaturesForPathways(options.pathways, transposed, pathwayComponentPrefix, splitType, splitId); } return aggregated; } private Table transposeTable(final DAVOptions options, final Table taskSpecificTable, final int idColumnIndex) throws InvalidColumnException, TypeMismatchException, ColumnTypeException { // 3. Transpose source. final DefineColumnFromRow columnHelper = new DefineColumnFromRow(idColumnIndex); final Table transposed = taskSpecificTable.transpose(columnHelper); rebuildTrainingPlatform(options, transposed); return transposed; } private void normalize(final DAVOptions options, final Table taskSpecificTable) throws TypeMismatchException, InvalidColumnException { if (options.normalizeFeatures) { MicroarrayTrainEvaluate.normalizeAcrossConditions(taskSpecificTable); if (!options.quiet) { System.out.println("Features are normalized."); } } else { if (!options.quiet) { System.out.println("Features are assumed to have been normalized."); } } } private Table considerFilterTableByGeneList(final GeneList geneList, final DAVOptions options, final List<Set<String>> labelValueGroups, final int idColumnIndex, Table taskSpecificTable) throws TypeMismatchException, InvalidColumnException { final Set<String> validProbesetIds; if (options.pathways == null) { // no pathway processing, we can exclude the input table and keep only probesets in the gene list. validProbesetIds = geneList.calculateProbeSetSelection(taskSpecificTable, idColumnIndex); final RowFilter myFilter = new IdentifierSetRowFilter(validProbesetIds, idColumnIndex); final Table filteredNoPathway = taskSpecificTable.copy(myFilter); assert taskSpecificTable .getRowNumber() != 0 : "Table must have some rows after filtering with gene list."; if (!options.quiet) { System.out.println("Kept " + filteredNoPathway.getRowNumber() + " features after filtering."); } if (!options.quiet) { System.out.println("Kept " + filteredNoPathway.getColumnNumber() + " columns after filtering."); } if (filteredNoPathway.getColumnNumber() == 1) { System.out.println("Condition labels must match input column names for " + labelValueGroups); System.exit(10); } taskSpecificTable = filteredNoPathway; } return taskSpecificTable; } private void applyFloor(final DAVOptions options, final Table taskSpecificTable, final int[] columnSelection) throws TypeMismatchException, InvalidColumnException { if (options.adjustSignalToFloorValue) { // Set a floor on signal at signalFloorValue: final RowFloorAdjustmentCalculator floorAdjust = new RowFloorAdjustmentCalculator(columnSelection, options.signalFloorValue, options.oneChannelArray, options.loggedArray); taskSpecificTable.processRows(floorAdjust); } } protected void scaleFeatures(final DAVOptions options, final boolean predictOnly, final Table aggregated) throws TypeMismatchException, InvalidColumnException { if (options.scaleFeatures) { // 5. Scale values per column (one column=one probeset). LOG.debug("Scaling features"); FeatureTableScaler scaler = null; try { scaler = (FeatureTableScaler) options.scalerClass.newInstance(); } catch (InstantiationException e) { LOG.error(e); } catch (IllegalAccessException e) { LOG.error(e); } assert scaler != null : "scaler must be initialized."; if (!predictOnly) { options.probesetScaleMeanMap = new Object2DoubleLinkedOpenHashMap<MutableString>(); options.probesetScaleRangeMap = new Object2DoubleLinkedOpenHashMap<MutableString>(); scaler.setTrainingMode(options.probesetScaleMeanMap, options.probesetScaleRangeMap); } else { assert options.probesetScaleMeanMap != null : "probesetScaleMeanMap must have been initialized"; assert options.probesetScaleRangeMap != null : "probesetScaleRangeMap must have been initialized"; scaler.setTestSetMode(options.probesetScaleMeanMap, options.probesetScaleRangeMap); } scaler.processTable(aggregated, MicroarrayTrainEvaluate.getDoubleColumnIndices(aggregated)); } else { LOG.info("Features are not scaled."); } } private void setupScalerOptions(final JSAPResult result, final DAVOptions options) { if (options.scaleFeatures) { // 5. Scale values per column (one column=one probeset). LOG.debug("Scaling features"); if (options.scalerClassName == null) { if (!options.percentileScaling) { LOG.debug("Using mean and min/max estimators for scaling features."); options.scalerClass = MinMaxScalingRowProcessor.class; } else { // new code LOG.debug("Using percentile estimators for scaling features."); options.scalerClass = PercentileScalingRowProcessor.class; } } else { try { final Class<?> aClass = Class.forName(options.scalerClassName); options.scalerClass = (Class<? extends FeatureScaler>) aClass; } catch (ClassNotFoundException e) { LOG.error("Scaler class cannot be found (classname:" + options.scalerClassName + ").", e); System.exit(1); } } } else { LOG.info("Features are not scaled."); options.scalerClass = FeatureScaler.class; } } private boolean exceptionOnCheckPostFilteringFail; public void setExceptionOnCheckPostFilteringFail(final boolean exceptionOnCheckPostFilteringFail) { this.exceptionOnCheckPostFilteringFail = exceptionOnCheckPostFilteringFail; } public boolean getExceptionOnCheckPostFilteringFail() { return exceptionOnCheckPostFilteringFail; } private void checkPostFiltering(final DAVOptions options, final List<Set<String>> labelValueGroups, final Table filtered) { assert filtered.getRowNumber() != 0 : "Table must have some rows after filtering with gene list."; if (!options.quiet) { System.out.println("Kept " + filtered.getRowNumber() + " samples after filtering."); } if (!options.quiet) { System.out.println("Kept " + (filtered.getColumnNumber() - 1) + " features after filtering."); } if (filtered.getColumnNumber() == 1) { System.out.println("Condition labels must match input column names for " + labelValueGroups); if (exceptionOnCheckPostFilteringFail) { throw new IllegalArgumentException( "Condition labels must match input column names for " + labelValueGroups); } else { System.exit(10); } } } protected Table filterTable(final DAVOptions options, final Table source, final GeneList geneList) { try { final Set<String> columnsToKeep = new ObjectOpenHashSet<String>(); columnsToKeep.add("ID_REF"); for (int colIndex = 1; colIndex < source.getColumnNumber(); colIndex++) { final String colId = source.getIdentifier(colIndex); if (geneList.isProbesetInList(colId)) { columnsToKeep.add(colId); } } final Table result = source.copy(new AcceptAllRowFilter(), new KeepSubSetColumnFilter(columnsToKeep)); /* final IntList columnIndicesToRemove = new IntArrayList(); final ObjectList<String> columnIdsToRemove = new ObjectArrayList<String>(); int index=0; for (final int toRemoveIndex : columnIndicesToRemove) { result.removeColumn(toRemoveIndex, columnIdsToRemove.get(index++)); } */ rebuildTrainingPlatform(options, result); return result; } catch (TypeMismatchException e) { throw new InternalError("Should not happen."); } catch (InvalidColumnException e) { throw new InternalError("Should not happen."); } } private void rebuildTrainingPlatform(final DAVOptions options, final Table finalTable) { // associate features with probeset through the trainingPlatform instance: options.resetTrainingPlatform(); int featureIndex = 0; for (int columnIndex = 0; columnIndex < finalTable.getColumnNumber(); columnIndex++) { final ArrayTable.ColumnDescription column = finalTable.getColumnValues(columnIndex); if (!column.identifier.equals("ID_REF")) { final int registeredProbesetIndex = options.registerProbeset(column.identifier); if (LOG.isTraceEnabled()) { LOG.trace(String.format("registering probeset %s index %d", column.identifier, registeredProbesetIndex)); } if (featureIndex != registeredProbesetIndex) { throw new IllegalArgumentException("Feature index (" + featureIndex + ") must match probeset index (" + registeredProbesetIndex + ") in training platform"); } featureIndex++; } } } /** * Obtain a classifier for training with known labels. * * @param processedTable * @param labelValueGroups * @return * @throws TypeMismatchException * @throws InvalidColumnException */ protected ClassificationHelper getClassifier(final Table processedTable, final List<Set<String>> labelValueGroups) throws TypeMismatchException, InvalidColumnException { final ClassificationHelper helper = new ClassificationHelper(); // libSVM: // Calculate SumOfSquares sum over all samples x.x: final SumOfSquaresCalculatorRowProcessor calculator = new SumOfSquaresCalculatorRowProcessor(processedTable, davOptions.IDENTIFIER_COLUMN_NAME); processedTable.processRows(calculator); // use the svmLight default C value, so that results are comparable: final double C = processedTable.getRowNumber() / calculator.getSumOfSquares(); final double gamma = 1d / processedTable.getColumnNumber(); // 1/<number of features> default for libSVM try { final Classifier classifier = (Classifier) davOptions.classiferClass.newInstance(); final LoadClassificationProblem loader = new LoadClassificationProblem(); final ClassificationProblem problem = classifier.newProblem(0); loader.load(problem, processedTable, "ID_REF", labelValueGroups); helper.problem = problem; if (classifier instanceof LibSvmClassifier) { // set default value of C classifier.getParameters().setParameter("C", C); classifier.getParameters().setParameter("gamma", gamma); } if (davOptions.classifierParameters.length == 1 && davOptions.classifierParameters[0].length() == 0) { davOptions.classifierParameters = ArrayUtils.EMPTY_STRING_ARRAY; } helper.parseParameters(classifier, davOptions.classifierParameters); helper.classifier = classifier; helper.parameters = classifier.getParameters(); return helper; } catch (IllegalAccessException e) { LOG.error("Cannot instantiate classifier.", e); } catch (InstantiationException e) { LOG.error("Cannot instantiate classifier.", e); } assert false : "Could not instantiate classifier"; return null; } /** * Filter processed table to keep only a subset of samples. * * @param processedTable * @param keepSampleIds Those sample ids to keep in the filtered table. * @return filtered table. */ protected Table filterSamples(final Table processedTable, final ObjectSet<String> keepSampleIds) { final int idColumnIndex = 0; final RowFilter myFilter = new IdentifierSetRowFilter(keepSampleIds, idColumnIndex); try { return processedTable.copy(myFilter); } catch (TypeMismatchException e) { throw new InternalError("Must not happen."); } catch (InvalidColumnException e) { throw new InternalError("Must not happen."); } } public void removeFromCache(final int splitId, final String splitType, final String datasetName) { if (isTableCacheEnabled) { tableCache.clearFromCache(splitId, splitType, datasetName); } } }