Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * Resample.java * Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand * */ package weka.filters.unsupervised.instance; import java.util.Collections; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.*; import weka.core.Capabilities.Capability; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; import weka.gui.ProgrammaticProperty; /** * <!-- globalinfo-start --> Produces a random subsample of a dataset using * either sampling with replacement or without replacement. The original dataset * must fit entirely in memory. The number of instances in the generated dataset * may be specified. When used in batch mode, subsequent batches are NOT * resampled. * <p/> * <!-- globalinfo-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -S <num> * Specify the random number seed (default 1) * </pre> * * <pre> * -Z <num> * The size of the output dataset, as a percentage of * the input dataset (default 100) * </pre> * * <pre> * -no-replacement * Disables replacement of instances * (default: with replacement) * </pre> * * <pre> * -V * Inverts the selection - only available with '-no-replacement'. * </pre> * * <!-- options-end --> * * @author Len Trigg (len@reeltwo.com) * @author FracPete (fracpete at waikato dot ac dot nz) * @author Eibe Frank * @version $Revision$ */ public class Resample extends Filter implements UnsupervisedFilter, OptionHandler, Randomizable, WeightedAttributesHandler { /** for serialization */ static final long serialVersionUID = 3119607037607101160L; /** The subsample size, percent of original set, default 100% */ protected double m_SampleSizePercent = 100; /** The random number generator seed */ protected int m_RandomSeed = 1; /** Whether to perform sampling with replacement or without */ protected boolean m_NoReplacement = false; /** * Whether to invert the selection (only if instances are drawn WITHOUT * replacement) * * @see #m_NoReplacement */ protected boolean m_InvertSelection = false; /** * Returns a string describing this classifier * * @return a description of the classifier suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Produces a random subsample of a dataset using either sampling with " + "replacement or without replacement. The original dataset must fit " + "entirely in memory. The number of instances in the generated dataset " + "may be specified. When used in batch mode, subsequent batches are " + "NOT resampled."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> result = new Vector<Option>(); result.addElement(new Option("\tSpecify the random number seed (default 1)", "S", 1, "-S <num>")); result.addElement(new Option( "\tThe size of the output dataset, as a percentage of\n" + "\tthe input dataset (default 100)", "Z", 1, "-Z <num>")); result.addElement(new Option("\tDisables replacement of instances\n" + "\t(default: with replacement)", "no-replacement", 0, "-no-replacement")); result.addElement( new Option("\tInverts the selection - only available with '-no-replacement'.", "V", 0, "-V")); return result.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -S <num> * Specify the random number seed (default 1) * </pre> * * <pre> * -Z <num> * The size of the output dataset, as a percentage of * the input dataset (default 100) * </pre> * * <pre> * -no-replacement * Disables replacement of instances * (default: with replacement) * </pre> * * <pre> * -V * Inverts the selection - only available with '-no-replacement'. * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String tmpStr = Utils.getOption('S', options); if (tmpStr.length() != 0) { setRandomSeed(Integer.parseInt(tmpStr)); } else { setRandomSeed(1); } tmpStr = Utils.getOption('Z', options); if (tmpStr.length() != 0) { setSampleSizePercent(Double.parseDouble(tmpStr)); } else { setSampleSizePercent(100); } setNoReplacement(Utils.getFlag("no-replacement", options)); if (getNoReplacement()) { setInvertSelection(Utils.getFlag('V', options)); } if (getInputFormat() != null) { setInputFormat(getInputFormat()); } Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); result.add("-S"); result.add("" + getRandomSeed()); result.add("-Z"); result.add("" + getSampleSizePercent()); if (getNoReplacement()) { result.add("-no-replacement"); if (getInvertSelection()) { result.add("-V"); } } return result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String randomSeedTipText() { return "The seed used for random sampling."; } /** * Gets the random number seed. * * @return the random number seed. */ public int getRandomSeed() { return m_RandomSeed; } /** * Sets the random number seed. * * @param newSeed the new random number seed. */ public void setRandomSeed(int newSeed) { m_RandomSeed = newSeed; } @ProgrammaticProperty public void setSeed(int seed) { setRandomSeed(seed); } @ProgrammaticProperty public int getSeed() { return getRandomSeed(); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String sampleSizePercentTipText() { return "Size of the subsample as a percentage of the original dataset."; } /** * Gets the subsample size as a percentage of the original set. * * @return the subsample size */ public double getSampleSizePercent() { return m_SampleSizePercent; } /** * Sets the size of the subsample, as a percentage of the original set. * * @param newSampleSizePercent the subsample set size, between 0 and 100. */ public void setSampleSizePercent(double newSampleSizePercent) { m_SampleSizePercent = newSampleSizePercent; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String noReplacementTipText() { return "Disables the replacement of instances."; } /** * Gets whether instances are drawn with or without replacement. * * @return true if the replacement is disabled */ public boolean getNoReplacement() { return m_NoReplacement; } /** * Sets whether instances are drawn with or with out replacement. * * @param value if true then the replacement of instances is disabled */ public void setNoReplacement(boolean value) { m_NoReplacement = value; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String invertSelectionTipText() { return "Inverts the selection (only if instances are drawn WITHOUT replacement)."; } /** * Gets whether selection is inverted (only if instances are drawn WIHTOUT * replacement). * * @return true if the replacement is disabled * @see #m_NoReplacement */ public boolean getInvertSelection() { return m_InvertSelection; } /** * Sets whether the selection is inverted (only if instances are drawn WIHTOUT * replacement). * * @param value if true then selection is inverted */ public void setInvertSelection(boolean value) { m_InvertSelection = value; } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { if (getNoReplacement() && getSampleSizePercent() > 100) { throw new WekaException("Sample size cannot exceed 100% for sampling *without* replacement!"); } super.setInputFormat(instanceInfo); setOutputFormat(instanceInfo); return true; } /** * Input an instance for filtering. Filter requires all training instances be * read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined */ @Override public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { push(instance); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. If the filter * requires all instances prior to filtering, output() may now be called to * retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (!isFirstBatchDone()) { // Do the subsample, and clear the input instances. createSubsample(); } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Creates a subsample of the current set of input instances. The output * instances are pushed onto the output queue for collection. */ protected void createSubsample() { Instances data = getInputFormat(); int numEligible = data.numInstances(); int sampleSize = (int) (numEligible * m_SampleSizePercent / 100); Random random = new Random(m_RandomSeed); if (getNoReplacement()) { // Set up array of indices int[] selected = new int[numEligible]; for (int j = 0; j < numEligible; j++) { selected[j] = j; } for (int i = 0; i < sampleSize; i++) { // Sampling without replacement int chosenLocation = random.nextInt(numEligible); int chosen = selected[chosenLocation]; numEligible--; selected[chosenLocation] = selected[numEligible]; selected[numEligible] = chosen; } // Do we need to invert the selection? if (getInvertSelection()) { // Take the first numEligible instances // because they have not been selected for (int j = 0; j < numEligible; j++) { push(data.instance(selected[j]), false); // No need to copy instance } } else { // Take the elements that have been selected for (int j = numEligible; j < data.numInstances(); j++) { push(data.instance(selected[j]), false); // No need to copy instance } } } else { // Sampling with replacement for (int i = 0; i < sampleSize; i++) { push(data.instance(random.nextInt(numEligible)), false); // No need to copy instance } } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method for testing this class. * * @param argv should contain arguments to the filter: use -h for help */ public static void main(String[] argv) { runFilter(new Resample(), argv); } }