Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /** * WekaBootstrapping.java * Copyright (C) 2017 University of Waikato, Hamilton, NZ */ package adams.flow.transformer; import adams.core.QuickInfoHelper; import adams.core.Randomizable; import adams.core.base.BaseDouble; import adams.data.spreadsheet.DefaultSpreadSheet; import adams.data.spreadsheet.Row; import adams.data.spreadsheet.SpreadSheet; import adams.data.statistics.Percentile; import adams.data.weka.WekaLabelIndex; import adams.flow.container.WekaEvaluationContainer; import adams.flow.core.EvaluationHelper; import adams.flow.core.EvaluationStatistic; import adams.flow.core.Token; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import weka.classifiers.Evaluation; import weka.classifiers.evaluation.NominalPrediction; import weka.classifiers.evaluation.Prediction; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.logging.Level; /** <!-- globalinfo-start --> * Performs bootstrapping on the incoming evaluation and outputs a spreadsheet where each row represents the results from bootstrapping sub-sample. * <br><br> <!-- globalinfo-end --> * <!-- flow-summary-start --> * Input/output:<br> * - accepts:<br> * weka.classifiers.Evaluation<br> * adams.flow.container.WekaEvaluationContainer<br> * - generates:<br> * adams.data.spreadsheet.SpreadSheet<br> * <br><br> * Container information:<br> * - adams.flow.container.WekaEvaluationContainer: Evaluation, Model, Prediction output, Original indices * <br><br> <!-- flow-summary-end --> * <!-- options-start --> * <pre>-logging-level <OFF|SEVERE|WARNING|INFO|CONFIG|FINE|FINER|FINEST> (property: loggingLevel) * The logging level for outputting errors and debugging output. * default: WARNING * </pre> * * <pre>-name <java.lang.String> (property: name) * The name of the actor. * default: WekaBootstrapping * </pre> * * <pre>-annotation <adams.core.base.BaseAnnotation> (property: annotations) * The annotations to attach to this actor. * default: * </pre> * * <pre>-skip <boolean> (property: skip) * If set to true, transformation is skipped and the input token is just forwarded * as it is. * default: false * </pre> * * <pre>-stop-flow-on-error <boolean> (property: stopFlowOnError) * If set to true, the flow execution at this level gets stopped in case this * actor encounters an error; the error gets propagated; useful for critical * actors. * default: false * </pre> * * <pre>-silent <boolean> (property: silent) * If enabled, then no errors are output in the console; Note: the enclosing * actor handler must have this enabled as well. * default: false * </pre> * * <pre>-seed <long> (property: seed) * The seed for generating the random sub-samples. * default: 1 * </pre> * * <pre>-num-subsamples <int> (property: numSubSamples) * The number of random sub-samples to generate. * default: 10 * minimum: 1 * </pre> * * <pre>-percentage <double> (property: percentage) * The percentage of the sub-sample size (between 0 and 1). * default: 0.66 * minimum: 1.0E-4 * maximum: 1.0 * </pre> * * <pre>-statistic <Number correct|Number incorrect|Number unclassified|Percent correct|Percent incorrect|Percent unclassified|Kappa statistic|Mean absolute error|Root mean squared error|Relative absolute error|Root relative squared error|Correlation coefficient|SF prior entropy|SF scheme entropy|SF entropy gain|SF mean prior entropy|SF mean scheme entropy|SF mean entropy gain|KB information|KB mean information|KB relative information|True positive rate|Num true positives|False positive rate|Num false positives|True negative rate|Num true negatives|False negative rate|Num false negatives|IR precision|IR recall|F measure|Matthews correlation coefficient|Area under ROC|Area under PRC|Weighted true positive rate|Weighted false positive rate|Weighted true negative rate|Weighted false negative rate|Weighted IR precision|Weighted IR recall|Weighted F measure|Weighted Matthews correlation coefficient|Weighted area under ROC|Weighted area under PRC|Unweighted Macro F measure|Unweighted Micro F measure|Bias|R^2> [-statistic ...] (property: statisticValues) * The evaluation values to extract and turn into a spreadsheet. * default: * </pre> * * <pre>-class-index <adams.data.weka.WekaLabelIndex> (property: classIndex) * The index of class label (eg used for AUC). * default: first * example: An index is a number starting with 1; apart from label names (case-sensitive), the following placeholders can be used as well: first, second, third, last_2, last_1, last; numeric indices can be enforced by preceding them with '#' (eg '#12'); label names can be surrounded by double quotes. * </pre> * * <pre>-percentile <adams.core.base.BaseDouble> [-percentile ...] (property: percentiles) * The percentiles to calculate for the errors (0-1; 0.95 is 95th percentile * ). * default: * </pre> * * <pre>-error-calculation <ACTUAL_MINUS_PREDICTED|PREDICTED_MINUS_ACTUAL|BOTH|ABSOLUTE> (property: errorCalculation) * Determines how to calculate the error. * default: ACTUAL_MINUS_PREDICTED * </pre> * * <pre>-with-replacement <boolean> (property: withReplacement) * If enabled, predictions are drawn using with replacement (i.e., duplicates * are possible). * default: true * </pre> * <!-- options-end --> * * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision$ */ public class WekaBootstrapping extends AbstractTransformer implements Randomizable { private static final long serialVersionUID = 2599800854948082354L; /** how to calculate the error. */ public enum ErrorCalculation { ACTUAL_MINUS_PREDICTED, PREDICTED_MINUS_ACTUAL, BOTH, ABSOLUTE, } /** the random number seed. */ protected long m_Seed; /** the number of random sub-samples to generate. */ protected int m_NumSubSamples; /** the size for the sub-samples (0-1). */ protected double m_Percentage; /** the comparison fields. */ protected EvaluationStatistic[] m_StatisticValues; /** the index of the class label. */ protected WekaLabelIndex m_ClassIndex; /** the percentiles to output (0-1). */ protected BaseDouble[] m_Percentiles; /** the error calculation. */ protected ErrorCalculation m_ErrorCalculation; /** whether to use with replacement or not. */ protected boolean m_WithReplacement; /** * Returns a string describing the object. * * @return a description suitable for displaying in the gui */ @Override public String globalInfo() { return "Performs bootstrapping on the incoming evaluation and outputs a " + "spreadsheet where each row represents the results from " + "bootstrapping sub-sample."; } /** * Adds options to the internal list of options. */ @Override public void defineOptions() { super.defineOptions(); m_OptionManager.add("seed", "seed", 1L); m_OptionManager.add("num-subsamples", "numSubSamples", 10, 1, null); m_OptionManager.add("percentage", "percentage", 0.66, 0.0001, 1.0); m_OptionManager.add("statistic", "statisticValues", new EvaluationStatistic[0]); m_OptionManager.add("class-index", "classIndex", new WekaLabelIndex(WekaLabelIndex.FIRST)); m_OptionManager.add("percentile", "percentiles", new BaseDouble[0]); m_OptionManager.add("error-calculation", "errorCalculation", ErrorCalculation.ACTUAL_MINUS_PREDICTED); m_OptionManager.add("with-replacement", "withReplacement", true); } /** * Sets the seed value. * * @param value the seed */ @Override public void setSeed(long value) { m_Seed = value; reset(); } /** * Returns the seed value. * * @return the seed */ @Override public long getSeed() { return m_Seed; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ @Override public String seedTipText() { return "The seed for generating the random sub-samples."; } /** * Sets the number sub-samples to generate. * * @param value the number of sub-samples */ public void setNumSubSamples(int value) { if (getOptionManager().isValid("numSubSamples", value)) { m_NumSubSamples = value; reset(); } } /** * Returns the number of sub-samples to generate. * * @return the number of sub-samples */ public int getNumSubSamples() { return m_NumSubSamples; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String numSubSamplesTipText() { return "The number of random sub-samples to generate."; } /** * Sets the percentage (0-1). * * @param value the percentage */ public void setPercentage(double value) { if (getOptionManager().isValid("percentage", value)) { m_Percentage = value; reset(); } } /** * Returns the percentage (0-1). * * @return the percentage */ public double getPercentage() { return m_Percentage; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String percentageTipText() { return "The percentage of the sub-sample size (between 0 and 1)."; } /** * Sets the values to extract. * * @param value the value */ public void setStatisticValues(EvaluationStatistic[] value) { m_StatisticValues = value; reset(); } /** * Returns the values to extract. * * @return the value */ public EvaluationStatistic[] getStatisticValues() { return m_StatisticValues; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String statisticValuesTipText() { return "The evaluation values to extract and turn into a spreadsheet."; } /** * Sets the index of class label index (1-based). * * @param value the label index */ public void setClassIndex(WekaLabelIndex value) { m_ClassIndex = value; reset(); } /** * Returns the current index of class label (1-based). * * @return the label index */ public WekaLabelIndex getClassIndex() { return m_ClassIndex; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String classIndexTipText() { return "The index of class label (eg used for AUC)."; } /** * Sets the percentiles to calculate for the errors. * * @param value the percentiles (0-1; 0.95 is 95th percentile) */ public void setPercentiles(BaseDouble[] value) { m_Percentiles = value; reset(); } /** * Returns the percentiles to calculate for the errors. * * @return the percentiles (0-1; 0.95 is 95th percentile) */ public BaseDouble[] getPercentiles() { return m_Percentiles; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String percentilesTipText() { return "The percentiles to calculate for the errors (0-1; 0.95 is 95th percentile)."; } /** * Sets how to calculate the errors for the percentiles. * * @param value the type */ public void setErrorCalculation(ErrorCalculation value) { m_ErrorCalculation = value; reset(); } /** * Returns how to calculate the errors for the percentiles. * * @return the type */ public ErrorCalculation getErrorCalculation() { return m_ErrorCalculation; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String errorCalculationTipText() { return "Determines how to calculate the error."; } /** * Sets whether to draw predictions using replacement. * * @param value true if with replacement */ public void setWithReplacement(boolean value) { m_WithReplacement = value; reset(); } /** * Returns whether to draw predictions using replacement. * * @return true if with replacement */ public boolean getWithReplacement() { return m_WithReplacement; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the GUI or for listing the options. */ public String withReplacementTipText() { return "If enabled, predictions are drawn using with replacement (i.e., duplicates are possible)."; } /** * Returns a quick info about the actor, which will be displayed in the GUI. * * @return null if no info available, otherwise short string */ @Override public String getQuickInfo() { String result; result = QuickInfoHelper.toString(this, "seed", m_Seed, "seed: "); result += QuickInfoHelper.toString(this, "numSubSamples", m_NumSubSamples, ", # sub: "); result += QuickInfoHelper.toString(this, "percentage", m_Percentage, ", percentage: "); result += QuickInfoHelper.toString(this, "statisticValues", m_StatisticValues.length + " statistic" + (m_StatisticValues.length != 1 ? "s" : ""), ", "); result += QuickInfoHelper.toString(this, "classIndex", m_ClassIndex, ", class label: "); result += QuickInfoHelper.toString(this, "percentiles", m_Percentiles.length + " percentile" + (m_Percentiles.length != 1 ? "s" : ""), ", "); result += QuickInfoHelper.toString(this, "errorCalculation", m_ErrorCalculation, ", errors: "); return result; } /** * Returns the class that the consumer accepts. * * @return the Class of objects that can be processed */ @Override public Class[] accepts() { return new Class[] { Evaluation.class, WekaEvaluationContainer.class }; } /** * Returns the class of objects that it generates. * * @return the Class of the generated tokens */ @Override public Class[] generates() { return new Class[] { SpreadSheet.class }; } /** * Executes the flow item. * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; SpreadSheet sheet; Row row; Evaluation evalAll; Evaluation eval; WekaEvaluationContainer cont; TIntList indices; Random random; int i; int iteration; int size; List<Prediction> preds; Instances header; Instances data; ArrayList<Attribute> atts; Instance inst; boolean numeric; int classIndex; Double[] errors; Double[] errorsRev; Percentile<Double> perc; Percentile<Double> percRev; TIntList subset; result = null; if (m_InputToken.getPayload() instanceof Evaluation) { evalAll = (Evaluation) m_InputToken.getPayload(); } else { cont = (WekaEvaluationContainer) m_InputToken.getPayload(); evalAll = (Evaluation) cont.getValue(WekaEvaluationContainer.VALUE_EVALUATION); } if ((evalAll.predictions() == null) || (evalAll.predictions().size() == 0)) result = "No predictions available!"; if (result == null) { // init spreadsheet sheet = new DefaultSpreadSheet(); row = sheet.getHeaderRow(); row.addCell("S").setContentAsString("Subsample"); for (EvaluationStatistic s : m_StatisticValues) row.addCell(s.toString()).setContentAsString(s.toString()); for (i = 0; i < m_Percentiles.length; i++) { switch (m_ErrorCalculation) { case ACTUAL_MINUS_PREDICTED: row.addCell("perc-AmP-" + i).setContentAsString("Percentile-AmP-" + m_Percentiles[i]); break; case PREDICTED_MINUS_ACTUAL: row.addCell("perc-PmA-" + i).setContentAsString("Percentile-PmA-" + m_Percentiles[i]); break; case ABSOLUTE: row.addCell("perc-Abs-" + i).setContentAsString("Percentile-Abs-" + m_Percentiles[i]); break; case BOTH: row.addCell("perc-AmP-" + i).setContentAsString("Percentile-AmP-" + m_Percentiles[i]); row.addCell("perc-PmA-" + i).setContentAsString("Percentile-PmA-" + m_Percentiles[i]); break; default: throw new IllegalStateException("Unhandled error calculation: " + m_ErrorCalculation); } } // set up bootstrapping preds = evalAll.predictions(); random = new Random(m_Seed); indices = new TIntArrayList(); size = (int) Math.round(preds.size() * m_Percentage); header = evalAll.getHeader(); numeric = header.classAttribute().isNumeric(); m_ClassIndex.setData(header.classAttribute()); if (numeric) classIndex = -1; else classIndex = m_ClassIndex.getIntIndex(); for (i = 0; i < preds.size(); i++) indices.add(i); // create fake evalutions subset = new TIntArrayList(); for (iteration = 0; iteration < m_NumSubSamples; iteration++) { if (isStopped()) { sheet = null; break; } // determine subset.clear(); if (m_WithReplacement) { for (i = 0; i < size; i++) subset.add(indices.get(random.nextInt(preds.size()))); } else { indices.shuffle(random); for (i = 0; i < size; i++) subset.add(indices.get(i)); } // create dataset from predictions errors = new Double[size]; errorsRev = new Double[size]; atts = new ArrayList<>(); atts.add(header.classAttribute().copy("Actual")); data = new Instances(header.relationName() + "-" + (iteration + 1), atts, size); data.setClassIndex(0); for (i = 0; i < subset.size(); i++) { inst = new DenseInstance(preds.get(subset.get(i)).weight(), new double[] { preds.get(subset.get(i)).actual() }); data.add(inst); switch (m_ErrorCalculation) { case ACTUAL_MINUS_PREDICTED: errors[i] = preds.get(subset.get(i)).actual() - preds.get(subset.get(i)).predicted(); break; case PREDICTED_MINUS_ACTUAL: errorsRev[i] = preds.get(subset.get(i)).predicted() - preds.get(subset.get(i)).actual(); break; case ABSOLUTE: errors[i] = Math .abs(preds.get(subset.get(i)).actual() - preds.get(subset.get(i)).predicted()); break; case BOTH: errors[i] = preds.get(subset.get(i)).actual() - preds.get(subset.get(i)).predicted(); errorsRev[i] = preds.get(subset.get(i)).predicted() - preds.get(subset.get(i)).actual(); break; default: throw new IllegalStateException("Unhandled error calculation: " + m_ErrorCalculation); } } // perform "fake" evaluation try { eval = new Evaluation(data); for (i = 0; i < subset.size(); i++) { if (numeric) eval.evaluateModelOnceAndRecordPrediction( new double[] { preds.get(subset.get(i)).predicted() }, data.instance(i)); else eval.evaluateModelOnceAndRecordPrediction( ((NominalPrediction) preds.get(subset.get(i))).distribution().clone(), data.instance(i)); } } catch (Exception e) { result = handleException( "Failed to create 'fake' Evaluation object (iteration: " + (iteration + 1) + ")!", e); break; } // add row row = sheet.addRow(); row.addCell("S").setContent(iteration + 1); for (EvaluationStatistic s : m_StatisticValues) { try { row.addCell(s.toString()).setContent(EvaluationHelper.getValue(eval, s, classIndex)); } catch (Exception e) { getLogger().log(Level.SEVERE, "Failed to calculate statistic in iteration #" + (iteration + 1) + ": " + s, e); row.addCell(s.toString()).setMissing(); } } for (i = 0; i < m_Percentiles.length; i++) { perc = new Percentile<>(); perc.addAll(errors); percRev = new Percentile<>(); percRev.addAll(errorsRev); switch (m_ErrorCalculation) { case ACTUAL_MINUS_PREDICTED: row.addCell("perc-AmP-" + i).setContent(perc.getPercentile(m_Percentiles[i].doubleValue())); break; case PREDICTED_MINUS_ACTUAL: row.addCell("perc-PmA-" + i) .setContent(percRev.getPercentile(m_Percentiles[i].doubleValue())); break; case ABSOLUTE: row.addCell("perc-Abs-" + i).setContent(perc.getPercentile(m_Percentiles[i].doubleValue())); break; case BOTH: row.addCell("perc-AmP-" + i).setContent(perc.getPercentile(m_Percentiles[i].doubleValue())); row.addCell("perc-PmA-" + i) .setContent(percRev.getPercentile(m_Percentiles[i].doubleValue())); break; default: throw new IllegalStateException("Unhandled error calculation: " + m_ErrorCalculation); } } } if ((result == null) && (sheet != null)) m_OutputToken = new Token(sheet); } return result; } }