Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * PrincipalComponents.java * Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand */ package weka.filters.unsupervised.attribute; import java.util.ArrayList; import java.util.Enumeration; import java.util.Vector; import no.uib.cipr.matrix.*; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; /** * <!-- globalinfo-start --> Performs a principal components analysis and * transformation of the data.<br/> * Dimensionality reduction is accomplished by choosing enough eigenvectors to * account for some percentage of the variance in the original data -- default * 0.95 (95%).<br/> * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark * Hall and Gabi Schmidberger. * <p/> * <!-- globalinfo-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix. * </pre> * * <pre> * -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95) * </pre> * * <pre> * -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5) * </pre> * * <pre> * -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1) * </pre> * * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code * @version $Revision$ */ public class PrincipalComponents extends Filter implements OptionHandler, UnsupervisedFilter { /** for serialization. */ private static final long serialVersionUID = -5649876869480249303L; /** The data to transform analyse/transform. */ protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */ protected Instances m_TrainCopy; /** The header for the transformed data format. */ protected Instances m_TransformedFormat; /** Data has a class set. */ protected boolean m_HasClass; /** Class index. */ protected int m_ClassIndex; /** Number of attributes. */ protected int m_NumAttribs; /** Number of instances. */ protected int m_NumInstances; /** Correlation matrix for the original data. */ protected UpperSymmDenseMatrix m_Correlation; /** * If true, center (rather than standardize) the data and compute PCA from * covariance (rather than correlation) matrix. */ private boolean m_center = false; /** * Will hold the unordered linear transformations of the (normalized) original * data. */ protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */ protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */ protected int[] m_SortedEigens; /** sum of the eigenvalues. */ protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for turning nominal values into numeric ones. */ protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */ protected Remove m_AttributeFilter; /** Filter for standardizing the data */ protected Standardize m_standardizeFilter; /** Filter for centering the data */ protected Center m_centerFilter; /** The number of attributes in the pc transformed data. */ protected int m_OutputNumAtts = -1; /** * the amount of varaince to cover in the original data when retaining the * best n PC's. */ protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */ protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */ protected int m_MaxAttributes = -1; /** * Returns a string describing this filter. * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Performs a principal components analysis and transformation of " + "the data.\n" + "Dimensionality reduction is accomplished by choosing enough eigenvectors " + "to account for some percentage of the variance in the original data -- " + "default 0.95 (95%).\n" + "Based on code of the attribute selection scheme 'PrincipalComponents' " + "by Mark Hall and Gabi Schmidberger."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> result = new Vector<Option>(); result.addElement(new Option("\tCenter (rather than standardize) the" + "\n\tdata and compute PCA using the covariance (rather" + "\n\t than the correlation) matrix.", "C", 0, "-C")); result.addElement(new Option( "\tRetain enough PC attributes to account\n" + "\tfor this proportion of variance in the original data.\n" + "\t(default: 0.95)", "R", 1, "-R <num>")); result.addElement(new Option("\tMaximum number of attributes to include in \n" + "\ttransformed attribute names.\n" + "\t(-1 = include all, default: 5)", "A", 1, "-A <num>")); result.addElement( new Option("\tMaximum number of PC attributes to retain.\n" + "\t(-1 = include all, default: -1)", "M", 1, "-M <num>")); return result.elements(); } /** * Parses a list of options for this object. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix. * </pre> * * <pre> * -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95) * </pre> * * <pre> * -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5) * </pre> * * <pre> * -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1) * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String tmpStr = Utils.getOption('R', options); if (tmpStr.length() != 0) { setVarianceCovered(Double.parseDouble(tmpStr)); } else { setVarianceCovered(0.95); } tmpStr = Utils.getOption('A', options); if (tmpStr.length() != 0) { setMaximumAttributeNames(Integer.parseInt(tmpStr)); } else { setMaximumAttributeNames(5); } tmpStr = Utils.getOption('M', options); if (tmpStr.length() != 0) { setMaximumAttributes(Integer.parseInt(tmpStr)); } else { setMaximumAttributes(-1); } setCenterData(Utils.getFlag('C', options)); Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); result.add("-R"); result.add("" + getVarianceCovered()); result.add("-A"); result.add("" + getMaximumAttributeNames()); result.add("-M"); result.add("" + getMaximumAttributes()); if (getCenterData()) { result.add("-C"); } return result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String centerDataTipText() { return "Center (rather than standardize) the data. PCA will " + "be computed from the covariance (rather than correlation) " + "matrix"; } /** * Set whether to center (rather than standardize) the data. If set to true * then PCA is computed from the covariance rather than correlation matrix. * * @param center true if the data is to be centered rather than standardized */ public void setCenterData(boolean center) { m_center = center; } /** * Get whether to center (rather than standardize) the data. If true then PCA * is computed from the covariance rather than correlation matrix. * * @return true if the data is to be centered rather than standardized. */ public boolean getCenterData() { return m_center; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String varianceCoveredTipText() { return "Retain enough PC attributes to account for this proportion of variance."; } /** * Sets the amount of variance to account for when retaining principal * components. * * @param value the proportion of total variance to account for */ public void setVarianceCovered(double value) { m_CoverVariance = value; } /** * Gets the proportion of total variance to account for when retaining * principal components. * * @return the proportion of variance to account for */ public double getVarianceCovered() { return m_CoverVariance; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String maximumAttributeNamesTipText() { return "The maximum number of attributes to include in transformed attribute names."; } /** * Sets maximum number of attributes to include in transformed attribute * names. * * @param value the maximum number of attributes */ public void setMaximumAttributeNames(int value) { m_MaxAttrsInName = value; } /** * Gets maximum number of attributes to include in transformed attribute * names. * * @return the maximum number of attributes */ public int getMaximumAttributeNames() { return m_MaxAttrsInName; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String maximumAttributesTipText() { return "The maximum number of PC attributes to retain."; } /** * Sets maximum number of PC attributes to retain. * * @param value the maximum number of attributes */ public void setMaximumAttributes(int value) { m_MaxAttributes = value; } /** * Gets maximum number of PC attributes to retain. * * @return the maximum number of attributes */ public int getMaximumAttributes() { return m_MaxAttributes; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.UNARY_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Determines the output format based on the input format and returns this. In * case the output format cannot be returned immediately, i.e., * immediateOutputFormat() returns false, then this method will be called from * batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { double cumulative; ArrayList<Attribute> attributes; int i; int j; StringBuffer attName; double[] coeff_mags; int num_attrs; int[] coeff_inds; double coeff_value; int numAttsLowerBound; if (m_Eigenvalues == null) { return inputFormat; } if (m_MaxAttributes > 0) { numAttsLowerBound = m_NumAttribs - m_MaxAttributes; } else { numAttsLowerBound = 0; } if (numAttsLowerBound < 0) { numAttsLowerBound = 0; } cumulative = 0.0; attributes = new ArrayList<Attribute>(); for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { attName = new StringBuffer(); // build array of coefficients coeff_mags = new double[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) { coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]); } num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs; // this array contains the sorted indices of the coefficients if (m_NumAttribs > 0) { // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude coeff_inds = Utils.sort(coeff_mags); } else { // if m_maxAttrsInName <= 0, use all coeffs in original order coeff_inds = new int[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) { coeff_inds[j] = j; } } // build final attName string for (j = 0; j < num_attrs; j++) { coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]]; if (j > 0 && coeff_value >= 0) { attName.append("+"); } attName.append( Utils.doubleToString(coeff_value, 5, 3) + inputFormat.attribute(coeff_inds[j]).name()); } if (num_attrs < m_NumAttribs) { attName.append("..."); } attributes.add(new Attribute(attName.toString())); cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) { break; } } if (m_HasClass) { attributes.add((Attribute) m_TrainCopy.classAttribute().copy()); } Instances outputFormat = new Instances(m_TrainCopy.relationName() + "_principal components", attributes, 0); // set the class to be the last attribute if necessary if (m_HasClass) { outputFormat.setClassIndex(outputFormat.numAttributes() - 1); } m_OutputNumAtts = outputFormat.numAttributes(); return outputFormat; } protected void fillCovariance() throws Exception { // just center the data or standardize it? if (m_center) { m_centerFilter = new Center(); m_centerFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter); } else { m_standardizeFilter = new Standardize(); m_standardizeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter); } // now compute the covariance matrix m_Correlation = new UpperSymmDenseMatrix(m_NumAttribs); for (int i = 0; i < m_NumAttribs; i++) { for (int j = i; j < m_NumAttribs; j++) { double cov = 0; for (Instance inst : m_TrainInstances) { cov += inst.value(i) * inst.value(j); } cov /= m_TrainInstances.numInstances() - 1; m_Correlation.set(i, j, cov); } } } /** * Transform an instance in original (unormalized) format. * * @param instance an instance in the original (unormalized) format * @return a transformed instance * @throws Exception if instance can't be transformed */ protected Instance convertInstance(Instance instance) throws Exception { Instance result; double[] newVals; Instance tempInst; double cumulative; int i; int j; double tempval; int numAttsLowerBound; newVals = new double[m_OutputNumAtts]; tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst); m_ReplaceMissingFilter.batchFinished(); tempInst = m_ReplaceMissingFilter.output(); m_NominalToBinaryFilter.input(tempInst); m_NominalToBinaryFilter.batchFinished(); tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) { m_AttributeFilter.input(tempInst); m_AttributeFilter.batchFinished(); tempInst = m_AttributeFilter.output(); } if (!m_center) { m_standardizeFilter.input(tempInst); m_standardizeFilter.batchFinished(); tempInst = m_standardizeFilter.output(); } else { m_centerFilter.input(tempInst); m_centerFilter.batchFinished(); tempInst = m_centerFilter.output(); } if (m_HasClass) { newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); } if (m_MaxAttributes > 0) { numAttsLowerBound = m_NumAttribs - m_MaxAttributes; } else { numAttsLowerBound = 0; } if (numAttsLowerBound < 0) { numAttsLowerBound = 0; } cumulative = 0; for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { tempval = 0.0; for (j = 0; j < m_NumAttribs; j++) { tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); } newVals[m_NumAttribs - i - 1] = tempval; cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) { break; } } // create instance if (instance instanceof SparseInstance) { result = new SparseInstance(instance.weight(), newVals); } else { result = new DenseInstance(instance.weight(), newVals); } return result; } /** * Initializes the filter with the given input data. * * @param instances the data to process * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected void setup(Instances instances) throws Exception { int i; int j; Vector<Integer> deleteCols; int[] todelete; double[][] v; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); m_NominalToBinaryFilter = new NominalToBinary(); m_NominalToBinaryFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing deleteCols = new Vector<Integer>(); for (i = 0; i < m_TrainInstances.numAttributes(); i++) { if (m_TrainInstances.numDistinctValues(i) <= 1) { deleteCols.addElement(i); } } if (m_TrainInstances.classIndex() >= 0) { // get rid of the class column m_HasClass = true; m_ClassIndex = m_TrainInstances.classIndex(); deleteCols.addElement(new Integer(m_ClassIndex)); } // remove columns from the data if necessary if (deleteCols.size() > 0) { m_AttributeFilter = new Remove(); todelete = new int[deleteCols.size()]; for (i = 0; i < deleteCols.size(); i++) { todelete[i] = (deleteCols.elementAt(i)).intValue(); } m_AttributeFilter.setAttributeIndicesArray(todelete); m_AttributeFilter.setInvertSelection(false); m_AttributeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter); } // can evaluator handle the processed data ? e.g., enough attributes? getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances(); m_NumAttribs = m_TrainInstances.numAttributes(); // fillCorrelation(); fillCovariance(); // get eigen vectors/values SymmDenseEVD evd = SymmDenseEVD.factorize(m_Correlation); m_Eigenvectors = Matrices.getArray(evd.getEigenvectors()); m_Eigenvalues = evd.getEigenvalues(); // any eigenvalues less than 0 are not worth anything --- change to 0 for (i = 0; i < m_Eigenvalues.length; i++) { if (m_Eigenvalues[i] < 0) { m_Eigenvalues[i] = 0.0; } } m_SortedEigens = Utils.sort(m_Eigenvalues); m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances); setOutputFormat(m_TransformedFormat); m_TrainInstances = null; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_Eigenvalues = null; m_OutputNumAtts = -1; m_AttributeFilter = null; m_NominalToBinaryFilter = null; m_SumOfEigenValues = 0.0; return false; } /** * Input an instance for filtering. Filter requires all training instances be * read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input format has been set * @throws Exception if conversion fails */ @Override public boolean input(Instance instance) throws Exception { Instance inst; if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (isNewBatch()) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { inst = convertInstance(instance); inst.setDataset(getOutputFormat()); push(inst, false); // No need to copy return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ @Override public boolean batchFinished() throws Exception { int i; Instances insts; Instance inst; if (getInputFormat() == null) { throw new NullPointerException("No input instance format defined"); } insts = getInputFormat(); if (!isFirstBatchDone()) { setup(insts); } for (i = 0; i < insts.numInstances(); i++) { inst = convertInstance(insts.instance(i)); inst.setDataset(getOutputFormat()); push(inst, false); // No need to copy } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method for running this filter. * * @param args should contain arguments to the filter: use -h for help */ public static void main(String[] args) { runFilter(new PrincipalComponents(), args); } }