Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * PrincipalComponents.java * Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand * */ package HomeWork7; import java.util.ArrayList; import java.util.Enumeration; import java.util.Vector; import no.uib.cipr.matrix.*; import weka.attributeSelection.AttributeTransformer; import weka.attributeSelection.UnsupervisedAttributeEvaluator; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Center; import weka.filters.unsupervised.attribute.NominalToBinary; import weka.filters.unsupervised.attribute.Remove; import weka.filters.unsupervised.attribute.ReplaceMissingValues; import weka.filters.unsupervised.attribute.Standardize; /** * <!-- globalinfo-start --> Performs a principal components analysis and * transformation of the data. Use in conjunction with a Ranker search. * Dimensionality reduction is accomplished by choosing enough eigenvectors to * account for some percentage of the variance in the original data---default * 0.95 (95%). Attribute noise can be filtered by transforming to the PC space, * eliminating some of the worst eigenvectors, and then transforming back to the * original space. * <p/> * <!-- globalinfo-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix. * </pre> * * <pre> * -R * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default = 0.95) * </pre> * * <pre> * -O * Transform through the PC space and * back to the original space. * </pre> * * <pre> * -A * Maximum number of attributes to include in * transformed attribute names. (-1 = include all) * </pre> * * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 12659 $ */ public class PrincipalComponents extends UnsupervisedAttributeEvaluator implements AttributeTransformer, OptionHandler { /** for serialization */ private static final long serialVersionUID = -3675307197777734007L; /** The data to transform analyse/transform */ private Instances m_trainInstances; /** Keep a copy for the class attribute (if set) */ private Instances m_trainHeader; /** The header for the transformed data format */ private Instances m_transformedFormat; /** The header for data transformed back to the original space */ private Instances m_originalSpaceFormat; /** Data has a class set */ private boolean m_hasClass; /** Class index */ private int m_classIndex; /** Number of attributes */ private int m_numAttribs; /** Number of instances */ private int m_numInstances; /** Correlation/covariance matrix for the original data */ private UpperSymmDenseMatrix m_correlation; private double[] m_means; private double[] m_stdDevs; /** * If true, center (rather than standardize) the data and compute PCA from * covariance (rather than correlation) matrix. */ private boolean m_center = false; /** * Will hold the unordered linear transformations of the (normalized) * original data */ private double[][] m_eigenvectors; /** Eigenvalues for the corresponding eigenvectors */ private double[] m_eigenvalues = null; /** Sorted eigenvalues */ private int[] m_sortedEigens; /** sum of the eigenvalues */ private double m_sumOfEigenValues = 0.0; /** Filters for original data */ private ReplaceMissingValues m_replaceMissingFilter; private NominalToBinary m_nominalToBinFilter; private Remove m_attributeFilter; private Center m_centerFilter; private Standardize m_standardizeFilter; /** The number of attributes in the pc transformed data */ private int m_outputNumAtts = -1; /** * the amount of variance to cover in the original data when retaining the * best n PC's */ private double m_coverVariance = 1; /** * transform the data through the pc space and back to the original space ? */ private boolean m_transBackToOriginal = false; /** maximum number of attributes in the transformed attribute name */ private int m_maxAttrsInName = 5; /** * holds the transposed eigenvectors for converting back to the original * space */ private double[][] m_eTranspose; private int m_maxNumAttr; public int getM_maxNumAttr() { return m_maxNumAttr; } public void setNumPrinComponents(int m_maxNumAttr) { this.m_maxNumAttr = m_maxNumAttr; } /** * Returns a string describing this attribute transformer * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Performs a principal components analysis and transformation of " + "the data. Use in conjunction with a Ranker search. Dimensionality " + "reduction is accomplished by choosing enough eigenvectors to " + "account for some percentage of the variance in the original data---" + "default 0.95 (95%). Attribute noise can be filtered by transforming " + "to the PC space, eliminating some of the worst eigenvectors, and " + "then transforming back to the original space."; } /** * Returns an enumeration describing the available options. * <p> * * @return an enumeration of all the available options. **/ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(4); newVector.addElement(new Option("\tCenter (rather than standardize) the" + "\n\tdata and compute PCA using the covariance (rather" + "\n\t than the correlation) matrix.", "C", 0, "-C")); newVector.addElement( new Option("\tRetain enough PC attributes to account " + "\n\tfor this proportion of variance in " + "the original data.\n" + "\t(default = 0.95)", "R", 1, "-R")); newVector.addElement(new Option("\tTransform through the PC space and " + "\n\tback to the original space.", "O", 0, "-O")); newVector.addElement(new Option("\tMaximum number of attributes to include in " + "\n\ttransformed attribute names. (-1 = include all)", "A", 1, "-A")); return newVector.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix. * </pre> * * <pre> * -R * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default = 0.95) * </pre> * * <pre> * -O * Transform through the PC space and * back to the original space. * </pre> * * <pre> * -A * Maximum number of attributes to include in * transformed attribute names. (-1 = include all) * </pre> * * <!-- options-end --> * * @param options * the list of options as an array of strings * @throws Exception * if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { resetOptions(); String optionString; optionString = Utils.getOption('R', options); if (optionString.length() != 0) { Double temp; temp = Double.valueOf(optionString); setVarianceCovered(temp.doubleValue()); } optionString = Utils.getOption('A', options); if (optionString.length() != 0) { setMaximumAttributeNames(Integer.parseInt(optionString)); } setTransformBackToOriginal(Utils.getFlag('O', options)); setCenterData(Utils.getFlag('C', options)); } /** * Reset to defaults */ private void resetOptions() { m_coverVariance = 0.95; m_sumOfEigenValues = 0.0; m_transBackToOriginal = false; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String centerDataTipText() { return "Center (rather than standardize) the data. PCA will " + "be computed from the covariance (rather than correlation) " + "matrix"; } /** * Set whether to center (rather than standardize) the data. If set to true * then PCA is computed from the covariance rather than correlation matrix. * * @param center * true if the data is to be centered rather than standardized */ public void setCenterData(boolean center) { m_center = center; } /** * Get whether to center (rather than standardize) the data. If true then * PCA is computed from the covariance rather than correlation matrix. * * @return true if the data is to be centered rather than standardized. */ public boolean getCenterData() { return m_center; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String varianceCoveredTipText() { return "Retain enough PC attributes to account for this proportion of " + "variance."; } /** * Sets the amount of variance to account for when retaining principal * components * * @param vc * the proportion of total variance to account for */ public void setVarianceCovered(double vc) { m_coverVariance = vc; } /** * Gets the proportion of total variance to account for when retaining * principal components * * @return the proportion of variance to account for */ public double getVarianceCovered() { return m_coverVariance; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String maximumAttributeNamesTipText() { return "The maximum number of attributes to include in transformed attribute names."; } /** * Sets maximum number of attributes to include in transformed attribute * names. * * @param m * the maximum number of attributes */ public void setMaximumAttributeNames(int m) { m_maxAttrsInName = m; } /** * Gets maximum number of attributes to include in transformed attribute * names. * * @return the maximum number of attributes */ public int getMaximumAttributeNames() { return m_maxAttrsInName; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String transformBackToOriginalTipText() { return "Transform through the PC space and back to the original space. " + "If only the best n PCs are retained (by setting varianceCovered < 1) " + "then this option will give a dataset in the original space but with " + "less attribute noise."; } /** * Sets whether the data should be transformed back to the original space * * @param b * true if the data should be transformed back to the original * space */ public void setTransformBackToOriginal(boolean b) { m_transBackToOriginal = b; } /** * Gets whether the data is to be transformed back to the original space. * * @return true if the data is to be transformed back to the original space */ public boolean getTransformBackToOriginal() { return m_transBackToOriginal; } /** * Gets the current settings of PrincipalComponents * * @return an array of strings suitable for passing to setOptions() */ @Override public String[] getOptions() { Vector<String> options = new Vector<String>(); if (getCenterData()) { options.add("-C"); } options.add("-R"); options.add("" + getVarianceCovered()); options.add("-A"); options.add("" + getMaximumAttributeNames()); if (getTransformBackToOriginal()) { options.add("-O"); } return options.toArray(new String[0]); } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.UNARY_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Initializes principal components and performs the analysis * * @param data * the instances to analyse/transform * @throws Exception * if analysis fails */ @Override public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); buildAttributeConstructor(data); } private void buildAttributeConstructor(Instances data) throws Exception { m_eigenvalues = null; m_outputNumAtts = -1; m_attributeFilter = null; m_nominalToBinFilter = null; m_sumOfEigenValues = 0.0; m_trainInstances = new Instances(data); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_trainHeader = new Instances(m_trainInstances, 0); m_replaceMissingFilter = new ReplaceMissingValues(); m_replaceMissingFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter); /* * if (m_normalize) { m_normalizeFilter = new Normalize(); * m_normalizeFilter.setInputFormat(m_trainInstances); m_trainInstances * = Filter.useFilter(m_trainInstances, m_normalizeFilter); } */ m_nominalToBinFilter = new NominalToBinary(); m_nominalToBinFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinFilter); // delete any attributes with only one distinct value or are all missing Vector<Integer> deleteCols = new Vector<Integer>(); for (int i = 0; i < m_trainInstances.numAttributes(); i++) { if (m_trainInstances.numDistinctValues(i) <= 1) { deleteCols.addElement(new Integer(i)); } } if (m_trainInstances.classIndex() >= 0) { // get rid of the class column m_hasClass = true; m_classIndex = m_trainInstances.classIndex(); deleteCols.addElement(new Integer(m_classIndex)); } // remove columns from the data if necessary if (deleteCols.size() > 0) { m_attributeFilter = new Remove(); int[] todelete = new int[deleteCols.size()]; for (int i = 0; i < deleteCols.size(); i++) { todelete[i] = (deleteCols.elementAt(i)).intValue(); } m_attributeFilter.setAttributeIndicesArray(todelete); m_attributeFilter.setInvertSelection(false); m_attributeFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter); } // can evaluator handle the processed data ? e.g., enough attributes? getCapabilities().testWithFail(m_trainInstances); m_numInstances = m_trainInstances.numInstances(); m_numAttribs = m_trainInstances.numAttributes(); fillCovariance(); SymmDenseEVD evd = SymmDenseEVD.factorize(m_correlation); m_eigenvectors = Matrices.getArray(evd.getEigenvectors()); m_eigenvalues = evd.getEigenvalues(); /* * for (int i = 0; i < m_numAttribs; i++) { for (int j = 0; j < * m_numAttribs; j++) { System.err.println(v[i][j] + " "); } * System.err.println(d[i]); } */ // any eigenvalues less than 0 are not worth anything --- change to 0 for (int i = 0; i < m_eigenvalues.length; i++) { if (m_eigenvalues[i] < 0) { m_eigenvalues[i] = 0.0; } } m_sortedEigens = Utils.sort(m_eigenvalues); m_sumOfEigenValues = Utils.sum(m_eigenvalues); m_transformedFormat = setOutputFormat(); if (m_transBackToOriginal) { m_originalSpaceFormat = setOutputFormatOriginal(); // new ordered eigenvector matrix int numVectors = (m_transformedFormat.classIndex() < 0) ? m_transformedFormat.numAttributes() : m_transformedFormat.numAttributes() - 1; double[][] orderedVectors = new double[m_eigenvectors.length][numVectors + 1]; // try converting back to the original space for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) { for (int j = 0; j < m_numAttribs; j++) { orderedVectors[j][m_numAttribs - i] = m_eigenvectors[j][m_sortedEigens[i]]; } } // transpose the matrix int nr = orderedVectors.length; int nc = orderedVectors[0].length; m_eTranspose = new double[nc][nr]; for (int i = 0; i < nc; i++) { for (int j = 0; j < nr; j++) { m_eTranspose[i][j] = orderedVectors[j][i]; } } } } /** * Returns just the header for the transformed data (ie. an empty set of * instances. This is so that AttributeSelection can determine the structure * of the transformed data without actually having to get all the * transformed data through transformedData(). * * @return the header of the transformed data. * @throws Exception * if the header of the transformed data can't be determined. */ @Override public Instances transformedHeader() throws Exception { if (m_eigenvalues == null) { throw new Exception("Principal components hasn't been built yet"); } if (m_transBackToOriginal) { return m_originalSpaceFormat; } else { return m_transformedFormat; } } /** * Return the header of the training data after all filtering - i.e missing * values and nominal to binary. * * @return the header of the training data after all filtering. */ public Instances getFilteredInputFormat() { return new Instances(m_trainInstances, 0); } /** * Return the correlation/covariance matrix * * @return the correlation or covariance matrix */ public double[][] getCorrelationMatrix() { return Matrices.getArray(m_correlation); } /** * Return the unsorted eigenvectors * * @return the unsorted eigenvectors */ public double[][] getUnsortedEigenVectors() { return m_eigenvectors; } /** * Return the eigenvalues corresponding to the eigenvectors * * @return the eigenvalues */ public double[] getEigenValues() { return m_eigenvalues; } /** * Gets the transformed training data. * * @return the transformed training data * @throws Exception * if transformed data can't be returned */ @Override public Instances transformedData(Instances data) throws Exception { if (m_eigenvalues == null) { throw new Exception("Principal components hasn't been built yet"); } Instances output = null; if (m_transBackToOriginal) { output = new Instances(m_originalSpaceFormat); } else { output = new Instances(m_transformedFormat); } for (int i = 0; i < data.numInstances(); i++) { Instance converted = convertInstance(data.instance(i)); output.add(converted); } return output; } /** * Evaluates the merit of a transformed attribute. This is defined to be 1 * minus the cumulative variance explained. Merit can't be meaningfully * evaluated if the data is to be transformed back to the original space. * * @param att * the attribute to be evaluated * @return the merit of a transformed attribute * @throws Exception * if attribute can't be evaluated */ @Override public double evaluateAttribute(int att) throws Exception { if (m_eigenvalues == null) { throw new Exception("Principal components hasn't been built yet!"); } if (m_transBackToOriginal) { return 1.0; // can't evaluate back in the original space! } // return 1-cumulative variance explained for this transformed att double cumulative = 0.0; for (int i = m_numAttribs - 1; i >= m_numAttribs - att - 1; i--) { cumulative += m_eigenvalues[m_sortedEigens[i]]; } return 1.0 - cumulative / m_sumOfEigenValues; } private void fillCovariance() throws Exception { // first store the means m_means = new double[m_trainInstances.numAttributes()]; m_stdDevs = new double[m_trainInstances.numAttributes()]; for (int i = 0; i < m_trainInstances.numAttributes(); i++) { m_means[i] = m_trainInstances.meanOrMode(i); m_stdDevs[i] = Math.sqrt(Utils.variance(m_trainInstances.attributeToDoubleArray(i))); } // just center the data or standardize it? if (m_center) { m_centerFilter = new Center(); m_centerFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_centerFilter); } else { m_standardizeFilter = new Standardize(); m_standardizeFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_standardizeFilter); } // now compute the covariance matrix m_correlation = new UpperSymmDenseMatrix(m_numAttribs); for (int i = 0; i < m_numAttribs; i++) { for (int j = i; j < m_numAttribs; j++) { double cov = 0; for (Instance inst : m_trainInstances) { cov += inst.value(i) * inst.value(j); } cov /= m_trainInstances.numInstances() - 1; m_correlation.set(i, j, cov); } } } /** * Return a summary of the analysis * * @return a summary of the analysis. */ private String principalComponentsSummary() { StringBuffer result = new StringBuffer(); double cumulative = 0.0; Instances output = null; int numVectors = 0; try { output = setOutputFormat(); numVectors = (output.classIndex() < 0) ? output.numAttributes() : output.numAttributes() - 1; } catch (Exception ex) { } // tomorrow String corrCov = (m_center) ? "Covariance " : "Correlation "; result.append(corrCov + "matrix\n" + matrixToString(Matrices.getArray(m_correlation)) + "\n\n"); result.append("eigenvalue\tproportion\tcumulative\n"); for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) { cumulative += m_eigenvalues[m_sortedEigens[i]]; result.append(Utils.doubleToString(m_eigenvalues[m_sortedEigens[i]], 9, 5) + "\t" + Utils.doubleToString((m_eigenvalues[m_sortedEigens[i]] / m_sumOfEigenValues), 9, 5) + "\t" + Utils.doubleToString((cumulative / m_sumOfEigenValues), 9, 5) + "\t" + output.attribute(m_numAttribs - i - 1).name() + "\n"); } result.append("\nEigenvectors\n"); for (int j = 1; j <= numVectors; j++) { result.append(" V" + j + '\t'); } result.append("\n"); for (int j = 0; j < m_numAttribs; j++) { for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) { result.append(Utils.doubleToString(m_eigenvectors[j][m_sortedEigens[i]], 7, 4) + "\t"); } result.append(m_trainInstances.attribute(j).name() + '\n'); } if (m_transBackToOriginal) { result.append("\nPC space transformed back to original space.\n" + "(Note: can't evaluate attributes in the original " + "space)\n"); } return result.toString(); } /** * Returns a description of this attribute transformer * * @return a String describing this attribute transformer */ @Override public String toString() { if (m_eigenvalues == null) { return "Principal components hasn't been built yet!"; } else { return "\tPrincipal Components Attribute Transformer\n\n" + principalComponentsSummary(); } } /** * Return a matrix as a String * * @param matrix * that is decribed as a string * @return a String describing a matrix */ public static String matrixToString(double[][] matrix) { StringBuffer result = new StringBuffer(); int last = matrix.length - 1; for (int i = 0; i <= last; i++) { for (int j = 0; j <= last; j++) { result.append(Utils.doubleToString(matrix[i][j], 6, 2) + " "); if (j == last) { result.append('\n'); } } } return result.toString(); } /** * Convert a pc transformed instance back to the original space * * @param inst * the instance to convert * @return the processed instance * @throws Exception * if something goes wrong */ private Instance convertInstanceToOriginal(Instance inst) throws Exception { double[] newVals = null; if (m_hasClass) { newVals = new double[m_numAttribs + 1]; } else { newVals = new double[m_numAttribs]; } if (m_hasClass) { // class is always appended as the last attribute newVals[m_numAttribs] = inst.value(inst.numAttributes() - 1); } for (int i = 0; i < m_eTranspose[0].length; i++) { double tempval = 0.0; for (int j = 1; j < m_eTranspose.length; j++) { tempval += (m_eTranspose[j][i] * inst.value(j - 1)); } newVals[i] = tempval; if (!m_center) { newVals[i] *= m_stdDevs[i]; } newVals[i] += m_means[i]; } if (inst instanceof SparseInstance) { return new SparseInstance(inst.weight(), newVals); } else { return new DenseInstance(inst.weight(), newVals); } } /** * Transform an instance in original (unormalized) format. Convert back to * the original space if requested. * * @param instance * an instance in the original (unormalized) format * @return a transformed instance * @throws Exception * if instance cant be transformed */ @Override public Instance convertInstance(Instance instance) throws Exception { if (m_eigenvalues == null) { throw new Exception("convertInstance: Principal components not " + "built yet"); } double[] newVals = new double[m_outputNumAtts]; Instance tempInst = (Instance) instance.copy(); if (!instance.dataset().equalHeaders(m_trainHeader)) { throw new Exception("Can't convert instance: header's don't match: " + "PrincipalComponents\n" + instance.dataset().equalHeadersMsg(m_trainHeader)); } m_replaceMissingFilter.input(tempInst); m_replaceMissingFilter.batchFinished(); tempInst = m_replaceMissingFilter.output(); /* * if (m_normalize) { m_normalizeFilter.input(tempInst); * m_normalizeFilter.batchFinished(); tempInst = * m_normalizeFilter.output(); } */ m_nominalToBinFilter.input(tempInst); m_nominalToBinFilter.batchFinished(); tempInst = m_nominalToBinFilter.output(); if (m_attributeFilter != null) { m_attributeFilter.input(tempInst); m_attributeFilter.batchFinished(); tempInst = m_attributeFilter.output(); } if (!m_center) { m_standardizeFilter.input(tempInst); m_standardizeFilter.batchFinished(); tempInst = m_standardizeFilter.output(); } else { m_centerFilter.input(tempInst); m_centerFilter.batchFinished(); tempInst = m_centerFilter.output(); } if (m_hasClass) { newVals[m_outputNumAtts - 1] = instance.value(instance.classIndex()); } double cumulative = 0; int numAttAdded = 0; for (int i = m_numAttribs - 1; i >= 0; i--) { double tempval = 0.0; for (int j = 0; j < m_numAttribs; j++) { tempval += (m_eigenvectors[j][m_sortedEigens[i]] * tempInst.value(j)); } newVals[m_numAttribs - i - 1] = tempval; cumulative += m_eigenvalues[m_sortedEigens[i]]; if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) { break; } if (numAttAdded > m_maxNumAttr) { break; } numAttAdded++; } if (!m_transBackToOriginal) { if (instance instanceof SparseInstance) { return new SparseInstance(instance.weight(), newVals); } else { return new DenseInstance(instance.weight(), newVals); } } else { if (instance instanceof SparseInstance) { return convertInstanceToOriginal(new SparseInstance(instance.weight(), newVals)); } else { return convertInstanceToOriginal(new DenseInstance(instance.weight(), newVals)); } } } /** * Set up the header for the PC->original space dataset * * @return the output format * @throws Exception * if something goes wrong */ private Instances setOutputFormatOriginal() throws Exception { ArrayList<Attribute> attributes = new ArrayList<Attribute>(); for (int i = 0; i < m_numAttribs; i++) { String att = m_trainInstances.attribute(i).name(); attributes.add(new Attribute(att)); } if (m_hasClass) { attributes.add((Attribute) m_trainHeader.classAttribute().copy()); } Instances outputFormat = new Instances(m_trainHeader.relationName() + "->PC->original space", attributes, 0); // set the class to be the last attribute if necessary if (m_hasClass) { outputFormat.setClassIndex(outputFormat.numAttributes() - 1); } return outputFormat; } /** * Set the format for the transformed data * * @return a set of empty Instances (header only) in the new format * @throws Exception * if the output format can't be set */ private Instances setOutputFormat() throws Exception { if (m_eigenvalues == null) { return null; } double cumulative = 0.0; ArrayList<Attribute> attributes = new ArrayList<Attribute>(); for (int i = m_numAttribs - 1; i >= 0; i--) { StringBuffer attName = new StringBuffer(); // build array of coefficients double[] coeff_mags = new double[m_numAttribs]; for (int j = 0; j < m_numAttribs; j++) { coeff_mags[j] = -Math.abs(m_eigenvectors[j][m_sortedEigens[i]]); } int num_attrs = (m_maxAttrsInName > 0) ? Math.min(m_numAttribs, m_maxAttrsInName) : m_numAttribs; // this array contains the sorted indices of the coefficients int[] coeff_inds; if (m_numAttribs > 0) { // if m_maxAttrsInName > 0, sort coefficients by decreasing // magnitude coeff_inds = Utils.sort(coeff_mags); } else { // if m_maxAttrsInName <= 0, use all coeffs in original order coeff_inds = new int[m_numAttribs]; for (int j = 0; j < m_numAttribs; j++) { coeff_inds[j] = j; } } // build final attName string for (int j = 0; j < num_attrs; j++) { double coeff_value = m_eigenvectors[coeff_inds[j]][m_sortedEigens[i]]; if (j > 0 && coeff_value >= 0) { attName.append("+"); } attName.append( Utils.doubleToString(coeff_value, 5, 3) + m_trainInstances.attribute(coeff_inds[j]).name()); } if (num_attrs < m_numAttribs) { attName.append("..."); } attributes.add(new Attribute(attName.toString())); cumulative += m_eigenvalues[m_sortedEigens[i]]; if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) { break; } } if (m_hasClass) { attributes.add((Attribute) m_trainHeader.classAttribute().copy()); } Instances outputFormat = new Instances(m_trainInstances.relationName() + "_principal components", attributes, 0); // set the class to be the last attribute if necessary if (m_hasClass) { outputFormat.setClassIndex(outputFormat.numAttributes() - 1); } m_outputNumAtts = outputFormat.numAttributes(); return outputFormat; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 12659 $"); } /** * Main method for testing this class * * @param argv * should contain the command line arguments to the * evaluator/transformer (see AttributeSelection) */ public static void main(String[] argv) { runEvaluator(new PrincipalComponents(), argv); } }