 * PrincipalComponents.java
 * Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand

package weka.filters.unsupervised.attribute;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;

import no.uib.cipr.matrix.*;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

 * <!-- globalinfo-start --> Performs a principal components analysis and
 * transformation of the data.<br/>
 * Dimensionality reduction is accomplished by choosing enough eigenvectors to
 * account for some percentage of the variance in the original data -- default
 * 0.95 (95%).<br/>
 * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark
 * Hall and Gabi Schmidberger.
 * <p/>
 * <!-- globalinfo-end -->
 * <!-- options-start --> Valid options are:
 * <p/>
 * <pre>
 * -C
 *  Center (rather than standardize) the
 *  data and compute PCA using the covariance (rather
 *   than the correlation) matrix.
 * </pre>
 * <pre>
 * -R &lt;num&gt;
 *  Retain enough PC attributes to account
 *  for this proportion of variance in the original data.
 *  (default: 0.95)
 * </pre>
 * <pre>
 * -A &lt;num&gt;
 *  Maximum number of attributes to include in 
 *  transformed attribute names.
 *  (-1 = include all, default: 5)
 * </pre>
 * <pre>
 * -M &lt;num&gt;
 *  Maximum number of PC attributes to retain.
 *  (-1 = include all, default: -1)
 * </pre>
 * <!-- options-end -->
 * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code
 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code
 * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code
 * @version $Revision$
public class PrincipalComponents extends Filter implements OptionHandler, UnsupervisedFilter {

    /** for serialization. */
    private static final long serialVersionUID = -5649876869480249303L;

    /** The data to transform analyse/transform. */
    protected Instances m_TrainInstances;

    /** Keep a copy for the class attribute (if set). */
    protected Instances m_TrainCopy;

    /** The header for the transformed data format. */
    protected Instances m_TransformedFormat;

    /** Data has a class set. */
    protected boolean m_HasClass;

    /** Class index. */
    protected int m_ClassIndex;

    /** Number of attributes. */
    protected int m_NumAttribs;

    /** Number of instances. */
    protected int m_NumInstances;

    /** Correlation matrix for the original data. */
    protected UpperSymmDenseMatrix m_Correlation;

     * If true, center (rather than standardize) the data and compute PCA from
     * covariance (rather than correlation) matrix.
    private boolean m_center = false;

     * Will hold the unordered linear transformations of the (normalized) original
     * data.
    protected double[][] m_Eigenvectors;

    /** Eigenvalues for the corresponding eigenvectors. */
    protected double[] m_Eigenvalues = null;

    /** Sorted eigenvalues. */
    protected int[] m_SortedEigens;

    /** sum of the eigenvalues. */
    protected double m_SumOfEigenValues = 0.0;

    /** Filters for replacing missing values. */
    protected ReplaceMissingValues m_ReplaceMissingFilter;

    /** Filter for turning nominal values into numeric ones. */
    protected NominalToBinary m_NominalToBinaryFilter;

    /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */
    protected Remove m_AttributeFilter;

    /** Filter for standardizing the data */
    protected Standardize m_standardizeFilter;

    /** Filter for centering the data */
    protected Center m_centerFilter;

    /** The number of attributes in the pc transformed data. */
    protected int m_OutputNumAtts = -1;

     * the amount of varaince to cover in the original data when retaining the
     * best n PC's.
    protected double m_CoverVariance = 0.95;

    /** maximum number of attributes in the transformed attribute name. */
    protected int m_MaxAttrsInName = 5;

    /** maximum number of attributes in the transformed data (-1 for all). */
    protected int m_MaxAttributes = -1;

     * Returns a string describing this filter.
     * @return a description of the filter suitable for displaying in the
     *         explorer/experimenter gui
    public String globalInfo() {
        return "Performs a principal components analysis and transformation of " + "the data.\n"
                + "Dimensionality reduction is accomplished by choosing enough eigenvectors "
                + "to account for some percentage of the variance in the original data -- "
                + "default 0.95 (95%).\n" + "Based on code of the attribute selection scheme 'PrincipalComponents' "
                + "by Mark Hall and Gabi Schmidberger.";

     * Returns an enumeration describing the available options.
     * @return an enumeration of all the available options.
    public Enumeration<Option> listOptions() {

        Vector<Option> result = new Vector<Option>();

        result.addElement(new Option("\tCenter (rather than standardize) the"
                + "\n\tdata and compute PCA using the covariance (rather" + "\n\t than the correlation) matrix.",
                "C", 0, "-C"));

        result.addElement(new Option(
                "\tRetain enough PC attributes to account\n"
                        + "\tfor this proportion of variance in the original data.\n" + "\t(default: 0.95)",
                "R", 1, "-R <num>"));

        result.addElement(new Option("\tMaximum number of attributes to include in \n"
                + "\ttransformed attribute names.\n" + "\t(-1 = include all, default: 5)", "A", 1, "-A <num>"));

                new Option("\tMaximum number of PC attributes to retain.\n" + "\t(-1 = include all, default: -1)",
                        "M", 1, "-M <num>"));

        return result.elements();

     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
    public void setOptions(String[] options) throws Exception {

        String tmpStr = Utils.getOption('R', options);
        if (tmpStr.length() != 0) {
        } else {

        tmpStr = Utils.getOption('A', options);
        if (tmpStr.length() != 0) {
        } else {

        tmpStr = Utils.getOption('M', options);
        if (tmpStr.length() != 0) {
        } else {

        setCenterData(Utils.getFlag('C', options));


     * Gets the current settings of the filter.
     * @return an array of strings suitable for passing to setOptions
    public String[] getOptions() {

        Vector<String> result = new Vector<String>();

        result.add("" + getVarianceCovered());

        result.add("" + getMaximumAttributeNames());

        result.add("" + getMaximumAttributes());

        if (getCenterData()) {

        return result.toArray(new String[result.size()]);

     * Returns the tip text for this property
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
    public String centerDataTipText() {
        return "Center (rather than standardize) the data. PCA will "
                + "be computed from the covariance (rather than correlation) " + "matrix";

     * Set whether to center (rather than standardize) the data. If set to true
     * then PCA is computed from the covariance rather than correlation matrix.
     * @param center true if the data is to be centered rather than standardized
    public void setCenterData(boolean center) {
        m_center = center;

     * Get whether to center (rather than standardize) the data. If true then PCA
     * is computed from the covariance rather than correlation matrix.
     * @return true if the data is to be centered rather than standardized.
    public boolean getCenterData() {
        return m_center;

     * Returns the tip text for this property.
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
    public String varianceCoveredTipText() {
        return "Retain enough PC attributes to account for this proportion of variance.";

     * Sets the amount of variance to account for when retaining principal
     * components.
     * @param value the proportion of total variance to account for
    public void setVarianceCovered(double value) {
        m_CoverVariance = value;

     * Gets the proportion of total variance to account for when retaining
     * principal components.
     * @return the proportion of variance to account for
    public double getVarianceCovered() {
        return m_CoverVariance;

     * Returns the tip text for this property.
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
    public String maximumAttributeNamesTipText() {
        return "The maximum number of attributes to include in transformed attribute names.";

     * Sets maximum number of attributes to include in transformed attribute
     * names.
     * @param value the maximum number of attributes
    public void setMaximumAttributeNames(int value) {
        m_MaxAttrsInName = value;

     * Gets maximum number of attributes to include in transformed attribute
     * names.
     * @return the maximum number of attributes
    public int getMaximumAttributeNames() {
        return m_MaxAttrsInName;

     * Returns the tip text for this property.
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
    public String maximumAttributesTipText() {
        return "The maximum number of PC attributes to retain.";

     * Sets maximum number of PC attributes to retain.
     * @param value the maximum number of attributes
    public void setMaximumAttributes(int value) {
        m_MaxAttributes = value;

     * Gets maximum number of PC attributes to retain.
     * @return the maximum number of attributes
    public int getMaximumAttributes() {
        return m_MaxAttributes;

     * Returns the capabilities of this evaluator.
     * @return the capabilities of this evaluator
     * @see Capabilities
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();

        // attributes

        // class

        return result;

     * Determines the output format based on the input format and returns this. In
     * case the output format cannot be returned immediately, i.e.,
     * immediateOutputFormat() returns false, then this method will be called from
     * batchFinished().
     * @param inputFormat the input format to base the output format on
     * @return the output format
     * @throws Exception in case the determination goes wrong
     * @see #batchFinished()
    protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
        double cumulative;
        ArrayList<Attribute> attributes;
        int i;
        int j;
        StringBuffer attName;
        double[] coeff_mags;
        int num_attrs;
        int[] coeff_inds;
        double coeff_value;
        int numAttsLowerBound;

        if (m_Eigenvalues == null) {
            return inputFormat;

        if (m_MaxAttributes > 0) {
            numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
        } else {
            numAttsLowerBound = 0;
        if (numAttsLowerBound < 0) {
            numAttsLowerBound = 0;

        cumulative = 0.0;
        attributes = new ArrayList<Attribute>();
        for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
            attName = new StringBuffer();
            // build array of coefficients
            coeff_mags = new double[m_NumAttribs];
            for (j = 0; j < m_NumAttribs; j++) {
                coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]);
            num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs;

            // this array contains the sorted indices of the coefficients
            if (m_NumAttribs > 0) {
                // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude
                coeff_inds = Utils.sort(coeff_mags);
            } else {
                // if m_maxAttrsInName <= 0, use all coeffs in original order
                coeff_inds = new int[m_NumAttribs];
                for (j = 0; j < m_NumAttribs; j++) {
                    coeff_inds[j] = j;
            // build final attName string
            for (j = 0; j < num_attrs; j++) {
                coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]];
                if (j > 0 && coeff_value >= 0) {
                        Utils.doubleToString(coeff_value, 5, 3) + inputFormat.attribute(coeff_inds[j]).name());
            if (num_attrs < m_NumAttribs) {

            attributes.add(new Attribute(attName.toString()));
            cumulative += m_Eigenvalues[m_SortedEigens[i]];

            if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) {

        if (m_HasClass) {
            attributes.add((Attribute) m_TrainCopy.classAttribute().copy());

        Instances outputFormat = new Instances(m_TrainCopy.relationName() + "_principal components", attributes, 0);

        // set the class to be the last attribute if necessary
        if (m_HasClass) {
            outputFormat.setClassIndex(outputFormat.numAttributes() - 1);

        m_OutputNumAtts = outputFormat.numAttributes();

        return outputFormat;

    protected void fillCovariance() throws Exception {

        // just center the data or standardize it?
        if (m_center) {
            m_centerFilter = new Center();
            m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter);
        } else {
            m_standardizeFilter = new Standardize();
            m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter);

        // now compute the covariance matrix
        m_Correlation = new UpperSymmDenseMatrix(m_NumAttribs);

        for (int i = 0; i < m_NumAttribs; i++) {
            for (int j = i; j < m_NumAttribs; j++) {

                double cov = 0;
                for (Instance inst : m_TrainInstances) {
                    cov += inst.value(i) * inst.value(j);

                cov /= m_TrainInstances.numInstances() - 1;
                m_Correlation.set(i, j, cov);

     * Transform an instance in original (unormalized) format.
     * @param instance an instance in the original (unormalized) format
     * @return a transformed instance
     * @throws Exception if instance can't be transformed
    protected Instance convertInstance(Instance instance) throws Exception {
        Instance result;
        double[] newVals;
        Instance tempInst;
        double cumulative;
        int i;
        int j;
        double tempval;
        int numAttsLowerBound;

        newVals = new double[m_OutputNumAtts];
        tempInst = (Instance) instance.copy();

        tempInst = m_ReplaceMissingFilter.output();

        tempInst = m_NominalToBinaryFilter.output();

        if (m_AttributeFilter != null) {
            tempInst = m_AttributeFilter.output();

        if (!m_center) {
            tempInst = m_standardizeFilter.output();
        } else {
            tempInst = m_centerFilter.output();

        if (m_HasClass) {
            newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex());

        if (m_MaxAttributes > 0) {
            numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
        } else {
            numAttsLowerBound = 0;
        if (numAttsLowerBound < 0) {
            numAttsLowerBound = 0;

        cumulative = 0;
        for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
            tempval = 0.0;
            for (j = 0; j < m_NumAttribs; j++) {
                tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j);

            newVals[m_NumAttribs - i - 1] = tempval;
            cumulative += m_Eigenvalues[m_SortedEigens[i]];
            if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) {

        // create instance
        if (instance instanceof SparseInstance) {
            result = new SparseInstance(instance.weight(), newVals);
        } else {
            result = new DenseInstance(instance.weight(), newVals);

        return result;

     * Initializes the filter with the given input data.
     * @param instances the data to process
     * @throws Exception in case the processing goes wrong
     * @see #batchFinished()
    protected void setup(Instances instances) throws Exception {
        int i;
        int j;
        Vector<Integer> deleteCols;
        int[] todelete;
        double[][] v;

        m_TrainInstances = new Instances(instances);

        // make a copy of the training data so that we can get the class
        // column to append to the transformed data (if necessary)
        m_TrainCopy = new Instances(m_TrainInstances, 0);

        m_ReplaceMissingFilter = new ReplaceMissingValues();
        m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter);

        m_NominalToBinaryFilter = new NominalToBinary();
        m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter);

        // delete any attributes with only one distinct value or are all missing
        deleteCols = new Vector<Integer>();
        for (i = 0; i < m_TrainInstances.numAttributes(); i++) {
            if (m_TrainInstances.numDistinctValues(i) <= 1) {

        if (m_TrainInstances.classIndex() >= 0) {
            // get rid of the class column
            m_HasClass = true;
            m_ClassIndex = m_TrainInstances.classIndex();
            deleteCols.addElement(new Integer(m_ClassIndex));

        // remove columns from the data if necessary
        if (deleteCols.size() > 0) {
            m_AttributeFilter = new Remove();
            todelete = new int[deleteCols.size()];
            for (i = 0; i < deleteCols.size(); i++) {
                todelete[i] = (deleteCols.elementAt(i)).intValue();
            m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);

        // can evaluator handle the processed data ? e.g., enough attributes?

        m_NumInstances = m_TrainInstances.numInstances();
        m_NumAttribs = m_TrainInstances.numAttributes();

        // fillCorrelation();

        // get eigen vectors/values

        SymmDenseEVD evd = SymmDenseEVD.factorize(m_Correlation);

        m_Eigenvectors = Matrices.getArray(evd.getEigenvectors());
        m_Eigenvalues = evd.getEigenvalues();

        // any eigenvalues less than 0 are not worth anything --- change to 0
        for (i = 0; i < m_Eigenvalues.length; i++) {
            if (m_Eigenvalues[i] < 0) {
                m_Eigenvalues[i] = 0.0;
        m_SortedEigens = Utils.sort(m_Eigenvalues);
        m_SumOfEigenValues = Utils.sum(m_Eigenvalues);

        m_TransformedFormat = determineOutputFormat(m_TrainInstances);

        m_TrainInstances = null;

     * Sets the format of the input instances.
     * @param instanceInfo an Instances object containing the input instance
     *          structure (any instances contained in the object are ignored -
     *          only the structure is required).
     * @return true if the outputFormat may be collected immediately
     * @throws Exception if the input format can't be set successfully
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        m_Eigenvalues = null;
        m_OutputNumAtts = -1;
        m_AttributeFilter = null;
        m_NominalToBinaryFilter = null;
        m_SumOfEigenValues = 0.0;

        return false;

     * Input an instance for filtering. Filter requires all training instances be
     * read before producing output.
     * @param instance the input instance
     * @return true if the filtered instance may now be collected with output().
     * @throws IllegalStateException if no input format has been set
     * @throws Exception if conversion fails
    public boolean input(Instance instance) throws Exception {
        Instance inst;

        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");

        if (isNewBatch()) {
            m_NewBatch = false;

        if (isFirstBatchDone()) {
            inst = convertInstance(instance);
            push(inst, false); // No need to copy
            return true;
        } else {
            return false;

     * Signify that this batch of input to the filter is finished.
     * @return true if there are instances pending output
     * @throws NullPointerException if no input structure has been defined,
     * @throws Exception if there was a problem finishing the batch.
    public boolean batchFinished() throws Exception {
        int i;
        Instances insts;
        Instance inst;

        if (getInputFormat() == null) {
            throw new NullPointerException("No input instance format defined");

        insts = getInputFormat();

        if (!isFirstBatchDone()) {

        for (i = 0; i < insts.numInstances(); i++) {
            inst = convertInstance(insts.instance(i));
            push(inst, false); // No need to copy

        m_NewBatch = true;
        m_FirstBatchDone = true;

        return (numPendingOutput() != 0);

     * Returns the revision string.
     * @return the revision
    public String getRevision() {
        return RevisionUtils.extract("$Revision$");

     * Main method for running this filter.
     * @param args should contain arguments to the filter: use -h for help
    public static void main(String[] args) {
        runFilter(new PrincipalComponents(), args);