Java tutorial
/* * The Gemma project * * Copyright (c) 2008 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.analysis.preprocess.svd; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.commons.lang.StringUtils; import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix; import ubic.basecode.dataStructure.matrix.DoubleMatrix; import ubic.basecode.math.DescriptiveWithMissing; import ubic.basecode.math.MatrixStats; import ubic.basecode.math.SingularValueDecomposition; import ubic.gemma.analysis.preprocess.filter.AffyProbeNameFilter; import ubic.gemma.analysis.preprocess.filter.AffyProbeNameFilter.Pattern; import ubic.gemma.analysis.preprocess.filter.RowLevelFilter; import ubic.gemma.analysis.preprocess.filter.RowLevelFilter.Method; import ubic.gemma.analysis.preprocess.filter.RowMissingValueFilter; import ubic.gemma.datastructure.matrix.ExpressionDataDoubleMatrix; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.designElement.CompositeSequence; import cern.colt.list.DoubleArrayList; import cern.colt.matrix.DoubleMatrix1D; import cern.colt.matrix.DoubleMatrix2D; import cern.colt.matrix.impl.DenseDoubleMatrix1D; import cern.colt.matrix.impl.DenseDoubleMatrix2D; import cern.colt.matrix.linalg.Algebra; /** * Perform SVD on an expression data matrix, E = U S V'. The rows of the input matrix are probes (genes), following the * convention of Alter et al. 2000 (PNAS). Thus the U matrix columns are the <em>eigensamples</em> (eigenarrays) and the * V matrix columns are the <em>eigengenes</em>. See also http://genome-www.stanford.edu/SVD/. * <p> * Because SVD can't be done on a matrix with missing values, values are imputed. Rows with no variance are removed, and * rows with too many missing values are also removed (MIN_PRESENT_FRACTION_FOR_ROW) * <p> * FIXME this also includes SVD-based normalization algorithms which might best be refactored out. * * @author paul * @version $Id: ExpressionDataSVD.java,v 1.11 2013/04/19 21:16:58 paul Exp $ */ public class ExpressionDataSVD { private static final double MIN_PRESENT_FRACTION_FOR_ROW = 0.75; private ExpressionDataDoubleMatrix expressionData; private double norm1; private boolean normalized = false; DenseDoubleMatrix2D missingValueInfo; // fixme use booleans SingularValueDecomposition<CompositeSequence, BioMaterial> svd; /** * Does normalization. * * @param expressionData */ public ExpressionDataSVD(ExpressionDataDoubleMatrix expressionData) { this(expressionData, true); } /** * @param expressionData Note that this may be modified! * @param normalizeMatrix If true, the data matrix will be rescaled and centred to mean zero, variance one, for both * rows and columns ("double-standardized") */ public ExpressionDataSVD(ExpressionDataDoubleMatrix expressionData, boolean normalizeMatrix) { this.expressionData = expressionData; ArrayDesign arrayDesign = expressionData.getRowElement(0).getDesignElement().getArrayDesign(); if (StringUtils.isNotBlank(arrayDesign.getName()) && arrayDesign.getName().toUpperCase().contains("AFFYMETRIX")) { AffyProbeNameFilter affyProbeNameFilter = new AffyProbeNameFilter(new Pattern[] { Pattern.AFFX }); this.expressionData = affyProbeNameFilter.filter(this.expressionData); } // /* // * FIXME Remove any columns which have only missing data. We have to put in dummy values, otherwise things // will // * be quite messed up. // */ // ColumnMissingValueFilter columnMissingFilter = new ColumnMissingValueFilter(); // columnMissingFilter.setMinPresentFactrion( 10 ); // this.expressionData = columnMissingFilter.filter( this.expressionData ); /* * Now filter rows. */ RowMissingValueFilter rowMissingFilter = new RowMissingValueFilter(); rowMissingFilter.setMinPresentFraction(MIN_PRESENT_FRACTION_FOR_ROW); this.expressionData = rowMissingFilter.filter(this.expressionData); /* * Remove rows with no variance */ RowLevelFilter rlf = new RowLevelFilter(); rlf.setMethod(Method.VAR); rlf.setLowCut(0.01, false); // the colt SVD method fails to converge? I tried removing just Constant.SMALL but // it wasn't enough? this.expressionData = rlf.filter(this.expressionData); if (this.expressionData.rows() == 0) { throw new IllegalStateException("After filtering, matrix has no rows, SVD cannot be computed"); } if (this.expressionData.rows() < this.expressionData.columns()) { throw new IllegalStateException( "After filtering, this data set has more samples than rows; SVD not supported."); } this.normalized = normalizeMatrix; DoubleMatrix<CompositeSequence, BioMaterial> matrix = this.expressionData.getMatrix(); assert matrix.getRowNames().size() > 0; assert matrix.getColNames().size() > 0; imputeMissing(matrix); if (normalizeMatrix) { matrix = MatrixStats.doubleStandardize(matrix); } this.svd = new SingularValueDecomposition<CompositeSequence, BioMaterial>(matrix); } /** * Implements the method described in the SPELL paper, alternative interpretation as related by Q. Morris. Set all * components to have equal weight (set all singular values to 1) * * @return the reconstructed matrix; values that were missing before are re-masked. */ public ExpressionDataDoubleMatrix equalize() { DoubleMatrix<Integer, Integer> copy = svd.getS().copy(); for (int i = 0; i < copy.columns(); i++) { copy.set(i, i, 1.0); } double[][] rawU = svd.getU().getRawMatrix(); double[][] rawS = copy.getRawMatrix(); double[][] rawV = svd.getV().getRawMatrix(); DoubleMatrix2D u = new DenseDoubleMatrix2D(rawU); DoubleMatrix2D s = new DenseDoubleMatrix2D(rawS); DoubleMatrix2D v = new DenseDoubleMatrix2D(rawV); Algebra a = new Algebra(); DoubleMatrix<CompositeSequence, BioMaterial> reconstructed = new DenseDoubleMatrix<CompositeSequence, BioMaterial>( a.mult(a.mult(u, s), a.transpose(v)).toArray()); reconstructed.setRowNames(this.expressionData.getMatrix().getRowNames()); reconstructed.setColumnNames(this.expressionData.getMatrix().getColNames()); // remask the missing values. for (int i = 0; i < reconstructed.rows(); i++) { for (int j = 0; j < reconstructed.columns(); j++) { if (Double.isNaN(this.missingValueInfo.get(i, j))) { reconstructed.set(i, j, Double.NaN); } } } return new ExpressionDataDoubleMatrix(this.expressionData, reconstructed); } /** * @param i * @return the ith eigengene (column of V) */ public Double[] getEigenGene(int i) { return getV().getColObj(i); } /** * @param i * @return the ith eigensample (column of U) */ public Double[] getEigenSample(int i) { return getU().getColObj(i); } /** * @return the square roots of the singular values. */ public double[] getEigenvalues() { double[] singularValues = this.getSingularValues(); double[] eigenvalues = new double[singularValues.length]; for (int i = 0; i < singularValues.length; i++) { double d = Math.pow(singularValues[i], 2) / (this.getNumVariables() - 1); eigenvalues[i] = d; } return eigenvalues; } /** * @return how many rows the U matrix has. */ public int getNumVariables() { return this.svd.getU().rows(); } /** * @return the matrix of singular values, indexed by the eigenarray (row) and eigengene (column) numbers (starting * from 0). */ public DoubleMatrix<Integer, Integer> getS() { return svd.getS(); } /** * @return */ public double[] getSingularValues() { return this.svd.getSingularValues(); } /** * @return the left singular vectors. The column indices are of the eigenarrays (starting from 0). */ public DoubleMatrix<CompositeSequence, Integer> getU() { return svd.getU(); } /** * @return the right singular vectors. The column indices are of the eigengenes (starting from 0). The row indices * are of the original samples in the given ExpressionDataDoubleMatrix. */ public DoubleMatrix<Integer, BioMaterial> getV() { return svd.getV(); } /** * @return fractions of the variance for each singular vector. */ public Double[] getVarianceFractions() { double[] singularValues = svd.getSingularValues(); // d should be be square roots of the eigenvalues scaled by number of variables: check int numVariables = this.getNumVariables(); double sum = 0; for (double d : singularValues) { double d2 = d * d / (numVariables - 1); sum += d2; } Double[] answer = new Double[singularValues.length]; for (int i = 0; i < singularValues.length; i++) { answer[i] = singularValues[i] * singularValues[i] / sum; } return answer; } /** * Provide a reconstructed matrix removing the first N components (the most significant ones). If the matrix was * normalized first, removing the first component replicates the normalization approach taken by Nielsen et al. * (Lancet 359, 2002) and Alter et al. (PNAS 2000). Correction by ANOVA would yield similar results if the nuisance * variable is known. * * @param numComponentsToRemove The number of components to remove, starting from the largest eigenvalue. * @return the reconstructed matrix; values that were missing before are re-masked. */ public ExpressionDataDoubleMatrix removeHighestComponents(int numComponentsToRemove) { DoubleMatrix<Integer, Integer> copy = svd.getS().copy(); for (int i = 0; i < numComponentsToRemove; i++) { copy.set(i, i, 0.0); } double[][] rawU = svd.getU().getRawMatrix(); double[][] rawS = copy.getRawMatrix(); double[][] rawV = svd.getV().getRawMatrix(); DoubleMatrix2D u = new DenseDoubleMatrix2D(rawU); DoubleMatrix2D s = new DenseDoubleMatrix2D(rawS); DoubleMatrix2D v = new DenseDoubleMatrix2D(rawV); Algebra a = new Algebra(); DoubleMatrix<CompositeSequence, BioMaterial> reconstructed = new DenseDoubleMatrix<CompositeSequence, BioMaterial>( a.mult(a.mult(u, s), a.transpose(v)).toArray()); reconstructed.setRowNames(this.expressionData.getMatrix().getRowNames()); reconstructed.setColumnNames(this.expressionData.getMatrix().getColNames()); // remask the missing values. for (int i = 0; i < reconstructed.rows(); i++) { for (int j = 0; j < reconstructed.columns(); j++) { if (Double.isNaN(this.missingValueInfo.get(i, j))) { reconstructed.set(i, j, Double.NaN); } } } return new ExpressionDataDoubleMatrix(this.expressionData, reconstructed); } /** * Implements the method described in the SPELL paper. Note that this alters the U matrix of this. * <p> * We make two assumptions about the method that are not described in the paper: 1) The data are rescaled and * centered; 2) the absolute value of the U matrix is used. Note that unlike the original data, the transformed data * will have no missing values. * * @return */ public ExpressionDataDoubleMatrix uMatrixAsExpressionData() { /* * */ if (!normalized) { throw new IllegalStateException("You must do SVD on the normalized matrix"); } DoubleMatrix<CompositeSequence, Integer> rawUMatrix = svd.getU(); DoubleMatrix<CompositeSequence, BioMaterial> result = new DenseDoubleMatrix<CompositeSequence, BioMaterial>( rawUMatrix.rows(), rawUMatrix.columns()); // take the absolute value of the U matrix. for (int i = 0; i < rawUMatrix.rows(); i++) { for (int j = 0; j < rawUMatrix.columns(); j++) { result.set(i, j, Math.abs(rawUMatrix.get(i, j))); } } List<BioMaterial> colNames = svd.getV().getColNames(); result.setColumnNames(colNames); result.setRowNames(rawUMatrix.getRowNames()); // use that as the 'expression data' return new ExpressionDataDoubleMatrix(this.expressionData, result); } /** * Implements method described in Skillicorn et al., "Strategies for winnowing microarray data" (also section 3.5.5 * of his book) * * @param thresholdQuantile Enter 0.5 for median. Value must be > 0 and < 1. * @return a filtered matrix */ public ExpressionDataDoubleMatrix winnow(double thresholdQuantile) { if (thresholdQuantile <= 0 || thresholdQuantile >= 1) { throw new IllegalArgumentException("Threshold quantile should be a value between 0 and 1 exclusive"); } class NormCmp implements Comparable<NormCmp> { Double norm; int rowIndex; public NormCmp(int rowIndex, Double norm) { super(); this.rowIndex = rowIndex; this.norm = norm; } @Override public int compareTo(NormCmp o) { return this.norm.compareTo(o.norm); } /* * (non-Javadoc) * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; NormCmp other = (NormCmp) obj; if (norm == null) { if (other.norm != null) return false; } else if (!norm.equals(other.norm)) return false; return true; } public int getRowIndex() { return rowIndex; } /* * (non-Javadoc) * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((norm == null) ? 0 : norm.hashCode()); return result; } } // order rows by distance from the origin. This is proportional to the 1-norm. Algebra a = new Algebra(); List<NormCmp> os = new ArrayList<NormCmp>(); for (int i = 0; i < this.expressionData.rows(); i++) { double[] row = this.getU().getRow(i); DoubleMatrix1D rom = new DenseDoubleMatrix1D(row); norm1 = a.norm1(rom); os.add(new NormCmp(i, norm1)); } Collections.sort(os); int quantileLimit = (int) Math.floor(this.expressionData.rows() * thresholdQuantile); quantileLimit = Math.max(0, quantileLimit); List<CompositeSequence> keepers = new ArrayList<CompositeSequence>(); for (int i = 0; i < quantileLimit; i++) { NormCmp x = os.get(i); CompositeSequence d = this.expressionData.getDesignElementForRow(x.getRowIndex()); keepers.add(d); } // / remove genes which are near the origin in SVD space. FIXME: make sure the missing values are still masked. return new ExpressionDataDoubleMatrix(this.expressionData, keepers); } /** * Simple imputation method. Generally (but not always), missing values correspond to "low expression". Therefore * imputed values of zero are defensible. However, because at this point the matrix has probably already been * filtered, the row mean is better. * * @param matrix */ private void imputeMissing(DoubleMatrix<CompositeSequence, BioMaterial> matrix) { /* * keep track of the missing values so they can be re-masked later. */ missingValueInfo = new DenseDoubleMatrix2D(matrix.rows(), matrix.columns()); for (int i = 0; i < matrix.rows(); i++) { DoubleArrayList v = new DoubleArrayList(matrix.getRow(i)); double m = DescriptiveWithMissing.mean(v); for (int j = 0; j < matrix.columns(); j++) { double d = matrix.get(i, j); if (Double.isNaN(d)) { missingValueInfo.set(i, j, Double.NaN); matrix.set(i, j, m); } else { missingValueInfo.set(i, j, 1.0); } } } } }