List of usage examples for org.apache.mahout.math Vector nonZeroes
Iterable<Element> nonZeroes();
From source file:org.gpfvic.mahout.cf.taste.hadoop.item.UserVectorSplitterMapper.java
License:Apache License
private float findSmallestLargeValue(Vector userVector) { PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) { @Override/*from w w w . j a v a 2 s.c o m*/ protected boolean lessThan(Float f1, Float f2) { return f1 < f2; } }; for (Element e : userVector.nonZeroes()) { float absValue = Math.abs((float) e.get()); topPrefValues.insertWithOverflow(absValue); } return topPrefValues.top(); }
From source file:org.gpfvic.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java
License:Apache License
@Override protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx) throws IOException, InterruptedException { Vector userRatings = vectorWritable.get(); int column = TasteHadoopUtils.idToIndex(rowIndex.get()); itemVectorWritable.setWritesLaxPrecision(true); Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); for (Vector.Element elem : userRatings.nonZeroes()) { itemID.set(elem.index());//from w w w .j a v a 2 s .com itemVector.setQuick(column, elem.get()); itemVectorWritable.set(itemVector); ctx.write(itemID, itemVectorWritable); // reset vector for reuse itemVector.setQuick(elem.index(), 0.0); } }
From source file:org.hf.mls.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java
License:Apache License
@Override protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx) throws IOException, InterruptedException { Vector userRatings = vectorWritable.get(); int numElementsBeforeSampling = userRatings.getNumNondefaultElements(); userRatings = Vectors.maybeSample(userRatings, sampleSize); int numElementsAfterSampling = userRatings.getNumNondefaultElements(); int column = TasteHadoopUtils.idToIndex(rowIndex.get()); itemVectorWritable.setWritesLaxPrecision(true); Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); for (Vector.Element elem : userRatings.nonZeroes()) { itemID.set(elem.index());/* w ww .j av a2 s . c o m*/ itemVector.setQuick(column, elem.get()); itemVectorWritable.set(itemVector); ctx.write(itemID, itemVectorWritable); // reset vector for reuse itemVector.setQuick(elem.index(), 0.0); } ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling); ctx.getCounter(Elements.USER_RATINGS_NEGLECTED) .increment(numElementsBeforeSampling - numElementsAfterSampling); }
From source file:org.qcri.pca.MeanAndSpanJob.java
/** * This method overrides the Vector.assign method to allow optimization for * ZeroIndifferent functions/*from w w w . j a va 2 s .c o m*/ * * @param vector * the vector to be updated * @param other * the other vector * @param function * the function that operates on elements of the two vectors * @return the modified vector */ static public Vector vectorAssign(Vector vector, Vector other, ZeroIndifferentFunc function) { if (vector.size() != other.size()) { throw new CardinalityException(vector.size(), other.size()); } // special case: iterate only over the non-zero elements of the vector to // add Iterator<Element> it = other.nonZeroes().iterator(); Element e; while (it.hasNext() && (e = it.next()) != null) { double val = vector.getQuick(e.index()); double newVal = function.apply(val, e.get()); vector.setQuick(e.index(), newVal); } return vector; }
From source file:org.qcri.pca.Norm2Job.java
/** * To compute the norm2 of a sparse matrix, iterate over sparse items and sum * square of the difference. After processing each row, add the sum of the * meanSquare of the zero-elements that were ignored in the sparse iteration. * //from ww w. j a v a 2 s . c o m * @param sparseVector * the sparse vector of data * @param meanVector * the vector of means * @param meanSquareSum * sum of the square of all the means, including for zero and * non-zero elements * @return */ static double norm2OfUncentralizedSparseVector(Vector sparseVector, DenseVector meanVector, double meanSquareSum) { double norm2 = 0; double meanSquareSumOfZeroElements = meanSquareSum; Iterator<Vector.Element> iterator = sparseVector.nonZeroes().iterator(); while (iterator.hasNext()) { Vector.Element element = iterator.next(); double v = element.get(); double mean = meanVector.get(element.index()); double diff = v - mean; diff *= diff; // cancel the effect of the non-zero element in meanSquareSum meanSquareSumOfZeroElements -= mean * mean; norm2 += diff; } // For all all zero items, the following has the sum of mean square norm2 += meanSquareSumOfZeroElements; return norm2; }
From source file:org.qcri.pca.NormalizeJob.java
static void sparseVectorAssign(Vector mainV, final Vector otherV, DoubleDoubleFunction function) { java.util.Vector<IndexValue> newZeroElements = new java.util.Vector<IndexValue>(); Iterator<Vector.Element> nonZeroElements = mainV.nonZeroes().iterator(); while (nonZeroElements.hasNext()) { Vector.Element e = nonZeroElements.next(); double res = function.apply(e.get(), otherV.getQuick(e.index())); if (res != 0) mainV.setQuick(e.index(), res); else //Don't affect the iterator newZeroElements.add(new IndexValue(e.index(), res)); }/*from w ww . java2 s. c o m*/ for (IndexValue iv : newZeroElements) mainV.setQuick(iv.index, iv.value); }
From source file:org.qcri.pca.ReconstructionErrJob.java
static void denseVectorPlusAbsSparseVector(DenseVector denseVector, Vector sparseVector) { Iterator<Vector.Element> nonZeroElements = sparseVector.nonZeroes().iterator(); while (nonZeroElements.hasNext()) { Vector.Element e = nonZeroElements.next(); int index = e.index(); double v = e.get(); double prevV = denseVector.getQuick(index); denseVector.setQuick(index, prevV + Math.abs(v)); }//from www . j a v a2 s .co m }
From source file:org.qcri.sparkpca.SparkPCA.java
/** * Compute principal component analysis where the input is a path for a hadoop sequence File <IntWritable key, VectorWritable value> * @param sc /*from www . jav a 2 s . c o m*/ * Spark context that contains the configuration parameters and represents connection to the cluster * (used to create RDDs, accumulators and broadcast variables on that cluster) * @param inputPath * Path to the sequence file that represents the input matrix * @param nRows * Number of rows in input Matrix * @param nCols * Number of columns in input Matrix * @param nPCs * Number of desired principal components * @param errRate * The sampling rate that is used for computing the reconstruction error * @param maxIterations * Maximum number of iterations before terminating * @return Matrix of size nCols X nPCs having the desired principal components */ public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc, String inputPath, String outputPath, final int nRows, final int nCols, final int nPCs, final double errRate, final int maxIterations, final int computeProjectedMatrix) { //Read from sequence file JavaPairRDD<IntWritable, VectorWritable> seqVectors = sc.sequenceFile(inputPath, IntWritable.class, VectorWritable.class); //Convert sequence file to RDD<org.apache.spark.mllib.linalg.Vector> of Vectors JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors = seqVectors .map(new Function<Tuple2<IntWritable, VectorWritable>, org.apache.spark.mllib.linalg.Vector>() { public org.apache.spark.mllib.linalg.Vector call(Tuple2<IntWritable, VectorWritable> arg0) throws Exception { org.apache.mahout.math.Vector mahoutVector = arg0._2.get(); Iterator<Element> elements = mahoutVector.nonZeroes().iterator(); ArrayList<Tuple2<Integer, Double>> tupleList = new ArrayList<Tuple2<Integer, Double>>(); while (elements.hasNext()) { Element e = elements.next(); if (e.index() >= nCols || e.get() == 0) continue; Tuple2<Integer, Double> tuple = new Tuple2<Integer, Double>(e.index(), e.get()); tupleList.add(tuple); } org.apache.spark.mllib.linalg.Vector sparkVector = Vectors.sparse(nCols, tupleList); return sparkVector; } }).persist(StorageLevel.MEMORY_ONLY_SER()); return computePrincipalComponents(sc, vectors, outputPath, nRows, nCols, nPCs, errRate, maxIterations, computeProjectedMatrix); }