Example usage for org.apache.mahout.math Vector nonZeroes

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector nonZeroes.

Prototype

Iterable<Element> nonZeroes();

Source Link

Usage

From source file:org.gpfvic.mahout.cf.taste.hadoop.item.UserVectorSplitterMapper.java

License:Apache License

private float findSmallestLargeValue(Vector userVector) {

    PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) {
        @Override/*from   w  w  w  .  j  a  v a  2  s.c  o m*/
        protected boolean lessThan(Float f1, Float f2) {
            return f1 < f2;
        }
    };

    for (Element e : userVector.nonZeroes()) {
        float absValue = Math.abs((float) e.get());
        topPrefValues.insertWithOverflow(absValue);
    }
    return topPrefValues.top();
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java

License:Apache License

@Override
protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
        throws IOException, InterruptedException {
    Vector userRatings = vectorWritable.get();

    int column = TasteHadoopUtils.idToIndex(rowIndex.get());

    itemVectorWritable.setWritesLaxPrecision(true);

    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    for (Vector.Element elem : userRatings.nonZeroes()) {
        itemID.set(elem.index());//from w  w w  .j  a v  a  2  s .com
        itemVector.setQuick(column, elem.get());
        itemVectorWritable.set(itemVector);
        ctx.write(itemID, itemVectorWritable);
        // reset vector for reuse
        itemVector.setQuick(elem.index(), 0.0);
    }
}

From source file:org.hf.mls.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java

License:Apache License

@Override
protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
        throws IOException, InterruptedException {
    Vector userRatings = vectorWritable.get();

    int numElementsBeforeSampling = userRatings.getNumNondefaultElements();
    userRatings = Vectors.maybeSample(userRatings, sampleSize);
    int numElementsAfterSampling = userRatings.getNumNondefaultElements();

    int column = TasteHadoopUtils.idToIndex(rowIndex.get());

    itemVectorWritable.setWritesLaxPrecision(true);

    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    for (Vector.Element elem : userRatings.nonZeroes()) {
        itemID.set(elem.index());/* w  ww .j av a2  s . c o  m*/
        itemVector.setQuick(column, elem.get());
        itemVectorWritable.set(itemVector);
        ctx.write(itemID, itemVectorWritable);
        // reset vector for reuse
        itemVector.setQuick(elem.index(), 0.0);
    }

    ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling);
    ctx.getCounter(Elements.USER_RATINGS_NEGLECTED)
            .increment(numElementsBeforeSampling - numElementsAfterSampling);
}

From source file:org.qcri.pca.MeanAndSpanJob.java

/**
 * This method overrides the Vector.assign method to allow optimization for
 * ZeroIndifferent functions/*from w  w w  . j a va  2  s .c  o  m*/
 * 
 * @param vector
 *          the vector to be updated
 * @param other
 *          the other vector
 * @param function
 *          the function that operates on elements of the two vectors
 * @return the modified vector
 */
static public Vector vectorAssign(Vector vector, Vector other, ZeroIndifferentFunc function) {
    if (vector.size() != other.size()) {
        throw new CardinalityException(vector.size(), other.size());
    }
    // special case: iterate only over the non-zero elements of the vector to
    // add
    Iterator<Element> it = other.nonZeroes().iterator();
    Element e;
    while (it.hasNext() && (e = it.next()) != null) {
        double val = vector.getQuick(e.index());
        double newVal = function.apply(val, e.get());
        vector.setQuick(e.index(), newVal);
    }
    return vector;
}

From source file:org.qcri.pca.Norm2Job.java

/**
 * To compute the norm2 of a sparse matrix, iterate over sparse items and sum
 * square of the difference. After processing each row, add the sum of the
 * meanSquare of the zero-elements that were ignored in the sparse iteration.
 * //from  ww w.  j a v a  2  s  . c  o m
 * @param sparseVector
 *          the sparse vector of data
 * @param meanVector
 *          the vector of means
 * @param meanSquareSum
 *          sum of the square of all the means, including for zero and
 *          non-zero elements
 * @return
 */
static double norm2OfUncentralizedSparseVector(Vector sparseVector, DenseVector meanVector,
        double meanSquareSum) {
    double norm2 = 0;
    double meanSquareSumOfZeroElements = meanSquareSum;
    Iterator<Vector.Element> iterator = sparseVector.nonZeroes().iterator();
    while (iterator.hasNext()) {
        Vector.Element element = iterator.next();
        double v = element.get();
        double mean = meanVector.get(element.index());
        double diff = v - mean;
        diff *= diff;
        // cancel the effect of the non-zero element in meanSquareSum
        meanSquareSumOfZeroElements -= mean * mean;
        norm2 += diff;
    }
    // For all all zero items, the following has the sum of mean square
    norm2 += meanSquareSumOfZeroElements;
    return norm2;
}

From source file:org.qcri.pca.NormalizeJob.java

static void sparseVectorAssign(Vector mainV, final Vector otherV, DoubleDoubleFunction function) {
    java.util.Vector<IndexValue> newZeroElements = new java.util.Vector<IndexValue>();
    Iterator<Vector.Element> nonZeroElements = mainV.nonZeroes().iterator();
    while (nonZeroElements.hasNext()) {
        Vector.Element e = nonZeroElements.next();
        double res = function.apply(e.get(), otherV.getQuick(e.index()));
        if (res != 0)
            mainV.setQuick(e.index(), res);
        else //Don't affect the iterator
            newZeroElements.add(new IndexValue(e.index(), res));
    }/*from w ww . java2  s.  c o m*/
    for (IndexValue iv : newZeroElements)
        mainV.setQuick(iv.index, iv.value);
}

From source file:org.qcri.pca.ReconstructionErrJob.java

static void denseVectorPlusAbsSparseVector(DenseVector denseVector, Vector sparseVector) {
    Iterator<Vector.Element> nonZeroElements = sparseVector.nonZeroes().iterator();
    while (nonZeroElements.hasNext()) {
        Vector.Element e = nonZeroElements.next();
        int index = e.index();
        double v = e.get();
        double prevV = denseVector.getQuick(index);
        denseVector.setQuick(index, prevV + Math.abs(v));
    }//from  www .  j  a  v a2 s  .co  m
}

From source file:org.qcri.sparkpca.SparkPCA.java

/**
 * Compute principal component analysis where the input is a path for a hadoop sequence File <IntWritable key, VectorWritable value>
 * @param sc /*from  www  .  jav  a  2 s  .  c  o m*/
 *           Spark context that contains the configuration parameters and represents connection to the cluster 
 *          (used to create RDDs, accumulators and broadcast variables on that cluster)
 * @param inputPath
 *          Path to the sequence file that represents the input matrix
 * @param nRows
 *          Number of rows in input Matrix
 * @param nCols
 *          Number of columns in input Matrix
 * @param nPCs
 *          Number of desired principal components
 * @param errRate
 *          The sampling rate that is used for computing the reconstruction error
 * @param maxIterations 
 *          Maximum number of iterations before terminating
 * @return Matrix of size nCols X nPCs having the desired principal components
 */
public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc,
        String inputPath, String outputPath, final int nRows, final int nCols, final int nPCs,
        final double errRate, final int maxIterations, final int computeProjectedMatrix) {

    //Read from sequence file
    JavaPairRDD<IntWritable, VectorWritable> seqVectors = sc.sequenceFile(inputPath, IntWritable.class,
            VectorWritable.class);

    //Convert sequence file to RDD<org.apache.spark.mllib.linalg.Vector> of Vectors
    JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors = seqVectors
            .map(new Function<Tuple2<IntWritable, VectorWritable>, org.apache.spark.mllib.linalg.Vector>() {

                public org.apache.spark.mllib.linalg.Vector call(Tuple2<IntWritable, VectorWritable> arg0)
                        throws Exception {

                    org.apache.mahout.math.Vector mahoutVector = arg0._2.get();
                    Iterator<Element> elements = mahoutVector.nonZeroes().iterator();
                    ArrayList<Tuple2<Integer, Double>> tupleList = new ArrayList<Tuple2<Integer, Double>>();
                    while (elements.hasNext()) {
                        Element e = elements.next();
                        if (e.index() >= nCols || e.get() == 0)
                            continue;
                        Tuple2<Integer, Double> tuple = new Tuple2<Integer, Double>(e.index(), e.get());
                        tupleList.add(tuple);
                    }
                    org.apache.spark.mllib.linalg.Vector sparkVector = Vectors.sparse(nCols, tupleList);
                    return sparkVector;
                }
            }).persist(StorageLevel.MEMORY_ONLY_SER());

    return computePrincipalComponents(sc, vectors, outputPath, nRows, nCols, nPCs, errRate, maxIterations,
            computeProjectedMatrix);
}