Example usage for org.apache.mahout.math Vector getNumNondefaultElements

List of usage examples for org.apache.mahout.math Vector getNumNondefaultElements

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector getNumNondefaultElements.

Prototype

int getNumNondefaultElements();

Source Link

Document

Return the number of values in the recipient which are not the default value.

Usage

From source file:de.isabeldrostfromm.sof.naive.VectoriserTest.java

License:Open Source License

@Test
public void testBodyVectorisation2Terms() {
    Vectoriser vectorise = new Vectoriser();
    Document doc = Document.of("first second", "", "", 0.0, new HashSet<String>());
    Vector vec = vectorise.vectorise(doc);
    assertEquals("Adding one term should result in two dimensions set to one.", 4,
            vec.getNumNondefaultElements());

}

From source file:de.isabeldrostfromm.sof.util.VectorsTest.java

License:Open Source License

@Test
@Repeat(iterations = 10)//from w w  w.j  av a2 s  .  c o m
public void testCreation() {
    Vector vec = randomVector();
    double[] entries = new double[vec.getNumNondefaultElements()];
    int index = 0;
    for (Vector.Element e : vec) {
        entries[index] = e.get();
        index++;
    }
    Vector result = Vectors.newSequentialAccessSparseVector(entries);
    assertEquals("Original vector should have same length as the one created from its entries.", vec.norm(2),
            result.norm(2), 0.0001);
}

From source file:de.tuberlin.dima.cuttlefish.TrainingDataReader.java

License:Open Source License

public static void main(String[] args) {

    //-----------------------------------------------------------------------------

    String documentVectorsFile = "/home/ssc/Desktop/cuttlefish/output/vectors/documentVectors.seq";

    //-----------------------------------------------------------------------------

    Configuration conf = new Configuration();

    int n = 0;/*from  w w w  . j  a  v  a2 s.  c o  m*/

    for (Pair<IDAndCodes, VectorWritable> labeledArticle : new SequenceFileIterable<IDAndCodes, VectorWritable>(
            new Path(documentVectorsFile), conf)) {

        System.out.println("ID: " + labeledArticle.getFirst().id());

        Vector features = labeledArticle.getSecond().get();

        System.out.println("Features: " + features.getNumNondefaultElements() + " of " + features.size());

        Multimap<String, String> codes = labeledArticle.getFirst().codes();
        for (Map.Entry<String, String> codeEntry : codes.entries()) {
            System.out.println("\t" + codeEntry.getKey() + "=" + codeEntry.getValue());
        }

        if (n++ == 10) {
            break;
        }
    }

}

From source file:edu.rosehulman.mahout.classifier.naivebayes.NaiveBayesModel.java

License:Apache License

public NaiveBayesModel(Matrix weightMatrix, Vector weightsPerFeature, Vector weightsPerLabel,
        Vector thetaNormalizer, float alphaI) {
    this.weightsPerLabelAndFeature = weightMatrix;
    this.weightsPerFeature = weightsPerFeature;
    this.weightsPerLabel = weightsPerLabel;
    this.perlabelThetaNormalizer = thetaNormalizer;
    this.numFeatures = weightsPerFeature.getNumNondefaultElements();
    this.totalWeightSum = weightsPerLabel.zSum();
    this.alphaI = alphaI;
    //    this.minThetaNormalizer = thetaNormalizer.maxValue();
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from   w  ww.ja v  a2 s.c o  m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:mlbench.bayes.train.WeightSummer.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from w w w  .  j  a v a2  s.  c om*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {

        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);
        Vector weightsPerFeature = null;
        Vector weightsPerLabel = new DenseVector(labNum);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            IntWritable index = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(index, value)) {
                Vector instance = value.get();
                if (weightsPerFeature == null) {
                    weightsPerFeature = new RandomAccessSparseVector(instance.size(),
                            instance.getNumNondefaultElements());
                }

                int label = index.get();
                weightsPerFeature.assign(instance, Functions.PLUS);
                weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
            }
        }
        if (weightsPerFeature != null) {
            MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
            MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDirW);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        Text key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (Text) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            Path resOut = new Path(outDir);
            NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config);
            naiveBayesModel.serialize(resOut, config);
        }
    }

    MPI_D.Finalize();
}

From source file:nl.gridline.zieook.inx.movielens.UserVectorSplitterMapper.java

License:Apache License

private Vector maybePruneUserVector(Vector userVector) {
    if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {
        return userVector;
    }/*from  w  w  w . j  a v a 2  s. com*/

    float smallestLargeValue = findSmallestLargeValue(userVector);

    // "Blank out" small-sized prefs to reduce the amount of partial products
    // generated later. They're not zeroed, but NaN-ed, so they come through
    // and can be used to exclude these items from prefs.
    Iterator<Vector.Element> it = userVector.iterateNonZero();
    while (it.hasNext()) {
        Vector.Element e = it.next();
        float absValue = Math.abs((float) e.get());
        if (absValue < smallestLargeValue) {
            e.set(Float.NaN);
        }
    }

    return userVector;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ALS.java

License:Apache License

public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM,
        double lambda, int numFeatures) {
    Vector ratings = ratingsWritable.get();

    List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements());
    for (Vector.Element e : ratings.nonZeroes()) {
        int index = e.index();
        featureVectors.add(uOrM.get(index));
    }/*from  w  w  w . ja  v a2 s.co  m*/

    return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures);
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from   w w w.  ja va  2  s. co m
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    lambda = Double.parseDouble(getOption("lambda"));
    alpha = Double.parseDouble(getOption("alpha"));
    implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));

    numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));

    /*
    * compute the factorization A = U M'
    *
    * where A (users x items) is the matrix of known ratings
    *           U (users x features) is the representation of users in the feature space
    *           M (items x features) is the representation of items in the feature space
    */

    if (usesLongIDs) {
        Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
        mapUsers.waitForCompletion(true);

        Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
        mapItems.waitForCompletion(true);
    }

    /* create A' */
    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
            ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    itemRatings.setCombinerClass(VectorSumCombiner.class);
    itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
    boolean succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    /* create A */
    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    userRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = userRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //TODO this could be fiddled into one of the upper jobs
    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
            AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageItemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());

    int numItems = averageRatings.getNumNondefaultElements();
    int numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();

    log.info("Found {} users and {} items", numUsers, numItems);

    /* create an initial M */
    initializeM(averageRatings);

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
        /* broadcast M, read A row-wise, recompute U row-wise */
        log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                currentIteration, "U", numItems);
        /* broadcast U, read A' row-wise, recompute M row-wise */
        log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration,
                "M", numUsers);
    }

    return 0;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.PredictionMapper.java

License:Apache License

@Override
protected void map(IntWritable userIndexWritable, VectorWritable ratingsWritable, Context ctx)
        throws IOException, InterruptedException {

    Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> uAndM = getSharedInstance();
    OpenIntObjectHashMap<Vector> U = uAndM.getFirst();
    OpenIntObjectHashMap<Vector> M = uAndM.getSecond();

    Vector ratings = ratingsWritable.get();
    int userIndex = userIndexWritable.get();
    final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements());

    for (Vector.Element e : ratings.nonZeroes()) {
        alreadyRatedItems.add(e.index());
    }/*from  w  w w .  j  a  va  2s .  c  o  m*/

    final TopItemsQueue topItemsQueue = new TopItemsQueue(recommendationsPerUser);
    final Vector userFeatures = U.get(userIndex);

    M.forEachPair(new IntObjectProcedure<Vector>() {
        @Override
        public boolean apply(int itemID, Vector itemFeatures) {
            if (!alreadyRatedItems.contains(itemID)) {
                double predictedRating = userFeatures.dot(itemFeatures);

                MutableRecommendedItem top = topItemsQueue.top();
                if (predictedRating > top.getValue()) {
                    top.set(itemID, (float) predictedRating);
                    topItemsQueue.updateTop();
                }
            }
            return true;
        }
    });

    List<RecommendedItem> recommendedItems = topItemsQueue.getTopItems();

    if (!recommendedItems.isEmpty()) {

        // cap predictions to maxRating
        for (RecommendedItem topItem : recommendedItems) {
            ((MutableRecommendedItem) topItem).capToMaxValue(maxRating);
        }

        if (usesLongIDs) {
            long userID = userIDIndex.get(userIndex);
            userIDWritable.set(userID);

            for (RecommendedItem topItem : recommendedItems) {
                // remap item IDs
                long itemID = itemIDIndex.get((int) topItem.getItemID());
                ((MutableRecommendedItem) topItem).setItemID(itemID);
            }

        } else {
            userIDWritable.set(userIndex);
        }

        recommendations.set(recommendedItems);
        ctx.write(userIDWritable, recommendations);
    }
}