Example usage for org.apache.mahout.math Vector getNumNondefaultElements

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector getNumNondefaultElements.

Prototype

int getNumNondefaultElements();

Source Link

Document

Return the number of values in the recipient which are not the default value.

Usage

From source file:de.isabeldrostfromm.sof.naive.VectoriserTest.java

License:Open Source License

@Test
public void testBodyVectorisation2Terms() {
    Vectoriser vectorise = new Vectoriser();
    Document doc = Document.of("first second", "", "", 0.0, new HashSet<String>());
    Vector vec = vectorise.vectorise(doc);
    assertEquals("Adding one term should result in two dimensions set to one.", 4,
            vec.getNumNondefaultElements());

}

From source file:de.isabeldrostfromm.sof.util.VectorsTest.java

License:Open Source License

@Test
@Repeat(iterations = 10)//from w w  w.j  av a2 s  .  c o m
public void testCreation() {
    Vector vec = randomVector();
    double[] entries = new double[vec.getNumNondefaultElements()];
    int index = 0;
    for (Vector.Element e : vec) {
        entries[index] = e.get();
        index++;
    }
    Vector result = Vectors.newSequentialAccessSparseVector(entries);
    assertEquals("Original vector should have same length as the one created from its entries.", vec.norm(2),
            result.norm(2), 0.0001);
}

From source file:de.tuberlin.dima.cuttlefish.TrainingDataReader.java

License:Open Source License

public static void main(String[] args) {

    //-----------------------------------------------------------------------------

    String documentVectorsFile = "/home/ssc/Desktop/cuttlefish/output/vectors/documentVectors.seq";

    //-----------------------------------------------------------------------------

    Configuration conf = new Configuration();

    int n = 0;/*from  w w w  . j  a  v  a2 s.  c o  m*/

    for (Pair<IDAndCodes, VectorWritable> labeledArticle : new SequenceFileIterable<IDAndCodes, VectorWritable>(
            new Path(documentVectorsFile), conf)) {

        System.out.println("ID: " + labeledArticle.getFirst().id());

        Vector features = labeledArticle.getSecond().get();

        System.out.println("Features: " + features.getNumNondefaultElements() + " of " + features.size());

        Multimap<String, String> codes = labeledArticle.getFirst().codes();
        for (Map.Entry<String, String> codeEntry : codes.entries()) {
            System.out.println("\t" + codeEntry.getKey() + "=" + codeEntry.getValue());
        }

        if (n++ == 10) {
            break;
        }
    }

}

From source file:edu.rosehulman.mahout.classifier.naivebayes.NaiveBayesModel.java

License:Apache License

public NaiveBayesModel(Matrix weightMatrix, Vector weightsPerFeature, Vector weightsPerLabel,
        Vector thetaNormalizer, float alphaI) {
    this.weightsPerLabelAndFeature = weightMatrix;
    this.weightsPerFeature = weightsPerFeature;
    this.weightsPerLabel = weightsPerLabel;
    this.perlabelThetaNormalizer = thetaNormalizer;
    this.numFeatures = weightsPerFeature.getNumNondefaultElements();
    this.totalWeightSum = weightsPerLabel.zSum();
    this.alphaI = alphaI;
    //    this.minThetaNormalizer = thetaNormalizer.maxValue();
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from   w  ww.ja v  a2 s.c o  m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:mlbench.bayes.train.WeightSummer.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from w w w  .  j  a v a2  s.  c om*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {

        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);
        Vector weightsPerFeature = null;
        Vector weightsPerLabel = new DenseVector(labNum);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            IntWritable index = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(index, value)) {
                Vector instance = value.get();
                if (weightsPerFeature == null) {
                    weightsPerFeature = new RandomAccessSparseVector(instance.size(),
                            instance.getNumNondefaultElements());
                }

                int label = index.get();
                weightsPerFeature.assign(instance, Functions.PLUS);
                weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
            }
        }
        if (weightsPerFeature != null) {
            MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
            MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDirW);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        Text key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (Text) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            Path resOut = new Path(outDir);
            NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config);
            naiveBayesModel.serialize(resOut, config);
        }
    }

    MPI_D.Finalize();
}

From source file:nl.gridline.zieook.inx.movielens.UserVectorSplitterMapper.java

License:Apache License

private Vector maybePruneUserVector(Vector userVector) {
    if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {
        return userVector;
    }/*from  w  w  w . j  a v a 2  s. com*/

    float smallestLargeValue = findSmallestLargeValue(userVector);

    // "Blank out" small-sized prefs to reduce the amount of partial products
    // generated later. They're not zeroed, but NaN-ed, so they come through
    // and can be used to exclude these items from prefs.
    Iterator<Vector.Element> it = userVector.iterateNonZero();
    while (it.hasNext()) {
        Vector.Element e = it.next();
        float absValue = Math.abs((float) e.get());
        if (absValue < smallestLargeValue) {
            e.set(Float.NaN);
        }
    }

    return userVector;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ALS.java

License:Apache License

public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM,
        double lambda, int numFeatures) {
    Vector ratings = ratingsWritable.get();

    List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements());
    for (Vector.Element e : ratings.nonZeroes()) {
        int index = e.index();
        featureVectors.add(uOrM.get(index));
    }/*from  w  w  w . ja  v a2 s.co  m*/

    return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures);
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from   w w w.  ja va  2  s. co m
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    lambda = Double.parseDouble(getOption("lambda"));
    alpha = Double.parseDouble(getOption("alpha"));
    implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));

    numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));

    /*
    * compute the factorization A = U M'
    *
    * where A (users x items) is the matrix of known ratings
    *           U (users x features) is the representation of users in the feature space
    *           M (items x features) is the representation of items in the feature space
    */

    if (usesLongIDs) {
        Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
        mapUsers.waitForCompletion(true);

        Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
        mapItems.waitForCompletion(true);
    }

    /* create A' */
    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
            ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    itemRatings.setCombinerClass(VectorSumCombiner.class);
    itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
    boolean succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    /* create A */
    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    userRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = userRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //TODO this could be fiddled into one of the upper jobs
    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
            AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageItemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());

    int numItems = averageRatings.getNumNondefaultElements();
    int numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();

    log.info("Found {} users and {} items", numUsers, numItems);

    /* create an initial M */
    initializeM(averageRatings);

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
        /* broadcast M, read A row-wise, recompute U row-wise */
        log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                currentIteration, "U", numItems);
        /* broadcast U, read A' row-wise, recompute M row-wise */
        log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration,
                "M", numUsers);
    }

    return 0;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.PredictionMapper.java

License:Apache License

@Override
protected void map(IntWritable userIndexWritable, VectorWritable ratingsWritable, Context ctx)
        throws IOException, InterruptedException {

    Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> uAndM = getSharedInstance();
    OpenIntObjectHashMap<Vector> U = uAndM.getFirst();
    OpenIntObjectHashMap<Vector> M = uAndM.getSecond();

    Vector ratings = ratingsWritable.get();
    int userIndex = userIndexWritable.get();
    final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements());

    for (Vector.Element e : ratings.nonZeroes()) {
        alreadyRatedItems.add(e.index());
    }/*from  w  w w .  j  a  va  2s .  c  o  m*/

    final TopItemsQueue topItemsQueue = new TopItemsQueue(recommendationsPerUser);
    final Vector userFeatures = U.get(userIndex);

    M.forEachPair(new IntObjectProcedure<Vector>() {
        @Override
        public boolean apply(int itemID, Vector itemFeatures) {
            if (!alreadyRatedItems.contains(itemID)) {
                double predictedRating = userFeatures.dot(itemFeatures);

                MutableRecommendedItem top = topItemsQueue.top();
                if (predictedRating > top.getValue()) {
                    top.set(itemID, (float) predictedRating);
                    topItemsQueue.updateTop();
                }
            }
            return true;
        }
    });

    List<RecommendedItem> recommendedItems = topItemsQueue.getTopItems();

    if (!recommendedItems.isEmpty()) {

        // cap predictions to maxRating
        for (RecommendedItem topItem : recommendedItems) {
            ((MutableRecommendedItem) topItem).capToMaxValue(maxRating);
        }

        if (usesLongIDs) {
            long userID = userIDIndex.get(userIndex);
            userIDWritable.set(userID);

            for (RecommendedItem topItem : recommendedItems) {
                // remap item IDs
                long itemID = itemIDIndex.get((int) topItem.getItemID());
                ((MutableRecommendedItem) topItem).setItemID(itemID);
            }

        } else {
            userIDWritable.set(userIndex);
        }

        recommendations.set(recommendedItems);
        ctx.write(userIDWritable, recommendations);
    }
}