List of usage examples for org.apache.mahout.math Vector getNumNondefaultElements
int getNumNondefaultElements();
From source file:de.isabeldrostfromm.sof.naive.VectoriserTest.java
License:Open Source License
@Test public void testBodyVectorisation2Terms() { Vectoriser vectorise = new Vectoriser(); Document doc = Document.of("first second", "", "", 0.0, new HashSet<String>()); Vector vec = vectorise.vectorise(doc); assertEquals("Adding one term should result in two dimensions set to one.", 4, vec.getNumNondefaultElements()); }
From source file:de.isabeldrostfromm.sof.util.VectorsTest.java
License:Open Source License
@Test @Repeat(iterations = 10)//from w w w.j av a2 s . c o m public void testCreation() { Vector vec = randomVector(); double[] entries = new double[vec.getNumNondefaultElements()]; int index = 0; for (Vector.Element e : vec) { entries[index] = e.get(); index++; } Vector result = Vectors.newSequentialAccessSparseVector(entries); assertEquals("Original vector should have same length as the one created from its entries.", vec.norm(2), result.norm(2), 0.0001); }
From source file:de.tuberlin.dima.cuttlefish.TrainingDataReader.java
License:Open Source License
public static void main(String[] args) { //----------------------------------------------------------------------------- String documentVectorsFile = "/home/ssc/Desktop/cuttlefish/output/vectors/documentVectors.seq"; //----------------------------------------------------------------------------- Configuration conf = new Configuration(); int n = 0;/*from w w w . j a v a2 s. c o m*/ for (Pair<IDAndCodes, VectorWritable> labeledArticle : new SequenceFileIterable<IDAndCodes, VectorWritable>( new Path(documentVectorsFile), conf)) { System.out.println("ID: " + labeledArticle.getFirst().id()); Vector features = labeledArticle.getSecond().get(); System.out.println("Features: " + features.getNumNondefaultElements() + " of " + features.size()); Multimap<String, String> codes = labeledArticle.getFirst().codes(); for (Map.Entry<String, String> codeEntry : codes.entries()) { System.out.println("\t" + codeEntry.getKey() + "=" + codeEntry.getValue()); } if (n++ == 10) { break; } } }
From source file:edu.rosehulman.mahout.classifier.naivebayes.NaiveBayesModel.java
License:Apache License
public NaiveBayesModel(Matrix weightMatrix, Vector weightsPerFeature, Vector weightsPerLabel, Vector thetaNormalizer, float alphaI) { this.weightsPerLabelAndFeature = weightMatrix; this.weightsPerFeature = weightsPerFeature; this.weightsPerLabel = weightsPerLabel; this.perlabelThetaNormalizer = thetaNormalizer; this.numFeatures = weightsPerFeature.getNumNondefaultElements(); this.totalWeightSum = weightsPerLabel.zSum(); this.alphaI = alphaI; // this.minThetaNormalizer = thetaNormalizer.maxValue(); }
From source file:edu.rosehulman.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;//from w ww.ja v a2 s.c o m } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.close(sf, true); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1); } }
From source file:mlbench.bayes.train.WeightSummer.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);/*from w w w . j a v a2 s. c om*/ HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); Vector weightsPerFeature = null; Vector weightsPerLabel = new DenseVector(labNum); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); IntWritable index = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(index, value)) { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); } int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); } } if (weightsPerFeature != null) { MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature)); MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel)); } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDirW); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (Text) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { Path resOut = new Path(outDir); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config); naiveBayesModel.serialize(resOut, config); } } MPI_D.Finalize(); }
From source file:nl.gridline.zieook.inx.movielens.UserVectorSplitterMapper.java
License:Apache License
private Vector maybePruneUserVector(Vector userVector) { if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) { return userVector; }/*from w w w . j a v a 2 s. com*/ float smallestLargeValue = findSmallestLargeValue(userVector); // "Blank out" small-sized prefs to reduce the amount of partial products // generated later. They're not zeroed, but NaN-ed, so they come through // and can be used to exclude these items from prefs. Iterator<Vector.Element> it = userVector.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); float absValue = Math.abs((float) e.get()); if (absValue < smallestLargeValue) { e.set(Float.NaN); } } return userVector; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ALS.java
License:Apache License
public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM, double lambda, int numFeatures) { Vector ratings = ratingsWritable.get(); List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements()); for (Vector.Element e : ratings.nonZeroes()) { int index = e.index(); featureVectors.add(uOrM.get(index)); }/*from w w w . ja v a2 s.co m*/ return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures); }
From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w. ja va 2 s. co m addOutputOption(); addOption("lambda", null, "regularization parameter", true); addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false)); addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40)); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1)); addOption("usesLongIDs", null, "input contains long IDs that need to be translated"); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } numFeatures = Integer.parseInt(getOption("numFeatures")); numIterations = Integer.parseInt(getOption("numIterations")); lambda = Double.parseDouble(getOption("lambda")); alpha = Double.parseDouble(getOption("alpha")); implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback")); numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver")); boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false))); /* * compute the factorization A = U M' * * where A (users x items) is the matrix of known ratings * U (users x features) is the representation of users in the feature space * M (items x features) is the representation of items in the feature space */ if (usesLongIDs) { Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class, MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS)); mapUsers.waitForCompletion(true); Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class, MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS)); mapItems.waitForCompletion(true); } /* create A' */ Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); itemRatings.setCombinerClass(VectorSumCombiner.class); itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs)); boolean succeeded = itemRatings.waitForCompletion(true); if (!succeeded) { return -1; } /* create A */ Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class, VectorWritable.class); userRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = userRatings.waitForCompletion(true); if (!succeeded) { return -1; } //TODO this could be fiddled into one of the upper jobs Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"), AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageItemRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageItemRatings.waitForCompletion(true); if (!succeeded) { return -1; } Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf()); int numItems = averageRatings.getNumNondefaultElements(); int numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue(); log.info("Found {} users and {} items", numUsers, numItems); /* create an initial M */ initializeM(averageRatings); for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) { /* broadcast M, read A row-wise, recompute U row-wise */ log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), currentIteration, "U", numItems); /* broadcast U, read A' row-wise, recompute M row-wise */ log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration, "M", numUsers); } return 0; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.als.PredictionMapper.java
License:Apache License
@Override protected void map(IntWritable userIndexWritable, VectorWritable ratingsWritable, Context ctx) throws IOException, InterruptedException { Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> uAndM = getSharedInstance(); OpenIntObjectHashMap<Vector> U = uAndM.getFirst(); OpenIntObjectHashMap<Vector> M = uAndM.getSecond(); Vector ratings = ratingsWritable.get(); int userIndex = userIndexWritable.get(); final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements()); for (Vector.Element e : ratings.nonZeroes()) { alreadyRatedItems.add(e.index()); }/*from w w w . j a va 2s . c o m*/ final TopItemsQueue topItemsQueue = new TopItemsQueue(recommendationsPerUser); final Vector userFeatures = U.get(userIndex); M.forEachPair(new IntObjectProcedure<Vector>() { @Override public boolean apply(int itemID, Vector itemFeatures) { if (!alreadyRatedItems.contains(itemID)) { double predictedRating = userFeatures.dot(itemFeatures); MutableRecommendedItem top = topItemsQueue.top(); if (predictedRating > top.getValue()) { top.set(itemID, (float) predictedRating); topItemsQueue.updateTop(); } } return true; } }); List<RecommendedItem> recommendedItems = topItemsQueue.getTopItems(); if (!recommendedItems.isEmpty()) { // cap predictions to maxRating for (RecommendedItem topItem : recommendedItems) { ((MutableRecommendedItem) topItem).capToMaxValue(maxRating); } if (usesLongIDs) { long userID = userIDIndex.get(userIndex); userIDWritable.set(userID); for (RecommendedItem topItem : recommendedItems) { // remap item IDs long itemID = itemIDIndex.get((int) topItem.getItemID()); ((MutableRecommendedItem) topItem).setItemID(itemID); } } else { userIDWritable.set(userIndex); } recommendations.set(recommendedItems); ctx.write(userIDWritable, recommendations); } }