List of usage examples for org.apache.mahout.math Vector getQuick
double getQuick(int index);
From source file:com.cloudera.science.ml.core.vectors.Vectors.java
License:Open Source License
/** * Converts the given {@code Vector} into a {@code double[]}. * //from ww w.j a v a2 s. com * @param v The vector to convert * @return The resulting array of values */ public static double[] toArray(Vector v) { double[] ret = new double[v.size()]; for (int i = 0; i < ret.length; i++) { ret[i] = v.getQuick(i); } return ret; }
From source file:com.cloudera.science.ml.kmeans.parallel.CentersIndex.java
License:Open Source License
private BitSet index(Vector vec) { double[] prod = new double[projectionBits]; if (vec.isDense()) { for (int i = 0; i < vec.size(); i++) { double v = vec.getQuick(i); if (v != 0.0) { for (int j = 0; j < projectionBits; j++) { prod[j] += v * projection[i + j * dimensions]; }//from www. ja va2 s . c o m } } } else { Iterator<Vector.Element> iter = vec.iterateNonZero(); while (iter.hasNext()) { Vector.Element e = iter.next(); for (int j = 0; j < projectionBits; j++) { prod[j] = e.get() * projection[e.index() + j * dimensions]; } } } BitSet bitset = new BitSet(projectionBits); for (int i = 0; i < projectionBits; i++) { if (prod[i] > 0.0) { bitset.set(i); } } return bitset; }
From source file:com.cloudera.science.ml.kmeans.parallel.CentersIndex.java
License:Open Source License
private static double dot(Vector vec, double[] p) { double dot = 0; if (vec.isDense()) { for (int i = 0; i < p.length; i++) { dot += vec.getQuick(i) * p[i]; }/*w w w .j ava 2s.c om*/ } else { Iterator<Vector.Element> iter = vec.iterateNonZero(); while (iter.hasNext()) { Vector.Element e = iter.next(); dot += e.get() * p[e.index()]; } } return dot; }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { // first calculate p(topic|term,document) for all terms in original, and all topics, // using p(term|topic) and p(topic|doc) pTopicGivenTerm(original, topics, docTopicModel); normalizeByTopic(docTopicModel);/*from w w w .ja v a 2 s .c o m*/ // now multiply, term-by-term, by the document, to get the weighted distribution of // term-topic pairs from this document. Iterator<Vector.Element> it = original.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); for (int x = 0; x < numTopics; x++) { Vector docTopicModelRow = docTopicModel.viewRow(x); docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); } } // now recalculate p(topic|doc) by summing contributions from all of pTopicGivenTerm topics.assign(0.0); for (int x = 0; x < numTopics; x++) { topics.set(x, docTopicModel.viewRow(x).norm(1)); } // now renormalize so that sum_x(p(x|doc)) = 1 topics.assign(Functions.mult(1 / topics.norm(1))); }
From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;/* w ww . java2s.co m*/ } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java
License:Open Source License
private static BasicDBList listFromMahoutVector(Vector vec, String prefix, BasicDBObject element) { if (vec instanceof NamedVector) { element.put(prefix + "Name", ((NamedVector) vec).getName()); }/*from w ww . ja v a 2 s. c o m*/ BasicDBList dbl2 = new BasicDBList(); if (vec.isDense()) { int nSize = vec.size(); dbl2.ensureCapacity(nSize); for (int i = 0; i < nSize; ++i) { dbl2.add(vec.getQuick(i)); } } else { // sparse, write as a set in the format [{int:double}] Iterator<org.apache.mahout.math.Vector.Element> elIt = vec.iterateNonZero(); while (elIt.hasNext()) { BasicDBObject el2 = new BasicDBObject(); org.apache.mahout.math.Vector.Element el = elIt.next(); el2.put("k", el.index()); el2.put("v", el.get()); dbl2.add(el2); } } return dbl2; }
From source file:com.innometrics.integration.app.recommender.ml.als.AlternatingLeastSquaresSolver.java
License:Apache License
static Matrix createMiIi(Iterable<Vector> featureVectors, int numFeatures) { double[][] MiIi = new double[numFeatures][Iterables.size(featureVectors)]; int n = 0;/*from ww w . j a v a 2 s . c o m*/ for (Vector featureVector : featureVectors) { for (int m = 0; m < numFeatures; m++) { MiIi[m][n] = featureVector.getQuick(m); } n++; } return new DenseMatrix(MiIi, true); }
From source file:com.innometrics.integration.app.recommender.ml.als.ImplicitFeedbackAlternatingLeastSquaresSolver.java
License:Apache License
public Matrix getYtransposeY(final OpenIntObjectHashMap<Vector> Y) { ExecutorService queue = Executors.newFixedThreadPool(numTrainingThreads); if (log.isInfoEnabled()) { log.info("Starting the computation of Y'Y"); }/*from w w w . j a v a2s .c om*/ long startTime = System.nanoTime(); final IntArrayList indexes = Y.keys(); final int numIndexes = indexes.size(); final double[][] YtY = new double[numFeatures][numFeatures]; // Compute Y'Y by dot products between the 'columns' of Y for (int i = 0; i < numFeatures; i++) { for (int j = i; j < numFeatures; j++) { final int ii = i; final int jj = j; queue.execute(new Runnable() { @Override public void run() { double dot = 0; for (int k = 0; k < numIndexes; k++) { Vector row = Y.get(indexes.getQuick(k)); dot += row.getQuick(ii) * row.getQuick(jj); } YtY[ii][jj] = dot; if (ii != jj) { YtY[jj][ii] = dot; } } }); } } queue.shutdown(); try { queue.awaitTermination(1, TimeUnit.DAYS); } catch (InterruptedException e) { log.error("Error during Y'Y queue shutdown", e); throw new RuntimeException("Error during Y'Y queue shutdown"); } if (log.isInfoEnabled()) { log.info("Computed Y'Y in " + (System.nanoTime() - startTime) / 1000000.0 + " ms"); } return new DenseMatrix(YtY, true); }
From source file:com.scaleunlimited.classify.model.HashedFeaturesLibLinearModel.java
License:Apache License
/** * Given a map from term to count, generate a feature array using * _maxFeatureIndex as the max index, based on the hash of the term. * /*from w w w .j a va 2 s .co m*/ * @param terms * @return array of LibLinear features */ private Feature[] getFeatures(Map<String, Integer> terms) { // First create the vector, where each term's index is the hash // of the term, and the value is the term count. Map<Integer, Integer> collisionCount = new HashMap<>(); Vector v = new RandomAccessSparseVector(_maxFeatureIndex); for (String term : terms.keySet()) { int index = calcHashJoaat(term, _maxFeatureIndex); double curValue = v.getQuick(index); if (_averageCollisions && (curValue != 0.0)) { Integer curCollisionCount = collisionCount.get(index); if (curCollisionCount == null) { // Number of values we'll need to divide by collisionCount.put(index, 2); } else { collisionCount.put(index, curCollisionCount + 1); } v.setQuick(index, curValue + terms.get(term)); } else { v.setQuick(index, terms.get(term)); } } // Now adjust the vector for collisions, if needed. if (_averageCollisions && !collisionCount.isEmpty()) { for (Integer index : collisionCount.keySet()) { double curValue = v.getQuick(index); v.setQuick(index, curValue / collisionCount.get(index)); } } // Apply the term vector normalizer. getNormalizer().normalize(v); List<FeatureNode> features = new ArrayList<FeatureNode>(terms.size()); for (Element e : v.nonZeroes()) { features.add(new FeatureNode(e.index() + 1, e.get())); } // We need to sort by increasing index. Collections.sort(features, new Comparator<FeatureNode>() { @Override public int compare(FeatureNode o1, FeatureNode o2) { return o1.index - o2.index; } }); return features.toArray(new FeatureNode[features.size()]); }
From source file:com.scaleunlimited.classify.model.RawFeaturesLibLinearModel.java
License:Apache License
private FeatureNode[] vectorToFeatureNodes(Vector vector) { int featureCount = vector.getNumNondefaultElements(); FeatureNode[] x = new FeatureNode[featureCount]; int arrayIndex = 0; int cardinality = vector.size(); for (int i = 0; i < cardinality; i++) { double value = vector.getQuick(i); if (value != 0.0) { // (At least) Linear.train assumes that FeatureNode.index // is 1-based, and we don't really have to map back to our // term indexes, so just add one. YUCK! x[arrayIndex++] = new FeatureNode(i + 1, value); }/* www. j a v a 2 s. c om*/ } return x; }