List of usage examples for org.apache.mahout.math Vector size
int size();
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
/** * sum_x sum_a (c_ai * log(p(x|i) * p(a|x))) *//*from ww w . java 2 s . c o m*/ public double perplexity(Vector document, Vector docTopics) { double perplexity = 0; double norm = docTopics.norm(1) + (docTopics.size() * alpha); Iterator<Vector.Element> it = document.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); int term = e.index(); double prob = 0; for (int x = 0; x < numTopics; x++) { double d = (docTopics.get(x) + alpha) / norm; double p = d * (topicTermCounts.viewRow(x).get(term) + eta) / (topicSums.get(x) + eta * numTerms); prob += p; } perplexity += e.get() * Math.log(prob); } return -perplexity; }
From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java
License:Open Source License
private static BasicDBList listFromMahoutVector(Vector vec, String prefix, BasicDBObject element) { if (vec instanceof NamedVector) { element.put(prefix + "Name", ((NamedVector) vec).getName()); }// w w w. java2s . co m BasicDBList dbl2 = new BasicDBList(); if (vec.isDense()) { int nSize = vec.size(); dbl2.ensureCapacity(nSize); for (int i = 0; i < nSize; ++i) { dbl2.add(vec.getQuick(i)); } } else { // sparse, write as a set in the format [{int:double}] Iterator<org.apache.mahout.math.Vector.Element> elIt = vec.iterateNonZero(); while (elIt.hasNext()) { BasicDBObject el2 = new BasicDBObject(); org.apache.mahout.math.Vector.Element el = elIt.next(); el2.put("k", el.index()); el2.put("v", el.get()); dbl2.add(el2); } } return dbl2; }
From source file:com.netease.news.classifier.naivebayes.NaiveBayesModel.java
License:Apache License
public static NaiveBayesModel materialize(Path output, Configuration conf) throws IOException { FileSystem fs = output.getFileSystem(conf); Vector weightsPerLabel = null; Vector perLabelThetaNormalizer = null; Vector weightsPerFeature = null; Matrix weightsPerLabelAndFeature;/* www.j a v a 2 s . c o m*/ float alphaI; FSDataInputStream in = fs.open(new Path(output, "naiveBayesModel.bin")); try { alphaI = in.readFloat(); weightsPerFeature = VectorWritable.readVector(in); weightsPerLabel = new DenseVector(VectorWritable.readVector(in)); perLabelThetaNormalizer = new DenseVector(VectorWritable.readVector(in)); weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), weightsPerFeature.size()); for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) { weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in)); } } finally { Closeables.close(in, true); } NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel, perLabelThetaNormalizer, alphaI); model.validate(); return model; }
From source file:com.netease.news.classifier.naivebayes.NaiveBayesModel.java
License:Apache License
public static NaiveBayesModel materializeLocal(String modelfile) throws IOException { Vector weightsPerLabel = null; Vector perLabelThetaNormalizer = null; Vector weightsPerFeature = null; Matrix weightsPerLabelAndFeature;/*from w w w .j av a 2 s .c o m*/ float alphaI; System.out.println(modelfile); ClassLoader loader = NaiveBayesModel.class.getClassLoader(); InputStream sin = loader.getResourceAsStream(modelfile); DataInputStream in = new DataInputStream(sin); try { alphaI = in.readFloat(); weightsPerFeature = VectorWritable.readVector(in); weightsPerLabel = new DenseVector(VectorWritable.readVector(in)); perLabelThetaNormalizer = new DenseVector(VectorWritable.readVector(in)); weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), weightsPerFeature.size()); for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) { weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in)); } } finally { in.close(); } NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel, perLabelThetaNormalizer, alphaI); model.validate(); return model; }
From source file:com.netease.news.classifier.naivebayes.WeightsMapper.java
License:Apache License
@Override protected void map(IntWritable index, VectorWritable value, Context ctx) throws IOException, InterruptedException { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); }//from w ww .j a v a 2 s .c o m int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); }
From source file:com.scaleunlimited.classify.model.RawFeaturesLibLinearModel.java
License:Apache License
private FeatureNode[] vectorToFeatureNodes(Vector vector) { int featureCount = vector.getNumNondefaultElements(); FeatureNode[] x = new FeatureNode[featureCount]; int arrayIndex = 0; int cardinality = vector.size(); for (int i = 0; i < cardinality; i++) { double value = vector.getQuick(i); if (value != 0.0) { // (At least) Linear.train assumes that FeatureNode.index // is 1-based, and we don't really have to map back to our // term indexes, so just add one. YUCK! x[arrayIndex++] = new FeatureNode(i + 1, value); }/*from ww w . j av a 2 s .c o m*/ } return x; }
From source file:com.scaleunlimited.classify.vectors.BaseNormalizer.java
License:Apache License
public static void dumpTopTerms(final Vector docFrequencies, List<String> uniqueTerms) { int cardinality = docFrequencies.size(); List<Integer> sortedDocFrequencyIndexes = new ArrayList<Integer>(cardinality); for (int i = 0; i < cardinality; i++) { sortedDocFrequencyIndexes.add(i); }//from w w w.java 2 s . c o m Collections.sort(sortedDocFrequencyIndexes, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return (int) (docFrequencies.getQuick(o2) - docFrequencies.getQuick(o1)); } }); double maxFrequency = docFrequencies.getQuick(docFrequencies.maxValueIndex()); StringBuffer topTermsReport = new StringBuffer(); for (int i = 0; i < cardinality; i++) { int index = sortedDocFrequencyIndexes.get(i); double frequency = docFrequencies.getQuick(index); if ((frequency / maxFrequency) > MIN_FREQUENCY_REPORT_RATIO) { topTermsReport.append(String.format("'%s'=%d, ", uniqueTerms.get(index), (int) frequency)); } } LOGGER.debug(topTermsReport.toString()); }
From source file:com.scaleunlimited.classify.vectors.UnitNormalizer.java
License:Apache License
@Override public void normalize(Vector vector) { double length = Math.sqrt(vector.getLengthSquared()); // Divide each vector coordinate by length, so we wind up with a unit vector. int cardinality = vector.size(); for (int j = 0; j < cardinality; j++) { double curValue = vector.getQuick(j); if (curValue > 0.0) { vector.setQuick(j, curValue / length); }/*from w ww .j av a 2 s. co m*/ } }
From source file:com.scaleunlimited.classify.vectors.VectorUtils.java
License:Apache License
public static Vector appendVectors(Vector baseVector, Vector extraVector) { int baseSize = baseVector.size(); Vector result = new RandomAccessSparseVector(baseSize + extraVector.size()); for (int i = 0; i < baseSize; i++) { double value = baseVector.getQuick(i); if (value != 0.0) { result.setQuick(i, value);/* w ww .j a v a2 s . c o m*/ } } for (int i = 0; i < extraVector.size(); i++) { double value = extraVector.getQuick(i); if (value != 0.0) { result.setQuick(baseSize + i, value); } } return result; }
From source file:com.scaleunlimited.classify.vectors.VectorUtils.java
License:Apache License
public static Vector extendVector(Vector v, int extraSize) { if (extraSize == 0) { return v; }//w w w . j a v a 2 s .c o m int baseSize = v.size(); Vector result = new RandomAccessSparseVector(baseSize + extraSize); for (int i = 0; i < baseSize; i++) { double value = v.getQuick(i); if (value != 0.0) { result.setQuick(i, value); } } return result; }