List of usage examples for org.apache.mahout.math Matrix viewRow
Vector viewRow(int row);
From source file:ca.uwaterloo.cpami.mahout.matrix.utils.GramSchmidt.java
License:Apache License
public static void orthonormalizeColumns(Matrix mx) { //int n = mx.numCols(); int n = mx.numRows(); for (int c = 0; c < n; c++) { System.out.println("col: " + c); Vector col = mx.viewRow(c); for (int c1 = 0; c1 < c; c1++) { Vector viewC1 = mx.viewRow(c1); col.assign(col.minus(viewC1.times(viewC1.dot(col)))); }//ww w .j a v a2 s . c o m final double norm2 = col.norm(2); if (norm2 == 0) { System.out.println("zero"); } col.assign(new DoubleFunction() { @Override public double apply(double x) { return x / norm2; } }); } }
From source file:ca.uwaterloo.cpami.mahout.matrix.utils.GramSchmidt.java
License:Apache License
public static void main(String[] args) throws IOException { //final Configuration conf = new Configuration(); //final FileSystem fs = FileSystem.get(conf); //final SequenceFile.Reader reader = new SequenceFile.Reader(fs, // new Path("R1.dat"), conf); //IntWritable key = new IntWritable(); //VectorWritable vec = new VectorWritable(); Matrix mat = new SparseMatrix(1500, 100); //SparseRealMatrix mat2 = new OpenMapRealMatrix(12419,1500 ); BufferedReader reader = new BufferedReader(new FileReader("R.3.csv")); String line = null;/* w ww. j a v a2 s. co m*/ while ((line = reader.readLine()) != null) { String[] parts = line.split(","); mat.set(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2])); /* Vector v = vec.get(); int i=0; Iterator<Vector.Element> itr = v.iterateNonZero(); while(itr.hasNext()){ double elem = itr.next().get(); if(elem !=0) mat2.setEntry(i, key.get(), elem); i++; } */ } //mat = mat.transpose(); System.out.println(mat.viewColumn(0).isDense()); System.out.println(mat.viewRow(0).isDense()); mat = mat.transpose(); GramSchmidt.orthonormalizeColumns(mat); /* System.out.println("started QR"); System.out.println(Runtime.getRuntime().maxMemory()); System.out.println(Runtime.getRuntime().maxMemory()-Runtime.getRuntime().freeMemory()); QRDecomposition qr = new QRDecomposition(mat2); System.out.println(qr.getQ().getColumnDimension()); System.out.println(qr.getQ().getRowDimension()); */ //mat = mat.transpose(); //storeSparseColumns(mat); //for (int i = 0; i < 10; i++) { // System.out.println(mat.viewRow(i).getNumNondefaultElements()); //} }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight) { this.dictionary = dictionary; this.topicTermCounts = topicTermCounts; this.topicSums = topicSums; this.numTopics = topicSums.size(); this.numTerms = topicTermCounts.numCols(); this.eta = eta; this.alpha = alpha; this.sampler = new Sampler(RandomUtils.getRandom()); this.numThreads = numThreads; if (modelWeight != 1) { topicSums.assign(Functions.mult(modelWeight)); for (int x = 0; x < numTopics; x++) { topicTermCounts.viewRow(x).assign(Functions.mult(modelWeight)); }//from ww w . j a v a 2 s . com } initializeThreadPool(); }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
private static Pair<Matrix, Vector> randomMatrix(int numTopics, int numTerms, Random random) { Matrix topicTermCounts = new DenseMatrix(numTopics, numTerms); Vector topicSums = new DenseVector(numTopics); if (random != null) { for (int x = 0; x < numTopics; x++) { for (int term = 0; term < numTerms; term++) { topicTermCounts.viewRow(x).set(term, random.nextDouble()); }//from www. jav a 2s . co m } } for (int x = 0; x < numTopics; x++) { topicSums.set(x, random == null ? 1.0 : topicTermCounts.viewRow(x).norm(1)); } return Pair.of(topicTermCounts, topicSums); }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException { int numTopics = -1; int numTerms = -1; List<Pair<Integer, Vector>> rows = Lists.newArrayList(); for (Path modelPath : modelPaths) { for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true, conf)) {/*from w ww . ja va 2 s .c om*/ rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext if (numTerms < 0) { numTerms = row.getSecond().get().size(); } } } if (rows.isEmpty()) { throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it"); } numTopics++; Matrix model = new DenseMatrix(numTopics, numTerms); Vector topicSums = new DenseVector(numTopics); for (Pair<Integer, Vector> pair : rows) { model.viewRow(pair.getFirst()).assign(pair.getSecond()); topicSums.set(pair.getFirst(), pair.getSecond().norm(1)); } return Pair.of(model, topicSums); }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { // first calculate p(topic|term,document) for all terms in original, and all topics, // using p(term|topic) and p(topic|doc) pTopicGivenTerm(original, topics, docTopicModel); normalizeByTopic(docTopicModel);// w w w .j a va 2s. c om // now multiply, term-by-term, by the document, to get the weighted distribution of // term-topic pairs from this document. Iterator<Vector.Element> it = original.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); for (int x = 0; x < numTopics; x++) { Vector docTopicModelRow = docTopicModel.viewRow(x); docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); } } // now recalculate p(topic|doc) by summing contributions from all of pTopicGivenTerm topics.assign(0.0); for (int x = 0; x < numTopics; x++) { topics.set(x, docTopicModel.viewRow(x).norm(1)); } // now renormalize so that sum_x(p(x|doc)) = 1 topics.assign(Functions.mult(1 / topics.norm(1))); }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public void update(Matrix docTopicCounts) { for (int x = 0; x < numTopics; x++) { updaters[x % updaters.length].update(x, docTopicCounts.viewRow(x)); }//from w w w . j a va 2 s . c o m }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
/** * Computes {@code p(topic x|term a, document i)} distributions given input document {@code i}. * {@code pTGT[x][a]} is the (un-normalized) {@code p(x|a,i)}, or if docTopics is {@code null}, * {@code p(a|x)} (also un-normalized).//from ww w. j av a 2 s. com * * @param document doc-term vector encoding {@code w(term a|document i)}. * @param docTopics {@code docTopics[x]} is the overall weight of topic {@code x} in given * document. If {@code null}, a topic weight of {@code 1.0} is used for all topics. * @param termTopicDist storage for output {@code p(x|a,i)} distributions. */ private void pTopicGivenTerm(Vector document, Vector docTopics, Matrix termTopicDist) { // for each topic x for (int x = 0; x < numTopics; x++) { // get p(topic x | document i), or 1.0 if docTopics is null double topicWeight = docTopics == null ? 1.0 : docTopics.get(x); // get w(term a | topic x) Vector topicTermRow = topicTermCounts.viewRow(x); // get \sum_a w(term a | topic x) double topicSum = topicSums.get(x); // get p(topic x | term a) distribution to update Vector termTopicRow = termTopicDist.viewRow(x); // for each term a in document i with non-zero weight Iterator<Vector.Element> it = document.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); int termIndex = e.index(); // calc un-normalized p(topic x | term a, document i) double termTopicLikelihood = (topicTermRow.get(termIndex) + eta) * (topicWeight + alpha) / (topicSum + eta * numTerms); termTopicRow.set(termIndex, termTopicLikelihood); } } }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
private void normalizeByTopic(Matrix perTopicSparseDistributions) { Iterator<Vector.Element> it = perTopicSparseDistributions.viewRow(0).iterateNonZero(); // then make sure that each of these is properly normalized by topic: sum_x(p(x|t,d)) = 1 while (it.hasNext()) { Vector.Element e = it.next(); int a = e.index(); double sum = 0; for (int x = 0; x < numTopics; x++) { sum += perTopicSparseDistributions.viewRow(x).get(a); }/*from ww w . ja va 2 s . c o m*/ for (int x = 0; x < numTopics; x++) { perTopicSparseDistributions.viewRow(x).set(a, perTopicSparseDistributions.viewRow(x).get(a) / sum); } } }
From source file:com.elex.dmp.lda.TopicModel.java
License:Apache License
public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths) throws IOException { int numTopics = -1; int numTerms = -1; List<Pair<Integer, Vector>> rows = Lists.newArrayList(); for (Path modelPath : modelPaths) { for (Pair<Text, VectorWritable> row : new SequenceFileIterable<Text, VectorWritable>(modelPath, true, conf)) {/*from w ww . ja va2 s . co m*/ rows.add(Pair.of(Integer.parseInt(row.getFirst().toString()), row.getSecond().get()));//keytext numTopics = Math.max(numTopics, Integer.parseInt(row.getFirst().toString()));//keytext if (numTerms < 0) { numTerms = row.getSecond().get().size(); } } } if (rows.isEmpty()) { throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it"); } numTopics++; Matrix model = new DenseMatrix(numTopics, numTerms); Vector topicSums = new DenseVector(numTopics); for (Pair<Integer, Vector> pair : rows) { model.viewRow(pair.getFirst()).assign(pair.getSecond()); topicSums.set(pair.getFirst(), pair.getSecond().norm(1)); } return Pair.of(model, topicSums); }