List of usage examples for org.apache.mahout.math Vector setQuick
void setQuick(int index, double value);
From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java
License:Apache License
private void initializeM(Vector averageRatings) throws IOException { Random random = RandomUtils.getRandom(); FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf()); SequenceFile.Writer writer = null; try {// w w w . j av a 2s .c o m writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"), IntWritable.class, VectorWritable.class); Iterator<Vector.Element> averages = averageRatings.iterateNonZero(); while (averages.hasNext()) { Vector.Element e = averages.next(); Vector row = new DenseVector(numFeatures); row.setQuick(0, e.get()); for (int m = 1; m < numFeatures; m++) { row.setQuick(m, random.nextDouble()); } writer.append(new IntWritable(e.index()), new VectorWritable(row)); } } finally { Closeables.closeQuietly(writer); } }
From source file:com.skp.experiment.common.MathHelper.java
License:Apache License
/** * write a two-dimensional double array to an SequenceFile<IntWritable,VectorWritable> */// w w w .j a v a2 s .c o m public static void writeDistributedRowMatrix(double[][] entries, FileSystem fs, Configuration conf, Path path) throws IOException { SequenceFile.Writer writer = null; try { writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); for (int n = 0; n < entries.length; n++) { Vector v = new RandomAccessSparseVector(entries[n].length); for (int m = 0; m < entries[n].length; m++) { v.setQuick(m, entries[n][m]); } writer.append(new IntWritable(n), new VectorWritable(v)); } } finally { Closeables.closeQuietly(writer); } }
From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Mahout Naive Bayesian Classifier"); System.out.println(/* www . j a va 2 s .co m*/ "Classifies input text document into a class given a model, dictionary, document frequency and input file"); System.out.println( "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String inputFilePath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from input file Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(inputFilePath)); StringBuilder stringBuilder = new StringBuilder(); String lineSeparator = System.getProperty("line.separator"); String line = null; while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(lineSeparator); } // Close the reader I/O reader.close(); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from input file TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString())); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the email is more likely // to // be associated to double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; Vector resultVector = classifier.classifyFull(vector); for (Element element : resultVector) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(" Class Labe: => " + labels.get(bestCategoryId)); System.out.println(" Score: => " + bestScore); analyzer.close(); }
From source file:de.isabeldrostfromm.sof.util.Vectors.java
License:Open Source License
/** * Appends two vectors directly after one another, leaving all non set elements zero. * *//*from w w w .j av a 2 s . co m*/ public static Vector append(Vector... vectors) { int totalSize = 0; for (Vector vec : vectors) { totalSize += vec.size(); } Vector result = new SequentialAccessSparseVector(totalSize); result.assign(0); int lastIndex = 0; for (Vector vector : vectors) { for (Element elem : vector) { result.setQuick(lastIndex + elem.index(), elem.get()); } lastIndex += vector.size(); } return result; }
From source file:edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java
License:Apache License
public Vector getTFVector(String volumeId) throws IOException { Vector result = new RandomAccessSparseVector(dictionary.size()); logger.info("Get TF vector for " + volumeId); TermQuery termquery = new TermQuery(new Term("id", volumeId)); TopDocs hits = indexSearcher.search(termquery, indexSearcher.maxDoc()); ScoreDoc[] docs = hits.scoreDocs;//from w ww. j a v a2 s. com int docId = docs[0].doc; // only one hit!!! TermPositionVector vector = (TermPositionVector) indexReader.getTermFreqVector(docId, "ocr"); long t0 = System.nanoTime(); String[] terms = vector.getTerms(); int[] freq = vector.getTermFrequencies(); for (int j = 0; j < terms.length; j++) { // if (dictionary.containsKey(terms[j])) { // result.setQuick(dictionary.get(terms[j]), freq[j]); // } if (filter.accept(terms[j], freq[j])) { result.setQuick(dictionary.get(terms[j]), freq[j]); } } long t1 = System.nanoTime(); elapsedTime += t1 - t0; return result; }
From source file:edu.indiana.d2i.htrc.io.index.solr.SolrClient.java
License:Apache License
private Vector createVector(XMLStreamReader parser) throws XMLStreamException { Vector vector = new RandomAccessSparseVector(dictionary.size()); while (parser.hasNext()) { int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT) { String attributeValue = parser.getAttributeValue(null, "name"); if (attributeValue != null) { // if (dictionary.containsKey(attributeValue)) { // parser.next(); // int tf = Integer.valueOf(parser.getElementText()); // vector.setQuick(dictionary.get(attributeValue), tf); // } parser.next();//w w w . ja v a 2 s. c o m int freq = Integer.valueOf(parser.getElementText()); if (filter.accept(attributeValue, freq)) { vector.setQuick(dictionary.get(attributeValue), freq); } } } } return vector; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter, Dictionary dictionary) throws IOException { Vector result = new RandomAccessSparseVector(dictionary.size()); TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w w w . j av a 2s . co m while (stream.incrementToken()) { // String term = new String(termAtt.buffer(), 0, // termAtt.length()); String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase(); if (filter.accept(term, 0)) { int index = dictionary.get(term); result.setQuick(index, result.get(index) + 1); } } return result; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java
License:Apache License
public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter, Dictionary dictionary) throws IOException { Vector result = new RandomAccessSparseVector(dictionary.size()); TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w w w . j a va 2 s . co m while (stream.incrementToken()) { // String term = new String(termAtt.buffer(), 0, // termAtt.length()); String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase(); if (filter.accept(term, 0)) { int index = dictionary.get(term); result.setQuick(index, result.get(index) + 1); } } return result; }
From source file:edu.rosehulman.mahout.math.VectorWritable.java
License:Apache License
@Override public void readFields(DataInput in) throws IOException { int flags = in.readByte(); //Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2)); boolean dense = (flags & FLAG_DENSE) != 0; boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; boolean named = (flags & FLAG_NAMED) != 0; boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; int size = Varint.readUnsignedVarInt(in); Vector v; if (dense) {/*from w ww. j a v a 2 s . c om*/ double[] values = new double[size]; for (int i = 0; i < size; i++) { values[i] = laxPrecision ? in.readFloat() : in.readDouble(); } v = new DenseVector(values); } else { int numNonDefaultElements = Varint.readUnsignedVarInt(in); v = sequential ? new SequentialAccessSparseVector(size, numNonDefaultElements) : new RandomAccessSparseVector(size, numNonDefaultElements); if (sequential) { int lastIndex = 0; for (int i = 0; i < numNonDefaultElements; i++) { int delta = Varint.readUnsignedVarInt(in); int index = lastIndex + delta; lastIndex = index; double value = laxPrecision ? in.readFloat() : in.readDouble(); v.setQuick(index, value); } } else { for (int i = 0; i < numNonDefaultElements; i++) { int index = Varint.readUnsignedVarInt(in); double value = laxPrecision ? in.readFloat() : in.readDouble(); v.setQuick(index, value); } } } if (named) { String name = in.readUTF(); v = new NamedVector(v, name); } vector = v; }
From source file:edu.rosehulman.mahout.math.VectorWritable.java
License:Apache License
public static Vector mergeToVector(Iterator<VectorWritable> vectors) { Vector accumulator = vectors.next().get(); while (vectors.hasNext()) { VectorWritable v = vectors.next(); if (v != null) { for (Element nonZeroElement : v.get().nonZeroes()) { accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get()); }//from w w w. j a va 2s . c o m } } return accumulator; }