Example usage for org.apache.mahout.math Vector setQuick

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector setQuick.

Prototype

void setQuick(int index, double value);

Source Link

Document

Set the value at the given index, without checking bounds

Usage

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java

License:Apache License

private void initializeM(Vector averageRatings) throws IOException {
    Random random = RandomUtils.getRandom();

    FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
    SequenceFile.Writer writer = null;
    try {// w  w  w  .  j av  a 2s .c o m
        writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"),
                IntWritable.class, VectorWritable.class);

        Iterator<Vector.Element> averages = averageRatings.iterateNonZero();
        while (averages.hasNext()) {
            Vector.Element e = averages.next();
            Vector row = new DenseVector(numFeatures);
            row.setQuick(0, e.get());
            for (int m = 1; m < numFeatures; m++) {
                row.setQuick(m, random.nextDouble());
            }
            writer.append(new IntWritable(e.index()), new VectorWritable(row));
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
}

From source file:com.skp.experiment.common.MathHelper.java

License:Apache License

/**
 * write a two-dimensional double array to an SequenceFile<IntWritable,VectorWritable>
 */// w w  w  .j  a v a2 s  .c  o  m
public static void writeDistributedRowMatrix(double[][] entries, FileSystem fs, Configuration conf, Path path)
        throws IOException {
    SequenceFile.Writer writer = null;
    try {
        writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
        for (int n = 0; n < entries.length; n++) {
            Vector v = new RandomAccessSparseVector(entries[n].length);
            for (int m = 0; m < entries[n].length; m++) {
                v.setQuick(m, entries[n][m]);
            }
            writer.append(new IntWritable(n), new VectorWritable(v));
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(/* www .  j a va  2  s .co  m*/
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:de.isabeldrostfromm.sof.util.Vectors.java

License:Open Source License

/**
 * Appends two vectors directly after one another, leaving all non set elements zero.
 * *//*from   w w  w .j av a  2 s . co m*/
public static Vector append(Vector... vectors) {
    int totalSize = 0;
    for (Vector vec : vectors) {
        totalSize += vec.size();
    }

    Vector result = new SequentialAccessSparseVector(totalSize);
    result.assign(0);

    int lastIndex = 0;
    for (Vector vector : vectors) {
        for (Element elem : vector) {
            result.setQuick(lastIndex + elem.index(), elem.get());
        }
        lastIndex += vector.size();
    }
    return result;
}

From source file:edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java

License:Apache License

public Vector getTFVector(String volumeId) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    logger.info("Get TF vector for " + volumeId);

    TermQuery termquery = new TermQuery(new Term("id", volumeId));
    TopDocs hits = indexSearcher.search(termquery, indexSearcher.maxDoc());
    ScoreDoc[] docs = hits.scoreDocs;//from  w  ww. j a v  a2  s. com
    int docId = docs[0].doc; // only one hit!!!
    TermPositionVector vector = (TermPositionVector) indexReader.getTermFreqVector(docId, "ocr");

    long t0 = System.nanoTime();
    String[] terms = vector.getTerms();
    int[] freq = vector.getTermFrequencies();
    for (int j = 0; j < terms.length; j++) {
        // if (dictionary.containsKey(terms[j])) {
        // result.setQuick(dictionary.get(terms[j]), freq[j]);
        // }

        if (filter.accept(terms[j], freq[j])) {
            result.setQuick(dictionary.get(terms[j]), freq[j]);
        }
    }
    long t1 = System.nanoTime();
    elapsedTime += t1 - t0;

    return result;
}

From source file:edu.indiana.d2i.htrc.io.index.solr.SolrClient.java

License:Apache License

private Vector createVector(XMLStreamReader parser) throws XMLStreamException {
    Vector vector = new RandomAccessSparseVector(dictionary.size());
    while (parser.hasNext()) {
        int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT) {
            String attributeValue = parser.getAttributeValue(null, "name");
            if (attributeValue != null) {
                //               if (dictionary.containsKey(attributeValue)) {
                //                  parser.next();
                //                  int tf = Integer.valueOf(parser.getElementText());
                //                  vector.setQuick(dictionary.get(attributeValue), tf);
                //               }

                parser.next();//w  w  w . ja v a  2 s. c o  m
                int freq = Integer.valueOf(parser.getElementText());
                if (filter.accept(attributeValue, freq)) {
                    vector.setQuick(dictionary.get(attributeValue), freq);
                }
            }
        }
    }
    return vector;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  w  w w  . j av  a  2s .  co  m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java

License:Apache License

public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from w w  w . j  a va 2 s .  co  m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

@Override
public void readFields(DataInput in) throws IOException {
    int flags = in.readByte();
    //Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
    boolean dense = (flags & FLAG_DENSE) != 0;
    boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
    boolean named = (flags & FLAG_NAMED) != 0;
    boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;

    int size = Varint.readUnsignedVarInt(in);
    Vector v;
    if (dense) {/*from   w  ww.  j a  v  a 2 s . c  om*/
        double[] values = new double[size];
        for (int i = 0; i < size; i++) {
            values[i] = laxPrecision ? in.readFloat() : in.readDouble();
        }
        v = new DenseVector(values);
    } else {
        int numNonDefaultElements = Varint.readUnsignedVarInt(in);
        v = sequential ? new SequentialAccessSparseVector(size, numNonDefaultElements)
                : new RandomAccessSparseVector(size, numNonDefaultElements);
        if (sequential) {
            int lastIndex = 0;
            for (int i = 0; i < numNonDefaultElements; i++) {
                int delta = Varint.readUnsignedVarInt(in);
                int index = lastIndex + delta;
                lastIndex = index;
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        } else {
            for (int i = 0; i < numNonDefaultElements; i++) {
                int index = Varint.readUnsignedVarInt(in);
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        }
    }
    if (named) {
        String name = in.readUTF();
        v = new NamedVector(v, name);
    }
    vector = v;
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

public static Vector mergeToVector(Iterator<VectorWritable> vectors) {
    Vector accumulator = vectors.next().get();
    while (vectors.hasNext()) {
        VectorWritable v = vectors.next();
        if (v != null) {
            for (Element nonZeroElement : v.get().nonZeroes()) {
                accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get());
            }//from  w  w  w.  j a va 2s  .  c o  m
        }
    }
    return accumulator;
}