Example usage for org.apache.mahout.math Vector setQuick

List of usage examples for org.apache.mahout.math Vector setQuick

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector setQuick.

Prototype

void setQuick(int index, double value);

Source Link

Document

Set the value at the given index, without checking bounds

Usage

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java

License:Apache License

private void initializeM(Vector averageRatings) throws IOException {
    Random random = RandomUtils.getRandom();

    FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
    SequenceFile.Writer writer = null;
    try {// w  w  w  .  j av  a 2s .c o m
        writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"),
                IntWritable.class, VectorWritable.class);

        Iterator<Vector.Element> averages = averageRatings.iterateNonZero();
        while (averages.hasNext()) {
            Vector.Element e = averages.next();
            Vector row = new DenseVector(numFeatures);
            row.setQuick(0, e.get());
            for (int m = 1; m < numFeatures; m++) {
                row.setQuick(m, random.nextDouble());
            }
            writer.append(new IntWritable(e.index()), new VectorWritable(row));
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
}

From source file:com.skp.experiment.common.MathHelper.java

License:Apache License

/**
 * write a two-dimensional double array to an SequenceFile<IntWritable,VectorWritable>
 */// w w  w  .j  a v a2 s  .c  o  m
public static void writeDistributedRowMatrix(double[][] entries, FileSystem fs, Configuration conf, Path path)
        throws IOException {
    SequenceFile.Writer writer = null;
    try {
        writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
        for (int n = 0; n < entries.length; n++) {
            Vector v = new RandomAccessSparseVector(entries[n].length);
            for (int m = 0; m < entries[n].length; m++) {
                v.setQuick(m, entries[n][m]);
            }
            writer.append(new IntWritable(n), new VectorWritable(v));
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(/* www .  j a va  2  s .co  m*/
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:de.isabeldrostfromm.sof.util.Vectors.java

License:Open Source License

/**
 * Appends two vectors directly after one another, leaving all non set elements zero.
 * *//*from   w w  w .j av a  2 s . co m*/
public static Vector append(Vector... vectors) {
    int totalSize = 0;
    for (Vector vec : vectors) {
        totalSize += vec.size();
    }

    Vector result = new SequentialAccessSparseVector(totalSize);
    result.assign(0);

    int lastIndex = 0;
    for (Vector vector : vectors) {
        for (Element elem : vector) {
            result.setQuick(lastIndex + elem.index(), elem.get());
        }
        lastIndex += vector.size();
    }
    return result;
}

From source file:edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java

License:Apache License

public Vector getTFVector(String volumeId) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    logger.info("Get TF vector for " + volumeId);

    TermQuery termquery = new TermQuery(new Term("id", volumeId));
    TopDocs hits = indexSearcher.search(termquery, indexSearcher.maxDoc());
    ScoreDoc[] docs = hits.scoreDocs;//from  w  ww. j a v  a2  s. com
    int docId = docs[0].doc; // only one hit!!!
    TermPositionVector vector = (TermPositionVector) indexReader.getTermFreqVector(docId, "ocr");

    long t0 = System.nanoTime();
    String[] terms = vector.getTerms();
    int[] freq = vector.getTermFrequencies();
    for (int j = 0; j < terms.length; j++) {
        // if (dictionary.containsKey(terms[j])) {
        // result.setQuick(dictionary.get(terms[j]), freq[j]);
        // }

        if (filter.accept(terms[j], freq[j])) {
            result.setQuick(dictionary.get(terms[j]), freq[j]);
        }
    }
    long t1 = System.nanoTime();
    elapsedTime += t1 - t0;

    return result;
}

From source file:edu.indiana.d2i.htrc.io.index.solr.SolrClient.java

License:Apache License

private Vector createVector(XMLStreamReader parser) throws XMLStreamException {
    Vector vector = new RandomAccessSparseVector(dictionary.size());
    while (parser.hasNext()) {
        int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT) {
            String attributeValue = parser.getAttributeValue(null, "name");
            if (attributeValue != null) {
                //               if (dictionary.containsKey(attributeValue)) {
                //                  parser.next();
                //                  int tf = Integer.valueOf(parser.getElementText());
                //                  vector.setQuick(dictionary.get(attributeValue), tf);
                //               }

                parser.next();//w  w  w . ja v a  2 s. c o  m
                int freq = Integer.valueOf(parser.getElementText());
                if (filter.accept(attributeValue, freq)) {
                    vector.setQuick(dictionary.get(attributeValue), freq);
                }
            }
        }
    }
    return vector;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  w  w w  . j av  a  2s .  co  m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java

License:Apache License

public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from w w  w . j  a va 2 s .  co  m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

@Override
public void readFields(DataInput in) throws IOException {
    int flags = in.readByte();
    //Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
    boolean dense = (flags & FLAG_DENSE) != 0;
    boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
    boolean named = (flags & FLAG_NAMED) != 0;
    boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;

    int size = Varint.readUnsignedVarInt(in);
    Vector v;
    if (dense) {/*from   w  ww.  j a  v  a 2 s . c  om*/
        double[] values = new double[size];
        for (int i = 0; i < size; i++) {
            values[i] = laxPrecision ? in.readFloat() : in.readDouble();
        }
        v = new DenseVector(values);
    } else {
        int numNonDefaultElements = Varint.readUnsignedVarInt(in);
        v = sequential ? new SequentialAccessSparseVector(size, numNonDefaultElements)
                : new RandomAccessSparseVector(size, numNonDefaultElements);
        if (sequential) {
            int lastIndex = 0;
            for (int i = 0; i < numNonDefaultElements; i++) {
                int delta = Varint.readUnsignedVarInt(in);
                int index = lastIndex + delta;
                lastIndex = index;
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        } else {
            for (int i = 0; i < numNonDefaultElements; i++) {
                int index = Varint.readUnsignedVarInt(in);
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        }
    }
    if (named) {
        String name = in.readUTF();
        v = new NamedVector(v, name);
    }
    vector = v;
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

public static Vector mergeToVector(Iterator<VectorWritable> vectors) {
    Vector accumulator = vectors.next().get();
    while (vectors.hasNext()) {
        VectorWritable v = vectors.next();
        if (v != null) {
            for (Element nonZeroElement : v.get().nonZeroes()) {
                accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get());
            }//from  w  w  w.  j a va 2s  .  c o  m
        }
    }
    return accumulator;
}