Example usage for org.apache.mahout.math Vector setQuick

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector setQuick.

Prototype

void setQuick(int index, double value);

Source Link

Document

Set the value at the given index, without checking bounds

Usage

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;/*from w  ww . j  a v a2  s . c  o m*/
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }//from  www  .  j  a  v  a  2s  .  co  m
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:edu.utsa.sifter.som.SelfOrganizingMap.java

License:Apache License

public void updateCell(final int id, final double alpha, final IntArrayWritable doc) {
    // Scalable SOM updating, per Roussinov

    final double rate = 1 - alpha;
    final double f = CellFactors[id];
    final double nextF = rate * f; // Rule 5

    final double adjustment = alpha / (rate * CellFactors[id]); // Rule 6
    double sumSqrOld = 0.0;
    double sumSqrNew = 0.0;
    double c1 = 0.0, // Kahan summation algorithm to account for error, c.f. http://en.wikipedia.org/wiki/Kahan_summation_algorithm
            c2 = 0.0, y, t;//w  w  w  . ja v  a 2s .c om

    final Vector weights = getCell(id);

    double weight;
    double trueWeight;
    int idx;
    final int[] terms = doc.getInts();
    final int numTerms = doc.getLength();
    for (int i = 0; i < numTerms; ++i) {
        idx = terms[i];
        weight = weights.getQuick(idx);

        trueWeight = weight * f;
        y = (trueWeight * trueWeight) - c1;
        t = sumSqrOld + y; // S'(t+1) component
        c1 = (t - sumSqrOld) - y;
        sumSqrOld = t;
        // sumSqrOld += trueWeight * trueWeight;

        weight += adjustment; // adjust weight

        trueWeight = weight * nextF;

        y = (trueWeight * trueWeight) - c2;
        t = sumSqrNew + y;
        c2 = (t - sumSqrNew) - y;
        sumSqrNew = t;
        // sumSqrNew += trueWeight * trueWeight; // S_2'(t+1) component

        weights.setQuick(idx, weight);
    }
    CellFactors[id] = nextF;
    S2[id] = sumSqrNew + (rate * rate) * (S2[id] - sumSqrOld); // new S2 component
}

From source file:hadoop.api.AggregateAndRecommendReducer.java

License:Apache License

private void reduceNonBooleanData(VarLongWritable userID, Iterable<PrefAndSimilarityColumnWritable> values,
        Context context) throws IOException, InterruptedException {
    /* each entry here is the sum in the numerator of the prediction formula */
    Vector numerators = null;/*from w  w w  . ja v a 2 s  . c  o  m*/
    /* each entry here is the sum in the denominator of the prediction formula */
    Vector denominators = null;
    /* each entry here is the number of similar items used in the prediction formula */
    Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

    for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
        Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
        float prefValue = prefAndSimilarityColumn.getPrefValue();
        /* count the number of items used for each prediction */
        for (Element e : simColumn.nonZeroes()) {
            int itemIDIndex = e.index();
            numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
        }

        if (denominators == null) {
            denominators = simColumn.clone();
        } else {
            denominators.assign(simColumn, Functions.PLUS_ABS);
        }

        if (numerators == null) {
            numerators = simColumn.clone();
            if (prefValue != BOOLEAN_PREF_VALUE) {
                numerators.assign(Functions.MULT, prefValue);
            }
        } else {
            if (prefValue != BOOLEAN_PREF_VALUE) {
                simColumn.assign(Functions.MULT, prefValue);
            }
            numerators.assign(simColumn, Functions.PLUS);
        }

    }

    if (numerators == null) {
        return;
    }

    Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
    for (Element element : numerators.nonZeroes()) {
        int itemIDIndex = element.index();
        /* preference estimations must be based on at least 2 datapoints */
        if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
            /* compute normalized prediction */
            double prediction = element.get() / denominators.getQuick(itemIDIndex);
            recommendationVector.setQuick(itemIDIndex, prediction);
        }
    }
    writeRecommendedItems(userID, recommendationVector, context);
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*w  w  w. ja  v a  2 s  . c o m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:nl.gridline.zieook.inx.movielens.AggregateAndRecommendReducer.java

License:Apache License

private void reduceNonBooleanData(VarLongWritable userID, Iterable<PrefAndSimilarityColumnWritable> values,
        Context context) throws IOException, InterruptedException {
    /* each entry here is the sum in the numerator of the prediction formula */
    Vector numerators = null;//from  w  w  w.j  ava2s .com
    /* each entry here is the sum in the denominator of the prediction formula */
    Vector denominators = null;
    /* each entry here is the number of similar items used in the prediction formula */
    Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

    for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
        Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
        float prefValue = prefAndSimilarityColumn.getPrefValue();
        /* count the number of items used for each prediction */
        Iterator<Vector.Element> usedItemsIterator = simColumn.iterateNonZero();
        while (usedItemsIterator.hasNext()) {
            int itemIDIndex = usedItemsIterator.next().index();
            numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
        }

        numerators = numerators == null
                ? prefValue == BOOLEAN_PREF_VALUE ? simColumn.clone() : simColumn.times(prefValue)
                : numerators.plus(prefValue == BOOLEAN_PREF_VALUE ? simColumn : simColumn.times(prefValue));

        simColumn.assign(ABSOLUTE_VALUES);
        denominators = denominators == null ? simColumn : denominators.plus(simColumn);
    }

    if (numerators == null) {
        return;
    }

    Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
    Iterator<Vector.Element> iterator = numerators.iterateNonZero();
    while (iterator.hasNext()) {
        Vector.Element element = iterator.next();
        int itemIDIndex = element.index();
        /* preference estimations must be based on at least 2 datapoints */
        if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
            /* compute normalized prediction */
            double prediction = element.get() / denominators.getQuick(itemIDIndex);
            recommendationVector.setQuick(itemIDIndex, prediction);
        }
    }
    writeRecommendedItems(userID, recommendationVector, context);
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

private void initializeM(Vector averageRatings) throws IOException {
    Random random = RandomUtils.getRandom();

    FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
    try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(),
            new Path(pathToM(-1), "part-m-00000"), IntWritable.class, VectorWritable.class)) {
        IntWritable index = new IntWritable();
        VectorWritable featureVector = new VectorWritable();

        for (Vector.Element e : averageRatings.nonZeroes()) {
            Vector row = new DenseVector(numFeatures);
            row.setQuick(0, e.get());
            for (int m = 1; m < numFeatures; m++) {
                row.setQuick(m, random.nextDouble());
            }//from  ww w. j  av a 2s  . c om
            index.set(e.index());
            featureVector.set(row);
            writer.append(index, featureVector);
        }
    }
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java

License:Apache License

@Override
protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
        throws IOException, InterruptedException {
    Vector userRatings = vectorWritable.get();

    int column = TasteHadoopUtils.idToIndex(rowIndex.get());

    itemVectorWritable.setWritesLaxPrecision(true);

    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    for (Vector.Element elem : userRatings.nonZeroes()) {
        itemID.set(elem.index());//  w w  w  .  ja v a2s  .c  om
        itemVector.setQuick(column, elem.get());
        itemVectorWritable.set(itemVector);
        ctx.write(itemID, itemVectorWritable);
        // reset vector for reuse
        itemVector.setQuick(elem.index(), 0.0);
    }
}

From source file:org.hf.mls.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper.java

License:Apache License

@Override
protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
        throws IOException, InterruptedException {
    Vector userRatings = vectorWritable.get();

    int numElementsBeforeSampling = userRatings.getNumNondefaultElements();
    userRatings = Vectors.maybeSample(userRatings, sampleSize);
    int numElementsAfterSampling = userRatings.getNumNondefaultElements();

    int column = TasteHadoopUtils.idToIndex(rowIndex.get());

    itemVectorWritable.setWritesLaxPrecision(true);

    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    for (Vector.Element elem : userRatings.nonZeroes()) {
        itemID.set(elem.index());//from w w  w .  j a  v  a  2  s .  c  o m
        itemVector.setQuick(column, elem.get());
        itemVectorWritable.set(itemVector);
        ctx.write(itemID, itemVectorWritable);
        // reset vector for reuse
        itemVector.setQuick(elem.index(), 0.0);
    }

    ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling);
    ctx.getCounter(Elements.USER_RATINGS_NEGLECTED)
            .increment(numElementsBeforeSampling - numElementsAfterSampling);
}

From source file:org.qcri.pca.MeanAndSpanJob.java

/**
 * This method overrides the Vector.assign method to allow optimization for
 * ZeroIndifferent functions//from w ww. jav a 2s .  co  m
 * 
 * @param vector
 *          the vector to be updated
 * @param other
 *          the other vector
 * @param function
 *          the function that operates on elements of the two vectors
 * @return the modified vector
 */
static public Vector vectorAssign(Vector vector, Vector other, ZeroIndifferentFunc function) {
    if (vector.size() != other.size()) {
        throw new CardinalityException(vector.size(), other.size());
    }
    // special case: iterate only over the non-zero elements of the vector to
    // add
    Iterator<Element> it = other.nonZeroes().iterator();
    Element e;
    while (it.hasNext() && (e = it.next()) != null) {
        double val = vector.getQuick(e.index());
        double newVal = function.apply(val, e.get());
        vector.setQuick(e.index(), newVal);
    }
    return vector;
}