Example usage for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector

Introduction

In this page you can find the example usage for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector.

Prototype

public SequentialAccessSparseVector(SequentialAccessSparseVector other)

Source Link

Usage

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;/*from   www. ja  v  a2 s  . co m*/
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from w w w  .  j  a  va  2 s  .com
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:guipart.view.GUIOverviewController.java

@FXML
void handleClassifyModel(ActionEvent event) throws IOException {

    if (pathModel != null && pathCSV != null) {

        Auc collector = new Auc();
        LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel));

        CsvRecordFactory csv = lmp.getCsvRecordFactory();
        OnlineLogisticRegression lr = lmp.createRegression();

        BufferedReader in = Utils.open(pathCSV);

        String line = in.readLine();
        csv.firstLine(line);/*w w  w  .  j  a  v  a2  s  .  c om*/
        line = in.readLine();

        int correct = 0;
        int wrong = 0;
        Boolean booltemp;
        String gender;

        while (line != null) {

            Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
            int target = csv.processLine(line, v);
            String[] split = line.split(",");

            double score = lr.classifyFull(v).maxValueIndex();
            if (score == target)
                correct++;
            else
                wrong++;

            System.out.println("Target is: " + target + " Score: " + score);

            booltemp = score != 0;

            if (split[1].contentEquals("1"))
                gender = "male";
            else
                gender = "female";

            Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]),
                    Integer.parseInt(split[7]), booltemp, gender, Integer.parseInt(split[5]),
                    Integer.parseInt(split[6]), Integer.parseInt(split[3]));

            guiPart.addPerson(temp);

            line = in.readLine();
            collector.add(target, score);

        }
        double posto = ((double) wrong / (double) (correct + wrong)) * 100;
        System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong
                + " Wrong pct: " + posto + "%");
        //PrintWriter output = null;
        Matrix m = collector.confusion();
        //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
        System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t   " + m.get(0, 1) + " "
                + m.get(1, 1) + " ");
        //        m = collector.entropy();
        //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
        textAnalyze2.setText("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t \t   " + m.get(0, 1) + " "
                + m.get(1, 1) + "\n" + "Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: "
                + wrong + " Wrong pct: " + posto + "%");
    } else {

        Dialogs.create().owner(guiPart.getPrimaryStage()).title("Error Dialog")
                .masthead("Look, an Error Dialog").message("One or more files aren't selected").showError();

    }
}

From source file:guipart.view.GUIOverviewController.java

@FXML
void singlClassify(ActionEvent e) throws IOException {

    LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel));

    CsvRecordFactory csv = lmp.getCsvRecordFactory();
    OnlineLogisticRegression lr = lmp.createRegression();
    csv.firstLine("custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk");

    String line;/*w w  w.  j a  va 2  s. co  m*/

    line = scID.getText();
    line = line.concat("," + scGender.getText());
    line = line.concat("," + scState.getText());
    line = line.concat("," + scCardholders.getText());
    line = line.concat("," + scBalance.getText());
    line = line.concat("," + scTrans.getText());
    line = line.concat("," + scIntlTrans.getText());
    line = line.concat("," + scCreditLine.getText());
    line = line.concat(",0 \n");

    Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
    int target = csv.processLine(line, v);
    String[] split = line.split(",");

    double score = lr.classifyFull(v).maxValueIndex();
    boolean booltemp = score != 0;

    String gender;

    if (split[1].contentEquals("1"))
        gender = "male";
    else
        gender = "female";

    Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]), Integer.parseInt(split[7]),
            booltemp, gender, Integer.parseInt(split[5]), Integer.parseInt(split[6]),
            Integer.parseInt(split[3]));

    guiPart.addPerson(temp);
}

From source file:haflow.component.mahout.logistic.RunLogistic.java

License:Apache License

static void mainToOutput(String[] args) throws Exception {
    if (parseArgs(args)) {
        if (!showAuc && !showConfusion && !showScores) {
            showAuc = true;//  w w  w  .ja v  a  2  s  .co m
            showConfusion = true;
        }

        //PrintWriter output=new PrintWriter(new FileOutputStream(outputFile),true);

        PrintWriter output = new PrintWriter(HdfsUtil.writeHdfs(outputFile), true);
        PrintWriter acc_output = new PrintWriter(HdfsUtil.writeHdfs(accurateFile), true);
        Auc collector = new Auc();
        LogisticModelParameters lmp = LogisticModelParameters.loadFrom(HdfsUtil.open(modelFile));

        CsvRecordFactory csv = lmp.getCsvRecordFactory();
        OnlineLogisticRegression lr = lmp.createRegression();
        BufferedReader in = new BufferedReader(new InputStreamReader(HdfsUtil.open(inputFile)));
        String line = in.readLine();
        csv.firstLine(line);
        line = in.readLine();
        if (showScores) {
            output.println("\"target\",\"model-output\",\"log-likelihood\"");
        }
        while (line != null) {
            Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
            int target = csv.processLine(line, v);

            double score = lr.classifyScalar(v);
            if (showScores) {
                output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
            }
            collector.add(target, score);
            line = in.readLine();
        }

        if (showAuc) {
            acc_output.printf(Locale.ENGLISH, "AUC , %.2f%n", collector.auc());
        }
        if (showConfusion) {
            Matrix m = collector.confusion();
            acc_output.printf(Locale.ENGLISH, "confusion, [[%.1f  %.1f], [%.1f  %.1f]]%n", m.get(0, 0),
                    m.get(1, 0), m.get(0, 1), m.get(1, 1));
            m = collector.entropy();
            acc_output.printf(Locale.ENGLISH, "entropy, [[%.1f  %.1f], [%.1f  %.1f]]%n", m.get(0, 0),
                    m.get(1, 0), m.get(0, 1), m.get(1, 1));
        }
        output.close();
        acc_output.close();
    }
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

public static void matrix2Vector(Configuration conf, Path path) throws IOException {
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader = null;
    // ??SequenceFile????Name??
    reader = new SequenceFile.Reader(fs, path, conf);
    Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    Writer writer = null;//  w  ww.  j  a  v a 2s .c  o m
    try {
        writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class,
                CompressionType.BLOCK);
        final IntWritable key1 = new IntWritable();
        final VectorWritable value = new VectorWritable();
        int lineNum = 0;
        Vector vector = null;
        while (reader.next(key, val)) {
            int index = 0;
            StringTokenizer st = new StringTokenizer(val.toString());
            // SequentialAccessSparseVector??NamedVector
            vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + "");
            while (st.hasMoreTokens()) {
                if (Integer.parseInt(st.nextToken()) == 1) {
                    vector.set(index, 1);
                }
                index++;
            }
            key1.set(lineNum++);
            value.set(vector);
            writer.append(key, value);
        }
    } finally {
        writer.close();
        reader.close();
    }
}

From source file:javaapplication3.RunLogistic.java

public static void main(String[] args) throws IOException {
    // TODO code application logic here
    Auc collector = new Auc();

    LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));

    CsvRecordFactory csv = lmp.getCsvRecordFactory();
    OnlineLogisticRegression lr = lmp.createRegression();

    BufferedReader in = open(inputFile);

    String line = in.readLine();/*from   w  w w  . j  av  a  2s.  c  om*/
    csv.firstLine(line);
    line = in.readLine();
    int correct = 0;
    int wrong = 0;
    while (line != null) {
        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
        int target = csv.processLine(line, v);

        System.out.println(line);
        String[] split = line.split(",");

        double score = lr.classifyFull(v).maxValueIndex();
        if (score == target)
            correct++;
        else
            wrong++;

        System.out.println("Target is: " + target + " Score: " + score);
        line = in.readLine();
        collector.add(target, score);

    }
    double posto = ((double) wrong / (double) (correct + wrong)) * 100;
    System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong
            + " Wrong pct: " + posto + "%");
    //PrintWriter output = null;
    Matrix m = collector.confusion();
    //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
    System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t   " + m.get(0, 1) + " "
            + m.get(1, 1) + " ");
    //        m = collector.entropy();
    //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));

}

From source file:net.aprendizajengrande.ontocluster.RedisToVectors.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        System.err.println("Usage: <hdfs folder for input>");
        System.exit(1);// w  w w.  ja  v a2 s  .co m
    }

    Configuration conf = new Configuration();

    System.out.println("Input: " + args[0]);

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    String inputName = args[0] + "/input";
    String relsInputName = args[0] + "/rels";
    String instancesInputName = args[0] + "/instances";

    Path input = new Path(inputName);
    Path relsInput = new Path(relsInputName);
    Path instancesInput = new Path(instancesInputName);

    // see http://stackoverflow.com/questions/14993644/configure-jedis-timeout
    Jedis jedis = new Jedis("localhost", 6379, 18000);

    // create the relations and instances first, so we know what to expect
    Set<String> rels = jedis.keys("rel-nom-*");

    Map<Integer, String> relIdToName = new HashMap<>();

    FSDataOutputStream fsdos = relsInput.getFileSystem(conf).create(relsInput);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(fsdos));

    int relNum = 0;
    for (String rel : rels) {
        String relName = rel.replaceAll("^rel-nom-", "");
        int relId = Integer.parseInt(jedis.get(rel));
        relIdToName.put(relId, relName);
        if (relId > relNum)
            relNum = relId;
    }
    relNum++;
    for (int i = 0; i < relNum; i++)
        pw.println(i + "\t" + relIdToName.get(i));
    pw.close();
    rels.clear();

    Set<String> instances = jedis.keys("res-nom-*");

    fsdos = instancesInput.getFileSystem(conf).create(instancesInput);
    pw = new PrintWriter(new OutputStreamWriter(fsdos));

    for (String instance : instances) {
        int instanceId = Integer.parseInt(instance.replaceAll("^res-nom-", ""));
        String instanceName = jedis.get(instance);
        pw.println(instanceId + "\t" + instanceName);
    }
    pw.close();
    instances.clear();

    Set<String> keys = jedis.keys("r-*");

    SequenceFile.Writer writer = SequenceFile.createWriter(conf, Writer.file(input),
            Writer.keyClass(Text.class), Writer.valueClass(VectorWritable.class));

    for (String key : keys) {
        Set<String> theseRels = jedis.smembers(key);

        Vector s = new SequentialAccessSparseVector(relNum);
        for (String relId : theseRels)
            s.set(Integer.parseInt(relId), 1.0);
        VectorWritable v = new VectorWritable(s);
        writer.append(new Text(key), v);
    }
    writer.close();

    jedis.close();
}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) {
    try {//from   w w w . j av a  2 s.  c  o  m
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        int lineNumber = 0;
        String thisLine;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            Vector vector = null;
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here
                if (thisLine.isEmpty())
                    continue;
                String[] splitted = thisLine.split("\\s+");
                vector = new SequentialAccessSparseVector(splitted.length);
                for (int i = 0; i < splitted.length; i++) {
                    vector.set(i, Double.parseDouble(splitted[i]));
                }
                key.set(lineNumber);
                value.set(vector);
                //System.out.println(vector);
                writer.append(key, value);//write last row
                lineNumber++;
            }
            writer.close();
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) {
    try {//w w  w . j av  a  2 s  .  c o m
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer = null;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        Vector vector = null;

        String thisLine;
        int prevRowID = -1;
        boolean first = true;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here            
                String[] splitted = thisLine.split(",");
                int rowID = Integer.parseInt(splitted[0]);
                int colID = Integer.parseInt(splitted[1]);
                double element = Double.parseDouble(splitted[2]);
                if (first) {
                    first = false;
                    vector = new SequentialAccessSparseVector(cardinality);
                } else if (rowID != prevRowID) {
                    key.set(prevRowID);
                    value.set(vector);
                    //System.out.println(vector);
                    writer.append(key, value);//write last row
                    vector = new SequentialAccessSparseVector(cardinality);
                }
                prevRowID = rowID;
                vector.set(colID - base, element);
            }
            /*//here we append the last vector in each file (assuming that we will start a new row in the next file
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key,value);//write last row
            writer.close();
            */
        }
        if (writer != null) //append last vector in last file
        {
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key, value);//write last row
            writer.close();
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}