Example usage for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector

List of usage examples for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector

Introduction

In this page you can find the example usage for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector.

Prototype

public SequentialAccessSparseVector(SequentialAccessSparseVector other) 

Source Link

Usage

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;/*from   www. ja  v  a2 s  . co m*/
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from w w w  .  j  a  va  2 s  .com
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:guipart.view.GUIOverviewController.java

@FXML
void handleClassifyModel(ActionEvent event) throws IOException {

    if (pathModel != null && pathCSV != null) {

        Auc collector = new Auc();
        LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel));

        CsvRecordFactory csv = lmp.getCsvRecordFactory();
        OnlineLogisticRegression lr = lmp.createRegression();

        BufferedReader in = Utils.open(pathCSV);

        String line = in.readLine();
        csv.firstLine(line);/*w w  w  .  j  a  v  a2  s  .  c om*/
        line = in.readLine();

        int correct = 0;
        int wrong = 0;
        Boolean booltemp;
        String gender;

        while (line != null) {

            Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
            int target = csv.processLine(line, v);
            String[] split = line.split(",");

            double score = lr.classifyFull(v).maxValueIndex();
            if (score == target)
                correct++;
            else
                wrong++;

            System.out.println("Target is: " + target + " Score: " + score);

            booltemp = score != 0;

            if (split[1].contentEquals("1"))
                gender = "male";
            else
                gender = "female";

            Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]),
                    Integer.parseInt(split[7]), booltemp, gender, Integer.parseInt(split[5]),
                    Integer.parseInt(split[6]), Integer.parseInt(split[3]));

            guiPart.addPerson(temp);

            line = in.readLine();
            collector.add(target, score);

        }
        double posto = ((double) wrong / (double) (correct + wrong)) * 100;
        System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong
                + " Wrong pct: " + posto + "%");
        //PrintWriter output = null;
        Matrix m = collector.confusion();
        //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
        System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t   " + m.get(0, 1) + " "
                + m.get(1, 1) + " ");
        //        m = collector.entropy();
        //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
        textAnalyze2.setText("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t \t   " + m.get(0, 1) + " "
                + m.get(1, 1) + "\n" + "Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: "
                + wrong + " Wrong pct: " + posto + "%");
    } else {

        Dialogs.create().owner(guiPart.getPrimaryStage()).title("Error Dialog")
                .masthead("Look, an Error Dialog").message("One or more files aren't selected").showError();

    }
}

From source file:guipart.view.GUIOverviewController.java

@FXML
void singlClassify(ActionEvent e) throws IOException {

    LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel));

    CsvRecordFactory csv = lmp.getCsvRecordFactory();
    OnlineLogisticRegression lr = lmp.createRegression();
    csv.firstLine("custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk");

    String line;/*w w  w.  j a  va 2  s. co  m*/

    line = scID.getText();
    line = line.concat("," + scGender.getText());
    line = line.concat("," + scState.getText());
    line = line.concat("," + scCardholders.getText());
    line = line.concat("," + scBalance.getText());
    line = line.concat("," + scTrans.getText());
    line = line.concat("," + scIntlTrans.getText());
    line = line.concat("," + scCreditLine.getText());
    line = line.concat(",0 \n");

    Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
    int target = csv.processLine(line, v);
    String[] split = line.split(",");

    double score = lr.classifyFull(v).maxValueIndex();
    boolean booltemp = score != 0;

    String gender;

    if (split[1].contentEquals("1"))
        gender = "male";
    else
        gender = "female";

    Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]), Integer.parseInt(split[7]),
            booltemp, gender, Integer.parseInt(split[5]), Integer.parseInt(split[6]),
            Integer.parseInt(split[3]));

    guiPart.addPerson(temp);
}

From source file:haflow.component.mahout.logistic.RunLogistic.java

License:Apache License

static void mainToOutput(String[] args) throws Exception {
    if (parseArgs(args)) {
        if (!showAuc && !showConfusion && !showScores) {
            showAuc = true;//  w w  w  .ja v  a  2  s  .co m
            showConfusion = true;
        }

        //PrintWriter output=new PrintWriter(new FileOutputStream(outputFile),true);

        PrintWriter output = new PrintWriter(HdfsUtil.writeHdfs(outputFile), true);
        PrintWriter acc_output = new PrintWriter(HdfsUtil.writeHdfs(accurateFile), true);
        Auc collector = new Auc();
        LogisticModelParameters lmp = LogisticModelParameters.loadFrom(HdfsUtil.open(modelFile));

        CsvRecordFactory csv = lmp.getCsvRecordFactory();
        OnlineLogisticRegression lr = lmp.createRegression();
        BufferedReader in = new BufferedReader(new InputStreamReader(HdfsUtil.open(inputFile)));
        String line = in.readLine();
        csv.firstLine(line);
        line = in.readLine();
        if (showScores) {
            output.println("\"target\",\"model-output\",\"log-likelihood\"");
        }
        while (line != null) {
            Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
            int target = csv.processLine(line, v);

            double score = lr.classifyScalar(v);
            if (showScores) {
                output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
            }
            collector.add(target, score);
            line = in.readLine();
        }

        if (showAuc) {
            acc_output.printf(Locale.ENGLISH, "AUC , %.2f%n", collector.auc());
        }
        if (showConfusion) {
            Matrix m = collector.confusion();
            acc_output.printf(Locale.ENGLISH, "confusion, [[%.1f  %.1f], [%.1f  %.1f]]%n", m.get(0, 0),
                    m.get(1, 0), m.get(0, 1), m.get(1, 1));
            m = collector.entropy();
            acc_output.printf(Locale.ENGLISH, "entropy, [[%.1f  %.1f], [%.1f  %.1f]]%n", m.get(0, 0),
                    m.get(1, 0), m.get(0, 1), m.get(1, 1));
        }
        output.close();
        acc_output.close();
    }
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

public static void matrix2Vector(Configuration conf, Path path) throws IOException {
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader = null;
    // ??SequenceFile????Name??
    reader = new SequenceFile.Reader(fs, path, conf);
    Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    Writer writer = null;//  w  ww.  j  a  v a 2s .c  o m
    try {
        writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class,
                CompressionType.BLOCK);
        final IntWritable key1 = new IntWritable();
        final VectorWritable value = new VectorWritable();
        int lineNum = 0;
        Vector vector = null;
        while (reader.next(key, val)) {
            int index = 0;
            StringTokenizer st = new StringTokenizer(val.toString());
            // SequentialAccessSparseVector??NamedVector
            vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + "");
            while (st.hasMoreTokens()) {
                if (Integer.parseInt(st.nextToken()) == 1) {
                    vector.set(index, 1);
                }
                index++;
            }
            key1.set(lineNum++);
            value.set(vector);
            writer.append(key, value);
        }
    } finally {
        writer.close();
        reader.close();
    }
}

From source file:javaapplication3.RunLogistic.java

public static void main(String[] args) throws IOException {
    // TODO code application logic here
    Auc collector = new Auc();

    LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));

    CsvRecordFactory csv = lmp.getCsvRecordFactory();
    OnlineLogisticRegression lr = lmp.createRegression();

    BufferedReader in = open(inputFile);

    String line = in.readLine();/*from   w  w w  . j  av  a  2s.  c  om*/
    csv.firstLine(line);
    line = in.readLine();
    int correct = 0;
    int wrong = 0;
    while (line != null) {
        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
        int target = csv.processLine(line, v);

        System.out.println(line);
        String[] split = line.split(",");

        double score = lr.classifyFull(v).maxValueIndex();
        if (score == target)
            correct++;
        else
            wrong++;

        System.out.println("Target is: " + target + " Score: " + score);
        line = in.readLine();
        collector.add(target, score);

    }
    double posto = ((double) wrong / (double) (correct + wrong)) * 100;
    System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong
            + " Wrong pct: " + posto + "%");
    //PrintWriter output = null;
    Matrix m = collector.confusion();
    //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
    System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t   " + m.get(0, 1) + " "
            + m.get(1, 1) + " ");
    //        m = collector.entropy();
    //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));

}

From source file:net.aprendizajengrande.ontocluster.RedisToVectors.java

License:Open Source License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        System.err.println("Usage: <hdfs folder for input>");
        System.exit(1);// w  w w.  ja  v a2 s  .co m
    }

    Configuration conf = new Configuration();

    System.out.println("Input: " + args[0]);

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    String inputName = args[0] + "/input";
    String relsInputName = args[0] + "/rels";
    String instancesInputName = args[0] + "/instances";

    Path input = new Path(inputName);
    Path relsInput = new Path(relsInputName);
    Path instancesInput = new Path(instancesInputName);

    // see http://stackoverflow.com/questions/14993644/configure-jedis-timeout
    Jedis jedis = new Jedis("localhost", 6379, 18000);

    // create the relations and instances first, so we know what to expect
    Set<String> rels = jedis.keys("rel-nom-*");

    Map<Integer, String> relIdToName = new HashMap<>();

    FSDataOutputStream fsdos = relsInput.getFileSystem(conf).create(relsInput);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(fsdos));

    int relNum = 0;
    for (String rel : rels) {
        String relName = rel.replaceAll("^rel-nom-", "");
        int relId = Integer.parseInt(jedis.get(rel));
        relIdToName.put(relId, relName);
        if (relId > relNum)
            relNum = relId;
    }
    relNum++;
    for (int i = 0; i < relNum; i++)
        pw.println(i + "\t" + relIdToName.get(i));
    pw.close();
    rels.clear();

    Set<String> instances = jedis.keys("res-nom-*");

    fsdos = instancesInput.getFileSystem(conf).create(instancesInput);
    pw = new PrintWriter(new OutputStreamWriter(fsdos));

    for (String instance : instances) {
        int instanceId = Integer.parseInt(instance.replaceAll("^res-nom-", ""));
        String instanceName = jedis.get(instance);
        pw.println(instanceId + "\t" + instanceName);
    }
    pw.close();
    instances.clear();

    Set<String> keys = jedis.keys("r-*");

    SequenceFile.Writer writer = SequenceFile.createWriter(conf, Writer.file(input),
            Writer.keyClass(Text.class), Writer.valueClass(VectorWritable.class));

    for (String key : keys) {
        Set<String> theseRels = jedis.smembers(key);

        Vector s = new SequentialAccessSparseVector(relNum);
        for (String relId : theseRels)
            s.set(Integer.parseInt(relId), 1.0);
        VectorWritable v = new VectorWritable(s);
        writer.append(new Text(key), v);
    }
    writer.close();

    jedis.close();
}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) {
    try {//from   w w w . j av a  2 s.  c  o  m
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        int lineNumber = 0;
        String thisLine;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            Vector vector = null;
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here
                if (thisLine.isEmpty())
                    continue;
                String[] splitted = thisLine.split("\\s+");
                vector = new SequentialAccessSparseVector(splitted.length);
                for (int i = 0; i < splitted.length; i++) {
                    vector.set(i, Double.parseDouble(splitted[i]));
                }
                key.set(lineNumber);
                value.set(vector);
                //System.out.println(vector);
                writer.append(key, value);//write last row
                lineNumber++;
            }
            writer.close();
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) {
    try {//w w  w . j av  a  2 s  .  c o m
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer = null;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        Vector vector = null;

        String thisLine;
        int prevRowID = -1;
        boolean first = true;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here            
                String[] splitted = thisLine.split(",");
                int rowID = Integer.parseInt(splitted[0]);
                int colID = Integer.parseInt(splitted[1]);
                double element = Double.parseDouble(splitted[2]);
                if (first) {
                    first = false;
                    vector = new SequentialAccessSparseVector(cardinality);
                } else if (rowID != prevRowID) {
                    key.set(prevRowID);
                    value.set(vector);
                    //System.out.println(vector);
                    writer.append(key, value);//write last row
                    vector = new SequentialAccessSparseVector(cardinality);
                }
                prevRowID = rowID;
                vector.set(colID - base, element);
            }
            /*//here we append the last vector in each file (assuming that we will start a new row in the next file
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key,value);//write last row
            writer.close();
            */
        }
        if (writer != null) //append last vector in last file
        {
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key, value);//write last row
            writer.close();
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}