Example usage for org.apache.mahout.math Vector size

List of usage examples for org.apache.mahout.math Vector size

Introduction

In this page you can find the example usage for org.apache.mahout.math Vector size.

Prototype

int size();

Source Link

Document

Return the cardinality of the recipient (the maximum number of values)

Usage

From source file:com.scaleunlimited.classify.vectors.VectorUtils.java

License:Apache License

public static String dumpVector(Vector v) {
    StringBuffer result = new StringBuffer();
    result.append(String.format("Vector '%s': ", "<unknown>"));
    int baseSize = v.size();
    for (int i = 0; i < baseSize; i++) {
        double component = v.getQuick(i);
        if (component != 0.0) {
            result.append(String.format("%d => %f, ", i, component));
        }//w ww .  j av  a  2  s .  c  om
    }
    return result.toString();
}

From source file:com.scaleunlimited.classify.vectors.VectorUtilsTest.java

License:Apache License

@Test
public void testMakeExtraVector() {
    List<String> uniqueTerms = new ArrayList<String>(2);
    uniqueTerms.add("a");
    uniqueTerms.add("b");

    Map<String, Integer> docTerms = new HashMap<String, Integer>();
    docTerms.put("a", 1);
    docTerms.put("c", 5);

    Vector v = VectorUtils.makeExtraVector(uniqueTerms, docTerms);
    Assert.assertEquals(1, v.size());
    Assert.assertEquals(5, new Double(v.get(0)).intValue());
}

From source file:com.scaleunlimited.classify.vectors.VectorUtilsTest.java

License:Apache License

@Test
public void testAppend() {
    Vector v1 = new RandomAccessSparseVector(2);
    v1.setQuick(0, 0);//from w ww.  j  ava2s. c  o m
    v1.setQuick(1, 1);

    Vector v2 = new RandomAccessSparseVector(3);
    v2.setQuick(0, 2);
    v2.setQuick(1, 3);
    v2.setQuick(2, 4);

    Vector v3 = VectorUtils.appendVectors(v1, v2);

    Assert.assertEquals(5, v3.size());
    for (int i = 0; i < 5; i++) {
        Assert.assertEquals(i, new Double(v3.getQuick(i)).intValue());
    }
}

From source file:com.scaleunlimited.classify.vectors.WritableComparableVectorTest.java

License:Apache License

private void compareVectors(Vector vector1, Vector vector2) {
    Assert.assertEquals(vector1.size(), vector2.size());
    for (int i = 0; i < vector1.size(); i++) {
        Assert.assertEquals(vector1.getQuick(i), vector2.getQuick(i));
    }/*  w  w  w  . ja  v a 2  s .  co m*/
}

From source file:com.tamingtext.mahout.VectorExamplesTest.java

License:Apache License

@Test
public void testProgrammatic() throws Exception {
    //<start id="vec.examples.programmatic"/>
    double[] vals = new double[] { 0.3, 1.8, 200.228 };
    Vector dense = new DenseVector(vals);//<co id="vec.exam.dense"/>
    assertTrue(dense.size() == 3);
    Vector sparseSame = new SequentialAccessSparseVector(3);//<co id="vec.exam.sparse.same"/>
    Vector sparse = new SequentialAccessSparseVector(3000);//<co id="vec.exam.sparse"/>
    for (int i = 0; i < vals.length; i++) {//<co id="vec.exam.assign.sparse"/>
        sparseSame.set(i, vals[i]);//from w w w.  j av  a  2  s.  com
        sparse.set(i, vals[i]);
    }
    assertFalse(dense.equals(sparse));//<co id="vec.exam.notequals.d.s"/>
    assertEquals(dense, sparseSame);//<co id="vec.exam.equals.d.s"/>
    assertFalse(sparse.equals(sparseSame));
    /*
    <calloutlist>
    <callout arearefs="vec.exam.dense"><para>Create a <classname>DenseVector</classname> with a label of "my-dense" and 3 values.  The cardinality of this vector is 3 </para></callout>
    <callout arearefs="vec.exam.sparse.same"><para>Create a <classname>SparseVector</classname> with a label of my-sparse-same that has cardinality of 3</para></callout>
            
    <callout arearefs="vec.exam.sparse"><para>Create a <classname>SparseVector</classname> with a label of my-sparse and a cardinality of 3000.</para></callout>
    <callout arearefs="vec.exam.assign.sparse"><para>Set the values to the first 3 items in the sparse vectors.</para></callout>
    <callout arearefs="vec.exam.notequals.d.s"><para>The dense and the sparse <classname>Vector</classname>s are not equal because they have different cardinality.</para></callout>
    <callout arearefs="vec.exam.equals.d.s"><para>The dense and sparseSame <classname>Vector</classname>s are equal because they have the same values and cardinality</para></callout>
            
    </calloutlist>
    */
    //<end id="vec.examples.programmatic"/>
    //<start id="vec.examples.seq.file"/>
    File tmpDir = new File(System.getProperty("java.io.tmpdir"));
    File tmpLoc = new File(tmpDir, "sfvwt");
    tmpLoc.mkdirs();
    File tmpFile = File.createTempFile("sfvwt", ".dat", tmpLoc);

    Path path = new Path(tmpFile.getAbsolutePath());
    Configuration conf = new Configuration();//<co id="vec.examples.seq.conf"/>
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
            VectorWritable.class);//<co id="vec.examples.seq.writer"/>
    VectorWriter vecWriter = new SequenceFileVectorWriter(seqWriter);//<co id="vec.examples.seq.vecwriter"/>
    List<Vector> vectors = new ArrayList<Vector>();
    vectors.add(sparse);
    vectors.add(sparseSame);
    vecWriter.write(vectors);//<co id="vec.examples.seq.write"/>
    vecWriter.close();
    /*
    <calloutlist>
    <callout arearefs="vec.examples.seq.conf"><para>Create a <classname>Configuration</classname> for Hadoop</para></callout>
    <callout arearefs="vec.examples.seq.writer"><para>Create a Hadoop <classname>SequenceFile.Writer</classname> to handle the job of physically writing out the vectors to a file in HDFS</para></callout>
    <callout arearefs="vec.examples.seq.vecwriter"><para>A <classname>VectorWriter</classname> processes the <classname>Vector</classname>s and invokes the underlying write methods on the <classname>SequenceFile.Writer</classname></para></callout>
    <callout arearefs="vec.examples.seq.write"><para>Do the work of writing out the files</para></callout>
            
    </calloutlist>
    */
    //<end id="vec.examples.seq.file"/>
}

From source file:de.isabeldrostfromm.sof.util.Vectors.java

License:Open Source License

/**
 * Appends two vectors directly after one another, leaving all non set elements zero.
 * *///from w w w . j  a  v  a 2s  .  c  o  m
public static Vector append(Vector... vectors) {
    int totalSize = 0;
    for (Vector vec : vectors) {
        totalSize += vec.size();
    }

    Vector result = new SequentialAccessSparseVector(totalSize);
    result.assign(0);

    int lastIndex = 0;
    for (Vector vector : vectors) {
        for (Element elem : vector) {
            result.setQuick(lastIndex + elem.index(), elem.get());
        }
        lastIndex += vector.size();
    }
    return result;
}

From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from w  w w. j  a v a  2  s  .  c o m
     * Option seqOpt =
     * obuilder.withLongName("seqFile").withRequired(false).withArgument(
     * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
     * withDescription(
     * "The Sequence File containing the Vectors").withShortName
     * ("s").create(); Option dirOpt =
     * obuilder.withLongName("seqDirectory").
     * withRequired(false).withArgument(
     * abuilder.withName("seqDirectory").withMinimum
     * (1).withMaximum(1).create()) .withDescription(
     * "The directory containing Sequence File of Vectors")
     * .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            // TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    // we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:de.tuberlin.dima.cuttlefish.TrainingDataReader.java

License:Open Source License

public static void main(String[] args) {

    //-----------------------------------------------------------------------------

    String documentVectorsFile = "/home/ssc/Desktop/cuttlefish/output/vectors/documentVectors.seq";

    //-----------------------------------------------------------------------------

    Configuration conf = new Configuration();

    int n = 0;/*ww w. ja va 2  s .  c  om*/

    for (Pair<IDAndCodes, VectorWritable> labeledArticle : new SequenceFileIterable<IDAndCodes, VectorWritable>(
            new Path(documentVectorsFile), conf)) {

        System.out.println("ID: " + labeledArticle.getFirst().id());

        Vector features = labeledArticle.getSecond().get();

        System.out.println("Features: " + features.getNumNondefaultElements() + " of " + features.size());

        Multimap<String, String> codes = labeledArticle.getFirst().codes();
        for (Map.Entry<String, String> codeEntry : codes.entries()) {
            System.out.println("\t" + codeEntry.getKey() + "=" + codeEntry.getValue());
        }

        if (n++ == 10) {
            break;
        }
    }

}

From source file:edu.indiana.d2i.htrc.vecproj.TestRandomProjection.java

License:Apache License

@Test
public void testProject() {
    long t0 = System.nanoTime();
    random = Functions.random();/*from w w  w . j  ava2s  .c  o  m*/
    for (int i = 0; i < 100; i++) { // test 100 times
        DenseVector vector = new DenseVector(originalDim);
        vector.assign(random);
        Vector projected = projector.project(vector);
        Assert.assertEquals(reducedDim, projected.size());
    }
    long t1 = System.nanoTime();
    System.out.println("elapsed " + ((double) (t1 - t0)) / 1e9);
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

public static void writeVector(DataOutput out, Vector vector, boolean laxPrecision) throws IOException {
    boolean dense = vector.isDense();
    boolean sequential = vector.isSequentialAccess();
    boolean named = vector instanceof NamedVector;

    out.writeByte((dense ? FLAG_DENSE : 0) | (sequential ? FLAG_SEQUENTIAL : 0) | (named ? FLAG_NAMED : 0)
            | (laxPrecision ? FLAG_LAX_PRECISION : 0));

    Varint.writeUnsignedVarInt(vector.size(), out);
    if (dense) {//  ww  w.j av  a2 s  .  c  om
        for (Vector.Element element : vector.all()) {
            if (laxPrecision) {
                out.writeFloat((float) element.get());
            } else {
                out.writeDouble(element.get());
            }
        }
    } else {
        Varint.writeUnsignedVarInt(vector.getNumNonZeroElements(), out);
        Iterator<Element> iter = vector.nonZeroes().iterator();
        if (sequential) {
            int lastIndex = 0;
            while (iter.hasNext()) {
                Vector.Element element = iter.next();
                if (element.get() == 0) {
                    continue;
                }
                int thisIndex = element.index();
                // Delta-code indices:
                Varint.writeUnsignedVarInt(thisIndex - lastIndex, out);
                lastIndex = thisIndex;
                if (laxPrecision) {
                    out.writeFloat((float) element.get());
                } else {
                    out.writeDouble(element.get());
                }
            }
        } else {
            while (iter.hasNext()) {
                Vector.Element element = iter.next();
                if (element.get() == 0) {
                    // TODO(robinanil): Fix the damn iterator for the zero element.
                    continue;
                }
                Varint.writeUnsignedVarInt(element.index(), out);
                if (laxPrecision) {
                    out.writeFloat((float) element.get());
                } else {
                    out.writeDouble(element.get());
                }
            }
        }
    }
    if (named) {
        String name = ((NamedVector) vector).getName();
        out.writeUTF(name == null ? "" : name);
    }
}