Example usage for org.apache.mahout.math VectorWritable VectorWritable

List of usage examples for org.apache.mahout.math VectorWritable VectorWritable

Introduction

In this page you can find the example usage for org.apache.mahout.math VectorWritable VectorWritable.

Prototype

public VectorWritable() 

Source Link

Usage

From source file:org.qcri.pca.PCACommon.java

static Path toDistributedVector(Vector vector, Path outputDir, String label, Configuration conf)
        throws IOException {
    Path outputFile = new Path(outputDir, "Vector-" + label);
    FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
    if (fs.exists(outputFile)) {
        log.warn("----------- OVERWRITE " + outputFile + " already exists");
        fs.delete(outputFile, false);/*from w w  w  .j  a v a 2s .  co  m*/
    }
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class,
            VectorWritable.class);
    VectorWritable vectorWritable = new VectorWritable();
    vectorWritable.set(vector);
    writer.append(new IntWritable(0), vectorWritable);
    writer.close();
    return outputFile;
}

From source file:org.qcri.pca.PCACommon.java

/**
 * Convert an in-memory representation of a matrix to a distributed version It
 * then can be used in distributed jobs/*from   w  w w.  j  a  v a 2  s. c o  m*/
 * 
 * @param oriMatrix
 * @return path that contains the matrix files
 * @throws IOException
 */
static DistributedRowMatrix toDistributedRowMatrix(Matrix origMatrix, Path outPath, Path tmpPath, String label)
        throws IOException {
    Configuration conf = new Configuration();
    Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols());
    FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
    if (!fs.exists(outputDir)) {
        Path outputFile = new Path(outputDir, "singleSliceMatrix");
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class,
                VectorWritable.class);
        VectorWritable vectorWritable = new VectorWritable();
        try {
            for (int r = 0; r < origMatrix.numRows(); r++) {
                Vector vector = origMatrix.viewRow(r);
                vectorWritable.set(vector);
                writer.append(new IntWritable(r), vectorWritable);
            }
        } finally {
            writer.close();
        }
    } else {
        log.warn("----------- Skip matrix " + outputDir + " - already exists");
    }
    DistributedRowMatrix dMatrix = new DistributedRowMatrix(outputDir, tmpPath, origMatrix.numRows(),
            origMatrix.numCols());
    dMatrix.setConf(conf);
    return dMatrix;
}

From source file:org.qcri.pca.PrepareInput.java

private static void textToSequnceFile(String inputStr) throws IOException {
    BufferedReader inputReader = new BufferedReader(new FileReader(inputStr));
    Configuration conf = new Configuration();
    Path inputPath = new Path(inputStr);
    Path outputPath = new Path(inputPath.getParent(), inputPath.getName() + ".formatted");
    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, IntWritable.class,
            VectorWritable.class);
    VectorWritable vectorWritable = new VectorWritable();
    String line;// ww w.  j  a va2  s .c  o  m
    int index = 0;
    try {
        while ((line = inputReader.readLine()) != null) {
            String[] columns = line.split(" ");
            int shift = 0;
            if (columns[0].isEmpty())
                shift++;
            double[] columnsDouble = new double[columns.length - shift];
            for (int i = 0; i < columnsDouble.length; i++) {
                columnsDouble[i] = Double.valueOf(columns[i + shift]);
            }
            Vector vector = new DenseVector(columnsDouble, true);
            vectorWritable.set(vector);
            writer.append(new IntWritable(index), vectorWritable);
            index++;
        }
    } finally {
        writer.close();
    }
    inputReader.close();
    System.out.println("Finish writing to " + outputPath);
}

From source file:org.qcri.pca.SPCADriver.java

static void writeMatrix(Matrix origMatrix, Path outPath, Path tmpPath, String label) throws IOException {
    Configuration conf = new Configuration();
    Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols());
    FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
    if (!fs.exists(outputDir)) {
        Path outputFile = new Path(outputDir, "singleSliceMatrix");
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class,
                VectorWritable.class);
        VectorWritable vectorWritable = new VectorWritable();
        try {/*w  ww  .j  av a  2s .co  m*/
            for (int r = 0; r < origMatrix.numRows(); r++) {
                Vector vector = origMatrix.viewRow(r);
                vectorWritable.set(vector);
                writer.append(new IntWritable(r), vectorWritable);
            }
        } finally {
            writer.close();
        }
    } else {
        log.warn("----------- Skip matrix " + outputDir + " - already exists");
    }
}

From source file:org.qcri.pca.TestSequenceFile.java

private static void printSequenceFile(String inputStr, int printRow) throws IOException {
    Configuration conf = new Configuration();
    Path finalNumberFile = new Path(inputStr);
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf), finalNumberFile, conf);
    IntWritable key = new IntWritable();
    VectorWritable value = new VectorWritable();
    Vector printVector = null;/* www.  j a  v  a2s  .  c  o  m*/
    while (reader.next(key, value)) {
        if (key.get() == printRow)
            printVector = value.get();
        int cnt = 0;
        Iterator<Element> iter = value.get().nonZeroes().iterator();
        for (; iter.hasNext(); iter.next())
            cnt++;
        System.out.println("# " + key + " " + cnt + " " + value.get().zSum());
    }
    reader.close();
    if (printVector != null)
        System.out.println("##### " + printRow + " " + printVector);
    else
        System.out.println("##### " + key + " " + value.get());
}

From source file:org.qcri.sparkpca.FileFormat.java

public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) {
    try {/*  ww w  .j ava 2 s .co m*/
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer = null;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        Vector vector = null;

        String thisLine;

        int lineNumber = 0;
        int prevRowID = -1;
        boolean first = true;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here            
                String[] splitted = thisLine.split(",");
                int rowID = Integer.parseInt(splitted[0]);
                int colID = Integer.parseInt(splitted[1]);
                double element = Double.parseDouble(splitted[2]);
                if (first) {
                    first = false;
                    vector = new SequentialAccessSparseVector(cardinality);
                } else if (rowID != prevRowID) {
                    key.set(prevRowID);
                    value.set(vector);
                    //System.out.println(vector);
                    writer.append(key, value);//write last row
                    vector = new SequentialAccessSparseVector(cardinality);
                }
                prevRowID = rowID;
                vector.set(colID - base, element);
            }
        }
        if (writer != null) //append last vector in last file
        {
            key.set(prevRowID);
            value.set(vector);
            writer.append(key, value);//write last row
            writer.close();
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:root.input.images.FormatImagesJob.java

License:Apache License

/**
 * {@inheritDoc}//from w w  w.  j  a  va 2s  . co m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String line = sc.nextLine();

            DenseVector vector = new DenseVector(3);

            String[] split = line.split(",");

            double r = Double.valueOf(split[0]);
            double g = Double.valueOf(split[1]);
            double b = Double.valueOf(split[2]);

            vector.setQuick(0, r);
            vector.setQuick(1, g);
            vector.setQuick(2, b);

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            String point = "{R:" + r + ",G:" + g + ",B:" + b + "}";

            metadataWriter.append(new Text(point), new Text(nextName));

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}

From source file:root.input.lyrl2004.FormatVectorsJob.java

License:Apache License

/**
 * {@inheritDoc}//from  ww w .  j ava 2 s.c  o m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String key = sc.next();

            RandomAccessSparseVector vector = new RandomAccessSparseVector(10000);

            String line = sc.nextLine().trim();
            Scanner lineScanner = new Scanner(line);
            while (lineScanner.hasNext()) {

                String pair = lineScanner.next();

                int k = Integer.valueOf(pair.split(":")[0]);
                double v = Double.valueOf(pair.split(":")[1]);

                vector.setQuick(k, v);

            }

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            metadataWriter.append(new Text(key), new Text(nextName));

            lineScanner.close();

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}

From source file:root.input.points.FormatPointsJob.java

License:Apache License

/**
 * {@inheritDoc}//  w w w .j a  v a 2  s  .c om
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String line = sc.nextLine();

            DenseVector vector = new DenseVector(3);

            String[] split = line.split(",");

            double x = Double.valueOf(split[0]);
            double y = Double.valueOf(split[1]);
            //            int cluster = Integer.valueOf(split[2]);
            vector.setQuick(0, x);
            vector.setQuick(1, y);

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            String point = String.format("%.2f-%.2f", x, y);

            metadataWriter.append(new Text(point), new Text(nextName));

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}

From source file:root.input.points.FormatVectorsJob.java

License:Apache License

/**
 * This method allows the Job to act as a {@link ToolRunner} and 
 * interface properly with the Driver.//from   ww  w.  jav a2s .  co m
 * 
 * @param args Configuration arguments
 * @return Exit status
 * @see ToolRunner
 */
@Override
public int run(String[] args) throws Exception {

    addArguments();

    if (parseArguments(args) == null) {
        return -1;
    }

    initArguments();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path in = new Path(inputDirectory);
    Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId");
    Path vectorFile = new Path(vectorDirectory + "/part-r-00000");

    @SuppressWarnings("resource")
    SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class,
            Text.class);

    @SuppressWarnings("resource")
    SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class,
            VectorWritable.class);

    FileStatus[] files = inputFS.listStatus(in);

    int counter = 0;

    for (FileStatus f : files) {
        Path curr = f.getPath();
        if (curr.getName().startsWith(".")) {
            throw new Exception("Bad Data: Hidden Files Exist");
        }

        Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr))));

        while (sc.hasNext()) {

            String line = sc.nextLine();

            RandomAccessSparseVector vector = new RandomAccessSparseVector(10000);

            String[] split = line.split(",");

            double val1 = Double.valueOf(split[0]);
            double val2 = Double.valueOf(split[1]);
            int val3 = Integer.valueOf(split[2]);

            vector.setQuick(0, val1);
            vector.setQuick(1, val2);

            String nextName = counter + "";
            String nextFileName = "/" + counter;
            counter++;

            VectorWritable vec = new VectorWritable();
            vec.set(vector);
            vectorWriter.append(new Text(nextFileName), vec);

            String point = "{x:" + val1 + ",y:" + val2 + ",cluster:" + val3 + "}";

            metadataWriter.append(new Text(point), new Text(nextName));

        }

        sc.close();
    }

    metadataWriter.close();
    vectorWriter.close();

    return 0;

}