List of usage examples for org.apache.mahout.math VectorWritable VectorWritable
public VectorWritable()
From source file:org.qcri.pca.PCACommon.java
static Path toDistributedVector(Vector vector, Path outputDir, String label, Configuration conf) throws IOException { Path outputFile = new Path(outputDir, "Vector-" + label); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (fs.exists(outputFile)) { log.warn("----------- OVERWRITE " + outputFile + " already exists"); fs.delete(outputFile, false);/*from w w w .j a v a 2s . co m*/ } SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorWritable = new VectorWritable(); vectorWritable.set(vector); writer.append(new IntWritable(0), vectorWritable); writer.close(); return outputFile; }
From source file:org.qcri.pca.PCACommon.java
/** * Convert an in-memory representation of a matrix to a distributed version It * then can be used in distributed jobs/*from w w w. j a v a 2 s. c o m*/ * * @param oriMatrix * @return path that contains the matrix files * @throws IOException */ static DistributedRowMatrix toDistributedRowMatrix(Matrix origMatrix, Path outPath, Path tmpPath, String label) throws IOException { Configuration conf = new Configuration(); Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols()); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (!fs.exists(outputDir)) { Path outputFile = new Path(outputDir, "singleSliceMatrix"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorWritable = new VectorWritable(); try { for (int r = 0; r < origMatrix.numRows(); r++) { Vector vector = origMatrix.viewRow(r); vectorWritable.set(vector); writer.append(new IntWritable(r), vectorWritable); } } finally { writer.close(); } } else { log.warn("----------- Skip matrix " + outputDir + " - already exists"); } DistributedRowMatrix dMatrix = new DistributedRowMatrix(outputDir, tmpPath, origMatrix.numRows(), origMatrix.numCols()); dMatrix.setConf(conf); return dMatrix; }
From source file:org.qcri.pca.PrepareInput.java
private static void textToSequnceFile(String inputStr) throws IOException { BufferedReader inputReader = new BufferedReader(new FileReader(inputStr)); Configuration conf = new Configuration(); Path inputPath = new Path(inputStr); Path outputPath = new Path(inputPath.getParent(), inputPath.getName() + ".formatted"); FileSystem fs = FileSystem.get(inputPath.toUri(), conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, IntWritable.class, VectorWritable.class); VectorWritable vectorWritable = new VectorWritable(); String line;// ww w. j a va2 s .c o m int index = 0; try { while ((line = inputReader.readLine()) != null) { String[] columns = line.split(" "); int shift = 0; if (columns[0].isEmpty()) shift++; double[] columnsDouble = new double[columns.length - shift]; for (int i = 0; i < columnsDouble.length; i++) { columnsDouble[i] = Double.valueOf(columns[i + shift]); } Vector vector = new DenseVector(columnsDouble, true); vectorWritable.set(vector); writer.append(new IntWritable(index), vectorWritable); index++; } } finally { writer.close(); } inputReader.close(); System.out.println("Finish writing to " + outputPath); }
From source file:org.qcri.pca.SPCADriver.java
static void writeMatrix(Matrix origMatrix, Path outPath, Path tmpPath, String label) throws IOException { Configuration conf = new Configuration(); Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols()); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (!fs.exists(outputDir)) { Path outputFile = new Path(outputDir, "singleSliceMatrix"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorWritable = new VectorWritable(); try {/*w ww .j av a 2s .co m*/ for (int r = 0; r < origMatrix.numRows(); r++) { Vector vector = origMatrix.viewRow(r); vectorWritable.set(vector); writer.append(new IntWritable(r), vectorWritable); } } finally { writer.close(); } } else { log.warn("----------- Skip matrix " + outputDir + " - already exists"); } }
From source file:org.qcri.pca.TestSequenceFile.java
private static void printSequenceFile(String inputStr, int printRow) throws IOException { Configuration conf = new Configuration(); Path finalNumberFile = new Path(inputStr); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf), finalNumberFile, conf); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); Vector printVector = null;/* www. j a v a2s . c o m*/ while (reader.next(key, value)) { if (key.get() == printRow) printVector = value.get(); int cnt = 0; Iterator<Element> iter = value.get().nonZeroes().iterator(); for (; iter.hasNext(); iter.next()) cnt++; System.out.println("# " + key + " " + cnt + " " + value.get().zSum()); } reader.close(); if (printVector != null) System.out.println("##### " + printRow + " " + printVector); else System.out.println("##### " + key + " " + value.get()); }
From source file:org.qcri.sparkpca.FileFormat.java
public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) { try {/* ww w .j ava 2 s .co m*/ final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = null; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); Vector vector = null; String thisLine; int lineNumber = 0; int prevRowID = -1; boolean first = true; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here String[] splitted = thisLine.split(","); int rowID = Integer.parseInt(splitted[0]); int colID = Integer.parseInt(splitted[1]); double element = Double.parseDouble(splitted[2]); if (first) { first = false; vector = new SequentialAccessSparseVector(cardinality); } else if (rowID != prevRowID) { key.set(prevRowID); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row vector = new SequentialAccessSparseVector(cardinality); } prevRowID = rowID; vector.set(colID - base, element); } } if (writer != null) //append last vector in last file { key.set(prevRowID); value.set(vector); writer.append(key, value);//write last row writer.close(); } } catch (Exception e) { e.printStackTrace(); } }
From source file:root.input.images.FormatImagesJob.java
License:Apache License
/** * {@inheritDoc}//from w w w. j a va 2s . co m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String line = sc.nextLine(); DenseVector vector = new DenseVector(3); String[] split = line.split(","); double r = Double.valueOf(split[0]); double g = Double.valueOf(split[1]); double b = Double.valueOf(split[2]); vector.setQuick(0, r); vector.setQuick(1, g); vector.setQuick(2, b); String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); String point = "{R:" + r + ",G:" + g + ",B:" + b + "}"; metadataWriter.append(new Text(point), new Text(nextName)); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }
From source file:root.input.lyrl2004.FormatVectorsJob.java
License:Apache License
/** * {@inheritDoc}//from ww w . j ava 2 s.c o m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String key = sc.next(); RandomAccessSparseVector vector = new RandomAccessSparseVector(10000); String line = sc.nextLine().trim(); Scanner lineScanner = new Scanner(line); while (lineScanner.hasNext()) { String pair = lineScanner.next(); int k = Integer.valueOf(pair.split(":")[0]); double v = Double.valueOf(pair.split(":")[1]); vector.setQuick(k, v); } String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); metadataWriter.append(new Text(key), new Text(nextName)); lineScanner.close(); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }
From source file:root.input.points.FormatPointsJob.java
License:Apache License
/** * {@inheritDoc}// w w w .j a v a 2 s .c om */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String line = sc.nextLine(); DenseVector vector = new DenseVector(3); String[] split = line.split(","); double x = Double.valueOf(split[0]); double y = Double.valueOf(split[1]); // int cluster = Integer.valueOf(split[2]); vector.setQuick(0, x); vector.setQuick(1, y); String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); String point = String.format("%.2f-%.2f", x, y); metadataWriter.append(new Text(point), new Text(nextName)); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }
From source file:root.input.points.FormatVectorsJob.java
License:Apache License
/** * This method allows the Job to act as a {@link ToolRunner} and * interface properly with the Driver.//from ww w. jav a2s . co m * * @param args Configuration arguments * @return Exit status * @see ToolRunner */ @Override public int run(String[] args) throws Exception { addArguments(); if (parseArguments(args) == null) { return -1; } initArguments(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path in = new Path(inputDirectory); Path docIdFile = new Path(fileDictDirectory + "/vectorName2docId"); Path vectorFile = new Path(vectorDirectory + "/part-r-00000"); @SuppressWarnings("resource") SequenceFile.Writer metadataWriter = new SequenceFile.Writer(workingFS, conf, docIdFile, Text.class, Text.class); @SuppressWarnings("resource") SequenceFile.Writer vectorWriter = new SequenceFile.Writer(workingFS, conf, vectorFile, Text.class, VectorWritable.class); FileStatus[] files = inputFS.listStatus(in); int counter = 0; for (FileStatus f : files) { Path curr = f.getPath(); if (curr.getName().startsWith(".")) { throw new Exception("Bad Data: Hidden Files Exist"); } Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(inputFS.open(curr)))); while (sc.hasNext()) { String line = sc.nextLine(); RandomAccessSparseVector vector = new RandomAccessSparseVector(10000); String[] split = line.split(","); double val1 = Double.valueOf(split[0]); double val2 = Double.valueOf(split[1]); int val3 = Integer.valueOf(split[2]); vector.setQuick(0, val1); vector.setQuick(1, val2); String nextName = counter + ""; String nextFileName = "/" + counter; counter++; VectorWritable vec = new VectorWritable(); vec.set(vector); vectorWriter.append(new Text(nextFileName), vec); String point = "{x:" + val1 + ",y:" + val2 + ",cluster:" + val3 + "}"; metadataWriter.append(new Text(point), new Text(nextName)); } sc.close(); } metadataWriter.close(); vectorWriter.close(); return 0; }