List of usage examples for org.apache.mahout.math Vector get
double get(int index);
From source file:edu.snu.dolphin.bsp.examples.ml.sub.VectorListCodec.java
License:Apache License
@Override public byte[] encode(final List<Vector> list) { /*/*www . ja v a2 s.c o m*/ * This codec assume that vectors have the same length */ int length = 0; for (final Vector vector : list) { length = vector.size(); } final ByteArrayOutputStream baos = new ByteArrayOutputStream( Integer.SIZE + Integer.SIZE + Double.SIZE * length * list.size()); try (final DataOutputStream daos = new DataOutputStream(baos)) { daos.writeInt(list.size()); daos.writeInt(length); for (final Vector vector : list) { for (int i = 0; i < length; i++) { daos.writeDouble(vector.get(i)); } } } catch (final IOException e) { throw new RuntimeException(e.getCause()); } return baos.toByteArray(); }
From source file:edu.utsa.sifter.som.MainSOM.java
License:Apache License
void somStats(final SifterConfig conf, final SelfOrganizingMap som, final ArrayList<ArrayList<Long>> clusters, final Writer somJS) throws IOException { somJS.write("{\"width\":" + som.width() + ", \"height\":" + som.height() + ", \n\"cells\":[\n"); int numZero = 0; int numWith = 0; int totalWith = 0; long totalSSD = 0; int maxNum = 0; double maxSSD = 0; double maxStd = 0; for (int i = 0; i < som.numCells(); ++i) { final ArrayList<Long> cluster = clusters.get(i); if (cluster.size() == 0) { ++numZero;/*w ww. ja va 2s .co m*/ } else { ++numWith; totalWith += cluster.size(); } totalSSD += som.getStats(i).sumSqrDistance(); maxNum = Math.max(maxNum, cluster.size()); maxSSD = Math.max(maxSSD, som.getStats(i).sumSqrDistance()); maxStd = Math.max(maxStd, som.getStats(i).stdDev()); somJS.write("{\"topTerms\":["); final java.util.Vector<String> topTerms = som.getStats(i).getTopTerms(); for (int j = 0; j < Conf.NUM_TOP_CELL_TERMS; ++j) { if (j != 0) { somJS.write(", "); } somJS.write("\""); somJS.write(StringEscapeUtils.escapeEcmaScript(topTerms.get(j))); somJS.write("\""); } somJS.write("], "); somJS.write("\"num\":" + cluster.size() + ", \"stdDev\":" + som.getStats(i).stdDev() + ", \"ssd\":" + som.getStats(i).sumSqrDistance()); somJS.write(", \"region\":" + som.getStats(i).getRegion()); if (i + 1 == som.numCells()) { somJS.write("}\n"); } else { somJS.write("},\n"); } } somJS.write("], \"numZero\":" + numZero + ", \"numWith\":" + numWith); somJS.write(", \"totalWith\":" + totalWith + ", \"avgNum\":" + (numWith == 0 ? 0 : (double) totalWith / numWith)); somJS.write(", \"numOutliers\":" + getNumOutliers()); somJS.write(", \"ssd\":" + totalSSD + ", \"numRegions\":" + som.getNumRegions()); somJS.write(", \"maxCellNum\":" + maxNum + ", \"maxCellSSD\":" + maxSSD + ", \"maxCellStd\":" + maxStd + ",\n\"regionColors\":["); for (int i = 0; i < som.getNumRegions(); ++i) { if (i > 0) { somJS.write(", "); } somJS.write(Integer.toString(som.getRegionColor(i))); } somJS.write("],\n\"regionMap\":["); final ArrayList<Set<Integer>> regionMap = som.getRegionMap(); for (int i = 0; i < regionMap.size(); ++i) { if (i > 0) { somJS.write(", "); } somJS.write("["); final Set<Integer> adjMap = regionMap.get(i); int j = 0; for (Integer adj : adjMap) { if (j > 0) { somJS.write(", "); } somJS.write(Integer.toString(adj)); ++j; } somJS.write("]"); } somJS.write("],\n"); somJS.write("\"cellTermDiffs\":[\n"); for (int i = 0; i < som.numCells(); ++i) { final HashMap<Integer, Integer> diffs = som.getCellTermDiffs(i); if (i != 0) { somJS.write(",\n"); } somJS.write("{"); int j = 0; for (Map.Entry<Integer, Integer> pair : diffs.entrySet()) { if (j != 0) { somJS.write(", "); } ++j; somJS.write("\""); somJS.write(Integer.toString(pair.getKey())); somJS.write("\": \""); int val = pair.getValue(); if (val < 0) { somJS.write("-"); val = -1 * val; } somJS.write(Terms.get(val)); somJS.write("\""); } somJS.write("}"); } somJS.write("]\n"); somJS.write("}\n"); }
From source file:edu.utsa.sifter.som.SelfOrganizingMap.java
License:Apache License
void assignTopTerms(final int numTopTerms, final java.util.Vector<String> terms) { final PriorityQueue<TermPair> topWeights = new PriorityQueue<TermPair>(numTopTerms, new TermPair.TermPairComparator()); for (int i = 0; i < numCells(); ++i) { final java.util.Vector<String> topTerms = new java.util.Vector<String>(numTopTerms); topTerms.setSize(numTopTerms);/*from w ww . j a v a 2 s .com*/ final Vector cell = getCell(i); final double f = getFactor(i); topWeights.clear(); for (Vector.Element w : cell) { int val = (int) (1000 * f * w.get()); if (topWeights.size() < numTopTerms) { topWeights.add(new TermPair(terms.get(w.index()), val)); } else if (topWeights.peek().DocCount < val) { topWeights.remove(); topWeights.add(new TermPair(terms.get(w.index()), val)); } } final int numTopWeights = topWeights.size(); for (int j = numTopWeights - 1; j > -1; --j) { topTerms.set(j, topWeights.remove().Term); } getStats(i).setTopTerms(topTerms); } }
From source file:mlbench.bayes.train.WeightSummer.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);//from w ww . j av a 2s. c o m HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); Vector weightsPerFeature = null; Vector weightsPerLabel = new DenseVector(labNum); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); IntWritable index = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(index, value)) { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); } int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); } } if (weightsPerFeature != null) { MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature)); MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel)); } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDirW); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (Text) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { Path resOut = new Path(outDir); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config); naiveBayesModel.serialize(resOut, config); } } MPI_D.Finalize(); }
From source file:mlbench.kmeans.KmeansUtils.java
License:Apache License
static void accumulate(double[] sum, Vector vector) throws MPI_D_Exception { if (sum.length != vector.size()) { throw new MPI_D_Exception("Array is incorrent!"); }/*from www . j av a2 s.c o m*/ for (int i = 0; i < sum.length; i++) { sum[i] += vector.get(i); } }
From source file:net.aprendizajengrande.ontocluster.Clusterer.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(/*from ww w . ja v a 2s. com*/ "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); DistanceMeasure measure = new CosineDistanceMeasure(); long seed = 67241; int numClusters = 250; int numIterations = 500; // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); Path input = new Path(args[0] + "/input"); // first centroids are an input parameter to clustering Path clusters = new Path(args[0] + "/clusters"); clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed); Path output = new Path(args[1]); // cluster KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:net.aprendizajengrande.ontocluster.ClusterExtractor.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(//from w ww.j a v a 2s .c om "Usage: <input hdfs folder with rels> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } // delete the _SUCCESS file as it is problematic // see // http://stackoverflow.com/questions/10752708/eofexception-at-org-apache-hadoop-io-sequencefilereader-initsequencefile-java Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:opennlp.addons.mahout.VectorClassifierModel.java
License:Apache License
public double[] eval(String[] features) { Vector vector = new RandomAccessSparseVector(predMap.size()); for (String feature : features) { Integer featureId = predMap.get(feature); if (featureId != null) { vector.set(featureId, vector.get(featureId) + 1); }//from w w w .ja v a2 s.c o m } Vector resultVector = classifier.classifyFull(vector); double outcomes[] = new double[classifier.numCategories()]; for (int i = 0; i < outcomes.length; i++) { outcomes[i] = resultVector.get(i); } return outcomes; }
From source file:org.apache.crunch.examples.Recommender.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println();/*from w w w. jav a 2 s. c om*/ System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); System.err.println(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } Pipeline pipeline = new MRPipeline(Recommender.class, getConf()); /* * input node */ PCollection<String> lines = pipeline.readTextFile(args[0]); /* * S0 + GBK */ PGroupedTable<Long, Long> userWithPrefs = lines.parallelDo(new MapFn<String, Pair<Long, Long>>() { @Override public Pair<Long, Long> map(String input) { String[] split = input.split("[,\\s]"); long userID = Long.parseLong(split[0]); long itemID = Long.parseLong(split[1]); return Pair.of(userID, itemID); } }, Writables.tableOf(Writables.longs(), Writables.longs())).groupByKey(); /* * S1 */ PTable<Long, Vector> userVector = userWithPrefs .parallelDo(new MapFn<Pair<Long, Iterable<Long>>, Pair<Long, Vector>>() { @Override public Pair<Long, Vector> map(Pair<Long, Iterable<Long>> input) { Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (long itemPref : input.second()) { userVector.set((int) itemPref, 1.0f); } return Pair.of(input.first(), userVector); } }, Writables.tableOf(Writables.longs(), Writables.vectors())); /* * S2 + GBK */ PGroupedTable<Integer, Integer> coOccurencePairs = userVector .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, Integer>>() { @Override public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, Integer>> emitter) { Iterator<Vector.Element> it = input.second().iterateNonZero(); while (it.hasNext()) { int index1 = it.next().index(); Iterator<Vector.Element> it2 = input.second().iterateNonZero(); while (it2.hasNext()) { int index2 = it2.next().index(); emitter.emit(Pair.of(index1, index2)); } } } }, Writables.tableOf(Writables.ints(), Writables.ints())).groupByKey(); /* * S3 */ PTable<Integer, Vector> coOccurenceVector = coOccurencePairs .parallelDo(new MapFn<Pair<Integer, Iterable<Integer>>, Pair<Integer, Vector>>() { @Override public Pair<Integer, Vector> map(Pair<Integer, Iterable<Integer>> input) { Vector cooccurrenceRow = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (int itemIndex2 : input.second()) { cooccurrenceRow.set(itemIndex2, cooccurrenceRow.get(itemIndex2) + 1.0); } return Pair.of(input.first(), cooccurrenceRow); } }, Writables.tableOf(Writables.ints(), Writables.vectors())); /* * asText */ pipeline.writeTextFile(coOccurenceVector, args[1]); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
From source file:org.qcri.algebra.MultiplicationTest.java
License:Apache License
void verifySquareSum(Path sumPath) throws IOException { Vector sumVec = AlgebraCommon.mapDirToSparseVector(sumPath, 1, colsA, conf); double[][] vectorsA = inputVectorsA; for (int r = 0; r < vectorsA.length; r++) { double sum = 0; for (int c = 0; c < vectorsA[0].length; c++) sum += vectorsA[r][c] * vectorsA[r][c]; Assert.assertEquals("The sum of a[" + r + "][*] is incorrect: ", sum, sumVec.get(r), EPSILON); }/*from w w w . ja v a2s.c o m*/ }