List of usage examples for org.apache.mahout.math SequentialAccessSparseVector SequentialAccessSparseVector
public SequentialAccessSparseVector(SequentialAccessSparseVector other)
From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java
License:Open Source License
public void vectorize(File luceneIndexDir, File outputDir) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); SequenceFile.Writer writer = null; FeatureDictionary dict = new FeatureDictionary(); DirectoryReader reader = null;/*from www. ja v a2 s . co m*/ try { reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir)); writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"), IDAndCodes.class, VectorWritable.class); IDAndCodes idAndCodes = new IDAndCodes(); VectorWritable vectorWritable = new VectorWritable(); Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fieldNames = fields.iterator(); while (fieldNames.hasNext()) { String field = fieldNames.next(); if (!field.startsWith("bip:") && !"itemID".equals(field)) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { dict.addTextFeature(field, text.utf8ToString()); } } } } int numDocsVectorized = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { Document doc = reader.document(docID); int itemID = doc.getField("itemID").numericValue().intValue(); RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures()); Multimap<String, String> codes = HashMultimap.create(); for (IndexableField field : doc.getFields()) { String fieldName = field.name(); if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) { Terms termFreqVector = reader.getTermVector(docID, fieldName); if (termFreqVector != null) { int maxTermFrequency = maxTermFrequency(termFreqVector); TermsEnum te = termFreqVector.iterator(null); BytesRef term; while ((term = te.next()) != null) { String termStr = term.utf8ToString(); int termFrequency = (int) te.totalTermFreq(); int documentFrequency = reader.docFreq(new Term(fieldName, term)); int numDocs = reader.numDocs(); double weight = weighting.weight(fieldName, termStr, termFrequency, documentFrequency, maxTermFrequency, numDocs); int featureIndex = dict.index(fieldName, term.utf8ToString()); documentVector.setQuick(featureIndex, weight); } } } else if (fieldName.startsWith("bip:")) { for (String value : doc.getValues(fieldName)) { codes.put(fieldName, value); } } } Vector featureVector = new SequentialAccessSparseVector(documentVector); weighting.normalize(featureVector); idAndCodes.set(itemID, codes); vectorWritable.set(featureVector); writer.append(idAndCodes, vectorWritable); numDocsVectorized++; if (numDocsVectorized % 100 == 0) { log.info("Vectorized {} documents", numDocsVectorized); } } log.info("Vectorized {} documents", numDocsVectorized); dict.writeToFile(new File(outputDir, "features.txt")); log.info("Wrote feature dictionary"); } finally { Closeables.close(reader, true); Closeables.close(writer, true); } }
From source file:edu.rosehulman.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;//from w w w . j a va 2 s .com } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.close(sf, true); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1); } }
From source file:guipart.view.GUIOverviewController.java
@FXML void handleClassifyModel(ActionEvent event) throws IOException { if (pathModel != null && pathCSV != null) { Auc collector = new Auc(); LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel)); CsvRecordFactory csv = lmp.getCsvRecordFactory(); OnlineLogisticRegression lr = lmp.createRegression(); BufferedReader in = Utils.open(pathCSV); String line = in.readLine(); csv.firstLine(line);/*w w w . j a v a2 s . c om*/ line = in.readLine(); int correct = 0; int wrong = 0; Boolean booltemp; String gender; while (line != null) { Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); int target = csv.processLine(line, v); String[] split = line.split(","); double score = lr.classifyFull(v).maxValueIndex(); if (score == target) correct++; else wrong++; System.out.println("Target is: " + target + " Score: " + score); booltemp = score != 0; if (split[1].contentEquals("1")) gender = "male"; else gender = "female"; Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]), Integer.parseInt(split[7]), booltemp, gender, Integer.parseInt(split[5]), Integer.parseInt(split[6]), Integer.parseInt(split[3])); guiPart.addPerson(temp); line = in.readLine(); collector.add(target, score); } double posto = ((double) wrong / (double) (correct + wrong)) * 100; System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong + " Wrong pct: " + posto + "%"); //PrintWriter output = null; Matrix m = collector.confusion(); //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t " + m.get(0, 1) + " " + m.get(1, 1) + " "); // m = collector.entropy(); //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); textAnalyze2.setText("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t \t " + m.get(0, 1) + " " + m.get(1, 1) + "\n" + "Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong + " Wrong pct: " + posto + "%"); } else { Dialogs.create().owner(guiPart.getPrimaryStage()).title("Error Dialog") .masthead("Look, an Error Dialog").message("One or more files aren't selected").showError(); } }
From source file:guipart.view.GUIOverviewController.java
@FXML void singlClassify(ActionEvent e) throws IOException { LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(pathModel)); CsvRecordFactory csv = lmp.getCsvRecordFactory(); OnlineLogisticRegression lr = lmp.createRegression(); csv.firstLine("custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk"); String line;/*w w w. j a va 2 s. co m*/ line = scID.getText(); line = line.concat("," + scGender.getText()); line = line.concat("," + scState.getText()); line = line.concat("," + scCardholders.getText()); line = line.concat("," + scBalance.getText()); line = line.concat("," + scTrans.getText()); line = line.concat("," + scIntlTrans.getText()); line = line.concat("," + scCreditLine.getText()); line = line.concat(",0 \n"); Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); int target = csv.processLine(line, v); String[] split = line.split(","); double score = lr.classifyFull(v).maxValueIndex(); boolean booltemp = score != 0; String gender; if (split[1].contentEquals("1")) gender = "male"; else gender = "female"; Person temp = new Person(Integer.parseInt(split[0]), Integer.parseInt(split[4]), Integer.parseInt(split[7]), booltemp, gender, Integer.parseInt(split[5]), Integer.parseInt(split[6]), Integer.parseInt(split[3])); guiPart.addPerson(temp); }
From source file:haflow.component.mahout.logistic.RunLogistic.java
License:Apache License
static void mainToOutput(String[] args) throws Exception { if (parseArgs(args)) { if (!showAuc && !showConfusion && !showScores) { showAuc = true;// w w w .ja v a 2 s .co m showConfusion = true; } //PrintWriter output=new PrintWriter(new FileOutputStream(outputFile),true); PrintWriter output = new PrintWriter(HdfsUtil.writeHdfs(outputFile), true); PrintWriter acc_output = new PrintWriter(HdfsUtil.writeHdfs(accurateFile), true); Auc collector = new Auc(); LogisticModelParameters lmp = LogisticModelParameters.loadFrom(HdfsUtil.open(modelFile)); CsvRecordFactory csv = lmp.getCsvRecordFactory(); OnlineLogisticRegression lr = lmp.createRegression(); BufferedReader in = new BufferedReader(new InputStreamReader(HdfsUtil.open(inputFile))); String line = in.readLine(); csv.firstLine(line); line = in.readLine(); if (showScores) { output.println("\"target\",\"model-output\",\"log-likelihood\""); } while (line != null) { Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); int target = csv.processLine(line, v); double score = lr.classifyScalar(v); if (showScores) { output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v)); } collector.add(target, score); line = in.readLine(); } if (showAuc) { acc_output.printf(Locale.ENGLISH, "AUC , %.2f%n", collector.auc()); } if (showConfusion) { Matrix m = collector.confusion(); acc_output.printf(Locale.ENGLISH, "confusion, [[%.1f %.1f], [%.1f %.1f]]%n", m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); m = collector.entropy(); acc_output.printf(Locale.ENGLISH, "entropy, [[%.1f %.1f], [%.1f %.1f]]%n", m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); } output.close(); acc_output.close(); } }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
public static void matrix2Vector(Configuration conf, Path path) throws IOException { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; // ??SequenceFile????Name?? reader = new SequenceFile.Reader(fs, path, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); Writer writer = null;// w ww. j a v a 2s .c o m try { writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class, CompressionType.BLOCK); final IntWritable key1 = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNum = 0; Vector vector = null; while (reader.next(key, val)) { int index = 0; StringTokenizer st = new StringTokenizer(val.toString()); // SequentialAccessSparseVector??NamedVector vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + ""); while (st.hasMoreTokens()) { if (Integer.parseInt(st.nextToken()) == 1) { vector.set(index, 1); } index++; } key1.set(lineNum++); value.set(vector); writer.append(key, value); } } finally { writer.close(); reader.close(); } }
From source file:javaapplication3.RunLogistic.java
public static void main(String[] args) throws IOException { // TODO code application logic here Auc collector = new Auc(); LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile)); CsvRecordFactory csv = lmp.getCsvRecordFactory(); OnlineLogisticRegression lr = lmp.createRegression(); BufferedReader in = open(inputFile); String line = in.readLine();/*from w w w . j av a 2s. c om*/ csv.firstLine(line); line = in.readLine(); int correct = 0; int wrong = 0; while (line != null) { Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); int target = csv.processLine(line, v); System.out.println(line); String[] split = line.split(","); double score = lr.classifyFull(v).maxValueIndex(); if (score == target) correct++; else wrong++; System.out.println("Target is: " + target + " Score: " + score); line = in.readLine(); collector.add(target, score); } double posto = ((double) wrong / (double) (correct + wrong)) * 100; System.out.println("Total: " + (correct + wrong) + " Correct: " + correct + " Wrong: " + wrong + " Wrong pct: " + posto + "%"); //PrintWriter output = null; Matrix m = collector.confusion(); //output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); System.out.println("Confusion:" + m.get(0, 0) + " " + m.get(1, 0) + "\n \t " + m.get(0, 1) + " " + m.get(1, 1) + " "); // m = collector.entropy(); //output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); }
From source file:net.aprendizajengrande.ontocluster.RedisToVectors.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: <hdfs folder for input>"); System.exit(1);// w w w. ja v a2 s .co m } Configuration conf = new Configuration(); System.out.println("Input: " + args[0]); // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); String inputName = args[0] + "/input"; String relsInputName = args[0] + "/rels"; String instancesInputName = args[0] + "/instances"; Path input = new Path(inputName); Path relsInput = new Path(relsInputName); Path instancesInput = new Path(instancesInputName); // see http://stackoverflow.com/questions/14993644/configure-jedis-timeout Jedis jedis = new Jedis("localhost", 6379, 18000); // create the relations and instances first, so we know what to expect Set<String> rels = jedis.keys("rel-nom-*"); Map<Integer, String> relIdToName = new HashMap<>(); FSDataOutputStream fsdos = relsInput.getFileSystem(conf).create(relsInput); PrintWriter pw = new PrintWriter(new OutputStreamWriter(fsdos)); int relNum = 0; for (String rel : rels) { String relName = rel.replaceAll("^rel-nom-", ""); int relId = Integer.parseInt(jedis.get(rel)); relIdToName.put(relId, relName); if (relId > relNum) relNum = relId; } relNum++; for (int i = 0; i < relNum; i++) pw.println(i + "\t" + relIdToName.get(i)); pw.close(); rels.clear(); Set<String> instances = jedis.keys("res-nom-*"); fsdos = instancesInput.getFileSystem(conf).create(instancesInput); pw = new PrintWriter(new OutputStreamWriter(fsdos)); for (String instance : instances) { int instanceId = Integer.parseInt(instance.replaceAll("^res-nom-", "")); String instanceName = jedis.get(instance); pw.println(instanceId + "\t" + instanceName); } pw.close(); instances.clear(); Set<String> keys = jedis.keys("r-*"); SequenceFile.Writer writer = SequenceFile.createWriter(conf, Writer.file(input), Writer.keyClass(Text.class), Writer.valueClass(VectorWritable.class)); for (String key : keys) { Set<String> theseRels = jedis.smembers(key); Vector s = new SequentialAccessSparseVector(relNum); for (String relId : theseRels) s.set(Integer.parseInt(relId), 1.0); VectorWritable v = new VectorWritable(s); writer.append(new Text(key), v); } writer.close(); jedis.close(); }
From source file:org.qcri.pca.FileFormat.java
public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) { try {//from w w w . j av a 2 s. c o m final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNumber = 0; String thisLine; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); Vector vector = null; String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here if (thisLine.isEmpty()) continue; String[] splitted = thisLine.split("\\s+"); vector = new SequentialAccessSparseVector(splitted.length); for (int i = 0; i < splitted.length; i++) { vector.set(i, Double.parseDouble(splitted[i])); } key.set(lineNumber); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row lineNumber++; } writer.close(); } } catch (Exception e) { e.printStackTrace(); } }
From source file:org.qcri.pca.FileFormat.java
public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) { try {//w w w . j av a 2 s . c o m final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = null; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); Vector vector = null; String thisLine; int prevRowID = -1; boolean first = true; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here String[] splitted = thisLine.split(","); int rowID = Integer.parseInt(splitted[0]); int colID = Integer.parseInt(splitted[1]); double element = Double.parseDouble(splitted[2]); if (first) { first = false; vector = new SequentialAccessSparseVector(cardinality); } else if (rowID != prevRowID) { key.set(prevRowID); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row vector = new SequentialAccessSparseVector(cardinality); } prevRowID = rowID; vector.set(colID - base, element); } /*//here we append the last vector in each file (assuming that we will start a new row in the next file key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key,value);//write last row writer.close(); */ } if (writer != null) //append last vector in last file { key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key, value);//write last row writer.close(); } } catch (Exception e) { e.printStackTrace(); } }