List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.cloudera.knittingboar.metrics.POLRModelTester.java
License:Apache License
/** * Runs the next training batch to prep the gamma buffer to send to the * mstr_node//w ww. j a v a 2 s .c o m * * TODO: need to provide stats, group measurements into struct * * @throws Exception * @throws IOException */ public void RunThroughTestRecords() throws IOException, Exception { Text value = new Text(); long batch_vec_factory_time = 0; k = 0; int num_correct = 0; for (int x = 0; x < this.BatchSize; x++) { if (this.input_split.next(value)) { long startTime = System.currentTimeMillis(); Vector v = new RandomAccessSparseVector(this.FeatureVectorSize); int actual = this.VectorFactory.processLine(value.toString(), v); long endTime = System.currentTimeMillis(); // System.out.println("That took " + (endTime - startTime) + // " milliseconds"); batch_vec_factory_time += (endTime - startTime); String ng = this.VectorFactory.GetClassnameByID(actual); // .GetNewsgroupNameByID( // actual ); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = this.polr.logLikelihood(actual, v); if (Double.isNaN(ll)) { /* * System.out.println(" --------- NaN -----------"); * * System.out.println( "k: " + k ); System.out.println( "ll: " + ll ); * System.out.println( "mu: " + mu ); */ // return; } else { metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu; } Vector p = new DenseVector(20); this.polr.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); if (estimated == actual) { num_correct++; } // averageCorrect = averageCorrect + (correct - averageCorrect) / mu; metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu; // this.polr.train(actual, v); k++; // if (x == this.BatchSize - 1) { int bump = bumps[(int) Math.floor(step) % bumps.length]; int scale = (int) Math.pow(10, Math.floor(step / bumps.length)); if (k % (bump * scale) == 0) { step += 0.25; System.out.printf( "Worker %s:\t Trained Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); } this.polr.close(); } else { // nothing else to process in split! break; } // if } // for the number of passes in the run }
From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java
License:Apache License
@Test public void testRecordFactoryOnDatasetShard() throws Exception { // TODO a test with assertions is not a test // p.270 ----- metrics to track lucene's parsing mechanics, progress, // performance of OLR ------------ double averageLL = 0.0; double averageCorrect = 0.0; int k = 0;/*w w w . ja v a2 s .co m*/ double step = 0.0; int[] bumps = new int[] { 1, 2, 5 }; TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t"); // rec_factory.setClassSplitString("\t"); JobConf job = new JobConf(defaultConf); long block_size = localFs.getDefaultBlockSize(workDir); LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // matches the OLR setup on p.269 --------------- // stepOffset, decay, and alpha --- describe how the learning rate decreases // lambda: amount of regularization // learningRate: amount of initial learning rate @SuppressWarnings("resource") OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1) .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20); FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); LOG.info("---- debug splits --------- "); rec_factory.Debug(); int total_read = 0; for (int x = 0; x < splits.length; x++) { LOG.info("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES); int actual = rec_factory.processLine(value.toString(), v); String ng = rec_factory.GetNewsgroupNameByID(actual); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = learningAlgorithm.logLikelihood(actual, v); averageLL = averageLL + (ll - averageLL) / mu; Vector p = new DenseVector(20); learningAlgorithm.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); averageCorrect = averageCorrect + (correct - averageCorrect) / mu; learningAlgorithm.train(actual, v); k++; int bump = bumps[(int) Math.floor(step) % bumps.length]; int scale = (int) Math.pow(10, Math.floor(step / bumps.length)); if (k % (bump * scale) == 0) { step += 0.25; LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL, averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated))); } learningAlgorithm.close(); count++; } LOG.info("read: " + count + " records for split " + x); total_read += count; } // for each split LOG.info("total read across all splits: " + total_read); rec_factory.Debug(); }
From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java
License:Apache License
public void testRecordFactoryOnDatasetShard() throws Exception { TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t"); //rec_factory.setClassSplitString("\t"); JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "20news-part-0.txt"); int tmp_file_size = 200000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // A reporter that does nothing Reporter reporter = Reporter.NULL;// w ww.j av a 2s .com FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); System.out.println("---- debug splits --------- "); rec_factory.Debug(); int total_read = 0; long ts_start = System.currentTimeMillis(); for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES); rec_factory.processLine(value.toString(), v); count++; //break; } System.out.println("read: " + count + " records for split " + x); total_read += count; } // for each split long ts_total = System.currentTimeMillis() - ts_start; double vectors_per_sec = (double) total_read / ((double) ts_total / 1000); System.out.println("Time: " + ts_total); System.out.println("total recs read across all splits: " + total_read); System.out.println("Vectors converted / sec: " + vectors_per_sec); assertEquals(total_read, 11314); rec_factory.Debug(); }
From source file:com.cloudera.knittingboar.sgd.iterativereduce.POLRWorkerNode.java
License:Apache License
/** * The IR::Compute method - this is where we do the next batch of records for * SGD/*w w w . j a va 2s . co m*/ */ @Override public ParameterVectorGradientUpdatable compute() { Text value = new Text(); long batch_vec_factory_time = 0; boolean result = true; //boolean processBatch = false; /* if (this.LocalPassCount > this.GlobalPassCount) { // we need to sit this one out System.out.println("Worker " + this.internalID + " is ahead of global pass count [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); processBatch = true; } if (this.LocalPassCount >= this.NumberPasses) { // learning is done, terminate System.out.println("Worker " + this.internalID + " is done [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); processBatch = false; } if (processBatch) { */ // if (this.lineParser.hasMoreRecords()) { //for (int x = 0; x < this.BatchSize; x++) { while (this.lineParser.hasMoreRecords()) { try { result = this.lineParser.next(value); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } if (result) { long startTime = System.currentTimeMillis(); Vector v = new RandomAccessSparseVector(this.FeatureVectorSize); int actual = -1; try { actual = this.VectorFactory.processLine(value.toString(), v); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } long endTime = System.currentTimeMillis(); batch_vec_factory_time += (endTime - startTime); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = this.polr.logLikelihood(actual, v); metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu; if (Double.isNaN(metrics.AvgLogLikelihood)) { metrics.AvgLogLikelihood = 0; } Vector p = new DenseVector(this.num_categories); this.polr.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu; this.polr.train(actual, v); k++; metrics.TotalRecordsProcessed = k; // if (x == this.BatchSize - 1) { /* System.err .printf( "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); */ // } this.polr.close(); } else { // this.LocalBatchCountForIteration++; // this.input_split.ResetToStartOfSplit(); // nothing else to process in split! // break; } // if } // for the batch size System.err.printf( "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); /* } else { System.err .printf( "Worker %s:\t Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, [Done With Iteration]\n", this.internalID, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100); } // if */ return new ParameterVectorGradientUpdatable(this.GenerateUpdate()); }
From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java
License:Apache License
public void testResults() throws Exception { OnlineLogisticRegression classifier = ModelSerializer .readBinary(new FileInputStream(model20News.toString()), OnlineLogisticRegression.class); Text value = new Text(); long batch_vec_factory_time = 0; int k = 0;//from w w w . ja v a2s.c om int num_correct = 0; // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); // TODO: work on this, splits are generating for everything in dir // InputSplit[] splits = generateDebugSplits(inputDir, job); //fullRCV1Dir InputSplit[] splits = generateDebugSplits(testData20News, job); System.out.println("split count: " + splits.length); InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]); TwentyNewsgroupsRecordFactory VectorFactory = new TwentyNewsgroupsRecordFactory("\t"); for (int x = 0; x < 8000; x++) { if (custom_reader_0.next(value)) { long startTime = System.currentTimeMillis(); Vector v = new RandomAccessSparseVector(FEATURES); int actual = VectorFactory.processLine(value.toString(), v); long endTime = System.currentTimeMillis(); //System.out.println("That took " + (endTime - startTime) + " milliseconds"); batch_vec_factory_time += (endTime - startTime); String ng = VectorFactory.GetClassnameByID(actual); //.GetNewsgroupNameByID( actual ); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = classifier.logLikelihood(actual, v); //averageLL = averageLL + (ll - averageLL) / mu; metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu; Vector p = new DenseVector(20); classifier.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); if (estimated == actual) { num_correct++; } //averageCorrect = averageCorrect + (correct - averageCorrect) / mu; metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu; //this.polr.train(actual, v); k++; // if (x == this.BatchSize - 1) { int bump = bumps[(int) Math.floor(step) % bumps.length]; int scale = (int) Math.pow(10, Math.floor(step / bumps.length)); if (k % (bump * scale) == 0) { step += 0.25; System.out.printf( "Worker %s:\t Tested Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", "OLR-standard-test", k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); } classifier.close(); } else { // nothing else to process in split! break; } // if } // for the number of passes in the run }
From source file:com.cloudera.knittingboar.sgd.POLRWorkerDriver.java
License:Apache License
/** * Runs the next training batch to prep the gamma buffer to send to the * mstr_node//w ww .j av a 2s . c o m * * TODO: need to provide stats, group measurements into struct * * @throws Exception * @throws IOException */ public boolean RunNextTrainingBatch() throws IOException, Exception { Text value = new Text(); long batch_vec_factory_time = 0; if (this.LocalPassCount > this.GlobalPassCount) { // we need to sit this one out System.out.println("Worker " + this.internalID + " is ahead of global pass count [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); return true; } if (this.LocalPassCount >= this.NumberPasses) { // learning is done, terminate System.out.println("Worker " + this.internalID + " is done [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); return false; } for (int x = 0; x < this.BatchSize; x++) { if (this.input_split.next(value)) { long startTime = System.currentTimeMillis(); Vector v = new RandomAccessSparseVector(this.FeatureVectorSize); int actual = this.VectorFactory.processLine(value.toString(), v); long endTime = System.currentTimeMillis(); batch_vec_factory_time += (endTime - startTime); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = this.polr.logLikelihood(actual, v); metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu; Vector p = new DenseVector(20); this.polr.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu; this.polr.train(actual, v); k++; if (x == this.BatchSize - 1) { System.out.printf( "Worker %s:\t Trained Recs: %10d, loglikelihood: %10.3f, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, k, ll, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); } this.polr.close(); } else { this.LocalPassCount++; this.input_split.ResetToStartOfSplit(); // nothing else to process in split! break; } // if } // for the batch size return true; }
From source file:com.cloudera.knittingboar.sgd.TestRunRCV1Subset.java
License:Apache License
public void testSplits() throws IOException { // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); // TODO: work on this, splits are generating for everything in dir InputSplit[] splits = generateDebugSplits(inputDir, job); System.out.println("split count: " + splits.length); assertEquals(10, splits.length);//from ww w . j a va2s . co m InputSplit[] splits_full = generateDebugSplits(fullRCV1Dir, job); System.out.println("full rcv1 split count: " + splits_full.length); Text value = new Text(); for (int x = 0; x < splits_full.length; x++) { InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits_full[x]); custom_reader_0.next(value); System.out.println(x + " > " + value.toString()); custom_reader_0.next(value); System.out.println(x + " > " + value.toString()); custom_reader_0.next(value); System.out.println(x + " > " + value.toString() + "\n"); } }
From source file:com.cloudera.recordbreaker.analyzer.AvroSequenceFileSchemaDescriptor.java
License:Open Source License
void computeSchema() throws IOException { try {/* www. ja v a 2 s .c om*/ AvroSequenceFile.Reader.Options options = new AvroSequenceFile.Reader.Options() .withFileSystem(FSAnalyzer.getInstance().getFS()).withInputPath(dd.getFilename()) .withConfiguration(new Configuration()); AvroSequenceFile.Reader in = new AvroSequenceFile.Reader(options); try { // // Look for the Avro metadata key in the SequenceFile. This will encode the Avro schema. // SequenceFile.Metadata seqFileMetadata = in.getMetadata(); TreeMap<Text, Text> kvs = seqFileMetadata.getMetadata(); Text keySchemaStr = kvs.get(AvroSequenceFile.METADATA_FIELD_KEY_SCHEMA); Text valSchemaStr = kvs.get(AvroSequenceFile.METADATA_FIELD_VALUE_SCHEMA); Schema keySchema = Schema.parse(keySchemaStr.toString()); Schema valSchema = Schema.parse(valSchemaStr.toString()); // // Build a "pair record" with "key" and "value" fields to hold the subschemas. // List<Schema.Field> fieldList = new ArrayList<Schema.Field>(); fieldList.add(new Schema.Field("key", keySchema, "", null)); fieldList.add(new Schema.Field("val", valSchema, "", null)); this.schema = Schema.createRecord(fieldList); } finally { in.close(); } } catch (IOException iex) { } }
From source file:com.cloudera.sa.hbasebulkload.HBASEBulkLoadKeyValueMapper.java
@Override protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException { outputFields = csvParser.parseLine(line.toString()); hKey.set(outputFields[0].getBytes()); hPut = new Put(outputFields[0].getBytes()); for (int i = 1; i < noOfColumns; i++) { if (hbaseColumns.length > i && outputFields.length > i && outputFields[i] != null && !outputFields[i].trim().isEmpty()) { hPut = hPut.addColumn(hbaseColumnFamily.getBytes(), hbaseColumns[i].getBytes(), outputFields[i].getBytes()); }/* ww w .ja v a 2s . co m*/ } context.write(hKey, hPut); }
From source file:com.cloudera.sa.securewordcount.TokenizerMapper.java
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { strTokenizer = new StringTokenizer(value.toString(), tokenDelim, true); while (strTokenizer.hasMoreElements()) { word.set(strTokenizer.nextToken()); context.write(word, one);/*from w ww . ja va 2 s .com*/ } tokenizerMapperLines.increment(1); }