Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:com.cloudera.knittingboar.metrics.POLRModelTester.java

License:Apache License

/**
 * Runs the next training batch to prep the gamma buffer to send to the
 * mstr_node//w  ww.  j a  v  a 2 s .c o m
 * 
 * TODO: need to provide stats, group measurements into struct
 * 
 * @throws Exception
 * @throws IOException
 */
public void RunThroughTestRecords() throws IOException, Exception {

    Text value = new Text();
    long batch_vec_factory_time = 0;
    k = 0;
    int num_correct = 0;

    for (int x = 0; x < this.BatchSize; x++) {

        if (this.input_split.next(value)) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(this.FeatureVectorSize);
            int actual = this.VectorFactory.processLine(value.toString(), v);

            long endTime = System.currentTimeMillis();

            // System.out.println("That took " + (endTime - startTime) +
            // " milliseconds");
            batch_vec_factory_time += (endTime - startTime);

            String ng = this.VectorFactory.GetClassnameByID(actual); // .GetNewsgroupNameByID(
                                                                     // actual );

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = this.polr.logLikelihood(actual, v);

            if (Double.isNaN(ll)) {

                /*
                 * System.out.println(" --------- NaN -----------");
                 * 
                 * System.out.println( "k: " + k ); System.out.println( "ll: " + ll );
                 * System.out.println( "mu: " + mu );
                 */
                // return;
            } else {

                metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            }

            Vector p = new DenseVector(20);
            this.polr.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            if (estimated == actual) {
                num_correct++;
            }
            // averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;

            // this.polr.train(actual, v);

            k++;
            // if (x == this.BatchSize - 1) {
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;

                System.out.printf(
                        "Worker %s:\t Trained Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
                        this.internalID, k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
                        batch_vec_factory_time);

            }

            this.polr.close();

        } else {

            // nothing else to process in split!
            break;

        } // if

    } // for the number of passes in the run

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;/*w w w  .  ja v  a2 s .co m*/
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java

License:Apache License

public void testRecordFactoryOnDatasetShard() throws Exception {

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    //rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "20news-part-0.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;// w  ww.j av a 2s  .com

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    rec_factory.Debug();

    int total_read = 0;

    long ts_start = System.currentTimeMillis();

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            rec_factory.processLine(value.toString(), v);

            count++;
            //break;

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    long ts_total = System.currentTimeMillis() - ts_start;

    double vectors_per_sec = (double) total_read / ((double) ts_total / 1000);

    System.out.println("Time: " + ts_total);

    System.out.println("total recs read across all splits: " + total_read);

    System.out.println("Vectors converted / sec: " + vectors_per_sec);

    assertEquals(total_read, 11314);

    rec_factory.Debug();

}

From source file:com.cloudera.knittingboar.sgd.iterativereduce.POLRWorkerNode.java

License:Apache License

/**
 * The IR::Compute method - this is where we do the next batch of records for
 * SGD/*w  w w .  j  a  va 2s .  co  m*/
 */
@Override
public ParameterVectorGradientUpdatable compute() {

    Text value = new Text();
    long batch_vec_factory_time = 0;

    boolean result = true;
    //boolean processBatch = false;

    /*    if (this.LocalPassCount > this.GlobalPassCount) {
          // we need to sit this one out
          System.out.println("Worker " + this.internalID
              + " is ahead of global pass count [" + this.LocalPassCount + ":"
              + this.GlobalPassCount + "] ");
          processBatch = true;
        }
                
        if (this.LocalPassCount >= this.NumberPasses) {
          // learning is done, terminate
          System.out.println("Worker " + this.internalID + " is done ["
              + this.LocalPassCount + ":" + this.GlobalPassCount + "] ");
          processBatch = false;
        }    
                
        if (processBatch) {
     */

    //    if (this.lineParser.hasMoreRecords()) {
    //for (int x = 0; x < this.BatchSize; x++) {
    while (this.lineParser.hasMoreRecords()) {

        try {
            result = this.lineParser.next(value);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }

        if (result) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(this.FeatureVectorSize);
            int actual = -1;
            try {

                actual = this.VectorFactory.processLine(value.toString(), v);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            long endTime = System.currentTimeMillis();

            batch_vec_factory_time += (endTime - startTime);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = this.polr.logLikelihood(actual, v);

            metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            if (Double.isNaN(metrics.AvgLogLikelihood)) {
                metrics.AvgLogLikelihood = 0;
            }

            Vector p = new DenseVector(this.num_categories);
            this.polr.classifyFull(p, v);
            int estimated = p.maxValueIndex();
            int correct = (estimated == actual ? 1 : 0);
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;
            this.polr.train(actual, v);

            k++;
            metrics.TotalRecordsProcessed = k;
            //          if (x == this.BatchSize - 1) {

            /*            System.err
                            .printf(
            "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
            this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood,
            metrics.AvgCorrect * 100, batch_vec_factory_time);
              */
            //          }

            this.polr.close();

        } else {

            //          this.LocalBatchCountForIteration++;
            // this.input_split.ResetToStartOfSplit();
            // nothing else to process in split!
            //          break;

        } // if

    } // for the batch size

    System.err.printf(
            "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
            this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
            batch_vec_factory_time);

    /*    } else {
          System.err
          .printf(
              "Worker %s:\t Trained Recs: %10d,  AvgLL: %10.3f, Percent Correct: %10.2f, [Done With Iteration]\n",
              this.internalID, k, metrics.AvgLogLikelihood,
              metrics.AvgCorrect * 100);
                  
        } // if 
      */
    return new ParameterVectorGradientUpdatable(this.GenerateUpdate());
}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java

License:Apache License

public void testResults() throws Exception {

    OnlineLogisticRegression classifier = ModelSerializer
            .readBinary(new FileInputStream(model20News.toString()), OnlineLogisticRegression.class);

    Text value = new Text();
    long batch_vec_factory_time = 0;
    int k = 0;//from w w w  . ja v a2s.c om
    int num_correct = 0;

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    //    InputSplit[] splits = generateDebugSplits(inputDir, job);

    //fullRCV1Dir
    InputSplit[] splits = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);

    TwentyNewsgroupsRecordFactory VectorFactory = new TwentyNewsgroupsRecordFactory("\t");

    for (int x = 0; x < 8000; x++) {

        if (custom_reader_0.next(value)) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(FEATURES);
            int actual = VectorFactory.processLine(value.toString(), v);

            long endTime = System.currentTimeMillis();

            //System.out.println("That took " + (endTime - startTime) + " milliseconds");
            batch_vec_factory_time += (endTime - startTime);

            String ng = VectorFactory.GetClassnameByID(actual); //.GetNewsgroupNameByID( actual );

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = classifier.logLikelihood(actual, v);
            //averageLL = averageLL + (ll - averageLL) / mu;
            metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            Vector p = new DenseVector(20);
            classifier.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            if (estimated == actual) {
                num_correct++;
            }
            //averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;

            //this.polr.train(actual, v);

            k++;
            //        if (x == this.BatchSize - 1) {
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;

                System.out.printf(
                        "Worker %s:\t Tested Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
                        "OLR-standard-test", k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
                        batch_vec_factory_time);

            }

            classifier.close();

        } else {

            // nothing else to process in split!
            break;

        } // if

    } // for the number of passes in the run    

}

From source file:com.cloudera.knittingboar.sgd.POLRWorkerDriver.java

License:Apache License

/**
 * Runs the next training batch to prep the gamma buffer to send to the
 * mstr_node//w  ww .j  av a 2s  . c  o  m
 * 
 * TODO: need to provide stats, group measurements into struct
 * 
 * @throws Exception
 * @throws IOException
 */
public boolean RunNextTrainingBatch() throws IOException, Exception {

    Text value = new Text();
    long batch_vec_factory_time = 0;

    if (this.LocalPassCount > this.GlobalPassCount) {
        // we need to sit this one out
        System.out.println("Worker " + this.internalID + " is ahead of global pass count ["
                + this.LocalPassCount + ":" + this.GlobalPassCount + "] ");
        return true;
    }

    if (this.LocalPassCount >= this.NumberPasses) {
        // learning is done, terminate
        System.out.println("Worker " + this.internalID + " is done [" + this.LocalPassCount + ":"
                + this.GlobalPassCount + "] ");
        return false;
    }

    for (int x = 0; x < this.BatchSize; x++) {

        if (this.input_split.next(value)) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(this.FeatureVectorSize);
            int actual = this.VectorFactory.processLine(value.toString(), v);

            long endTime = System.currentTimeMillis();

            batch_vec_factory_time += (endTime - startTime);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = this.polr.logLikelihood(actual, v);

            metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            Vector p = new DenseVector(20);
            this.polr.classifyFull(p, v);
            int estimated = p.maxValueIndex();
            int correct = (estimated == actual ? 1 : 0);
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;
            this.polr.train(actual, v);

            k++;
            if (x == this.BatchSize - 1) {

                System.out.printf(
                        "Worker %s:\t Trained Recs: %10d, loglikelihood: %10.3f, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
                        this.internalID, k, ll, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
                        batch_vec_factory_time);

            }

            this.polr.close();

        } else {

            this.LocalPassCount++;
            this.input_split.ResetToStartOfSplit();
            // nothing else to process in split!
            break;

        } // if

    } // for the batch size

    return true;

}

From source file:com.cloudera.knittingboar.sgd.TestRunRCV1Subset.java

License:Apache License

public void testSplits() throws IOException {

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    InputSplit[] splits = generateDebugSplits(inputDir, job);

    System.out.println("split count: " + splits.length);

    assertEquals(10, splits.length);//from  ww  w  . j a  va2s  . co  m

    InputSplit[] splits_full = generateDebugSplits(fullRCV1Dir, job);

    System.out.println("full rcv1 split count: " + splits_full.length);

    Text value = new Text();

    for (int x = 0; x < splits_full.length; x++) {

        InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits_full[x]);

        custom_reader_0.next(value);
        System.out.println(x + " > " + value.toString());

        custom_reader_0.next(value);
        System.out.println(x + " > " + value.toString());

        custom_reader_0.next(value);
        System.out.println(x + " > " + value.toString() + "\n");

    }

}

From source file:com.cloudera.recordbreaker.analyzer.AvroSequenceFileSchemaDescriptor.java

License:Open Source License

void computeSchema() throws IOException {
    try {/* www. ja  v a  2  s  .c om*/
        AvroSequenceFile.Reader.Options options = new AvroSequenceFile.Reader.Options()
                .withFileSystem(FSAnalyzer.getInstance().getFS()).withInputPath(dd.getFilename())
                .withConfiguration(new Configuration());
        AvroSequenceFile.Reader in = new AvroSequenceFile.Reader(options);
        try {
            //
            // Look for the Avro metadata key in the SequenceFile.  This will encode the Avro schema.
            //
            SequenceFile.Metadata seqFileMetadata = in.getMetadata();
            TreeMap<Text, Text> kvs = seqFileMetadata.getMetadata();
            Text keySchemaStr = kvs.get(AvroSequenceFile.METADATA_FIELD_KEY_SCHEMA);
            Text valSchemaStr = kvs.get(AvroSequenceFile.METADATA_FIELD_VALUE_SCHEMA);
            Schema keySchema = Schema.parse(keySchemaStr.toString());
            Schema valSchema = Schema.parse(valSchemaStr.toString());

            //
            // Build a "pair record" with "key" and "value" fields to hold the subschemas.
            //
            List<Schema.Field> fieldList = new ArrayList<Schema.Field>();
            fieldList.add(new Schema.Field("key", keySchema, "", null));
            fieldList.add(new Schema.Field("val", valSchema, "", null));
            this.schema = Schema.createRecord(fieldList);
        } finally {
            in.close();
        }
    } catch (IOException iex) {
    }
}

From source file:com.cloudera.sa.hbasebulkload.HBASEBulkLoadKeyValueMapper.java

@Override
protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException {

    outputFields = csvParser.parseLine(line.toString());
    hKey.set(outputFields[0].getBytes());
    hPut = new Put(outputFields[0].getBytes());
    for (int i = 1; i < noOfColumns; i++) {
        if (hbaseColumns.length > i && outputFields.length > i && outputFields[i] != null
                && !outputFields[i].trim().isEmpty()) {
            hPut = hPut.addColumn(hbaseColumnFamily.getBytes(), hbaseColumns[i].getBytes(),
                    outputFields[i].getBytes());
        }/* ww  w  .ja  v a 2s .  co m*/

    }
    context.write(hKey, hPut);
}

From source file:com.cloudera.sa.securewordcount.TokenizerMapper.java

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    strTokenizer = new StringTokenizer(value.toString(), tokenDelim, true);
    while (strTokenizer.hasMoreElements()) {
        word.set(strTokenizer.nextToken());
        context.write(word, one);/*from w  ww  .  ja va  2 s .com*/
    }
    tokenizerMapperLines.increment(1);
}