Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * //from  w w  w .  j  ava  2s .com
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);/* w w  w. j  ava 2s . com*/

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);/*from  w  w w .j ava2  s .c o  m*/

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    TextInputFormat format = new TextInputFormat();
    format.configure(job);//from w ww.  ja  v a  2s  . c o  m

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;//from w  w w  .  j  av  a 2s .co m
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java

License:Apache License

public void testRecordFactoryOnDatasetShard() throws Exception {

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    //rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "20news-part-0.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;// ww  w. j  ava  2 s.c  o m

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    rec_factory.Debug();

    int total_read = 0;

    long ts_start = System.currentTimeMillis();

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            rec_factory.processLine(value.toString(), v);

            count++;
            //break;

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    long ts_total = System.currentTimeMillis() - ts_start;

    double vectors_per_sec = (double) total_read / ((double) ts_total / 1000);

    System.out.println("Time: " + ts_total);

    System.out.println("total recs read across all splits: " + total_read);

    System.out.println("Vectors converted / sec: " + vectors_per_sec);

    assertEquals(total_read, 11314);

    rec_factory.Debug();

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    //FileInputFormat.setInputPaths(job, workDir);
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);/*ww w.  j  ava  2  s  . c  o m*/
    //LongWritable key = new LongWritable();
    //Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.sgd.TestRunPOLRMasterAndSingleWorker.java

License:Apache License

@Test
public void testRunSingleWorkerSingleMaster() throws Exception {
    // TODO a test with assertions is not a test
    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(configuration);//from w w w  .j a  va2 s. co m
    // now load the conf stuff into locally used vars
    master.LoadConfigVarsLocally();
    // now construct any needed machine learning data structures based on config
    master.Setup();
    // ------------------

    POLRWorkerDriver worker_model_builder_0 = new POLRWorkerDriver();

    // simulates the conf stuff
    worker_model_builder_0.setConf(configuration);

    // ---- this all needs to be done in
    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);
    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");
    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, workDir);
    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    InputSplit[] splits = format.getSplits(job, 1);

    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);

    // TODO: set this up to run through the conf pathways
    worker_model_builder_0.setupInputSplit(custom_reader);

    worker_model_builder_0.LoadConfigVarsLocally();

    worker_model_builder_0.Setup();

    LOG.info("> Feature Size: " + worker_model_builder_0.FeatureVectorSize);
    LOG.info("> Category Size: " + worker_model_builder_0.num_categories);

    for (int x = 0; x < 25; x++) {

        worker_model_builder_0.RunNextTrainingBatch();

        GradientUpdateMessage msg = worker_model_builder_0.GenerateUpdateMessage();
        master.AddIncomingGradientMessageToQueue(msg);
        master.RecvGradientMessage(); // process msg
        master.GenerateGlobalUpdateVector();
        GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();
        worker_model_builder_0.ProcessIncomingParameterVectorMessage(returned_msg);
        LOG.info("---------- cycle " + x + " done ------------- ");
    } // for

    worker_model_builder_0.Debug();
}

From source file:com.cloudera.knittingboar.sgd.TestRunPOLRMasterAndTwoWorkers.java

License:Apache License

@Test
public void testRunMasterAndTwoWorkers() throws Exception {
    // TODO a test with assertions is not a test
    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(configuration);// w ww  .  j a va2  s  .  c  om
    // now load the conf stuff into locally used vars
    master.LoadConfigVarsLocally();
    // now construct any needed machine learning data structures based on config
    master.Setup();
    // ------------------    

    POLRWorkerDriver worker_model_builder_0 = new POLRWorkerDriver();
    worker_model_builder_0.internalID = "0";
    // simulates the conf stuff
    worker_model_builder_0.setConf(configuration);

    POLRWorkerDriver worker_model_builder_1 = new POLRWorkerDriver();
    worker_model_builder_1.internalID = "1";
    // simulates the conf stuff
    worker_model_builder_1.setConf(configuration);

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);
    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");
    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, workDir);
    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    InputSplit[] splits = format.getSplits(job, 2);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);
    InputRecordsSplit custom_reader_1 = new InputRecordsSplit(job, splits[1]);

    // TODO: set this up to run through the conf pathways
    worker_model_builder_0.setupInputSplit(custom_reader_0);
    worker_model_builder_0.LoadConfigVarsLocally();
    worker_model_builder_0.Setup();

    worker_model_builder_1.setupInputSplit(custom_reader_1);
    worker_model_builder_1.LoadConfigVarsLocally();
    worker_model_builder_1.Setup();

    LOG.info("> Feature Size: " + worker_model_builder_0.FeatureVectorSize);
    LOG.info("> Category Size: " + worker_model_builder_0.num_categories);

    for (int x = 0; x < 30; x++) {

        // run batch 0
        worker_model_builder_0.RunNextTrainingBatch();
        GradientUpdateMessage msg0 = worker_model_builder_0.GenerateUpdateMessage();

        worker_model_builder_1.RunNextTrainingBatch();
        GradientUpdateMessage msg1 = worker_model_builder_1.GenerateUpdateMessage();

        master.AddIncomingGradientMessageToQueue(msg0);
        master.AddIncomingGradientMessageToQueue(msg1);
        master.RecvGradientMessage(); // process msg
        master.RecvGradientMessage(); // process msg

        master.GenerateGlobalUpdateVector();

        GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();
        worker_model_builder_0.ProcessIncomingParameterVectorMessage(returned_msg);

        worker_model_builder_1.ProcessIncomingParameterVectorMessage(returned_msg);

        LOG.info("---------- cycle " + x + " done ------------- ");

    } // for

}

From source file:com.cloudera.knittingboar.sgd.TestRunPOLRWorkerSingleBatch.java

License:Apache License

public InputSplit[] generateDebugSplits(String input_file, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);/*from   w  w w.  j  a v  a2  s. co  m*/

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}