List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java
License:Apache License
/** * //from w w w . j ava 2s .com * - use the TextInputFormat.getSplits() to test pulling split info * @throws IOException * */ public void testGetSplits() throws IOException { TextInputFormat input = new TextInputFormat(); JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "testGetSplits.txt"); int tmp_file_size = 200000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < tmp_file_size; i++) { writer.write( "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99"); writer.write("\n"); } } finally { writer.close(); } System.out.println("file write complete"); // A reporter that does nothing Reporter reporter = Reporter.NULL; // localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, file); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); assertEquals(2, splits.length); System.out.println("---- debug splits --------- "); for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", " + splits[x].getLocations()[0]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter); try { int count = 0; while (reader.next(key, value)) { if (count == 0) { System.out.println("first: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } count++; } System.out.println("last: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } finally { reader.close(); } } // for each split }
From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);/* w w w. j ava 2s . com*/ int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);/*from w w w .j ava2 s .c o m*/ int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); FileInputFormat.setInputPaths(job, input_path); TextInputFormat format = new TextInputFormat(); format.configure(job);//from w ww. ja v a 2s . c o m int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java
License:Apache License
@Test public void testRecordFactoryOnDatasetShard() throws Exception { // TODO a test with assertions is not a test // p.270 ----- metrics to track lucene's parsing mechanics, progress, // performance of OLR ------------ double averageLL = 0.0; double averageCorrect = 0.0; int k = 0;//from w w w . j av a 2s .co m double step = 0.0; int[] bumps = new int[] { 1, 2, 5 }; TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t"); // rec_factory.setClassSplitString("\t"); JobConf job = new JobConf(defaultConf); long block_size = localFs.getDefaultBlockSize(workDir); LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // matches the OLR setup on p.269 --------------- // stepOffset, decay, and alpha --- describe how the learning rate decreases // lambda: amount of regularization // learningRate: amount of initial learning rate @SuppressWarnings("resource") OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1) .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20); FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); LOG.info("---- debug splits --------- "); rec_factory.Debug(); int total_read = 0; for (int x = 0; x < splits.length; x++) { LOG.info("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES); int actual = rec_factory.processLine(value.toString(), v); String ng = rec_factory.GetNewsgroupNameByID(actual); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = learningAlgorithm.logLikelihood(actual, v); averageLL = averageLL + (ll - averageLL) / mu; Vector p = new DenseVector(20); learningAlgorithm.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); averageCorrect = averageCorrect + (correct - averageCorrect) / mu; learningAlgorithm.train(actual, v); k++; int bump = bumps[(int) Math.floor(step) % bumps.length]; int scale = (int) Math.pow(10, Math.floor(step / bumps.length)); if (k % (bump * scale) == 0) { step += 0.25; LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL, averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated))); } learningAlgorithm.close(); count++; } LOG.info("read: " + count + " records for split " + x); total_read += count; } // for each split LOG.info("total read across all splits: " + total_read); rec_factory.Debug(); }
From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java
License:Apache License
public void testRecordFactoryOnDatasetShard() throws Exception { TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t"); //rec_factory.setClassSplitString("\t"); JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "20news-part-0.txt"); int tmp_file_size = 200000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // A reporter that does nothing Reporter reporter = Reporter.NULL;// ww w. j ava 2 s.c o m FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); System.out.println("---- debug splits --------- "); rec_factory.Debug(); int total_read = 0; long ts_start = System.currentTimeMillis(); for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES); rec_factory.processLine(value.toString(), v); count++; //break; } System.out.println("read: " + count + " records for split " + x); total_read += count; } // for each split long ts_total = System.currentTimeMillis() - ts_start; double vectors_per_sec = (double) total_read / ((double) ts_total / 1000); System.out.println("Time: " + ts_total); System.out.println("total recs read across all splits: " + total_read); System.out.println("Vectors converted / sec: " + vectors_per_sec); assertEquals(total_read, 11314); rec_factory.Debug(); }
From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- //FileInputFormat.setInputPaths(job, workDir); FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);/*ww w. j ava 2 s . c o m*/ //LongWritable key = new LongWritable(); //Text value = new Text(); int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.sgd.TestRunPOLRMasterAndSingleWorker.java
License:Apache License
@Test public void testRunSingleWorkerSingleMaster() throws Exception { // TODO a test with assertions is not a test POLRMasterDriver master = new POLRMasterDriver(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff master.setConf(configuration);//from w w w .j a va2 s. co m // now load the conf stuff into locally used vars master.LoadConfigVarsLocally(); // now construct any needed machine learning data structures based on config master.Setup(); // ------------------ POLRWorkerDriver worker_model_builder_0 = new POLRWorkerDriver(); // simulates the conf stuff worker_model_builder_0.setConf(configuration); // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); long block_size = localFs.getDefaultBlockSize(workDir); LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 1); InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]); // TODO: set this up to run through the conf pathways worker_model_builder_0.setupInputSplit(custom_reader); worker_model_builder_0.LoadConfigVarsLocally(); worker_model_builder_0.Setup(); LOG.info("> Feature Size: " + worker_model_builder_0.FeatureVectorSize); LOG.info("> Category Size: " + worker_model_builder_0.num_categories); for (int x = 0; x < 25; x++) { worker_model_builder_0.RunNextTrainingBatch(); GradientUpdateMessage msg = worker_model_builder_0.GenerateUpdateMessage(); master.AddIncomingGradientMessageToQueue(msg); master.RecvGradientMessage(); // process msg master.GenerateGlobalUpdateVector(); GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue(); worker_model_builder_0.ProcessIncomingParameterVectorMessage(returned_msg); LOG.info("---------- cycle " + x + " done ------------- "); } // for worker_model_builder_0.Debug(); }
From source file:com.cloudera.knittingboar.sgd.TestRunPOLRMasterAndTwoWorkers.java
License:Apache License
@Test public void testRunMasterAndTwoWorkers() throws Exception { // TODO a test with assertions is not a test POLRMasterDriver master = new POLRMasterDriver(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff master.setConf(configuration);// w ww . j a va2 s . c om // now load the conf stuff into locally used vars master.LoadConfigVarsLocally(); // now construct any needed machine learning data structures based on config master.Setup(); // ------------------ POLRWorkerDriver worker_model_builder_0 = new POLRWorkerDriver(); worker_model_builder_0.internalID = "0"; // simulates the conf stuff worker_model_builder_0.setConf(configuration); POLRWorkerDriver worker_model_builder_1 = new POLRWorkerDriver(); worker_model_builder_1.internalID = "1"; // simulates the conf stuff worker_model_builder_1.setConf(configuration); // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); long block_size = localFs.getDefaultBlockSize(workDir); LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 2); InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]); InputRecordsSplit custom_reader_1 = new InputRecordsSplit(job, splits[1]); // TODO: set this up to run through the conf pathways worker_model_builder_0.setupInputSplit(custom_reader_0); worker_model_builder_0.LoadConfigVarsLocally(); worker_model_builder_0.Setup(); worker_model_builder_1.setupInputSplit(custom_reader_1); worker_model_builder_1.LoadConfigVarsLocally(); worker_model_builder_1.Setup(); LOG.info("> Feature Size: " + worker_model_builder_0.FeatureVectorSize); LOG.info("> Category Size: " + worker_model_builder_0.num_categories); for (int x = 0; x < 30; x++) { // run batch 0 worker_model_builder_0.RunNextTrainingBatch(); GradientUpdateMessage msg0 = worker_model_builder_0.GenerateUpdateMessage(); worker_model_builder_1.RunNextTrainingBatch(); GradientUpdateMessage msg1 = worker_model_builder_1.GenerateUpdateMessage(); master.AddIncomingGradientMessageToQueue(msg0); master.AddIncomingGradientMessageToQueue(msg1); master.RecvGradientMessage(); // process msg master.RecvGradientMessage(); // process msg master.GenerateGlobalUpdateVector(); GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue(); worker_model_builder_0.ProcessIncomingParameterVectorMessage(returned_msg); worker_model_builder_1.ProcessIncomingParameterVectorMessage(returned_msg); LOG.info("---------- cycle " + x + " done ------------- "); } // for }
From source file:com.cloudera.knittingboar.sgd.TestRunPOLRWorkerSingleBatch.java
License:Apache License
public InputSplit[] generateDebugSplits(String input_file, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);/*from w w w. j a v a2 s. co m*/ int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }