List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.whirr.service.yarn.integration.AbstractHadoopServiceTest.java
License:Apache License
@Test public void test() throws Exception { Configuration conf = controller.getConfiguration(); JobConf job = new JobConf(conf, AbstractHadoopServiceTest.class); FileSystem fs = FileSystem.get(conf); OutputStream os = fs.create(new Path("input")); Writer wr = new OutputStreamWriter(os); wr.write("b a\n"); wr.close();/* w w w.j ava 2s. c o m*/ job.setMapperClass(TokenCountMapper.class); job.setReducerClass(LongSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, new Path("input")); FileOutputFormat.setOutputPath(job, new Path("output")); JobClient.runJob(job); FSDataInputStream in = fs.open(new Path("output/part-00000")); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); assertEquals("a\t1", reader.readLine()); assertEquals("b\t1", reader.readLine()); assertNull(reader.readLine()); reader.close(); }
From source file:org.archive.wayback.hadoop.CDXSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from w ww . j a v a 2s.c om*/ * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { boolean compressOutput = false; boolean dereferenceInputs = false; boolean canonicalize = false; boolean funkyInput = false; JobConf jobConf = new JobConf(getConf(), CDXSort.class); jobConf.setJobName("cdxsort"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("--compress-output".equals(args[i])) { compressOutput = true; } else if ("--funky-input".equals(args[i])) { funkyInput = true; } else if ("--dereference-inputs".equals(args[i])) { dereferenceInputs = true; } else if ("--canonicalize".equals(args[i])) { canonicalize = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 3 parameters left: split input output if (otherArgs.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3."); return printUsage(); } String splitPath = otherArgs.get(0); String inputPath = otherArgs.get(1); String outputPath = otherArgs.get(2); // load the split file, find and set the number of reduces AlphaPartitioner partitioner = new AlphaPartitioner(); File localSplitFile = new File(splitPath); FileInputStream fis = new FileInputStream(localSplitFile); InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8); BufferedReader bis = new BufferedReader(isr); // try { // partitioner.loadBoundaries(bis); // } catch (IOException except) { // System.err.println("ERROR: Problem loading file " + splitPath); // return printUsage(); // exits // } // jobConf.setNumReduceTasks(partitioner.getNumPartitions()); // // // copy the split file into the FS, add to the DistributedCache: //// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); // AlphaPartitioner.setSplitCache(jobConf, localSplitFile); // System.err.println("uploaded split file to FS and DistributedCache"); // // // Set job configs: // jobConf.setInputFormat(TextInputFormat.class); // // jobConf.setOutputFormat(TextOutputFormat.class); // if (canonicalize) { // jobConf.setMapperClass(CDXCanonicalizerMapClass.class); // } else { // jobConf.setMapperClass(CDXMapClass.class); // } // jobConf.setOutputKeyClass(Text.class); // jobConf.setOutputValueClass(Text.class); // jobConf.set("mapred.textoutputformat.separator", " "); // jobConf.setPartitionerClass(AlphaPartitioner.class); int inputCount = 0; // Set job input: if (dereferenceInputs) { // SO SLOW... can't add one at a time... // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // FileInputFormat.addInputPath(jobConf, new Path(line)); // inputCount++; // System.err.println("Added path(" + inputCount + "): " + line); // } // PASS 2: // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // ArrayList<String> list = new ArrayList<String>(); // // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // list.add(line); // inputCount++; // } // Path arr[] = new Path[list.size()]; // for(int i=0; i < list.size(); i++) { // arr[i] = new Path(list.get(i)); // } // FileInputFormat.setInputPaths(jobConf, arr); // PASS 3: if (funkyInput) { jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class); } else { jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class); } FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } else { FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } // Set job output: FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); if (compressOutput) { FileOutputFormat.setCompressOutput(jobConf, true); FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); } // System.out.println("Running on " + cluster.getTaskTrackers() // + " nodes, processing " + inputCount + " files/directories" // + " into " + outputPath + " with " // + partitioner.getNumPartitions() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:org.asayler.WikiTitleCount.java
License:Apache License
/** * The main driver for wikititlecount map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker./*from ww w. j a v a 2s . c om*/ */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WikiTitleCount.class); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = 1; int num_reducers = 1; conf.setJobName("wikititlecount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); /** Set Default Mappers */ num_maps = (int) (cluster.getMaxMapTasks()); /** Set Default Mappers */ num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { other_args.add(args[i]); } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); /* Set Mappers and Reducer */ conf.setNumMapTasks(num_maps); conf.setNumReduceTasks(num_reducers); JobClient.runJob(conf); return 0; }
From source file:org.asayler.WikiTitleSort.java
License:Apache License
/** * The main driver for wikititlecount map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from w w w . ja va 2s . c om */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WikiTitleSort.class); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = 1; final int num_reducers = 1; conf.setJobName("wikititlesort"); conf.setMapperClass(MapClass.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); /** Set Default Mappers */ num_maps = (int) (cluster.getMaxMapTasks()); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { other_args.add(args[i]); } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); /* Set Mappers and Reducer */ conf.setNumMapTasks(num_maps); conf.setNumReduceTasks(num_reducers); JobClient.runJob(conf); return 0; }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
@Test public void TestArcInputFormat() throws IOException, InterruptedException { for (int i = 0; i < NUM_ITERATIONS; ++i) { JobConf job = new JobConf(); FileSystem fs = LocalFileSystem.get(job); Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName()); fs.mkdirs(path);/* w ww. ja v a 2 s .c o m*/ List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES); FileInputFormat.setInputPaths(job, path); ARCFileInputFormat inputFormat = new ARCFileInputFormat(); InputSplit splits[] = inputFormat.getSplits(job, 0); for (InputSplit split : splits) { RecordReader<Text, BytesWritable> reader = inputFormat.getRecordReader(split, job, null); validateSplit(fs, split, fileList, reader); } Assert.assertTrue(fileList.size() == 0); fs.delete(path, true); } }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
@Test public void TestArcItemInputFormat() throws IOException, InterruptedException { for (int i = 0; i < NUM_ITERATIONS; ++i) { JobConf job = new JobConf(); FileSystem fs = LocalFileSystem.get(job); Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName()); fs.mkdirs(path);/* w w w.j a va 2 s. c om*/ List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES); FileInputFormat.setInputPaths(job, path); ARCFileItemInputFormat inputFormat = new ARCFileItemInputFormat(); InputSplit splits[] = inputFormat.getSplits(job, 0); for (InputSplit split : splits) { RecordReader<Text, ArcFileItem> reader = inputFormat.getRecordReader(split, job, null); validateArcFileItemSplit(fs, split, fileList, reader); } Assert.assertTrue(fileList.size() == 0); fs.delete(path, true); } }
From source file:org.datavec.hadoop.records.reader.TestBasicHDFS_Integration.java
License:Apache License
/** * generate splits for this run/*from ww w .j a va2 s.c o m*/ * * @param input_path * @param job * @return */ private InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:org.deeplearning4j.iterativereduce.irunit.IRUnitDriver.java
License:Apache License
/** * generate splits for this run/*from www . ja v a 2s .c om*/ * * @param inputPath * @param job * @return array of {@link InputSplit} */ private InputSplit[] generateDebugSplits(Path inputPath, JobConf job) { long block_size = localFs.getDefaultBlockSize(inputPath); log.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, inputPath); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { log.error("Error loading properties ", e); } return splits; }
From source file:org.deeplearning4j.iterativereduce.runtime.irunit.IRUnitDriver.java
License:Apache License
/** * generate splits for this run/*from w w w . j a v a 2 s . co m*/ * * @param input_path * @param job * @return */ private InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); log.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { log.error("Error with splits", e); } return splits; }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
private Set<ConfigurationTuple> getConfigurationTuples() throws IOException { if (confTuples != null) return confTuples; Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH)); FileSystem fs = FileSystem.get(conf); FileStatus f = fs.getFileStatus(inputPath); //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen()); Set<ConfigurationTuple> configTuples = new HashSet<>(); int workerId = 0; JobConf job = new JobConf(new Configuration()); job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks()); for (InputSplit split : splits) { FileSplit convertedToMetronomeSplit = new FileSplit(); org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split; if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) { convertedToMetronomeSplit.setLength(hadoopFileSplit.getLength()); convertedToMetronomeSplit.setOffset(hadoopFileSplit.getStart()); convertedToMetronomeSplit.setPath(hadoopFileSplit.getPath().toString()); StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize) .setIterations(iterationCount).setOther(appConfig).setSplit(convertedToMetronomeSplit) .build();/*from ww w. ja v a2s. c om*/ String wid = "worker-" + workerId; ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config); configTuples.add(tuple); workerId++; LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString()); } else { LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString()); } } LOG.info("Total Splits/Workers: " + configTuples.size()); confTuples = configTuples; return configTuples; }