List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java
License:Apache License
public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();/*from w w w. j a v a2 s. c om*/ // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass(GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; }
From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java
License:Open Source License
@Test public void submitJob() throws Exception { String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input", "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" }; JobConf conf = new JobConf(); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./test-res/pentaho-mapreduce-sample.jar"); URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() }); conf.setMapperClass(/*from w ww . j av a 2 s . com*/ (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map")); conf.setCombinerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setReducerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount")); conf.setWorkingDirectory(new Path("/tmp/wordcount")); JobClient jobClient = new JobClient(conf); ClusterStatus status = jobClient.getClusterStatus(); assertEquals(State.RUNNING, status.getJobTrackerState()); RunningJob runningJob = jobClient.submitJob(conf); System.out.print("Running " + runningJob.getJobName() + ""); while (!runningJob.isComplete()) { System.out.print("."); Thread.sleep(500); } System.out.println(); System.out.println("Finished " + runningJob.getJobName() + "."); FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000")); String output = IOUtils.toString(file.getContent().getInputStream()); assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output); }
From source file:org.pentaho.hadoop.mapreduce.test.TransMapReduceJobTestFIXME.java
License:Open Source License
@Test public void submitJob() throws Exception { String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input", "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" }; JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();//from w ww.j a v a 2 s.c om TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = new TransMeta("./test-res/wordcount-mapper.ktr"); TransConfiguration transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); transMeta = new TransMeta("./test-res/wordcount-reducer.ktr"); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() }); conf.setMapperClass( (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransMap")); conf.setCombinerClass( (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce")); conf.setReducerClass( (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce")); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); JobClient jobClient = new JobClient(conf); ClusterStatus status = jobClient.getClusterStatus(); assertEquals(State.RUNNING, status.getJobTrackerState()); RunningJob runningJob = jobClient.submitJob(conf); System.out.print("Running " + runningJob.getJobName() + ""); while (!runningJob.isComplete()) { System.out.print("."); Thread.sleep(500); } System.out.println(); System.out.println("Finished " + runningJob.getJobName() + "."); FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000")); String output = IOUtils.toString(file.getContent().getInputStream()); assertEquals( "Bye\t4\nGood\t2\nGoodbye\t1\nHadoop\t2\nHello\t5\nThis\t1\nWorld\t5\nand\t1\ncounting\t1\nextra\t1\nfor\t1\nis\t1\nsome\t1\ntext\t1\nwords\t1\n", output); }
From source file:org.pentaho.hadoop.sample.wordcount.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { String hdfsHost = "localhost:9000"; String jobTrackerHost = "localhost:9001"; String fsPrefix = "hdfs"; String dirInput = "/wordcount/input"; String dirOutput = "/wordcount/output"; if (args.length == 1 && (args[0].equals("--help") || args[0].equals("-h") || args[0].equals("/?"))) { System.out.println("Usage: WordCount <options>"); System.out.println();//from www. java 2 s.com System.out.println("Options:"); System.out.println(); System.out.println("--input=DIR The directory containing the input files for the"); System.out.println(" WordCount Hadoop job"); System.out.println("--output=DIR The directory where the results of the WordCount"); System.out.println(" Hadoop job will be stored"); System.out.println("--hdfsHost=HOST The host<:port> of the HDFS service"); System.out.println(" e.g.- localhost:9000"); System.out.println("--jobTrackerHost=HOST The host<:port> of the job tracker service"); System.out.println(" e.g.- localhost:9001"); System.out.println("--fsPrefix=PREFIX The prefix to use for for the filesystem"); System.out.println(" e.g.- hdfs"); System.out.println(); System.out.println(); System.out.println("If an option is not provided through the command prompt the following defaults"); System.out.println("will be used:"); System.out.println("--input='/wordcount/input'"); System.out.println("--output='/wordcount/output'"); System.out.println("--hdfsHost=localhost:9000"); System.out.println("--jobTrackerHost=localhost:9001"); System.out.println("--fsPrefix=hdfs"); } else { if (args.length > 0) { for (String arg : args) { if (arg.startsWith("--input=")) { dirInput = WordCount.getArgValue(arg); } else if (arg.startsWith("--output=")) { dirOutput = WordCount.getArgValue(arg); } else if (arg.startsWith("--hdfsHost=")) { hdfsHost = WordCount.getArgValue(arg); } else if (arg.startsWith("--jobTrackerHost=")) { jobTrackerHost = WordCount.getArgValue(arg); } else if (arg.startsWith("--fsPrefix=")) { fsPrefix = WordCount.getArgValue(arg); } } } JobConf conf = new JobConf(WordCount.class); conf.setJobName("WordCount"); String hdfsBaseUrl = fsPrefix + "://" + hdfsHost; conf.set("fs.default.name", hdfsBaseUrl + "/"); if (jobTrackerHost != null && jobTrackerHost.length() > 0) { conf.set("mapred.job.tracker", jobTrackerHost); } FileInputFormat.setInputPaths(conf, new Path[] { new Path(hdfsBaseUrl + dirInput) }); FileOutputFormat.setOutputPath(conf, new Path(hdfsBaseUrl + dirOutput)); conf.setMapperClass(WordCountMapper.class); conf.setReducerClass(WordCountReducer.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); JobClient.runJob(conf); } }
From source file:org.pentaho.hadoop.shim.common.ConfigurationProxy.java
License:Apache License
@Override public void setInputPaths(org.pentaho.hadoop.shim.api.fs.Path... paths) { if (paths == null) { return;//from www. j a v a 2s .c o m } Path[] actualPaths = new Path[paths.length]; for (int i = 0; i < paths.length; i++) { actualPaths[i] = ShimUtils.asPath(paths[i]); } FileInputFormat.setInputPaths(this, actualPaths); }
From source file:org.pentaho.weblogs.WebLogs.java
License:Apache License
/** * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce job. * /* w w w . j a v a2s . com*/ * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WebLogs.class); conf.setJobName("wordcount"); conf.set("debug", "true"); conf.setWorkingDirectory(new Path("./")); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // these are set so the job is run in the same // JVM as the debugger - we are not submitting // to MR Node. conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "local"); // The mapper, reducer and combiner classes. File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() }); conf.setMapperClass( (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransMap")); // conf.setCombinerClass((Class<? extends Reducer>) // loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce")); conf.setReducerClass( (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce")); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta mapperTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-mapper.ktr"); TransConfiguration mapperTransConfig = new TransConfiguration(mapperTransMeta, transExecConfig); conf.set("transformation-map-xml", mapperTransConfig.getXML()); TransMeta reducerTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-reducer.ktr"); TransConfiguration reducerTransConfig = new TransConfiguration(reducerTransMeta, transExecConfig); conf.set("transformation-reduce-xml", reducerTransConfig.getXML()); // transformation data interface conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:org.pentaho.wordcount.LocalWordCount.java
License:Open Source License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from w w w . ja v a 2 s . c o m */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), LocalWordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(LongWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(GenericTransReduce.class); conf.setReducerClass(GenericTransReduce.class); // these are set so the job is run in the same // JVM as the debugger - we are not submitting // to MR Node. conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "local"); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:org.pooledtimeseries.cartesian.CartesianInputFormat.java
License:Apache License
private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits) throws ClassNotFoundException, IOException { // Create a new instance of the input format FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass), conf);//from www. j av a2s .c om // Set the input path for the left data set inputFormat.setInputPaths(conf, inputPath); // Get the left input splits return inputFormat.getSplits(conf, numSplits); }
From source file:org.sf.xrime.algorithms.BC.BCBackwardStep.java
License:Apache License
@Override public void execute() throws ProcessorExecutionException { try {/*from w ww . j av a 2s. co m*/ context.setParameter("distance", Integer.toString(dist)); jobConf = new JobConf(context, BCBackwardStep.class); jobConf.setJobName("BC"); jobConf.setMapperClass(BCBackwardMapper.class); jobConf.setReducerClass(BCBackwardReducer.class); jobConf.setMapOutputValueClass(LabeledAdjBiSetVertex.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(LabeledAdjBiSetVertex.class); //jobConf.setNumMapTasks(getMapperNum()); jobConf.setNumMapTasks(1); //jobConf.setNumReduceTasks(getReducerNum()); jobConf.setNumReduceTasks(1); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(jobConf, context.getSource().getPath()); FileOutputFormat.setOutputPath(jobConf, context.getDestination().getPath()); this.runningJob = JobClient.runJob(jobConf); if (dist > 0) { end = false; } else end = true; } catch (IOException e) { throw new ProcessorExecutionException(e); } catch (IllegalAccessException e) { e.printStackTrace(); } }
From source file:org.sf.xrime.algorithms.BC.BCForwardStep.java
License:Apache License
@Override public void execute() throws ProcessorExecutionException { try {/*w w w . j a va 2 s . co m*/ jobConf = new JobConf(context, BCForwardStep.class); jobConf.setJobName("BC"); jobConf.setMapperClass(BCForwardMapper.class); jobConf.setReducerClass(BCForwardReducer.class); //jobConf.setNumMapTasks(getMapperNum()); jobConf.setNumMapTasks(1); //jobConf.setNumReduceTasks(getReducerNum()); jobConf.setNumReduceTasks(1); jobConf.setMapOutputValueClass(LabeledAdjBiSetVertex.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(LabeledAdjBiSetVertex.class); FileInputFormat.setInputPaths(jobConf, context.getSource().getPath()); FileOutputFormat.setOutputPath(jobConf, context.getDestination().getPath()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set(maxDistance, "0"); jobConf.set(continueFileKey, continueFlagFile()); this.runningJob = JobClient.runJob(jobConf); if (client == null) { client = FileSystem.get(jobConf); } if (client.exists(new Path(continueFlagFile()))) { end = false; client.delete(new Path(continueFlagFile()), true); } else { end = true; } } catch (IOException e) { throw new ProcessorExecutionException(e); } catch (IllegalAccessException e) { e.printStackTrace(); } }