Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java

License:Apache License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();/*from w w w.  j a v a2 s. c om*/

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass(GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}

From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java

License:Open Source License

@Test
public void submitJob() throws Exception {

    String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input",
            "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" };

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./test-res/pentaho-mapreduce-sample.jar");

    URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });

    conf.setMapperClass(/*from  w ww  .  j av  a 2 s  .  com*/
            (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map"));
    conf.setCombinerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));
    conf.setReducerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount"));
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    JobClient jobClient = new JobClient(conf);
    ClusterStatus status = jobClient.getClusterStatus();
    assertEquals(State.RUNNING, status.getJobTrackerState());

    RunningJob runningJob = jobClient.submitJob(conf);
    System.out.print("Running " + runningJob.getJobName() + "");
    while (!runningJob.isComplete()) {
        System.out.print(".");
        Thread.sleep(500);
    }
    System.out.println();
    System.out.println("Finished " + runningJob.getJobName() + ".");

    FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000"));
    String output = IOUtils.toString(file.getContent().getInputStream());
    assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output);
}

From source file:org.pentaho.hadoop.mapreduce.test.TransMapReduceJobTestFIXME.java

License:Open Source License

@Test
public void submitJob() throws Exception {

    String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input",
            "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" };

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();//from   w ww.j a  v a 2  s.c  om
    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();
    TransMeta transMeta = new TransMeta("./test-res/wordcount-mapper.ktr");
    TransConfiguration transConfig = new TransConfiguration(transMeta, transExecConfig);
    conf.set("transformation-map-xml", transConfig.getXML());

    transMeta = new TransMeta("./test-res/wordcount-reducer.ktr");
    transConfig = new TransConfiguration(transMeta, transExecConfig);
    conf.set("transformation-reduce-xml", transConfig.getXML());

    conf.set("transformation-map-input-stepname", "Injector");
    conf.set("transformation-map-output-stepname", "Output");

    conf.set("transformation-reduce-input-stepname", "Injector");
    conf.set("transformation-reduce-output-stepname", "Output");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });

    conf.setMapperClass(
            (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransMap"));
    conf.setCombinerClass(
            (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));
    conf.setReducerClass(
            (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    JobClient jobClient = new JobClient(conf);
    ClusterStatus status = jobClient.getClusterStatus();
    assertEquals(State.RUNNING, status.getJobTrackerState());

    RunningJob runningJob = jobClient.submitJob(conf);
    System.out.print("Running " + runningJob.getJobName() + "");
    while (!runningJob.isComplete()) {
        System.out.print(".");
        Thread.sleep(500);
    }
    System.out.println();
    System.out.println("Finished " + runningJob.getJobName() + ".");

    FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000"));
    String output = IOUtils.toString(file.getContent().getInputStream());
    assertEquals(
            "Bye\t4\nGood\t2\nGoodbye\t1\nHadoop\t2\nHello\t5\nThis\t1\nWorld\t5\nand\t1\ncounting\t1\nextra\t1\nfor\t1\nis\t1\nsome\t1\ntext\t1\nwords\t1\n",
            output);
}

From source file:org.pentaho.hadoop.sample.wordcount.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    String hdfsHost = "localhost:9000";
    String jobTrackerHost = "localhost:9001";
    String fsPrefix = "hdfs";

    String dirInput = "/wordcount/input";
    String dirOutput = "/wordcount/output";

    if (args.length == 1 && (args[0].equals("--help") || args[0].equals("-h") || args[0].equals("/?"))) {
        System.out.println("Usage: WordCount <options>");
        System.out.println();//from   www. java  2 s.com
        System.out.println("Options:");
        System.out.println();
        System.out.println("--input=DIR                   The directory containing the input files for the");
        System.out.println("                              WordCount Hadoop job");
        System.out.println("--output=DIR                  The directory where the results of the WordCount");
        System.out.println("                              Hadoop job will be stored");
        System.out.println("--hdfsHost=HOST               The host<:port> of the HDFS service");
        System.out.println("                              e.g.- localhost:9000");
        System.out.println("--jobTrackerHost=HOST         The host<:port> of the job tracker service");
        System.out.println("                              e.g.- localhost:9001");
        System.out.println("--fsPrefix=PREFIX             The prefix to use for for the filesystem");
        System.out.println("                              e.g.- hdfs");
        System.out.println();
        System.out.println();
        System.out.println("If an option is not provided through the command prompt the following defaults");
        System.out.println("will be used:");
        System.out.println("--input='/wordcount/input'");
        System.out.println("--output='/wordcount/output'");
        System.out.println("--hdfsHost=localhost:9000");
        System.out.println("--jobTrackerHost=localhost:9001");
        System.out.println("--fsPrefix=hdfs");

    } else {
        if (args.length > 0) {
            for (String arg : args) {
                if (arg.startsWith("--input=")) {
                    dirInput = WordCount.getArgValue(arg);
                } else if (arg.startsWith("--output=")) {
                    dirOutput = WordCount.getArgValue(arg);
                } else if (arg.startsWith("--hdfsHost=")) {
                    hdfsHost = WordCount.getArgValue(arg);
                } else if (arg.startsWith("--jobTrackerHost=")) {
                    jobTrackerHost = WordCount.getArgValue(arg);
                } else if (arg.startsWith("--fsPrefix=")) {
                    fsPrefix = WordCount.getArgValue(arg);
                }
            }
        }

        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("WordCount");

        String hdfsBaseUrl = fsPrefix + "://" + hdfsHost;
        conf.set("fs.default.name", hdfsBaseUrl + "/");
        if (jobTrackerHost != null && jobTrackerHost.length() > 0) {
            conf.set("mapred.job.tracker", jobTrackerHost);
        }

        FileInputFormat.setInputPaths(conf, new Path[] { new Path(hdfsBaseUrl + dirInput) });
        FileOutputFormat.setOutputPath(conf, new Path(hdfsBaseUrl + dirOutput));

        conf.setMapperClass(WordCountMapper.class);
        conf.setReducerClass(WordCountReducer.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(IntWritable.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        JobClient.runJob(conf);
    }
}

From source file:org.pentaho.hadoop.shim.common.ConfigurationProxy.java

License:Apache License

@Override
public void setInputPaths(org.pentaho.hadoop.shim.api.fs.Path... paths) {
    if (paths == null) {
        return;//from  www.  j  a v a  2s  .c  o m
    }
    Path[] actualPaths = new Path[paths.length];
    for (int i = 0; i < paths.length; i++) {
        actualPaths[i] = ShimUtils.asPath(paths[i]);
    }
    FileInputFormat.setInputPaths(this, actualPaths);
}

From source file:org.pentaho.weblogs.WebLogs.java

License:Apache License

/**
 * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce job.
 * /* w w  w . j  a v a2s  .  com*/
 * @throws IOException
 *           When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf conf = new JobConf(getConf(), WebLogs.class);
    conf.setJobName("wordcount");
    conf.set("debug", "true");
    conf.setWorkingDirectory(new Path("./"));
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // these are set so the job is run in the same
    // JVM as the debugger - we are not submitting
    // to MR Node.
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "local");

    // The mapper, reducer and combiner classes.
    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");
    URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });
    conf.setMapperClass(
            (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransMap"));
    // conf.setCombinerClass((Class<? extends Reducer>)
    // loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));
    conf.setReducerClass(
            (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta mapperTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-mapper.ktr");
    TransConfiguration mapperTransConfig = new TransConfiguration(mapperTransMeta, transExecConfig);
    conf.set("transformation-map-xml", mapperTransConfig.getXML());

    TransMeta reducerTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-reducer.ktr");
    TransConfiguration reducerTransConfig = new TransConfiguration(reducerTransMeta, transExecConfig);
    conf.set("transformation-reduce-xml", reducerTransConfig.getXML());

    // transformation data interface
    conf.set("transformation-map-input-stepname", "Injector");
    conf.set("transformation-map-output-stepname", "Output");
    conf.set("transformation-reduce-input-stepname", "Injector");
    conf.set("transformation-reduce-output-stepname", "Output");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:org.pentaho.wordcount.LocalWordCount.java

License:Open Source License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker.//from   w w w .  ja  v  a  2 s . c o  m
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), LocalWordCount.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(LongWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(GenericTransReduce.class);
    conf.setReducerClass(GenericTransReduce.class);

    //  these are set so the job is run in the same
    //  JVM as the debugger - we are not submitting 
    //  to MR Node.
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "local");

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:org.pooledtimeseries.cartesian.CartesianInputFormat.java

License:Apache License

private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits)
        throws ClassNotFoundException, IOException {
    // Create a new instance of the input format
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);//from   www. j  av  a2s .c om

    // Set the input path for the left data set
    inputFormat.setInputPaths(conf, inputPath);

    // Get the left input splits
    return inputFormat.getSplits(conf, numSplits);
}

From source file:org.sf.xrime.algorithms.BC.BCBackwardStep.java

License:Apache License

@Override
public void execute() throws ProcessorExecutionException {
    try {/*from   w ww  . j  av  a 2s.  co  m*/
        context.setParameter("distance", Integer.toString(dist));

        jobConf = new JobConf(context, BCBackwardStep.class);
        jobConf.setJobName("BC");

        jobConf.setMapperClass(BCBackwardMapper.class);
        jobConf.setReducerClass(BCBackwardReducer.class);

        jobConf.setMapOutputValueClass(LabeledAdjBiSetVertex.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(LabeledAdjBiSetVertex.class);

        //jobConf.setNumMapTasks(getMapperNum());       
        jobConf.setNumMapTasks(1);
        //jobConf.setNumReduceTasks(getReducerNum());
        jobConf.setNumReduceTasks(1);

        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(jobConf, context.getSource().getPath());
        FileOutputFormat.setOutputPath(jobConf, context.getDestination().getPath());

        this.runningJob = JobClient.runJob(jobConf);

        if (dist > 0) {
            end = false;
        } else
            end = true;
    } catch (IOException e) {
        throw new ProcessorExecutionException(e);
    } catch (IllegalAccessException e) {
        e.printStackTrace();
    }
}

From source file:org.sf.xrime.algorithms.BC.BCForwardStep.java

License:Apache License

@Override
public void execute() throws ProcessorExecutionException {
    try {/*w w w .  j  a  va  2 s . co m*/
        jobConf = new JobConf(context, BCForwardStep.class);
        jobConf.setJobName("BC");

        jobConf.setMapperClass(BCForwardMapper.class);
        jobConf.setReducerClass(BCForwardReducer.class);

        //jobConf.setNumMapTasks(getMapperNum());       
        jobConf.setNumMapTasks(1);
        //jobConf.setNumReduceTasks(getReducerNum());
        jobConf.setNumReduceTasks(1);

        jobConf.setMapOutputValueClass(LabeledAdjBiSetVertex.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(LabeledAdjBiSetVertex.class);

        FileInputFormat.setInputPaths(jobConf, context.getSource().getPath());
        FileOutputFormat.setOutputPath(jobConf, context.getDestination().getPath());

        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);

        jobConf.set(maxDistance, "0");
        jobConf.set(continueFileKey, continueFlagFile());

        this.runningJob = JobClient.runJob(jobConf);

        if (client == null) {
            client = FileSystem.get(jobConf);
        }
        if (client.exists(new Path(continueFlagFile()))) {
            end = false;
            client.delete(new Path(continueFlagFile()), true);
        } else {
            end = true;
        }
    } catch (IOException e) {
        throw new ProcessorExecutionException(e);
    } catch (IllegalAccessException e) {
        e.printStackTrace();
    }
}