Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:org.dkpro.bigdata.hadoop.DkproHadoopDriver.java

License:Apache License

/**
 * Runs the UIMA pipeline./*  w w w  .  ja v  a2s  . com*/
 * 
 * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3
 * 
 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
 */
@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.out.println(
                "Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]");
        System.exit(1);
    }
    this.job = new JobConf(getConf(), DkproHadoopDriver.class);
    final FileSystem fs = FileSystem.get(this.job);
    // set the factory class name
    this.job.set("dkpro.uima.factory", this.getClass().getName());
    Path inputPath;
    if (args[0].contains(",")) {
        String[] inputPaths = args[0].split(",");
        inputPath = new Path(inputPaths[0]);
        for (String path : inputPaths) {
            FileInputFormat.addInputPath(job, new Path(path));
        }
    } else {
        inputPath = new Path(args[0]); // input
        FileInputFormat.setInputPaths(this.job, inputPath);

    }
    String outDir = args[1];
    if (!getConf().getBoolean("dkpro.output.overwrite", true)) {
        outDir = getUniqueDirectoryName(outDir, fs);
    }
    final Path outputPath = new Path(outDir);// output
    final CollectionReader reader = buildCollectionReader();
    // if a collection reader was defined, import data into hdfs
    // try {
    // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec");
    // FileOutputFormat.setOutputCompressorClass(this.job,
    // (Class<? extends CompressionCodec>) c);
    // }
    // catch (final Exception e) {
    //
    // }
    if (reader != null) {
        final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine(
                CASWritableSequenceFileWriter.class, // createTypeSystemDescription(),
                CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(),
                CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS,
                job.get(("fs.default.name"), "file:/"));
        runPipeline(reader, xcasWriter);
    }
    // cleanup previous output
    fs.delete(outputPath, true);
    // this is a sensible default for the UKP cluster
    //        int numMappers = 256;
    // if (args.length > 2) {
    // numMappers = Integer.parseInt(args[2]);
    // }

    FileOutputFormat.setOutputPath(this.job, outputPath);
    // SequenceFileOutputFormat.setCompressOutput(this.job, true);

    if (this.job.get("mapred.output.compress") == null) {
        this.job.setBoolean("mapred.output.compress", true);
    }
    // Just in case compression is on
    this.job.set("mapred.output.compression.type", "BLOCK");

    if (this.job.getBoolean("dkpro.output.writecas", true)) {
        if (this.job.getBoolean("dkpro.output.plaintext", false)) {
            this.job.setOutputFormat(TextOutputFormat.class);
        } else {
            this.job.setOutputFormat(SequenceFileOutputFormat.class);
        }
    } else {
        job.setOutputFormat(NullOutputFormat.class);
    }
    // this.job.set("mapred.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // use compression
    // setup some sensible defaults
    this.job.setMapperClass(this.mapperClass);
    this.job.setReducerClass(this.reducerClass);
    if (getInputFormatClass() != null) {
        this.job.setInputFormat(getInputFormatClass());
    } else {
        this.job.setInputFormat(SequenceFileInputFormat.class);
    }
    // this.job.setOutputFormat(TextOutputFormat.class);
    this.job.setMapOutputKeyClass(Text.class);
    this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setOutputKeyClass(Text.class);
    this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setJobName(this.getClass().getSimpleName());
    // this.job.set("mapred.child.java.opts", "-Xmx1g");
    //        this.job.setInt("mapred.job.map.memory.mb", 1280);
    //        this.job.setInt("mapred.job.reduce.memory.mb", 1280);
    //        this.job.setNumMapTasks(numMappers);
    this.job.setNumReduceTasks(0);
    configure(this.job);

    // create symlinks for distributed resources
    DistributedCache.createSymlink(this.job);
    // sLogger.info("Running job "+job.getJobName());

    RunningJob runningJob = JobClient.runJob(this.job);
    runningJob.waitForCompletion();
    int status = runningJob.getJobState();
    if (status == JobStatus.SUCCEEDED) {
        return 0;
    } else if (status == JobStatus.FAILED) {
        return 1;
    } else if (status == JobStatus.KILLED) {
        return 2;
    } else {
        return 3;
    }

}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);/* ww w  .ja  v  a2  s .c  om*/
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java

License:Apache License

private JobConf createReadJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(MapWritable.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumReduceTasks(0);// w  w  w  . jav a  2s  . com

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(random.nextBoolean()));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, "true");

    FileInputFormat.setInputPaths(conf, new Path(TestUtils.gibberishDat(conf)));
    return conf;
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSaveTest.java

License:Apache License

@Parameters
public static Collection<Object[]> configs() {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(SplittableTextInputFormat.class);
    conf.setOutputFormat(EsOutputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumMapTasks(2);/*from   w ww.java2s .  com*/
    conf.setInt("actual.splits", 2);
    conf.setNumReduceTasks(0);

    JobConf standard = new JobConf(conf);
    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    standard.set(ConfigurationOptions.ES_INPUT_JSON, "false");
    FileInputFormat.setInputPaths(standard, new Path(TestUtils.sampleArtistsDat(conf)));

    JobConf json = new JobConf(conf);
    json.setMapperClass(IdentityMapper.class);
    json.setMapOutputValueClass(Text.class);
    json.set(ConfigurationOptions.ES_INPUT_JSON, "true");
    FileInputFormat.setInputPaths(json, new Path(TestUtils.sampleArtistsJson(conf)));

    return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } });
}

From source file:org.elasticsearch.hadoop.integration.mr.AbstractMROldApiSearchTest.java

License:Apache License

private JobConf createJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(mapType);/*from w  w  w. ja va 2  s.c o  m*/
    HadoopCfgUtils.setGenericOptions(conf);
    conf.set(ConfigurationOptions.ES_QUERY, query);
    conf.setNumReduceTasks(0);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(readMetadata));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, String.valueOf(readAsJson));

    QueryTestParams.provisionQueries(conf);
    FileInputFormat.setInputPaths(conf, new Path(TestUtils.sampleArtistsDat()));
    return conf;
}

From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java

License:Apache License

@Test
public void testBasicSave() throws Exception {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(ESOutputFormat.class);
    conf.setMapOutputValueClass(MapWritable.class);
    conf.setMapperClass(JsonMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat"));
    conf.set("es.resource", "mroldapi/save");

    JobClient.runJob(conf);/*from   w  w  w.  j  ava2 s  .  c o  m*/
}

From source file:org.elasticsearch.hadoop.integration.mr.MROldApiSaveTest.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testIndexAutoCreateDisabled() throws Exception {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(ESOutputFormat.class);
    conf.setMapOutputValueClass(MapWritable.class);
    conf.setMapperClass(JsonMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    FileInputFormat.setInputPaths(conf, new Path("src/test/resources/artists.dat"));
    conf.set(ConfigurationOptions.ES_RESOURCE, "mroldapi/non-existing");
    conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, "no");

    JobClient.runJob(conf);/*from  ww w . ja  va  2  s  . c o m*/
}

From source file:org.gitreduce.GitReduce.java

License:Apache License

public static void main(String args[]) throws IOException {
    JobConf conf = new JobConf(GitReduce.class);
    conf.setJobName("GitReduce");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    // conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(GitInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    JobClient.runJob(conf);//from   ww w .  j  a  va2  s  .c o  m
}

From source file:org.pentaho.di.job.entries.hadoopjobexecutor.JobEntryHadoopJobExecutor.java

License:Apache License

public Result execute(Result result, int arg1) throws KettleException {
    result.setNrErrors(0);/*w  w w .  j a  v  a  2s . co m*/

    Log4jFileAppender appender = null;
    String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$

    String hadoopDistro = System.getProperty("hadoop.distribution.name", hadoopDistribution);
    hadoopDistro = environmentSubstitute(hadoopDistro);
    if (Const.isEmpty(hadoopDistro)) {
        hadoopDistro = "generic";
    }

    try {
        appender = LogWriter.createFileAppender(logFileName, true, false);
        LogWriter.getInstance().addAppender(appender);
        log.setLogLevel(parentJob.getLogLevel());
    } catch (Exception e) {
        logError(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.FailedToOpenLogFile", logFileName, //$NON-NLS-1$
                e.toString()));
        logError(Const.getStackTracker(e));
    }

    try {
        URL resolvedJarUrl = null;
        String jarUrlS = environmentSubstitute(jarUrl);
        if (jarUrlS.indexOf("://") == -1) {
            // default to file://
            File jarFile = new File(jarUrlS);
            resolvedJarUrl = jarFile.toURI().toURL();
        } else {
            resolvedJarUrl = new URL(jarUrlS);
        }

        final String cmdLineArgsS = environmentSubstitute(cmdLineArgs);

        if (log.isDetailed())
            logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.ResolvedJar",
                    resolvedJarUrl.toExternalForm()));

        if (isSimple) {
            /*      final AtomicInteger taskCount = new AtomicInteger(0);
                  final AtomicInteger successCount = new AtomicInteger(0);
                  final AtomicInteger failedCount = new AtomicInteger(0); */

            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.SimpleMode"));
            List<Class<?>> classesWithMains = JarUtility
                    .getClassesInJarWithMain(resolvedJarUrl.toExternalForm(), getClass().getClassLoader());
            for (final Class<?> clazz : classesWithMains) {
                Runnable r = new Runnable() {
                    public void run() {
                        try {
                            final ClassLoader cl = Thread.currentThread().getContextClassLoader();
                            try {
                                //                  taskCount.incrementAndGet();
                                Thread.currentThread().setContextClassLoader(clazz.getClassLoader());
                                Method mainMethod = clazz.getMethod("main", new Class[] { String[].class });
                                Object[] args = (cmdLineArgsS != null)
                                        ? new Object[] { cmdLineArgsS.split(" ") }
                                        : new Object[0];
                                mainMethod.invoke(null, args);
                            } finally {
                                Thread.currentThread().setContextClassLoader(cl);
                                //                  successCount.incrementAndGet();
                                //                  taskCount.decrementAndGet();
                            }
                        } catch (Throwable ignored) {
                            // skip, try the next one
                            //                logError(ignored.getMessage());
                            //                failedCount.incrementAndGet();
                            ignored.printStackTrace();
                        }
                    }
                };
                Thread t = new Thread(r);
                t.start();
            }

            // uncomment to implement blocking
            /* if (blocking) {
              while (taskCount.get() > 0 && !parentJob.isStopped()) {
                Thread.sleep(1000);
              }
                    
              if (!parentJob.isStopped()) {
                result.setResult(successCount.get() > 0);
                result.setNrErrors((successCount.get() > 0) ? 0 : 1);
              } else {
                // we can't really know at this stage if 
                // the hadoop job will finish successfully 
                // because we have to stop now
                result.setResult(true); // look on the bright side of life :-)...
                result.setNrErrors(0);
              }
            } else { */
            // non-blocking - just set success equal to no failures arising
            // from invocation
            //          result.setResult(failedCount.get() == 0);
            //          result.setNrErrors(failedCount.get());
            result.setResult(true);
            result.setNrErrors(0);
            /* } */
        } else {
            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.AdvancedMode"));

            URL[] urls = new URL[] { resolvedJarUrl };
            URLClassLoader loader = new URLClassLoader(urls, getClass().getClassLoader());

            JobConf conf = new JobConf();
            String hadoopJobNameS = environmentSubstitute(hadoopJobName);
            conf.setJobName(hadoopJobNameS);

            String outputKeyClassS = environmentSubstitute(outputKeyClass);
            conf.setOutputKeyClass(loader.loadClass(outputKeyClassS));
            String outputValueClassS = environmentSubstitute(outputValueClass);
            conf.setOutputValueClass(loader.loadClass(outputValueClassS));

            if (mapperClass != null) {
                String mapperClassS = environmentSubstitute(mapperClass);
                Class<? extends Mapper> mapper = (Class<? extends Mapper>) loader.loadClass(mapperClassS);
                conf.setMapperClass(mapper);
            }
            if (combinerClass != null) {
                String combinerClassS = environmentSubstitute(combinerClass);
                Class<? extends Reducer> combiner = (Class<? extends Reducer>) loader.loadClass(combinerClassS);
                conf.setCombinerClass(combiner);
            }
            if (reducerClass != null) {
                String reducerClassS = environmentSubstitute(reducerClass);
                Class<? extends Reducer> reducer = (Class<? extends Reducer>) loader.loadClass(reducerClassS);
                conf.setReducerClass(reducer);
            }

            if (inputFormatClass != null) {
                String inputFormatClassS = environmentSubstitute(inputFormatClass);
                Class<? extends InputFormat> inputFormat = (Class<? extends InputFormat>) loader
                        .loadClass(inputFormatClassS);
                conf.setInputFormat(inputFormat);
            }
            if (outputFormatClass != null) {
                String outputFormatClassS = environmentSubstitute(outputFormatClass);
                Class<? extends OutputFormat> outputFormat = (Class<? extends OutputFormat>) loader
                        .loadClass(outputFormatClassS);
                conf.setOutputFormat(outputFormat);
            }

            String hdfsHostnameS = environmentSubstitute(hdfsHostname);
            String hdfsPortS = environmentSubstitute(hdfsPort);
            String jobTrackerHostnameS = environmentSubstitute(jobTrackerHostname);
            String jobTrackerPortS = environmentSubstitute(jobTrackerPort);

            // See if we can auto detect the distribution first
            HadoopConfigurer configurer = HadoopConfigurerFactory.locateConfigurer();

            if (configurer == null) {
                // go with what has been selected by the user
                configurer = HadoopConfigurerFactory.getConfigurer(hadoopDistro);

                // if the user-specified distribution is detectable, make sure it is still
                // the current distribution!
                if (configurer != null && configurer.isDetectable()) {
                    if (!configurer.isAvailable()) {
                        throw new KettleException(BaseMessages.getString(PKG,
                                "JobEntryHadoopJobExecutor.Error.DistroNoLongerPresent",
                                configurer.distributionName()));
                    }
                }
            }
            if (configurer == null) {
                throw new KettleException(BaseMessages.getString(PKG,
                        "JobEntryHadoopJobExecutor.Error.UnknownHadoopDistribution", hadoopDistro));
            }
            logBasic(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.Message.DistroConfigMessage",
                    configurer.distributionName()));

            List<String> configMessages = new ArrayList<String>();
            configurer.configure(hdfsHostnameS, hdfsPortS, jobTrackerHostnameS, jobTrackerPortS, conf,
                    configMessages);
            for (String m : configMessages) {
                logBasic(m);
            }

            String inputPathS = environmentSubstitute(inputPath);
            String[] inputPathParts = inputPathS.split(",");
            List<Path> paths = new ArrayList<Path>();
            for (String path : inputPathParts) {
                paths.add(new Path(configurer.getFilesystemURL() + path));
            }
            Path[] finalPaths = paths.toArray(new Path[paths.size()]);

            //FileInputFormat.setInputPaths(conf, new Path(configurer.getFilesystemURL() + inputPathS));
            FileInputFormat.setInputPaths(conf, finalPaths);
            String outputPathS = environmentSubstitute(outputPath);
            FileOutputFormat.setOutputPath(conf, new Path(configurer.getFilesystemURL() + outputPathS));

            // process user defined values
            for (UserDefinedItem item : userDefined) {
                if (item.getName() != null && !"".equals(item.getName()) && item.getValue() != null
                        && !"".equals(item.getValue())) {
                    String nameS = environmentSubstitute(item.getName());
                    String valueS = environmentSubstitute(item.getValue());
                    conf.set(nameS, valueS);
                }
            }

            String workingDirectoryS = environmentSubstitute(workingDirectory);
            conf.setWorkingDirectory(new Path(configurer.getFilesystemURL() + workingDirectoryS));
            conf.setJar(jarUrl);

            String numMapTasksS = environmentSubstitute(numMapTasks);
            String numReduceTasksS = environmentSubstitute(numReduceTasks);
            int numM = 1;
            try {
                numM = Integer.parseInt(numMapTasksS);
            } catch (NumberFormatException e) {
                logError("Can't parse number of map tasks '" + numMapTasksS + "'. Setting num"
                        + "map tasks to 1");
            }
            int numR = 1;
            try {
                numR = Integer.parseInt(numReduceTasksS);
            } catch (NumberFormatException e) {
                logError("Can't parse number of reduce tasks '" + numReduceTasksS + "'. Setting num"
                        + "reduce tasks to 1");
            }

            conf.setNumMapTasks(numM);
            conf.setNumReduceTasks(numR);

            JobClient jobClient = new JobClient(conf);
            RunningJob runningJob = jobClient.submitJob(conf);

            String loggingIntervalS = environmentSubstitute(loggingInterval);
            int logIntv = 60;
            try {
                logIntv = Integer.parseInt(loggingIntervalS);
            } catch (NumberFormatException e) {
                logError("Can't parse logging interval '" + loggingIntervalS + "'. Setting "
                        + "logging interval to 60");
            }
            if (blocking) {
                try {
                    int taskCompletionEventIndex = 0;
                    while (!parentJob.isStopped() && !runningJob.isComplete()) {
                        if (logIntv >= 1) {
                            printJobStatus(runningJob);
                            taskCompletionEventIndex = logTaskMessages(runningJob, taskCompletionEventIndex);
                            Thread.sleep(logIntv * 1000);
                        } else {
                            Thread.sleep(60000);
                        }
                    }

                    if (parentJob.isStopped() && !runningJob.isComplete()) {
                        // We must stop the job running on Hadoop
                        runningJob.killJob();
                        // Indicate this job entry did not complete
                        result.setResult(false);
                    }

                    printJobStatus(runningJob);
                    // Log any messages we may have missed while polling
                    logTaskMessages(runningJob, taskCompletionEventIndex);
                } catch (InterruptedException ie) {
                    logError(ie.getMessage(), ie);
                }

                // Entry is successful if the MR job is successful overall
                result.setResult(runningJob.isSuccessful());
            }

        }
    } catch (Throwable t) {
        t.printStackTrace();
        result.setStopped(true);
        result.setNrErrors(1);
        result.setResult(false);
        logError(t.getMessage(), t);
    }

    if (appender != null) {
        LogWriter.getInstance().removeAppender(appender);
        appender.close();

        ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(),
                parentJob.getJobname(), getName());
        result.getResultFiles().put(resultFile.getFile().toString(), resultFile);
    }

    return result;
}

From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java

License:Open Source License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();/*from ww w .ja  v  a  2s .  c  om*/

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass((Class<? extends Reducer>) GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}