Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:de.tudarmstadt.lt.n2n.hadoop.FilterByVocabularyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), FilterByVocabularyJob.class);
    conf.setJobName(FilterByVocabularyJob.class.getSimpleName());

    conf.setMapperClass(FilterByVocabularyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    String word_list_file = conf.get(SHARED_CONSTANTS.PARAM_WORD_LIST);
    if (word_list_file == null)
        throw new MissingArgumentException(
                "Please specify word list with '-Dnlkg.filterbywordsfile=<path-to-file-in-hdfs>'.");

    DistributedCache.addFileToClassPath(new Path(word_list_file), conf);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from ww w .  ja  v  a  2s .  c  om
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(FlipJoBims.class);

    /* begin necessary for UKP cluster */
    conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    FileOutputFormat.setCompressOutput(conf, true); // compress output
    FileOutputFormat.setOutputCompressorClass(conf,
            org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */
    conf.setCompressMapOutput(true); // compress mapper output
    /* end necessary for UKP cluster */

    conf.setJobName(FlipJoBims.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(FlipJoBims.Map.class);
    conf.setNumReduceTasks(0);/*from w w w. j av a 2  s.  c om*/
    // conf.setReducerClass(IdentityReducer.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapOutputValueClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

}

From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class);
    conf.setJobName(GoogleSyntacticsJob.class.getSimpleName());

    conf.setMapperClass(GoogleSyntacticsJob3Mapper.class);
    conf.setReducerClass(GoogleSyntacticsJob3Reducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(JoBimFormat.class);
    conf.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }//from w w w . j a  v a  2  s .  c  o m

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class);
    conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName());

    conf.setMapperClass(GoogleSyntacticsJob2Mapper.class);
    conf.setReducerClass(GoogleSyntacticsJob2Reducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // conf.setMapOutputKeyClass(Text.class);
    // conf.setMapOutputValueClass(NullWritable.class);

    conf.setOutputKeyClass(JoBimFormat.class);
    conf.setOutputValueClass(IntWritable.class);

    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }/*from  w ww .j  a va  2 s  . co  m*/

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob4.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob4.class);
    conf.setJobName(GoogleSyntacticsJob4.class.getSimpleName());

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }/*ww  w  . jav  a2 s. c om*/

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    conf.setMapperClass(GoogleSyntacticsJob4Mapper.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(NullWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(0);
    conf.setCombinerClass(IdentityReducer.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class);

    conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class);
    conf.setNumReduceTasks(0);//from  ww  w .  ja  v a2 s .  c  om
    // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    JobClient.runJob(conf);

    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), RemoveExactDuplicatesJob.class);
    conf.setJobName(RemoveExactDuplicatesJob.class.getSimpleName());

    conf.setMapperClass(LineMapper.class);
    conf.setReducerClass(KeyReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    JobClient.runJob(conf);/*w w  w  .  j ava  2 s.  co  m*/
    return 0;
}

From source file:de.tudarmstadt.lt.nlkg.ConvertInvertSVO.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), ConvertInvertSVO.class);
    conf.setJobName(ConvertInvertSVO.class.getSimpleName());

    conf.setMapperClass(ConversionMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ConvertedWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);/*  w  w  w . j av  a2  s. c  o m*/
    return 0;
}

From source file:de.tudarmstadt.lt.nlkg.ConvertSVO.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), ConvertSVO.class);
    conf.setJobName(ConvertSVO.class.getSimpleName());

    conf.setMapperClass(ConversionMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ConvertedWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from  ww w. j a v a  2s  .c om
    return 0;
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver.java

License:Apache License

/**
 * Runs the UIMA pipeline.//from   w  w w .jav  a  2 s  .  c  o  m
 * 
 * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3
 * 
 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
 */
@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.out.println(
                "Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]");
        System.exit(1);
    }
    this.job = new JobConf(getConf(), DkproHadoopDriver.class);
    final FileSystem fs = FileSystem.get(this.job);
    // set the factory class name
    this.job.set("dkpro.uima.factory", this.getClass().getName());
    Path inputPath;
    if (args[0].contains(",")) {
        String[] inputPaths = args[0].split(",");
        inputPath = new Path(inputPaths[0]);
        for (String path : inputPaths)
            FileInputFormat.addInputPath(job, new Path(path));
    } else {
        inputPath = new Path(args[0]); // input
        FileInputFormat.setInputPaths(this.job, inputPath);

    }
    final Path outputPath = new Path(args[1]);// output
    final CollectionReader reader = buildCollectionReader();
    // if a collection reader was defined, import data into hdfs
    // try {
    // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec");
    // FileOutputFormat.setOutputCompressorClass(this.job,
    // (Class<? extends CompressionCodec>) c);
    // }
    // catch (final Exception e) {
    //
    // }
    if (reader != null) {
        final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine(
                CASWritableSequenceFileWriter.class, // createTypeSystemDescription(),
                CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(),
                CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS,
                job.get(("fs.default.name"), "file:/"));
        runPipeline(reader, xcasWriter);
    }
    // cleanup previous output
    fs.delete(outputPath, true);
    // this is a sensible default for the UKP cluster
    int numMappers = 256;
    // if (args.length > 2) {
    // numMappers = Integer.parseInt(args[2]);
    // }

    FileOutputFormat.setOutputPath(this.job, outputPath);
    // SequenceFileOutputFormat.setCompressOutput(this.job, true);

    if (this.job.get("mapred.output.compress") == null) {
        this.job.setBoolean("mapred.output.compress", true);
    }
    // Just in case compression is on
    this.job.set("mapred.output.compression.type", "BLOCK");

    if (this.job.getBoolean("dkpro.output.plaintext", false)) {
        this.job.setOutputFormat(TextOutputFormat.class);
    } else {
        this.job.setOutputFormat(SequenceFileOutputFormat.class);
    }
    // this.job.set("mapred.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // use compression
    // setup some sensible defaults
    this.job.setMapperClass(this.mapperClass);
    this.job.setReducerClass(this.reducerClass);
    if (getInputFormatClass() != null) {
        this.job.setInputFormat(getInputFormatClass());
    } else {
        this.job.setInputFormat(SequenceFileInputFormat.class);
    }
    // this.job.setOutputFormat(TextOutputFormat.class);
    this.job.setMapOutputKeyClass(Text.class);
    this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setOutputKeyClass(Text.class);
    this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setJobName(this.getClass().getSimpleName());
    // this.job.set("mapred.child.java.opts", "-Xmx1g");
    this.job.setInt("mapred.job.map.memory.mb", 1280);
    this.job.setInt("mapred.job.reduce.memory.mb", 1280);
    this.job.setNumMapTasks(numMappers);
    this.job.setNumReduceTasks(0);
    configure(this.job);

    // create symlinks for distributed resources
    DistributedCache.createSymlink(this.job);
    // sLogger.info("Running job "+job.getJobName());

    RunningJob runningJob = JobClient.runJob(this.job);
    runningJob.waitForCompletion();
    int status = runningJob.getJobState();
    if (status == JobStatus.SUCCEEDED) {
        return 0;
    } else if (status == JobStatus.FAILED) {
        return 1;
    } else if (status == JobStatus.KILLED) {
        return 2;
    } else {
        return 3;
    }

}