List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:de.tudarmstadt.lt.n2n.hadoop.FilterByVocabularyJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), FilterByVocabularyJob.class); conf.setJobName(FilterByVocabularyJob.class.getSimpleName()); conf.setMapperClass(FilterByVocabularyMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); String word_list_file = conf.get(SHARED_CONSTANTS.PARAM_WORD_LIST); if (word_list_file == null) throw new MissingArgumentException( "Please specify word list with '-Dnlkg.filterbywordsfile=<path-to-file-in-hdfs>'."); DistributedCache.addFileToClassPath(new Path(word_list_file), conf); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from ww w . ja v a 2s . c om return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(FlipJoBims.class); /* begin necessary for UKP cluster */ conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */ conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */ FileOutputFormat.setCompressOutput(conf, true); // compress output FileOutputFormat.setOutputCompressorClass(conf, org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */ conf.setCompressMapOutput(true); // compress mapper output /* end necessary for UKP cluster */ conf.setJobName(FlipJoBims.class.getSimpleName()); args = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(FlipJoBims.Map.class); conf.setNumReduceTasks(0);/*from w w w. j av a 2 s. c om*/ // conf.setReducerClass(IdentityReducer.class); conf.setMapOutputKeyClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class); conf.setJobName(GoogleSyntacticsJob.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob3Mapper.class); conf.setReducerClass(GoogleSyntacticsJob3Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }//from w w w . j a v a 2 s . c o m String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class); conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob2Mapper.class); conf.setReducerClass(GoogleSyntacticsJob2Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // conf.setMapOutputKeyClass(Text.class); // conf.setMapOutputValueClass(NullWritable.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); args = new GenericOptionsParser(conf, args).getRemainingArgs(); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }/*from w ww .j a va 2 s . co m*/ String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob4.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob4.class); conf.setJobName(GoogleSyntacticsJob4.class.getSimpleName()); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }/*ww w . jav a2 s. c om*/ String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); conf.setMapperClass(GoogleSyntacticsJob4Mapper.class); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(NullWritable.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(0); conf.setCombinerClass(IdentityReducer.class); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class); conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName()); args = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class); conf.setNumReduceTasks(0);//from ww w . ja v a2 s . c om // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), RemoveExactDuplicatesJob.class); conf.setJobName(RemoveExactDuplicatesJob.class.getSimpleName()); conf.setMapperClass(LineMapper.class); conf.setReducerClass(KeyReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); JobClient.runJob(conf);/*w w w . j ava 2 s. co m*/ return 0; }
From source file:de.tudarmstadt.lt.nlkg.ConvertInvertSVO.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), ConvertInvertSVO.class); conf.setJobName(ConvertInvertSVO.class.getSimpleName()); conf.setMapperClass(ConversionMapper.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ConvertedWritable.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/* w w w . j av a2 s. c o m*/ return 0; }
From source file:de.tudarmstadt.lt.nlkg.ConvertSVO.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), ConvertSVO.class); conf.setJobName(ConvertSVO.class.getSimpleName()); conf.setMapperClass(ConversionMapper.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ConvertedWritable.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from ww w. j a v a 2s .c om return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver.java
License:Apache License
/** * Runs the UIMA pipeline.//from w w w .jav a 2 s . c o m * * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3 * * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ @Override public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println( "Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]"); System.exit(1); } this.job = new JobConf(getConf(), DkproHadoopDriver.class); final FileSystem fs = FileSystem.get(this.job); // set the factory class name this.job.set("dkpro.uima.factory", this.getClass().getName()); Path inputPath; if (args[0].contains(",")) { String[] inputPaths = args[0].split(","); inputPath = new Path(inputPaths[0]); for (String path : inputPaths) FileInputFormat.addInputPath(job, new Path(path)); } else { inputPath = new Path(args[0]); // input FileInputFormat.setInputPaths(this.job, inputPath); } final Path outputPath = new Path(args[1]);// output final CollectionReader reader = buildCollectionReader(); // if a collection reader was defined, import data into hdfs // try { // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec"); // FileOutputFormat.setOutputCompressorClass(this.job, // (Class<? extends CompressionCodec>) c); // } // catch (final Exception e) { // // } if (reader != null) { final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine( CASWritableSequenceFileWriter.class, // createTypeSystemDescription(), CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(), CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS, job.get(("fs.default.name"), "file:/")); runPipeline(reader, xcasWriter); } // cleanup previous output fs.delete(outputPath, true); // this is a sensible default for the UKP cluster int numMappers = 256; // if (args.length > 2) { // numMappers = Integer.parseInt(args[2]); // } FileOutputFormat.setOutputPath(this.job, outputPath); // SequenceFileOutputFormat.setCompressOutput(this.job, true); if (this.job.get("mapred.output.compress") == null) { this.job.setBoolean("mapred.output.compress", true); } // Just in case compression is on this.job.set("mapred.output.compression.type", "BLOCK"); if (this.job.getBoolean("dkpro.output.plaintext", false)) { this.job.setOutputFormat(TextOutputFormat.class); } else { this.job.setOutputFormat(SequenceFileOutputFormat.class); } // this.job.set("mapred.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // use compression // setup some sensible defaults this.job.setMapperClass(this.mapperClass); this.job.setReducerClass(this.reducerClass); if (getInputFormatClass() != null) { this.job.setInputFormat(getInputFormatClass()); } else { this.job.setInputFormat(SequenceFileInputFormat.class); } // this.job.setOutputFormat(TextOutputFormat.class); this.job.setMapOutputKeyClass(Text.class); this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setOutputKeyClass(Text.class); this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setJobName(this.getClass().getSimpleName()); // this.job.set("mapred.child.java.opts", "-Xmx1g"); this.job.setInt("mapred.job.map.memory.mb", 1280); this.job.setInt("mapred.job.reduce.memory.mb", 1280); this.job.setNumMapTasks(numMappers); this.job.setNumReduceTasks(0); configure(this.job); // create symlinks for distributed resources DistributedCache.createSymlink(this.job); // sLogger.info("Running job "+job.getJobName()); RunningJob runningJob = JobClient.runJob(this.job); runningJob.waitForCompletion(); int status = runningJob.getJobState(); if (status == JobStatus.SUCCEEDED) { return 0; } else if (status == JobStatus.FAILED) { return 1; } else if (status == JobStatus.KILLED) { return 2; } else { return 3; } }