List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step2DistinctDataJob.class); job.setJobName(Phase3Step2DistinctDataJob.class.getName()); //mapper/*from w ww . j a v a 2 s . c o m*/ job.setMapperClass(RemoveRedundantDataMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); //reducer job.setReducerClass(RemoveRedundantDataReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; job.setInputFormatClass(TextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); //i/o paths FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step3NearDupTuplesCreation.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step3NearDupTuplesCreation.class); job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName()); // mapper/*from w ww . j a v a2 s.co m*/ job.setMapperClass(CreateTuplesMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(TreeSet.class); job.setInputFormatClass(TextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setNumReduceTasks(0); //must be added or the mapper wont be called return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step4LocalDeDuplication.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step4LocalDeDuplication.class); job.setJobName(Phase3Step4LocalDeDuplication.class.getName()); // paths/* w ww .j av a 2 s .c o m*/ String inputPath = args[0]; // text files of ids to be deleted String outputPath = args[1]; // input: reading max N lines for each mapper job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, new Path(inputPath)); job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", LINES); // mapper job.setMapperClass(LocalGreedyDeDuplicationMapper.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // reducer job.setReducerClass(IDCollectorReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase4RemoveDuplicatesUsingReduceSideJoins.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths/* ww w .j a v a 2s . c o m*/ // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.ContentTypeAndSizeDistribution.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(ContentTypeAndSizeDistribution.class); job.setJobName(ContentTypeAndSizeDistribution.class.getName()); // mapper/*from w w w . j a v a 2s. c om*/ job.setMapperClass(ContentAndSizeMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // reducer // job.setReducerClass(DistributionReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.TopDomainCounter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(); job.setJarByClass(TopDomainCounter.class); job.setJobName(TopDomainCounter.class.getName()); // mapper/* w w w .jav a 2s . c om*/ job.setMapperClass(DomainMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // combiner + reducer job.setCombinerClass(TextLongCountingReducer.class); job.setReducerClass(TextLongCountingReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.vocabulary.WordDistributionStatisticsCollector.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(WordDistributionStatisticsCollector.class); job.setJobName(WordDistributionStatisticsCollector.class.getName()); // mapper//w w w. j ava2 s.com job.setMapperClass(getMapperClass()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // reducer job.setReducerClass(SumReducer.class); job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.WARCRecordCounter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); System.out.println("Other args: " + Arrays.toString(otherArgs)); Job job = Job.getInstance(conf); job.setJarByClass(WARCRecordCounter.class); job.setJobName(WARCRecordCounter.class.getName()); // mapper/*from ww w. j a va 2 s.c o m*/ job.setMapperClass(ResponseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // combiner + reducer job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.PagesByURLExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); for (Map.Entry<String, String> next : job.getConfiguration()) { System.out.println(next.getKey() + ": " + next.getValue()); }//from w w w . ja va 2 s . c o m job.setJarByClass(PagesByURLExtractor.class); job.setJobName(PagesByURLExtractor.class.getName()); // mapper job.setMapperClass(MapperClass.class); // input job.setInputFormatClass(WARCInputFormat.class); // output job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setCompressOutput(job, true); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; // load IDs to be searched for job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2])); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.TextToSentencesSplitter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf); job.setJarByClass(TextToSentencesSplitter.class); job.setJobName(TextToSentencesSplitter.class.getName()); // mapper/*from ww w .j a v a2s .c o m*/ job.setMapperClass(TextToSentencesSplitter.MapperClass.class); job.setInputFormatClass(WARCInputFormat.class); // reducer job.setReducerClass(ReducerClass.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }