List of usage examples for org.apache.hadoop.mapreduce Job waitForCompletion
public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException
From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java
License:Apache License
public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: urlresolution <url-map> <url-register> <out>"); System.exit(2);// w w w . ja v a 2 s . co m } JobConf job = new JobConf(conf); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[2]), true); MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class); MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class); job.setJarByClass(HadoopUrlResolution.class); job.setPartitionerClass(KeyPartitioner.class); job.setOutputValueGroupingComparator(GroupingComparator.class); job.setMapOutputKeyClass(UrlRegJoinUrlMap.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); Job j = new Job(job); j.setReducerClass(Reduce.class); j.waitForCompletion(true); }
From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/* w ww . j av a 2s . co m*/ } //conf.setBoolean("hadoop.security.authorization", false); //conf.set("hadoop.security.authentication","simple"); Job job = new Job(conf, "word count"); job.setJarByClass(HadoopWordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.datasalt.pangool.examples.avro.AvroTweetsJoin.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { failArguments("Wrong number of arguments"); return -1; }/*from w ww . j a v a 2s .com*/ Path tweetsPath = new Path(args[0]); Path retweetsPath = new Path(args[1]); Path outputPath = new Path(args[2]); delete(outputPath.toString()); TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin"); mr.addIntermediateSchema(getPangoolTweetSchema()); mr.addIntermediateSchema(getPangoolRetweetSchema()); mr.setGroupByFields("tweet_id"); mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC)); mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC)); mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper()); mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper()); mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class, NullWritable.class); mr.setTupleReducer(new Red()); try { Job job = mr.createJob(); job.waitForCompletion(true); } finally { mr.cleanUpInstanceFiles(); } return 0; }
From source file:com.datasalt.pangool.examples.Grep.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { failArguments("Wrond number of arguments"); return -1; }//from ww w . j ava2s . c o m String regex = args[0]; String input = args[1]; String output = args[2]; delete(output); MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf); b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class); b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new GrepHandler(regex)); Job job = b.createJob(); try { job.waitForCompletion(true); } finally { b.cleanUpInstanceFiles(); } return 0; }
From source file:com.datasalt.pangool.flow.PangoolJob.java
License:Apache License
/** * Convenience method that can be used by Jobs for executing Pangool's {@link CoGrouper} instances. *//*w w w .j a v a 2 s . c o m*/ public int executeCoGrouper(TupleMRBuilder coGrouper) throws IOException, TupleMRException, InterruptedException, ClassNotFoundException { Job job = coGrouper.createJob(); if (job.waitForCompletion(true)) { return 1; } return -1; }
From source file:com.datasalt.pangool.flow.Step.java
License:Apache License
/** * Convenience method that can be used by Jobs for executing Pangool's {@link CoGrouper} instances. *//* w w w .j a v a 2 s.c o m*/ public int executeCoGrouper(TupleMRBuilder coGrouper) throws IOException, TupleMRException, InterruptedException, ClassNotFoundException { Job job = coGrouper.createJob(); try { if (nReducers > 0) { job.getConfiguration().setInt("mapred.reduce.tasks", nReducers); } if (job.waitForCompletion(true)) { return 1; } return -1; } finally { coGrouper.cleanUpInstanceFiles(); } }
From source file:com.datasalt.pangool.solr.TupleSolrOutputFormatExample.java
License:Apache License
public int run(String input, String output, Configuration conf) throws Exception { // Define the intermediate schema: It must match SOLR's schema.xml! final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string")); TupleMRBuilder job = new TupleMRBuilder(conf); job.addIntermediateSchema(schema);//from ww w . j a va 2 s .co m job.setGroupByFields("user_id"); // Define the input and its associated mapper. // We'll just have a Mapper, reducer will be Identity job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { Tuple tuple = new Tuple(schema); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); String language = fields[1]; tuple.set("user_id", fields[0]); tuple.set("message", fields[2]); if (language.equals("en")) { // English -> write to main output collector.write(tuple); } else if (language.equals("fr")) { // French -> write to french index collector.getNamedOutput("FR").write(tuple, NullWritable.get()); } else if (language.equals("es")) { // Spanish -> write to spanish index collector.getNamedOutput("ES").write(tuple, NullWritable.get()); } } }); // Add multi-output: French index job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf), ITuple.class, NullWritable.class); // Add multi-output: Spanish index job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf), ITuple.class, NullWritable.class); job.setTupleReducer(new IdentityTupleReducer()); // Add multi-output: English index job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"), conf), ITuple.class, NullWritable.class); Job hadoopJob = job.createJob(); try { hadoopJob.waitForCompletion(true); if (!hadoopJob.isSuccessful()) { throw new PangoolRuntimeException("Job was not sucessfull"); } } finally { job.cleanUpInstanceFiles(); } return 0; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java
License:Apache License
@Test public void test() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { CommonUtils.writeTXT("foo1 bar1\nbar2 foo2", new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); Path outPathText = new Path(OUT_TEXT); HadoopUtils.deleteIfExists(fS, outPath); HadoopUtils.deleteIfExists(fS, outPathText); List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("title", Type.STRING)); fields.add(Field.create("content", Type.STRING)); Schema schema = new Schema("schema", fields); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("title"); builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC)); builder.setTupleReducer(new IdentityTupleReducer()); builder.setTupleOutput(outPath, schema); // setTupleOutput method builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor()); Job job = builder.createJob(); try {//from ww w . j av a 2 s.c o m job.waitForCompletion(true); } finally { builder.cleanUpInstanceFiles(); } // Use output as input of new TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("title"); builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC)); builder.setTupleReducer(new MyGroupHandler()); builder.setOutput(outPathText, new HadoopOutputFormat(TextOutputFormat.class), Text.class, Text.class); builder.addTupleInput(outPath, new IdentityTupleMapper()); // addTupleInput method job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals("title\tbar2 foo2\ntitle\tfoo1 bar1", Files.toString(new File(OUT_TEXT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); HadoopUtils.deleteIfExists(fS, outPathText); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat.java
License:Apache License
@Test public void testSplits() throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(IN)); for (int i = 0; i < 10000; i++) { writer.write("str1" + " " + "str2" + " " + "30" + " " + "4000" + "\n"); }//w ww . j av a 2 s. c om writer.close(); Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long")); InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); Configuration conf = getConf(); conf.setLong("mapred.min.split.size", 10 * 1024); conf.setLong("dfs.block.size", 10 * 1024); conf.setLong("mapred.max.split.size", 10 * 1024); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf); mapOnly.addInput(new Path(IN), inputFormat, new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { Assert.assertEquals("str1", key.get("a").toString()); Assert.assertEquals("str2", key.get("b").toString()); Assert.assertEquals((Integer) 30, (Integer) key.get("c")); Assert.assertEquals((Long) 4000l, (Long) key.get("d")); context.getCounter("stats", "nlines").increment(1); }; }); HadoopUtils.deleteIfExists(fS, outPath); mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mapOnly.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mapOnly.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, new Path(IN)); assertEquals(10000, job.getCounters().getGroup("stats").findCounter("nlines").getValue()); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat.java
License:Apache License
@Test public void testInputCompression() throws Exception { Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long")); InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf); mapOnly.addInput(new Path("src/test/resources/*.gz"), inputFormat, new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { Assert.assertNotNull(key.get("a").toString()); Assert.assertNotNull(key.get("b").toString()); Assert.assertTrue((Integer) key.get("c") > 0); Assert.assertTrue((Long) key.get("d") > 0); context.getCounter("stats", "nlines").increment(1); };//from w w w.j a va2 s . co m }); HadoopUtils.deleteIfExists(fS, outPath); mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mapOnly.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mapOnly.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, new Path(IN)); assertEquals(100, job.getCounters().getGroup("stats").findCounter("nlines").getValue()); }