Example usage for org.apache.hadoop.mapreduce Job waitForCompletion

List of usage examples for org.apache.hadoop.mapreduce Job waitForCompletion

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job waitForCompletion.

Prototype

public boolean waitForCompletion(boolean verbose)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Submit the job to the cluster and wait for it to finish.

Usage

From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java

License:Apache License

public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: urlresolution <url-map> <url-register> <out>");
        System.exit(2);// w w  w .  ja v  a  2 s  .  co m
    }
    JobConf job = new JobConf(conf);
    FileSystem fS = FileSystem.get(conf);
    fS.delete(new Path(otherArgs[2]), true);

    MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class);
    MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class);

    job.setJarByClass(HadoopUrlResolution.class);

    job.setPartitionerClass(KeyPartitioner.class);
    job.setOutputValueGroupingComparator(GroupingComparator.class);

    job.setMapOutputKeyClass(UrlRegJoinUrlMap.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    Job j = new Job(job);
    j.setReducerClass(Reduce.class);
    j.waitForCompletion(true);
}

From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*  w  ww .  j av a  2s .  co  m*/
    }
    //conf.setBoolean("hadoop.security.authorization", false);
    //conf.set("hadoop.security.authentication","simple");
    Job job = new Job(conf, "word count");
    job.setJarByClass(HadoopWordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.datasalt.pangool.examples.avro.AvroTweetsJoin.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        failArguments("Wrong number of arguments");
        return -1;
    }/*from w ww .  j  a v a  2s  .com*/
    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    delete(outputPath.toString());

    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC));

    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
            NullWritable.class);

    mr.setTupleReducer(new Red());

    try {
        Job job = mr.createJob();
        job.waitForCompletion(true);
    } finally {
        mr.cleanUpInstanceFiles();
    }

    return 0;
}

From source file:com.datasalt.pangool.examples.Grep.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        failArguments("Wrond number of arguments");
        return -1;
    }//from  ww  w  . j ava2s .  c  o  m

    String regex = args[0];
    String input = args[1];
    String output = args[2];

    delete(output);

    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
            NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new GrepHandler(regex));
    Job job = b.createJob();
    try {
        job.waitForCompletion(true);
    } finally {
        b.cleanUpInstanceFiles();
    }

    return 0;
}

From source file:com.datasalt.pangool.flow.PangoolJob.java

License:Apache License

/**
 * Convenience method that can be used by Jobs for executing Pangool's {@link CoGrouper} instances.
 *//*w  w w  .j a  v  a  2 s .  c o  m*/
public int executeCoGrouper(TupleMRBuilder coGrouper)
        throws IOException, TupleMRException, InterruptedException, ClassNotFoundException {

    Job job = coGrouper.createJob();
    if (job.waitForCompletion(true)) {
        return 1;
    }
    return -1;
}

From source file:com.datasalt.pangool.flow.Step.java

License:Apache License

/**
 * Convenience method that can be used by Jobs for executing Pangool's {@link CoGrouper} instances.
 *//* w w  w  .j a  v  a  2  s.c  o m*/
public int executeCoGrouper(TupleMRBuilder coGrouper)
        throws IOException, TupleMRException, InterruptedException, ClassNotFoundException {

    Job job = coGrouper.createJob();
    try {
        if (nReducers > 0) {
            job.getConfiguration().setInt("mapred.reduce.tasks", nReducers);
        }
        if (job.waitForCompletion(true)) {
            return 1;
        }
        return -1;
    } finally {
        coGrouper.cleanUpInstanceFiles();
    }
}

From source file:com.datasalt.pangool.solr.TupleSolrOutputFormatExample.java

License:Apache License

public int run(String input, String output, Configuration conf) throws Exception {
    // Define the intermediate schema: It must match SOLR's schema.xml!
    final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);//from ww w .  j  a va 2 s  .co m
    job.setGroupByFields("user_id");
    // Define the input and its associated mapper.
    // We'll just have a Mapper, reducer will be Identity
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
            new TupleMapper<LongWritable, Text>() {

                Tuple tuple = new Tuple(schema);

                @Override
                public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                        throws IOException, InterruptedException {
                    String[] fields = value.toString().split("\t");
                    String language = fields[1];
                    tuple.set("user_id", fields[0]);
                    tuple.set("message", fields[2]);
                    if (language.equals("en")) {
                        // English -> write to main output
                        collector.write(tuple);
                    } else if (language.equals("fr")) {
                        // French -> write to french index
                        collector.getNamedOutput("FR").write(tuple, NullWritable.get());
                    } else if (language.equals("es")) {
                        // Spanish -> write to spanish index
                        collector.getNamedOutput("ES").write(tuple, NullWritable.get());
                    }
                }
            });
    // Add multi-output: French index
    job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
            ITuple.class, NullWritable.class);
    // Add multi-output: Spanish index
    job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
            ITuple.class, NullWritable.class);
    job.setTupleReducer(new IdentityTupleReducer());
    // Add multi-output: English index
    job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"), conf),
            ITuple.class, NullWritable.class);
    Job hadoopJob = job.createJob();
    try {
        hadoopJob.waitForCompletion(true);
        if (!hadoopJob.isSuccessful()) {
            throw new PangoolRuntimeException("Job was not sucessfull");
        }
    } finally {
        job.cleanUpInstanceFiles();
    }
    return 0;
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java

License:Apache License

@Test
public void test() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException {

    CommonUtils.writeTXT("foo1 bar1\nbar2 foo2", new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    Path outPathText = new Path(OUT_TEXT);
    HadoopUtils.deleteIfExists(fS, outPath);
    HadoopUtils.deleteIfExists(fS, outPathText);

    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("title", Type.STRING));
    fields.add(Field.create("content", Type.STRING));
    Schema schema = new Schema("schema", fields);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("title");
    builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));

    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, schema); // setTupleOutput method
    builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor());

    Job job = builder.createJob();
    try {//from ww w  . j av  a 2  s.c o  m
        job.waitForCompletion(true);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    // Use output as input of new TupleMRBuilder

    builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("title");
    builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(outPathText, new HadoopOutputFormat(TextOutputFormat.class), Text.class, Text.class);
    builder.addTupleInput(outPath, new IdentityTupleMapper()); // addTupleInput method

    job = builder.createJob();
    try {
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    Assert.assertEquals("title\tbar2 foo2\ntitle\tfoo1 bar1",
            Files.toString(new File(OUT_TEXT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
    HadoopUtils.deleteIfExists(fS, outPathText);
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat.java

License:Apache License

@Test
public void testSplits() throws Exception {

    BufferedWriter writer = new BufferedWriter(new FileWriter(IN));
    for (int i = 0; i < 10000; i++) {
        writer.write("str1" + " " + "str2" + " " + "30" + " " + "4000" + "\n");
    }//w  ww  . j av  a 2 s. c om
    writer.close();

    Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long"));
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
            TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER,
            FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);

    Configuration conf = getConf();
    conf.setLong("mapred.min.split.size", 10 * 1024);
    conf.setLong("dfs.block.size", 10 * 1024);
    conf.setLong("mapred.max.split.size", 10 * 1024);

    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);

    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path(IN), inputFormat,
            new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

                protected void map(ITuple key, NullWritable value, Context context)
                        throws IOException, InterruptedException {
                    Assert.assertEquals("str1", key.get("a").toString());
                    Assert.assertEquals("str2", key.get("b").toString());
                    Assert.assertEquals((Integer) 30, (Integer) key.get("c"));
                    Assert.assertEquals((Long) 4000l, (Long) key.get("d"));
                    context.getCounter("stats", "nlines").increment(1);
                };
            });

    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
            NullWritable.class);
    Job job = mapOnly.createJob();
    try {
        assertTrue(job.waitForCompletion(true));
    } finally {
        mapOnly.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, new Path(IN));

    assertEquals(10000, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat.java

License:Apache License

@Test
public void testInputCompression() throws Exception {
    Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long"));
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
            TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER,
            FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);

    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);

    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path("src/test/resources/*.gz"), inputFormat,
            new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

                protected void map(ITuple key, NullWritable value, Context context)
                        throws IOException, InterruptedException {
                    Assert.assertNotNull(key.get("a").toString());
                    Assert.assertNotNull(key.get("b").toString());
                    Assert.assertTrue((Integer) key.get("c") > 0);
                    Assert.assertTrue((Long) key.get("d") > 0);
                    context.getCounter("stats", "nlines").increment(1);
                };//from   w w  w.j  a va2 s  .  co  m
            });

    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
            NullWritable.class);
    Job job = mapOnly.createJob();
    try {
        assertTrue(job.waitForCompletion(true));
    } finally {
        mapOnly.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, new Path(IN));

    assertEquals(100, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
}