Example usage for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobStatus status, JobConf conf) throws IOException

Source Link

Usage

From source file:AvgScore.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: AvgScore <in> [<in>...] <out>");
        System.exit(2);//from  www .ja v a  2  s  .co m
    }
    Job job = new Job(conf, "AvgScore");
    job.setJarByClass(AvgScore.class);
    job.setMapperClass(Map.class);
    //job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:CountJob.java

License:Apache License

public static void doJob(String param, String args[], String msgs)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set(TokenizerMapper.PATTERN, args[2]);
    FileSystem hdfs = FileSystem.get(conf);
    Path tempOutput1 = new Path("/data/output/temp/" + param + "1");
    Path tempOutput2 = new Path("/data/output/temp/" + param + "2");
    if (hdfs.exists(tempOutput1) || hdfs.exists(tempOutput2)) {
        hdfs.delete(tempOutput1, true);/*from   w  ww  .  j  ava2 s.  c o  m*/
        hdfs.delete(tempOutput2, true);
    }

    Job job = new Job(conf, "word count");
    job.setJarByClass(CountJob.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, tempOutput1);
    job.waitForCompletion(true);

    Job sortJob1 = new Job(conf);
    sortJob1.setJobName("grep-sort");
    FileInputFormat.setInputPaths(sortJob1, tempOutput1);
    sortJob1.setInputFormatClass(SequenceFileInputFormat.class);
    sortJob1.setMapperClass(InverseMapper.class);
    sortJob1.setNumReduceTasks(1); // write a single file
    FileOutputFormat.setOutputPath(sortJob1, tempOutput2);
    sortJob1.setSortComparatorClass( // sort by decreasing freq
            LongWritable.DecreasingComparator.class);
    sortJob1.waitForCompletion(true);
    hdfs.delete(tempOutput1, true);

}

From source file:Egg.java

License:Open Source License

/** Creates a Hadoop job with a default configuration of
 *  TextInputFormat and TextOutputFormat.  If invoked with no
 *  parameters, uses the initially created job as the parent to
 *  spawn a new job.  The name of the parent job is used as the name
 *  of the child job.  The object is used a the 'this' object of the
 *  eggshell function/*from w  w  w .  j  a v  a2  s  . com*/
 *  @param o    The Hadoop Job
 */
@JSConstructor
public Egg(Object o) throws IOException {
    Configuration cf = conf; // new Configuration(conf);
    job = new Job(cf, name);
    job.setJarByClass(this.getClass()); // set jar file
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Tuple.class); // K2
    job.setOutputValueClass(Tuple.class); // V2
    job.setMapperClass(Payload.TextMap.class);
    job.setReducerClass(Reducer.class);
    job.setCombinerClass(Reducer.class);
}

From source file:ClassAverage.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Class Average");

    job.setJarByClass(ClassAverage.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(DoubleCalcReducer.class);
    job.setReducerClass(DoubleCalcReducer.class);

    job.setOutputKeyClass(Text.class);

}

From source file:DateExample_Year.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*w w  w  .j a  v  a 2s .co m*/
    }
    Job job = new Job(conf, "word count fs");
    job.setJarByClass(DateExample_Year.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(IsValidKeyFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:DataHBase.java

License:Open Source License

public void run(HashMap<String, String> config) throws Exception {

    //clean the former output if it exists
    Path p = new Path(config.get("hdfs_output_dir"));
    FileSystem fs = FileSystem.get(new Configuration());
    if (fs.exists(p)) {
        fs.delete(p, true);/*from  ww  w.ja  v a2s . c o  m*/
    }

    String junction = config.get("what_to_find"); // the name of the junction
    String date1 = config.get("date1");
    String date2 = config.get("date2");
    //date1 and date2 can be of a format YYYY-MM-DD
    if (date1.length() == 10)
        date1 = date1 + " 00:00:00";
    if (date2.length() == 10)
        date2 = date2 + " 23:59:59";
    System.out.println("Looking for data of " + junction + ": " + date1 + " - " + date2);

    //create timestamps (considering time zone!) to limit data
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    sdf.setTimeZone(TimeZone.getDefault());
    Long time1 = sdf.parse(date1).getTime();
    Long time2 = sdf.parse(date2).getTime();

    //run a job
    Configuration conf = HBaseConfiguration.create();
    conf.set("mapreduce.output.textoutputformat.separator", ","); //set comma as a delimiter

    Job job = new Job(conf, "Retrieve data from hbase");
    job.setJarByClass(DataHBase.class);

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.setMaxVersions(1);
    scan.setTimeRange(time1, time2); //take a day we are interested in
    //set a filter for a junction name
    if (!junction.equals("")) {
        SingleColumnValueFilter filter = new SingleColumnValueFilter(Bytes.toBytes("data"),
                Bytes.toBytes("location_name"), CompareOp.EQUAL, Bytes.toBytes(junction));
        scan.setFilter(filter);
    }
    //add the specific columns to the output to limit the amount of data
    scan.addFamily(Bytes.toBytes("data"));

    TableMapReduceUtil.initTableMapperJob(config.get("hbase_table"), // input HBase table name
            scan, // Scan instance to control CF and attribute selection
            TableMap.class, // mapper
            Text.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(config.get("hdfs_output_dir")));

    job.waitForCompletion(true);

}

From source file:TweetCategorizer.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    // conf.addResource(new Path("../../env_vars"));

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: TweetCategorizer <in> <out>");
        System.exit(2);// ww w.  j  a  v  a2  s . c  o  m
    }

    // ----------------------------------------------------------
    //         READ FILTER FILE
    // ----------------------------------------------------------
    // Path pt=new Path("hdfs://pathTofile");
    //Path pt = new Path("../www/hfilters.json");
    String l;
    String line = "";
    //FileSystem fs = FileSystem.get(conf);
    BufferedReader br = new BufferedReader(new FileReader("../www/json/filters.json"));

    try {
        //BufferedReader br = new BufferedReader(new FileReader(fs.open(pt)));

        while ((l = br.readLine()) != null) {
            line += l;
            //System.out.println(line);
        }

    } finally {
        // you should close out the BufferedReader
        br.close();
    }
    // ----------------------------------------------------------
    //         PARSE JSON
    //http://stackoverflow.com/questions/6697147/json-iterate-through-jsonarray
    //http://juliusdavies.ca/json-simple-1.1.1-javadocs/org/json/simple/JSONObject.html
    // ----------------------------------------------------------
    JSONParser parser = new JSONParser();
    JSONObject jsonObject = (JSONObject) parser.parse(line);

    Set<String> filters = jsonObject.keySet();

    // inside each object there is a "name" field, get value and add to keyword_list
    for (String i : filters) {
        JSONObject objects = (JSONObject) jsonObject.get(i);
        String keyword = ((String) objects.get("name")).toLowerCase();
        TokenizerMapper.keyname_list.add(i);
        TokenizerMapper.keyword_list.add(keyword);
    }
    // ----------------------------------------------------------

    Job job = new Job(conf, "categorize tweets");
    job.setJarByClass(TweetCategorizer.class);
    job.setMapperClass(TokenizerMapper.class);
    // job.setCombinerClass(IntSumReducer.class);
    // job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:WordLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 3) {
        System.err.println("Usage: wordlines <in> [<in>...] <SearchTerm> <out>");
        System.exit(2);/* w  ww .  j  a va  2 s . c  o  m*/
    }
    conf.set("searchWord", otherArgs[otherArgs.length - 2]);
    Job job = new Job(conf, "word lines");
    job.setJarByClass(WordLines.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    for (int i = 0; i < otherArgs.length - 2; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:WordCountSplitTest.java

License:Apache License

private final static void test(boolean use_shards, boolean use_chunks, Boolean slaveok) throws Exception {
    did_start = false;/* w  w w. j  a va 2 s. c o  m*/
    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost:30000/test.lines");
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, use_shards);
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, use_chunks);
    String output_table = null;
    if (use_chunks) {
        if (use_shards)
            output_table = "with_shards_and_chunks";
        else
            output_table = "with_chunks";
    } else {
        if (use_shards)
            output_table = "with_shards";
        else
            output_table = "no_splits";
    }
    if (slaveok != null) {
        output_table += "_" + slaveok;
    }
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost:30000/test." + output_table);
    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count " + output_table);

    job.setJarByClass(WordCountSplitTest.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    final long start = System.currentTimeMillis();
    System.out.println(" ----------------------- running test " + output_table + " --------------------");
    try {
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.out.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
    }
    final long end = System.currentTimeMillis();
    final float seconds = ((float) (end - start)) / 1000;
    java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
    nf.setMaximumFractionDigits(3);
    System.out.println("finished run in " + nf.format(seconds) + " seconds");

    com.mongodb.Mongo m = new com.mongodb.Mongo(
            new com.mongodb.MongoURI("mongodb://localhost:30000/?slaveok=true"));
    com.mongodb.DB db = m.getDB("test");
    com.mongodb.DBCollection coll = db.getCollection(output_table);
    com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
    query.put("_id", "the");
    com.mongodb.DBCursor cur = coll.find(query);
    if (!cur.hasNext())
        System.out.println("FAILURE: could not find count of \'the\'");
    else
        System.out.println("'the' count: " + cur.next());

    //        if (! result)
    //           System.exit(  1 );
}

From source file:LoadClue.java

License:Apache License

/**
 * Job configuration.//from  w  w w  . ja  va 2  s.  c o  m
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(ClueWebInputFormat.class);
    job.setMapperClass(Uploader.class);
    LoadClue.setTableName(tableName);

    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    TableMapReduceUtil.addDependencyJars(conf, TableOutputFormat.class);
    job.setNumReduceTasks(0);
    return job;
}