Example usage for org.apache.hadoop.mapreduce Job Job

List of usage examples for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobStatus status, JobConf conf) throws IOException 

Source Link

Usage

From source file:com.main.MRSearchMain.java

public void searchHBase(int numOfDays) throws IOException, InterruptedException, ClassNotFoundException {
    long startTime;
    long endTime;

    String path = "/home/hadoop/app/hadoop-2.0.0-cdh4.3.0/etc/hadoop/";
    Configuration conf = HBaseConfiguration.create();
    //      conf.set("hbase.zookeeper.quorum", "streamslab.localdomain");
    //      conf.set("fs.default.name", "hdfs://streamslab.localdomain:8020");
    //      conf.set("mapred.job.tracker", "hdfs://streamslab.localdomain:50300");
    //      conf.set("fs.hdfs.impl",
    //            org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
    //?,FileSystem?
    conf.addResource(new Path(path + "core-site.xml"));
    conf.addResource(new Path(path + "hdfs-site.xml"));
    conf.addResource(new Path(path + "mapred-site.xml"));
    /* //  w w w  .j  a va  2 s  .  c om
     * ?map 
     */
    conf.set("search.license", "C87310");
    conf.set("search.color", "10");
    conf.set("search.direction", "2");

    Job job = new Job(conf, "MRSearchHBase");
    System.out.println("search.license: " + conf.get("search.license"));
    job.setNumReduceTasks(0);
    job.setJarByClass(MRSearchMain.class);
    Scan scan = new Scan();
    scan.addFamily(FAMILY_NAME);
    byte[] startRow = Bytes.toBytes("2011010100000");
    byte[] stopRow;
    switch (numOfDays) {
    case 1:
        stopRow = Bytes.toBytes("2011010200000");
        break;
    case 10:
        stopRow = Bytes.toBytes("2011011100000");
        break;
    case 30:
        stopRow = Bytes.toBytes("2011020100000");
        break;
    case 365:
        stopRow = Bytes.toBytes("2012010100000");
        break;
    default:
        stopRow = Bytes.toBytes("2011010101000");
    }
    // ?key  
    scan.setStartRow(startRow);
    scan.setStopRow(stopRow);

    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, SearchMapper.class, ImmutableBytesWritable.class,
            Text.class, job);
    Path outPath = new Path("searchresult");
    LOG.info("outPath:" + outPath.toString());

    //hdfs
    FileSystem file = null;
    try {
        file = FileSystem.get(conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
    //      HDFS_File file = new HDFS_File();
    //      file.DelFile(conf, outPath.getName(), true); // 
    //"hdfs://streamslab.localdomain:8020/
    if (file.exists(outPath)) {
        file.delete(outPath, true);
        LOG.info("=====delPath " + outPath.toString() + "=====");
    }
    FileOutputFormat.setOutputPath(job, outPath);//   

    startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    endTime = System.currentTimeMillis();
    LOG.info("Time used: " + (endTime - startTime));
    LOG.info("startRow:" + Text.decode(startRow));
    LOG.info("stopRow: " + Text.decode(stopRow));
}

From source file:com.mb.saas.bi.job.WordCountJob.java

License:Apache License

public static boolean runHadoopMapReduceJob() throws Exception {
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    File jarFile = UploadResource.createTempJar("bin");
    ClassLoader classLoader = UploadResource.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);

    Configuration conf = new Configuration();

    conf.set("fs.defaultFS", "hdfs://mbcluster/");
    conf.set("dfs.nameservices", "mbcluster");
    conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001");
    conf.set("dfs.client.failover.proxy.provider.mbcluster",
            "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCountJob.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    if (jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    boolean isMapReduceJarSetted = false;
    String hadoopMapReduceJar = "F:/henry_projects/mbHiveAnalyzer/t.jar";
    File file = new File(hadoopMapReduceJar);
    if (file.exists()) {
        ((JobConf) job.getConfiguration()).setJar(hadoopMapReduceJar);
        isMapReduceJarSetted = true;/*w  w  w  .  j av a2 s.  c  o  m*/
    }

    if (!isMapReduceJarSetted && jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    job.setNumReduceTasks(1);
    FileInputFormat.addInputPath(job, new Path("/input/wordcount.txt"));
    FileOutputFormat.setOutputPath(job, new Path("/output/001"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return true;
}

From source file:com.mb.saas.bi.job.WordCountJob.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    File jarFile = UploadResource.createTempJar("bin");
    System.setProperty("hadoop.home.dir", "F:/hadoop");
    ClassLoader classLoader = UploadResource.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);

    Configuration conf = new Configuration();
    //   conf.set("fs.defaultFS", "hdfs://slave1:4001");
    //   conf.set("mapreduce.framework.name", "yarn");
    //   conf.set("yarn.resourcemanager.address", "master:8032");
    //   conf.set("yarn.resourcemanager.scheduler.address", "master:8030");

    conf.set("fs.defaultFS", "hdfs://mbcluster/");
    conf.set("dfs.nameservices", "mbcluster");
    conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001");
    conf.set("dfs.client.failover.proxy.provider.mbcluster",
            "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");

    conf.set("mapred.remote.os", "Linux");
    System.out.println(conf.get("mapred.remote.os"));

    //   conf.set("mapreduce.job.reduces", "2");
    //   conf.set("mapreduce.tasktracker.map.tasks.maximum", "8");
    //   conf.set("mapreduce.input.fileinputformat.split.maxsize","123");

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCountJob.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    if (jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    //    job.setMaxMapAttempts(2);
    job.setNumReduceTasks(1);//from   w  w w. ja  v a 2s.  c o  m
    FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt"));
    //    FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt"));
    FileOutputFormat.setOutputPath(job, new Path("/output/001002"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public boolean run() {
    try {//  w  w  w  .jav  a2s.  co  m
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = new Job(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            injectSystemProperties(groupByJob);
            groupByJob.setInputFormatClass(TextInputFormat.class);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setJarByClass(DeterminePartitionsJob.class);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = new Job(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        injectSystemProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            dimSelectionJob.setInputFormatClass(TextInputFormat.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setJarByClass(DeterminePartitionsJob.class);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0));
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (fileSystem.exists(partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }

                shardSpecs.put(bucket, actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.HadoopDruidIndexerJob.java

License:Open Source License

private void ensurePaths() {
    // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done before anything else
    try {// w  w  w .j a va2 s . c  om
        Job job = new Job(new Configuration(),
                String.format("%s-determine_partitions-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.19");
        for (String propName : System.getProperties().stringPropertyNames()) {
            Configuration conf = job.getConfiguration();
            if (propName.startsWith("hadoop.")) {
                conf.set(propName.substring("hadoop.".length()), System.getProperty(propName));
            }
        }

        config.addInputPaths(job);
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.IndexGeneratorJob.java

License:Open Source License

public boolean run() {
    try {// ww w .jav a2 s. co  m
        Job job = new Job(new Configuration(),
                String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.23");

        for (String propName : System.getProperties().stringPropertyNames()) {
            Configuration conf = job.getConfiguration();
            if (propName.startsWith("hadoop.")) {
                conf.set(propName.substring("hadoop.".length()), System.getProperty(propName));
            }
        }

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(Text.class);

        SortableBytes.useSortableBytesAsMapOutputKey(job);

        job.setNumReduceTasks(Iterables.size(config.getAllBuckets()));
        job.setPartitionerClass(IndexGeneratorPartitioner.class);

        job.setReducerClass(IndexGeneratorReducer.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());

        config.addInputPaths(job);
        config.intoConfiguration(job);

        job.setJarByClass(IndexGeneratorJob.class);

        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());

        boolean success = job.waitForCompletion(true);

        Counter invalidRowCount = job.getCounters()
                .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());

        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce.java

public static void main(String[] args) throws Exception {
    int iteration = 0, num_of_iteration = 30;
    int feature_size = 2;
    FileSystem fs;//from w  w  w.  ja  v a  2 s.co  m
    int number_of_clusters = 2;

    do {
        Configuration conf = new Configuration();
        fs = FileSystem.get(conf);

        Job job = new Job(conf, "K_meansClusteringMapReduce");
        job.setJarByClass(K_meansClusteringMapReduce.class);

        conf = job.getConfiguration(); // This line is mandatory. 

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(FloatArrayWritable.class);

        job.setMapperClass(K_meansClusteringMap.class);
        job.setReducerClass(K_meansClusteringReduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setNumReduceTasks(1); // set number of reducers to one.

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path out = new Path(args[1]);
        if (fs.exists(out))
            fs.delete(out, true);

        FileOutputFormat.setOutputPath(job, out);
        number_of_clusters = Integer.parseInt(args[2]);
        num_of_iteration = Integer.parseInt(args[3]);
        feature_size = Integer.parseInt(args[4]);

        conf.setInt("number_of_clusters", number_of_clusters);
        conf.setInt("feature_size", feature_size);
        conf.setInt("current_iteration_num", iteration);

        try {
            job.waitForCompletion(true);
            iteration++;
        } catch (IOException e) {
            e.printStackTrace();
        }
    } while (iteration < num_of_iteration);

}

From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMapReduce.java

public static void main(String[] args) throws Exception {
    String[] theta;//from  ww  w  .ja  v  a  2  s.co m
    int iteration = 0, num_of_iteration = 1;
    int feature_size = 0, input_data_size = 0;
    FileSystem fs;
    Float alpha = 0.1f;

    do {
        Configuration conf = new Configuration();
        fs = FileSystem.get(conf);

        Job job = new Job(conf, "LinearRegressionMapReduce");
        job.setJarByClass(MultipleLinearRegressionMapReduce.class);

        // the following two lines are needed for propagating "theta"
        conf = job.getConfiguration();

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(FloatWritable.class);

        job.setMapperClass(MultipleLinearRegressionMap.class);
        job.setReducerClass(MultipleLinearRegressionReduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setNumReduceTasks(1); // set mapred.reduce.tasks = 1 (only one reducer)

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path out = new Path(args[1]);
        if (fs.exists(out))
            fs.delete(out, true);

        FileOutputFormat.setOutputPath(job, out);
        alpha = Float.parseFloat(args[2]);
        num_of_iteration = Integer.parseInt(args[3]);
        feature_size = Integer.parseInt(args[4]);
        input_data_size = Integer.parseInt(args[5]);
        conf.setFloat("alpha", alpha);
        conf.setInt("feature_size", feature_size);
        conf.setInt("input_data_size", input_data_size);
        conf.setInt("iteration", iteration);

        theta = new String[feature_size];

        if (iteration == 0) { // first iteration
            for (int i = 0; i < theta.length; i++)
                theta[i] = "0.0";
            conf.setStrings("theta", theta);
        } else {
            try {
                String uri = "/user/hduser/theta.txt";
                fs = FileSystem.get(conf);
                //FSDataInputStream in = fs.open(new Path(uri));
                BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(uri))));
                theta = br.readLine().split(",");
            } catch (Exception e) {

            }
            conf.setStrings("theta", theta);
        }

        for (int i = 0; i < theta.length; i++)
            System.out.println("In MapRedce main function: theta[ " + i + " ]" + theta[i]);

        try {
            job.waitForCompletion(true);
            iteration++;
        } catch (IOException e) {
            e.printStackTrace();
        }
    } while (iteration < num_of_iteration);

}

From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java

/**
 * @param args/*from ww w. j  a  v  a2s.  c  om*/
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    int number_of_classes = 1;
    int number_of_features = 1;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features");
    job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class);

    conf = job.getConfiguration(); // This line is mandatory. 

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(MapArrayWritable.class);

    job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class);
    job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    Path out = new Path(args[1]);
    if (fs.exists(out))
        fs.delete(out, true);
    FileOutputFormat.setOutputPath(job, out);
    number_of_classes = Integer.parseInt(args[2]);
    number_of_features = Integer.parseInt(args[3]);
    conf.setInt("number_of_classes", number_of_classes);
    conf.setInt("number_of_features", number_of_features);

    try {
        job.waitForCompletion(true);

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticMongoTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf);
    if (outputUri == null)
        throw new IllegalStateException("output uri is not set");
    if (MongoConfigUtil.getInputURI(conf) == null)
        throw new IllegalStateException("input uri is not set");
    final String outputCollectionName = outputUri.getCollection();
    final Job job = new Job(conf, "snmp analysis " + outputCollectionName);
    job.setJarByClass(SnmpStatisticMongoTool.class);
    job.setMapperClass(MapHostUploadEachAPEachDay.class);
    job.setReducerClass(ReduceHostUploadEachAPEachDay.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    boolean result = job.waitForCompletion(true);
    return (result ? 0 : 1);
}