Example usage for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:com.ifeng.vdn.iparea.parser.IPAreaLocalDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//from www  .j  a v  a2  s.  com

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(IPAreaMapper.class);
    job.setReducerClass(IPAreaReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.ifeng.vdn.loggroup.mapper.VideologGroupDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*from  www.  ja v  a  2s . com*/

    Job job = Job.getInstance(super.getConf());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogGroupMapper.class);
    job.setReducerClass(VideologGroupReducer.class);
    job.setCombinerClass(VideologGroupReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.logparser.mapper.VideoLogDriver.java

License:Apache License

@Override
public int run(String[] paths) throws Exception {
    Job job = Job.getInstance(super.getConf());
    job.setJarByClass(getClass());//from ww w .j a  v  a 2 s  .  c om

    FileInputFormat.addInputPath(job, new Path(paths[0]));
    FileOutputFormat.setOutputPath(job, new Path(paths[1]));

    job.setMapperClass(VideoLogMapper.class);
    job.setReducerClass(VideoLogReducer.class);
    job.setCombinerClass(VideoLogReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.parser.VideoLogParseDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//w ww  .j av  a2s  . c om

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogParseMapper.class);
    job.setReducerClass(VideoLogParseReducer.class);
    job.setCombinerClass(VideoLogParseReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.parser.VideoLogParseLocalDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//from w w  w . ja v a  2  s . com

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogParseMapper.class);
    job.setReducerClass(VideoLogParseReducer.class);
    job.setCombinerClass(VideoLogParseReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.ifeng.vdn.videolog.sort.SortGroupResultPreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*from   ww w .  ja v  a  2 s.  c  o  m*/

    Job job = Job.getInstance(getConf());
    job.setMapperClass(SortGroupResultMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    // Sort data by total number:
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ikanow.aleph2.analytics.hadoop.services.BeJobLauncher.java

License:Open Source License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    try {/* ww w .ja va2 s.c o m*/
        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext()
                            .getInputPaths(Optional.of(bucket), _batchEnrichmentContext.getJob(), input);
                    final Job inputJob = Job.getInstance(config);
                    inputJob.setInputFormatClass(BeFileInputFormat.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                    inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                }));
        ;

        // (ALEPH-12): other input format types

        // Now do everything else

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        Job job = Job.getInstance(config, jobName);
        job.setJarByClass(BatchEnrichmentJob.class);

        // Set the classpath

        cacheJars(job, bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);

        //TODO: ALEPH-12 handle reducer scenarios
        job.setNumReduceTasks(0);
        //job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job);

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.setOutputFormatClass(BeFileOutputFormat.class);

        launch(job);
        return Validation.success(job);

    } catch (Throwable t) {
        logger.error("Caught Exception", t);
        return Validation.fail(ErrorUtils.getLongForm("{0}", t));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java

License:Apache License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    final SetOnce<Job> job = new SetOnce<>();
    try {//from  w ww.j a  v  a 2s.c om
        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Validation:

        try {
            final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator();
            validator.setDataBucket(bucket);
            validator.setEnrichmentContext(_batchEnrichmentContext);
            validator.setEcMetadata(
                    Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()));
            final List<BasicMessageBean> errs = validator.validate();
            if (errs.stream().anyMatch(b -> !b.success())) {
                return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(),
                        errs.stream().map(
                                b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message()))
                                .collect(Collectors.joining(";"))));
            }
        } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop)
            logger.error(
                    ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name()));
        }

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .filter(input -> Optional.ofNullable(input.enabled()).orElse(true))
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    // In the debug case, transform the input to add the max record limit
                    final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input)
                            .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils
                                    .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils
                                            .build(AnalyticThreadJobInputConfigBean.class).done().get()))
                                    .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default")
                                            debug_max.map(max -> Optionals
                                                    .of(() -> input.config().test_record_limit_request())
                                                    .orElse(max)).orElse(null))
                                    .done())
                            .done();

                    // Get the paths and add them to a list for later
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths(
                            Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings);

                    RScriptUtils.addFilePaths(paths);

                    if (!paths.isEmpty()) {

                        logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}",
                                bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));

                        final Job inputJob = Job.getInstance(config);
                        inputJob.setInputFormatClass(BeFileInputFormat.class);
                        paths.stream().forEach(Lambdas.wrap_consumer_u(
                                path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                        inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                    } else { // not easily available in HDFS directory format, try getting from the context

                        Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext
                                .getAnalyticsContext().getServiceInput(HadoopAccessContext.class,
                                        Optional.of(bucket), _batchEnrichmentContext.getJob(),
                                        input_with_test_settings);
                        if (!input_format_info.isPresent()) {
                            logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                    BeanTemplateUtils.toJson(input_with_test_settings)));
                        } else {
                            logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                    bucket.full_name(), input_format_info.get().describe()));

                            final Job inputJob = Job.getInstance(config);
                            inputJob.setInputFormatClass(input_format_info.get().getAccessService()
                                    .either(l -> l.getClass(), r -> r));
                            input_format_info.get().getAccessConfig().ifPresent(map -> {
                                map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(),
                                        kv.getValue().toString()));
                            });

                            inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                        }
                    }

                }));

        // (ALEPH-12): other input format types

        // Now do everything else

        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        this.handleHadoopConfigOverrides(bucket, config);

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        job.set(Job.getInstance(config, jobName));
        job.get().setJarByClass(BatchEnrichmentJob.class);
        job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things)

        // Set the classpath

        cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);
        job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class);
        job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class);

        // (combiner and reducer)
        Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream()
                .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true))
                .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> {
                    final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils
                            .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()),
                                    HadoopTechnologyOverrideBean.class)
                            .get();

                    job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2));
                    job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

                    if (tech_override.use_combiner()) {
                        job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class);
                    }
                    return Unit.unit();
                }).orElseGet(() -> {
                    job.get().setNumReduceTasks(0);
                    return Unit.unit();
                });

        // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job.get());

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.get().setOutputFormatClass(BeFileOutputFormat.class);

        // Submit the job for processing
        launch(job.get());

        // Wait for the job to complete and collect the data
        //            job.get().waitForCompletion(true);

        return Validation.success(job.get());

    } catch (Throwable t) {
        Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t;

        if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) {
            // Probably a benign "no matching paths", so return pithy error
            return Validation.fail(ErrorUtils.get("{0}", tt.getMessage()));
        } else { // General error : Dump the config params to string         
            if (job.isSet()) {
                logger.error(ErrorUtils.get("Error submitting, config= {0}",
                        Optionals.streamOf(job.get().getConfiguration().iterator(), false)
                                .map(kv -> kv.getKey() + ":" + kv.getValue())
                                .collect(Collectors.joining("; "))));
            }
            return Validation.fail(ErrorUtils.getLongForm("{0}", tt));
        }
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java

License:Apache License

/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways 
 * @param context//from  ww  w.  j a v  a2 s .c  o m
 * @param bucket
 * @param job
 * @param config
 * @param per_input_action - user lambda that determines how they are used
 */
public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket,
        final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec,
        final Configuration config, final Set<String> exclude_names,
        BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) {
    transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec)
            .filter(input -> !exclude_names.contains(input.name()))
            .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> {

                final Optional<IBucketLogger> a2_logger = Optional
                        .ofNullable(context.getLogger(Optional.of(bucket)));

                final List<String> paths = context.getInputPaths(Optional.empty(), job,
                        input_with_test_settings);

                if (!paths.isEmpty()) {

                    _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                            paths.stream().collect(Collectors.joining(";"))));

                    a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                            () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                                    paths.stream().collect(Collectors.joining(";"))),
                            () -> SparkTechnologyService.class.getSimpleName() + "."
                                    + Optional.ofNullable(job.name()).orElse("no_name"),
                            () -> "startAnalyticJobOrTest"));

                    //DEBUG
                    //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));   

                    final Job input_job = Job.getInstance(config);
                    input_job.setInputFormatClass(BeFileInputFormat_Pure.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path))));
                    // (Add the input config in)
                    input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG,
                            BeanTemplateUtils.toJson(input_with_test_settings).toString());
                    per_input_action.accept(input_with_test_settings, input_job);
                } else { // not easily available in HDFS directory format, try getting from the context

                    Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context
                            .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class,
                                    Optional.empty(), job, input_with_test_settings);
                    if (!input_format_info.isPresent()) {
                        _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                BeanTemplateUtils.toJson(input_with_test_settings)));

                        a2_logger.ifPresent(l -> l.log(Level.WARN, true,
                                () -> ErrorUtils.get("Tried but failed to get input format from {0}",
                                        BeanTemplateUtils.toJson(input_with_test_settings)),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)));
                    } else {
                        _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                bucket.full_name(), input_format_info.get().describe()));

                        a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                                () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                        bucket.full_name(), input_format_info.get().describe()),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe()));

                        final Job input_job = Job.getInstance(config);
                        input_job.setInputFormatClass(
                                input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r));
                        input_format_info.get().getAccessConfig().ifPresent(map -> {
                            map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(),
                                    kv.getValue().toString()));
                        });
                        per_input_action.accept(input_with_test_settings, input_job);
                    }
                }
            }));
}

From source file:com.jbw.jobcontrol.Patent.java

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    Job job1 = Job.getInstance(conf);
    job1.setJobName("test");
    job1.setJarByClass(Patent.class);

    ChainMapper.addMapper(job1, InverseMapper.class, LongWritable.class, Text.class, Text.class, Text.class,
            conf);// w w  w. j  ava 2  s .c  o  m
    ChainMapper.addMapper(job1, CountMapper.class, Text.class, Text.class, Text.class, IntWritable.class, conf);

    job1.setReducerClass(IntSumReducer.class);

    Job job2 = Job.getInstance();
    ControlledJob cjob1 = new ControlledJob(job1.getConfiguration());
    ControlledJob cjob2 = new ControlledJob(job2.getConfiguration());
    cjob2.addDependingJob(cjob1);
    JobControl jc = new JobControl("process job");
    jc.addJob(cjob1);
    jc.addJob(cjob2);
    Thread t = new Thread(jc);
    t.start();
    while (true) {
        for (ControlledJob j : jc.getRunningJobList()) {
            break;
        }
        break;
    }
    return 0;
}