List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:com.ifeng.vdn.iparea.parser.IPAreaLocalDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());//from www .j a v a2 s. com FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(IPAreaMapper.class); job.setReducerClass(IPAreaReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.loggroup.mapper.VideologGroupDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from www. ja v a 2s . com*/ Job job = Job.getInstance(super.getConf()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogGroupMapper.class); job.setReducerClass(VideologGroupReducer.class); job.setCombinerClass(VideologGroupReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.logparser.mapper.VideoLogDriver.java
License:Apache License
@Override public int run(String[] paths) throws Exception { Job job = Job.getInstance(super.getConf()); job.setJarByClass(getClass());//from ww w .j a v a 2 s . c om FileInputFormat.addInputPath(job, new Path(paths[0])); FileOutputFormat.setOutputPath(job, new Path(paths[1])); job.setMapperClass(VideoLogMapper.class); job.setReducerClass(VideoLogReducer.class); job.setCombinerClass(VideoLogReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.parser.VideoLogParseDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());//w ww .j av a2s . c om FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.parser.VideoLogParseLocalDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());//from w w w . ja v a 2 s . com FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.videolog.sort.SortGroupResultPreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from ww w . ja v a 2 s. c o m*/ Job job = Job.getInstance(getConf()); job.setMapperClass(SortGroupResultMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // Sort data by total number: job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ikanow.aleph2.analytics.hadoop.services.BeJobLauncher.java
License:Open Source License
@Override public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket, final Optional<ProcessingTestSpecBean> testSpec) { final Configuration config = getHadoopConfig(); final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader(); //(not currently used, but has proven useful in the past) try {/* ww w .ja va2 s.c o m*/ final String contextSignature = _batchEnrichmentContext .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty()); config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature); final Optional<Long> debug_max = testSpec .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects())); //then gets applied to all the inputs: debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString())); final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder(); // Create a separate InputFormat for every input (makes testing life easier) Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream() .forEach(Lambdas.wrap_consumer_u(input -> { final List<String> paths = _batchEnrichmentContext.getAnalyticsContext() .getInputPaths(Optional.of(bucket), _batchEnrichmentContext.getJob(), input); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(BeFileInputFormat.class); paths.stream().forEach(Lambdas .wrap_consumer_u(path -> FileInputFormat.addInputPath(inputJob, new Path(path)))); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); })); ; // (ALEPH-12): other input format types // Now do everything else final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(), Optional.ofNullable(_batchEnrichmentContext.getJob().name())); // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does) Job job = Job.getInstance(config, jobName); job.setJarByClass(BatchEnrichmentJob.class); // Set the classpath cacheJars(job, bucket, _batchEnrichmentContext.getAnalyticsContext()); // (generic mapper - the actual code is run using the classes in the shared libraries) job.setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class); //TODO: ALEPH-12 handle reducer scenarios job.setNumReduceTasks(0); //job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); // Input format: inputBuilder.build(job); // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context) job.setOutputFormatClass(BeFileOutputFormat.class); launch(job); return Validation.success(job); } catch (Throwable t) { logger.error("Caught Exception", t); return Validation.fail(ErrorUtils.getLongForm("{0}", t)); } finally { Thread.currentThread().setContextClassLoader(currentClassloader); } }
From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java
License:Apache License
@Override public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket, final Optional<ProcessingTestSpecBean> testSpec) { final Configuration config = getHadoopConfig(); final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader(); //(not currently used, but has proven useful in the past) final SetOnce<Job> job = new SetOnce<>(); try {//from w ww.j a v a 2s.c om final Optional<Long> debug_max = testSpec .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects())); //then gets applied to all the inputs: debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString())); final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder(); // Validation: try { final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator(); validator.setDataBucket(bucket); validator.setEnrichmentContext(_batchEnrichmentContext); validator.setEcMetadata( Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList())); final List<BasicMessageBean> errs = validator.validate(); if (errs.stream().anyMatch(b -> !b.success())) { return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(), errs.stream().map( b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message())) .collect(Collectors.joining(";")))); } } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop) logger.error( ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name())); } // Create a separate InputFormat for every input (makes testing life easier) Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream() .filter(input -> Optional.ofNullable(input.enabled()).orElse(true)) .forEach(Lambdas.wrap_consumer_u(input -> { // In the debug case, transform the input to add the max record limit final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input) .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils .build(AnalyticThreadJobInputConfigBean.class).done().get())) .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default") debug_max.map(max -> Optionals .of(() -> input.config().test_record_limit_request()) .orElse(max)).orElse(null)) .done()) .done(); // Get the paths and add them to a list for later final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths( Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); RScriptUtils.addFilePaths(paths); if (!paths.isEmpty()) { logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(BeFileInputFormat.class); paths.stream().forEach(Lambdas.wrap_consumer_u( path -> FileInputFormat.addInputPath(inputJob, new Path(path)))); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext .getAnalyticsContext().getServiceInput(HadoopAccessContext.class, Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); if (!input_format_info.isPresent()) { logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(input_format_info.get().getAccessService() .either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } } })); // (ALEPH-12): other input format types // Now do everything else final String contextSignature = _batchEnrichmentContext .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty()); config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature); final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(), Optional.ofNullable(_batchEnrichmentContext.getJob().name())); this.handleHadoopConfigOverrides(bucket, config); // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does) job.set(Job.getInstance(config, jobName)); job.get().setJarByClass(BatchEnrichmentJob.class); job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things) // Set the classpath cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext()); // (generic mapper - the actual code is run using the classes in the shared libraries) job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class); job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class); job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class); // (combiner and reducer) Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream() .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true)) .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> { final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()), HadoopTechnologyOverrideBean.class) .get(); job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2)); job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); if (tech_override.use_combiner()) { job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class); } return Unit.unit(); }).orElseGet(() -> { job.get().setNumReduceTasks(0); return Unit.unit(); }); // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); // Input format: inputBuilder.build(job.get()); // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context) job.get().setOutputFormatClass(BeFileOutputFormat.class); // Submit the job for processing launch(job.get()); // Wait for the job to complete and collect the data // job.get().waitForCompletion(true); return Validation.success(job.get()); } catch (Throwable t) { Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t; if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) { // Probably a benign "no matching paths", so return pithy error return Validation.fail(ErrorUtils.get("{0}", tt.getMessage())); } else { // General error : Dump the config params to string if (job.isSet()) { logger.error(ErrorUtils.get("Error submitting, config= {0}", Optionals.streamOf(job.get().getConfiguration().iterator(), false) .map(kv -> kv.getKey() + ":" + kv.getValue()) .collect(Collectors.joining("; ")))); } return Validation.fail(ErrorUtils.getLongForm("{0}", tt)); } } finally { Thread.currentThread().setContextClassLoader(currentClassloader); } }
From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java
License:Apache License
/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways * @param context//from ww w. j a v a2 s .c o m * @param bucket * @param job * @param config * @param per_input_action - user lambda that determines how they are used */ public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket, final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec, final Configuration config, final Set<String> exclude_names, BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) { transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec) .filter(input -> !exclude_names.contains(input.name())) .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> { final Optional<IBucketLogger> a2_logger = Optional .ofNullable(context.getLogger(Optional.of(bucket))); final List<String> paths = context.getInputPaths(Optional.empty(), job, input_with_test_settings); if (!paths.isEmpty()) { _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass(BeFileInputFormat_Pure.class); paths.stream().forEach(Lambdas .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path)))); // (Add the input config in) input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG, BeanTemplateUtils.toJson(input_with_test_settings).toString()); per_input_action.accept(input_with_test_settings, input_job); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class, Optional.empty(), job, input_with_test_settings); if (!input_format_info.isPresent()) { _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); a2_logger.ifPresent(l -> l.log(Level.WARN, true, () -> ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe()), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe())); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass( input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); per_input_action.accept(input_with_test_settings, input_job); } } })); }
From source file:com.jbw.jobcontrol.Patent.java
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job1 = Job.getInstance(conf); job1.setJobName("test"); job1.setJarByClass(Patent.class); ChainMapper.addMapper(job1, InverseMapper.class, LongWritable.class, Text.class, Text.class, Text.class, conf);// w w w. j ava 2 s .c o m ChainMapper.addMapper(job1, CountMapper.class, Text.class, Text.class, Text.class, IntWritable.class, conf); job1.setReducerClass(IntSumReducer.class); Job job2 = Job.getInstance(); ControlledJob cjob1 = new ControlledJob(job1.getConfiguration()); ControlledJob cjob2 = new ControlledJob(job2.getConfiguration()); cjob2.addDependingJob(cjob1); JobControl jc = new JobControl("process job"); jc.addJob(cjob1); jc.addJob(cjob2); Thread t = new Thread(jc); t.start(); while (true) { for (ControlledJob j : jc.getRunningJobList()) { break; } break; } return 0; }