List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.marklogic.mapreduce.examples.LinkCountCooccurrences.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCountCooccurrences configFile outputDir"); System.exit(2);// w w w.ja v a2s. co m } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count cooccurrences"); job.setJarByClass(LinkCountCooccurrences.class); job.setInputFormatClass(KeyValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_KEY_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, HrefTitleMap.class, ElemAttrValueCooccurrences.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountHDFS.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCountHDFS inputDir outputDir"); System.exit(2);/*from w ww .j av a2 s . c om*/ } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count hdfs"); job.setJarByClass(LinkCountHDFS.class); job.setInputFormatClass(HDFSInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); HDFSInputFormat.setInputPaths(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountInDoc.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 1) { System.err.println("Usage: LinkCountInDoc configFile"); System.exit(2);//from ww w . j a v a 2s . co m } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count in doc"); job.setJarByClass(LinkCountInDoc.class); job.setInputFormatClass(NodeInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(NodeOutputFormat.class); job.setOutputKeyClass(NodePath.class); job.setOutputValueClass(MarkLogicNode.class); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountInProperty.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 1) { System.err.println("Usage: LinkCountInProperty configFile"); System.exit(2);/*ww w . j a v a2 s.c om*/ } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count in property"); job.setJarByClass(LinkCountInProperty.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(PropertyOutputFormat.class); job.setOutputKeyClass(DocumentURI.class); job.setOutputValueClass(MarkLogicNode.class); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountValue.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCountValue configFile outputDir"); System.exit(2);/* ww w . j a v a2 s . c o m*/ } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count value"); job.setJarByClass(LinkCountValue.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.WikiLoader.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: WikiLoader configFile inputDir"); System.exit(2);/*from ww w .ja v a 2 s . c o m*/ } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "wiki loader"); job.setJarByClass(WikiLoader.class); job.setInputFormatClass(WikiInputFormat.class); job.setMapperClass(ArticleMapper.class); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(ContentOutputFormat.class); ContentInputFormat.setInputPaths(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.ZipContentLoader.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ZipContentLoader configFile inputDir"); System.exit(2);/*from w w w .j a va2s . com*/ } Job job = Job.getInstance(conf, "zip content loader"); job.setJarByClass(ZipContentLoader.class); job.setInputFormatClass(ZipContentInputFormat.class); job.setMapperClass(ZipContentMapper.class); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(ContentOutputFormat.class); ZipContentInputFormat.setInputPaths(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.test.CustomQuery.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 1) { System.err.println("Usage: CustomQuery configFile"); System.exit(2);//from w w w . j a va 2 s . com } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "custom query"); job.setJarByClass(CustomQuery.class); job.setInputFormatClass(NodeInputFormat.class); job.setMapperClass(QueryMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(QueryReducer.class); job.setOutputFormatClass(KeyValueOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java
License:Open Source License
public boolean run() { try {/*from w w w . ja va2 s .c o m*/ /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = new Job(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); injectSystemProperties(groupByJob); groupByJob.setInputFormatClass(TextInputFormat.class); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); groupByJob.setJarByClass(DeterminePartitionsJob.class); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = new Job(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); injectSystemProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); dimSelectionJob.setInputFormatClass(TextInputFormat.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setJarByClass(DeterminePartitionsJob.class); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals()) { DateTime bucket = segmentGranularity.getStart(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0)); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (fileSystem.exists(partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i)); } shardSpecs.put(bucket, actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.IndexGeneratorJob.java
License:Open Source License
public boolean run() { try {/*from w w w. ja v a 2 s . co m*/ Job job = new Job(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(Text.class); SortableBytes.useSortableBytesAsMapOutputKey(job); job.setNumReduceTasks(Iterables.size(config.getAllBuckets())); job.setPartitionerClass(IndexGeneratorPartitioner.class); job.setReducerClass(IndexGeneratorReducer.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); config.intoConfiguration(job); job.setJarByClass(IndexGeneratorJob.class); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }