List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
License:Apache License
/** * Configures and submits the Map Reduce Job to Hadoop *///from w w w. jav a 2s . c om public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } }
From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }//from ww w.ja va 2 s . c o m boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job, true); job.setInputFormatClass(NQuadsInputFormat.class); job.setMapperClass(CollationMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(CollationReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.CheckConvergence.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: CheckConvergence <input path> <output path>"); return -1; }/*from w w w .j av a2s .c o m*/ FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "CheckConvergence"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CheckConvergenceMapper.class); job.setCombinerClass(CheckConvergenceReducer.class); job.setReducerClass(CheckConvergenceReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.CheckingData.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: CheckingData <input path> <output path>"); return -1; }//from w ww. ja v a 2 s .c o m FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "CheckingData"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CheckingDataMapper.class); job.setReducerClass(CheckingDataReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.CountPages.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: CountPages <input path> <output path>"); return -1; }/*w w w .j ava 2 s.co m*/ FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "CountPages"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CountPagesMapper.class); job.setCombinerClass(CountPagesReducer.class); job.setReducerClass(CountPagesReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.DanglingPages.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: DanglingPages <input path> <output path>"); return -1; }/*from ww w.jav a 2s . c om*/ FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "DanglingPages"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DanglingPagesMapper.class); job.setCombinerClass(DanglingPagesReducer.class); job.setReducerClass(DanglingPagesReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.InitializePageRanks.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: InitializePageRanks <input path> <output path> <number of pages>"); return -1; }/*from w w w . j a va 2s. c o m*/ Configuration conf = getConf(); conf.set("pagerank.count", args[2]); FileSystem.get(conf).delete(new Path(args[1]), true); Job job = new Job(conf, "InitializePageRanks"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(InitializePageRanksMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.SortPageRanks.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: SortPageRanks <input path> <output path>"); return -1; }//from w w w . j av a 2s. co m FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "SortPageRanks"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(SortPageRanksMapper.class); job.setReducerClass(Reducer.class); // i.e. identity reducer job.setSortComparatorClass(DoubleWritableDecreasingComparator.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); // TODO: inefficient, use InputSampler with v0.20.x return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.UpdatePageRanks.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { System.err.println(/* w ww . j a v a2 s. c o m*/ "Usage: UpdatePageRanks <input path> <output path> <number of pages> <dangling pages contribution>"); return -1; } Configuration conf = getConf(); conf.set("pagerank.count", args[2]); conf.set("pagerank.dangling", args[3]); FileSystem.get(conf).delete(new Path(args[1]), true); Job job = new Job(conf, "UpdatePageRanks"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(UpdatePageRanksMapper.class); job.setReducerClass(UpdatePageRanksReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java
License:Apache License
/** * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration. * //from ww w . j a v a 2s . c om * @param mscr The MSCR to convert * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers * @param outputPath The output path of the MapRed job * @return A hadoop-executable MapRed Job * * @throws IOException */ static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException { Configuration conf = new Configuration(); conf.set(WORKFLOW_NAME, workFlow.getClass().getName()); conf.setInt(MSCR_ID, mscr.getId()); conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath); Job job = new Job(conf, "MSCR"); // TODO deprecation job.setMapOutputKeyClass(PlumeObject.class); job.setMapOutputValueClass(PlumeObject.class); job.setJarByClass(MapRedExecutor.class); /** * Define multiple inputs */ for (PCollection<?> input : mscr.getInputs()) { if (!(input instanceof LazyCollection)) { throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable"); } LazyCollection<Text> l = (LazyCollection<Text>) input; if (!(l.isMaterialized() && l.getFile() != null)) { // Collections have plume ID only if they are intermediate results - TODO better naming for this if (l.getPlumeId().length() < 1) { throw new IllegalArgumentException( "Can't create MapRed from MSCR inputs that are not materialized to a file"); } } PCollectionType<?> rType = l.getType(); Class<? extends InputFormat> format = SequenceFileInputFormat.class; if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) { format = KeyValueTextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } else { if (rType.elementType() instanceof StringType) { format = TextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } } /** * Define multiple outputs */ FileOutputFormat.setOutputPath(job, new Path(outputPath)); for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) { PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output) .getType(); if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType())); } else { Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (rType.elementType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType())); } } /** * Define Reducer & Combiner */ job.setCombinerClass(MSCRCombiner.class); job.setReducerClass(MSCRReducer.class); job.setNumReduceTasks(1); return job; }