List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*www . j a v a 2 s .c om*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/* w w w .j ava2 s .co m*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(FixDictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java
License:Apache License
@Override public void handle(Job conf) throws Exception { // HBase??// ww w .j a v a 2 s. c o m HbaseConfiguration hconf = new HbaseConfiguration(conf.getConfiguration(), HbaseConfiguration.FLAG_HBASE_INPUT); String tableName = hconf.getInputTableName(); if (null == tableName || tableName.trim().length() <= 0) { String meg = "[MR ERROR]HBase??<" + HbaseConfiguration.INPUT_TABLE + ">?."; MRLog.error(LOG, meg); throw new Exception(meg); } // ? String inputFieldName[] = hconf.getInputFieldNames(); this.vParamSrcTargetFieldNames(hconf, inputFieldName); if (hconf.getInputIsCombiner()) { conf.setCombinerClass(DBGroupReducer.class); } // ?TIMERANGE String timerange[] = hconf.getInputHBaseQueryTimerange(); this.vParamQueryTimeRange(timerange); // ?startrow String startrow = hconf.getInputHBaseQueryStartRow(); if (null == startrow || startrow.trim().length() <= 0) { MRLog.warn(LOG, "[MR WARN]?startrow<" + HbaseConfiguration.INPUT_QUERY_STARTROW + ">."); } // ?stoprow String stoprow = hconf.getInputHBaseQueryStopRow(); if (null == stoprow || stoprow.trim().length() <= 0) { MRLog.warn(LOG, "[MR WARN]?stoprow<" + HbaseConfiguration.INPUT_QUERY_STOPROW + ">."); } // ?timestamp long timestamp = hconf.getInputHBaseQueryTimestamp(); if (timestamp <= -1) { MRLog.warn(LOG, "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_TIMESTAMP + ">."); } // ?filters String filters = hconf.getInputHBaseQueryFilters(); if (null == filters || filters.length() <= 0) { MRLog.warn(LOG, "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FILTER + ">."); } // ?familyColumns String familyColumns[] = hconf.getInputHBaseQueryFamilyColumns(); if (null == familyColumns || familyColumns.length <= 0) { MRLog.warn(LOG, "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS + ">."); } if (null != familyColumns) { for (String tmp : familyColumns) { if (tmp.split(":").length != 2) { String meg = "[MR ERROR]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS + ">."; MRLog.error(LOG, meg); throw new Exception(meg); } } } // ?familys String familys[] = hconf.getInputHBaseQueryFamilys(); if (null == familys || familys.length <= 0) { MRLog.warn(LOG, "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FAMILYS + ">."); } conf.setInputFormatClass(HbaseInputFormat.class); hconf.setInputClass(DBRecord.class); // ?MapTask? int taskNumber = HbaseInputFormat.getTableHRegionInfoCount(conf.getConfiguration(), startrow, stoprow); int reduceTasks = taskNumber; if (hconf.getInputMapEnd()) { reduceTasks = 0; } // hconf.setNumMapTasks(taskNumber); hconf.setNumReduceTasks(reduceTasks); hconf.setInputClass(DBRecord.class); conf.setMapperClass(DBMapper.class); conf.setMapOutputKeyClass(DBRecord.class); conf.setMapOutputValueClass(DBRecord.class); if (hconf.getInputIsCombiner()) { conf.setCombinerClass(DBGroupReducer.class); } }
From source file:com.example.bigtable.sample.CellCounter.java
License:Apache License
/** * Sets up the actual job.//from ww w . j a v a2 s . co m * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); String reportSeparatorString = (args.length > 2) ? args[2] : ":"; conf.set("ReportSeparator", reportSeparatorString); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(CellCounter.class); Scan scan = getConfiguredScanForJob(conf, args); TableMapReduceUtil.initTableMapperJob(tableName, scan, CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); job.setNumReduceTasks(1); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, outputDir); job.setReducerClass(IntSumReducer.class); return job; }
From source file:com.example.bigtable.sample.WordCountHBase.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount-hbase <in> [<in>...] <table-name>"); System.exit(2);/* w w w . j a v a2s . c om*/ } Job job = Job.getInstance(conf, "word count"); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } TableName tableName = TableName.valueOf(otherArgs[otherArgs.length - 1]); try { CreateTable.createTable(tableName, conf, Collections.singletonList(Bytes.toString(COLUMN_FAMILY))); } catch (Exception e) { LOG.error("Could not create the table.", e); } job.setJarByClass(WordCountHBase.class); job.setMapperClass(TokenizerMapper.class); job.setMapOutputValueClass(IntWritable.class); TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), MyTableReducer.class, job); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.examples.ch03.ParseWeblogs_Ex_1.java
public int run(String[] args) throws Exception { Path inputPath = new Path("apache_clf.txt"); Path outputPath = new Path("output"); Configuration conf = getConf(); Job weblogJob = Job.getInstance(conf); weblogJob.setJobName("Weblog Transformer"); weblogJob.setJarByClass(getClass()); weblogJob.setNumReduceTasks(0);//ww w.j a v a2s . c o m weblogJob.setMapperClass(CLFMapper_Ex_1.class); weblogJob.setMapOutputKeyClass(Text.class); weblogJob.setMapOutputValueClass(Text.class); weblogJob.setOutputKeyClass(Text.class); weblogJob.setOutputValueClass(Text.class); weblogJob.setInputFormatClass(TextInputFormat.class); weblogJob.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(weblogJob, inputPath); FileOutputFormat.setOutputPath(weblogJob, outputPath); if (weblogJob.waitForCompletion(true)) { return 0; } return 1; }
From source file:com.facebook.hiveio.mapreduce.output.WritingTool.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); handleCommandLine(args, conf);//from ww w. j a va2 s .c o m HadoopUtils.setMapAttempts(conf, 1); adjustConfigurationForHive(conf); HiveTools.setupJob(conf); Job job = new Job(conf, "hive-io-writing"); if (job.getJar() == null) { job.setJarByClass(getClass()); } job.setMapperClass(SampleMapper.class); job.setInputFormatClass(SampleInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(HiveWritableRecord.class); job.setOutputFormatClass(SampleOutputFormat.class); job.setNumReduceTasks(0); job.submit(); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.fanlehai.hadoop.serialize.avro.MapReduceAvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { printUsage();/* ww w . ja va2 s . c om*/ } FileSystem.get(new Configuration()).delete(new Path(args[1]), true); Job job = Job.getInstance(super.getConf(), "AvroWordCount"); job.setJarByClass(MapReduceAvroWordCount.class); job.setJobName("AvroWordCount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setSortComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 1 : 0; }
From source file:com.fanlehai.hadoop.serialize.avro.MapReduceColorCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { printUsage();//from w w w. j av a 2 s . co m } FileSystem.get(new Configuration()).delete(new Path(args[1]), true); Job job = Job.getInstance(super.getConf(), "MapReduceAvroWordCount"); job.setJarByClass(MapReduceColorCount.class); job.setJobName("Color Count"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapperClass(ColorCountMapper.class); AvroJob.setInputKeySchema(job, User.getClassSchema()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(ColorCountReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); return job.waitForCompletion(true) ? 1 : 0; }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorDistCPDriver.java
License:Apache License
private Job createJob(Configuration configuration) throws Exception { System.out.println("Initializing BlueShift v 2.0..."); System.out.println("Configuration: " + dcmConfig.toString()); Job job = Job.getInstance(configuration, "BlueShift v 2.0 - " + dcmConfig.getBatchName()); job.setJarByClass(MirrorDistCPDriver.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MirrorMapper.class); job.setReducerClass(MirrorReducer.class); job.setInputFormatClass(MirrorFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, stateManager.getReportPath()); job.setNumReduceTasks(configuration.getInt("mapreduce.reduce.tasks", 1)); System.out.println("Job Initialization Complete, The status of the Mirror job will be written to: " + stateManager.getReportPath()); return job;// ww w .j a v a 2 s.co m }