List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:diamondmapreduce.DiamondMapReduce.java
License:Apache License
int launchHamondAWS(String[] arguments) throws Exception { //extract diamond, query, reference and output from array String diamond = arguments[0]; String query = arguments[1];//from w ww . j a v a2 s. co m String dataBase = arguments[2]; String outPut = arguments[3]; //set Hadoop configuration Job job = Job.getInstance(getConf(), "DIAMOND"); Configuration conf = job.getConfiguration(); SetConf.setHadoopConf(conf); //get user name userName = HadoopUser.getHadoopUser(); //delete all existing DIAMOND files under current Hadoop user DeleteHDFSFiles.deleteAllFiles(userName); //make local Hamond dir awshamondsidefunctions.MakeHamondDir.make(); //copy DIAMOND, query, reference from S3 to master local awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase); //make Hamond directory on HDFS MakeHamondHDFSdir.makedir(conf, userName); //make DIAMOND database on local then copy to HDFS with query and delete local database MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName()); //copy DIAMOND bin, query and local database file to HDFS CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(), "/mnt/Hamond/" + new Path(dataBase).getName(), userName); //pass query name and database name to mappers conf.set(QUERY, query); conf.set(DATABASE, dataBase); conf.set(OUTPUT, outPut); String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length); conf.setStrings("DIAMOND-arguments", subArgs); conf.setStrings(OUTPUT, outPut); //add DIAMOND bin and database into distributed cache job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond")); job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd")); //set job input and output paths FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName())); FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out")); //set job driver and mapper job.setJarByClass(DiamondMapReduce.class); job.setMapperClass(DiamondMapper.class); job.setReducerClass(AWSDiamondReducer.class); //set job input format into customized multilines format job.setInputFormatClass(CustomNLineFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:distributed.hadoop.MapReduceJobConfig.java
License:Open Source License
/** * Apply the settings encapsulated in this config and return a Job object * ready for execution./*from ww w. java2s . co m*/ * * @param jobName the name of the job * @param conf the Configuration object that will be wrapped in the Job * @param env environment variables * @return a configured Job object * @throws IOException if a problem occurs * @throws ClassNotFoundException if various classes are not found */ public Job configureForHadoop(String jobName, Configuration conf, Environment env) throws IOException, ClassNotFoundException { String jobTrackerPort = getJobTrackerPort(); if (DistributedJobConfig.isEmpty(jobTrackerPort)) { jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN : AbstractHadoopJobConfig.DEFAULT_PORT; } String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort; if (DistributedJobConfig.isEmpty(jobTracker)) { System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ") + "set - running locally..."); } else { jobTracker = environmentSubstitute(jobTracker, env); if (AbstractHadoopJobConfig.isHadoop2()) { conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker); conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS, environmentSubstitute(getJobTrackerHost(), env) + ":8030"); } else { conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker); } } System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ") + jobTracker); if (AbstractHadoopJobConfig.isHadoop2()) { // a few other properties needed to run against Yarn conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle"); conf.set("mapreduce.framework.name", "yarn"); } if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) { conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize()); } // Do any user supplied properties here before creating the Job for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) { conf.set(e.getKey(), e.getValue()); } m_hdfsConfig.configureForHadoop(conf, env); Job job = new Job(conf, jobName); String numMappers = getNumberOfMaps(); if (!DistributedJobConfig.isEmpty(numMappers)) { numMappers = environmentSubstitute(numMappers, env); ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers)); } // The number of map tasks that will be run simultaneously by a task tracker String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum(); if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) { ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks); } String numReducers = getNumberOfReducers(); if (!DistributedJobConfig.isEmpty(numReducers)) { numReducers = environmentSubstitute(numReducers, env); job.setNumReduceTasks(Integer.parseInt(numReducers)); if (Integer.parseInt(numReducers) == 0) { System.err.println("Warning - no reducer class set. Configuring for a map only job"); } } else { job.setNumReduceTasks(1); } String mapperClass = getMapperClass(); if (DistributedJobConfig.isEmpty(mapperClass)) { throw new IOException("No mapper class specified!"); } mapperClass = environmentSubstitute(mapperClass, env); @SuppressWarnings("unchecked") Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass); job.setMapperClass(mc); String reducerClass = getReducerClass(); if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) { throw new IOException("No reducer class specified!"); } else if (job.getNumReduceTasks() > 0) { reducerClass = environmentSubstitute(reducerClass, env); @SuppressWarnings("unchecked") Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass); job.setReducerClass(rc); } String combinerClass = getCombinerClass(); if (!DistributedJobConfig.isEmpty(combinerClass)) { combinerClass = environmentSubstitute(combinerClass, env); @SuppressWarnings("unchecked") Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass); job.setCombinerClass(cc); } String inputFormatClass = getInputFormatClass(); if (DistributedJobConfig.isEmpty(inputFormatClass)) { throw new IOException("No input format class specified"); } inputFormatClass = environmentSubstitute(inputFormatClass, env); @SuppressWarnings("unchecked") Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass); job.setInputFormatClass(ifc); String outputFormatClass = getOutputFormatClass(); if (DistributedJobConfig.isEmpty(outputFormatClass)) { throw new IOException("No output format class specified"); } outputFormatClass = environmentSubstitute(outputFormatClass, env); @SuppressWarnings("unchecked") Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass); job.setOutputFormatClass(ofc); String mapOutputKeyClass = getMapOutputKeyClass(); if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) { throw new IOException("No map output key class defined"); } mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env); Class mokc = Class.forName(mapOutputKeyClass); job.setMapOutputKeyClass(mokc); String mapOutputValueClass = getMapOutputValueClass(); if (DistributedJobConfig.isEmpty(mapOutputValueClass)) { throw new IOException("No map output value class defined"); } mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env); Class movc = Class.forName(mapOutputValueClass); job.setMapOutputValueClass(movc); String outputKeyClass = getOutputKeyClass(); if (DistributedJobConfig.isEmpty(outputKeyClass)) { throw new IOException("No output key class defined"); } outputKeyClass = environmentSubstitute(outputKeyClass, env); Class okc = Class.forName(outputKeyClass); job.setOutputKeyClass(okc); String outputValueClass = getOutputValueClass(); if (DistributedJobConfig.isEmpty(outputValueClass)) { throw new IOException("No output value class defined"); } outputValueClass = environmentSubstitute(outputValueClass, env); Class ovc = Class.forName(outputValueClass); job.setOutputValueClass(ovc); String inputPaths = getInputPaths(); // don't complain if there aren't any as inputs such as HBASE // require other properties to be set if (!DistributedJobConfig.isEmpty(inputPaths)) { inputPaths = environmentSubstitute(inputPaths, env); FileInputFormat.setInputPaths(job, inputPaths); } String outputPath = getOutputPath(); if (DistributedJobConfig.isEmpty(outputPath)) { throw new IOException("No output path specified"); } outputPath = environmentSubstitute(outputPath, env); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job; }
From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); Job job = new Job(configuration, "ARC Header Extractor"); job.setJarByClass(ARCHeaderExtractorMR.class); job.setMapperClass(ARCHeaderExtractorMapper.class); job.setCombinerClass(ARCHeaderExtractorReducer.class); job.setReducerClass(ARCHeaderExtractorReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); int n = args.length; if (n == 0 || n > 2) { System.err.println(/* w ww . j a v a2 s. co m*/ "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied."); System.exit(0); } SequenceFileInputFormat.addInputPath(job, new Path(args[0])); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : -1; }
From source file:edu.berkeley.chukwa_xtrace.XtrExtract.java
License:Apache License
@Override public int run(String[] arg) throws Exception { Job extractor = new Job(getConf()); extractor.setMapperClass(MapClass.class); extractor.setReducerClass(Reduce.class); extractor.setJobName("x-trace reconstructor"); extractor.setJarByClass(this.getClass()); extractor.setMapOutputKeyClass(BytesWritable.class); extractor.setMapOutputValueClass(Text.class); extractor.setOutputKeyClass(BytesWritable.class); extractor.setOutputValueClass(TextArrayWritable.class); extractor.setInputFormatClass(SequenceFileInputFormat.class); extractor.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(extractor, new Path(arg[0])); FileOutputFormat.setOutputPath(extractor, new Path(arg[1])); System.out.println("looks OK. Submitting."); extractor.submit();/*ww w.jav a2 s. c o m*/ // extractor.waitForCompletion(false); return 0; }
From source file:edu.berkeley.chukwa_xtrace.XtrIndex.java
License:Apache License
@Override public int run(String[] arg) throws Exception { Job extractor = new Job(getConf()); extractor.setMapperClass(MapClass.class); //no reduce, just identity extractor.setJobName("x-trace indexer"); extractor.setJarByClass(this.getClass()); extractor.setMapOutputKeyClass(BytesWritable.class); extractor.setMapOutputValueClass(TextArrayWritable.class); extractor.setOutputKeyClass(BytesWritable.class); extractor.setOutputValueClass(TextArrayWritable.class); extractor.setInputFormatClass(SequenceFileInputFormat.class); extractor.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(extractor, new Path(arg[0])); FileOutputFormat.setOutputPath(extractor, new Path(arg[1])); System.out.println("looks OK. Submitting."); extractor.submit();/*from www.jav a 2 s . c om*/ // extractor.waitForCompletion(false); return 0; }
From source file:edu.bigdata.training.serialization.UserHistory.java
@Override public int run(String[] strings) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "postHistory"); job.setJarByClass(UserHistory.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1);//w w w . j av a2s .c o m MultipleInputs.addInputPath(job, new Path("input/posts/user_info.txt"), TextInputFormat.class, UserCityMapper.class); MultipleInputs.addInputPath(job, new Path("input/posts/user_posts.txt"), TextInputFormat.class, UserPostsMapper.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, UserPostSummary.getClassSchema()); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outPath = new Path("output/user/posts"); FileOutputFormat.setOutputPath(job, outPath); job.setReducerClass(UserPostHistory.class); //outPath.getFileSystem(job.getConfiguration()).delete(outPath, true); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.columbia.hs2807.Sentiment.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "sentiment"); job.setJarByClass(Sentiment.class); job.setMapperClass(Map.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(LongArrayWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { String tableName = args[0];//w w w . j a v a 2s . c o m String inputPath = args[1]; String outputPath = args[2]; Path topicsFile = new Path(args[3]); Path vocabFile = new Path(args[4]); Path dateBinFile = new Path(args[5]); Configuration config = getConf(); config.set("hbase.table.name", tableName); HBaseConfiguration.addHbaseResources(config); Job job = Job.getInstance(config); job.setJarByClass(ThriftBulkLoader.class); job.setJobName("Bulk Loading HBase Table::" + tableName); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapperClass(ThriftFilterMapper.class); Path output = new Path(outputPath); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, output); job.setMapOutputValueClass(Put.class); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.addCacheFile(dateBinFile.toUri()); job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().setClass("mapred.map.output.compression.codec", org.apache.hadoop.io.compress.SnappyCodec.class, org.apache.hadoop.io.compress.CompressionCodec.class); job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName()); //RegionLocator regionLocator = conn.getRegionLocator(tableName); //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName)); Connection con = ConnectionFactory.createConnection(config); TableName htableName = TableName.valueOf(tableName); HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName), con.getRegionLocator(htableName)); job.waitForCompletion(true); if (job.isSuccessful()) { // Couldn't find a better way to do this. The LoadIncrementalHFiles // seems to want 777 permissions on the output directory. try { Runtime rt = Runtime.getRuntime(); rt.exec("hadoop fs -chmod -R 777 " + output); } catch (Exception e) { e.printStackTrace(); } /* LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config); HTable htable = new HTable(config, tableName); loader.doBulkLoad(new Path(outputPath), htable); */ } else { throw new IOException("error with job"); } return 0; // - /* Job job = Job.getInstance(config); job.setJarByClass(ThriftBulkLoader.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setInputFormatClass(ThriftFileInputFormat.class); //HFileOutputFormat2.configureIncrementalLoad(job, htable); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.setMapperClass(ThriftFilterMapper.class); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job"); } LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config); loader.doBulkLoad(new Path(outputPath), htable); return 0; */ }
From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/*from ww w.j a v a 2 s.c om*/ } // all directories are in HDFS tokenizedDocDir = args[0]; dictDir = args[1]; outputDir = args[2]; numReducers = Integer.valueOf(args[3]); logger.info("PartialVectorsFromTokenizedDoc "); logger.info(" - tokenizedDocDir: " + tokenizedDocDir); logger.info(" - dictDir: " + dictDir); logger.info(" - outputDir: " + outputDir); logger.info(" - numReducers: " + numReducers); Path tokenizedDocPath = new Path(tokenizedDocDir); Path dictPath = new Path(dictDir); Path outputPath = new Path(outputDir); // get dimension Configuration conf = getConf(); int dimension = 0; for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true, conf)) { dimension++; } logger.info("dimension of a vector: " + dimension); // submit job long t0 = System.currentTimeMillis(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf); Job job = new Job(conf); job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir + ", dictionary-file: " + dictDir); job.setJarByClass(PartialVectorsFromTokenizedDoc.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, tokenizedDocPath); FileOutputFormat.setOutputPath(job, outputPath); HadoopUtil.delete(conf, outputPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); job.waitForCompletion(true); long t1 = System.currentTimeMillis(); logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 6) { printUsage();/*from w w w . java 2 s . c om*/ } String inputPath = args[0]; String outputPath = args[1]; int maxIdsPerSplit = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; String analyzerClassName = args[4]; int maxIdsPerReq = Integer.valueOf(args[5]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // upload dictionary file to HDFS // FileSystem fs = FileSystem.get(getConf()); // Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile)); // BufferedWriter writer = new BufferedWriter( // new OutputStreamWriter(fs.create(dictionaryPath, true))); // BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile)); // String line = null; // while ((line = reader.readLine()) != null) { // writer.write(line + "\n"); // } // writer.close(); // Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely."); job.setJarByClass(DataCopyTokenizerJob.class); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set distributed cache // Path dictionaryPath = new Path(dictionaryFile); // DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration()); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); job.setMapperClass(DataCopyTokenizerMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }