List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:edu.indiana.d2i.htrc.io.ParallelDataCopyJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 5) { printUsage();/*from w w w. j a v a2 s . com*/ } String inputPath = args[0]; String outputPath = args[1]; int maxIdsPerSplit = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; int maxIdsPerReq = Integer.valueOf(args[4]); logger.info("ParallelDataCopyJob "); logger.info(" - input: " + inputPath); // id list logger.info(" - output: " + outputPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); Job job = new Job(getConf(), "Copy data from HTRC data storage parallely."); job.setJarByClass(ParallelDataCopyJob.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("ParallelDataCopyJob took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromHDFSRawText.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from plain text in HDFS."); job.setJarByClass(SparseVectorsFromHDFSRawText.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and lucene job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0);/*from w w w. j av a2 s. c o m*/ FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromLucene.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from Lucene."); job.setJarByClass(SparseVectorsFromLucene.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and lucene // job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.getConfiguration().set("htrc.solr.dictionary", dictDir); job.getConfiguration().set("htrc.lucene.index.path", indexLoc); job.setInputFormatClass(LuceneIDFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0);/* w w w . j a va2 s . c o m*/ FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 7) { printUsage();//from w w w . j a v a 2 s . c o m } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; int maxIdsPerSplit = Integer.valueOf(args[3]); String dataAPIConfClassName = args[4]; String analyzerClassName = args[5]; int maxIdsPerReq = Integer.valueOf(args[6]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SparseVectorsFromRawText.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromSolr.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from Solr."); job.setJarByClass(SparseVectorsFromSolr.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and solr // job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.getConfiguration().set("htrc.solr.dictionary", dictDir); job.getConfiguration().set(HTRCConstants.SOLR_MAIN_URL, solrEPR); job.setInputFormatClass(SolrIDFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0);//from w ww. java2 s . c o m FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ///* ww w.j av a 2 s . c om*/ Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SparseVectorsToMemcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(idListDir)); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();//from w w w . j av a 2 s . co m } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; String analyzerClassName = args[3]; logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - analyzerName: " + analyzerClassName); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SVFromHDFS2HDFS.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ///*from w ww .j ava 2 s .co m*/ Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SVFromHDFS2Memcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(vecDir)); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
/** * Run the job using supplied arguments/*w ww . jav a 2 s. co m*/ * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param clustersOut * the directory pathname for output clusters * @param measureClass * the classname of the DistanceMeasure * @param convergenceDelta * the convergence delta value * * @return true if the iteration successfully runs */ private static boolean runIteration(Configuration conf, Path input, Path clustersOut, String measureClass, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass); conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta); Job job = new Job(conf, "KMeans Driver running runIteration "); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ClusterObservations.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Cluster.class); // job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(MemIDInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapperClass(MemKMeansMapper.class); job.setCombinerClass(KMeansCombiner.class); // ?? job.setReducerClass(MemKMeansReducer.class); FileInputFormat.addInputPath(job, input); // input is id list FileOutputFormat.setOutputPath(job, clustersOut); job.setJarByClass(MemCachedKMeansDriver.class); HadoopUtil.delete(conf, clustersOut); if (!job.waitForCompletion(true)) { throw new InterruptedException("K-Means Iteration failed processing "); } return isConverged(conf); }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { printUsage();/* w w w . ja va 2 s .co m*/ } String input = args[0]; String output = args[1]; int maxCluster = Integer.valueOf(args[2]); logger.info("StreamingKmeansDriver "); logger.info(" - input: " + input); logger.info(" - output: " + output); logger.info(" - maxCluster: " + maxCluster); // set job Job job = new Job(getConf(), "Streaming KMeans"); job.setJarByClass(StreamingKMeansDriver.class); StreamingKMeansConfigHelper(job.getConfiguration(), input, maxCluster); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StreamingKMeansCluster.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(StreamingKMeansMapper.class); job.setReducerClass(StreamingKMeansReducer.class); job.waitForCompletion(true); return 0; }