List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:edu.isi.mavuno.app.nlp.TratzParse.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf); // optional parameter that allows the parsed documents to be output in text format String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf); boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { textOutputFormat = true;// w ww. jav a 2s . c o m } sLogger.info("Tool name: TratzParse"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("TratzParse"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(TratzParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TratzParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(TratzParse.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue()); sLogger.info(" - Total dropped sentences: " + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue()); sLogger.info( " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue()); sLogger.info( " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue()); sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue()); sLogger.info(" - Total named entity tagging time (ms): " + counters.findCounter(StatCounters.NETAG_TIME).getValue()); sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue()); return 0; }
From source file:edu.isi.mavuno.app.util.ExamplesToSequenceFile.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String contextPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.OutputPath", conf); sLogger.info("Tool name: ExamplesToSequenceFile"); sLogger.info(" - Context path: " + contextPath); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExamplesToSequenceFile"); FileInputFormat.addInputPath(job, new Path(contextPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);// ww w .ja v a2 s . com return 0; }
From source file:edu.isi.mavuno.app.util.SequenceFileToText.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.SequenceFileToText.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.SequenceFileToText.OutputPath", conf); sLogger.info("Tool name: SequenceFileToText"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("SequenceFileToText"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);/*from w ww .j av a2 s.co m*/ job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.CombineGlobalStats.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf); int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1); sLogger.info("Tool name: CombineGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Number of splits: " + numSplits); Job job = new Job(conf); job.setJobName("CombineGlobalStats"); for (int split = 0; split < numSplits; split++) { FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split)); }/*from w ww . j a v a 2 s .co m*/ FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.CombineSplits.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf); String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase(); int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf); sLogger.info("Tool name: CombineSplits"); sLogger.info(" - Examples path: " + examplesPath); sLogger.info(" - Example stats path: " + exampleStatsPath); sLogger.info(" - Split key: " + splitKey); sLogger.info(" - Total splits: " + numSplits); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("CombineSplits"); for (int split = 0; split < numSplits; split++) { FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split)); }/*from w ww . ja v a 2 s.c o m*/ if (MavunoUtils.pathExists(conf, exampleStatsPath)) { FileInputFormat.addInputPath(job, new Path(exampleStatsPath)); } FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); if ("pattern".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else if ("context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); } else if ("pattern+context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else { throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey); } job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.Extract.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase(); int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf); sLogger.info("Tool name: Extract"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Min context matches: " + minContextMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("Extract"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); if ("pattern".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); } else if ("context".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class); } else {// ww w . ja v a2s . c o m throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf) .toLowerCase();//from w w w . jav a 2 s .c o m String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf); // split examples conf.set("Mavuno.Split.InputPath", inputPath); conf.set("Mavuno.Split.OutputPath", outputPath + "/../split"); conf.set("Mavuno.Split.SplitKey", extractorTarget); new Split(conf).run(); // get splits FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split"); int split = 0; for (FileStatus file : files) { if (!file.getPath().getName().endsWith(".examples")) { continue; } conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString()); sLogger.info("Tool name: ExtractGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Examples path: " + file.getPath()); sLogger.info(" - Example split: " + split); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor class: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractGlobalStats"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); split++; } // combine splits conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split); conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/"); conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath); new CombineGlobalStats(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/../split"); return 0; }
From source file:edu.isi.mavuno.extract.Split.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Split.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Split.OutputPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.Split.SplitKey", conf); sLogger.info("Tool name: Split"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Split key: " + splitKey); Job job = new Job(conf); job.setJobName("Split"); MavunoUtils.recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1);//from ww w .jav a 2 s. c o m job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.score.CombineScores.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineScores.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineScores.OutputPath", conf); sLogger.info("Tool name: CombineScores"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("CombineScores"); MavunoUtils.recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/* w w w. java2 s. co m*/ return 0; }
From source file:edu.isi.mavuno.score.GetTopResults.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf); int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf)); boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); sLogger.info("Tool name: GetTopResults"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Number of results: " + numResults); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("GetTopResults"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); if (sequenceFileOutputFormat) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else {/*from ww w . j a v a 2s . c o m*/ job.setOutputFormatClass(TextOutputFormat.class); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }