Java tutorial
/** * Ivory: A Hadoop toolkit for Web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.ptc.driver; import ivory.ptc.data.PseudoJudgments; import ivory.ptc.data.PseudoQuery; import java.io.IOException; import java.net.URI; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping; /** * Driver that formats the pseudo judgments and outputs an XML file * that can be used directly for training. * * @author Nima Asadi */ @SuppressWarnings("deprecation") public class XMLFormatJudgments extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(XMLFormatJudgments.class); static { LOG.setLevel(Level.INFO); } public static class MyReducer extends MapReduceBase implements Reducer<PseudoQuery, PseudoJudgments, Text, Text> { private static final Text keyOut = new Text(); private static final Text valueOut = new Text(""); private static final ClueWarcDocnoMapping mDocnoMapping = new ClueWarcDocnoMapping(); private static PseudoJudgments nextJudgeJudgments; private static int id = 1; @Override public void configure(JobConf job) { Path[] localFiles; try { localFiles = DistributedCache.getLocalCacheFiles(job); } catch (IOException e) { throw new RuntimeException("Local cache files not read properly.", e); } try { mDocnoMapping.loadMapping(localFiles[0], FileSystem.getLocal(job)); } catch (Exception e) { throw new RuntimeException("Error initializing DocnoMapping!", e); } } @Override public void reduce(PseudoQuery key, Iterator<PseudoJudgments> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { while (values.hasNext()) { nextJudgeJudgments = values.next(); for (int i = 0; i < nextJudgeJudgments.size(); i++) { keyOut.set(id + " 0 " + mDocnoMapping.getDocid(nextJudgeJudgments.getDocno(i)) + " 1"); output.collect(keyOut, valueOut); } } id++; } } private static int printUsage() { System.out.println("usage: [input-path] [output-path] [docno-mapping]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } @Override public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } JobConf conf = new JobConf(getConf(), XMLFormatJudgments.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; String docnoMapping = args[2]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoJudgments"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); DistributedCache.addCacheFile(new URI(docnoMapping), conf); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new XMLFormatJudgments(), args); System.exit(res); } }