Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package goraci; import goraci.generated.CINode; import java.io.IOException; import java.util.ArrayList; import org.apache.gora.mapreduce.GoraMapper; import org.apache.gora.query.Query; import org.apache.gora.store.DataStore; import org.apache.gora.store.DataStoreFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * A Map Reduce job that verifies that the linked list generated by {@link goraci.Generator} do not have any holes. */ public class Verify extends Configured implements Tool { private static final VLongWritable DEF = new VLongWritable(-1); public static class VerifyMapper extends GoraMapper<Long, CINode, LongWritable, VLongWritable> { private LongWritable row = new LongWritable(); private LongWritable ref = new LongWritable(); private VLongWritable vrow = new VLongWritable(); @Override protected void map(Long key, CINode node, Context context) throws IOException, InterruptedException { row.set(key); context.write(row, DEF); if (node.getPrev() >= 0) { ref.set(node.getPrev()); vrow.set(key); context.write(ref, vrow); } } } public static enum Counts { UNREFERENCED, UNDEFINED, REFERENCED, CORRUPT } public static class VerifyReducer extends Reducer<LongWritable, VLongWritable, Text, Text> { private ArrayList<Long> refs = new ArrayList<Long>(); public void reduce(LongWritable key, Iterable<VLongWritable> values, Context context) throws IOException, InterruptedException { int defCount = 0; refs.clear(); for (VLongWritable type : values) { if (type.get() == -1) { defCount++; } else { refs.add(type.get()); } } // TODO check for more than one def, should not happen if (defCount == 0 && refs.size() > 0) { // this is bad, found a node that is referenced but not defined. It must have been lost, emit some info about this node for debugging purposes. StringBuilder sb = new StringBuilder(); String comma = ""; for (Long ref : refs) { sb.append(comma); comma = ","; sb.append(String.format("%016x", ref)); } context.write(new Text(String.format("%016x", key.get())), new Text(sb.toString())); context.getCounter(Counts.UNDEFINED).increment(1); } else if (defCount > 0 && refs.size() == 0) { // node is defined but not referenced context.getCounter(Counts.UNREFERENCED).increment(1); } else { // node is defined and referenced context.getCounter(Counts.REFERENCED).increment(1); } } } @Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.println("Usage : " + Verify.class.getSimpleName() + " <output dir> <num reducers>"); return 0; } DataStore<Long, CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class, new Configuration()); String outputDir = args[0]; int numReducers = Integer.parseInt(args[1]); Job job = new Job(getConf()); if (!job.getConfiguration().get("io.serializations") .contains("org.apache.hadoop.io.serializer.JavaSerialization")) { job.getConfiguration().set("io.serializations", job.getConfiguration().get("io.serializations") + ",org.apache.hadoop.io.serializer.JavaSerialization"); } job.setJobName("Link Verifier"); job.setNumReduceTasks(numReducers); job.setJarByClass(getClass()); Query<Long, CINode> query = store.newQuery(); query.setFields("prev"); GoraMapper.initMapperJob(job, query, store, LongWritable.class, VLongWritable.class, VerifyMapper.class, true); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.setReducerClass(VerifyReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(outputDir)); boolean success = job.waitForCompletion(true); store.close(); return success ? 0 : 1; } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new Verify(), args); System.exit(ret); } }