Java tutorial
/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Socorro. * * The Initial Developer of the Original Code is the Mozilla Foundation. * Portions created by the Initial Developer are Copyright (C) 2010 * the Initial Developer. All Rights Reserved. * * Contributor(s): * * Xavier Stevens <xstevens@mozilla.com>, Mozilla Corporation (original author) * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ package com.mozilla.socorro.hadoop; import static com.mozilla.socorro.hadoop.CrashReportJob.*; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.commons.math.stat.descriptive.DescriptiveStatistics; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.TypeReference; import com.mozilla.util.DateUtil; public class DumpSizeTrends implements Tool { private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(DumpSizeTrends.class); private static final String NAME = "DumpSizeTrends"; private Configuration conf; private static final String KEY_DELIMITER = "\u0001"; private static final String TAB_DELIMITER = "\t"; public static class DumpSizeTrendsMapper extends TableMapper<Text, IntWritable> { public enum ReportStats { RAW_BYTES_NULL, PROCESSED_BYTES_NULL, JSON_PARSE_EXCEPTION, JSON_MAPPING_EXCEPTION, META_JSON_BYTES_NULL, PROCESSED_JSON_BYTES_NULL, PROCESSED, PRODUCT_FILTERED, RELEASE_FILTERED, TIME_FILTERED, CRASH_TIME_NULL, CRASH_TIME_PARSE_FAILED, OOM_ERROR } private Text outputKey; private IntWritable outputValue; private ObjectMapper jsonMapper; private SimpleDateFormat outputSdf; private String productFilter; private String releaseFilter; private long startTime; private long endTime; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ public void setup(Context context) { outputKey = new Text(); outputValue = new IntWritable(); jsonMapper = new ObjectMapper(); outputSdf = new SimpleDateFormat("yyyyMMdd"); Configuration conf = context.getConfiguration(); productFilter = conf.get(PRODUCT_FILTER); releaseFilter = conf.get(RELEASE_FILTER); startTime = DateUtil.getTimeAtResolution(conf.getLong(START_TIME, 0), Calendar.DATE); endTime = DateUtil.getEndTimeAtResolution(conf.getLong(END_TIME, System.currentTimeMillis()), Calendar.DATE); } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ public void map(ImmutableBytesWritable key, Result result, Context context) throws InterruptedException, IOException { try { byte[] valueBytes = result.getValue(META_DATA_BYTES, JSON_BYTES); if (valueBytes == null) { context.getCounter(ReportStats.META_JSON_BYTES_NULL).increment(1L); return; } // This is an untyped parse so the caller is expected to know the types Map<String, Object> meta = jsonMapper.readValue(new String(valueBytes), new TypeReference<Map<String, Object>>() { }); String product = null; String productVersion = null; if (meta.containsKey(META_JSON_PRODUCT_NAME)) { product = (String) meta.get(META_JSON_PRODUCT_NAME); } if (meta.containsKey(META_JSON_PRODUCT_VERSION)) { productVersion = (String) meta.get(META_JSON_PRODUCT_VERSION); } // Filter row if filter(s) are set and it doesn't match if (!StringUtils.isBlank(productFilter)) { if (product == null || !product.equals(productFilter)) { context.getCounter(ReportStats.PRODUCT_FILTERED).increment(1L); return; } } if (!StringUtils.isBlank(releaseFilter)) { if (productVersion == null || !productVersion.equals(releaseFilter)) { context.getCounter(ReportStats.RELEASE_FILTERED).increment(1L); return; } } String crashTimeStr = (String) meta.get(META_JSON_CRASH_TIME); if (!meta.containsKey(META_JSON_CRASH_TIME)) { context.getCounter(ReportStats.CRASH_TIME_NULL).increment(1L); } long crashTime = 0L; try { crashTime = Long.parseLong(crashTimeStr) * 1000L; } catch (NumberFormatException e) { context.getCounter(ReportStats.CRASH_TIME_PARSE_FAILED).increment(1L); return; } // Filter if the crash time is not within our window if (crashTime < startTime || crashTime > endTime) { context.getCounter(ReportStats.TIME_FILTERED).increment(1L); return; } Calendar cal = Calendar.getInstance(); cal.setTimeInMillis(crashTime); StringBuilder keyPrefix = new StringBuilder(); keyPrefix.append(outputSdf.format(cal.getTime())).append(KEY_DELIMITER); keyPrefix.append(product).append(KEY_DELIMITER); keyPrefix.append(productVersion).append(KEY_DELIMITER); valueBytes = result.getValue(RAW_DATA_BYTES, DUMP_BYTES); if (valueBytes == null) { context.getCounter(ReportStats.RAW_BYTES_NULL).increment(1L); } else { outputKey.set(keyPrefix.toString() + "raw"); outputValue.set(valueBytes.length); context.write(outputKey, outputValue); } valueBytes = result.getValue(PROCESSED_DATA_BYTES, JSON_BYTES); if (valueBytes != null) { outputKey.set(keyPrefix.toString() + "processed"); outputValue.set(valueBytes.length); context.write(outputKey, outputValue); } else { context.getCounter(ReportStats.PROCESSED_JSON_BYTES_NULL).increment(1L); } } catch (JsonParseException e) { context.getCounter(ReportStats.JSON_PARSE_EXCEPTION).increment(1L); } catch (JsonMappingException e) { context.getCounter(ReportStats.JSON_MAPPING_EXCEPTION).increment(1L); } catch (OutOfMemoryError e) { System.out.println("OutOfMemoryError on row: " + new String(result.getRow())); context.getCounter(ReportStats.OOM_ERROR).increment(1L); } } } public static class DumpSizeTrendsReducer extends Reducer<Text, IntWritable, Text, Text> { private Text outputKey = null; private Text outputValue = null; private Pattern keyPattern = null; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ public void setup(Context context) { keyPattern = Pattern.compile(KEY_DELIMITER); outputKey = new Text(); outputValue = new Text(); } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context) */ public void cleanup(Context context) { } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { Iterator<IntWritable> iter = values.iterator(); DescriptiveStatistics stats = new DescriptiveStatistics(); long sum = 0L; while (iter.hasNext()) { long curValue = iter.next().get(); stats.addValue(curValue); sum += curValue; } Matcher m = keyPattern.matcher(key.toString()); if (m.find()) { outputKey.set(m.replaceAll(TAB_DELIMITER)); } else { outputKey.set(key.toString()); } // Output the median along with total size StringBuilder sb = new StringBuilder(); sb.append(stats.getN()).append(TAB_DELIMITER); sb.append(stats.getPercentile(50.0d)).append(TAB_DELIMITER); sb.append(sum); outputValue.set(sb.toString()); context.write(outputKey, outputValue); } } /** * @param args * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { conf.set("mapred.child.java.opts", "-Xmx1024m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); Map<byte[], byte[]> columns = new HashMap<byte[], byte[]>(); columns.put(RAW_DATA_BYTES, DUMP_BYTES); columns.put(META_DATA_BYTES, JSON_BYTES); columns.put(PROCESSED_DATA_BYTES, JSON_BYTES); Job job = CrashReportJob.initJob(NAME, getConf(), DumpSizeTrends.class, DumpSizeTrendsMapper.class, null, DumpSizeTrendsReducer.class, columns, Text.class, Text.class, new Path(args[0])); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); return job; } /** * @return */ private static int printUsage() { System.out.println("Usage: " + NAME + " [generic-options] <output-path>"); System.out.println(); System.out.println("Configurable Properties:"); System.out.println(PRODUCT_FILTER + "=<product-name>"); System.out.println(RELEASE_FILTER + "=<release-version>"); System.out.println(START_DATE + "=<yyyyMMdd>"); System.out.println(END_DATE + "=<yyyyMMdd>"); System.out.println(); GenericOptionsParser.printGenericCommandUsage(System.out); return -1; } /* (non-Javadoc) * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ public int run(String[] args) throws Exception { if (args.length != 1) { return printUsage(); } int rc = -1; Job job = initJob(args); job.waitForCompletion(true); if (job.isSuccessful()) { rc = 0; } return rc; } /* (non-Javadoc) * @see org.apache.hadoop.conf.Configurable#getConf() */ public Configuration getConf() { return this.conf; } /* (non-Javadoc) * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) */ public void setConf(Configuration conf) { this.conf = conf; } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new DumpSizeTrends(), args); System.exit(res); } }