Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.jacky.hadoop.examples; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class ExtractData { private static final Log log = LogFactory.getLog(ExtractData.class); private static Long primaryDataTime = Long.MAX_VALUE; private static enum DataStatus { Add, Update, Delete; } // private static final String inputPath = // "hdfs://192.168.2.248:9000/usr/input/out"; // private static final String outPath = // "hdfs://192.168.2.248:9000/usr/outPut"; private static URI uri; static { uri = URI.create("hdfs://192.168.50.70:9000"); } public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> { private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { InputSplit inputSplit = context.getInputSplit(); String fileName = ((FileSplit) inputSplit).getPath().getName(); fileName = fileName.substring(0, fileName.lastIndexOf(".")); //remove file postfix String[] params = fileName.split("[_]"); if (params.length != 3) return; String tableSignature = params[0]; String tableName = params[1]; String dataTime = params[2]; if (!dataTime.matches("[0-9]+")) throw new IllegalArgumentException("123 " + dataTime + " is not digit!"); if (/*primaryDataTime == 0 || */primaryDataTime > Long.valueOf(dataTime)) primaryDataTime = Long.valueOf(dataTime); Text valueText = new Text(); valueText.set(value.toString() + "|" + dataTime); Text keyText = new Text(); keyText.set(value.toString().split("[|]")[2]); Counter count = context.getCounter("map value: ", value.toString()); count.increment(1l); word.set(tableSignature + "_" + keyText); context.write(word, valueText); } } public static class IntSumReducer extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Map<Long, String> dataMap = new HashMap<Long, String>(); List<Long> timeList = new LinkedList<Long>(); for (Text text : values) { String[] textTmp = text.toString().split("[|]"); String dataTime = textTmp[textTmp.length - 1]; if (!dataTime.matches("[0-9]+")) throw new IllegalArgumentException("321 " + dataTime + " is not digit!"); timeList.add(Long.valueOf(dataTime)); dataMap.put(Long.valueOf(dataTime), text.toString().substring(0, text.toString().lastIndexOf("|"))); } List<Long> arrayDataList = new ArrayList<Long>(timeList); Collections.sort(arrayDataList); String primaryData = dataMap.get(arrayDataList.get(0)); Long primaryTime = arrayDataList.get(0); Text keyText = new Text(); keyText.set(primaryData); if (dataMap.size() == 1) { if (primaryTime <= primaryDataTime) context.write(keyText, new Text(DataStatus.Delete.toString())); else context.write(keyText, new Text(DataStatus.Add.toString())); } else { for (int i = 1; i < arrayDataList.size(); i++) { String data = dataMap.get(arrayDataList.get(i)); if (!data.trim().equals(primaryData.trim())) context.write(keyText, new Text(DataStatus.Update.toString())); } } } } /** * delete a dir in the hdfs. dir may like /tmp/testdir. * * @param dir * @throws IOException */ public static void deleteDir(String dir) throws IOException { log.info("delete path"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(uri, conf); /* * for method delete(Path f, boolean recursive)we need notice:if path * is a directory or is a empty directory and set recursive to true, the * directory is deleted else throws an exception. In case of a file the * recursive can be set to either true or false */ fs.delete(new Path(dir), true); fs.close(); } public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); Configuration conf = new Configuration(); // conf.setQuietMode(false); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: extract <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(ExtractData.class); job.setMapperClass(TokenizerMapper.class); //job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); // delete out path first deleteDir(otherArgs[1]); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); if (job.waitForCompletion(true)) { long endTime = System.currentTimeMillis(); log.info("useTime: " + formatLongToTimeStr((endTime - startTime))); log.info("primary time: " + primaryDataTime); System.exit(0); } } public static String formatLongToTimeStr(Long useTime) { int hour = 0; int minute = 0; int second = 0; second = useTime.intValue() / 1000; if (second > 60) { minute = second / 60; second = second % 60; } if (minute > 60) { hour = minute / 60; minute = minute % 60; } String strtime = hour + "H " + minute + "M " + second + "S"; return strtime; } }