Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package nl.utwente.bigdata; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.StringTokenizer; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.IntWritable; import java.text.DecimalFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.json.simple.parser.JSONParser; public class PageRank { private static float defaultVal = 1.0f; private static float dampingFactor = 0.85f; public static final String TWEETS = "/user/alyr/worldcup/part*"; public static final String USER = "/user/"; public static final String OVERSLAAN = "overslaan"; private static String rtPath = "output2"; private static String prPath = "outputpagerank"; public static class PageRankMapper extends Mapper<Object, Text, Text, MapWritable> { private Map<String, Object> tweet; private Text retweetUserScore = new Text(); private Text tweetUser = new Text(); private Text retweetUser = new Text(); private Map<String, Object> retweetUserInfo; private Map<String, Object> tweetUserInfo; private URI[] files; private MapWritable mw = new MapWritable(); private HashMap<String, String> scores = new HashMap<String, String>(); private JSONParser parser = new JSONParser(); public void setup(Context context) throws IOException { try { FileSystem fs = FileSystem.get(context.getConfiguration()); BufferedReader br; for (URI file : context.getCacheFiles()) { if (file.toString().toLowerCase().contains(rtPath)) { Path path = new Path(file.toString()); br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { StringTokenizer st = new StringTokenizer(line, "\\t"); int j = 0; String[] arr = new String[2]; while (st.hasMoreTokens() && j < 2) { j++; arr[j] = st.nextToken(); } scores.put(arr[0], arr[1]); line = br.readLine(); } br.close(); } } } finally { if (scores.size() < 1) { throw new IOException("Geen scores"); //throw new IOException("File not loaded" + context.getCacheFiles().length + " files:" + files.length + ", path:" + context.getCacheFiles()[0].getPath() + ", string: " + context.getCacheFiles()[0].toString()); } } } public void map(Object key, Text value, Context context) throws IOException, InterruptedException { try { tweet = (Map<String, Object>) parser.parse(value.toString()); } catch (ClassCastException e) { return; // do nothing (we might log this) } catch (org.json.simple.parser.ParseException e) { return; // do nothing } // Bepalen of het een tweet of een retweet is (misschien een check op nullpointerexception?) Map<String, Object> retweetInfo = (Map<String, Object>) tweet.get("retweeted_status"); if (retweetInfo != null) { //Retweetscore mist beschikbaar retweetUserInfo = (Map<String, Object>) tweet.get("user"); retweetUser.set((String) String.valueOf(retweetUserInfo.get("id"))); String score = scores.get(String.valueOf(retweetUserInfo.get("id"))); score = (score != null) ? score : "1.0"; retweetUserScore.set((String) String.valueOf(score)); //Tweetuserid tweetUserInfo = (Map<String, Object>) retweetInfo.get("user"); tweetUser.set((String) String.valueOf(tweetUserInfo.get("id"))); mw.put(new IntWritable(1), retweetUser); mw.put(new IntWritable(2), retweetUserScore); context.write(tweetUser, mw); } } } public static class PageRankReducer extends Reducer<Text, MapWritable, Text, Text> { private static final DecimalFormat df = new DecimalFormat("#.00000"); private Text finalScore = new Text(); private float totaalScore = 0.0f; private float tempScore = 0.0f; private HashMap<String, String> scores = new HashMap<String, String>(); public void setup(Context context) throws IOException { try { FileSystem fs = FileSystem.get(context.getConfiguration()); BufferedReader br; for (URI file : context.getCacheFiles()) { if (file.toString().toLowerCase().contains(prPath)) { Path path = new Path(file.toString()); br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { StringTokenizer st = new StringTokenizer(line, "\\t"); int j = 0; String[] arr = new String[2]; while (st.hasMoreTokens() && j < 2) { j++; arr[j] = st.nextToken(); } scores.put(arr[0], arr[1]); line = br.readLine(); } br.close(); } } } finally { /*if(scores.size() < 1){ throw new IOException("Geen scores" + context.getCacheFiles()[0].toString(); //throw new IOException("File not loaded" + context.getCacheFiles().length + " files:" + files.length + ", path:" + context.getCacheFiles()[0].getPath() + ", string: " + context.getCacheFiles()[0].toString()); }*/ System.out.println("Empty scores"); } } public void reduce(Text key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException { tempScore = 0.0f; for (MapWritable value : values) { // Value is het aantal retweets van retweeter X // In scores staat het pagerank van tweeter Y String retweeter = ((Text) value.get(new IntWritable(1))).toString(); String retweeterCount = ((Text) value.get(new IntWritable(2))).toString(); float rtCount = Float.parseFloat(retweeterCount); String score = scores.get(retweeter); score = (score != null) ? score : "0.15"; float rtPagerank = Float.parseFloat(score); tempScore += rtPagerank / rtCount; } totaalScore = tempScore * dampingFactor + (defaultVal - dampingFactor); finalScore.set(df.format(totaalScore)); context.write(key, finalScore); } } public static void run(String[] args) throws Exception { Configuration conf = new Configuration(); //Process args GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: pageRank <in> [<in>...] <out> <retweetpath> <pagerankpath"); System.exit(2); } conf.set("mapreduce.job.reduce.slowstart.completedmaps", "1"); //Setup the job Job job = Job.getInstance(conf, "Twitter Reader"); job.setJarByClass(PageRank.class); job.setMapperClass(PageRankMapper.class); job.setReducerClass(PageRankReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MapWritable.class); //Load input files for (int i = 0; i < otherArgs.length - 3; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } //Load output file FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 3])); //Load previous results FileSystem fs = FileSystem.get(new Configuration()); FileStatus[] status = fs.listStatus(new Path("hdfs:" + otherArgs[otherArgs.length - 2])); for (FileStatus s : status) { job.addCacheFile(s.getPath().toUri()); } String str = new String(otherArgs[otherArgs.length - 1]); if (!str.equals("overslaan")) { FileStatus[] status2 = fs.listStatus(new Path("hdfs:" + otherArgs[otherArgs.length - 1])); for (FileStatus s2 : status2) { job.addCacheFile(s2.getPath().toUri()); } } boolean succesful = job.waitForCompletion(true); } public static String getPath(String studentnummer, String folder) { return USER + studentnummer + "/" + folder + "/"; } public static void main(String[] args) throws Exception { int runs = 10; if (args.length < 2) { System.err.println("Usage: pageRank <studentnummer> <runs> [<rtPath>] [<prPath>]"); System.exit(2); } try { runs = Integer.parseInt(args[1]); } catch (NumberFormatException e) { System.err.println("<Runs> must be a number"); System.exit(2); } //optional, choose different output folders if (args.length == 4) { rtPath = args[2]; prPath = args[3]; } //userfolder on hadoop fs String studentnummer = args[0]; String[] argsol = { TWEETS, rtPath }; String[] argspr = { TWEETS, prPath + "0", PageRank.getPath(studentnummer, rtPath), OVERSLAAN }; OutgoingLinks.main(argsol); for (int i = 0; i < runs; i++) { PageRank.run(argspr); argspr[3] = PageRank.getPath(studentnummer, argspr[i]); argspr[1] = prPath + (i + 1); } System.exit(0); } }