nl.utwente.bigdata.PageRank.java Source code

Introduction

Here is the source code for nl.utwente.bigdata.PageRank.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.utwente.bigdata;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.IntWritable;
import java.text.DecimalFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.json.simple.parser.JSONParser;

public class PageRank {

    private static float defaultVal = 1.0f;
    private static float dampingFactor = 0.85f;

    public static final String TWEETS = "/user/alyr/worldcup/part*";
    public static final String USER = "/user/";
    public static final String OVERSLAAN = "overslaan";
    private static String rtPath = "output2";
    private static String prPath = "outputpagerank";

    public static class PageRankMapper extends Mapper<Object, Text, Text, MapWritable> {

        private Map<String, Object> tweet;
        private Text retweetUserScore = new Text();
        private Text tweetUser = new Text();
        private Text retweetUser = new Text();
        private Map<String, Object> retweetUserInfo;
        private Map<String, Object> tweetUserInfo;
        private URI[] files;
        private MapWritable mw = new MapWritable();
        private HashMap<String, String> scores = new HashMap<String, String>();
        private JSONParser parser = new JSONParser();

        public void setup(Context context) throws IOException {
            try {
                FileSystem fs = FileSystem.get(context.getConfiguration());
                BufferedReader br;
                for (URI file : context.getCacheFiles()) {
                    if (file.toString().toLowerCase().contains(rtPath)) {
                        Path path = new Path(file.toString());
                        br = new BufferedReader(new InputStreamReader(fs.open(path)));
                        String line = br.readLine();
                        while (line != null) {
                            StringTokenizer st = new StringTokenizer(line, "\\t");
                            int j = 0;
                            String[] arr = new String[2];
                            while (st.hasMoreTokens() && j < 2) {
                                j++;
                                arr[j] = st.nextToken();
                            }

                            scores.put(arr[0], arr[1]);
                            line = br.readLine();
                        }
                        br.close();
                    }
                }
            } finally {
                if (scores.size() < 1) {
                    throw new IOException("Geen scores");
                    //throw new IOException("File not loaded" + context.getCacheFiles().length + " files:" + files.length + ", path:" + context.getCacheFiles()[0].getPath() + ", string: " + context.getCacheFiles()[0].toString());
                }
            }
        }

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            try {
                tweet = (Map<String, Object>) parser.parse(value.toString());
            }

            catch (ClassCastException e) {
                return; // do nothing (we might log this)
            }

            catch (org.json.simple.parser.ParseException e) {
                return; // do nothing 
            }

            // Bepalen of het een tweet of een retweet is (misschien een check op nullpointerexception?)
            Map<String, Object> retweetInfo = (Map<String, Object>) tweet.get("retweeted_status");
            if (retweetInfo != null) {
                //Retweetscore mist beschikbaar
                retweetUserInfo = (Map<String, Object>) tweet.get("user");
                retweetUser.set((String) String.valueOf(retweetUserInfo.get("id")));
                String score = scores.get(String.valueOf(retweetUserInfo.get("id")));
                score = (score != null) ? score : "1.0";
                retweetUserScore.set((String) String.valueOf(score));

                //Tweetuserid
                tweetUserInfo = (Map<String, Object>) retweetInfo.get("user");
                tweetUser.set((String) String.valueOf(tweetUserInfo.get("id")));

                mw.put(new IntWritable(1), retweetUser);
                mw.put(new IntWritable(2), retweetUserScore);

                context.write(tweetUser, mw);
            }
        }
    }

    public static class PageRankReducer extends Reducer<Text, MapWritable, Text, Text> {

        private static final DecimalFormat df = new DecimalFormat("#.00000");
        private Text finalScore = new Text();
        private float totaalScore = 0.0f;
        private float tempScore = 0.0f;
        private HashMap<String, String> scores = new HashMap<String, String>();

        public void setup(Context context) throws IOException {
            try {
                FileSystem fs = FileSystem.get(context.getConfiguration());
                BufferedReader br;
                for (URI file : context.getCacheFiles()) {
                    if (file.toString().toLowerCase().contains(prPath)) {
                        Path path = new Path(file.toString());
                        br = new BufferedReader(new InputStreamReader(fs.open(path)));
                        String line = br.readLine();
                        while (line != null) {
                            StringTokenizer st = new StringTokenizer(line, "\\t");
                            int j = 0;
                            String[] arr = new String[2];
                            while (st.hasMoreTokens() && j < 2) {
                                j++;
                                arr[j] = st.nextToken();
                            }

                            scores.put(arr[0], arr[1]);
                            line = br.readLine();
                        }
                        br.close();
                    }
                }
            } finally {
                /*if(scores.size() < 1){
                   throw new IOException("Geen scores" + context.getCacheFiles()[0].toString();
                   //throw new IOException("File not loaded" + context.getCacheFiles().length + " files:" + files.length + ", path:" + context.getCacheFiles()[0].getPath() + ", string: " + context.getCacheFiles()[0].toString());
                }*/
                System.out.println("Empty scores");
            }
        }

        public void reduce(Text key, Iterable<MapWritable> values, Context context)
                throws IOException, InterruptedException {
            tempScore = 0.0f;
            for (MapWritable value : values) {
                // Value is het aantal retweets van retweeter X
                // In scores staat het pagerank van tweeter Y
                String retweeter = ((Text) value.get(new IntWritable(1))).toString();
                String retweeterCount = ((Text) value.get(new IntWritable(2))).toString();
                float rtCount = Float.parseFloat(retweeterCount);
                String score = scores.get(retweeter);
                score = (score != null) ? score : "0.15";
                float rtPagerank = Float.parseFloat(score);
                tempScore += rtPagerank / rtCount;
            }
            totaalScore = tempScore * dampingFactor + (defaultVal - dampingFactor);
            finalScore.set(df.format(totaalScore));
            context.write(key, finalScore);
        }
    }

    public static void run(String[] args) throws Exception {
        Configuration conf = new Configuration();

        //Process args
        GenericOptionsParser parser = new GenericOptionsParser(conf, args);
        String[] otherArgs = parser.getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: pageRank <in> [<in>...] <out> <retweetpath> <pagerankpath");
            System.exit(2);
        }
        conf.set("mapreduce.job.reduce.slowstart.completedmaps", "1");

        //Setup the job
        Job job = Job.getInstance(conf, "Twitter Reader");
        job.setJarByClass(PageRank.class);
        job.setMapperClass(PageRankMapper.class);
        job.setReducerClass(PageRankReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(MapWritable.class);

        //Load input files
        for (int i = 0; i < otherArgs.length - 3; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        //Load output file
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 3]));

        //Load previous results
        FileSystem fs = FileSystem.get(new Configuration());
        FileStatus[] status = fs.listStatus(new Path("hdfs:" + otherArgs[otherArgs.length - 2]));
        for (FileStatus s : status) {
            job.addCacheFile(s.getPath().toUri());
        }
        String str = new String(otherArgs[otherArgs.length - 1]);
        if (!str.equals("overslaan")) {
            FileStatus[] status2 = fs.listStatus(new Path("hdfs:" + otherArgs[otherArgs.length - 1]));
            for (FileStatus s2 : status2) {
                job.addCacheFile(s2.getPath().toUri());
            }
        }

        boolean succesful = job.waitForCompletion(true);
    }

    public static String getPath(String studentnummer, String folder) {
        return USER + studentnummer + "/" + folder + "/";
    }

    public static void main(String[] args) throws Exception {
        int runs = 10;

        if (args.length < 2) {
            System.err.println("Usage: pageRank <studentnummer> <runs> [<rtPath>] [<prPath>]");
            System.exit(2);
        }

        try {
            runs = Integer.parseInt(args[1]);
        } catch (NumberFormatException e) {
            System.err.println("<Runs> must be a number");
            System.exit(2);
        }

        //optional, choose different output folders
        if (args.length == 4) {
            rtPath = args[2];
            prPath = args[3];
        }

        //userfolder on hadoop fs
        String studentnummer = args[0];

        String[] argsol = { TWEETS, rtPath };
        String[] argspr = { TWEETS, prPath + "0", PageRank.getPath(studentnummer, rtPath), OVERSLAAN };

        OutgoingLinks.main(argsol);

        for (int i = 0; i < runs; i++) {
            PageRank.run(argspr);
            argspr[3] = PageRank.getPath(studentnummer, argspr[i]);
            argspr[1] = prPath + (i + 1);
        }

        System.exit(0);
    }
}