gws.PageRank.java Source code

Java tutorial

Introduction

Here is the source code for gws.PageRank.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gws;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.cli.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.HashPartitioner;
import org.apache.hama.bsp.TextInputFormat;
import org.apache.hama.bsp.TextOutputFormat;
import org.apache.hama.graph.AverageAggregator;
import org.apache.hama.graph.Edge;
import org.apache.hama.graph.GraphJob;
import org.apache.hama.graph.Vertex;
import org.apache.hama.graph.VertexInputReader;
import org.json.simple.JSONArray;
import org.json.simple.parser.JSONParser;

/**
 * Real pagerank with dangling node contribution.
 */
public class PageRank {

    public static class PageRankVertex extends Vertex<Text, NullWritable, DoubleWritable> {

        static double DAMPING_FACTOR = 0.85;
        static double MAXIMUM_CONVERGENCE_ERROR = 0.001;

        @Override
        public void setup(HamaConfiguration conf) {
            String val = conf.get("hama.pagerank.alpha");
            if (val != null) {
                DAMPING_FACTOR = Double.parseDouble(val);
            }
            val = conf.get("hama.graph.max.convergence.error");
            if (val != null) {
                MAXIMUM_CONVERGENCE_ERROR = Double.parseDouble(val);
            }

            // initialize this vertex to 1 / count of global vertices in this graph
            setValue(new DoubleWritable(1.0 / getNumVertices()));
        }

        @Override
        public void compute(Iterable<DoubleWritable> messages) throws IOException {
            if (this.getSuperstepCount() >= 1) {
                double sum = 0;
                for (DoubleWritable msg : messages) {
                    sum += msg.get();
                }
                double alpha = (1.0d - DAMPING_FACTOR) / getNumVertices();
                setValue(new DoubleWritable(alpha + (sum * DAMPING_FACTOR)));
                aggregate(0, this.getValue());
            }

            // if we have not reached our global error yet, then proceed.
            DoubleWritable globalError = getAggregatedValue(0);

            if (globalError != null && this.getSuperstepCount() > 2
                    && MAXIMUM_CONVERGENCE_ERROR > globalError.get()) {
                voteToHalt();
            } else {
                // in each superstep we are going to send a new rank to our neighbours
                sendMessageToNeighbors(new DoubleWritable(this.getValue().get() / this.getEdges().size()));
            }
        }
    }

    public static class PagerankTextReader
            extends VertexInputReader<LongWritable, Text, Text, NullWritable, DoubleWritable> {

        @Override
        public boolean parseVertex(LongWritable key, Text value, Vertex<Text, NullWritable, DoubleWritable> vertex)
                throws Exception {

            String[] tokenArray = value.toString().split("\t");
            String vtx = tokenArray[0].trim();
            String[] edges = tokenArray[1].trim().split(" ");

            vertex.setVertexID(new Text(vtx));

            for (String v : edges) {
                vertex.addEdge(new Edge<Text, NullWritable>(new Text(v), null));
            }

            return true;
        }
    }

    public static class PagerankJsonReader
            extends VertexInputReader<LongWritable, Text, Text, NullWritable, DoubleWritable> {

        @SuppressWarnings("unchecked")
        @Override
        public boolean parseVertex(LongWritable key, Text value, Vertex<Text, NullWritable, DoubleWritable> vertex)
                throws Exception {
            JSONArray jsonArray = (JSONArray) new JSONParser().parse(value.toString());

            vertex.setVertexID(new Text(jsonArray.get(0).toString()));

            Iterator<JSONArray> iter = ((JSONArray) jsonArray.get(2)).iterator();
            while (iter.hasNext()) {
                JSONArray edge = (JSONArray) iter.next();
                vertex.addEdge(new Edge<Text, NullWritable>(new Text(edge.get(0).toString()), null));
            }

            return true;
        }
    }

    public static GraphJob createJob(String[] args, HamaConfiguration conf, Options opts)
            throws IOException, ParseException {
        CommandLine cliParser = new GnuParser().parse(opts, args);

        if (!cliParser.hasOption("i") || !cliParser.hasOption("o")) {
            System.out.println("No input or output path specified for PageRank, exiting.");
        }

        GraphJob pageJob = new GraphJob(conf, PageRank.class);
        pageJob.setJobName("Pagerank");

        pageJob.setVertexClass(PageRankVertex.class);
        pageJob.setInputPath(new Path(cliParser.getOptionValue("i")));
        pageJob.setOutputPath(new Path(cliParser.getOptionValue("o")));

        // set the defaults
        pageJob.setMaxIteration(30);
        pageJob.set("hama.pagerank.alpha", "0.85");
        // reference vertices to itself, because we don't have a dangling node
        // contribution here
        pageJob.set("hama.graph.self.ref", "true");
        pageJob.set("hama.graph.max.convergence.error", "0.001");

        if (cliParser.hasOption("t")) {
            pageJob.setNumBspTask(Integer.parseInt(cliParser.getOptionValue("t")));
        }

        // error
        pageJob.setAggregatorClass(AverageAggregator.class);

        // Vertex reader
        // According to file type, which is Text or Json,
        // Vertex reader handle it differently.
        if (cliParser.hasOption("f")) {
            if (cliParser.getOptionValue("f").equals("text")) {
                pageJob.setVertexInputReaderClass(PagerankTextReader.class);
            } else if (cliParser.getOptionValue("f").equals("json")) {
                pageJob.setVertexInputReaderClass(PagerankJsonReader.class);
            } else {
                System.out.println(
                        "File type is not available to run Pagerank... " + "File type set default value, Text.");
                pageJob.setVertexInputReaderClass(PagerankTextReader.class);
            }
        } else {
            pageJob.setVertexInputReaderClass(PagerankTextReader.class);
        }

        pageJob.setVertexIDClass(Text.class);
        pageJob.setVertexValueClass(DoubleWritable.class);
        pageJob.setEdgeValueClass(NullWritable.class);

        pageJob.setInputFormat(TextInputFormat.class);
        pageJob.setInputKeyClass(LongWritable.class);
        pageJob.setInputValueClass(Text.class);

        pageJob.setPartitioner(HashPartitioner.class);
        pageJob.setOutputFormat(TextOutputFormat.class);
        pageJob.setOutputKeyClass(Text.class);
        pageJob.setOutputValueClass(DoubleWritable.class);
        return pageJob;
    }

    public static void main(String[] args)
            throws IOException, InterruptedException, ClassNotFoundException, ParseException {
        Options opts = new Options();
        opts.addOption("i", "input_path", true, "The Location of output path.");
        opts.addOption("o", "output_path", true, "The Location of input path.");
        opts.addOption("h", "help", false, "Print usage");
        opts.addOption("t", "task_num", true, "The number of tasks.");
        opts.addOption("f", "file_type", true, "The file type of input data. Input"
                + "file format which is \"text\" tab delimiter separated or \"json\"." + "Default value - Text");

        if (args.length < 2) {
            new HelpFormatter()
                    .printHelp("pagerank -i INPUT_PATH -o OUTPUT_PATH " + "[-t NUM_TASKS] [-f FILE_TYPE]", opts);
            System.exit(-1);
        }

        HamaConfiguration conf = new HamaConfiguration();
        GraphJob pageJob = createJob(args, conf, opts);

        long startTime = System.currentTimeMillis();
        if (pageJob.waitForCompletion(true)) {
            System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        }
    }
}