junto.algorithm.parallel.LP_ZGL_Hadoop.java Source code

Java tutorial

Introduction

Here is the source code for junto.algorithm.parallel.LP_ZGL_Hadoop.java

Source

package junto.algorithm.parallel;

/**
 * Copyright 2011 Partha Pratim Talukdar
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import gnu.trove.map.hash.TObjectDoubleHashMap;
import gnu.trove.iterator.TObjectDoubleIterator;

import java.io.IOException;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;

import junto.config.*;
import junto.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class LP_ZGL_Hadoop {

    private static String _kDelim = "\t";

    public static class LP_ZGL_Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            /////
            // Constructing the vertex from the string representation
            /////
            String line = value.toString();

            // id gold_label injected_labels estimated_labels neighbors rw_probabilities 
            String[] fields = line.split(_kDelim);
            TObjectDoubleHashMap neighbors = CollectionUtil.String2Map(fields[4]);

            boolean isSeedNode = fields[2].length() > 0 ? true : false;

            // If the current node is a seed node but there is no
            // estimate label information yet, then transfer the seed label
            // to the estimated label distribution. Ideally, this is likely
            // to be used in the map of the very first iteration.
            if (isSeedNode && fields[3].length() == 0) {
                fields[3] = fields[2];
            }

            // Send two types of messages:
            //   -- self messages which will store the injection labels and
            //        random walk probabilities.
            //   -- messages to neighbors about current estimated scores
            //        of the node.
            //
            // message to self
            output.collect(new Text(fields[0]), new Text(line));

            // message to neighbors
            TObjectDoubleIterator neighIterator = neighbors.iterator();
            while (neighIterator.hasNext()) {
                neighIterator.advance();

                // message (neighbor_node, current_node + DELIM + curr_node_label_scores
                output.collect(new Text((String) neighIterator.key()), new Text(fields[0] + _kDelim + fields[3]));
            }
        }
    }

    public static class LP_ZGL_Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        private static double mu1;
        private static double mu2;
        private static int keepTopKLabels;

        public void configure(JobConf conf) {
            mu1 = Double.parseDouble(conf.get("mu1"));
            mu2 = Double.parseDouble(conf.get("mu2"));
            keepTopKLabels = Integer.parseInt(conf.get("keepTopKLabels"));
        }

        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            // new scores estimated for the current node
            TObjectDoubleHashMap newEstimatedScores = new TObjectDoubleHashMap();

            // set to true only if the message sent to itself is found.
            boolean isSelfMessageFound = false;

            String vertexId = key.toString();
            String vertexString = "";

            TObjectDoubleHashMap neighbors = null;
            TObjectDoubleHashMap randWalkProbs = null;

            HashMap<String, String> neighScores = new HashMap<String, String>();

            int totalMessagesReceived = 0;
            boolean isSeedNode = false;

            // iterate over all the messages received at the node
            while (values.hasNext()) {
                ++totalMessagesReceived;

                String val = values.next().toString();
                String[] fields = val.split(_kDelim);

                // System.out.println("src: " + fields[0] + " dest: " + vertexId +
                //         "MESSAGE>>" + val + "<<");

                // self-message check
                if (vertexId.equals(fields[0])) {
                    isSelfMessageFound = true;
                    vertexString = val;

                    // System.out.println("Reduce: " + vertexId + " " + val + " " + fields.length);

                    TObjectDoubleHashMap injLabels = CollectionUtil.String2Map(fields[2]);
                    neighbors = CollectionUtil.String2Map(neighbors, fields[4]);
                    randWalkProbs = CollectionUtil.String2Map(fields[5]);

                    if (injLabels.size() > 0) {
                        isSeedNode = true;

                        // add injected labels to the estimated scores.
                        ProbUtil.AddScores(newEstimatedScores, mu1, injLabels);
                    }
                } else {
                    // an empty second field represents that the
                    // neighbor has no valid label assignment yet.
                    if (fields.length > 1) {
                        neighScores.put(fields[0], fields[1]);
                    }
                }
            }

            // terminate if message from self is not received.
            if (!isSelfMessageFound) {
                throw new RuntimeException("Self message not received for node " + vertexId);
            }

            // Add neighbor label scores to current node's label estimates only if the
            // current node is not a seed node. In case of seed nodes, clamp back the 
            // injected label distribution, which is already done above when processing
            // the self messages
            if (!isSeedNode) {
                // collect neighbors label distributions and create one single
                // label distribution
                TObjectDoubleHashMap weightedNeigLablDist = new TObjectDoubleHashMap();
                Iterator<String> neighIter = neighScores.keySet().iterator();
                while (neighIter.hasNext()) {
                    String neighName = neighIter.next();
                    ProbUtil.AddScores(weightedNeigLablDist, // newEstimatedScores,
                            mu2 * neighbors.get(neighName), CollectionUtil.String2Map(neighScores.get(neighName)));
                }
                ProbUtil.Normalize(weightedNeigLablDist, keepTopKLabels);

                // now add the collective neighbor label distribution to
                // the estimate of the current node's labels.
                ProbUtil.AddScores(newEstimatedScores, 1.0, weightedNeigLablDist);
            }

            // normalize the scores
            ProbUtil.Normalize(newEstimatedScores);

            // now reconstruct the vertex representation (with the new estimated scores)
            // so that the output from the current mapper can be used as input in next
            // iteration's mapper.
            String[] vertexFields = vertexString.split(_kDelim);

            // replace estimated scores with the new ones.
            String[] newVertexFields = new String[vertexFields.length - 1];
            for (int i = 1; i < vertexFields.length; ++i) {
                newVertexFields[i - 1] = vertexFields[i];
            }
            newVertexFields[2] = CollectionUtil.Map2String(newEstimatedScores);

            output.collect(key, new Text(CollectionUtil.Join(newVertexFields, _kDelim)));
        }
    }

    public static void main(String[] args) throws Exception {
        Hashtable config = ConfigReader.read_config(args);

        String baseInputFilePat = Defaults.GetValueOrDie(config, "hdfs_input_pattern");
        String baseOutputFilePat = Defaults.GetValueOrDie(config, "hdfs_output_base");
        int numIterations = Integer.parseInt(Defaults.GetValueOrDie(config, "iters"));

        String currInputFilePat = baseInputFilePat;
        String currOutputFilePat = "";
        for (int iter = 1; iter <= numIterations; ++iter) {
            JobConf conf = new JobConf(LP_ZGL_Hadoop.class);
            conf.setJobName("lp_zgl_hadoop");

            conf.setOutputKeyClass(Text.class);
            conf.setOutputValueClass(Text.class);

            conf.setMapperClass(LP_ZGL_Map.class);
            // conf.setCombinerClass(LP_ZGL_Reduce.class);
            conf.setReducerClass(LP_ZGL_Reduce.class);

            conf.setInputFormat(TextInputFormat.class);
            conf.setOutputFormat(TextOutputFormat.class);

            // hyperparameters
            conf.set("mu1", Defaults.GetValueOrDie(config, "mu1"));
            conf.set("mu2", Defaults.GetValueOrDie(config, "mu2"));
            conf.set("keepTopKLabels", Defaults.GetValueOrDefault((String) config.get("keep_top_k_labels"),
                    Integer.toString(Integer.MAX_VALUE)));

            if (iter > 1) {
                // output from last iteration is the input for current iteration
                currInputFilePat = currOutputFilePat + "/*";
            }
            FileInputFormat.setInputPaths(conf, new Path(currInputFilePat));

            currOutputFilePat = baseOutputFilePat + "_" + iter;
            FileOutputFormat.setOutputPath(conf, new Path(currOutputFilePat));

            JobClient.runJob(conf);
        }
    }
}