graphcreator.Creator.java Source code

Introduction

Here is the source code for graphcreator.Creator.java
Source

package graphcreator;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import SimilarityMeasure.EuclideanSquared;
import SimilarityMeasure.MaximumsNotSetException;
import SimilarityMeasure.SimilarityMeasure;

/* USAGE WARNING:
 * 
 * "java.lang.OutOfMemoryError: Java heap space" can frequently occur in this code 
 * in (at least) two circumstances:
 * 
 * 1) Not enough mappers. This causes sort and shuffle algorithms to exceed available JVM space
 * 2) Too many mappers. This causes internal fragmentation among hdfs blocks.
 * 
 * The below Hadoop configurations have scaled well up to 25,000 records. Behavior when records
 * exceed 25,000 is unknown.
 */

/**
 * This MapReduce Creator class can create both fully complete graphs and k-Nearest Neighbor
 * Graphs (kNNGs).
 * @author mcneill
 */
public class Creator {
    /**
     * The base path to use for intermediate hdfs storage
     */
    private static final String INTERMEDIATE_PATH = "/intermediate";

    public static class EdgeCreate extends Mapper<LongWritable, Text, Text, Text> {

        /**
         * An ArrayList that holds all the lines of the distributed cache.
         */
        ArrayList<String> lines = new ArrayList<String>();

        /**
         * Populate lines with the distributed cache.
         */
        @Override
        public void setup(Context context) throws IOException {
            if (context.getCacheFiles() != null && context.getCacheFiles().length > 0) {
                URI graphUri = context.getCacheFiles()[0];
                if (graphUri != null) {
                    Path pt = new Path(graphUri.toString());
                    FileSystem fs = FileSystem.get(new Configuration());
                    InputStreamReader isr = new InputStreamReader(fs.open(pt));
                    BufferedReader br = new BufferedReader(isr);
                    String line;
                    while (true) {
                        line = br.readLine();
                        if (line == null) {
                            break;
                        }
                        lines.add(line);
                    }
                    br.close();
                    isr.close();
                }
            }
        }

        /**
         * Use the input and the distributed cache to pair vertices.
         */
        @Override
        public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException {
            int number = Integer.parseInt(val.toString().split(" |\t")[0]);
            // the initial value of j prohibits duplicates
            for (int j = number + 1; j < lines.size(); j++) {
                context.write(val, new Text(lines.get(j)));
            }
        }
    }

    /**
     * Calculate edge weights for each vertex pair
     * @author mcneill
     */
    public static class DistanceCalc extends Reducer<Text, Text, IntWritable, Text> {

        /**
         * The similarity measure to be used for distance calculations.
         */
        SimilarityMeasure sm = new EuclideanSquared();

        /**
         * This setup method reads the maximum values property and passes the maximums to the 
         * similarity measure. This code was usually not used (if the maximums property is not 
         * set this code has no effect) and maximums were rather implemented with literals in
         * relevant similarity measures.
         */
        @Override
        public void setup(Context context) throws IOException, InterruptedException {
            String maximumsString = context.getConfiguration().get("maximums");
            if (maximumsString != null) {
                double a[] = new double[maximumsString.split(",").length];
                String maxSplit[] = maximumsString.substring(1, a.length - 1).split(",");
                for (int i = 0; i < a.length; i++) {
                    a[i] = Double.parseDouble(maxSplit[i]);
                }
                sm.setMaximums(a);
            }
        };

        /**
         * This mapper inputs vertex pairs (binned by the lesser vertex) with their associated data vectors and
         * outputs the vertex pair with the associated edge weight.
         */
        @Override
        public void reduce(Text key, Iterable<Text> vals, Context context)
                throws IOException, InterruptedException {

            StringTokenizer splitA1 = new StringTokenizer(key.toString());
            int a = Integer.parseInt(splitA1.nextToken());
            String[] stringA = splitA1.nextToken().split(" |,");

            for (Text val : vals) {

                StringTokenizer splitB1 = new StringTokenizer(val.toString());
                int b = Integer.parseInt(splitB1.nextToken());
                String[] stringB = splitB1.nextToken().split(" |,");

                try {
                    double distance = sm.getDistance(stringA, stringB);
                    double weight = distance == 0 ? sm.maxDistance() : Math.exp(-distance);

                    context.write(new IntWritable(a),
                            new Text(Integer.toString(b) + " " + Double.toString(weight)));
                } catch (MaximumsNotSetException e) {
                    e.printStackTrace();
                }

            }
        }
    }

    /**
     * This mapper bins edges by both vertices
     * @author mcneill
     */
    public static class BinByVertex extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException {
            String values[] = val.toString().split(" |\t");
            context.write(new Text(values[0]), new Text(values[1] + " " + values[2]));
            context.write(new Text(values[1]), new Text(values[0] + " " + values[2]));
        }
    }

    /**
     * This reducer takes edges binned by vertex and emits only the k highest weighted edges, where k is the 
     * value of the property "kVal"
     * @author mcneill
     */
    public static class kNNFilter extends Reducer<Text, Text, Text, Text> {

        @Override
        public void reduce(Text key, Iterable<Text> vals, Context context)
                throws IOException, InterruptedException {
            int kVal = Integer.parseInt(context.getConfiguration().get("kVal"));
            int kEdgesV[] = new int[kVal];
            double kEdgesW[] = new double[kVal];
            int minPos = 0;
            double minWeight = 0;

            for (Text val : vals) {
                String split[] = val.toString().split(" ");
                int v = Integer.parseInt(split[0]);
                double w = Double.parseDouble(split[1]);

                if (w > minWeight) {
                    kEdgesV[minPos] = v;
                    kEdgesW[minPos] = w;

                    minWeight = Double.MAX_VALUE;
                    for (int i = 0; i < kVal; i++) {
                        if (kEdgesW[i] < minWeight) {
                            minPos = i;
                            minWeight = kEdgesW[i];
                        }
                    }
                }
            }

            for (int i = 0; i < kVal; i++) {
                if (Integer.parseInt(key.toString()) < kEdgesV[i]) {
                    context.write(new Text(key.toString() + " " + Integer.toString(kEdgesV[i])),
                            new Text(Double.toString(kEdgesW[i])));
                } else {
                    context.write(new Text(Integer.toString(kEdgesV[i]) + " " + key.toString()),
                            new Text(Double.toString(kEdgesW[i])));
                }
            }
        }
    }

    /**
     * This is the identity map. It passes input as output, unaffected.
     * @author mcneill
     */
    public static class IdentityMap extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException {
            String values[] = val.toString().split("\t");
            context.write(new Text(values[0]), new Text(values[1]));
        }
    }

    /**
     * This reducer takes weighted edges, binned by the sorted vertex values of the edge and removes duplicates.
     * @author mcneill
     */
    public static class RemoveDuplicateEdges extends Reducer<Text, Text, Text, Text> {

        @Override
        public void reduce(Text key, Iterable<Text> vals, Context context)
                throws IOException, InterruptedException {
            context.write(key, vals.iterator().next()); //return the first value since all the values are identitcal
        }
    }

    public static void main(String[] args) throws Exception {

        /*
         * args[0] is the hdfs input path
         * args[1] is the hdfs output path
         * args[2] is the location of the single input file
         * args[3] is the k value for kNNG creation, -1 if a full graph is to be used
         */

        /**
         * conf1 gives a smaller amount of input to each MapReduce job, making it optimal for MR jobs where output
         * or intermediate data volume is significantly larger than input data volume
         */
        Configuration conf1 = new Configuration();
        conf1.set("mapreduce.input.fileinputformat.split.maxsize", "5000");
        conf1.set("mapreduce.job.split.metainfo.maxsize", "-1");
        conf1.set("mapreduce.job.reduces", "100");

        /**
         * conf2 is optimal for MapReduce jobs where data volume is roughly consistent throughout the MR job.
         */
        Configuration conf2 = new Configuration();
        conf2.set("mapreduce.input.fileinputformat.split.maxsize", "5000000");
        conf2.set("mapreduce.job.split.metainfo.maxsize", "-1");
        conf2.set("mapreduce.job.reduces", "100");
        conf2.set("kVal", args[3]);

        /* GRAPH CREATION */
        Job job1 = Job.getInstance(conf1, "edge creation");

        job1.setJarByClass(Creator.class);
        job1.setMapperClass(EdgeCreate.class);
        job1.setReducerClass(DistanceCalc.class);
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(Text.class);
        job1.setOutputKeyClass(IntWritable.class);
        job1.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job1, new Path(args[0]));
        job1.addCacheFile(new URI("hdfs://localhost:9000/user/hduser/" + args[2]));

        if (!args[3].equals("-1")) {
            FileOutputFormat.setOutputPath(job1, new Path(args[1] + INTERMEDIATE_PATH + "1"));
        } else {
            FileOutputFormat.setOutputPath(job1, new Path(args[1] + "/output"));
        }

        job1.waitForCompletion(true);

        /*
         * Only run KNNG trimming MapReduce jobs if kNNG is desired, as oppused to a complete graph.
         */
        if (!args[3].equals("-1")) {

            Job job2 = Job.getInstance(conf2, "trimming 1");

            job2.setJarByClass(Creator.class);
            job2.setMapperClass(BinByVertex.class);
            job2.setReducerClass(kNNFilter.class);
            job2.setMapOutputKeyClass(Text.class);
            job2.setMapOutputValueClass(Text.class);
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(Text.class);

            FileInputFormat.addInputPath(job2, new Path(args[1] + INTERMEDIATE_PATH + "1"));
            FileOutputFormat.setOutputPath(job2, new Path(args[1] + INTERMEDIATE_PATH + "2"));

            job2.waitForCompletion(true);

            Job job3 = Job.getInstance(conf2, "trimming 2");
            job3.setJarByClass(Creator.class);
            job3.setMapperClass(IdentityMap.class);
            job3.setReducerClass(RemoveDuplicateEdges.class);
            job3.setMapOutputKeyClass(Text.class);
            job3.setMapOutputValueClass(Text.class);
            job3.setOutputKeyClass(Text.class);
            job3.setOutputValueClass(Text.class);

            FileInputFormat.addInputPath(job3, new Path(args[1] + INTERMEDIATE_PATH + "2"));
            FileOutputFormat.setOutputPath(job3, new Path(args[1] + "/output"));

            job3.waitForCompletion(true);
        }
    }
}