pad.InitializationDriver.java Source code

Introduction

Here is the source code for pad.InitializationDriver.java
Source

/**
 *   @file InitializationDriver.java
 *   @brief Driver of the Job responsible for transforming the adjacency list or cliques list into a list of edges <nodeID, neighborID>.
 *  @author Federico Conte (draxent)
 *  
 *   Copyright 2015 Federico Conte
 *   https://github.com/Draxent/ConnectedComponents
 * 
 *   Licensed under the Apache License, Version 2.0 (the "License"); 
 *   you may not use this file except in compliance with the License. 
 *   You may obtain a copy of the License at 
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0 
 *  
 *   Unless required by applicable law or agreed to in writing, software 
 *   distributed under the License is distributed on an "AS IS" BASIS, 
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *   See the License for the specific language governing permissions and 
 *   limitations under the License. 
 */

package pad;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;

/**   Driver of the Job responsible for transforming the adjacency list or clique list into a edges list  <nodeID, neighborID>. */
public class InitializationDriver extends Configured implements Tool {
    /** The input file can be format as an adjacency list or a cliques list */
    public enum InputType {
        ADJACENCY_LIST, CLIQUES_LIST
    };

    /** Directory name for multiple output */
    public static final String MOS_OUTPUT_NAME = "result";
    /** Base output path for multiple output */
    public static final String MOS_BASEOUTPUTPATH = MOS_OUTPUT_NAME + "/part";

    private final Path input, output;
    private final boolean verbose;
    private InputType type;
    private long numCliques, numInitialNodes;

    /**
    * Initializes a new instance of the InitializationDriver class.
    * @param input      path of the input graph stored on hdfs.
    * @param output      path of the output folder.
    * @param verbose   if <c>true</c> shows on screen the messages of the Job execution.
    * @throws IOException 
    */
    public InitializationDriver(Path input, Path output, boolean verbose) throws IOException {
        this.input = input;
        this.output = output;
        this.verbose = verbose;

        // Analyze the first line of the input file in order to determine
        // if is format as an adjacency list or a cliques list.
        FileSystem fs = FileSystem.get(new Configuration());
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(this.input)));

        // Repeat until we succeed to classify the input file.
        boolean done = false;
        while (!done) {
            // Read line
            String line = br.readLine();
            // Split the line on the tab character.
            String userID_neighborhood[] = line.split("\t");
            // If <TAB> not found, the format of input file can be cluster format or the node is alone.
            if (userID_neighborhood.length == 1) {
                // Split the line on the space character.
                String cliquesLists[] = line.split(" ");

                // If the node is alone we have to repeat the procedure,
                // since we cannot understand the format analyzing this line.
                if (cliquesLists.length > 1) {
                    this.type = InputType.CLIQUES_LIST;
                    done = true;
                }
            } else {
                this.type = InputType.ADJACENCY_LIST;
                done = true;
            }
        }

        // Close file
        br.close();
    }

    /**
     * Execute the InitializationDriver Job.
     * 
     * If the input file format is adjacency list, then we can easily determinate the initial number of nodes
     * that is equal to the number of rows of the input file while the number of cliques is zero.
     * In order to obtain a list of arcs from the adjacency list, we use the \see InitializationMapperAdjacent
     * as Mapper and zero Reducer.
     * 
     * If the input file format is cliques list, then we can easily determinate the number of cliques
     * that is equal to the number of rows of the input file.
     * In order to obtain a edges list from the cliques list, we use the \see InitializationMapperClique
     * as Mapper. We store this result into a special folder \see MOS_OUTPUT_NAME.
     * Into the regular folder, this Mapper emits all the encountered nodes.
     * We use \see InitializationReducerNumNodes as Reducer in order to count the initial number of nodes
     * counting all the distinct nodes found. The combiner (\see InitializationCombinerNumNodes) reduce locally
     * the number of duplicated nodes.
     * Obtained the value of the NUM_INITIAL_NODES counter ( \see UtilCounters ), we delete the empty files
     * produced by the Reducer and we move the real results into the main/regular folder.
     * 
     * @param args      array of external arguments, not used in this method
     * @return          <c>1</c> if the InitializationDriver Job failed its execution; <c>0</c> if everything is ok. 
     * @throws Exception 
     */
    public int run(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // GenericOptionsParser invocation in order to suppress the hadoop warning.
        new GenericOptionsParser(conf, args);
        Job job = new Job(conf, "InitializationDriver");
        job.setJarByClass(InitializationDriver.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileInputFormat.addInputPath(job, this.input);
        FileOutputFormat.setOutputPath(job, this.output);

        if (this.type == InputType.ADJACENCY_LIST) {
            // In order to obtain the arcs list from the adjacency list, we need only a Mapper task.
            job.setMapperClass(InitializationMapperAdjacency.class);
            job.setNumReduceTasks(0);
        } else {
            // Set up the special folder.
            MultipleOutputs.addNamedOutput(job, MOS_OUTPUT_NAME, SequenceFileOutputFormat.class, IntWritable.class,
                    IntWritable.class);
            MultipleOutputs.setCountersEnabled(job, true);
            // In order to obtain the edges list from the cliques list, we need only a Mapper task
            // and we save the result into the special folder.
            // Then, we need a Reducer task in order to count the initial number of nodes
            job.setMapperClass(InitializationMapperClique.class);
            job.setCombinerClass(InitializationCombinerNumNodes.class);
            job.setReducerClass(InitializationReducerNumNodes.class);
        }

        if (!job.waitForCompletion(verbose))
            return 1;

        // Set up the private variables looking to the counters value
        this.numCliques = job.getCounters().findCounter(UtilCounters.NUM_CLIQUES).getValue();
        this.numInitialNodes = job.getCounters().findCounter(UtilCounters.NUM_INITIAL_NODES).getValue();

        if (this.type == InputType.CLIQUES_LIST) {
            FileSystem fs = FileSystem.get(conf);

            // Delete the empty outputs of the Job
            FileStatus[] filesStatus = fs.listStatus(this.output);
            for (FileStatus fileStatus : filesStatus)
                if (fileStatus.getPath().getName().contains("part"))
                    fs.delete(fileStatus.getPath(), false);

            // Move the real outputs into the parent folder
            filesStatus = fs.listStatus(this.output.suffix("/" + MOS_OUTPUT_NAME));
            for (FileStatus fileStatus : filesStatus)
                fs.rename(fileStatus.getPath(), this.output.suffix("/" + fileStatus.getPath().getName()));

            // Delete empty special folder
            fs.delete(this.output.suffix("/" + MOS_OUTPUT_NAME), true);
        }

        return 0;
    }

    /**
     * Return the type of format of the input file.
     * @return    the type of format of the input file.
     */
    public InputType getInputType() {
        return this.type;
    }

    /**
     * Returns the number of cliques founds in the input file.
     * @return    number of cliques.
     */
    public long getNumCliques() {
        return this.numCliques;
    }

    /**
     * Returns the number of initial nodes founds in the input file.
     * @return    number of initial nodes.
     */
    public long getNumInitialNodes() {
        return this.numInitialNodes;
    }

    /**
     * Main of the \see InitializationDriver class.
     * @param args   array of external arguments,
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("Usage: InitializationDriver <input> <output>");
            System.exit(1);
        }

        Path input = new Path(args[0]);
        Path output = new Path(args[1]);
        System.out.println("Start InitializationDriver. ");
        InitializationDriver init = new InitializationDriver(input, output, true);
        if (init.run(null) != 0) {
            FileSystem.get(new Configuration()).delete(output, true);
            System.exit(1);
        }
        System.out.println("End InitializationDriver.");

        System.exit(0);
    }
}