smile.wide.facebook.ExperimentDriver.java Source code

Introduction

Here is the source code for smile.wide.facebook.ExperimentDriver.java
Source

/*
         Licensed to the DARPA XDATA project.
   DARPA XDATA licenses this file to you under the 
     Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
       You may obtain a copy of the License at
    
     http://www.apache.org/licenses/LICENSE-2.0
    
  Unless required by applicable law or agreed to in writing,
 software distributed under the License is distributed on an
 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
             either express or implied.                    
   See the License for the specific language governing
 permissions and limitations under the License.
*/
package smile.wide.facebook;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import smile.Network;
import smile.wide.AttributeValueHistogram;
import smile.wide.PerInstanceInferenceMapper;
import smile.wide.hadoop.io.DoubleArrayWritable;

/** The experimental driver class for the facebook data test 
 * 
 * This class should drive more generalized inference mapper 
 * and reducer classes. In general, experiment drivers should
 * - run a preparation step in which the necessary information   
 * is collected about the datasets, 
 * - the network is built up using this information
 * - saved in a file
 * - run inference or learning as needed on the saved network
 *  
 * This class is also scheduled to become obsolete as it is  
 * replaced by the general Network class. Ideally, the user 
 * should not have to write long drivers like this.
 *  
 * @author tomas.singliar@boeing.com
 *
 */
public class ExperimentDriver extends Configured implements Tool {
    // =======================================================================
    // Parameters

    // parameterizing the Hive connection
    private static boolean useHiveJDBC_ = false;

    // TODO: make this HiveJDBC configuration a collection of command-line parameters
    private static String jdbcDriverName = "org.apache.hadoop.hive.jdbc.HiveDriver";
    private static String jdbcConnectURL = "jdbc:hive://centoshead:10000/default";
    private static String tableName = "facebook";

    // HDFS directory where the native lib lives - will be taken from there by distributed cache
    // TODO: make this a configurable parameter
    private String libHDFSPath_ = "/user/tsingliar/lib/linux64";

    // TODO: make this a command-line configurable parameter
    String[] columns_ = new String[] { "FirstName", "MiddleName", "Sex", "IsAppUser", "FriendCount", "LikesCount" };

    // =======================================================================
    // Logging
    private static final Logger s_logger;
    static {
        s_logger = Logger.getLogger(ExperimentDriver.class);
        s_logger.setLevel(Level.DEBUG);
    }

    // =======================================================================
    // Parameters obtained from command line or execution environment

    private String inPath_ = null;
    private String outPath_ = null;
    private URI jobHDFSPath_ = null;

    // file which will have the Bayesian network which
    // will be processed by the inference phase
    private String networkFile_ = null;
    private String modifiedNetwork_ = null;
    private Configuration conf_ = getConf();

    // =======================================================================
    // Stuff from data  

    // stuff that comes from the preparation stage to the inference stage
    private Map<String, Map<String, Integer>> attValueCounts_;

    // ================================================================================
    // Methods

    /** Populates attValueCounts_ by executing a Hive query and parsing the result
     * 
     * @return 0 if success
     */
    private int collectAttValuesFromHive() throws SQLException, FileNotFoundException {
        attValueCounts_ = new HashMap<String, Map<String, Integer>>();

        try {
            Class.forName(jdbcDriverName);
        } catch (ClassNotFoundException e) {
            s_logger.error("Can't inititialize the HiveJDBC driver");
            e.printStackTrace();
            return 1;
        }

        Connection con = DriverManager.getConnection(jdbcConnectURL, "", "");
        Statement stmt = con.createStatement();
        // show tables

        String sql = "use facebook";
        stmt.executeQuery(sql);
        sql = "describe " + tableName;
        System.out.println("Table description: " + sql);
        ResultSet res = stmt.executeQuery(sql);
        while (res.next()) {
            System.out.println(res.getString(1) + "\t" + res.getString(2));
        }

        // for each column, record unique values
        // suboptimal to do it one by one, but for testing hive connectivity will do
        for (String column : columns_) {
            Map<String, Integer> valueCounts = new HashMap<String, Integer>();
            sql = "select " + column + ", count(*) from facebook group by " + column;
            res = stmt.executeQuery(sql);
            while (res.next()) {
                String value = res.getString(1);
                System.out.println("Attribute " + column);
                System.out.println("\t " + value + "\t " + res.getInt(2) + " occurences");
                valueCounts.put(value, res.getInt(2));
            }
            attValueCounts_.put(column, valueCounts);
        }

        return 0;
    }

    /** Populates attValueCounts_ by running an MR Job
     *  Populates jobHDFSPath_ with someplace we can put temp files.
     * 
     * @return 0 if success
     */
    private int collectAttValuesByMRJob() {
        // pathInfo();

        // go through the dataset and collect all values of all columns
        AttributeValueHistogram coll = new AttributeValueHistogram();

        // let's pass this through the args interface, just as an exercise
        // in doing it this way because AttributeValueHistogram could be a 
        // standalone job, too, so it needs to be able to parse happily
        // conf_.set("xdata.bayesnets.datasetreader.class", FacebookCSVReader.class.getName());
        // conf_.set("xdata.bayesnets.datasetreader.filter", "3,5,7,10,11,12" );

        String[] args = new String[] { inPath_,
                "xdata.bayesnets.datasetreader.class" + "=" + FacebookCSVReader.class.getName(),
                "xdata.bayesnets.datasetreader.filter" + "=" + "3,5,7,10,11,12",
                "xdata.bayesnets.datasetreader.instid" + "=" + "1",
                "xdata.bayesnets.datasetreader.variablenames" + "=" + concat(columns_, ",") };
        try {
            ToolRunner.run(conf_, coll, args);
            jobHDFSPath_ = coll.getWorkingDir();
            attValueCounts_ = coll.AttributeValues();
        } catch (Exception e) {
            s_logger.error("Exception caught in AtributeValueHistogram: " + e.getMessage());
            e.printStackTrace();
            return 2;
        }

        return 0;
    }

    /**
     * Runs preparation step for the experiment:
     * collects attribute values and prepares the bayes network  
     * 
     * @return error code from execution
     * @throws SQLException
     * @throws FileNotFoundException
     */
    private int prepare() {
        // collect the values contained in the dataset
        if (useHiveJDBC_) {
            try {
                collectAttValuesFromHive();
            } catch (FileNotFoundException e) {
                s_logger.error("Hive attribute/value collection failed on file error");
                e.printStackTrace();
            } catch (SQLException e) {
                s_logger.error("Hive attribute/value collection failed on SQL problem.");
                e.printStackTrace();
            }
        } else {
            collectAttValuesByMRJob();
        }

        // load the network file
        Network theNet = new Network();
        theNet.readFile(networkFile_);

        // modify it using the collected attributes
        // update the possible outcomes for every node
        for (String var : new String[] { "FirstName", "MiddleName", "Sex", "IsAppUser" }) {
            String[] origOutcomes = theNet.getOutcomeIds(var);

            s_logger.debug("In the data, " + attValueCounts_.get(var).size() + " outcomes for " + var);

            HashSet<String> normalizedValues = new HashSet<String>();

            for (String attval : attValueCounts_.get(var).keySet()) {
                String trimmed = attval.replace(".", "").trim().toLowerCase();
                if (trimmed.matches("[a-zA-Z_]\\w*")) {
                    normalizedValues.add(trimmed);
                }
            }

            s_logger.debug("Normalized " + attValueCounts_.get(var).size() + " outcomes for " + var + " to "
                    + normalizedValues.size());

            int i = 0;
            for (String av : normalizedValues) {
                theNet.addOutcome(var, av);
                ++i;
                if (i % 10000 == 0) {
                    s_logger.info("Adding attribute # " + i + " - " + av);
                }

            }

            s_logger.debug("Deleting the original outcomes from " + var);
            for (String out : origOutcomes) {
                if (!normalizedValues.contains(out) && (theNet.getOutcomeCount(var) > 2)) {
                    theNet.deleteOutcome(var, out);
                }
            }

            s_logger.debug("Setting non-zero CPT for " + var);
            // make all outcomes possible
            double[] def = theNet.getNodeDefinition(var);
            int outcomes = theNet.getOutcomeCount(var);
            int parentconf = def.length / outcomes; // how many parent configurations
            Random r = new Random();
            for (int p = 0; p < parentconf; ++p) {
                double sum = 0.0;
                double[] par = new double[outcomes];
                for (int j = 0; j < outcomes; ++j) {
                    par[j] = r.nextDouble() + 0.01;
                    sum += par[j];
                }
                for (int j = 0; j < outcomes; ++j) {
                    def[p * outcomes + j] = par[j] / sum;
                }
            }
            theNet.setNodeDefinition(var, def);
        }

        // save a modified copy
        modifiedNetwork_ = basename(networkFile_);
        modifiedNetwork_ = modifiedNetwork_.substring(0, modifiedNetwork_.lastIndexOf(".")) + ".mod.xdsl";
        try {
            theNet.writeFile(modifiedNetwork_);
            FileSystem fs = FileSystem.get(conf_);
            fs.mkdirs(new Path(jobHDFSPath_ + "/tmp/"));
            fs.moveFromLocalFile(new Path(modifiedNetwork_), new Path(jobHDFSPath_ + "/tmp/" + modifiedNetwork_));
        } catch (IOException e) {
            s_logger.error("I/O Error recording the modified Bayes network " + modifiedNetwork_ + " to "
                    + jobHDFSPath_ + "/tmp/" + modifiedNetwork_);
            e.printStackTrace();
        }

        return 0;
    }

    /**
     * Used for debugging path issues.
     */
    @SuppressWarnings("unused")
    private void pathInfo() {
        // ============== Debug
        String libpath = System.getProperty("java.library.path");
        String clspath = System.getProperty("java.class.path");
        String currentdir = System.getProperty("user.dir");

        s_logger.debug("Classpath is:"); // gotta print to err to see the output      
        s_logger.debug(">>>>>>" + clspath + "<<<<<<");
        s_logger.debug("Libpath is:");
        s_logger.debug(">>>>>>" + libpath + "<<<<<<");
        s_logger.debug("Current directory is: " + currentdir);

        // list the files in the current directory      
        s_logger.debug("== Directory listing: =============================");
        File dir = new File(currentdir);
        String[] chld = dir.list();
        if (chld == null) {
            s_logger.fatal("Specified directory '" + currentdir + "' does not exist or is not a directory.");
        } else {
            for (int i = 0; i < chld.length; i++) {
                String fileName = chld[i];
                s_logger.debug(fileName);
            }
        }
        s_logger.debug("== Directory listing: =============================");
    }

    /** Concatenate strings using the give delimiter
     * 
     * @param strings
     * @param delim
     * @return
     */
    private String concat(String[] strings, String delim) {

        String result = "";
        for (int i = 0; i < strings.length - 1; ++i) {
            result += strings[i] + delim;
        }

        if (strings.length > 0) {
            result += strings[strings.length - 1];
        }

        return result;
    }

    private int inference() {
        try {
            DistributedCache.createSymlink(conf_);
            try {
                DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/smile.jar#smile.jar"), conf_);
                DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/libjsmile.so#libjsmile.so"), conf_);
                DistributedCache.addCacheFile(
                        new URI(jobHDFSPath_ + "/tmp/" + modifiedNetwork_ + "#" + basename(modifiedNetwork_)),
                        conf_);
            } catch (URISyntaxException e) {
                s_logger.fatal("Bad URL for modifed network file.");
                return -12;
            }

            // the principle for whether to use string column names or integer column indexes:
            // - when talking about variables in the BN, use string names
            // - when talking about data munging, use column indexes.

            // configure the inference task
            conf_.set("xdata.bayesnets.networkfile", basename(modifiedNetwork_));
            conf_.set("xdata.bayesnets.datasetreader.class", FacebookCSVReader.class.getName());
            conf_.set("xdata.bayesnets.datasetreader.filter", "3,5,7,10,11,12");
            conf_.set("xdata.bayesnets.datasetreader.variablenames",
                    "FirstName,MiddleName,Sex,IsAppUser,LikesCount,FriendCount");
            conf_.set("xdata.bayesnets.datasetreader.instid", "1");
            conf_.set("xdata.bayesnets.queryvariable", "Age");

            Job job = new Job(conf_);

            job.setJarByClass(ExperimentDriver.class); // use this jar
            job.setJobName("Facebook Inference Performance Test");

            FileInputFormat.addInputPath(job, new Path(inPath_));
            FileOutputFormat.setOutputPath(job, new Path(outPath_));

            job.setMapperClass(PerInstanceInferenceMapper.class);
            // there need not be any reducer
            // job.setReducerClass(PerInstanceInferenceReducer.class);

            // set both the map and reduce in/out classes
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(DoubleWritable.class);
            // but redefine them for the mapper      
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(DoubleArrayWritable.class);

            s_logger.info("Job working directory is " + job.getWorkingDirectory());

            return job.waitForCompletion(true) ? 0 : 1;

        } catch (IOException e) {
            System.err.println("Something went badly wrong in IO.");
            System.exit(2);
        } catch (InterruptedException e) {
            System.err.println("Job interrupted.");
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            System.err.println("ClassNotFound exception.");
            e.printStackTrace();
        }

        return 2;
    }

    private String basename(String fName) {
        return fName.substring(fName.lastIndexOf('/') + 1);
    }

    @Override
    public int run(String[] filteredargs) throws Exception {

        if (filteredargs.length != 3) {
            System.err.println("Usage: ExperimentDriver <network.xdsl> <dataset.csv> <output-dir>");

            String yousaid = "";
            for (String s : filteredargs) {
                yousaid += s + " ";
            }
            System.err.println("You said to the driver: " + yousaid);
            System.err.println("Are those generic arguments supposed to be there?");
            System.exit(1);
        }

        networkFile_ = filteredargs[filteredargs.length - 3];
        inPath_ = filteredargs[filteredargs.length - 2];
        outPath_ = filteredargs[filteredargs.length - 1];

        conf_ = getConf();
        conf_.set("keep.failed.task.files", "true");
        conf_.set("keep.failed.task.pattern", "*");

        // ===================== Pass 1
        int presult = prepare();
        if (presult != 0) {
            s_logger.error("Prepare step failed");
            System.exit(4);
        }
        s_logger.info("Preparations suceeded.");

        // retrieve results of prepare      
        /* for (String a : attValueCounts_.keySet())
        {
           s_logger.info(a + "= ");
           for (String v : attValueCounts_.get(a).keySet())
           {
        s_logger.info("\t" + v + ": " + attValueCounts_.get(a).get(v));
           }            
        }*/

        // ===================== Pass 2      
        int iresult = inference();
        if (iresult != 0) {
            s_logger.error("Inference step failed");
            System.exit(8);
        }

        // retrieve results of inference from outPath_

        return iresult;
    }

    public static void main(String args[]) throws Exception {
        ToolRunner.run(new Configuration(), new ExperimentDriver(), args);
    }
}