weka.distributed.hadoop.HadoopJob.java Source code

Introduction

Here is the source code for weka.distributed.hadoop.HadoopJob.java
Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    HadoopJob
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed.hadoop;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.mapreduce.Job;

import weka.core.ClassloaderUtil;
import weka.core.Environment;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.WekaPackageManager;
import weka.distributed.DistributedWekaException;
import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import distributed.hadoop.HDFSUtils;
import distributed.hadoop.MapReduceJobConfig;

/**
 * Abstract base class for Hadoop jobs. Contains routines for installing Weka
 * libraries in HDFS, running jobs and getting status information on running
 * jobs.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public abstract class HadoopJob extends DistributedJob implements OptionHandler {

    /** For serialization */
    private static final long serialVersionUID = -9026086203818342364L;

    /** The path to the distributedWekaHadoop.jar */
    public static final String DISTRIBUTED_WEKA_HADOOP_JAR = WekaPackageManager.PACKAGES_DIR.toString()
            + File.separator + "distributedWekaHadoop" + File.separator + "distributedWekaHadoop.jar";

    /** The path to the distributedWekaBase.jar */
    public static final String DISTRIBUTED_WEKA_BASE_JAR = WekaPackageManager.PACKAGES_DIR.toString()
            + File.separator + "distributedWekaBase" + File.separator + "distributedWekaBase.jar";

    /** The path to the opencsv.jar */
    public static final String OPEN_CSV_JAR = WekaPackageManager.PACKAGES_DIR.toString() + File.separator
            + "distributedWekaBase" + File.separator + "lib" + File.separator + "opencsv-2.3.jar";

    /** The path to the jfreechart jar */
    public static final String JFREECHART_JAR = WekaPackageManager.PACKAGES_DIR.toString() + File.separator
            + "distributedWekaBase" + File.separator + "lib" + File.separator + "jfreechart-1.0.13.jar";

    /** The path to the jcommon jar */
    public static final String JCOMMON_JAR = WekaPackageManager.PACKAGES_DIR.toString() + File.separator
            + "distributedWekaBase" + File.separator + "lib" + File.separator + "jcommon-1.0.16.jar";

    /** The path to the colt.jar */
    public static final String COLT_JAR = WekaPackageManager.PACKAGES_DIR.toString() + File.separator
            + "distributedWekaBase" + File.separator + "lib" + File.separator + "colt-1.2.0.jar";

    /** The path to the la4j.jar */
    public static final String LA4J_JAR = WekaPackageManager.PACKAGES_DIR.toString() + File.separator
            + "distributedWekaBase" + File.separator + "lib" + File.separator + "la4j-0.4.5.jar";

    /**
     * A default path to a weka.jar file. If the classpath contains a weka.jar
     * file (rather than a directory of weka classes) when Weka is started then
     * this path will be populated automatically by scanning for weka.jar in the
     * classpath.
     */
    protected static String DEFAULT_WEKA_JAR_PATH = System.getProperty("user.home") + File.separator + "weka.jar";

    // Attempt to locate the weka.jar in the classpath and set a
    // the default path to it
    static {
        try {
            ClassLoader cl = ClassloaderUtil.class.getClassLoader();
            if (cl instanceof URLClassLoader) {
                URL[] urls = ((URLClassLoader) cl).getURLs();

                for (URL u : urls) {
                    if (u.toString().endsWith("weka.jar")) {
                        File f = new File(u.toURI());
                        DEFAULT_WEKA_JAR_PATH = f.toString();
                    }
                }
            }
        } catch (Exception ex) {

        }
    }

    /** Holds the path to the weka.jar */
    protected String m_pathToWekaJar = DEFAULT_WEKA_JAR_PATH;

    /** The main configuration object for this job */
    protected MapReduceJobConfig m_mrConfig = new MapReduceJobConfig();

    /** interval (seconds) between status updates for the running job */
    protected String m_loggingInterval = "10";

    /** Output debugging info */
    protected boolean m_debug;

    /** Hadoop logging */
    protected Log m_hadoopLog = LogFactory.getLog(HadoopJob.class);

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> options = new Vector<Option>();
        Enumeration<Option> confOpts = m_mrConfig.listOptions();

        options.addElement(
                new Option("\tPath to the weka.jar file", "weka-jar", 1, "-weka-jar <path to weka.jar>"));

        options.addElement(new Option("\tAdditional Weka packages to use.", "weka-packages", 1,
                "-weka-packages <comma-separated list of package names>"));
        options.addElement(new Option("\tLogging interval in seconds (default = 15).", "logging-interval", 1,
                "-logging-interval <seconds>"));
        options.addElement(new Option("\tOutput debug info.", "debug", 0, "-debug"));

        while (confOpts.hasMoreElements()) {
            options.addElement(confOpts.nextElement());
        }

        return options.elements();
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        m_mrConfig.setOptions(options);

        String wekaPath = Utils.getOption("weka-jar", options);
        if (!DistributedJobConfig.isEmpty(wekaPath)) {
            setPathToWekaJar(wekaPath);
        }

        String additionalPackages = Utils.getOption("weka-packages", options);
        setAdditionalWekaPackages(additionalPackages);

        String logInt = Utils.getOption("logging-interval", options);
        setLoggingInterval(logInt);

        setDebug(Utils.getFlag("debug", options));
    }

    /**
     * Return the base options only (not the subclasses options or the options
     * specific to the configuration)
     * 
     * @return just the base options
     */
    public String[] getBaseOptionsOnly() {
        List<String> options = new ArrayList<String>();

        if (!DistributedJobConfig.isEmpty(getPathToWekaJar())) {
            options.add("-weka-jar");
            options.add(getPathToWekaJar());
        }

        if (!DistributedJobConfig.isEmpty(getAdditionalWekaPackages())) {
            options.add("-weka-packages");
            options.add(getAdditionalWekaPackages());
        }

        if (!DistributedJobConfig.isEmpty(getLoggingInterval())) {
            options.add("-logging-interval");
            options.add(getLoggingInterval());
        }

        if (getDebug()) {
            options.add("-debug");
        }

        return options.toArray(new String[options.size()]);
    }

    @Override
    public String[] getOptions() {
        List<String> options = new ArrayList<String>();

        String[] baseOptions = getBaseOptionsOnly();
        for (String b : baseOptions) {
            options.add(b);
        }

        String[] configOpts = m_mrConfig.getOptions();
        for (String o : configOpts) {
            options.add(o);
        }

        return options.toArray(new String[options.size()]);
    }

    /**
     * Constructor for a HadoopJob
     * 
     * @param jobName the name of the job
     * @param jobDescription a short description of the job
     */
    public HadoopJob(String jobName, String jobDescription) {
        super(jobName, jobDescription);
    }

    /**
     * Set the main configuration to use with this job
     * 
     * @param conf the main configuration to use with this job
     */
    public void setMapReduceJobConfig(MapReduceJobConfig conf) {
        m_mrConfig = conf;
    }

    /**
     * Get the main configuration to use with this job
     * 
     * @return the main configuration to use with this job
     */
    public MapReduceJobConfig getMapReduceJobConfig() {
        return m_mrConfig;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String deubgTipText() {
        return "Output debugging info to the log";
    }

    /**
     * Set whether to output debug info. Some jobs may output more info to the log
     * if this is turned on
     * 
     * @param debug true if debug info is to be output
     */
    public void setDebug(boolean debug) {
        m_debug = debug;
    }

    /**
     * Get whether to output debug info. Some jobs may output more info to the log
     * if this is turned on
     * 
     * @return true if debug info is to be output
     */
    public boolean getDebug() {
        return m_debug;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String pathToWekaJarTipText() {
        return "The path to the weka jar file. This will get installed in"
                + "HDFS and placed into the classpath for map and reduce tasks";
    }

    /**
     * Set the path to the weka.jar file. Will be populated automatically if the
     * classpath contains a weka.jar. The weka.jar is installed in HDFS and used
     * in the classpath for map and reduce tasks.
     * 
     * @param path the path to the weka.jar.
     */
    public void setPathToWekaJar(String path) {
        m_pathToWekaJar = path;
    }

    /**
     * Get the path to the weka.jar file. Will be populated automatically if the
     * classpath contains a weka.jar. The weka.jar is installed in HDFS and used
     * in the classpath for map and reduce tasks.
     * 
     * @return the path to the weka.jar.
     */
    public String getPathToWekaJar() {
        return m_pathToWekaJar;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property.
     */
    public String additionalWekaPackagesTipText() {
        return "A list of comma separated weka package names to use with the job. "
                + "Any jar files in the main package directory and the lib "
                + "directory of each package will get installed in HDFS and "
                + "placed in the classpath of map and reduce tasks.";
    }

    /**
     * Set a comma separated list of the names of additional weka packages to use
     * with the job. Any jar files in the main package directory and the lib
     * directory of the package will get installed in HDFS and placed in the
     * classpath of map and reduce tasks
     * 
     * @param packages a comma separated list of weka packages to use with the job
     */
    public void setAdditionalWekaPackages(String packages) {
        m_mrConfig.setUserSuppliedProperty(DistributedJob.WEKA_ADDITIONAL_PACKAGES_KEY, packages);
    }

    /**
     * Get a comma separated list of the names of additional weka packages to use
     * with the job. Any jar files in the main package directory and the lib
     * directory of the package will get installed in HDFS and placed in the
     * classpath of map and reduce tasks
     * 
     * @return a comma separated list of weka packages to use with the job
     */
    public String getAdditionalWekaPackages() {
        return m_mrConfig.getUserSuppliedProperty(DistributedJob.WEKA_ADDITIONAL_PACKAGES_KEY);
    }

    /**
     * Tip text for this property
     * 
     * @return tip text for this property
     */
    public String loggingIntervalTipText() {
        return "The interval (in seconds) between output of logging information" + " from running jobs";
    }

    /**
     * Set the interval between output of logging information from running jobs.
     * 
     * @param li the interval (in seconds) between output of logging information
     */
    public void setLoggingInterval(String li) {
        m_loggingInterval = li;
    }

    /**
     * Get the interval between output of logging information from running jobs.
     * 
     * @return the interval (in seconds) between output of logging information
     */
    public String getLoggingInterval() {
        return m_loggingInterval;
    }

    /**
     * Installs the core weka library and the distributed weka libraries in HDFS.
     * Also adds the libraries to the classpath for map and reduce tasks by
     * populating the appropriate properties in the supplied Hadoop Configuration
     * object.
     * 
     * @param conf the Configuration object to populate
     * @throws IOException if a problem occurs
     */
    protected void installWekaLibrariesInHDFS(Configuration conf) throws IOException {
        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        if (m_pathToWekaJar == null || DistributedJobConfig.isEmpty(m_pathToWekaJar.toString())) {
            throw new IOException("No path to weka.jar file provided. We need to install the "
                    + "weka.jar in HDFS so that it is available to running Jobs");
        }

        statusMessage("Installing libraries in HDFS...");
        List<String> installLibraries = new ArrayList<String>();
        logMessage("Copying " + environmentSubstitute(m_pathToWekaJar) + " to HDFS");
        installLibraries.add(environmentSubstitute(m_pathToWekaJar));
        logMessage("Copying " + DISTRIBUTED_WEKA_BASE_JAR + " to HSFS");
        installLibraries.add(DISTRIBUTED_WEKA_BASE_JAR);
        logMessage("Copying " + DISTRIBUTED_WEKA_HADOOP_JAR + " to HSFS");
        installLibraries.add(DISTRIBUTED_WEKA_HADOOP_JAR);
        logMessage("Copying " + OPEN_CSV_JAR + " to HDFS");
        installLibraries.add(OPEN_CSV_JAR);
        logMessage("Copying " + JFREECHART_JAR + " to HDFS");
        installLibraries.add(JFREECHART_JAR);
        logMessage("Copying " + JCOMMON_JAR + " to HDFS");
        installLibraries.add(JCOMMON_JAR);
        logMessage("Copying " + COLT_JAR + " to HDFS");
        installLibraries.add(COLT_JAR);
        logMessage("Copying " + LA4J_JAR + " to HDFS");
        installLibraries.add(LA4J_JAR);

        HDFSUtils.copyFilesToWekaHDFSInstallationDirectory(installLibraries, m_mrConfig.getHDFSConfig(), m_env,
                true);

        addWekaLibrariesToClasspath(conf);

        installWekaPackageLibrariesInHDFS(getAdditionalWekaPackageNames(m_mrConfig), conf);
    }

    /**
     * Install the jar files for a list of named weka packages in HDFS and add
     * them to the classpath for map and reduce tasks
     * 
     * @param packageNames a list of weka packages to install the jar files for
     * @param conf the Hadoop configuration to set the classpath for map and
     *          reduce tasks
     * @throws IOException if a problem occurs
     */
    private void installWekaPackageLibrariesInHDFS(List<String> packageNames, Configuration conf)
            throws IOException {
        if (packageNames == null || packageNames.size() == 0) {
            return;
        }

        File packagesDir = WekaPackageManager.PACKAGES_DIR;
        List<String> installLibraries = new ArrayList<String>();
        for (String packageDir : packageNames) {

            // package dir
            File current = new File(packagesDir.toString() + File.separator + packageDir);

            if (current.exists() && current.isDirectory()) {
                File[] contents = current.listFiles();
                for (File f : contents) {
                    if (f.isFile() && f.toString().toLowerCase().endsWith(".jar")) {
                        logMessage("Copying package '" + packageDir + "': " + f.getName() + " to HDFS");
                        installLibraries.add(f.toString());
                    }
                }

                // lib dir
                File libDir = new File(current.toString() + File.separator + "lib");
                if (libDir.exists() && libDir.isDirectory()) {
                    File[] libContents = libDir.listFiles();
                    for (File f : libContents) {
                        if (f.isFile() && f.toString().toLowerCase().endsWith(".jar")) {
                            logMessage("Copying package '" + packageDir + "': " + f.getName() + " to HDFS");
                            installLibraries.add(f.toString());
                        }
                    }
                }
            }
        }

        HDFSUtils.copyFilesToWekaHDFSInstallationDirectory(installLibraries, m_mrConfig.getHDFSConfig(), m_env,
                true);

        addWekaPackageLibrariesToClasspath(installLibraries, conf);
    }

    /**
     * Adds the core weka and distributed weka jar files to the classpath for map
     * and reduce tasks
     * 
     * @param conf the Configuration object to populate
     * @throws IOException if a problem occurs
     */
    protected void addWekaLibrariesToClasspath(Configuration conf) throws IOException {
        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        statusMessage("Adding Weka libraries to the distributed cache and classpath " + "for the job");
        List<String> cacheFiles = new ArrayList<String>();
        cacheFiles.add(new File(m_pathToWekaJar).getName());
        cacheFiles.add(new File(DISTRIBUTED_WEKA_BASE_JAR).getName());
        cacheFiles.add(new File(DISTRIBUTED_WEKA_HADOOP_JAR).getName());
        cacheFiles.add(new File(OPEN_CSV_JAR).getName());
        cacheFiles.add(new File(JFREECHART_JAR).getName());
        cacheFiles.add(new File(JCOMMON_JAR).getName());
        cacheFiles.add(new File(COLT_JAR).getName());
        cacheFiles.add(new File(LA4J_JAR).getName());

        HDFSUtils.addWekaInstalledFilesToClasspath(m_mrConfig.getHDFSConfig(), conf, cacheFiles, m_env);
    }

    /**
     * @param packageJars a list of paths to jar files from packages to add to the
     *          classpath
     * @param conf the Hadoop Configuration to populate
     * @throws IOException if a problem occurs
     */
    private void addWekaPackageLibrariesToClasspath(List<String> packageJars, Configuration conf)
            throws IOException {

        if (packageJars == null || packageJars.size() == 0) {
            return;
        }

        List<String> cacheFiles = new ArrayList<String>();
        statusMessage("Adding Weka package libraries to the distributed cache and classpath");
        for (String jar : packageJars) {
            cacheFiles.add(new File(jar).getName());
        }

        HDFSUtils.addWekaInstalledFilesToClasspath(m_mrConfig.getHDFSConfig(), conf, cacheFiles, m_env);
    }

    /**
     * Deletes the output directory for a job
     * 
     * @param job the Job object to delete the output directory for
     * @throws IOException if a problem occurs
     */
    public void cleanOutputDirectory(Job job) throws IOException {
        if (DistributedJobConfig.isEmpty(m_mrConfig.getOutputPath())) {
            throw new IOException("No output directory set!");
        }

        m_mrConfig.deleteOutputDirectory(job, m_env);
    }

    /**
     * Runs the supplied job
     * 
     * @param job the job to run
     * @return true if the job was successful
     * @throws DistributedWekaException if a problem occurs
     */
    protected boolean runJob(Job job) throws DistributedWekaException {
        try {
            m_stopRunningJob = false;
            if (DistributedJobConfig.isEmpty(getLoggingInterval())) {
                m_loggingInterval = "10";
            }
            int logInterval = Integer.parseInt(m_loggingInterval);
            System.out.println("Setting logging interval to " + logInterval);
            job.submit();

            try {
                int taskCompletionEventIndex = 0;
                while (!m_stopRunningJob && !job.isComplete()) {
                    if (logInterval >= 1) {
                        printJobStatus(job);
                        taskCompletionEventIndex += logTaskMessages(job, taskCompletionEventIndex);

                        Thread.sleep(logInterval * 1000);
                    } else {
                        Thread.sleep(60000);
                    }
                }
            } catch (InterruptedException ie) {
                logMessage(ie.getMessage());
                m_stopRunningJob = true;
            }

            if (m_stopRunningJob && !job.isComplete()) {
                job.killJob();
            }
            m_stopRunningJob = false;

            return job.isSuccessful();
        } catch (Exception ex) {
            throw new DistributedWekaException(ex);
        }
    }

    /**
     * Print status information for the supplied (running) job
     * 
     * @param job the job to print status info for
     * @throws IOException if a problem occurs
     */
    protected void printJobStatus(Job job) throws IOException {
        float setupPercent = job.setupProgress() * 100f;
        float mapPercent = job.mapProgress() * 100f;
        float reducePercent = job.reduceProgress() * 100f;

        String info = getJobName() + " Setup: " + setupPercent + " Map: " + mapPercent + " Reduce: "
                + reducePercent;

        statusMessage(info);
        logMessage(info);
    }

    /**
     * Output task messages for the currently running job
     * 
     * @param job the job to output messages for
     * @param startIndex the index to start outputting messages from
     * @return the index of the last message output
     * @throws IOException if a problem occurs
     */
    protected int logTaskMessages(Job job, int startIndex) throws IOException {
        TaskCompletionEvent[] tcEvents = job.getTaskCompletionEvents(startIndex);

        // StringBuilder taskMessages = new StringBuilder();
        for (TaskCompletionEvent tcEvent : tcEvents) {
            logMessage(tcEvent.toString());
            // taskMessages.append(tcEvent.toString()).append("\n");
        }

        // logMessage(taskMessages.toString());

        return tcEvents.length;
    }

    /**
     * Extract the number of a map/reduce attempt from the supplied taskID string.
     * 
     * @param taskID the taskID string
     * @param prefix the prefix identifying the type of task (i.e. mapper or
     *          reducer)
     * @return the task number
     */
    public static int getMapReduceNumber(String taskID, String prefix) {
        if (taskID.indexOf(prefix) < 0) {
            return -1; // not what was expected
        }

        String lastPart = taskID.substring(taskID.indexOf(prefix) + prefix.length());
        String theNumber = lastPart.substring(0, lastPart.indexOf("_"));

        return Integer.parseInt(theNumber);
    }

    /**
     * Get the number of the map attempt from the supplied task ID string
     * 
     * @param taskID the task ID string
     * @return the number of the map attempt
     */
    public static int getMapNumber(String taskID) {
        return getMapReduceNumber(taskID, "_m_");
    }

    /**
     * Get the number of the reduce attempt from the supplied task ID string
     * 
     * @param taskID the task ID string
     * @return the number of the reduce attempt
     */
    public static int getReduceNumber(String taskID) {
        return getMapReduceNumber(taskID, "_r");
    }

    /**
     * Log a debug message
     * 
     * @param message the message to log
     */
    protected void logDebug(String message) {
        if (getDebug()) {
            logMessage(message);
        }
    }

    /**
     * Log a message
     * 
     * @param message the message to log
     */
    @Override
    protected void logMessage(String message) {
        if (m_log != null) {
            m_log.logMessage(m_statusMessagePrefix + message);
        }
        m_hadoopLog.info(message);
    }

    /**
     * Send a message to the status
     * 
     * @param message the message to status
     */
    @Override
    protected void statusMessage(String message) {
        if (m_log != null) {
            m_log.statusMessage(m_statusMessagePrefix + message);
        }
    }
}