org.terrier.utility.io.HadoopPlugin.java Source code

Introduction

Here is the source code for org.terrier.utility.io.HadoopPlugin.java
Source

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is HadoopPlugin.java.
 *
 * The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   
 */
package org.terrier.utility.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URI;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Files;
import org.terrier.utility.KillHandler;
import org.terrier.utility.Files.FSCapability;
import org.terrier.utility.KillHandler.Killable;

/** This class provides the main glue between Terrier and Hadoop. It has several main roles:<ol>
  * <li>Configure Terrier such that the Hadoop file systems can be accessed by Terrier.</li>
  * <li>Provide a means to access the Hadoop map-reduce cluster, using <a href="http://hadoop.apache.org/core/docs/current/hod.html">Hadoop on Demand (HOD)</a> if necessary.</li>
  * </ol>
  * <p><h3>Configuring Terrier to access HDFS</h3>
  * Terrier can access a Hadoop Distributed File System (HDFS), allowing collections and indices to be placed there.
  * To do so, ensure that your Hadoop <tt>conf/</tt> is on your CLASSPATH, and that the Hadoop plugin is loaded by Terrier,
  * by setting <tt>terrier.plugins=org.terrier.utility.io.HadoopPlugin</tt> in your <tt>terrier.properties</tt> file.
  * </p>
  * <p><h3>Configuring Terrier to access an existing Hadoop MapReduce cluster</h3>
  * Terrier can access an existing MapReduce cluster, as long as the <tt>conf/</tt> folder for Hadoop is on your CLASSPATH.
  * If you do not already have an existing Hadoop cluster, Terrier can be configured to use HOD, to build a temporary
  * Hadoop cluster from a PBS (Torque) cluster. To configure HOD itself, the reader is referred to the
  * <a href="http://hadoop.apache.org/core/docs/current/hod.html">HOD documentation</a>. To use HOD from Terrier,
  * set the following properties:
  * <ul>
  * <li><tt>plugin.hadoop.hod</tt> - path to the hod binary, normally $HADOOP_HOME/contrib/hod/bin. If unset, then HOD is presusmed
  * to be unconfigured.</li>
  * <li><tt>plugin.hadoop.hod.nodes</tt> - the number of nodes/CPUs that you want to request from the PBS Torque cluster. Defaults to 6.</li>
  * <li><tt>plugin.hadoop.hod.params</tt> - any additional options you want to set on the HOD command line. See the 
  * <a href="http://hadoop.apache.org/core/docs/current/hod_user_guide.html#Command+Line">HOD User guide</a> for examples.</li>
  * </ul>
  * </p><p><h3>Using Hadoop MapReduce from Terier</h3>
  * You should use the JobFactory provided by this class when creating a MapReduce job from Terrier. The JobFactory
  * creates a HOD session should one be required, and also configures jobs such that the Terrier environment can
  * be recreated on the execution nodes.
  * <pre>
  * HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
  * if (jf == null)
  *    throw new Exception("Could not get JobFactory from HadoopPlugin");
  * JobConf conf = jf.newJob();
  * ....
  * jf.close(); //closing the JobFactory will ensure that the HOD session ends
  * </pre>
  * When using your own code in Terrier MapReduce jobs, ensure that you configure the Terrier application before
  * anything else:
  * <pre>
  * public void configure(JobConf jc)
  * {
  *    try{
  *       HadoopUtility.loadTerrierJob(jc);
  *    } catch (Exception e) {
  *       throw new Error("Cannot load ApplicationSetup", e);
  *    }
  * }
  * </pre>
  * </p>
  * @since 2.2
  * @author Craig Macdonald
  */
@SuppressWarnings("deprecation")
public class HadoopPlugin implements ApplicationSetup.TerrierApplicationPlugin {
    /** instance of this class - it is a singleton */
    protected static HadoopPlugin singletonHadoopPlugin;
    /** main configuration object to use for Hadoop access */
    protected static Configuration singletonConfiguration;
    /** The logger used */
    protected static final Logger logger = Logger.getLogger(HadoopPlugin.class);

    /** a Job Factory is responsible for creating Terrier MapReduce jobs.
      * This should be used when requesting a Terrier MapReduce job, as it
      * adequately initialises the job, such that Terrier can run correctly.
      */
    public static abstract class JobFactory {
        /** Make a new job */
        public abstract JobConf newJob() throws Exception;

        /** Add additional informatino to a MapReduce job about the Terrier configuration */
        protected static void makeTerrierJob(JobConf jc) throws IOException {
            HadoopUtility.makeTerrierJob(jc);
        }

        /** Finish with this job factory. If the JobFactory was created using HOD, then
          * the HOD job will also be ended */
        public abstract void close();
    }

    /** JobFactory that doesn't resort to HadoopOnDemand, and directly wraps a Configuration object */
    static class DirectJobFactory extends JobFactory {
        protected Configuration c;

        DirectJobFactory() {
            c = null;
        }

        DirectJobFactory(Configuration _c) {
            c = _c;
        }

        public JobConf newJob() throws Exception {
            JobConf rtr = c != null ? new JobConf(c) : new JobConf();
            makeTerrierJob(rtr);
            return rtr;
        }

        public void close() {
        }
    }

    private static final Random random = new Random();

    /** JobFactory that uses HadoopOnDemand */
    static class HODJobFactory extends JobFactory implements Killable {
        protected String hodConfLocation = null;
        protected String hodBinaryLocation = null;
        protected boolean connected = false;

        HODJobFactory(String _hodJobName, String _hodBinaryLocation, String[] hodParams, int HodNumNodes)
                throws Exception {
            hodBinaryLocation = _hodBinaryLocation;
            KillHandler.addKillhandler(this);
            doHod(_hodJobName, hodParams, HodNumNodes);
        }

        protected void doHod(String jobName, String[] hodParams, int NumNodes) throws Exception {
            if (jobName == null || jobName.length() == 0)
                jobName = "terrierHOD";
            logger.info("Processing HOD for " + jobName + " at " + hodBinaryLocation + " request for " + NumNodes
                    + " nodes");

            File hodDir = null;
            while (hodDir == null) {
                hodDir = new File(System.getProperty("java.io.tmpdir", "/tmp") + "/hod" + random.nextInt());
                if (hodDir.exists())
                    hodDir = null;
            }
            if (!hodDir.mkdir()) {
                throw new IOException("Could not create new HOD tmp dir at " + hodDir);
            }

            //build the HOD command
            String[] command = new String[8 + hodParams.length];
            command[0] = hodBinaryLocation;
            command[1] = "allocate";
            command[2] = "-d";
            command[3] = hodDir.getAbsolutePath();
            command[4] = "-n";
            command[5] = "" + NumNodes;
            command[6] = "-N";
            command[7] = jobName;
            int offset = 8;
            for (String param : hodParams)
                command[offset++] = param;

            //execute the command
            ProcessBuilder pb = new ProcessBuilder();
            pb.command(command);
            Process p = pb.start();

            //log all output from HOD
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(p.getErrorStream()));
                String line = null;
                while ((line = br.readLine()) != null)
                    logger.info(line);
                br.close();
            } catch (IOException ioe) {
                logger.warn("Problem reading error stream of HOD", ioe);
            }
            p.waitFor();

            //check for successfull HOD
            File hodConf = new File(hodDir, "hadoop-site.xml");
            if (!Files.exists((hodConf.toString())))
                throw new IOException("HOD did not produce a hadoop-site.xml");
            final int exitValue = p.exitValue();
            if (exitValue != 0) {
                throw new Exception("HOD allocation did not succeed (exit value was " + exitValue + ")");
            }
            hodConfLocation = hodDir.getAbsolutePath();
            connected = true;
        }

        protected void disconnectHod() throws Exception {
            logger.info("Processing HOD disconnect");
            ProcessBuilder pb = new ProcessBuilder();
            pb.command(new String[] { hodBinaryLocation, "deallocate", "-d", hodConfLocation });
            Process p = pb.start();
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(p.getErrorStream()));
                String line = null;
                while ((line = br.readLine()) != null)
                    logger.info(line);
                br.close();
            } catch (IOException ioe) {
                logger.warn("Problem reading error stream of HOD", ioe);
            }
            p.waitFor();
            final int exitValue = p.exitValue();
            if (exitValue != 0) {
                logger.warn("HOD deallocate might not have succeeded (exit value was " + exitValue + ")");
            }
            connected = false;
        }

        public JobConf newJob() throws Exception {
            JobConf rtr = new JobConf(hodConfLocation + "/hadoop-site.xml");
            makeTerrierJob(rtr);
            return rtr;
        }

        protected void finalize() {
            close();
        }

        public void close() {
            if (connected)
                try {
                    disconnectHod();
                } catch (Exception e) {
                    logger.warn("Encoutered exception while closing HOD. A PBS job may need to be deleted.", e);
                } finally {
                    KillHandler.removeKillhandler(this);
                }
        }

        public void kill() {
            close();
        }
    }

    /** Get a JobFactory with the specified session name. This method attempts three processes, in order:
      * <ol>
      * <li>If the current/default Hadoop configuration has a real Hadoop cluster Job Tracker configured, then
      * that will be used. This requires that the <tt>mapred.job.tracker</tt> property in the haddop-site.xml
      * be configured.</li>
      * <li>Next, it will attempt to use HOD to build a Hadoop MapReduce cluster. This requies the Terrier property
      * relating to HOD be configured to point to the location of the HOD binary - <tt>plugin.hadoop.hod</tt></li>
      * <li>As a last resort, Terrier will use the local job tracker that Hadoop provides on the localhost. This is
      * useful for unit testing, however it does not support multiple reducers.</li> 
      * </ol>
      */
    public static JobFactory getJobFactory(String sessionName) {
        return getJobFactory(sessionName, false);
    }

    /** implements the obtaining of job factories */
    protected static JobFactory getJobFactory(String sessionName, boolean persistent) {
        if (persistent)//TODO
            throw new Error("Persistent JobFactory not yet supported, sorry");
        Configuration globalConf = getGlobalConfiguration();

        try {
            //JobConf jc_sampleConf = new JobConf();
            //see if the current hadoop configuration has a real job tracker configured
            String jt = globalConf.get("mapred.job.tracker");
            if (jt == null) {
                jt = new JobConf().get("mapred.job.tracker");
            }
            if (jt != null && !jt.equals("local")) {
                if (logger.isDebugEnabled())
                    logger.debug(
                            "Default configuration has job tracker set to " + globalConf.get("mapred.job.tracker"));
                return new DirectJobFactory(/*globalConf*/);
            }
            // if not, try HOD   
            String hod = ApplicationSetup.getProperty("plugin.hadoop.hod", null);
            String[] hodParams = ApplicationSetup.getProperty("plugin.hadoop.hod.params", "").split(" ");
            if (hod != null && hod.length() > 0) {
                int HodNodes = Integer.parseInt(ApplicationSetup.getProperty("plugin.hadoop.hod.nodes", "" + 6));
                return new HODJobFactory(sessionName, hod, hodParams, HodNodes);
            }
            //as a last resort, use the local Hadoop job tracker
            logger.warn("No remote job tracker or HOD configuration found, using local job tracker");
            return new DirectJobFactory(globalConf);
        } catch (Exception e) {
            logger.warn("Exception occurred while creating JobFactory", e);
            return null;
        }
    }

    /** Update the global Hadoop configuration in use by the plugin */
    public static void setGlobalConfiguration(Configuration _config) {
        singletonConfiguration = _config;
    }

    /** Obtain the global Hadoop configuration in use by the plugin */
    public static Configuration getGlobalConfiguration() {
        if (singletonConfiguration == null) {
            singletonConfiguration = new Configuration();
        }
        return singletonConfiguration;
    }

    static HadoopPlugin getHadoopPlugin() {
        return singletonHadoopPlugin;
    }

    /** configuration used by this plugin */
    protected Configuration config = null;
    /** distributed file system used by this plugin */
    protected org.apache.hadoop.fs.FileSystem hadoopFS = null;

    /** Constructs a new plugin */
    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "ST_WRITE_TO_STATIC_FROM_INSTANCE_METHOD", justification = "Its a singleton")
    public HadoopPlugin() {
        singletonHadoopPlugin = this;
    }

    /** What is the String prefix of the default file system according to Hadoop */
    public static String getDefaultFileSystemPrefix() {
        return org.apache.hadoop.fs.FileSystem.getDefaultUri(singletonConfiguration).toString();
    }

    /** What is the URI of the default file system according to Hadoop */
    public static URI getDefaultFileSystemURI() {
        return org.apache.hadoop.fs.FileSystem.getDefaultUri(singletonConfiguration);
    }

    /** What is the default file system according to Hadoop 
     * @throws IOException */
    public static org.apache.hadoop.fs.FileSystem getDefaultFileSystem() throws IOException {
        return org.apache.hadoop.fs.FileSystem.get(singletonConfiguration);
    }

    /** Wrapper around FSDataInputStream which implements RandomDataInput. */
    static class HadoopFSRandomAccessFile implements RandomDataInput
    //static class HadoopFSRandomAccessFile extends RandomAccessFile implements RandomDataInput
    {
        FSDataInputStream in;
        org.apache.hadoop.fs.FileSystem fs;
        String filename;

        public HadoopFSRandomAccessFile(org.apache.hadoop.fs.FileSystem _fs, String _filename) throws IOException {
            this.fs = _fs;
            this.filename = _filename;
            this.in = _fs.open(new Path(_filename));
        }

        public int read() throws IOException {
            return in.read();
        }

        public int read(byte b[], int off, int len) throws IOException {
            return in.read(in.getPos(), b, off, len);
        }

        public int readBytes(byte b[], int off, int len) throws IOException {
            return in.read(in.getPos(), b, off, len);
        }

        public void seek(long pos) throws IOException {
            in.seek(pos);
        }

        public long length() throws IOException {
            return fs.getFileStatus(new Path(filename)).getLen();
        }

        public void close() throws IOException {
            in.close();
        }

        // implementation from RandomAccessFile
        public final double readDouble() throws IOException {
            return in.readDouble();
        }

        public final int readUnsignedShort() throws IOException {
            return in.readUnsignedShort();
        }

        public final short readShort() throws IOException {
            return in.readShort();
        }

        public final int readUnsignedByte() throws IOException {
            return in.readUnsignedByte();
        }

        public final byte readByte() throws IOException {
            return in.readByte();
        }

        public final boolean readBoolean() throws IOException {
            return in.readBoolean();
        }

        public final int readInt() throws IOException {
            return in.readInt();
        }

        public final long readLong() throws IOException {
            return in.readLong();
        }

        public final float readFloat() throws IOException {
            return in.readFloat();
        }

        public final void readFully(byte b[]) throws IOException {
            in.readFully(b);
        }

        public final void readFully(byte b[], int off, int len) throws IOException {
            in.readFully(b, off, len);
        }

        public int skipBytes(int n) throws IOException {
            return in.skipBytes(n);
        }

        public long getFilePointer() throws IOException {
            return in.getPos();
        }

        public final char readChar() throws IOException {
            return in.readChar();
        }

        public final String readUTF() throws IOException {
            return in.readUTF();
        }

        public final String readLine() throws IOException {
            return in.readLine();
        }
    }

    /** Initialises the Plugin, by connecting to the distributed file system */
    public void initialise() throws Exception {
        config = getGlobalConfiguration();

        final org.apache.hadoop.fs.FileSystem DFS = hadoopFS = org.apache.hadoop.fs.FileSystem.get(config);

        FileSystem terrierDFS = new FileSystem() {
            public String name() {
                return "hdfs";
            }

            /** capabilities of the filesystem */
            public byte capabilities() {
                return FSCapability.READ | FSCapability.WRITE | FSCapability.RANDOM_READ | FSCapability.STAT
                        | FSCapability.DEL_ON_EXIT | FSCapability.LS_DIR;
            }

            public String[] schemes() {
                return new String[] { "dfs", "hdfs" };
            }

            /** returns true if the path exists */
            public boolean exists(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Checking that " + filename + " exists answer=" + DFS.exists(new Path(filename)));
                return DFS.exists(new Path(filename));
            }

            /** open a file of given filename for reading */
            public InputStream openFileStream(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Opening " + filename);
                return DFS.open(new Path(filename));
            }

            /** open a file of given filename for writing */
            public OutputStream writeFileStream(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Creating " + filename);
                return DFS.create(new Path(filename));
            }

            public boolean mkdir(String filename) throws IOException {
                return DFS.mkdirs(new Path(filename));
            }

            public RandomDataOutput writeFileRandom(String filename) throws IOException {
                throw new IOException("HDFS does not support random writing");
            }

            public RandomDataInput openFileRandom(String filename) throws IOException {
                return new HadoopFSRandomAccessFile(DFS, filename);
            }

            public boolean delete(String filename) throws IOException {
                return DFS.delete(new Path(filename), true);
            }

            public boolean deleteOnExit(String filename) throws IOException {
                return DFS.deleteOnExit(new Path(filename));
            }

            public String[] list(String path) throws IOException {
                final FileStatus[] contents = DFS.listStatus(new Path(path));
                if (contents == null)
                    throw new FileNotFoundException("Cannot list path " + path);
                final String[] names = new String[contents.length];
                for (int i = 0; i < contents.length; i++) {
                    names[i] = contents[i].getPath().getName();
                }
                return names;
            }

            public String getParent(String path) throws IOException {
                return new Path(path).getParent().getName();
            }

            public boolean rename(String source, String destination) throws IOException {
                return DFS.rename(new Path(source), new Path(destination));
            }

            public boolean isDirectory(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).isDir();
            }

            public long length(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getLen();
            }

            public boolean canWrite(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getPermission().getUserAction().implies(FsAction.WRITE);
            }

            public boolean canRead(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getPermission().getUserAction().implies(FsAction.READ);
            }
        };
        Files.addFileSystemCapability(terrierDFS);
    }

    /** Returns the Hadoop configuration underlying this plugin instance */
    public Configuration getConfiguration() {
        return config;
    }
}