org.dutir.lucene.io.HadoopPlugin.java Source code

Introduction

Here is the source code for org.dutir.lucene.io.HadoopPlugin.java
Source

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://ir.dcs.gla.ac.uk/terrier
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - Department of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is HadoopPlugin.java.
 *
 * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   
 */
package org.dutir.lucene.io;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Random;
import java.io.InputStreamReader;
import org.apache.log4j.Logger;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.FileStatus;

import org.dutir.lucene.util.ApplicationSetup;
import org.dutir.lucene.util.Files;
import org.dutir.lucene.util.Files.FSCapability;

/** This class provides the main glue between Terrier and Hadoop. It has several main roles:<ol>
  * <li>Configure Terrier such that the Hadoop file systems can be accessed by Terrier.</li>
  * <li>Provide a means to access the Hadoop map-reduce cluster, using <a href="http://hadoop.apache.org/core/docs/current/hod.html">Hadoop on Demand (HOD)</a> if necessary.</li>
  * </ol>
  * <p><h3>Configuring Terrier to access HDFS</h3>
  * Terrier can access a Hadoop Distributed File System (HDFS), allowing collections and indices to be placed there.
  * To do so, ensure that your Hadoop <tt>conf/</tt> is on your CLASSPATH, and that the Hadoop plugin is loaded by Terrier,
  * by setting <tt>terrier.plugins=uk.ac.gla.terrier.utility.io.HadoopPlugin</tt> in your <tt>terrier.properties</tt> file.
  * </p>
  * <p><h3>Configuring Terrier to access an existing Hadoop MapReduce cluster</h3>
  * Terrier can access an existing MapReduce cluster, as long as the <tt>conf/</tt> folder for Hadoop is on your CLASSPATH.
  * If you do not already have an existing Hadoop cluster, Terrier can be configured to use HOD, to build a temporary
  * Hadoop cluster from a PBS (Torque) cluster. To configure HOD itself, the reader is referred to the
  * <a href="http://hadoop.apache.org/core/docs/current/hod.html">HOD documentation</a>. To use HOD from Terrier,
  * set the following properties:
  * <ul>
  * <li><tt>plugin.hadoop.hod</tt> - path to the hod binary, normally $HADOOP_HOME/contrib/hod/bin. If unset, then HOD is presusmed
  * to be unconfigured.</li>
  * <li><tt>plugin.hadoop.hod.nodes</tt> - the number of nodes/CPUs that you want to request from the PBS Torque cluster. Defaults to 6.</li>
  * <li><tt>plugin.hadoop.hod.params</tt> - any additional options you want to set on the HOD command line. See the 
  * <a href="http://hadoop.apache.org/core/docs/current/hod_user_guide.html#Command+Line">HOD User guide</a> for examples.</li>
  * </ul>
  * </p><p><h3>Using Hadoop MapReduce from Terier</h3>
  * You should use the JobFactory provided by this class when creating a MapReduce job from Terrier. The JobFactory
  * creates a HOD session should one be required, and also configures jobs such that the Terrier environment can
  * be recreated on the execution nodes.
  * <pre>
  * HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
  * if (jf == null)
  *     throw new Exception("Could not get JobFactory from HadoopPlugin");
  * JobConf conf = jf.newJob();
  * ....
  * jf.close(); //closing the JobFactory will ensure that the HOD session ends
  * </pre>
  * When using your own code in Terrier MapReduce jobs, ensure that you configure the Terrier application before
  * anything else:
  * <pre>
  * public void configure(JobConf jc)
  * {
  *     try{
  *         HadoopUtility.loadTerrierJob(jc);
  *     } catch (Exception e) {
  *         throw new Error("Cannot load ApplicationSetup", e);
  *     }
  * }
  * </pre>
  * </p>
  * @since 2.2
  * @author Craig Macdonald
  * @version $Revision: 1.4 $
  */
public class HadoopPlugin implements ApplicationSetup.TerrierApplicationPlugin {
    /** instance of this class - it is a singleton */
    protected static HadoopPlugin singletonHadoopPlugin;
    /** main configuration object to use for Hadoop access */
    protected static Configuration singletonConfiguration;
    /** The logger used */
    protected static final Logger logger = Logger.getLogger(HadoopPlugin.class);

    /** a Job Factory is responsible for creating Terrier Map Reduce jobs.
      * This should be used when requesting a Terrier map reduce job, as it
      * adequately initialises the job, such that Terrier can run correctly.
      */
    public static abstract class JobFactory {
        /** Make a new job */
        public abstract JobConf newJob() throws Exception;

        protected static void makeTerrierJob(JobConf jc) throws IOException {
            HadoopUtility.makeTerrierJob(jc);
        }

        /** Finish with this job factory. If the JobFactory was created using HOD, then
          * the HOD job will also be ended */
        public abstract void close();
    }

    static class DirectJobFactory extends JobFactory {
        protected Configuration c;

        DirectJobFactory(Configuration _c) {
            c = _c;
        }

        public JobConf newJob() throws Exception {
            JobConf rtr = new JobConf(c);
            makeTerrierJob(rtr);
            return rtr;
        }

        public void close() {
        }
    }

    private static final Random random = new Random();

    static class HODJobFactory extends JobFactory {
        protected String hodConfLocation = null;
        protected String hodBinaryLocation = null;
        protected boolean connected = false;

        HODJobFactory(String _hodJobName, String _hodBinaryLocation, String[] hodParams, int HodNumNodes)
                throws Exception {
            hodBinaryLocation = _hodBinaryLocation;
            doHod(_hodJobName, hodParams, HodNumNodes);
        }

        protected void doHod(String jobName, String[] hodParams, int NumNodes) throws Exception {
            if (jobName == null || jobName.length() == 0)
                jobName = "terrierHOD";
            logger.info("Processing HOD for " + jobName + " at " + hodBinaryLocation + " request for " + NumNodes
                    + " nodes");

            File hodDir = null;
            while (hodDir == null) {
                hodDir = new File(System.getProperty("java.io.tmpdir", "/tmp") + "/hod" + random.nextInt());
                if (hodDir.exists())
                    hodDir = null;
            }
            if (!hodDir.mkdir()) {
                throw new IOException("Could not create new HOD tmp dir at " + hodDir);
            }

            //build the HOD command
            String[] command = new String[8 + hodParams.length];
            command[0] = hodBinaryLocation;
            command[1] = "allocate";
            command[2] = "-d";
            command[3] = hodDir.getAbsolutePath();
            command[4] = "-n";
            command[5] = "" + NumNodes;
            command[6] = "-N";
            command[7] = jobName;
            int offset = 8;
            for (String param : hodParams)
                command[offset++] = param;

            //execute the command
            ProcessBuilder pb = new ProcessBuilder();
            pb.command(command);
            Process p = pb.start();

            //log all output from HOD
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(p.getErrorStream()));
                String line = null;
                while ((line = br.readLine()) != null)
                    logger.info(line);
                br.close();
            } catch (IOException ioe) {
            }
            p.waitFor();

            //check for successfull HOD
            File hodConf = new File(hodDir, "hadoop-site.xml");
            if (!Files.exists((hodConf.toString())))
                throw new IOException("HOD did not produce a hadoop-site.xml");
            final int exitValue = p.exitValue();
            if (exitValue != 0) {
                throw new Exception("HOD allocation did not succeed (exit value was " + exitValue + ")");
            }
            hodConfLocation = hodDir.getAbsolutePath();
            connected = true;
        }

        protected void disconnectHod() throws Exception {
            logger.info("Processing HOD disconnect");
            ProcessBuilder pb = new ProcessBuilder();
            pb.command(new String[] { hodBinaryLocation, "deallocate", "-d", hodConfLocation });
            Process p = pb.start();
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(p.getErrorStream()));
                String line = null;
                while ((line = br.readLine()) != null)
                    logger.info(line);
                br.close();
            } catch (IOException ioe) {
            }
            p.waitFor();
            final int exitValue = p.exitValue();
            if (exitValue != 0) {
                logger.warn("HOD deallocate might not have succeeded (exit value was " + exitValue + ")");
            }
            connected = false;
        }

        public JobConf newJob() throws Exception {
            JobConf rtr = new JobConf(hodConfLocation + "/hadoop-site.xml");
            makeTerrierJob(rtr);
            return rtr;
        }

        protected void finalize() {
            close();
        }

        public void close() {
            if (connected)
                try {
                    disconnectHod();
                } catch (Exception e) {
                    logger.warn("Encoutered exception while closing HOD. A PBS job may need to be deleted.", e);
                }
        }
    }

    /** Get a JobFactory with the specified session name. This method attempts three processes, in order:
      * <ol>
      * <li>If the current/default Hadoop configuration has a real Hadoop cluster Job Tracker configured, then
      * that will be used. This requires that the <tt>mapred.job.tracker</tt> property in the haddop-site.xml
      * be configured.</li>
      * <li>Next, it will attempt to use HOD to build a Hadoop MapReduce cluster. This requies the Terrier property
      * relating to HOD be configured to point to the location of the HOD binary - <tt>plugin.hadoop.hod</tt></li>
      * <li>As a last resort, Terrier will use the local job tracker that Hadoop provides on the localhost.</li> 
      * </ol>
      */
    public static JobFactory getJobFactory(String sessionName) {
        return getJobFactory(sessionName, false);
    }

    protected static JobFactory getJobFactory(String sessionName, boolean persistent) {
        if (persistent)//TODO
            throw new Error("Persistent JobFactory not yet supported, sorry");
        Configuration globalConf = getGlobalConfiguration();

        try {
            //see if the current hadoop configuration has a real job tracker configured
            if (!globalConf.get("mapred.job.tracker").equals("local")) {
                logger.debug("Default configuration has job tracked set");
                return new DirectJobFactory(globalConf);
            }

            // if not, try HOD   
            String hod = ApplicationSetup.getProperty("plugin.hadoop.hod", null);
            String[] hodParams = ApplicationSetup.getProperty("plugin.hadoop.hod.params", "").split(" ");
            if (hod != null) {
                int HodNodes = Integer.parseInt(ApplicationSetup.getProperty("plugin.hadoop.hod.nodes", "" + 6));
                return new HODJobFactory(sessionName, hod, hodParams, HodNodes);
            }
            //as a last resort, use the local Hadoop job tracker
            logger.warn("No remote job tracker or HOD configuration found, using local job tracker");
            return new DirectJobFactory(globalConf);
        } catch (Exception e) {
            logger.warn("Exception occurred while creating JobFactory", e);
            return null;
        }
    }

    public static void setGlobalConfiguration(Configuration _config) {
        singletonConfiguration = _config;
    }

    public static Configuration getGlobalConfiguration() {
        if (singletonConfiguration == null) {
            singletonConfiguration = new Configuration();
        }
        return singletonConfiguration;
    }

    static HadoopPlugin getHadoopPlugin() {
        return singletonHadoopPlugin;
    }

    protected Configuration config = null;
    protected org.apache.hadoop.fs.FileSystem hadoopFS = null;

    public HadoopPlugin() {
        singletonHadoopPlugin = this;
    }

    static class HadoopFSRandomAccessFile implements RandomDataInput
    //static class HadoopFSRandomAccessFile extends RandomAccessFile implements RandomDataInput
    {
        FSDataInputStream in;
        org.apache.hadoop.fs.FileSystem fs;
        String filename;

        public HadoopFSRandomAccessFile(org.apache.hadoop.fs.FileSystem fs, String filename) throws IOException {
            this.fs = fs;
            this.filename = filename;
            this.in = fs.open(new Path(filename));
        }

        public int read() throws IOException {
            return in.read();
        }

        public int read(byte b[], int off, int len) throws IOException {
            return in.read(in.getPos(), b, off, len);
        }

        public int readBytes(byte b[], int off, int len) throws IOException {
            return in.read(in.getPos(), b, off, len);
        }

        public void seek(long pos) throws IOException {
            in.seek(pos);
        }

        public long length() throws IOException {
            return fs.getFileStatus(new Path(filename)).getLen();
        }

        public void close() throws IOException {
            in.close();
        }

        /* implementation from RandomAccessFile */
        public final double readDouble() throws IOException {
            return Double.longBitsToDouble(readLong());
        }

        public final int readUnsignedShort() throws IOException {
            int ch1 = this.read();
            int ch2 = this.read();
            if ((ch1 | ch2) < 0)
                throw new EOFException();
            return (ch1 << 8) + (ch2 << 0);
        }

        public final short readShort() throws IOException {
            int ch1 = this.read();
            int ch2 = this.read();
            if ((ch1 | ch2) < 0)
                throw new EOFException();
            return (short) ((ch1 << 8) + (ch2 << 0));
        }

        public final int readUnsignedByte() throws IOException {
            int ch = this.read();
            if (ch < 0)
                throw new EOFException();
            return ch;
        }

        public final byte readByte() throws IOException {
            int ch = this.read();
            if (ch < 0)
                throw new EOFException();
            return (byte) (ch);
        }

        public final boolean readBoolean() throws IOException {
            int ch = this.read();
            if (ch < 0)
                throw new EOFException();
            return (ch != 0);
        }

        public final int readInt() throws IOException {
            int ch1 = this.read();
            int ch2 = this.read();
            int ch3 = this.read();
            int ch4 = this.read();
            if ((ch1 | ch2 | ch3 | ch4) < 0)
                throw new EOFException();
            return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
        }

        public final long readLong() throws IOException {
            return ((long) (readInt()) << 32) + (readInt() & 0xFFFFFFFFL);
        }

        public final float readFloat() throws IOException {
            return Float.intBitsToFloat(readInt());
        }

        public final void readFully(byte b[]) throws IOException {
            readFully(b, 0, b.length);
        }

        public final void readFully(byte b[], int off, int len) throws IOException {
            int n = 0;
            do {
                int count = this.read(b, off + n, len - n);
                if (count < 0)
                    throw new EOFException();
                n += count;
            } while (n < len);
        }

        public int skipBytes(int n) throws IOException {
            long pos;
            long len;
            long newpos;

            if (n <= 0) {
                return 0;
            }
            pos = getFilePointer();
            len = length();
            newpos = pos + n;
            if (newpos > len) {
                newpos = len;
            }
            seek(newpos);

            /* return the actual number of bytes skipped */
            return (int) (newpos - pos);
        }

        public long getFilePointer() throws IOException {
            return in.getPos();
        }

        public final char readChar() throws IOException {
            int ch1 = this.read();
            int ch2 = this.read();
            if ((ch1 | ch2) < 0)
                throw new EOFException();
            return (char) ((ch1 << 8) + (ch2 << 0));
        }

        public final String readUTF() throws IOException {
            return DataInputStream.readUTF(this);
        }

        public final String readLine() throws IOException {
            StringBuffer input = new StringBuffer();
            int c = -1;
            boolean eol = false;

            while (!eol) {
                switch (c = read()) {
                case -1:
                case '\n':
                    eol = true;
                    break;
                case '\r':
                    eol = true;
                    long cur = getFilePointer();
                    if ((read()) != '\n') {
                        seek(cur);
                    }
                    break;
                default:
                    input.append((char) c);
                    break;
                }
            }

            if ((c == -1) && (input.length() == 0)) {
                return null;
            }
            return input.toString();
        }
    }

    public void initialise() throws Exception {
        config = getGlobalConfiguration();

        final org.apache.hadoop.fs.FileSystem DFS = hadoopFS = org.apache.hadoop.fs.FileSystem.get(config);

        FileSystem terrierDFS = new FileSystem() {
            public String name() {
                return "hdfs";
            }

            /** capabilities of the filesystem */
            public byte capabilities() {
                return FSCapability.READ | FSCapability.WRITE | FSCapability.RANDOM_READ | FSCapability.STAT
                        | FSCapability.DEL_ON_EXIT | FSCapability.LS_DIR;
            }

            public String[] schemes() {
                return new String[] { "dfs", "hdfs" };
            }

            /** returns true if the path exists */
            public boolean exists(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Checking that " + filename + " exists answer=" + DFS.exists(new Path(filename)));
                return DFS.exists(new Path(filename));
            }

            /** open a file of given filename for reading */
            public InputStream openFileStream(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Opening " + filename);
                return DFS.open(new Path(filename));
            }

            /** open a file of given filename for writing */
            public OutputStream writeFileStream(String filename) throws IOException {
                if (logger.isDebugEnabled())
                    logger.debug("Creating " + filename);
                return DFS.create(new Path(filename));
            }

            public boolean mkdir(String filename) throws IOException {
                return DFS.mkdirs(new Path(filename));
            }

            public RandomDataOutput writeFileRandom(String filename) throws IOException {
                throw new IOException("HDFS does not support random writing");
            }

            public RandomDataInput openFileRandom(String filename) throws IOException {
                return new HadoopFSRandomAccessFile(DFS, filename);
            }

            public boolean delete(String filename) throws IOException {
                return DFS.delete(new Path(filename), true);
            }

            public boolean deleteOnExit(String filename) throws IOException {
                return DFS.deleteOnExit(new Path(filename));
            }

            public String[] list(String path) throws IOException {
                final FileStatus[] contents = DFS.listStatus(new Path(path));
                final String[] names = new String[contents.length];
                for (int i = 0; i < contents.length; i++) {
                    names[i] = contents[i].getPath().getName();
                }
                return names;
            }

            public String getParent(String path) throws IOException {
                return new Path(path).getParent().getName();
            }

            public boolean rename(String source, String destination) throws IOException {
                return DFS.rename(new Path(source), new Path(destination));
            }

            public boolean isDirectory(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).isDir();
            }

            public long length(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getLen();
            }

            public boolean canWrite(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getPermission().getUserAction().implies(FsAction.WRITE);
            }

            public boolean canRead(String path) throws IOException {
                return DFS.getFileStatus(new Path(path)).getPermission().getUserAction().implies(FsAction.READ);
            }
        };
        Files.addFileSystemCapability(terrierDFS);
    }

    public Configuration getConfiguration() {
        return config;
    }
}