datafu.hourglass.jobs.AbstractJob.java Source code

Java tutorial

Introduction

Here is the source code for datafu.hourglass.jobs.AbstractJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * Base class for Hadoop jobs.
 * 
 * <p>
 * This class defines a set of common methods and configuration shared by Hadoop jobs.
 * Jobs can be configured either by providing properties or by calling setters.
 * Each property has a corresponding setter.
 * </p>
 * 
 * This class recognizes the following properties:
 * 
 * <ul>
 *   <li><em>input.path</em> - Input path job will read from</li>
 *   <li><em>output.path</em> - Output path job will write to</li>
 *   <li><em>temp.path</em> - Temporary path under which intermediate files are stored</li>
 *   <li><em>retention.count</em> - Number of days to retain in output directory</li>
 *   <li><em>num.reducers</em> - Number of reducers to use</li>
 *   <li><em>use.combiner</em> - Whether to use a combiner or not</li>
 *   <li><em>counters.path</em> - Path to store job counters in</li>
 * </ul>
 * 
 * <p>
 * The <em>input.path</em> property may be a comma-separated list of paths.  When there is more
 * than one it implies a join is to be performed.  Alternatively the paths may be listed separately.
 * For example, <em>input.path.first</em> and <em>input.path.second</em> define two separate input
 * paths.
 * </p>
 * 
 * <p>
 * The <em>num.reducers</em> fixes the number of reducers.  When not set the number of reducers
 * is computed based on the input size.
 * </p>
 * 
 * <p>
 * The <em>temp.path</em> property defines the parent directory for temporary paths, not the
 * temporary path itself.  Temporary paths are created under this directory with an <em>hourglass-</em>
 * prefix followed by a GUID.
 * </p>
 * 
 * <p> 
 * The input and output paths are the only required parameters.  The rest are optional.
 * </p>
 * 
 * <p>
 * Hadoop configuration may be provided by setting a property with the prefix <em>hadoop-conf.</em>.
 * For example, <em>mapred.min.split.size</em> can be configured by setting property
 * <em>hadoop-conf.mapred.min.split.size</em> to the desired value. 
 * </p>
 * 
 * @author "Matthew Hayes"
 *
 */
public abstract class AbstractJob extends Configured {
    private static String HADOOP_PREFIX = "hadoop-conf.";

    private Properties _props;
    private String _name;
    private boolean _useCombiner;
    private Path _countersParentPath;
    private Integer _numReducers;
    private Integer _retentionCount;
    private List<Path> _inputPaths;
    private Path _outputPath;
    private Path _tempPath = new Path("/tmp");
    private FileSystem _fs;

    /**
     * Initializes the job.
     */
    public AbstractJob() {
        setConf(new Configuration());
    }

    /**
     * Initializes the job with a job name and properties.
     * 
     * @param name Job name
     * @param props Configuration properties
     */
    public AbstractJob(String name, Properties props) {
        this();
        setName(name);
        setProperties(props);
    }

    /**
     * Gets the job name
     * 
     * @return Job name
     */
    public String getName() {
        return _name;
    }

    /**
     * Sets the job name
     * 
     * @param name Job name
     */
    public void setName(String name) {
        _name = name;
    }

    /**
     * Gets the configuration properties.
     * 
     * @return Configuration properties
     */
    public Properties getProperties() {
        return _props;
    }

    /**
     * Sets the configuration properties.
     * 
     * @param props Properties
     */
    public void setProperties(Properties props) {
        _props = props;
        updateConfigurationFromProps(_props);

        if (_props.get("input.path") != null) {
            String[] pathSplit = ((String) _props.get("input.path")).split(",");
            List<Path> paths = new ArrayList<Path>();
            for (String path : pathSplit) {
                if (path != null && path.length() > 0) {
                    path = path.trim();
                    if (path.length() > 0) {
                        paths.add(new Path(path));
                    }
                }
            }
            if (paths.size() > 0) {
                setInputPaths(paths);
            } else {
                throw new RuntimeException("Could not extract input paths from: " + _props.get("input.path"));
            }
        } else {
            List<Path> inputPaths = new ArrayList<Path>();
            for (Object o : _props.keySet()) {
                String prop = o.toString();
                if (prop.startsWith("input.path.")) {
                    inputPaths.add(new Path(_props.getProperty(prop)));
                }
            }
            if (inputPaths.size() > 0) {
                setInputPaths(inputPaths);
            }
        }

        if (_props.get("output.path") != null) {
            setOutputPath(new Path((String) _props.get("output.path")));
        }

        if (_props.get("temp.path") != null) {
            setTempPath(new Path((String) _props.get("temp.path")));
        }

        if (_props.get("retention.count") != null) {
            setRetentionCount(Integer.parseInt((String) _props.get("retention.count")));
        }

        if (_props.get("num.reducers") != null) {
            setNumReducers(Integer.parseInt((String) _props.get("num.reducers")));
        }

        if (_props.get("use.combiner") != null) {
            setUseCombiner(Boolean.parseBoolean((String) _props.get("use.combiner")));
        }

        if (_props.get("counters.path") != null) {
            setCountersParentPath(new Path((String) _props.get("counters.path")));
        }
    }

    /**
     * Overridden to provide custom configuration before the job starts.
     * 
     * @param conf
     */
    public void config(Configuration conf) {
    }

    /**
     * Gets the number of reducers to use.
     * 
     * @return Number of reducers
     */
    public Integer getNumReducers() {
        return _numReducers;
    }

    /**
     * Sets the number of reducers to use.  Can also be set with <em>num.reducers</em> property.
     * 
     * @param numReducers Number of reducers to use
     */
    public void setNumReducers(Integer numReducers) {
        this._numReducers = numReducers;
    }

    /**
     * Gets whether the combiner should be used.
     * 
     * @return True if combiner should be used, otherwise false.
     */
    public boolean isUseCombiner() {
        return _useCombiner;
    }

    /**
     * Sets whether the combiner should be used.  Can also be set with <em>use.combiner</em>.
     * 
     * @param useCombiner True if a combiner should be used, otherwise false.
     */
    public void setUseCombiner(boolean useCombiner) {
        this._useCombiner = useCombiner;
    }

    /**
     * Gets the path where counters will be stored.
     * 
     * @return Counters path
     */
    public Path getCountersParentPath() {
        return _countersParentPath;
    }

    /**
     * Sets the path where counters will be stored.  Can also be set with <em>counters.path</em>.
     * 
     * @param countersParentPath Counters path
     */
    public void setCountersParentPath(Path countersParentPath) {
        this._countersParentPath = countersParentPath;
    }

    /**
     * Gets the number of days of data which will be retained in the output path.
     * Only the latest will be kept.  Older paths will be removed.
     * 
     * @return retention count
     */
    public Integer getRetentionCount() {
        return _retentionCount;
    }

    /**
     * Sets the number of days of data which will be retained in the output path.
     * Only the latest will be kept.  Older paths will be removed.
     * Can also be set with <em>retention.count</em>.
     * 
     * @param retentionCount
     */
    public void setRetentionCount(Integer retentionCount) {
        this._retentionCount = retentionCount;
    }

    /**
     * Gets the input paths.  Multiple input paths imply a join is to be performed.
     * 
     * @return input paths
     */
    public List<Path> getInputPaths() {
        return _inputPaths;
    }

    /**
     * Sets the input paths.  Multiple input paths imply a join is to be performed.
     * Can also be set with <em>input.path</em> or several properties starting with
     * <em>input.path.</em>.
     * 
     * @param inputPaths input paths
     */
    public void setInputPaths(List<Path> inputPaths) {
        this._inputPaths = inputPaths;
    }

    /**
     * Gets the output path.
     * 
     * @return output path
     */
    public Path getOutputPath() {
        return _outputPath;
    }

    /**
     * Sets the output path.  Can also be set with <em>output.path</em>.
     * 
     * @param outputPath output path
     */
    public void setOutputPath(Path outputPath) {
        this._outputPath = outputPath;
    }

    /**
     * Gets the temporary path under which intermediate files will be stored.  Defaults to /tmp.
     * 
     * @return Temporary path
     */
    public Path getTempPath() {
        return _tempPath;
    }

    /**
     * Sets the temporary path where intermediate files will be stored.  Defaults to /tmp. 
     * 
     * @param tempPath Temporary path
     */
    public void setTempPath(Path tempPath) {
        this._tempPath = tempPath;
    }

    /**
     * Gets the file system.
     * 
     * @return File system
     * @throws IOException 
     */
    protected FileSystem getFileSystem() {
        if (_fs == null) {
            try {
                _fs = FileSystem.get(getConf());
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        return _fs;
    }

    /**
     * Generates a random temporary path within the file system.  This does not create the path.
     * 
     * @return Random temporary path
     */
    protected Path randomTempPath() {
        return new Path(_tempPath, String.format("hourglass-%s", UUID.randomUUID()));
    }

    /**
     * Creates a random temporary path within the file system.
     * 
     * @return Random temporary path
     * @throws IOException
     */
    protected Path createRandomTempPath() throws IOException {
        return ensurePath(randomTempPath());
    }

    /**
     * Creates a path, if it does not already exist.
     * 
     * @param path Path to create
     * @return The same path that was provided
     * @throws IOException
     */
    protected Path ensurePath(Path path) throws IOException {
        if (!getFileSystem().exists(path)) {
            getFileSystem().mkdirs(path);
        }
        return path;
    }

    /**
     * Validation required before running job.
     */
    protected void validate() {
        if (_inputPaths == null || _inputPaths.size() == 0) {
            throw new IllegalArgumentException("Input path is not specified.");
        }

        if (_outputPath == null) {
            throw new IllegalArgumentException("Output path is not specified.");
        }
    }

    /**
     * Initialization required before running job.
     */
    protected void initialize() {
    }

    /**
     * Run the job.
     * 
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    public abstract void run() throws IOException, InterruptedException, ClassNotFoundException;

    /**
     * Creates Hadoop configuration using the provided properties.
     * 
     * @param props
     * @return
     */
    private void updateConfigurationFromProps(Properties props) {
        Configuration config = getConf();

        if (config == null) {
            config = new Configuration();
        }

        // to enable unit tests to inject configuration  
        if (props.containsKey("test.conf")) {
            try {
                byte[] decoded = Base64.decodeBase64(props.getProperty("test.conf"));
                ByteArrayInputStream byteInput = new ByteArrayInputStream(decoded);
                DataInputStream inputStream = new DataInputStream(byteInput);
                config.readFields(inputStream);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        } else {
            for (String key : props.stringPropertyNames()) {
                String newKey = key;
                String value = props.getProperty(key);

                if (key.toLowerCase().startsWith(HADOOP_PREFIX)) {
                    newKey = key.substring(HADOOP_PREFIX.length());
                    config.set(newKey, value);
                }
            }
        }
    }
}