nl.gridline.zieook.runners.ZieOokRunnerTool.java Source code

Introduction

Here is the source code for nl.gridline.zieook.runners.ZieOokRunnerTool.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 */
package nl.gridline.zieook.runners;

import java.io.File;
import java.io.IOException;
import java.util.Map.Entry;

import nl.gridline.zieook.configuration.Config;
import nl.gridline.zieook.mapreduce.TaskConfig;
import nl.gridline.zieook.tasks.ZieOokTask;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
 * [purpose]
 * <p />
 * Project zieook-runner<br />
 * ZieOokRunnerTool.java created 18 feb. 2011
 * <p />
 * Copyright, all rights reserved 2011 GridLine Amsterdam
 * @author <a href="mailto:job@gridline.nl">Job</a>
 * @version $Revision$, $Date$
 */
public abstract class ZieOokRunnerTool extends Configured implements Tool {

    private static final Logger LOG = LoggerFactory.getLogger(ZieOokRunnerTool.class);

    /**
     * task takes care of the communication with the world.
     */
    protected final ZieOokTask task;

    /**
     * Sets the basics - see below
     */
    public ZieOokRunnerTool(ZieOokTask task) {
        // setBasics();
        super(new Configuration());
        Preconditions.checkNotNull(task);
        this.task = task;
    }

    public TaskConfig getTaskConfiguration() {
        return task.getConfig();
    }

    /**
     * Set an item in the hadoop configuration
     * @param key
     * @param value
     */
    public void set(String key, String value) {
        getConf().set(key, value);
    }

    /**
     * Set the basic properties that are always the same in the zieook context. The items are directly read from the
     * zieook configuration files.<br />
     * The following mapreduce related keys items will be set and read: <br />
     * <ul>
     * <li><tt>mapred.job.tracker</tt> read from <tt>mapred.job.tracker</tt></li>
     * <li><tt>hbase.zookeeper.quorum</tt> read from <tt>zieook.zookeeper.host</tt></li>
     * <li><tt>hbase.cluster.distributed</tt> read from <tt>hbase.cluster.distributed</tt></li>
     * <li><tt>hbase.rootdir</tt> read from <tt>hbase.rootdir</tt></li>
     * </ul>
     */
    protected void setBasics() {
        try {
            // set zieook configuration on the Job configuration:
            Config zieook = Config.getInstance();
            Configuration conf = getConf();
            String tracker = zieook.get(Config.ZIEOOK_JOBTRACKER);
            conf.set("mapred.job.tracker", tracker);// MapReduceSettings.JOB_TRACKER_HOST_NAME_PORT);
            // settings from hbase-site.xml:
            conf.set("hbase.zookeeper.quorum", zieook.getZooKeeperHost());
            conf.set("hbase.zookeeper.property.clientPort", zieook.getZooKeeperPort());
            conf.set("hbase.rootdir", zieook.get("hbase.rootdir"));
            conf.set("hbase.cluster.distributed", zieook.get("hbase.cluster.distributed"));
            conf.set("fs.default.name", zieook.get(Config.ZIEOOK_HDFS_SERVER));
            // conf.set("user.name", "mapred");
        } catch (Exception e) {
            LOG.error("Failed", e);
        }
        LOG.info("SETTING BASICS: DONE");
    }

    public void setJar(String jar) throws IOException {
        File file = new File(jar.trim()).getCanonicalFile();
        // check the location of the jar file:
        LOG.info("map reduce jar: {}", file);
        //

        if (!file.exists()) {
            LOG.error("map reduce jar: does not exist '{}'", file);
            throw new IOException("map reduce jar: does not exist '" + file.toString() + "'");
        }
        if (!file.canRead()) {
            LOG.error("map reduce jar: cannot be read '{}'", file);
            throw new IOException("map reduce jar: cannot be read '" + file.toString() + "'");
        }

        getConf().set("mapred.jar", file.toString());
    }

    public String getJar() {
        return getConf().get("mapred.jar");
    }

    public void setOutputTable(String table) {
        getConf().set(TableOutputFormat.OUTPUT_TABLE, table);
    }

    public String getOutputTable() {
        return getConf().get(TableOutputFormat.OUTPUT_TABLE);
    }

    public void setInputTable(String table) {
        getConf().set(TableInputFormat.INPUT_TABLE, table);
    }

    public String getInputTable() {
        return getConf().get(TableInputFormat.INPUT_TABLE);
    }

    public abstract boolean execute() throws Exception;

    // public abstract ZieOokRunnerTool configure(String input, String output) throws Exception;

    /**
     * A convenience function to print the current configuration.
     */
    public void printConfiguration() {
        Configuration conf = getConf();

        for (Entry<String, String> entry : conf) {
            LOG.debug("{}={}", entry.getKey(), entry.getValue());
        }
    }

    /**
     * all options passed through here are ignored for now.
     */
    @Override
    public int run(String[] arg0) throws Exception {
        return 0;
    }

    /**
     * @param inputPath input path
     * @param outputPath output path
     * @param inputFormat input format
     * @param mapper mapper class
     * @param mapperKey mapper key
     * @param mapperValue mapper value
     * @param reducer reducer class
     * @param reducerKey reducer key
     * @param reducerValue reducer value
     * @param outputFormat output format
     * @return a ready to run Job
     * @throws IOException
     */
    @SuppressWarnings("rawtypes")
    public Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        Configuration jobConf = job.getConfiguration();

        // This never really worked for me: job.setJarByClass... in any case, we already know the jar
        // if (reducer.equals(Reducer.class))
        // {
        // if (mapper.equals(Mapper.class))
        // {
        // throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        // }
        // job.setJarByClass(mapper);
        // }
        // else
        // {
        // job.setJarByClass(reducer);
        // }

        job.setInputFormatClass(inputFormat);
        if (inputPath != null) {
            jobConf.set("mapred.input.dir", inputPath.toString());
        }

        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        final String name = getCustomJobName(job, mapper, reducer);
        job.setJobName(name);

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        LOG.debug("job setup for: {}", name);

        return job;
    }

    /**
     * @param inputTable input table
     * @param outputPath output path
     * @param scanner table row scanner object
     * @param mapper mapper class
     * @param mapperKey table mapper key
     * @param mapperValue table mapper value
     * @param reducer reducer class
     * @param reducerKey reducer key
     * @param reducerValue reducer value
     * @param outputFormat output (file) format
     * @return a ready to run job - unless you need to assign more job specific settings
     * @throws IOException
     */
    @SuppressWarnings("rawtypes")
    public Job prepareTableMapper(String inputTable, Path outputPath, Scan scanner,
            Class<? extends TableMapper> mapper, Class<? extends WritableComparable> mapperKey,
            Class<? extends WritableComparable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat) throws IOException {
        setInputTable(inputTable);

        // Configures the Job & starts it:

        Configuration conf = getConf();

        Job job = new Job(conf);
        job.setJobName(getCustomJobName(job, mapper, reducer));

        // mapper:
        TableMapReduceUtil.initTableMapperJob(getInputTable(), scanner, mapper, mapperKey, mapperValue, job);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        job.setOutputFormatClass(outputFormat);

        FileOutputFormat.setOutputPath(job, outputPath);

        return job;
    }

    /**
     * @param inputPath input path
     * @param outputTable output table
     * @param inputFormat input format
     * @param mapper mapper class
     * @param mapperKey mapper class key
     * @param mapperValue mapper class value
     * @param reducer table reducer
     * @return a ready to execute job, unless you need to set specific job setting for the mapper / reducer
     * @throws IOException
     */
    @SuppressWarnings("rawtypes")
    public Job prepareTableReducer(Path inputPath, String outputTable, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends TableReducer> reducer) throws IOException {
        setOutputTable(outputTable);

        Configuration conf = getConf();
        conf.set("mapred.input.dir", inputPath.toString());

        Job job = new Job(conf);
        job.setJobName(getCustomJobName(job, mapper, reducer));

        job.setInputFormatClass(inputFormat);
        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        TableMapReduceUtil.initTableReducerJob(getOutputTable(), reducer, job);

        return job;
    }

    /**
     * Generates a nice Job name: class-mapper.class-reducer.class
     * @param job
     * @param mapper
     * @param reducer
     * @return
     */
    @SuppressWarnings("rawtypes")
    protected String getCustomJobName(JobContext job, Class<? extends Mapper> mapper,
            Class<? extends Reducer> reducer) {
        StringBuilder name = new StringBuilder(100);
        String customJobName = job.getJobName();
        if (customJobName == null || customJobName.trim().length() == 0) {
            name.append(getClass().getSimpleName());
        } else {
            name.append(customJobName);
        }
        name.append('-').append(mapper.getSimpleName());
        name.append('-').append(reducer.getSimpleName());
        return name.toString();
    }

    /**
     * Delete data from hdfs
     * @param path
     * @throws IOException
     */
    protected void cleanup(Path path) throws IOException {
        Configuration hdfsConfig = new Configuration();
        hdfsConfig.set("fs.default.name", Config.getInstance().get("fs.default.name"));
        FileSystem hdfs = FileSystem.get(hdfsConfig);
        hdfs.delete(path, true);
        LOG.debug("cleanup <{}>", path);
    }

}