org.commoncrawl.util.JobBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.JobBuilder.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.IOException;
import java.util.List;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunner;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;

/**
 * 
 * @author rana
 *
 */
public class JobBuilder {

    JobConf _jobConf;

    public JobBuilder(String jobName, Configuration conf) {
        _jobConf = new JobConf(conf);
        // legacy crap 
        _jobConf.addResource("nutch-default.xml");
        _jobConf.addResource("nutch-site.xml");
        //defaults 
        _jobConf.setNumTasksToExecutePerJvm(1000);
        _jobConf.setJobName(jobName);
    }

    /**
     * add inputs to the job config 
     * 
     * @param inputs
     * @return
     * @throws IOException
     */
    public JobBuilder inputs(List<Path> inputs) throws IOException {
        for (Path input : inputs) {
            FileInputFormat.addInputPath(_jobConf, input);
        }
        return this;
    }

    /**
     * add a single input file to the job config
     * @param input
     * @return
     * @throws IOException
     */
    public JobBuilder input(Path input) throws IOException {
        FileInputFormat.addInputPath(_jobConf, input);
        return this;
    }

    public JobBuilder output(Path outputPath) throws IOException {
        FileOutputFormat.setOutputPath(_jobConf, outputPath);
        return this;
    }

    /**
     * set input format 
     * 
     * @param inputFormat
     * @return
     * @throws IOException
     */
    public JobBuilder inputFormat(Class<? extends InputFormat> inputFormat) throws IOException {
        _jobConf.setInputFormat(inputFormat);
        return this;
    }

    public JobBuilder inputIsSeqFile() throws IOException {
        _jobConf.setInputFormat(SequenceFileInputFormat.class);
        return this;
    }

    /**
     * set output format 
     * 
     * @param inputFormat
     * @return
     * @throws IOException
     */
    public JobBuilder outputFormat(Class<? extends OutputFormat> outputFormat) throws IOException {
        _jobConf.setOutputFormat(outputFormat);
        return this;
    }

    public JobBuilder outputIsSeqFile() throws IOException {
        _jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        return this;
    }

    public JobBuilder jarByClass(Class theClass) throws IOException {
        _jobConf.setJarByClass(theClass);
        return this;
    }

    /**
     * 
     * @param mapper
     * @return
     * @throws IOException
     */
    public JobBuilder mapper(Class<? extends Mapper> mapper) throws IOException {
        if (mapper != IdentityMapper.class)
            _jobConf.setMapperClass(mapper);
        _jobConf.setJarByClass(mapper);
        return this;
    }

    public JobBuilder mapRunner(Class<? extends MapRunner> mapRunner) throws IOException {
        _jobConf.setMapRunnerClass(mapRunner);
        _jobConf.setJarByClass(mapRunner);
        return this;
    }

    public JobBuilder mapperKeyValue(Class<? extends WritableComparable> key, Class<? extends Writable> value)
            throws IOException {
        _jobConf.setMapOutputKeyClass(key);
        _jobConf.setMapOutputValueClass(value);
        _jobConf.setOutputKeyClass(key);
        _jobConf.setOutputValueClass(value);

        return this;
    }

    public JobBuilder reducer(Class<? extends Reducer> reducer, boolean hasCombiner) throws IOException {
        if (reducer != IdentityReducer.class)
            _jobConf.setReducerClass(reducer);
        if (hasCombiner)
            _jobConf.setCombinerClass(reducer);
        _jobConf.setJarByClass(reducer);
        return this;
    }

    public JobBuilder outputKeyValue(Class<? extends WritableComparable> key, Class<? extends Writable> value)
            throws IOException {
        _jobConf.setOutputKeyClass(key);
        _jobConf.setOutputValueClass(value);
        return this;
    }

    public JobBuilder keyValue(Class<? extends WritableComparable> key, Class<? extends Writable> value)
            throws IOException {
        _jobConf.setMapOutputKeyClass(key);
        _jobConf.setMapOutputValueClass(value);
        _jobConf.setOutputKeyClass(key);
        _jobConf.setOutputValueClass(value);
        return this;
    }

    public JobBuilder numMappers(int mappers) throws IOException {
        _jobConf.setNumMapTasks(mappers);
        return this;
    }

    public JobBuilder numReducers(int reducers) throws IOException {
        _jobConf.setNumReduceTasks(reducers);
        return this;
    }

    public JobBuilder compressMapOutput(boolean compress) throws IOException {
        _jobConf.setCompressMapOutput(compress);
        return this;
    }

    public JobBuilder compressor(CompressionType type, Class<? extends CompressionCodec> codec) throws IOException {
        _jobConf.setBoolean("mapred.output.compress", true);
        _jobConf.set("mapred.output.compression.type", type.toString());
        _jobConf.setClass("mapred.output.compression.codec", codec, CompressionCodec.class);
        return this;
    }

    public JobBuilder compressType(CompressionType type) throws IOException {
        if (type == CompressionType.NONE) {
            _jobConf.setBoolean("mapred.output.compress", false);
        } else {
            _jobConf.setBoolean("mapred.output.compress", true);
        }
        _jobConf.set("mapred.output.compression.type", type.toString());
        return this;
    }

    public JobBuilder sort(Class<? extends RawComparator> comparator) throws IOException {
        _jobConf.setOutputKeyComparatorClass(comparator);
        return this;
    }

    public JobBuilder group(Class<? extends RawComparator> comparator) throws IOException {
        _jobConf.setOutputValueGroupingComparator(comparator);
        return this;
    }

    public JobBuilder partition(Class<? extends Partitioner> partitioner) throws IOException {
        _jobConf.setPartitionerClass(partitioner);
        return this;
    }

    public JobBuilder speculativeExecution(boolean enabled) throws IOException {
        _jobConf.setSpeculativeExecution(enabled);
        return this;
    }

    public JobBuilder speculativeMapExecution() throws IOException {
        _jobConf.setMapSpeculativeExecution(true);
        return this;
    }

    public JobBuilder speculativeReducerExecution() throws IOException {
        _jobConf.setReduceSpeculativeExecution(true);
        return this;
    }

    public JobBuilder maxMapAttempts(int maxAttempts) throws IOException {
        _jobConf.setMaxMapAttempts(maxAttempts);
        return this;
    }

    public JobBuilder maxReduceAttempts(int maxAttempts) throws IOException {
        _jobConf.setMaxReduceAttempts(maxAttempts);
        return this;
    }

    public JobBuilder delayReducersUntil(float pctMappersComplete) throws IOException {
        _jobConf.setFloat("mapred.reduce.slowstart.completed.maps", pctMappersComplete);
        return this;
    }

    public JobBuilder maxMapTaskFailures(int percent) throws IOException {
        _jobConf.setMaxMapTaskFailuresPercent(percent);
        return this;
    }

    public JobBuilder setAffinity(Path affinityPath) throws IOException {
        if (affinityPath != null) {
            // set node affinity ...
            String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(FileSystem.get(_jobConf),
                    affinityPath, null);

            NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);
        }

        return this;
    }

    public JobBuilder reuseJVM(int numberOfTimes) throws IOException {
        _jobConf.setNumTasksToExecutePerJvm(numberOfTimes);
        return this;
    }

    public JobBuilder setAffinity(Path affinityPath, Set<String> exclusionSet) throws IOException {
        // set node affinity ...
        String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(FileSystem.get(_jobConf), affinityPath,
                null, exclusionSet);

        NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);

        return this;
    }

    public JobBuilder setAffinityNoBalancing(Path affinityPath, Set<String> exclusionSet) throws IOException {
        // set node affinity ...
        String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(FileSystem.get(_jobConf), affinityPath,
                null, exclusionSet, _jobConf.getInt("mapred.tasktracker.reduce.tasks.maximum", -1), true);

        NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);

        return this;
    }

    public JobBuilder minSplitSize(long minSplitSize) throws IOException {
        _jobConf.setLong("mapred.min.split.size", minSplitSize);
        return this;
    }

    public JobConf build() throws IOException {
        return _jobConf;
    }

    public static Path tempDir(Configuration conf, String baseName) throws IOException {

        Path tempOutputDir = new Path(
                conf.get("mapred.temp.dir", ".") + "/" + baseName + "-" + System.currentTimeMillis());

        return tempOutputDir;
    }

    public JobBuilder set(String key, String value) throws IOException {
        _jobConf.set(key, value);
        return this;
    }

    public JobBuilder set(String key, long value) throws IOException {
        _jobConf.setLong(key, value);
        return this;
    }

}