com.datasalt.pangool.flow.Step.java Source code

Introduction

Here is the source code for com.datasalt.pangool.flow.Step.java
Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.flow;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;

import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;

/**
 * This is the base low-level class that executable jobs must extend for being used with {@link BaseFlow}.
 * <p>
 * Each job must declare in its constructor its input paths, named outputs and its input parameters with {@link Inputs},
 * {@link NamedOutputs} and {@link Params}.
 * <p>
 * Each job must have a unique name which is also specified in the constructor. The name is used for locating input /
 * output resources of a job. For instance, the output of a job is located as "name.output". These unique strings can
 * then be binded in {@link BaseFlow} for making connections between resources.
 * <p>
 * Jobs execute method {@link #run(Path, Map, Map)}. Parameters / input / output paths are parsed automatically and
 * passed as parameters of this method. Help and parameter error handling is handled transparently.
 */
@SuppressWarnings("serial")
public abstract class Step implements Configurable, Tool, Serializable {

    protected transient Output output;
    protected transient String help; // extra explanation on what this Job does
    protected transient List<Input> inputs;
    protected transient List<String> namedOutputs;
    protected transient Params parameters;

    protected transient Configuration hadoopConf;

    // Specifies the number of reducers to use in this Job
    protected transient int nReducers = -1;

    public Step(String name, Inputs inputs) {
        this(new Output(name), inputs, Params.NONE, NamedOutputs.NONE, null);
    }

    public Step(String name, Inputs inputs, Params parameters) {
        this(new Output(name), inputs, parameters, NamedOutputs.NONE, null);
    }

    public Step(String name, Inputs inputs, Params parameters, NamedOutputs namedOutputs) {
        this(new Output(name), inputs, parameters, namedOutputs, null);
    }

    public Step(String name, Inputs inputs, Params parameters, NamedOutputs namedOutputs, String help) {
        this(new Output(name), inputs, parameters, namedOutputs, help);
    }

    public Step(Output output, Inputs inputs, Params parameters, NamedOutputs namedOutputs, String help) {
        this.output = output;
        this.help = help;
        this.inputs = inputs;
        this.namedOutputs = namedOutputs;
        this.parameters = parameters;
    }

    private String hadoopConfName(Param param) { // Same convention as input / output paths: name.resource
        return getName() + "." + param.getName();
    }

    private List<String> help() {
        List<String> helpLines = new ArrayList<String>();
        helpLines.add(output.name + (help == null ? "" : "(" + help + ")") + " - Usage:");
        helpLines.add("");
        helpLines.add("Parameters:");
        helpLines.add("");
        for (Param param : parameters) {
            helpLines.add("\t -D " + hadoopConfName(param) + " : " + param);
        }
        helpLines.add("");
        helpLines.add("Input paths:");
        helpLines.add("");
        for (Input input : inputs) {
            helpLines.add("\t--" + input.name);
        }
        helpLines.add("");
        return helpLines;
    }

    @Override
    public int run(String[] args) throws Exception {
        Map<String, Path> parsedInputs = new HashMap<String, Path>();
        Map<String, Object> parsedParameters = new HashMap<String, Object>();
        Path outputPath = null;

        for (int i = 0; i < args.length; i++) {
            for (Input input : inputs) {
                if (args[i].equals("-" + input.name) || args[i].equals("--" + input.name)) {
                    parsedInputs.put(input.name, new Path(args[++i]));
                    continue;
                }
            }
            if (args[i].equals("-output") || args[i].equals("--output")) {
                outputPath = new Path(args[++i]);
            }
        }

        List<String> errors = new ArrayList<String>();

        if (outputPath == null) {
            errors.add("Output path is mandatory. Use -output or --output for it.");
        }
        for (Input input : inputs) {
            if (!parsedInputs.containsKey(input.name)) {
                errors.add("Missing required input path: [" + input.name + "]. Please add it with -" + input.name
                        + " or --" + input.name);
            }
        }

        for (Param parameter : parameters) {
            String hadoopConfName = hadoopConfName(parameter);
            Object val = hadoopConf.get(hadoopConfName);
            if (val == null) {
                continue;
            }
            try {
                if (parameter.getClazz().equals(Integer.class)) {
                    val = Integer.parseInt(val + "");
                } else if (parameter.getClazz().equals(Float.class)) {
                    val = Float.parseFloat(val + "");
                } else if (parameter.getClazz().equals(Double.class)) {
                    val = Double.parseDouble(val + "");
                } else if (parameter.getClazz().equals(String[].class)) { // support for list of strings
                    val = hadoopConf.getStrings(hadoopConfName);
                } else if (parameter.getClazz().equals(Long.class)) {
                    val = Long.parseLong(val + "");
                } else if (parameter.getClazz().equals(Boolean.class)) {
                    val = Boolean.parseBoolean(val + "");
                }
                parsedParameters.put(parameter.getName(), val);
            } catch (Throwable e) {
                e.printStackTrace();
                errors.add("Error when parsing parameter: " + parameter + " : " + e);
            }
        }

        for (Param param : parameters) {
            if (!parsedParameters.containsKey(param.getName())) {
                errors.add("Missing required parameter: [" + param + "]. Please add it with -D "
                        + hadoopConfName(param));
            }
        }

        if (errors.size() > 0) {
            for (String error : errors) {
                System.err.println(error);
            }
            for (String help : help()) {
                System.out.println(help);
            }
            return -1;
        }

        return run(outputPath, parsedInputs, parsedParameters);
    }

    /**
     * Method that Job implementations must implement for executing the job business logic. It receives all the necessary
     * things already parsed.
     */
    public abstract int run(Path outputPath, Map<String, Path> parsedInputs, Map<String, Object> parsedParameters)
            throws Exception;

    /**
     * Convenience method that can be used by Jobs for executing Pangool's {@link CoGrouper} instances.
     */
    public int executeCoGrouper(TupleMRBuilder coGrouper)
            throws IOException, TupleMRException, InterruptedException, ClassNotFoundException {

        Job job = coGrouper.createJob();
        try {
            if (nReducers > 0) {
                job.getConfiguration().setInt("mapred.reduce.tasks", nReducers);
            }
            if (job.waitForCompletion(true)) {
                return 1;
            }
            return -1;
        } finally {
            coGrouper.cleanUpInstanceFiles();
        }
    }

    public String getOutputName() {
        return output.name + "." + NamedOutputs.OUTPUT;
    }

    public String getNamedOutputName(String namedOutput) {
        return output.name + "." + NamedOutputs.OUTPUT + "." + namedOutput;
    }

    public String getName() {
        return output.name;
    }

    public List<Input> getInputs() {
        return inputs;
    }

    public List<String> getNamedOutputs() {
        return namedOutputs;
    }

    public Params getParameters() {
        return parameters;
    }

    @Override
    public void setConf(Configuration hadoopConf) {
        this.hadoopConf = hadoopConf;
    }

    @Override
    public Configuration getConf() {
        return hadoopConf;
    }

    public String toString() {
        return output.name;
    }

    public void setNReducers(int nReducers) {
        this.nReducers = nReducers;
    }
}