com.ebay.erl.mobius.core.MobiusJob.java Source code

Java tutorial

Introduction

Here is the source code for com.ebay.erl.mobius.core.MobiusJob.java

Source

package com.ebay.erl.mobius.core;

import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.util.Tool;

import com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder;
import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.builder.DatasetBuildersFactory;
import com.ebay.erl.mobius.core.mapred.ConfigurableJob;
import com.ebay.erl.mobius.core.model.Column;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.core.sort.Sorter;

/**
 * Main class of the Mobius API. Extends this class 
 * to create a Mobius data processing flow.
 * 
 * 
 * 
 * This product is licensed under the Apache License,  Version 2.0, 
 * available at http://www.apache.org/licenses/LICENSE-2.0.
 * 
 * This product contains portions derived from Apache hadoop which is 
 * licensed under the Apache License, Version 2.0, available at 
 * http://hadoop.apache.org.
 * 
 *  2007  2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
 */
@SuppressWarnings({ "deprecation", "unchecked" })
public abstract class MobiusJob extends Configured implements Tool, Serializable {
    private static final long serialVersionUID = -9070202196576655916L;

    private static final Log LOGGER = LogFactory.getLog(MobiusJob.class);

    transient Map<URI/*output*/, Job> jobTopology = new HashMap<URI, Job>();

    transient Set<String> inputPaths = new HashSet<String>();

    transient List<Path> tempFiles = new LinkedList<Path>();

    private transient FileSystem fs;

    /**
     * Return the Hadoop job configuration.
     * <p>
     * Note that, this method creates a new {@link Configuration}
     * from the default one every time, so changes that are made 
     * to the returned {@link Configuration} won't affect the conf 
     * returned by the next call of {@link #getConf()}.
     */
    @Override
    public Configuration getConf() {
        Configuration conf = super.getConf() == null ? new Configuration() : super.getConf();
        Configuration clone = new Configuration();
        Iterator<Entry<String, String>> it = conf.iterator();
        while (it.hasNext()) {
            Entry<String, String> entry = it.next();
            clone.set(entry.getKey(), entry.getValue());
        }
        return clone;
    }

    /**
     * Test if the given <code>input</code> is the output of another job or not
     * 
     * @param input input path of a job.
     * @return <code>true</code> if the <code>input</code> is the output
     * path of another job, <code>false</code> otherwise.
     */
    public boolean isOutputOfAnotherJob(Path input) {
        // normalize the input first, in case of it doesn't 
        // contain schema (hdfs://, or file:// for example.)
        Path p = this.getFS().makeQualified(input);
        LOGGER.info("Current Path Key:" + this.jobTopology.keySet());
        LOGGER.info(p.toUri() + " is the output of another job? " + this.jobTopology.containsKey(p.toUri()));

        return this.jobTopology.containsKey(p.toUri());
    }

    /**
     * Test if the given <code>input</code> is the output of another job or not
     * 
     * @param input input path of a job
     * @return <code>true</code> if the <code>input</code> is the output
     * path of another job, <code>false</code> otherwise.
     */
    public boolean isOutputOfAnotherJob(String input) {
        return this.isOutputOfAnotherJob(new Path(input));
    }

    /**
     * Select the <code>columns</code> from the <code>dataset</code>, store
     * it into <code>outputFolder</code> with the given <code>outputFormat</code>
     * <p>
     * 
     * Here is an example:
     * <pre>
     * <code>
     * public MyJob extends MobiusJob
     * {
     *    public void run(String[] args)
     *    {
     *       Dataset students = ...;
     *       
     *       // save the result to $OUTPUT in SequenceFileOutputFormat,
     *       // the key will be NullWritable, and the value is a Tuple 
     *       // which contains 3 columns, id, f_name and l_name.
     *       this.list(students,
     *          new Path("$OUTPUT"),
     *          SequenceFileOutputFormat.class,
     *          new Column(students, "id"),
     *          new Column(students, "f_name"),
     *          new Column(students, "l_name")
     *       ); 
     *    }
     *    
     *    public static void main(String[] args) throw Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     */
    public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat,
            Column... columns) throws IOException {
        byte datasetID = 0;// set to 0 as there is only one dataset to be operated.

        JobConf job = dataset.createJobConf(datasetID);

        job.set("mapred.job.name", "Listing " + dataset.getName());
        job.setJarByClass(this.getClass());
        job.setNumReduceTasks(0); // list is map only job
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Tuple.class);
        job.setJobName("List " + dataset.getName());

        JobSetup.validateColumns(dataset, columns);
        JobSetup.setupInputs(job, dataset, datasetID);
        JobSetup.setupProjections(job, dataset, datasetID, columns);
        JobSetup.setupOutputs(job, outputFolder, outputFormat);

        this.addToExecQueue(job);

        AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat,
                "Dataset_" + outputFolder.getName());
        return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns));
    }

    /**
     * Select the <code>columns</code> from the <code>dataset</code> and store
     * it into <code>outputFolder</code>.
     * <p>
     * The output format is {@link TextOutputFormat}.
     * <p>
     * 
     * Here is an example:
     * <pre>
     * <code>
     * public MyJob extends MobiusJob
     * {
     *    public void run(String[] args)
     *    {
     *       Dataset students = ...;
     *       
     *       // save the result to $OUTPUT in TextOutputFormat,
     *       // output will be tab delimited files with 3 columns,
     *       // id, f_name and l_name.
     *       //
     *       // To change the delimiter, put -Dmobius.tuple.tostring.delimiter=YOUR_DELIMITER
     *       // when submitting a job in command line. 
     *       this.list(students,
     *          new Path("$OUTPUT"),          
     *          new Column(students, "id"),
     *          new Column(students, "f_name"),
     *          new Column(students, "l_name")
     *       ); 
     *    }
     *    
     *    public static void main(String[] args) throw Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     * 
     */
    public Dataset list(Dataset dataset, Path outputFolder, Column... columns) throws IOException {
        return this.list(dataset, outputFolder, TextOutputFormat.class, columns);
    }

    /**
     * Select the <code>columns</code> from the <code>dataset</code>.
     * <p>
     * 
     * The output path is a temporal path under hadoop.tmp.dir, and the output
     * format is {@link SequenceFileOutputFormat}.
     * <p>
     * 
     * Here is an example:
     * <pre>
     * <code>
     * public MyJob extends MobiusJob
     * {
     *    public void run(String[] args)
     *    {
     *       Dataset students = ...;
     *       
     *       this.list(students, 
     *          new Column(students, "id"),
     *          new Column(students, "f_name"),
     *          new Column(students, "l_name")
     *       ); 
     *    }
     *    
     *    public static void main(String[] args) throw Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     */
    public Dataset list(Dataset dataset, Column... columns) throws IOException {
        return this.list(dataset, this.newTempPath(), SequenceFileOutputFormat.class, columns);
    }

    /**
     * Performing "Left Outer Join", the result contains all the records of
     * the left {@linkplain Dataset} (the 1st {@linkplain Dataset}) with
     * or without match to the right {@linkplain Dataset}.
     * <p>
     * 
     * If in a join group, there is no records from the right {@linkplain Dataset} 
     * (the 2nd argument), by default, <code>null</code>(if the output format is 
     * SequenceFileOutputFormat) or empty string (if the output format is 
     * {@link TextOutputFormat}) is written for the selected columns from 
     * the right {@linkplain Dataset}.
     * <p>
     * 
     * If <code>nullReplacement</code> is not null, then it will be used as 
     * the value for the columns from the right dataset when no match in a 
     * join group.
     * <p>
     * 
     * To compose a <code>leftOuterJoin</code> is almost the same as composing
     * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling
     * <code>innerJoin</code>, simply change it to 
     * <code>leftOuterJoin(Dataset, Dataset, Object)</code>. 
     * <p>
     * 
     * @param left left-hand side {@link Dataset}
     * @param right right-hand side {@link Dataset} 
     * @param nullReplacement the value to be used as the value for null columns, 
     * it can be only the type supported by {@link Tuple}
     * 
     */
    public JoinOnConfigure leftOuterJoin(Dataset left, Dataset right, Object nullReplacement) throws IOException {
        Configuration conf = this.getConf();
        conf.setBoolean(ConfigureConstants.IS_OUTER_JOIN, true);
        return new JoinOnConfigure(nullReplacement, conf, left, right);
    }

    /**
     * Performing "Left Outer Join", the result contains all the records of
     * the left {@linkplain Dataset} (the 1st {@linkplain Dataset}) with
     * or without match to the right {@linkplain Dataset}.
     * <p>
     * 
     * If in a join group, there is no records from the right {@linkplain Dataset} 
     * (the 2nd argument), by default, <code>null</code>(if the output format is 
     * SequenceFileOutputFormat) or empty string (if the output format is 
     * {@link TextOutputFormat}) is written for the selected columns from 
     * the right {@linkplain Dataset}.
     * <p>
     * 
     * To compose a <code>leftOuterJoin</code> is almost the same as composing
     * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling
     * <code>innerJoin</code>, simply change it to 
     * <code>leftOuterJoin(Dataset, Dataset)</code>. 
     * <p>
     * 
     * @param left left-hand side {@link Dataset}
     * @param right right-hand side {@link Dataset}
     * 
     */
    public JoinOnConfigure leftOuterJoin(Dataset left, Dataset right) throws IOException {
        return this.leftOuterJoin(left, right, null);
    }

    /**
     * Performing "Right Outer Join", the result contains all the records of
     * the right {@linkplain Dataset} (the 2nd argument) with or without match 
     * to the left {@linkplain Dataset}.
     * <p>
     * 
     * If in a join group, there is no records from the right {@linkplain Dataset} 
     * (the 2nd argument), by default, <code>null</code>(if the output format is 
     * SequenceFileOutputFormat) or empty string (if the output format is 
     * {@link TextOutputFormat}) is written for the selected columns from 
     * the left {@linkplain Dataset}
     * <p>
     * 
     * If <code>nullReplacement</code> is not null, then it will be used as 
     * the value for the columns from the left dataset when no match in a 
     * join group.
     * <p>
     * 
     * To compose a <code>rightOuterJoin</code> is almost the same as composing
     * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling
     * <code>innerJoin</code>, simply change it to 
     * <code>rightOuterJoin(Dataset, Dataset, Object)</code>. 
     * <p>
     * 
     * @param left left-hand side {@link Dataset}
     * @param right right-hand side {@link Dataset}
     * @param nullReplacement the value to be used as the value for null columns, 
     * it can be only the type supported by {@link Tuple}
     */
    public JoinOnConfigure rightOuterJoin(Dataset left, Dataset right, Object nullReplacement) throws IOException {
        // leverage the leftOuterJoin by exchanging the position
        // of left and right dataset.
        return leftOuterJoin(right, left, nullReplacement);
    }

    /**
     * Performing "Right Outer Join", the result contains all the records of
     * the right {@linkplain Dataset} (the 2nd argument) with or without match 
     * to the left {@linkplain Dataset}.
     * <p>
     * 
     * If in a join group, there is no records from the right {@linkplain Dataset} 
     * (the 2nd argument), by default, <code>null</code>(if the output format is 
     * SequenceFileOutputFormat) or empty string (if the output format is 
     * {@link TextOutputFormat}) is written for the selected columns from 
     * the left {@linkplain Dataset}
     * <p>
     * 
     * To compose a <code>rightOuterJoin</code> is almost the same as composing
     * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling
     * <code>innerJoin</code>, simply change it to 
     * <code>rightOuterJoin(Dataset, Dataset)</code>. 
     * <p>
     * 
     * @param left left-hand side {@link Dataset}
     * @param right right-hand side {@link Dataset}
     * @param nullReplacement the value to be used as the value for null columns, 
     * it can be only the type supported by {@link Tuple}
     */
    public JoinOnConfigure rightOuterJoin(Dataset left, Dataset right) throws IOException {
        return this.rightOuterJoin(left, right, null);
    }

    /**
     * Perform inner join on the given <code>datasets</code>.
     * <p>
     * 
     * The number of <code>datasets</code> must >= 2.
     * One can join <b>more than two {@link Dataset} at once</b>
     * only if the datasets have a shared key, i.e., they have
     * columns that share the same meaning, the name of
     * the columns don't have to be the same, but the content 
     * (value) of the columns need to be the same.
     * <p>
     * 
     * Form the performance perspective, the <b>biggest dataset
     * </b> should be placed in the <b>right most side</b>.  The 
     * <b>bigness</b> is measured in terms of values in a join 
     * key, <b>NOT</b> by the total number of records of a dataset.
     * <p>
     * 
     * Here is an example of how to create a inner join job:
     * <pre>
     * <code>
     * public class MyJob extends MobiusJob
     * {
     *    public void run(String[] args) throws Exception
     *    {
     *      Dataset students = ...;
     *      Dataset courses = ...;
     * 
     *      this
     *      .innerJoin(students, courses)
     *      .on( new EQ(new Column(students, "student_id"), new Column(courses, "student_id")) )
     *      .save(this, new Path("$OUTPUT"),
     *         new Column(students, "student_id"), 
     *         new Column(students, "f_name"),
     *         new Column(students, "l_name"),
     *         new Column(courses, "c_title")
     *      );
     *    }
     *    
     *    public static void main(String[] args) throws Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     */
    public JoinOnConfigure innerJoin(Dataset... datasets) {
        return new JoinOnConfigure(this.getConf(), datasets);
    }

    /**
     * Start a group-by job.
     * <p>
     * 
     * Group-by the given <code>aDataset</code> by
     * certain column(s) (to be specified in the returned
     * {@link GroupByConfigure}).
     * <p>
     * 
     * Here is an example of group-by job:
     * <pre>
     * <code>
     * public class MyJob extends MobiusJob
     * {
     *    public void run(String[] args) throws Exception
     *    {
     *       .....
     *       this
     *       .group(order)
     *       .by(new Column(order, "order_person_id"))
     *       .save(this,
     *          new Path("$OUTPUT_PATH"),
     *          new Column(order, "order_person_id"),
     *          new Max(new Column(order, "order_id")));
     *    }
     * 
     *    public static void main(String[] args) throws Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     */
    public GroupByConfigure group(Dataset aDataset) {
        return new GroupByConfigure(this.getConf(), aDataset);
    }

    /**
     * Performing a total sort on the aDataset.
     * <p>
     * 
     * After the job has finished, concatenating 
     * the out files together, the values in the files 
     * are sorted according to the given {@link Sorter}.
     * <p>
     * 
     * Here is an example of how to start a <code>sort</code>
     * job:
     * 
     * <pre>
     * <code>
     * public MyJob extends MobiusJob
     * {
     *    public void run(String[] args) throws Exception
     *    {
     *       .....
     *      this
     *      .sort(person)
     *      .select(
     *         new Column(ds, "age"),
     *         new Column(ds, "gender"),
     *         new Column(ds, "fname"),
     *         new Column(ds, "lname"))
     *      .orderBy(
     *         new Sorter(new Column(ds, "age"), Ordering.ASC, true),
     *         new Sorter(new Column(ds, "gender"), Ordering.DESC, true))
     *      .save(
     *         this,
     *         new Path("$OUTPUT")
     *      );
     *    }
     * 
     *    public static void main(String[] args) throws Exception
     *    {
     *       System.exit(MobiusJobRunner.run(new MyJob(), args));
     *    }
     * }
     * </code>
     * </pre>
     */
    public SortProjectionConfigure sort(Dataset aDataset) throws IOException {
        return new SortProjectionConfigure(this.getConf(), aDataset);
    }

    protected FileSystem getFS() {
        if (this.fs == null) {
            try {
                this.fs = FileSystem.get(this.getConf());
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        return this.fs;
    }

    void deleteTempFiles() throws IOException {
        LOGGER.info("Cleanning temporal files...");

        for (Path aTempFile : this.tempFiles) {
            if (!this.getFS().delete(aTempFile, true)) {
                LOGGER.warn("Cannot delete temp file:" + aTempFile.toString());
            } else {
                LOGGER.info(aTempFile.toString() + " deleted.");
            }
        }
        LOGGER.info("All temporal files are deleted.");
    }

    /**
     * create an empty folder under hadoop.tmp.dir.
     */
    public Path newTempPath() throws IOException {
        Path tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis()));
        while (this.getFS().exists(tmp)) {
            tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis()));
        }

        if (!this.getFS().mkdirs(tmp)) {
            throw new IOException("Cannot create temp file:" + tmp.toString() + ".");
        }

        // remember the temp file so it can be deleted after
        // this job has completed.
        this.tempFiles.add(tmp);

        return tmp;
    }

    /**
     * Add a job, represented by the <code>aNewJob</code> object, into the execution queue.
     * <p>
     * 
     * Users can use this method to add one or more jobs' configuration into the job queue, and Mobius engine
     * will analyze the <code>aNewJob</code> objects within the queue to understand the dependence of jobs.  
     * For example, if job B's input is from job A, then job B won't be submitted until A is completed 
     * successfully.  If A failed, the B will not be submitted.
     * <p>
     *  
     * 
     * @param aNewJobConf a {@link Configuration} object represents a Hadoop job. 
     * @throws IOException
     */
    protected void addToExecQueue(Configuration aNewJobConf) throws IOException {
        // Add the new job into execution engine and realize
        // its dependency, if any.
        //
        // To realize the job dependency, we need to analyze the input
        // path of this new job.
        // 
        // The inputs of a job could be:
        // 1) if aNewJob is not a derived job (ex: result of another MR job), 
        // then the inputs of the job can be retrieved from "mapred.input.dir",
        // or from {@link MultipleInputs} (ex, joining different type of dataset)/
        // 2) if aNewJob is a derived job, the input  is from the output of previous
        // MR job.

        String inputFolders = aNewJobConf.get("mapred.input.dir", "");
        if (inputFolders.length() == 0) {
            // the value of "mapred.input.dir" is empty, assuming the inputs of this job 
            // are coming from {@link MultipleInputs}.

            String multipleInputs = aNewJobConf
                    .get("mapred.input.dir.mappers"/* for using old MultipleInputs, v0.20.X */, aNewJobConf.get(
                            "mapreduce.input.multipleinputs.dir.formats"/* for new MultipleInputs, v0.23.X */, ""));

            if (multipleInputs.length() > 0) {
                // the input paths of this job is coming from MultipleInputs, extract the input paths.
                // The format from {@link MultipleInputs} is like: hadoop_path1;corresponding_mapper1,hadoop_path2;corresponding_mapper2...
                String[] pathAndMapperPairs = multipleInputs.split(",");
                for (String aPair : pathAndMapperPairs) {
                    String[] pathToMapper = aPair.split(";");
                    String path = pathToMapper[0];
                    String mapper = pathToMapper[1];

                    if (inputFolders.length() == 0) {
                        inputFolders = getPathOnly(path);
                    } else {
                        inputFolders = inputFolders + "," + getPathOnly(path);
                    }
                }
            } else {
                throw new IllegalArgumentException("Cannot find input path(s) of job: ["
                        + aNewJobConf.get("mapred.job.name") + "] from the following attributes: "
                        + "mapred.input.dir, mapred.input.dir.mappers, nor mapreduce.input.multipleinputs.dir.formats. "
                        + "Please specify the input path(s) of this job.");
            }
        } else {
            // the input path of this job is specified in mapred.input.dir
            inputFolders = getPathOnly(inputFolders);
        }

        ////////////////////////////////////////////////////////////
        // validate output path of this job, to ensure it doesn't
        // use the same folder of another job's output.
        ////////////////////////////////////////////////////////////
        String outputPath = aNewJobConf.get("mapred.output.dir", "");
        if (outputPath.isEmpty())
            throw new IllegalStateException(
                    "Please specify the output directory of job:" + aNewJobConf.get("mapred.job.name"));

        if (this.isOutputOfAnotherJob(outputPath)) {
            throw new IllegalArgumentException("Job [" + aNewJobConf.get("mapred.job.name") + "]'s output ["
                    + outputPath + "] is " + "the output of job[" + jobTopology.get(outputPath).getJobName() + "], "
                    + "please make sure to use different output folder for each job.");
        }

        //////////////////////////////////////////////////////////////////
        // pass all the validation, start to build the dependencies.
        //////////////////////////////////////////////////////////////////
        Job newJob = new ConfigurableJob(new JobConf(aNewJobConf, this.getClass()));

        newJob.setJobName(aNewJobConf.get("mapred.job.name", aNewJobConf.get("mapreduce.job.name", "Mobius Job")));
        for (String anInputOfNewJob : inputFolders.split(",")) {
            // Added to track inputs for local PC sampling
            inputPaths.add(anInputOfNewJob);

            Job dependsOn = jobTopology.get(this.getFS().makeQualified(new Path(anInputOfNewJob)).toUri());
            if (dependsOn != null) {
                List<Job> dependingJobs = newJob.getDependingJobs();

                boolean alreadyInDependency = dependingJobs != null && dependingJobs.contains(dependsOn);
                if (alreadyInDependency) {
                    // already added, do nothing.
                } else {
                    LOGGER.info(newJob.getJobName() + " depends on " + dependsOn.getJobName());
                    newJob.addDependingJob(dependsOn);
                }
            }
        }

        // put the output of this <code>newJob</code> into job topology
        // so that later if a job read this <code>newJob</code>'s output
        // as its input, then the system can detect the dependency.

        URI outputPathURI = this.getFS().makeQualified(new Path(outputPath)).toUri();
        LOGGER.info("Adding Job:" + newJob.getJobName() + "\tOutput:[" + outputPath.toString() + "]");
        jobTopology.put(outputPathURI, newJob);
    }

    /**
     * returning only the "path" part of the input URI.
     */
    protected String getPathOnly(String uriStr) {
        try {
            URI uri = new URI(uriStr);
            return uri.getPath();
        } catch (URISyntaxException e) {
            LOGGER.error(e);
            throw new IllegalArgumentException(e);
        }
    }

}