com.ebay.erl.mobius.core.Persistable.java Source code

Java tutorial

Introduction

Here is the source code for com.ebay.erl.mobius.core.Persistable.java

Source

package com.ebay.erl.mobius.core;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

import com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder;
import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.builder.DatasetBuildersFactory;
import com.ebay.erl.mobius.core.criterion.TupleCriterion;
import com.ebay.erl.mobius.core.datajoin.DataJoinKey;
import com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner;
import com.ebay.erl.mobius.core.datajoin.DataJoinValue;
import com.ebay.erl.mobius.core.function.base.GroupFunction;
import com.ebay.erl.mobius.core.function.base.Projectable;
import com.ebay.erl.mobius.core.mapred.DefaultMobiusCombiner;
import com.ebay.erl.mobius.core.mapred.DefaultMobiusReducer;
import com.ebay.erl.mobius.core.model.Column;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;

/**
 * Sets the projections (columns to be saved on disk ) 
 * for join or group-by jobs.
 * <p>
 * 
 * The user cannot create an instance of this class
 * directly.  To get an instance of this class, use 
 * {@link JoinOnConfigure} for join type jobs, or 
 * {@link GroupByConfigure} for group-by jobs.
 * <p>
 * 
 * See {@link MobiusJob#innerJoin(Dataset...)} or
 * {@link MobiusJob#group(Dataset)} for information
 * on creating a join or group-by job.
 * 
 * 
 * 
 * This product is licensed under the Apache License,  Version 2.0, 
 * available at http://www.apache.org/licenses/LICENSE-2.0.
 * 
 * This product contains portions derived from Apache hadoop which is 
 * licensed under the Apache License, Version 2.0, available at 
 * http://hadoop.apache.org.
 * 
 *  2007  2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
 */
@SuppressWarnings({ "deprecation", "unchecked" })
public class Persistable {
    private Configuration userDefinedConf;

    private JobConf jobConf;

    private Dataset[] datasets;

    private static final Log LOGGER = LogFactory.getLog(Persistable.class);

    Persistable(Configuration jobConf, Dataset... datasets) {
        this.jobConf = new JobConf(jobConf);
        this.datasets = datasets;
    }

    /**
     * set a configuration property to this job's 
     * configuration.
     * <p>
     * 
     * @param name a property name in a Hadoop job configuration.
     * @param value the value for the property name in a Hadoop 
     * job configuration.
     */
    public Persistable setConf(String name, String value) {
        if (userDefinedConf == null) {
            this.userDefinedConf = new Configuration(false);
        }
        this.userDefinedConf.set(name, value);
        return this;
    }

    /**
     * Specify the name of this job.
     */
    public Persistable setJobName(String newJobName) {
        this.jobConf.set("mapred.job.name", newJobName);
        return this;
    }

    /**
     * Specify the number of reducer of this job.
     */
    public Persistable setReducersNumber(int reducerNumber) {
        if (reducerNumber <= 0)
            throw new IllegalArgumentException("number of reducer must grater than 0.");

        this.jobConf.setInt("mapred.reduce.tasks", reducerNumber);
        return this;
    }

    /**
     * Build the dataset and store the <code>projections</code>
     * into a temporal path (under hadoop.tmp.dir) in the format of
     * {@link SequenceFileOutputFormat}.
     */
    public Dataset build(MobiusJob job, Projectable... projections) throws IOException {
        return this.build(job, SequenceFileOutputFormat.class, projections);
    }

    /**
     * Build the dataset and store the <code>projections</code>
     * into a temporal path (under hadoop.tmp.dir) in the format of
     * {@link SequenceFileOutputFormat}.
     * <p>
     * 
     * Only the rows that meet the <code>criteria</code> will be 
     * stored.  The <code>criteria</code> can only evaluate the 
     * columns specified in the <code>projections</code>.
     */
    public Dataset build(MobiusJob job, TupleCriterion criteria, Projectable... projections) throws IOException {
        return this.build(job, SequenceFileOutputFormat.class, criteria, projections);
    }

    /**
     * Build the dataset and store the <code>projections</code>
     * into a temporal path (under hadoop.tmp.dir) in the format of
     * the given <code>outputFormat</code>.
     * <p>
     */
    public Dataset build(MobiusJob job, Class<? extends FileOutputFormat> outputFormat, Projectable... projections)
            throws IOException {
        return this.build(job, outputFormat, null, projections);
    }

    /**
     * Build the dataset and store the <code>projections</code>
     * into a temporal path (under hadoop.tmp.dir) in the format of
     * {@link SequenceFileOutputFormat}.
     * <p>
     * 
     * Only the rows that meet the <code>criteria</code> will be 
     * stored.  The <code>criteria</code> can only evaluate the 
     * columns specified in the <code>projections</code>.
     * 
     * @param job 
     * @param outputFormat
     * @param criteria if specified (not null), only rows that satisfy the given <code>criteria</code>
     * will be saved.  Note that, <code>criteria</code> is applied just before the persistant step, so
     * it can only operate on the columns in the output schema of this job.
     * @param projections the columns to be saved in the returned {@link Dataset}.
     * @return a {@link Dataset} with the specified columns ()
     * @throws IOException
     */
    public Dataset build(MobiusJob job, Class<? extends FileOutputFormat> outputFormat, TupleCriterion criteria,
            Projectable... projections) throws IOException {
        return this.save(job, job.newTempPath(), outputFormat, criteria, projections);
    }

    /**
     * Save the dataset and store the <code>projections</code>
     * into a the specified <code>output</code> path in the 
     * format of {@link TextOutputFormat}.
     * <p>
     * 
     * <code>output</code> will be deleted before the job gets started.
     */
    public Dataset save(MobiusJob job, Path output, Projectable... projections) throws IOException {
        return this.save(job, output, TextOutputFormat.class, null, projections);
    }

    /**
     * Save the dataset and store the <code>projections</code>
     * into a the specified <code>output</code> path in the 
     * format of {@link TextOutputFormat}.
     * <p>
     * 
     * Only the rows that meet the <code>criteria</code> will be 
     * stored.  The <code>criteria</code> can only evaluate the 
     * columns specified in the <code>projections</code>.
     * <p>
     * 
     * <code>output</code> will be deleted before the job gets started.
     */
    public Dataset save(MobiusJob job, Path output, TupleCriterion criteria, Projectable... projections)
            throws IOException {
        return this.save(job, output, TextOutputFormat.class, criteria, projections);
    }

    /**
     * Save the dataset and store the <code>projections</code>
     * into a the specified <code>output</code> path in the 
     * format of the given <code>outputFormat</code>.
     * <p>
     * 
     * <code>output</code> will be deleted before the job gets started.
     */
    public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat,
            Projectable... projections) throws IOException {
        return this.save(job, output, outputFormat, null, projections);
    }

    /**
     * Save the dataset and store the <code>projections</code>
     * into a the specified <code>output</code> path in the 
     * format of the given <code>outputFormat</code>.
     * <p>
     * 
     * Only the rows that meet the <code>criteria</code> will be 
     * stored.  The <code>criteria</code> can only evaluate the 
     * columns specified in the <code>projections</code>.
     * <p>
     * 
     * <code>output</code> will be deleted before the job gets started.
     */
    public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat,
            TupleCriterion criteria, Projectable... projections) throws IOException {
        if (projections == null || projections.length == 0)
            throw new IllegalArgumentException("Please specify the output columns.");

        // - VALIDATION - make sure no ambiguous column names.
        //
        // make sure the projections don't have two or more different columns that
        // have the same name but in different dataset, as we are going the use 
        // the {@link Column#getOutputColumnName} as the output schema of the
        // returned dataset.
        Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
        for (Projectable aColumn : projections) {
            String[] outputSchema = aColumn.getOutputSchema();
            for (String anOutput : outputSchema) {
                if (!columnNames.contains(anOutput)) {
                    columnNames.add(anOutput);
                } else {
                    throw new IllegalArgumentException(columnNames + " from " + aColumn.toString()
                            + " is ambiguous, it has the same name"
                            + "as aother selected projected in different dataset, please use Column#setNewName(String) to"
                            + "change it.");
                }
            }
        }

        // - VALIDATION - if <code>criteria</code> is not null, need to make
        // sure the columns used in the criteria are in the output columns.
        if (criteria != null) {
            TupleCriterion.validate(columnNames, criteria);
            this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria));
        }

        // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns
        // for each dataset, and also perform validation on making sure all the projection columns 
        // are from the selected <code>datasets</code> only,
        Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>();

        for (Projectable aFunc : projections) {
            Column[] requiredInputColumns = aFunc.getInputColumns();
            for (Column aColumn : requiredInputColumns) {
                Dataset aDataset = aColumn.getDataset();
                // make sure the <code>aDataset</code> within the participated datasets
                boolean withinSelectedDataset = false;
                for (Dataset aSelectedDataset : this.datasets) {
                    if (aSelectedDataset.equals(aDataset)) {
                        withinSelectedDataset = true;
                        break;
                    }
                }

                if (!withinSelectedDataset) {
                    // user select a column from a dataset that doesn't
                    // in the selected datasets in this join/group by job.
                    throw new IllegalArgumentException(aColumn.toString()
                            + " does not within the selected datasets "
                            + "in this join/group task, please select columns only from the selected datasets.");
                }

                List<Column> projectablesInADataset = null;
                if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) {
                    projectablesInADataset = new LinkedList<Column>();
                    datasetToColumns.put(aDataset, projectablesInADataset);
                }

                if (!projectablesInADataset.contains(aColumn))
                    projectablesInADataset.add(aColumn);
            }
        }

        if (datasetToColumns.keySet().size() != this.datasets.length) {
            throw new IllegalArgumentException(
                    "Please select at least one column from each dataset in the join/group-by job.");
        }

        // SETUP JOB
        if (this.userDefinedConf != null) {
            this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
        }
        this.jobConf.setJarByClass(job.getClass());
        this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
        this.jobConf.setMapOutputValueClass(DataJoinValue.class);
        this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class);
        this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class);
        this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
        this.jobConf.setReducerClass(DefaultMobiusReducer.class);
        this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections));

        JobSetup.setupOutputs(this.jobConf, output, outputFormat);

        // setup input paths, projection columns for each datasets.
        for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) {
            Dataset aDataset = this.datasets[assignedDatasetID];

            // setup input for each dataset
            JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID);

            // setup projection for each dataset
            JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID,
                    datasetToColumns.get(aDataset).toArray(new Column[0]));
        }

        // setup all dataset IDs
        for (int i = 0; i < this.datasets.length; i++) {
            Byte id = this.datasets[i].getID();
            if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) {
                this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS,
                        this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id);
            } else {
                this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString());
            }
        }

        boolean isCombinable = true;
        for (Projectable aFunc : projections) {
            aFunc.setConf(jobConf);

            if (!aFunc.isCombinable()) {
                isCombinable = false;
                LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false.");
                break;
            }
            if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) {
                LOGGER.info(aFunc.toString()
                        + " is a group function and use group key as its input only, disable combiner.");
                isCombinable = false;
                break;
            }
        }

        LOGGER.info("Using Combiner? " + isCombinable);
        if (isCombinable) {
            jobConf.setCombinerClass(DefaultMobiusCombiner.class);
        }

        job.addToExecQueue(jobConf);

        AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
                "Dataset_" + output.getName());

        // form the output column from the projections
        List<String> outputColumns = new ArrayList<String>();
        for (Projectable func : projections) {
            String[] aProjectOutputs = func.getOutputSchema();
            for (String anOutputName : aProjectOutputs) {
                outputColumns.add(anOutputName);
            }
        }

        return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
    }
}