Java tutorial
package com.ebay.erl.mobius.core; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.util.Tool; import com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder; import com.ebay.erl.mobius.core.builder.Dataset; import com.ebay.erl.mobius.core.builder.DatasetBuildersFactory; import com.ebay.erl.mobius.core.mapred.ConfigurableJob; import com.ebay.erl.mobius.core.model.Column; import com.ebay.erl.mobius.core.model.Tuple; import com.ebay.erl.mobius.core.sort.Sorter; /** * Main class of the Mobius API. Extends this class * to create a Mobius data processing flow. * * * * This product is licensed under the Apache License, Version 2.0, * available at http://www.apache.org/licenses/LICENSE-2.0. * * This product contains portions derived from Apache hadoop which is * licensed under the Apache License, Version 2.0, available at * http://hadoop.apache.org. * * 2007 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan */ @SuppressWarnings({ "deprecation", "unchecked" }) public abstract class MobiusJob extends Configured implements Tool, Serializable { private static final long serialVersionUID = -9070202196576655916L; private static final Log LOGGER = LogFactory.getLog(MobiusJob.class); transient Map<URI/*output*/, Job> jobTopology = new HashMap<URI, Job>(); transient Set<String> inputPaths = new HashSet<String>(); transient List<Path> tempFiles = new LinkedList<Path>(); private transient FileSystem fs; /** * Return the Hadoop job configuration. * <p> * Note that, this method creates a new {@link Configuration} * from the default one every time, so changes that are made * to the returned {@link Configuration} won't affect the conf * returned by the next call of {@link #getConf()}. */ @Override public Configuration getConf() { Configuration conf = super.getConf() == null ? new Configuration() : super.getConf(); Configuration clone = new Configuration(); Iterator<Entry<String, String>> it = conf.iterator(); while (it.hasNext()) { Entry<String, String> entry = it.next(); clone.set(entry.getKey(), entry.getValue()); } return clone; } /** * Test if the given <code>input</code> is the output of another job or not * * @param input input path of a job. * @return <code>true</code> if the <code>input</code> is the output * path of another job, <code>false</code> otherwise. */ public boolean isOutputOfAnotherJob(Path input) { // normalize the input first, in case of it doesn't // contain schema (hdfs://, or file:// for example.) Path p = this.getFS().makeQualified(input); LOGGER.info("Current Path Key:" + this.jobTopology.keySet()); LOGGER.info(p.toUri() + " is the output of another job? " + this.jobTopology.containsKey(p.toUri())); return this.jobTopology.containsKey(p.toUri()); } /** * Test if the given <code>input</code> is the output of another job or not * * @param input input path of a job * @return <code>true</code> if the <code>input</code> is the output * path of another job, <code>false</code> otherwise. */ public boolean isOutputOfAnotherJob(String input) { return this.isOutputOfAnotherJob(new Path(input)); } /** * Select the <code>columns</code> from the <code>dataset</code>, store * it into <code>outputFolder</code> with the given <code>outputFormat</code> * <p> * * Here is an example: * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) * { * Dataset students = ...; * * // save the result to $OUTPUT in SequenceFileOutputFormat, * // the key will be NullWritable, and the value is a Tuple * // which contains 3 columns, id, f_name and l_name. * this.list(students, * new Path("$OUTPUT"), * SequenceFileOutputFormat.class, * new Column(students, "id"), * new Column(students, "f_name"), * new Column(students, "l_name") * ); * } * * public static void main(String[] args) throw Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat, Column... columns) throws IOException { byte datasetID = 0;// set to 0 as there is only one dataset to be operated. JobConf job = dataset.createJobConf(datasetID); job.set("mapred.job.name", "Listing " + dataset.getName()); job.setJarByClass(this.getClass()); job.setNumReduceTasks(0); // list is map only job job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Tuple.class); job.setJobName("List " + dataset.getName()); JobSetup.validateColumns(dataset, columns); JobSetup.setupInputs(job, dataset, datasetID); JobSetup.setupProjections(job, dataset, datasetID, columns); JobSetup.setupOutputs(job, outputFolder, outputFormat); this.addToExecQueue(job); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat, "Dataset_" + outputFolder.getName()); return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns)); } /** * Select the <code>columns</code> from the <code>dataset</code> and store * it into <code>outputFolder</code>. * <p> * The output format is {@link TextOutputFormat}. * <p> * * Here is an example: * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) * { * Dataset students = ...; * * // save the result to $OUTPUT in TextOutputFormat, * // output will be tab delimited files with 3 columns, * // id, f_name and l_name. * // * // To change the delimiter, put -Dmobius.tuple.tostring.delimiter=YOUR_DELIMITER * // when submitting a job in command line. * this.list(students, * new Path("$OUTPUT"), * new Column(students, "id"), * new Column(students, "f_name"), * new Column(students, "l_name") * ); * } * * public static void main(String[] args) throw Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> * */ public Dataset list(Dataset dataset, Path outputFolder, Column... columns) throws IOException { return this.list(dataset, outputFolder, TextOutputFormat.class, columns); } /** * Select the <code>columns</code> from the <code>dataset</code>. * <p> * * The output path is a temporal path under hadoop.tmp.dir, and the output * format is {@link SequenceFileOutputFormat}. * <p> * * Here is an example: * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) * { * Dataset students = ...; * * this.list(students, * new Column(students, "id"), * new Column(students, "f_name"), * new Column(students, "l_name") * ); * } * * public static void main(String[] args) throw Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public Dataset list(Dataset dataset, Column... columns) throws IOException { return this.list(dataset, this.newTempPath(), SequenceFileOutputFormat.class, columns); } /** * Performing "Left Outer Join", the result contains all the records of * the left {@linkplain Dataset} (the 1st {@linkplain Dataset}) with * or without match to the right {@linkplain Dataset}. * <p> * * If in a join group, there is no records from the right {@linkplain Dataset} * (the 2nd argument), by default, <code>null</code>(if the output format is * SequenceFileOutputFormat) or empty string (if the output format is * {@link TextOutputFormat}) is written for the selected columns from * the right {@linkplain Dataset}. * <p> * * If <code>nullReplacement</code> is not null, then it will be used as * the value for the columns from the right dataset when no match in a * join group. * <p> * * To compose a <code>leftOuterJoin</code> is almost the same as composing * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling * <code>innerJoin</code>, simply change it to * <code>leftOuterJoin(Dataset, Dataset, Object)</code>. * <p> * * @param left left-hand side {@link Dataset} * @param right right-hand side {@link Dataset} * @param nullReplacement the value to be used as the value for null columns, * it can be only the type supported by {@link Tuple} * */ public JoinOnConfigure leftOuterJoin(Dataset left, Dataset right, Object nullReplacement) throws IOException { Configuration conf = this.getConf(); conf.setBoolean(ConfigureConstants.IS_OUTER_JOIN, true); return new JoinOnConfigure(nullReplacement, conf, left, right); } /** * Performing "Left Outer Join", the result contains all the records of * the left {@linkplain Dataset} (the 1st {@linkplain Dataset}) with * or without match to the right {@linkplain Dataset}. * <p> * * If in a join group, there is no records from the right {@linkplain Dataset} * (the 2nd argument), by default, <code>null</code>(if the output format is * SequenceFileOutputFormat) or empty string (if the output format is * {@link TextOutputFormat}) is written for the selected columns from * the right {@linkplain Dataset}. * <p> * * To compose a <code>leftOuterJoin</code> is almost the same as composing * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling * <code>innerJoin</code>, simply change it to * <code>leftOuterJoin(Dataset, Dataset)</code>. * <p> * * @param left left-hand side {@link Dataset} * @param right right-hand side {@link Dataset} * */ public JoinOnConfigure leftOuterJoin(Dataset left, Dataset right) throws IOException { return this.leftOuterJoin(left, right, null); } /** * Performing "Right Outer Join", the result contains all the records of * the right {@linkplain Dataset} (the 2nd argument) with or without match * to the left {@linkplain Dataset}. * <p> * * If in a join group, there is no records from the right {@linkplain Dataset} * (the 2nd argument), by default, <code>null</code>(if the output format is * SequenceFileOutputFormat) or empty string (if the output format is * {@link TextOutputFormat}) is written for the selected columns from * the left {@linkplain Dataset} * <p> * * If <code>nullReplacement</code> is not null, then it will be used as * the value for the columns from the left dataset when no match in a * join group. * <p> * * To compose a <code>rightOuterJoin</code> is almost the same as composing * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling * <code>innerJoin</code>, simply change it to * <code>rightOuterJoin(Dataset, Dataset, Object)</code>. * <p> * * @param left left-hand side {@link Dataset} * @param right right-hand side {@link Dataset} * @param nullReplacement the value to be used as the value for null columns, * it can be only the type supported by {@link Tuple} */ public JoinOnConfigure rightOuterJoin(Dataset left, Dataset right, Object nullReplacement) throws IOException { // leverage the leftOuterJoin by exchanging the position // of left and right dataset. return leftOuterJoin(right, left, nullReplacement); } /** * Performing "Right Outer Join", the result contains all the records of * the right {@linkplain Dataset} (the 2nd argument) with or without match * to the left {@linkplain Dataset}. * <p> * * If in a join group, there is no records from the right {@linkplain Dataset} * (the 2nd argument), by default, <code>null</code>(if the output format is * SequenceFileOutputFormat) or empty string (if the output format is * {@link TextOutputFormat}) is written for the selected columns from * the left {@linkplain Dataset} * <p> * * To compose a <code>rightOuterJoin</code> is almost the same as composing * a {@link MobiusJob#innerJoin(Dataset...)} job except that instead of calling * <code>innerJoin</code>, simply change it to * <code>rightOuterJoin(Dataset, Dataset)</code>. * <p> * * @param left left-hand side {@link Dataset} * @param right right-hand side {@link Dataset} * @param nullReplacement the value to be used as the value for null columns, * it can be only the type supported by {@link Tuple} */ public JoinOnConfigure rightOuterJoin(Dataset left, Dataset right) throws IOException { return this.rightOuterJoin(left, right, null); } /** * Perform inner join on the given <code>datasets</code>. * <p> * * The number of <code>datasets</code> must >= 2. * One can join <b>more than two {@link Dataset} at once</b> * only if the datasets have a shared key, i.e., they have * columns that share the same meaning, the name of * the columns don't have to be the same, but the content * (value) of the columns need to be the same. * <p> * * Form the performance perspective, the <b>biggest dataset * </b> should be placed in the <b>right most side</b>. The * <b>bigness</b> is measured in terms of values in a join * key, <b>NOT</b> by the total number of records of a dataset. * <p> * * Here is an example of how to create a inner join job: * <pre> * <code> * public class MyJob extends MobiusJob * { * public void run(String[] args) throws Exception * { * Dataset students = ...; * Dataset courses = ...; * * this * .innerJoin(students, courses) * .on( new EQ(new Column(students, "student_id"), new Column(courses, "student_id")) ) * .save(this, new Path("$OUTPUT"), * new Column(students, "student_id"), * new Column(students, "f_name"), * new Column(students, "l_name"), * new Column(courses, "c_title") * ); * } * * public static void main(String[] args) throws Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public JoinOnConfigure innerJoin(Dataset... datasets) { return new JoinOnConfigure(this.getConf(), datasets); } /** * Start a group-by job. * <p> * * Group-by the given <code>aDataset</code> by * certain column(s) (to be specified in the returned * {@link GroupByConfigure}). * <p> * * Here is an example of group-by job: * <pre> * <code> * public class MyJob extends MobiusJob * { * public void run(String[] args) throws Exception * { * ..... * this * .group(order) * .by(new Column(order, "order_person_id")) * .save(this, * new Path("$OUTPUT_PATH"), * new Column(order, "order_person_id"), * new Max(new Column(order, "order_id"))); * } * * public static void main(String[] args) throws Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public GroupByConfigure group(Dataset aDataset) { return new GroupByConfigure(this.getConf(), aDataset); } /** * Performing a total sort on the aDataset. * <p> * * After the job has finished, concatenating * the out files together, the values in the files * are sorted according to the given {@link Sorter}. * <p> * * Here is an example of how to start a <code>sort</code> * job: * * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) throws Exception * { * ..... * this * .sort(person) * .select( * new Column(ds, "age"), * new Column(ds, "gender"), * new Column(ds, "fname"), * new Column(ds, "lname")) * .orderBy( * new Sorter(new Column(ds, "age"), Ordering.ASC, true), * new Sorter(new Column(ds, "gender"), Ordering.DESC, true)) * .save( * this, * new Path("$OUTPUT") * ); * } * * public static void main(String[] args) throws Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public SortProjectionConfigure sort(Dataset aDataset) throws IOException { return new SortProjectionConfigure(this.getConf(), aDataset); } protected FileSystem getFS() { if (this.fs == null) { try { this.fs = FileSystem.get(this.getConf()); } catch (IOException e) { throw new RuntimeException(e); } } return this.fs; } void deleteTempFiles() throws IOException { LOGGER.info("Cleanning temporal files..."); for (Path aTempFile : this.tempFiles) { if (!this.getFS().delete(aTempFile, true)) { LOGGER.warn("Cannot delete temp file:" + aTempFile.toString()); } else { LOGGER.info(aTempFile.toString() + " deleted."); } } LOGGER.info("All temporal files are deleted."); } /** * create an empty folder under hadoop.tmp.dir. */ public Path newTempPath() throws IOException { Path tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis())); while (this.getFS().exists(tmp)) { tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis())); } if (!this.getFS().mkdirs(tmp)) { throw new IOException("Cannot create temp file:" + tmp.toString() + "."); } // remember the temp file so it can be deleted after // this job has completed. this.tempFiles.add(tmp); return tmp; } /** * Add a job, represented by the <code>aNewJob</code> object, into the execution queue. * <p> * * Users can use this method to add one or more jobs' configuration into the job queue, and Mobius engine * will analyze the <code>aNewJob</code> objects within the queue to understand the dependence of jobs. * For example, if job B's input is from job A, then job B won't be submitted until A is completed * successfully. If A failed, the B will not be submitted. * <p> * * * @param aNewJobConf a {@link Configuration} object represents a Hadoop job. * @throws IOException */ protected void addToExecQueue(Configuration aNewJobConf) throws IOException { // Add the new job into execution engine and realize // its dependency, if any. // // To realize the job dependency, we need to analyze the input // path of this new job. // // The inputs of a job could be: // 1) if aNewJob is not a derived job (ex: result of another MR job), // then the inputs of the job can be retrieved from "mapred.input.dir", // or from {@link MultipleInputs} (ex, joining different type of dataset)/ // 2) if aNewJob is a derived job, the input is from the output of previous // MR job. String inputFolders = aNewJobConf.get("mapred.input.dir", ""); if (inputFolders.length() == 0) { // the value of "mapred.input.dir" is empty, assuming the inputs of this job // are coming from {@link MultipleInputs}. String multipleInputs = aNewJobConf .get("mapred.input.dir.mappers"/* for using old MultipleInputs, v0.20.X */, aNewJobConf.get( "mapreduce.input.multipleinputs.dir.formats"/* for new MultipleInputs, v0.23.X */, "")); if (multipleInputs.length() > 0) { // the input paths of this job is coming from MultipleInputs, extract the input paths. // The format from {@link MultipleInputs} is like: hadoop_path1;corresponding_mapper1,hadoop_path2;corresponding_mapper2... String[] pathAndMapperPairs = multipleInputs.split(","); for (String aPair : pathAndMapperPairs) { String[] pathToMapper = aPair.split(";"); String path = pathToMapper[0]; String mapper = pathToMapper[1]; if (inputFolders.length() == 0) { inputFolders = getPathOnly(path); } else { inputFolders = inputFolders + "," + getPathOnly(path); } } } else { throw new IllegalArgumentException("Cannot find input path(s) of job: [" + aNewJobConf.get("mapred.job.name") + "] from the following attributes: " + "mapred.input.dir, mapred.input.dir.mappers, nor mapreduce.input.multipleinputs.dir.formats. " + "Please specify the input path(s) of this job."); } } else { // the input path of this job is specified in mapred.input.dir inputFolders = getPathOnly(inputFolders); } //////////////////////////////////////////////////////////// // validate output path of this job, to ensure it doesn't // use the same folder of another job's output. //////////////////////////////////////////////////////////// String outputPath = aNewJobConf.get("mapred.output.dir", ""); if (outputPath.isEmpty()) throw new IllegalStateException( "Please specify the output directory of job:" + aNewJobConf.get("mapred.job.name")); if (this.isOutputOfAnotherJob(outputPath)) { throw new IllegalArgumentException("Job [" + aNewJobConf.get("mapred.job.name") + "]'s output [" + outputPath + "] is " + "the output of job[" + jobTopology.get(outputPath).getJobName() + "], " + "please make sure to use different output folder for each job."); } ////////////////////////////////////////////////////////////////// // pass all the validation, start to build the dependencies. ////////////////////////////////////////////////////////////////// Job newJob = new ConfigurableJob(new JobConf(aNewJobConf, this.getClass())); newJob.setJobName(aNewJobConf.get("mapred.job.name", aNewJobConf.get("mapreduce.job.name", "Mobius Job"))); for (String anInputOfNewJob : inputFolders.split(",")) { // Added to track inputs for local PC sampling inputPaths.add(anInputOfNewJob); Job dependsOn = jobTopology.get(this.getFS().makeQualified(new Path(anInputOfNewJob)).toUri()); if (dependsOn != null) { List<Job> dependingJobs = newJob.getDependingJobs(); boolean alreadyInDependency = dependingJobs != null && dependingJobs.contains(dependsOn); if (alreadyInDependency) { // already added, do nothing. } else { LOGGER.info(newJob.getJobName() + " depends on " + dependsOn.getJobName()); newJob.addDependingJob(dependsOn); } } } // put the output of this <code>newJob</code> into job topology // so that later if a job read this <code>newJob</code>'s output // as its input, then the system can detect the dependency. URI outputPathURI = this.getFS().makeQualified(new Path(outputPath)).toUri(); LOGGER.info("Adding Job:" + newJob.getJobName() + "\tOutput:[" + outputPath.toString() + "]"); jobTopology.put(outputPathURI, newJob); } /** * returning only the "path" part of the input URI. */ protected String getPathOnly(String uriStr) { try { URI uri = new URI(uriStr); return uri.getPath(); } catch (URISyntaxException e) { LOGGER.error(e); throw new IllegalArgumentException(e); } } }