Java tutorial
package com.splout.db.benchmark; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParameterException; import com.datasalt.pangool.io.Fields; import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.tuplemr.IdentityTupleMapper; import com.datasalt.pangool.tuplemr.IdentityTupleReducer; import com.datasalt.pangool.tuplemr.TupleMRBuilder; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat.FieldSelector; import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleTextOutputFormat; import com.datasalt.pangool.utils.HadoopUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * This Job implements a identity Map/Reduce in two ways: one using the plain Hadoop API and the other one using the Pangool API * for parsing CSV files. With this Job we can measure 1) The overhead of using Splout store generator tools against a plain Indetity Hadoop Job * and 2) Which part of this overhead is only due to parsing CSV files with Pangool. */ public class IdentityJob implements Tool { @Parameter(required = true, names = { "-i", "--inputpath" }, description = "The input path for the identity Job. Must be textual files.") private String inputPath; @Parameter(required = true, names = { "-o", "--outputpath" }, description = "The output path for the identity Job.") private String outputPath; @Parameter(required = false, names = { "-ps", "--pangoolSchema" }, description = "Provide a Pangool-schema and Pangool will be used for parsing the input text file into a Tuple. Using this option one can measure the overhead of using Pangool's textual input format.") private String pangoolSchema = null; @Parameter(required = false, names = { "-gb", "--groupBy" }, description = "If pangoolSchema is provided, a groupBy clause must be provided too. Use a field in your schema that makes the data as evenly spread across reducers as possible.") private String groupBy = null; // Basic CSV parsing parameters, optionally used if pangoolSchema != null, can be overrided // --------------------------------// @Parameter(names = { "-sep", "--separator" }, description = "The separator character of your text input file, defaults to a space.") private String separator = " "; @Parameter(names = { "-quo", "--quotes" }, description = "The quotes character of your input file, defaults to none.") private String quotes = TupleTextInputFormat.NO_QUOTE_CHARACTER + ""; @Parameter(names = { "-esc", "--escape" }, description = "The escape character of your input file, defaults to none.") private String escape = TupleTextInputFormat.NO_ESCAPE_CHARACTER + ""; @Parameter(names = { "-sh", "--skipheading" }, description = "Specify this flag for skipping the header line of your text file.") private boolean skipHeading = false; // --------------------------------// private Configuration conf; @Override public Configuration getConf() { return null; } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public int run(String[] params) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Identity Job"); try { jComm.parse(params); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } Path outP = new Path(outputPath); HadoopUtils.deleteIfExists(FileSystem.get(conf), outP); if (pangoolSchema == null) { // Use plain Hadoop API Job job = new Job(conf); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outP); job.waitForCompletion(true); } else { if (groupBy == null) { System.err.println("If pangoolSchema is used, groupBy must also be used."); jComm.usage(); System.exit(-1); } Schema schema = new Schema("sch", Fields.parse(pangoolSchema)); Path inputP = new Path(inputPath); // Use Pangool API - parse CSV, etc TupleMRBuilder builder = new TupleMRBuilder(conf); TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null); TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0)); builder.addIntermediateSchema(schema); builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper()); builder.setGroupByFields(groupBy); builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class); builder.setTupleReducer(new IdentityTupleReducer()); builder.setJarByClass(this.getClass()); builder.createJob().waitForCompletion(true); } return 1; } public static void main(String[] args) throws Exception { ToolRunner.run(new IdentityJob(), args); } }