Java tutorial
package com.splout.db.examples; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParameterException; import com.datasalt.pangool.io.Fields; import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.tuplemr.OrderBy; import com.datasalt.pangool.tuplemr.TupleMRBuilder; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat; import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat; import com.datasalt.pangool.utils.HadoopUtils; import com.splout.db.common.PartitionMap; import com.splout.db.common.SploutHadoopConfiguration; import com.splout.db.hadoop.*; import com.splout.db.hadoop.TupleSampler.SamplingOptions; import com.splout.db.hadoop.TupleSampler.SamplingType; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.mortbay.log.Log; import java.io.File; import java.io.Serializable; import java.util.ArrayList; /** * An advanced Splout example with the Wikipedia pagecounts dataset: http://dom.as/2007/12/10/wikipedia-page-counters/ */ @SuppressWarnings("serial") public class PageCountsExample implements Tool, Serializable { @Parameter(required = true, names = { "-i", "--inputpath" }, description = "The input path that contains the pagecounts file tree.") private String inputPath; @Parameter(required = true, names = { "-np", "--npartitions" }, description = "The number of partitions to create.") private Integer nPartitions; @Parameter(required = true, names = { "-o", "--outputpath" }, description = "The output path.") private String outputPath; @Parameter(names = { "-d", "--deploy" }, description = "Will deploy the generated dataset.") private boolean deploy = false; @Parameter(names = { "-q", "--qnode" }, description = "If -d is used, this qnode will be used for deploying the dataset.") private String qnode = null; @Parameter(names = { "-ng", "--nogenerate" }, description = "If used, the dataset will not be generated, instead, it is expected to be found in the output (-o) path. Use this in conjunction with -d for deploying a previously generated dataset.") private boolean noGenerate = false; @Parameter(names = { "-r", "--repfactor" }, description = "The replication factor to use when deploying.") private Integer repFactor = 1; @Parameter(names = { "-m", "--memoryForIndexing" }, description = "The amount of memory to use in each Reducer for indexing, in bytes.") private Long memoryForIndexing = 268435456l; // 256 MB @Parameter(names = { "-gtf", "--generateTupleFiles" }, description = "This boolean parameter is made for benchmarking purposes. When enabled, no SQLite files will be generated, just plain binary TupleFiles. This allows us to compare the performance of the SQLite output format compared to a plain binary output format. This option is incompatible with -d (deploy).") private boolean generateTupleFiles = false; private transient Configuration conf; @Override public Configuration getConf() { return conf; } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public int run(String[] args) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Splout Page Counts example"); try { jComm.parse(args); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } boolean generate = !noGenerate; // just for clarifying if (generateTupleFiles && deploy) { System.err.println("Can't run a 'dry' TupleFile generation and deploy it."); jComm.usage(); System.exit(-1); } Path outPath = new Path(outputPath); FileSystem outFs = outPath.getFileSystem(getConf()); if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) { File nativeLibs = new File("native"); if (nativeLibs.exists()) { SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf); } } if (generate) { Path inputPath = new Path(this.inputPath); FileSystem inputFileSystem = inputPath.getFileSystem(conf); FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath); // define the schema that the resultant table will have: date, hour, pagename, pageviews final Schema tableSchema = new Schema("pagecounts", Fields.parse("date:string, hour:string, pagename:string, pageviews:int")); // define the schema of the input files: projectcode, pagename, pageviews, bytes Schema fileSchema = new Schema("pagecountsfile", Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long")); // instantiate a TableBuilder TableBuilder tableBuilder = new TableBuilder(tableSchema); // for every input file... for (FileStatus fileStatus : fileStatuses) { String fileName = fileStatus.getPath().getName().toString(); // strip the date and the hour from the file name String fileDate = fileName.split("-")[1]; String fileHour = fileName.split("-")[2].substring(0, 2); // instantiate a custom RecordProcessor to process the records of this file PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate, fileHour); // use the tableBuilder method for adding each of the files to the mix tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING, fileSchema, recordProcessor); } // partition the dataset by pagename - which should give a fair even distribution. tableBuilder.partitionBy("pagename"); // create a compound index on pagename, date so that typical queries for the dataset will be fast tableBuilder.createIndex("pagename", "date"); long nonExactPageSize = memoryForIndexing / 32000; // number of pages int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2))); Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing + "] and there are 32000 pages."); tableBuilder.initialSQL("pragma page_size=" + pageSize); // insertion order is very important for optimizing query speed because it makes data be co-located in disk tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc")); // instantiate a TablespaceBuilder TablespaceBuilder tablespaceBuilder = new TablespaceBuilder(); // we will partition this dataset in as many partitions as: tablespaceBuilder.setNPartitions(nPartitions); tablespaceBuilder.add(tableBuilder.build()); // we turn a specific SQLite pragma on for making autocomplete queries fast tablespaceBuilder.initStatements("pragma case_sensitive_like=true;"); HadoopUtils.deleteIfExists(outFs, outPath); // finally, instantiate a TablespaceGenerator and execute it TablespaceGenerator tablespaceViewBuilder; if (generateTupleFiles) { // we subclass TablespaceGenerator to be able to run the generation without outputting the SQLite stores, for // benchmark comparisons. // In the future this feature may be useful in general for debugging store creation. tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath, this.getClass()) { @Override public void generateView(Configuration conf, SamplingType samplingType, SamplingOptions samplingOptions) throws Exception { prepareOutput(conf); final int nPartitions = tablespace.getnPartitions(); if (nPartitions > 1) { partitionMap = sample(nPartitions, conf, samplingType, samplingOptions); } else { partitionMap = PartitionMap.oneShardOpenedMap(); } writeOutputMetadata(conf); TupleMRBuilder builder = createMRBuilder(nPartitions, conf); // Set a TupleOutput here instead of SQLiteOutput builder.setOutput(new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema), ITuple.class, NullWritable.class); executeViewGeneration(builder); } }; } else { // ... otherwise a standard TablespaceGenerator is used. tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath, this.getClass()); } tablespaceViewBuilder.generateView(getConf(), SamplingType.FULL_SCAN, new TupleSampler.FullScanSamplingOptions()); } if (deploy) { // use StoreDeployerTool for deploying the already generated dataset StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf()); ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>(); deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null)); deployer.deploy(deployments); } return 1; } public static final void main(String[] args) throws Exception { ToolRunner.run(new PageCountsExample(), args); } }