com.splout.db.examples.PageCountsExample.java Source code

Introduction

Here is the source code for com.splout.db.examples.PageCountsExample.java
Source

package com.splout.db.examples;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.OrderBy;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat;
import com.datasalt.pangool.utils.HadoopUtils;
import com.splout.db.common.PartitionMap;
import com.splout.db.common.SploutHadoopConfiguration;
import com.splout.db.hadoop.*;
import com.splout.db.hadoop.TupleSampler.SamplingOptions;
import com.splout.db.hadoop.TupleSampler.SamplingType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.mortbay.log.Log;

import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;

/**
 * An advanced Splout example with the Wikipedia pagecounts dataset: http://dom.as/2007/12/10/wikipedia-page-counters/
 */
@SuppressWarnings("serial")
public class PageCountsExample implements Tool, Serializable {

    @Parameter(required = true, names = { "-i",
            "--inputpath" }, description = "The input path that contains the pagecounts file tree.")
    private String inputPath;

    @Parameter(required = true, names = { "-np",
            "--npartitions" }, description = "The number of partitions to create.")
    private Integer nPartitions;

    @Parameter(required = true, names = { "-o", "--outputpath" }, description = "The output path.")
    private String outputPath;

    @Parameter(names = { "-d", "--deploy" }, description = "Will deploy the generated dataset.")
    private boolean deploy = false;

    @Parameter(names = { "-q",
            "--qnode" }, description = "If -d is used, this qnode will be used for deploying the dataset.")
    private String qnode = null;

    @Parameter(names = { "-ng",
            "--nogenerate" }, description = "If used, the dataset will not be generated, instead, it is expected to be found in the output (-o) path. Use this in conjunction with -d for deploying a previously generated dataset.")
    private boolean noGenerate = false;

    @Parameter(names = { "-r", "--repfactor" }, description = "The replication factor to use when deploying.")
    private Integer repFactor = 1;

    @Parameter(names = { "-m",
            "--memoryForIndexing" }, description = "The amount of memory to use in each Reducer for indexing, in bytes.")
    private Long memoryForIndexing = 268435456l; // 256 MB

    @Parameter(names = { "-gtf",
            "--generateTupleFiles" }, description = "This boolean parameter is made for benchmarking purposes. When enabled, no SQLite files will be generated, just plain binary TupleFiles. This allows us to compare the performance of the SQLite output format compared to a plain binary output format. This option is incompatible with -d (deploy).")
    private boolean generateTupleFiles = false;

    private transient Configuration conf;

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public int run(String[] args) throws Exception {
        // Validate params etc
        JCommander jComm = new JCommander(this);
        jComm.setProgramName("Splout Page Counts example");
        try {
            jComm.parse(args);
        } catch (ParameterException e) {
            System.err.println(e.getMessage());
            jComm.usage();
            System.exit(-1);
        }

        boolean generate = !noGenerate; // just for clarifying

        if (generateTupleFiles && deploy) {
            System.err.println("Can't run a 'dry' TupleFile generation and deploy it.");
            jComm.usage();
            System.exit(-1);
        }

        Path outPath = new Path(outputPath);
        FileSystem outFs = outPath.getFileSystem(getConf());

        if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) {
            File nativeLibs = new File("native");
            if (nativeLibs.exists()) {
                SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
            }
        }

        if (generate) {
            Path inputPath = new Path(this.inputPath);
            FileSystem inputFileSystem = inputPath.getFileSystem(conf);

            FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath);

            // define the schema that the resultant table will have: date, hour, pagename, pageviews
            final Schema tableSchema = new Schema("pagecounts",
                    Fields.parse("date:string, hour:string, pagename:string, pageviews:int"));
            // define the schema of the input files: projectcode, pagename, pageviews, bytes
            Schema fileSchema = new Schema("pagecountsfile",
                    Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));

            // instantiate a TableBuilder
            TableBuilder tableBuilder = new TableBuilder(tableSchema);

            // for every input file...
            for (FileStatus fileStatus : fileStatuses) {
                String fileName = fileStatus.getPath().getName().toString();
                // strip the date and the hour from the file name
                String fileDate = fileName.split("-")[1];
                String fileHour = fileName.split("-")[2].substring(0, 2);
                // instantiate a custom RecordProcessor to process the records of this file
                PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate,
                        fileHour);
                // use the tableBuilder method for adding each of the files to the mix
                tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER,
                        TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING,
                        fileSchema, recordProcessor);
            }

            // partition the dataset by pagename - which should give a fair even distribution.
            tableBuilder.partitionBy("pagename");
            // create a compound index on pagename, date so that typical queries for the dataset will be fast
            tableBuilder.createIndex("pagename", "date");

            long nonExactPageSize = memoryForIndexing / 32000; // number of pages
            int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
            Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing
                    + "] and there are 32000 pages.");

            tableBuilder.initialSQL("pragma page_size=" + pageSize);
            // insertion order is very important for optimizing query speed because it makes data be co-located in disk
            tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));

            // instantiate a TablespaceBuilder
            TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();

            // we will partition this dataset in as many partitions as:
            tablespaceBuilder.setNPartitions(nPartitions);
            tablespaceBuilder.add(tableBuilder.build());
            // we turn a specific SQLite pragma on for making autocomplete queries fast
            tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");

            HadoopUtils.deleteIfExists(outFs, outPath);

            // finally, instantiate a TablespaceGenerator and execute it
            TablespaceGenerator tablespaceViewBuilder;

            if (generateTupleFiles) {
                // we subclass TablespaceGenerator to be able to run the generation without outputting the SQLite stores, for
                // benchmark comparisons.
                // In the future this feature may be useful in general for debugging store creation.
                tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                        this.getClass()) {

                    @Override
                    public void generateView(Configuration conf, SamplingType samplingType,
                            SamplingOptions samplingOptions) throws Exception {

                        prepareOutput(conf);
                        final int nPartitions = tablespace.getnPartitions();
                        if (nPartitions > 1) {
                            partitionMap = sample(nPartitions, conf, samplingType, samplingOptions);
                        } else {
                            partitionMap = PartitionMap.oneShardOpenedMap();
                        }
                        writeOutputMetadata(conf);

                        TupleMRBuilder builder = createMRBuilder(nPartitions, conf);
                        // Set a TupleOutput here instead of SQLiteOutput
                        builder.setOutput(new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema),
                                ITuple.class, NullWritable.class);
                        executeViewGeneration(builder);
                    }
                };
            } else {
                // ... otherwise a standard TablespaceGenerator is used.
                tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                        this.getClass());
            }

            tablespaceViewBuilder.generateView(getConf(), SamplingType.FULL_SCAN,
                    new TupleSampler.FullScanSamplingOptions());
        }

        if (deploy) {
            // use StoreDeployerTool for deploying the already generated dataset
            StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf());
            ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>();
            deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null));
            deployer.deploy(deployments);
        }
        return 1;
    }

    public static final void main(String[] args) throws Exception {
        ToolRunner.run(new PageCountsExample(), args);
    }
}