com.pivotal.hawq.mapreduce.pt.HAWQInputFormatPerformanceTest_TPCH.java Source code

Java tutorial

Introduction

Here is the source code for com.pivotal.hawq.mapreduce.pt.HAWQInputFormatPerformanceTest_TPCH.java

Source

package com.pivotal.hawq.mapreduce.pt;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import com.google.common.collect.Lists;
import com.pivotal.hawq.mapreduce.*;
import com.pivotal.hawq.mapreduce.metadata.HAWQTableFormat;
import com.pivotal.hawq.mapreduce.schema.HAWQSchema;
import com.pivotal.hawq.mapreduce.util.HAWQJdbcUtils;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.sql.Connection;
import java.util.List;
import java.util.Map;

/**
 * MapReduce driver class to do manually performance tests on reading TPC-H table.
 *
 * Usage:
 * 1. load and query
 * HAWQInputFormatPerformanceTest_TPCH ao|parquet scale is_partition tableName [columns]
 * 2. query-only
 * HAWQInputFormatPerformanceTest_TPCH --query-only tableName [columns]
 *
 * Example:
 * $ hadoop com.pivotal.hawq.mapreduce.pt.HAWQInputFormatPerformanceTest_TPCH ao 5 false lineitem_ao_row L_PARTKEY,L_COMMENT
 */
public class HAWQInputFormatPerformanceTest_TPCH extends Configured implements Tool {

    static class TPCHTableMapper extends Mapper<Void, HAWQRecord, Text, Void> {
        @Override
        protected void map(Void key, HAWQRecord value, Context context) throws IOException, InterruptedException {
            try {
                String recordString = toRecordString(value);
                context.write(new Text(recordString), null);

            } catch (HAWQException e) {
                throw new IOException(e);
            }
        }

        private String toRecordString(HAWQRecord record) throws HAWQException {
            HAWQSchema schema = record.getSchema();

            // read all columns if user didn't specify column list
            if (colNames == null) {
                StringBuilder buf = new StringBuilder(toFieldString(record, 1));
                for (int i = 2; i <= schema.getFieldCount(); i++) {
                    buf.append("|").append(toFieldString(record, i));
                }
                return buf.toString();
            }

            assert colNames.size() > 0;
            StringBuilder buf = new StringBuilder(toFieldString(record, schema.getFieldIndex(colNames.get(0))));
            for (int i = 1; i < colNames.size(); i++) {
                buf.append("|").append(toFieldString(record, schema.getFieldIndex(colNames.get(i))));
            }
            return buf.toString();
        }

        private String toFieldString(HAWQRecord record, int fieldIndex) throws HAWQException {
            Object val = record.getObject(fieldIndex);
            if (val == null)
                return "null";
            if (val instanceof byte[])
                return new String((byte[]) val);
            return val.toString();
        }
    }

    // command line arguments
    boolean queryOnly;
    HAWQTableFormat tableFormat;
    String scale;
    boolean isPartition;
    String tableName;
    static List<String> colNames; // used by TPCHTableMapper to determine which column to read.

    // counters
    long dataLoadTime;
    long metadataExtractTime;
    long mapReduceTime;

    private boolean readArguments(String[] args) {
        if (args.length == 0)
            return false;

        int tableNameArgIndex;
        queryOnly = args[0].equalsIgnoreCase("--query-only");

        if (queryOnly) {
            if (args.length != 2 && args.length != 3)
                return false;
            tableNameArgIndex = 1;

        } else {
            if (args[0].equalsIgnoreCase("ao"))
                tableFormat = HAWQTableFormat.AO;
            else if (args[0].equalsIgnoreCase("parquet"))
                tableFormat = HAWQTableFormat.Parquet;
            else
                return false;

            if (args.length != 4 && args.length != 5)
                return false;

            scale = args[1];
            isPartition = Boolean.parseBoolean(args[2]);
            tableNameArgIndex = 3;
        }

        tableName = args[tableNameArgIndex];
        if (tableNameArgIndex + 1 < args.length) {
            colNames = Lists.newArrayList(args[tableNameArgIndex + 1].split(","));
        }
        return true;
    }

    private void loadTPCHData() throws Exception {
        // get number of segments
        int segnum;
        Connection conn = null;
        try {
            conn = MRFormatTestUtils.getTestDBConnection();
            Map<String, String> rs = HAWQJdbcUtils.executeSafeQueryForSingleRow(conn,
                    "SELECT COUNT(*) segnum FROM gp_segment_configuration WHERE content>=0;");
            segnum = Integer.parseInt(rs.get("segnum"));
        } finally {
            HAWQJdbcUtils.closeConnection(conn);
        }

        // run external script to load TPC-H data
        TPCHTester.HAWQTPCHSpec spec = new TPCHTester.HAWQTPCHSpec(scale, tableFormat, isPartition);
        MRFormatTestUtils.runShellCommand(spec.getLoadCmd(segnum));
    }

    private int runMapReduceJob() throws Exception {
        Path outputPath = new Path("/output");
        // delete previous output
        FileSystem fs = FileSystem.get(getConf());
        if (fs.exists(outputPath))
            fs.delete(outputPath, true);
        fs.close();

        Job job = new Job(getConf());
        job.setJarByClass(HAWQInputFormatPerformanceTest_TPCH.class);

        job.setInputFormatClass(HAWQInputFormat.class);

        long startTime = System.currentTimeMillis();
        HAWQInputFormat.setInput(job.getConfiguration(), MRFormatConfiguration.TEST_DB_URL, null, null, tableName);
        metadataExtractTime = System.currentTimeMillis() - startTime;

        FileOutputFormat.setOutputPath(job, outputPath);

        job.setMapperClass(TPCHTableMapper.class);
        job.setNumReduceTasks(0);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Void.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    @Override
    public int run(String[] args) throws Exception {
        if (!readArguments(args)) {
            System.err.printf(
                    "Usage: %s [generic options] <ao|parquet> <scale> <is_partition> <tableName> [<col1,col2>]\n",
                    getClass().getSimpleName());
            System.err.printf("       %s [generic options] --query-only <tableName> [<col1,col2>]\n",
                    getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }

        long startTime;

        if (!queryOnly) {
            startTime = System.currentTimeMillis();
            loadTPCHData();
            dataLoadTime = System.currentTimeMillis() - startTime;
        }

        startTime = System.currentTimeMillis();
        int res = runMapReduceJob();
        mapReduceTime = System.currentTimeMillis() - startTime;

        System.out.println("=====================================");
        System.out.println("========= Reports ===================");
        System.out.println("Table read : " + tableName);
        if (!queryOnly) {
            System.out.println("Data volume:        " + scale + "G");
            System.out.println("TPC-H Data Loading: " + dataLoadTime + "ms");
        }
        System.out.println("Metadata Extract:   " + metadataExtractTime + "ms");
        System.out.println("MapReduce Job:      " + mapReduceTime + "ms");
        return res;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new HAWQInputFormatPerformanceTest_TPCH(), args);
        System.exit(exitCode);
    }
}