com.linkedin.pinot.integration.tests.HybridClusterIntegrationTest.java Source code

Introduction

Here is the source code for com.linkedin.pinot.integration.tests.HybridClusterIntegrationTest.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.integration.tests;

import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.utils.FileUploadUtils;
import com.linkedin.pinot.common.utils.KafkaStarterUtils;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.common.utils.ZkStarter;
import com.linkedin.pinot.util.TestUtils;
import java.io.File;
import java.io.FileInputStream;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import kafka.server.KafkaServerStartable;
import org.apache.commons.io.FileUtils;
import org.apache.helix.ExternalViewChangeListener;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerFactory;
import org.apache.helix.InstanceType;
import org.apache.helix.NotificationContext;
import org.apache.helix.model.ExternalView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;

/**
 * Hybrid cluster integration test that uploads 8 months of data as offline and 6 months of data as realtime (with a
 * two month overlap).
 *
 */
public class HybridClusterIntegrationTest extends BaseClusterIntegrationTest {
    private static final Logger LOGGER = LoggerFactory.getLogger(RealtimeClusterIntegrationTest.class);
    private static final String TENANT_NAME = "TestTenant";
    protected final File _tmpDir = new File("/tmp/HybridClusterIntegrationTest");
    protected final File _segmentDir = new File("/tmp/HybridClusterIntegrationTest/segmentDir");
    protected final File _tarDir = new File("/tmp/HybridClusterIntegrationTest/tarDir");
    protected static final String KAFKA_TOPIC = "hybrid-integration-test";

    private int segmentCount = 12;
    private int offlineSegmentCount = 8;
    private int realtimeSegmentCount = 6;
    private Random random = new Random();
    private Schema schema;

    private KafkaServerStartable kafkaStarter;

    protected void setSegmentCount(int segmentCount) {
        this.segmentCount = segmentCount;
    }

    protected void setOfflineSegmentCount(int offlineSegmentCount) {
        this.offlineSegmentCount = offlineSegmentCount;
    }

    protected void setRealtimeSegmentCount(int realtimeSegmentCount) {
        this.realtimeSegmentCount = realtimeSegmentCount;
    }

    protected int getOfflineSegmentCount() {
        return offlineSegmentCount;
    }

    @BeforeClass
    public void setUp() throws Exception {
        //Clean up
        ensureDirectoryExistsAndIsEmpty(_tmpDir);
        ensureDirectoryExistsAndIsEmpty(_segmentDir);
        ensureDirectoryExistsAndIsEmpty(_tarDir);

        // Start Zk, Kafka and Pinot
        startHybridCluster();

        // Unpack the Avro files
        TarGzCompressionUtils.unTar(new File(TestUtils.getFileFromResourceUrl(OfflineClusterIntegrationTest.class
                .getClassLoader().getResource("On_Time_On_Time_Performance_2014_100k_subset_nonulls.tar.gz"))),
                _tmpDir);

        _tmpDir.mkdirs();

        final List<File> avroFiles = getAllAvroFiles();

        File schemaFile = getSchemaFile();
        schema = Schema.fromFile(schemaFile);
        addSchema(schemaFile, schema.getSchemaName());
        final List<String> invertedIndexColumns = makeInvertedIndexColumns();
        final String sortedColumn = makeSortedColumn();

        // Create Pinot table
        addHybridTable("mytable", "DaysSinceEpoch", "daysSinceEpoch", KafkaStarterUtils.DEFAULT_ZK_STR, KAFKA_TOPIC,
                schema.getSchemaName(), TENANT_NAME, TENANT_NAME, avroFiles.get(0), sortedColumn,
                invertedIndexColumns, null);
        LOGGER.info("Running with Sorted column=" + sortedColumn + " and inverted index columns = "
                + invertedIndexColumns);

        // Create a subset of the first 8 segments (for offline) and the last 6 segments (for realtime)
        final List<File> offlineAvroFiles = getOfflineAvroFiles(avroFiles);
        final List<File> realtimeAvroFiles = getRealtimeAvroFiles(avroFiles);

        // Load data into H2
        ExecutorService executor = Executors.newCachedThreadPool();
        setupH2AndInsertAvro(avroFiles, executor);

        // Create segments from Avro data
        LOGGER.info("Creating offline segments from avro files " + offlineAvroFiles);
        buildSegmentsFromAvro(offlineAvroFiles, executor, 0, _segmentDir, _tarDir, "mytable", false, null);

        // Initialize query generator
        setupQueryGenerator(avroFiles, executor);

        executor.shutdown();
        executor.awaitTermination(10, TimeUnit.MINUTES);

        // Set up a Helix spectator to count the number of segments that are uploaded and unlock the latch once 12 segments are online
        final CountDownLatch latch = new CountDownLatch(1);
        HelixManager manager = HelixManagerFactory.getZKHelixManager(getHelixClusterName(), "test_instance",
                InstanceType.SPECTATOR, ZkStarter.DEFAULT_ZK_STR);
        manager.connect();
        manager.addExternalViewChangeListener(new ExternalViewChangeListener() {
            @Override
            public void onExternalViewChange(List<ExternalView> externalViewList,
                    NotificationContext changeContext) {
                for (ExternalView externalView : externalViewList) {
                    if (externalView.getId().contains("mytable")) {

                        Set<String> partitionSet = externalView.getPartitionSet();
                        if (partitionSet.size() == offlineSegmentCount) {
                            int onlinePartitionCount = 0;

                            for (String partitionId : partitionSet) {
                                Map<String, String> partitionStateMap = externalView.getStateMap(partitionId);
                                if (partitionStateMap.containsValue("ONLINE")) {
                                    onlinePartitionCount++;
                                }
                            }

                            if (onlinePartitionCount == offlineSegmentCount) {
                                System.out.println("Got " + offlineSegmentCount
                                        + " online tables, unlatching the main thread");
                                latch.countDown();
                            }
                        }
                    }
                }
            }
        });

        // Upload the segments
        int i = 0;
        for (String segmentName : _tarDir.list()) {
            System.out.println("Uploading segment " + (i++) + " : " + segmentName);
            File file = new File(_tarDir, segmentName);
            FileUploadUtils.sendSegmentFile("localhost", "8998", segmentName, new FileInputStream(file),
                    file.length());
        }

        // Wait for all offline segments to be online
        latch.await();

        // Load realtime data into Kafka
        LOGGER.info("Pushing data from realtime avro files " + realtimeAvroFiles);
        pushAvroIntoKafka(realtimeAvroFiles, KafkaStarterUtils.DEFAULT_KAFKA_BROKER, KAFKA_TOPIC);

        // Wait until the Pinot event count matches with the number of events in the Avro files
        int pinotRecordCount, h2RecordCount;
        long timeInFiveMinutes = System.currentTimeMillis() + 5 * 60 * 1000L;

        Statement statement = _connection.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        statement.execute("select count(*) from mytable");
        ResultSet rs = statement.getResultSet();
        rs.first();
        h2RecordCount = rs.getInt(1);
        rs.close();

        waitForRecordCountToStabilizeToExpectedCount(h2RecordCount, timeInFiveMinutes);
    }

    /**
     * Pick one column at random (or null) out of dimensions and return it as a sorted column.
     * @note: Change this method to return a specific sorted column (or null) to debug failed tests
     *
     * @return sorted column name or null if none is to be used for this run.
     */
    private String makeSortedColumn() {
        List<String> dimensions = schema.getDimensionNames();
        final int nDimensions = dimensions.size();
        int ntries = nDimensions;
        int rand = random.nextInt();
        if (rand % 5 == 0) {
            // Return no sorted column 20% of the time
            return null;
        }

        while (ntries-- > 0) {
            int dimPos = random.nextInt(dimensions.size() + 1);
            if (dimPos == nDimensions) {
                continue;
            }
            String sortedColumn = dimensions.get(dimPos);
            FieldSpec fieldSpec = schema.getFieldSpecFor(sortedColumn);
            if (fieldSpec.isSingleValueField()) {
                return sortedColumn;
            }
        }
        return null;
    }

    /**
     * Pick one or two inverted index columns from the list of dimension columns, and return them as list of
     * inverted index columns.
     * @note: Change this method to return a specific list of columns (or null) as needed to debug a testcase
     *
     * @return list of inverted index columns or null is no inv index is to be used for this run
     */
    private List<String> makeInvertedIndexColumns() {
        List<String> dimensions = schema.getDimensionNames();
        final int nDimensions = dimensions.size();
        int dimPos = random.nextInt(dimensions.size() + 1);
        List<String> invIndexColumns = new ArrayList<String>(2);
        invIndexColumns.add("DestStateFips");
        invIndexColumns.add("OriginStateFips");

        if (dimPos == nDimensions) {
            return null;
        }
        invIndexColumns.add(dimensions.get(dimPos));
        dimPos = random.nextInt(dimensions.size() + 1);
        if (dimPos == nDimensions || dimensions.get(dimPos).equals(invIndexColumns.get(0))) {
            return invIndexColumns;
        }
        invIndexColumns.add(dimensions.get(dimPos));
        return invIndexColumns;
    }

    protected List<File> getAllAvroFiles() {
        final List<File> avroFiles = new ArrayList<File>(segmentCount);
        for (int segmentNumber = 1; segmentNumber <= segmentCount; ++segmentNumber) {
            avroFiles.add(
                    new File(_tmpDir.getPath() + "/On_Time_On_Time_Performance_2014_" + segmentNumber + ".avro"));
        }
        return avroFiles;
    }

    protected List<File> getRealtimeAvroFiles(List<File> avroFiles) {
        final List<File> realtimeAvroFiles = new ArrayList<File>(realtimeSegmentCount);
        for (int i = segmentCount - realtimeSegmentCount; i < segmentCount; i++) {
            realtimeAvroFiles.add(avroFiles.get(i));
        }
        return realtimeAvroFiles;
    }

    protected List<File> getOfflineAvroFiles(List<File> avroFiles) {
        final List<File> offlineAvroFiles = new ArrayList<File>(offlineSegmentCount);
        for (int i = 0; i < offlineSegmentCount; i++) {
            offlineAvroFiles.add(avroFiles.get(i));
        }
        return offlineAvroFiles;
    }

    protected void startHybridCluster() throws Exception {
        // Start Zk and Kafka
        startZk();
        kafkaStarter = KafkaStarterUtils.startServer(KafkaStarterUtils.DEFAULT_KAFKA_PORT,
                KafkaStarterUtils.DEFAULT_BROKER_ID, KafkaStarterUtils.DEFAULT_ZK_STR,
                KafkaStarterUtils.getDefaultKafkaConfiguration());

        // Create Kafka topic
        KafkaStarterUtils.createTopic(KAFKA_TOPIC, KafkaStarterUtils.DEFAULT_ZK_STR, 10);

        // Start the Pinot cluster
        startController(true);
        startBroker();
        startServers(2);

        // Create tenants
        createBrokerTenant(TENANT_NAME, 1);
        createServerTenant(TENANT_NAME, 1, 1);
    }

    @AfterClass
    public void tearDown() throws Exception {
        stopBroker();
        stopController();
        stopServer();
        KafkaStarterUtils.stopServer(kafkaStarter);
        try {
            stopZk();
        } catch (Exception e) {
            // Swallow ZK Exceptions.
        }
        cleanup();
    }

    protected void cleanup() throws Exception {
        FileUtils.deleteDirectory(_tmpDir);
    }
}