gobblin.compaction.hive.CompactionRunner.java Source code

Java tutorial


Here is the source code for gobblin.compaction.hive.CompactionRunner.java


 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package gobblin.compaction.hive;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeUnit;

import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationConverter;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import gobblin.compaction.CliOptions;
import gobblin.compaction.mapreduce.MRCompactionRunner;

 * Run Hive compaction based on config files.
public class CompactionRunner {

    private static final Logger LOG = LoggerFactory.getLogger(CompactionRunner.class);

    private static final String COMPACTION_CONFIG_DIR = "compaction.config.dir";
    private static final String TIMING_FILE = "timing.file";
    private static final String TIMING_FILE_DEFAULT = "time.txt";
    private static final String SNAPSHOT = "snapshot";
    private static final String DELTA = "delta";
    private static final String NAME = ".name";
    private static final String PKEY = ".pkey";
    private static final String DATALOCATION = ".datalocation";
    private static final String SCHEMALOCATION = ".schemalocation";
    private static final String COPYDATA = ".copydata";
    private static final String COPYDATA_DEFAULT = "false";
    private static final String DATAFORMAT_EXTENSION_NAME = ".dataformat.extension.name";
    private static final String OUTPUT = "output";

    static Properties properties = new Properties();
    static Properties jobProperties = new Properties();

    public static void main(String[] args) throws IOException, ConfigurationException {

        properties = CliOptions.parseArgs(MRCompactionRunner.class, args);

        File compactionConfigDir = new File(properties.getProperty(COMPACTION_CONFIG_DIR));
        File[] listOfFiles = compactionConfigDir.listFiles();
        if (listOfFiles == null || listOfFiles.length == 0) {
            System.err.println("No compaction configuration files found under " + compactionConfigDir);

        int numOfJobs = 0;
        for (File file : listOfFiles) {
            if (file.isFile() && !file.getName().startsWith(".")) {
        LOG.info("Found " + numOfJobs + " compaction tasks.");
        try (PrintWriter pw = new PrintWriter(new OutputStreamWriter(
                new FileOutputStream(properties.getProperty(TIMING_FILE, TIMING_FILE_DEFAULT)),
                Charset.forName("UTF-8")))) {

            for (File file : listOfFiles) {
                if (file.isFile() && !file.getName().startsWith(".")) {
                    Configuration jobConfig = new PropertiesConfiguration(file.getAbsolutePath());
                    jobProperties = ConfigurationConverter.getProperties(jobConfig);
                    long startTime = System.nanoTime();
                    long endTime = System.nanoTime();
                    long elapsedTime = endTime - startTime;
                    double seconds = TimeUnit.NANOSECONDS.toSeconds(elapsedTime);
                    pw.printf("%s: %f%n", file.getAbsolutePath(), seconds);

    private static void compact() throws IOException {

        SerialCompactor sc = new SerialCompactor.Builder().withSnapshot(buildSnapshotTable())
                .withDeltas(buildDeltaTables()).withOutputTableName(jobProperties.getProperty(OUTPUT + NAME))
                .withOutputDataLocationInHdfs(jobProperties.getProperty(OUTPUT + DATALOCATION)).build();

    private static AvroExternalTable buildSnapshotTable() throws IOException {
        return buildAvroExternalTable(SNAPSHOT);

    private static List<AvroExternalTable> buildDeltaTables() throws IOException {
        List<AvroExternalTable> deltas = new ArrayList<>();

        for (int i = 1;; i++) {
            String deltai = DELTA + "." + i;
            if (jobProperties.getProperty(deltai + DATALOCATION) == null) {

        return deltas;

    private static AvroExternalTable buildAvroExternalTable(String tableType) throws IOException {
        AvroExternalTable.Builder builder = new AvroExternalTable.Builder()
                .withName(jobProperties.getProperty(tableType + NAME, ""))
                .withPrimaryKeys(jobProperties.getProperty(tableType + PKEY))
                .withSchemaLocation(jobProperties.getProperty(tableType + SCHEMALOCATION, ""))
                .withDataLocation(jobProperties.getProperty(tableType + DATALOCATION));

        if (Boolean.parseBoolean(jobProperties.getProperty(tableType + COPYDATA, COPYDATA_DEFAULT))) {
            builder = builder
                    .withMoveDataToTmpHdfsDir(jobProperties.getProperty(tableType + DATAFORMAT_EXTENSION_NAME, ""));

        return builder.build();