org.apache.falcon.hive.HiveDRTool.java Source code

Introduction

Here is the source code for org.apache.falcon.hive.HiveDRTool.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.falcon.hive;

import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.falcon.hive.exception.HiveReplicationException;
import org.apache.falcon.hive.mapreduce.CopyMapper;
import org.apache.falcon.hive.mapreduce.CopyReducer;
import org.apache.falcon.hive.util.DRStatusStore;
import org.apache.falcon.hive.util.DelimiterUtils;
import org.apache.falcon.hive.util.EventSourcerUtils;
import org.apache.falcon.hive.util.FileUtils;
import org.apache.falcon.hive.util.HiveDRStatusStore;
import org.apache.falcon.hive.util.HiveDRUtils;
import org.apache.falcon.hive.util.HiveMetastoreUtils;
import org.apache.falcon.job.JobCounters;
import org.apache.falcon.job.JobCountersHandler;
import org.apache.falcon.job.JobType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.api.HCatClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;

/**
 * DR Tool Driver.
 */
public class HiveDRTool extends Configured implements Tool {

    private static final String META_PATH_FILE_SUFFIX = ".metapath";

    private FileSystem jobFS;
    private FileSystem sourceClusterFS;
    private FileSystem targetClusterFS;

    private HiveDROptions inputOptions;
    private DRStatusStore drStore;
    private String eventsMetaFile;
    private EventSourcerUtils eventSoucerUtil;
    private Configuration jobConf;
    private String executionStage;

    public static final FsPermission STAGING_DIR_PERMISSION = new FsPermission(FsAction.ALL, FsAction.ALL,
            FsAction.ALL);

    private static final Logger LOG = LoggerFactory.getLogger(HiveDRTool.class);

    public HiveDRTool() {
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length < 1) {
            usage();
            return -1;
        }

        try {
            init(args);
        } catch (Throwable e) {
            LOG.error("Invalid arguments: ", e);
            System.err.println("Invalid arguments: " + e.getMessage());
            usage();
            return -1;
        }

        try {
            Job job = execute();
            if ((job != null) && (inputOptions.getExecutionStage()
                    .equalsIgnoreCase(HiveDRUtils.ExecutionStage.EXPORT.name()))) {
                if ((job.getStatus().getState() == JobStatus.State.SUCCEEDED)
                        && (job.getConfiguration().get("counterLogDir") != null)) {
                    LOG.info("Obtaining job replication counters for Hive DR job");
                    Path counterFile = new Path(job.getConfiguration().get("counterLogDir"), "counter.txt");
                    JobCounters hiveReplicationCounters = JobCountersHandler
                            .getCountersType(JobType.HIVEREPLICATION.name());
                    hiveReplicationCounters.obtainJobCounters(job.getConfiguration(), job, false);
                    hiveReplicationCounters.storeJobCounters(job.getConfiguration(), counterFile);
                }
            }
        } catch (Exception e) {
            System.err.println("Exception encountered " + e.getMessage());
            e.printStackTrace();
            LOG.error("Exception encountered, cleaning up staging dirs", e);
            cleanup();
            return -1;
        }

        if (inputOptions.getExecutionStage().equalsIgnoreCase(HiveDRUtils.ExecutionStage.IMPORT.name())) {
            cleanup();
        }

        return 0;
    }

    private void init(String[] args) throws Exception {
        LOG.info("Initializing HiveDR");
        inputOptions = parseOptions(args);
        LOG.info("Input Options: {}", inputOptions);

        Configuration sourceConf = FileUtils.getConfiguration(getConf(), inputOptions.getSourceWriteEP(),
                inputOptions.getSourceNNKerberosPrincipal());
        sourceClusterFS = FileSystem.get(sourceConf);
        Configuration targetConf = FileUtils.getConfiguration(getConf(), inputOptions.getTargetWriteEP(),
                inputOptions.getTargetNNKerberosPrincipal());
        targetClusterFS = FileSystem.get(targetConf);
        jobConf = FileUtils.getConfiguration(getConf(), inputOptions.getJobClusterWriteEP(),
                inputOptions.getJobClusterNNPrincipal());
        jobFS = FileSystem.get(jobConf);

        // init DR status store
        drStore = new HiveDRStatusStore(targetClusterFS);

        // Update the source staging path after initing DR status store
        inputOptions.setSourceStagingPath();
        inputOptions.setTargetStagingPath();

        LOG.info("srcStaginPath: {}", inputOptions.getSourceStagingPath());
        LOG.info("tgtStaginPath: {}", inputOptions.getTargetStagingPath());

        eventSoucerUtil = new EventSourcerUtils(jobConf, inputOptions.shouldKeepHistory(),
                inputOptions.getJobName());
    }

    private HiveDROptions parseOptions(String[] args) throws ParseException {
        return HiveDROptions.create(args);
    }

    public Job execute() throws Exception {
        assert inputOptions != null;
        assert getConf() != null;
        executionStage = inputOptions.getExecutionStage();
        LOG.info("Executing Workflow stage : {}", executionStage);
        if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.LASTEVENTS.name())) {
            String lastEventsIdFile = getLastEvents(jobConf);
            LOG.info("Last successfully replicated Event file : {}", lastEventsIdFile);
            return null;
        } else if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.EXPORT.name())) {
            createStagingDirectory();
            eventsMetaFile = sourceEvents();
            LOG.info("Sourced Events meta file : {}", eventsMetaFile);
            if (StringUtils.isEmpty(eventsMetaFile)) {
                LOG.info("No events to process");
                return null;
            } else {
                /*
                 * eventsMetaFile contains the events to be processed by HiveDr. This file should be available
                 * for the import action as well. Persist the file at a location common to both export and import.
                 */
                persistEventsMetafileLocation(eventsMetaFile);
            }
        } else if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.IMPORT.name())) {
            // read the location of eventsMetaFile from hdfs
            eventsMetaFile = getEventsMetaFileLocation();
            if (StringUtils.isEmpty(eventsMetaFile)) {
                LOG.info("No events to process");
                return null;
            }
        } else {
            throw new HiveReplicationException("Invalid Execution stage : " + inputOptions.getExecutionStage());
        }

        Job job = createJob();
        job.submit();

        String jobID = job.getJobID().toString();
        job.getConfiguration().set("HIVEDR_JOB_ID", jobID);

        LOG.info("HiveDR job-id: {}", jobID);
        if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) {
            throw new IOException(
                    "HiveDR failure: Job " + jobID + " has failed: " + job.getStatus().getFailureInfo());
        }

        return job;
    }

    private Job createJob() throws Exception {
        String jobName = "hive-dr" + executionStage;
        String userChosenName = getConf().get(JobContext.JOB_NAME);
        if (userChosenName != null) {
            jobName += ": " + userChosenName;
        }
        Job job = Job.getInstance(getConf());
        job.setJobName(jobName);
        job.setJarByClass(CopyMapper.class);
        job.setMapperClass(CopyMapper.class);
        job.setReducerClass(CopyReducer.class);
        job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.NLineInputFormat.class);

        job.setOutputFormatClass(NullOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
        job.getConfiguration().set(JobContext.NUM_MAPS, String.valueOf(inputOptions.getReplicationMaxMaps()));

        for (HiveDRArgs args : HiveDRArgs.values()) {
            if (inputOptions.getValue(args) != null) {
                job.getConfiguration().set(args.getName(), inputOptions.getValue(args));
            } else {
                job.getConfiguration().set(args.getName(), "null");
            }
        }
        job.getConfiguration().set(FileInputFormat.INPUT_DIR, eventsMetaFile);

        return job;
    }

    private void createStagingDirectory() throws IOException, HiveReplicationException {
        Path sourceStagingPath = new Path(inputOptions.getSourceStagingPath());
        Path targetStagingPath = new Path(inputOptions.getTargetStagingPath());
        LOG.info("Source staging path: {}", sourceStagingPath);
        if (!FileSystem.mkdirs(sourceClusterFS, sourceStagingPath, STAGING_DIR_PERMISSION)) {
            throw new IOException("mkdir failed for " + sourceStagingPath);
        }

        LOG.info("Target staging path: {}", targetStagingPath);
        if (!FileSystem.mkdirs(targetClusterFS, targetStagingPath, STAGING_DIR_PERMISSION)) {
            throw new IOException("mkdir failed for " + targetStagingPath);
        }
    }

    private void cleanStagingDirectory() throws HiveReplicationException {
        LOG.info("Cleaning staging directories");
        Path sourceStagingPath = new Path(inputOptions.getSourceStagingPath());
        Path targetStagingPath = new Path(inputOptions.getTargetStagingPath());
        try {
            if (sourceClusterFS.exists(sourceStagingPath)) {
                sourceClusterFS.delete(sourceStagingPath, true);
            }

            if (targetClusterFS.exists(targetStagingPath)) {
                targetClusterFS.delete(targetStagingPath, true);
            }
        } catch (IOException e) {
            LOG.error("Unable to cleanup staging dir:", e);
        }
    }

    private String sourceEvents() throws Exception {
        MetaStoreEventSourcer defaultSourcer = null;
        String inputFilename = null;
        String lastEventsIdFile = FileUtils.DEFAULT_EVENT_STORE_PATH + File.separator + inputOptions.getJobName()
                + File.separator + inputOptions.getJobName() + ".id";
        Map<String, Long> lastEventsIdMap = getLastDBTableEvents(new Path(lastEventsIdFile));
        try {
            HCatClient sourceMetastoreClient = HiveMetastoreUtils.initializeHiveMetaStoreClient(
                    inputOptions.getSourceMetastoreUri(), inputOptions.getSourceMetastoreKerberosPrincipal(),
                    inputOptions.getSourceHive2KerberosPrincipal());
            defaultSourcer = new MetaStoreEventSourcer(sourceMetastoreClient,
                    new DefaultPartitioner(drStore, eventSoucerUtil), eventSoucerUtil, lastEventsIdMap);
            inputFilename = defaultSourcer.sourceEvents(inputOptions);
        } finally {
            if (defaultSourcer != null) {
                defaultSourcer.cleanUp();
            }
        }

        return inputFilename;
    }

    private String getLastEvents(Configuration conf) throws Exception {
        LastReplicatedEvents lastEvents = new LastReplicatedEvents(conf, inputOptions.getTargetMetastoreUri(),
                inputOptions.getTargetMetastoreKerberosPrincipal(), inputOptions.getTargetHive2KerberosPrincipal(),
                drStore, inputOptions);
        String eventIdFile = lastEvents.getLastEvents(inputOptions);
        lastEvents.cleanUp();
        return eventIdFile;
    }

    private Map<String, Long> getLastDBTableEvents(Path lastEventIdFile) throws Exception {
        Map<String, Long> lastEventsIdMap = new HashMap<>();
        BufferedReader in = new BufferedReader(new InputStreamReader(jobFS.open(lastEventIdFile)));
        try {
            String line;
            while ((line = in.readLine()) != null) {
                String[] field = line.trim().split(DelimiterUtils.TAB_DELIM, -1);
                lastEventsIdMap.put(field[0], Long.parseLong(field[1]));
            }
        } catch (Exception e) {
            throw new IOException(e);
        } finally {
            IOUtils.closeQuietly(in);
        }

        return lastEventsIdMap;
    }

    public static void main(String[] args) {
        int exitCode;
        try {
            HiveDRTool hiveDRTool = new HiveDRTool();
            exitCode = ToolRunner.run(HiveDRUtils.getDefaultConf(), hiveDRTool, args);
        } catch (Exception e) {
            LOG.error("Couldn't complete HiveDR operation: ", e);
            exitCode = -1;
        }

        System.exit(exitCode);
    }

    private void cleanInputDir() {
        eventSoucerUtil.cleanUpEventInputDir();
    }

    private synchronized void cleanup() throws HiveReplicationException {
        cleanStagingDirectory();
        cleanInputDir();
        cleanTempFiles();
    }

    private void cleanTempFiles() {
        Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
        Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);
        Path eventsFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + ".id");

        try {
            if (jobFS.exists(metaFilePath)) {
                jobFS.delete(metaFilePath, true);
            }
            if (jobFS.exists(eventsFilePath)) {
                jobFS.delete(eventsFilePath, true);
            }
        } catch (IOException e) {
            LOG.error("Deleting Temp files failed", e);
        }
    }

    public void persistEventsMetafileLocation(final String eventMetaFilePath) throws IOException {
        Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
        Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);

        OutputStream out = null;
        try {
            out = FileSystem.create(jobFS, metaFilePath, FileUtils.FS_PERMISSION_700);
            out.write(eventMetaFilePath.getBytes());
            out.flush();
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

    private String getEventsMetaFileLocation() throws IOException {
        Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
        Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);
        String line = null;
        if (jobFS.exists(metaFilePath)) {
            BufferedReader in = new BufferedReader(new InputStreamReader(jobFS.open(metaFilePath)));
            line = in.readLine();
            in.close();
        }
        return line;
    }

    public static void usage() {
        System.out.println("Usage: hivedrtool -option value ....");
    }
}