gobblin.data.management.copy.hive.HiveCopyEntityHelper.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.data.management.copy.hive.HiveCopyEntityHelper.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.data.management.copy.hive;

import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;

import com.google.common.annotations.VisibleForTesting;
import com.typesafe.config.ConfigFactory;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closer;
import com.google.gson.Gson;
import com.typesafe.config.Config;

import gobblin.commit.CommitStep;
import gobblin.configuration.State;
import gobblin.util.ClassAliasResolver;
import gobblin.data.management.copy.CopyConfiguration;
import gobblin.data.management.copy.CopyEntity;
import gobblin.data.management.copy.CopyableFile;
import gobblin.data.management.copy.OwnerAndPermission;
import gobblin.data.management.copy.entities.PostPublishStep;
import gobblin.data.management.copy.hive.avro.HiveAvroCopyEntityHelper;
import gobblin.data.management.partition.FileSet;
import gobblin.hive.HiveMetastoreClientPool;
import gobblin.hive.HiveRegProps;
import gobblin.hive.HiveRegisterStep;
import gobblin.hive.PartitionDeregisterStep;
import gobblin.hive.TableDeregisterStep;
import gobblin.hive.metastore.HiveMetaStoreUtils;
import gobblin.hive.spec.HiveSpec;
import gobblin.hive.spec.SimpleHiveSpec;
import gobblin.metrics.event.EventSubmitter;
import gobblin.metrics.event.MultiTimingEvent;
import gobblin.util.PathUtils;
import gobblin.util.commit.DeleteFileCommitStep;
import gobblin.util.reflection.GobblinConstructorUtils;
import gobblin.util.request_allocation.PushDownRequestor;

import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.Singular;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;

/**
 * Creates {@link CopyEntity}s for copying a Hive table.
 */
@Slf4j
@Getter
public class HiveCopyEntityHelper {

    public static final String EXISTING_ENTITY_POLICY_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".existing.entity.conflict.policy";
    public static final String DEFAULT_EXISTING_ENTITY_POLICY = ExistingEntityPolicy.ABORT.name();

    public static final String UNMANAGED_DATA_POLICY_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".unmanaged.data.conflict.policy";
    public static final String DEFAULT_UNMANAGED_DATA_POLICY = UnmanagedDataPolicy.ABORT.name();

    /** Target metastore URI */
    public static final String TARGET_METASTORE_URI_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.target.metastore.uri";
    /** Target database name */
    public static final String TARGET_DATABASE_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.target.database";

    /** A filter to select partitions to copy */
    public static final String COPY_PARTITIONS_FILTER_CONSTANT = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.partition.filter.constant";
    /** Use an implementation of {@link PartitionFilterGenerator} to dynamically create partition filter. The value should
     * be the name of the implementation to use. */
    public static final String COPY_PARTITION_FILTER_GENERATOR = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.partition.filter.generator";
    /** A predicate applied to each partition before any file listing.
     * If the predicate returns true, the partition will be skipped. */
    public static final String FAST_PARTITION_SKIP_PREDICATE = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.fast.partition.skip.predicate";

    /** A predicate applied to non partition table before any file listing.
     * If the predicate returns true, the table will be skipped. */
    public static final String FAST_TABLE_SKIP_PREDICATE = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.fast.table.skip.predicate";

    /** Method for deleting files on deregister. One of {@link DeregisterFileDeleteMethod}. */
    public static final String DELETE_FILES_ON_DEREGISTER = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".copy.deregister.fileDeleteMethod";
    public static final DeregisterFileDeleteMethod DEFAULT_DEREGISTER_DELETE_METHOD = DeregisterFileDeleteMethod.NO_DELETE;

    /**
     * Config key to specify if {@link IMetaStoreClient }'s filtering method {@link listPartitionsByFilter} is not enough
     * for filtering out specific partitions.
     * For example, if you specify "Path" as the filter type and "Hourly" as the filtering condition,
     * partitions with Path containing '/Hourly/' will be kept.
     */
    public static final String HIVE_PARTITION_EXTENDED_FILTER_TYPE = HiveDatasetFinder.HIVE_DATASET_PREFIX
            + ".extendedFilterType";

    static final Gson gson = new Gson();

    private static final String source_client = "source_client";
    private static final String target_client = "target_client";
    public static final String GOBBLIN_DISTCP = "gobblin-distcp";

    public static class Stages {
        public static final String EXISTING_PARTITION = "ExistingPartition";
        public static final String PARTITION_SKIP_PREDICATE = "PartitionSkipPredicate";
        public static final String CREATE_LOCATIONS = "CreateLocations";
        public static final String FULL_PATH_DIFF = "FullPathDiff";
        public static final String CREATE_DELETE_UNITS = "CreateDeleteUnits";
        public static final String CREATE_COPY_UNITS = "CreateCopyUnits";
        public static final String SOURCE_PATH_LISTING = "SourcePathListing";
        public static final String TARGET_EXISTING_PATH_LISTING = "TargetExistingPathListing";
        public static final String DESIRED_PATHS_LISTING = "DesiredPathsListing";
        public static final String PATH_DIFF = "PathDiff";
        public static final String COMPUTE_DELETE_PATHS = "ComputeDeletePaths";
        public static final String GET_TABLES = "GetTables";
        public static final String COMPUTE_TARGETS = "ComputeTargets";
    }

    private final long startTime;

    private final HiveDataset dataset;
    private final CopyConfiguration configuration;
    private final FileSystem targetFs;

    private final HiveMetastoreClientPool targetClientPool;
    private final String targetDatabase;
    private final HiveRegProps hiveRegProps;
    private Optional<Table> existingTargetTable;
    private final Table targetTable;
    private final Optional<String> targetURI;
    private final ExistingEntityPolicy existingEntityPolicy;
    private final UnmanagedDataPolicy unmanagedDataPolicy;
    private final Optional<String> partitionFilter;
    private Optional<? extends HivePartitionExtendedFilter> hivePartitionExtendedFilter;
    private final Optional<Predicate<HivePartitionFileSet>> fastPartitionSkip;
    private final Optional<Predicate<HiveCopyEntityHelper>> fastTableSkip;

    private final DeregisterFileDeleteMethod deleteMethod;

    private final Optional<CommitStep> tableRegistrationStep;
    private final Map<List<String>, Partition> sourcePartitions;
    private final Map<List<String>, Partition> targetPartitions;

    private final EventSubmitter eventSubmitter;
    @Getter
    protected final HiveTargetPathHelper targetPathHelper;

    /**
     * Defines what should be done for partitions that exist in the target but are not compatible with the source.
     */
    public enum ExistingEntityPolicy {
        /** Deregister target partition, delete its files, and create a new partition with correct values. */
        REPLACE_PARTITIONS,
        /** Deregister target table, do NOT delete its files, and create a new table with correct values. */
        REPLACE_TABLE,
        /** Keep the target table as registered while updating the file location */
        UPDATE_TABLE,
        /** Abort copying of conflict table. */
        ABORT
    }

    /**
     * Defines what should be done for data that is not managed by the existing target table / partition.
     */
    public enum UnmanagedDataPolicy {
        /** Delete any data that is not managed by the existing target table / partition. */
        DELETE_UNMANAGED_DATA,
        /** Abort copying of conflict table / partition. */
        ABORT
    }

    public enum DeregisterFileDeleteMethod {
        /** Delete the files pointed at by the input format. */
        INPUT_FORMAT,
        /** Delete all files at the partition location recursively. */
        RECURSIVE,
        /** Don't delete files, just deregister partition. */
        NO_DELETE
    }

    /**
     * A container for the differences between desired and existing files.
     */
    @Builder
    @ToString
    protected static class DiffPathSet {
        /** Desired files that don't exist on target */
        @Singular(value = "copyFile")
        Collection<FileStatus> filesToCopy;
        /** Files in target that are not desired */
        @Singular(value = "deleteFile")
        Collection<Path> pathsToDelete;
    }

    /**
     * Represents a source {@link FileStatus} and a {@link Path} destination.
     */
    @Data
    private static class SourceAndDestination {
        private final FileStatus source;
        private final Path destination;
    }

    HiveCopyEntityHelper(HiveDataset dataset, CopyConfiguration configuration, FileSystem targetFs)
            throws IOException {

        try (Closer closer = Closer.create()) {
            log.info("Finding copy entities for table " + dataset.table.getCompleteName());

            this.eventSubmitter = new EventSubmitter.Builder(dataset.getMetricContext(), "hive.dataset.copy")
                    .build();
            MultiTimingEvent multiTimer = closer
                    .register(new MultiTimingEvent(this.eventSubmitter, "HiveCopySetup", true));

            this.startTime = System.currentTimeMillis();

            this.dataset = dataset;
            this.configuration = configuration;
            this.targetFs = targetFs;

            this.targetPathHelper = new HiveTargetPathHelper(this.dataset);
            this.hiveRegProps = new HiveRegProps(new State(this.dataset.getProperties()));
            this.targetURI = Optional
                    .fromNullable(this.dataset.getProperties().getProperty(TARGET_METASTORE_URI_KEY));
            this.targetClientPool = HiveMetastoreClientPool.get(this.dataset.getProperties(), this.targetURI);
            this.targetDatabase = Optional
                    .fromNullable(this.dataset.getProperties().getProperty(TARGET_DATABASE_KEY))
                    .or(this.dataset.table.getDbName());
            this.existingEntityPolicy = ExistingEntityPolicy.valueOf(this.dataset.getProperties()
                    .getProperty(EXISTING_ENTITY_POLICY_KEY, DEFAULT_EXISTING_ENTITY_POLICY).toUpperCase());
            this.unmanagedDataPolicy = UnmanagedDataPolicy.valueOf(this.dataset.getProperties()
                    .getProperty(UNMANAGED_DATA_POLICY_KEY, DEFAULT_UNMANAGED_DATA_POLICY).toUpperCase());

            this.deleteMethod = this.dataset.getProperties().containsKey(DELETE_FILES_ON_DEREGISTER)
                    ? DeregisterFileDeleteMethod.valueOf(
                            this.dataset.getProperties().getProperty(DELETE_FILES_ON_DEREGISTER).toUpperCase())
                    : DEFAULT_DEREGISTER_DELETE_METHOD;

            if (this.dataset.getProperties().containsKey(COPY_PARTITION_FILTER_GENERATOR)) {
                try {
                    PartitionFilterGenerator generator = GobblinConstructorUtils.invokeFirstConstructor(
                            (Class<PartitionFilterGenerator>) Class.forName(
                                    this.dataset.getProperties().getProperty(COPY_PARTITION_FILTER_GENERATOR)),
                            Lists.<Object>newArrayList(this.dataset.getProperties()), Lists.newArrayList());
                    this.partitionFilter = Optional.of(generator.getFilter(this.dataset));
                    log.info(String.format("Dynamic partition filter for table %s: %s.",
                            this.dataset.table.getCompleteName(), this.partitionFilter.get()));
                } catch (ReflectiveOperationException roe) {
                    throw new IOException(roe);
                }
            } else {
                this.partitionFilter = Optional
                        .fromNullable(this.dataset.getProperties().getProperty(COPY_PARTITIONS_FILTER_CONSTANT));
            }

            // Initialize extended partition filter
            if (this.dataset.getProperties().containsKey(HIVE_PARTITION_EXTENDED_FILTER_TYPE)) {
                String filterType = dataset.getProperties().getProperty(HIVE_PARTITION_EXTENDED_FILTER_TYPE);
                try {
                    Config config = ConfigFactory.parseProperties(this.dataset.getProperties());
                    this.hivePartitionExtendedFilter = Optional
                            .of(new ClassAliasResolver<>(HivePartitionExtendedFilterFactory.class)
                                    .resolveClass(filterType).newInstance().createFilter(config));
                } catch (ReflectiveOperationException roe) {
                    log.error("Error: Could not find filter with alias " + filterType);
                    closer.close();
                    throw new IOException(roe);
                }
            } else {
                this.hivePartitionExtendedFilter = Optional.absent();
            }

            try {
                this.fastPartitionSkip = this.dataset.getProperties().containsKey(FAST_PARTITION_SKIP_PREDICATE)
                        ? Optional.of(GobblinConstructorUtils.invokeFirstConstructor(
                                (Class<Predicate<HivePartitionFileSet>>) Class.forName(
                                        this.dataset.getProperties().getProperty(FAST_PARTITION_SKIP_PREDICATE)),
                                Lists.<Object>newArrayList(this), Lists.newArrayList()))
                        : Optional.<Predicate<HivePartitionFileSet>>absent();

                this.fastTableSkip = this.dataset.getProperties().containsKey(FAST_TABLE_SKIP_PREDICATE)
                        ? Optional.of(GobblinConstructorUtils.invokeFirstConstructor(
                                (Class<Predicate<HiveCopyEntityHelper>>) Class.forName(
                                        this.dataset.getProperties().getProperty(FAST_TABLE_SKIP_PREDICATE)),
                                Lists.newArrayList()))
                        : Optional.<Predicate<HiveCopyEntityHelper>>absent();

            } catch (ReflectiveOperationException roe) {
                closer.close();
                throw new IOException(roe);
            }

            Map<String, HiveMetastoreClientPool> namedPools = ImmutableMap.of(source_client,
                    this.dataset.clientPool, target_client, this.targetClientPool);

            multiTimer.nextStage(Stages.GET_TABLES);
            try (HiveMetastoreClientPool.MultiClient multiClient = HiveMetastoreClientPool
                    .safeGetClients(namedPools)) {

                if (multiClient.getClient(target_client).tableExists(this.targetDatabase,
                        this.dataset.table.getTableName())) {
                    this.existingTargetTable = Optional.of(new Table(multiClient.getClient(target_client)
                            .getTable(this.targetDatabase, this.dataset.table.getTableName())));
                } else {
                    this.existingTargetTable = Optional.absent();
                }

                // Constructing CommitStep object for table registration
                Path targetPath = getTargetLocation(dataset.fs, this.targetFs, dataset.table.getDataLocation(),
                        Optional.<Partition>absent());
                this.targetTable = getTargetTable(this.dataset.table, targetPath);
                HiveSpec tableHiveSpec = new SimpleHiveSpec.Builder<>(targetPath)
                        .withTable(HiveMetaStoreUtils.getHiveTable(this.targetTable.getTTable())).build();

                CommitStep tableRegistrationStep = new HiveRegisterStep(this.targetURI, tableHiveSpec,
                        this.hiveRegProps);
                this.tableRegistrationStep = Optional.of(tableRegistrationStep);

                if (this.existingTargetTable.isPresent() && this.existingTargetTable.get().isPartitioned()) {
                    checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get());
                }
                if (HiveUtils.isPartitioned(this.dataset.table)) {
                    this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client),
                            this.dataset.table, this.partitionFilter, this.hivePartitionExtendedFilter);
                    // Note: this must be mutable, so we copy the map
                    this.targetPartitions = this.existingTargetTable.isPresent()
                            ? Maps.newHashMap(HiveUtils.getPartitionsMap(multiClient.getClient(target_client),
                                    this.existingTargetTable.get(), this.partitionFilter,
                                    this.hivePartitionExtendedFilter))
                            : Maps.<List<String>, Partition>newHashMap();
                } else {
                    this.sourcePartitions = Maps.newHashMap();
                    this.targetPartitions = Maps.newHashMap();
                }

            } catch (TException te) {
                closer.close();
                throw new IOException("Failed to generate work units for table " + dataset.table.getCompleteName(),
                        te);
            }
        }
    }

    /**
     * See {@link #getCopyEntities(CopyConfiguration, Comparator, PushDownRequestor)}. This method does not pushdown any prioritizer.
     */
    Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration) throws IOException {
        return getCopyEntities(configuration, null, null);
    }

    /**
     * Finds all files read by the table and generates {@link CopyEntity}s for duplicating the table. The semantics are as follows:
     * 1. Find all valid {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}. If the table is partitioned, the
     *    {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of the base
     *    table will be ignored, and we will instead process the {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of each partition.
     * 2. For each {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} find all files referred by it.
     * 3. Generate a {@link CopyableFile} for each file referred by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
     * 4. If the table is partitioned, create a file set for each partition.
     * 5. Create work units for registering, deregistering partitions / tables, and deleting unnecessary files in the target.
     *
     * For computation of target locations see {@link HiveTargetPathHelper#getTargetPath}
     */
    Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration,
            Comparator<FileSet<CopyEntity>> prioritizer, PushDownRequestor<FileSet<CopyEntity>> requestor)
            throws IOException {
        if (HiveUtils.isPartitioned(this.dataset.table)) {
            return new PartitionIterator(this.sourcePartitions, configuration, prioritizer, requestor);
        } else {
            FileSet<CopyEntity> fileSet = new UnpartitionedTableFileSet(this.dataset.table.getCompleteName(),
                    this.dataset, this);
            return Iterators.singletonIterator(fileSet);
        }
    }

    /**
     * An iterator producing a {@link FileSet} of {@link CopyEntity} for each partition in this table. The files
     * are not scanned or the {@link FileSet} materialized until {@link #next} is called.
     */
    private class PartitionIterator implements Iterator<FileSet<CopyEntity>> {

        static final String DEREGISTER_FILE_SET = "deregister";

        private final List<FileSet<CopyEntity>> allFileSets;
        private final Iterator<FileSet<CopyEntity>> fileSetIterator;

        public PartitionIterator(Map<List<String>, Partition> partitionMap, CopyConfiguration configuration,
                Comparator<FileSet<CopyEntity>> prioritizer, PushDownRequestor<FileSet<CopyEntity>> requestor) {
            this.allFileSets = generateAllFileSets(partitionMap);
            for (FileSet<CopyEntity> fileSet : this.allFileSets) {
                fileSet.setRequestor(requestor);
            }
            if (prioritizer != null) {
                Collections.sort(this.allFileSets, prioritizer);
            }
            this.fileSetIterator = this.allFileSets.iterator();
        }

        @Override
        public boolean hasNext() {
            return this.fileSetIterator.hasNext();
        }

        @Override
        public FileSet<CopyEntity> next() {
            return this.fileSetIterator.next();
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        private List<FileSet<CopyEntity>> generateAllFileSets(Map<List<String>, Partition> partitionMap) {
            List<FileSet<CopyEntity>> fileSets = Lists.newArrayList();
            for (Map.Entry<List<String>, Partition> partition : partitionMap.entrySet()) {
                fileSets.add(fileSetForPartition(partition.getValue()));
                HiveCopyEntityHelper.this.targetPartitions.remove(partition.getKey());
            }
            if (!HiveCopyEntityHelper.this.targetPartitions.isEmpty()) {
                fileSets.add(
                        new HivePartitionsDeregisterFileSet(
                                HiveCopyEntityHelper.this.dataset.getTable().getCompleteName()
                                        + DEREGISTER_FILE_SET,
                                HiveCopyEntityHelper.this.dataset,
                                HiveCopyEntityHelper.this.targetPartitions.values(), HiveCopyEntityHelper.this));
            }
            return fileSets;
        }

        private FileSet<CopyEntity> fileSetForPartition(final Partition partition) {
            return new HivePartitionFileSet(HiveCopyEntityHelper.this, partition,
                    HiveCopyEntityHelper.this.dataset.getProperties());
        }
    }

    private Table getTargetTable(Table originTable, Path targetLocation) throws IOException {
        try {
            Table targetTable = originTable.copy();

            targetTable.setDbName(this.targetDatabase);
            targetTable.setDataLocation(targetLocation);
            /*
             * Need to set the table owner as the flow executor
             */
            targetTable.setOwner(UserGroupInformation.getCurrentUser().getShortUserName());
            targetTable.getTTable().putToParameters(HiveDataset.REGISTERER, GOBBLIN_DISTCP);
            targetTable.getTTable().putToParameters(HiveDataset.REGISTRATION_GENERATION_TIME_MILLIS,
                    Long.toString(this.startTime));
            targetTable.getTTable().unsetCreateTime();

            HiveAvroCopyEntityHelper.updateTableAttributesIfAvro(targetTable, this);

            return targetTable;
        } catch (HiveException he) {
            throw new IOException(he);
        }
    }

    int addPartitionDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table,
            Partition partition) throws IOException {

        int stepPriority = initialPriority;
        Collection<Path> partitionPaths = Lists.newArrayList();

        if (this.deleteMethod == DeregisterFileDeleteMethod.RECURSIVE) {
            partitionPaths = Lists.newArrayList(partition.getDataLocation());
        } else if (this.deleteMethod == DeregisterFileDeleteMethod.INPUT_FORMAT) {
            InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(partition.getTPartition().getSd());

            HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(partition.getDataLocation(),
                    inputFormat, this.targetFs, this.dataset.getProperties());

            partitionPaths = targetLocation.getPaths().keySet();
        } else if (this.deleteMethod == DeregisterFileDeleteMethod.NO_DELETE) {
            partitionPaths = Lists.newArrayList();
        }

        if (!partitionPaths.isEmpty()) {
            DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.targetFs, partitionPaths,
                    this.dataset.getProperties(), table.getDataLocation());
            copyEntities.add(
                    new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
        }

        PartitionDeregisterStep deregister = new PartitionDeregisterStep(table.getTTable(),
                partition.getTPartition(), this.targetURI, this.hiveRegProps);
        copyEntities
                .add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
        return stepPriority;
    }

    @VisibleForTesting
    protected int addTableDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority,
            Table table) throws IOException {

        int stepPriority = initialPriority;
        Collection<Path> tablePaths = Lists.newArrayList();

        switch (this.getDeleteMethod()) {
        case RECURSIVE:
            tablePaths = Lists.newArrayList(table.getDataLocation());
            break;
        case INPUT_FORMAT:
            InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(table.getSd());

            HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(table.getDataLocation(), inputFormat,
                    this.getTargetFs(), this.getDataset().getProperties());

            tablePaths = targetLocation.getPaths().keySet();
            break;
        case NO_DELETE:
            tablePaths = Lists.newArrayList();
            break;
        default:
            tablePaths = Lists.newArrayList();
        }

        if (!tablePaths.isEmpty()) {
            DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.getTargetFs(), tablePaths,
                    this.getDataset().getProperties(), table.getDataLocation());
            copyEntities.add(
                    new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
        }

        TableDeregisterStep deregister = new TableDeregisterStep(table.getTTable(), this.getTargetURI(),
                this.getHiveRegProps());
        copyEntities
                .add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
        return stepPriority;
    }

    int addSharedSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority) {
        int priority = initialPriority;
        if (this.tableRegistrationStep.isPresent()) {
            copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(),
                    this.tableRegistrationStep.get(), priority++));
        }
        return priority;
    }

    /**
     * Compares three entities to figure out which files should be copied and which files should be deleted in the target
     * file system.
     * @param sourceLocation Represents the source table or partition.
     * @param desiredTargetLocation Represents the new desired table or partition.
     * @param currentTargetLocation Represents the corresponding existing table or partition in the target hcat if it exists.
     * @param partition If present, contains partition information.
     * @return A {@link DiffPathSet} with data on files to copy and delete.
     * @throws IOException if the copy of this table / partition should be aborted.
     */
    @VisibleForTesting
    protected static DiffPathSet fullPathDiff(HiveLocationDescriptor sourceLocation,
            HiveLocationDescriptor desiredTargetLocation, Optional<HiveLocationDescriptor> currentTargetLocation,
            Optional<Partition> partition, MultiTimingEvent multiTimer, HiveCopyEntityHelper helper)
            throws IOException {

        DiffPathSet.DiffPathSetBuilder builder = DiffPathSet.builder();

        multiTimer.nextStage(Stages.SOURCE_PATH_LISTING);
        // These are the paths at the source
        Map<Path, FileStatus> sourcePaths = sourceLocation.getPaths();

        multiTimer.nextStage(Stages.TARGET_EXISTING_PATH_LISTING);
        // These are the paths that the existing target table / partition uses now
        Map<Path, FileStatus> targetExistingPaths = currentTargetLocation.isPresent()
                ? currentTargetLocation.get().getPaths()
                : Maps.<Path, FileStatus>newHashMap();

        multiTimer.nextStage(Stages.DESIRED_PATHS_LISTING);
        // These are the paths that exist at the destination and the new table / partition would pick up
        Map<Path, FileStatus> desiredTargetExistingPaths;
        try {
            desiredTargetExistingPaths = desiredTargetLocation.getPaths();
        } catch (InvalidInputException ioe) {
            // Thrown if inputFormat cannot find location in target. Since location doesn't exist, this set is empty.
            desiredTargetExistingPaths = Maps.newHashMap();
        }

        multiTimer.nextStage(Stages.PATH_DIFF);
        for (FileStatus sourcePath : sourcePaths.values()) {
            // For each source path
            Path newPath = helper.getTargetPathHelper().getTargetPath(sourcePath.getPath(),
                    desiredTargetLocation.getFileSystem(), partition, true);
            boolean shouldCopy = true;
            if (desiredTargetExistingPaths.containsKey(newPath)) {
                // If the file exists at the destination, check whether it should be replaced, if not, no need to copy
                FileStatus existingTargetStatus = desiredTargetExistingPaths.get(newPath);
                if (!helper.shouldReplaceFile(existingTargetStatus, sourcePath)) {
                    shouldCopy = false;
                }
            }
            if (shouldCopy) {
                builder.copyFile(sourcePath);
            } else {
                // if not copying, we want to keep the file in the target
                // at the end of this loop, all files in targetExistingPaths will be marked for deletion, so remove this file
                targetExistingPaths.remove(newPath);
                desiredTargetExistingPaths.remove(newPath);
            }
        }

        multiTimer.nextStage(Stages.COMPUTE_DELETE_PATHS);
        // At this point, targetExistingPaths contains paths managed by target partition / table, but that we don't want
        // delete them
        for (Path delete : targetExistingPaths.keySet()) {
            builder.deleteFile(delete);
            desiredTargetExistingPaths.remove(delete);
        }

        // Now desiredTargetExistingPaths contains paths that we don't want, but which are not managed by the existing
        // table / partition.
        // Ideally, we shouldn't delete them (they're not managed by Hive), and we don't want to pick
        // them up in the new table / partition, so if there are any leftover files, we should abort copying
        // this table / partition.
        if (desiredTargetExistingPaths.size() > 0
                && helper.getUnmanagedDataPolicy() != UnmanagedDataPolicy.DELETE_UNMANAGED_DATA) {
            throw new IOException(String.format(
                    "New table / partition would pick up existing, undesired files in target file system. "
                            + "%s, files %s.",
                    partition.isPresent() ? partition.get().getCompleteName()
                            : helper.getDataset().getTable().getCompleteName(),
                    Arrays.toString(desiredTargetExistingPaths.keySet().toArray())));
        }
        // Unless, the policy requires us to delete such un-managed files - in which case: we will add the leftover files
        // to the deletion list.
        else if (desiredTargetExistingPaths.size() > 0) {
            for (Path delete : desiredTargetExistingPaths.keySet()) {
                builder.deleteFile(delete);
            }
            log.warn(String.format(
                    "Un-managed files detected in target file system, however deleting them "
                            + "because of the policy: %s Files to be deleted are: %s",
                    UnmanagedDataPolicy.DELETE_UNMANAGED_DATA,
                    StringUtils.join(desiredTargetExistingPaths.keySet(), ",")));
        }

        return builder.build();
    }

    private static boolean shouldReplaceFile(FileStatus referencePath, FileStatus replacementFile) {
        return replacementFile.getLen() != referencePath.getLen()
                || referencePath.getModificationTime() < replacementFile.getModificationTime();
    }

    private void checkPartitionedTableCompatibility(Table desiredTargetTable, Table existingTargetTable)
            throws IOException {
        if (!desiredTargetTable.getDataLocation().equals(existingTargetTable.getDataLocation())) {
            throw new HiveTableLocationNotMatchException(desiredTargetTable.getDataLocation(),
                    existingTargetTable.getDataLocation());
        }

        if (HiveUtils.isPartitioned(desiredTargetTable) != HiveUtils.isPartitioned(existingTargetTable)) {
            throw new IOException(String.format(
                    "%s: Desired target table %s partitioned, existing target table %s partitioned. Tables are incompatible.",
                    this.dataset.tableIdentifier, HiveUtils.isPartitioned(desiredTargetTable) ? "is" : "is not",
                    HiveUtils.isPartitioned(existingTargetTable) ? "is" : "is not"));
        }
        if (desiredTargetTable.isPartitioned()
                && !desiredTargetTable.getPartitionKeys().equals(existingTargetTable.getPartitionKeys())) {
            throw new IOException(String.format(
                    "%s: Desired target table has partition keys %s, existing target table has partition  keys %s. "
                            + "Tables are incompatible.",
                    this.dataset.tableIdentifier, gson.toJson(desiredTargetTable.getPartitionKeys()),
                    gson.toJson(existingTargetTable.getPartitionKeys())));
        }
    }

    /**
     * Get builders for a {@link CopyableFile} for each file referred to by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
     */
    List<CopyableFile.Builder> getCopyableFilesFromPaths(Collection<FileStatus> paths,
            CopyConfiguration configuration, Optional<Partition> partition) throws IOException {
        List<CopyableFile.Builder> builders = Lists.newArrayList();
        List<SourceAndDestination> dataFiles = Lists.newArrayList();

        Configuration hadoopConfiguration = new Configuration();
        FileSystem actualSourceFs = null;
        String referenceScheme = null;
        String referenceAuthority = null;

        for (FileStatus status : paths) {
            dataFiles.add(new SourceAndDestination(status,
                    getTargetPathHelper().getTargetPath(status.getPath(), this.targetFs, partition, true)));
        }

        for (SourceAndDestination sourceAndDestination : dataFiles) {

            URI uri = sourceAndDestination.getSource().getPath().toUri();
            if (actualSourceFs == null || !StringUtils.equals(referenceScheme, uri.getScheme())
                    || !StringUtils.equals(referenceAuthority, uri.getAuthority())) {
                actualSourceFs = sourceAndDestination.getSource().getPath().getFileSystem(hadoopConfiguration);
                referenceScheme = uri.getScheme();
                referenceAuthority = uri.getAuthority();
            }

            if (!this.dataset.getTableRootPath().isPresent()) {
                // The logic for computing ancestor owner and permissions for hive copies depends on tables having a non-glob
                // location. Currently, this restriction is also imposed by Hive, so this is not a problem. If this ever changes
                // on the Hive side, and we try to copy a table with a glob location, this logic will have to change.
                throw new IOException(String.format("Table %s does not have a concrete table root path.",
                        this.dataset.getTable().getCompleteName()));
            }
            List<OwnerAndPermission> ancestorOwnerAndPermission = CopyableFile
                    .resolveReplicatedOwnerAndPermissionsRecursively(actualSourceFs,
                            sourceAndDestination.getSource().getPath().getParent(),
                            this.dataset.getTableRootPath().get().getParent(), configuration);

            builders.add(CopyableFile
                    .fromOriginAndDestination(actualSourceFs, sourceAndDestination.getSource(),
                            sourceAndDestination.getDestination(), configuration)
                    .ancestorsOwnerAndPermission(ancestorOwnerAndPermission));
        }

        return builders;
    }

    /**
     * Compute the target location for a Hive location.
     * @param sourceFs Source {@link FileSystem}.
     * @param path source {@link Path} in Hive location.
     * @param partition partition these paths correspond to.
     * @return transformed location in the target.
     * @throws IOException if cannot generate a single target location.
     */
    Path getTargetLocation(FileSystem sourceFs, FileSystem targetFs, Path path, Optional<Partition> partition)
            throws IOException {
        return getTargetPathHelper().getTargetPath(path, targetFs, partition, false);
    }

    protected static Path replacedPrefix(Path sourcePath, Path prefixTobeReplaced, Path prefixReplacement) {
        Path sourcePathWithoutSchemeAndAuthority = PathUtils.getPathWithoutSchemeAndAuthority(sourcePath);
        Preconditions.checkArgument(PathUtils.isAncestor(prefixTobeReplaced, sourcePathWithoutSchemeAndAuthority),
                "When replacing prefix, all locations must be descendants of the prefix. "
                        + "The prefix: %s, file location: %s.",
                prefixTobeReplaced, sourcePathWithoutSchemeAndAuthority);
        Path relativePath = PathUtils.relativizePath(sourcePathWithoutSchemeAndAuthority, prefixTobeReplaced);
        Path result = new Path(prefixReplacement, relativePath);
        return result;
    }

    public FileSystem getTargetFileSystem() {
        return this.targetFs;
    }
}