org.apache.rya.accumulo.mr.merge.CopyTool.java Source code

Introduction

Here is the source code for org.apache.rya.accumulo.mr.merge.CopyTool.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.rya.accumulo.mr.merge;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.ClientConfiguration;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Instance;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.client.mapreduce.AbstractInputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloFileOutputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloMultiTableInputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
import org.apache.accumulo.core.client.mapreduce.InputFormatBase;
import org.apache.accumulo.core.client.mapreduce.InputTableConfig;
import org.apache.accumulo.core.client.mapreduce.lib.partition.KeyRangePartitioner;
import org.apache.accumulo.core.client.mock.MockInstance;
import org.apache.accumulo.core.client.security.tokens.PasswordToken;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.file.rfile.bcfile.Compression.Algorithm;
import org.apache.accumulo.core.iterators.user.AgeOffFilter;
import org.apache.accumulo.core.iterators.user.TimestampFilter;
import org.apache.accumulo.core.security.TablePermission;
import org.apache.accumulo.core.util.TextUtil;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.apache.rya.accumulo.AccumuloRdfConfiguration;
import org.apache.rya.accumulo.mr.AccumuloHDFSFileInputFormat;
import org.apache.rya.accumulo.mr.MRUtils;
import org.apache.rya.accumulo.mr.merge.common.InstanceType;
import org.apache.rya.accumulo.mr.merge.mappers.AccumuloCopyToolMapper;
import org.apache.rya.accumulo.mr.merge.mappers.AccumuloRyaRuleMapper;
import org.apache.rya.accumulo.mr.merge.mappers.FileCopyToolMapper;
import org.apache.rya.accumulo.mr.merge.mappers.MergeToolMapper;
import org.apache.rya.accumulo.mr.merge.mappers.RowRuleMapper;
import org.apache.rya.accumulo.mr.merge.reducers.MultipleFileReducer;
import org.apache.rya.accumulo.mr.merge.util.AccumuloInstanceDriver;
import org.apache.rya.accumulo.mr.merge.util.AccumuloQueryRuleset;
import org.apache.rya.accumulo.mr.merge.util.AccumuloRyaUtils;
import org.apache.rya.accumulo.mr.merge.util.GroupedRow;
import org.apache.rya.accumulo.mr.merge.util.TimeUtils;
import org.apache.rya.accumulo.mr.merge.util.ToolConfigUtils;
import org.apache.rya.api.RdfCloudTripleStoreConstants;
import org.apache.rya.api.RdfCloudTripleStoreUtils;
import org.apache.rya.api.layout.TablePrefixLayoutStrategy;
import org.apache.rya.indexing.accumulo.ConfigUtils;

import com.google.common.base.Joiner;

/**
 * Handles copying data from a parent instance into a child instance.
 */
public class CopyTool extends AbstractDualInstanceAccumuloMRTool {
    private static final Logger log = Logger.getLogger(CopyTool.class);

    /**
     * Use this property to set the tables that are going to be copied.  The list should
     * be a comma-separated string containing the full table names.  If not set, then all
     * tables will be copied.
     */
    public static final String COPY_TABLE_LIST_PROP = "copy.table.list";

    /**
     * Indicates the type of child instance to create.  {@code null} or empty to not create an
     * instance indicating that it already was created and exists.
     */
    public static final String CREATE_CHILD_INSTANCE_TYPE_PROP = "create.child.instance.type";

    /**
     * The time difference between the parent machine and the time server.
     */
    public static final String PARENT_TIME_OFFSET_PROP = "time.offset";

    /**
     * The time difference between the child machine and the time server.
     */
    public static final String CHILD_TIME_OFFSET_PROP = "time.offset.child";

    /**
     * The host name of the time server to use.
     */
    public static final String NTP_SERVER_HOST_PROP = "ntp.server.host";

    /**
     * The URL of the Apache Tomcat server web page running on the parent machine.
     */
    public static final String PARENT_TOMCAT_URL_PROP = "tomcat.url";

    /**
     * The URL of the Apache Tomcat server web page running on the child machine.
     */
    public static final String CHILD_TOMCAT_URL_PROP = "tomcat.url.child";

    /**
     * The run time of the copy process.
     */
    public static final String COPY_RUN_TIME_PROP = "copy.run.time";

    /**
     * "true" to use the NTP server to handle time synchronization.
     * "false" (or any other value) to not use the NTP server.
     */
    public static final String USE_NTP_SERVER_PROP = "use.ntp.server";

    /**
     * "true" to use file output. "false" to use Accumulo output.
     */
    public static final String USE_COPY_FILE_OUTPUT = "use.copy.file.output";

    /**
     * The file path to output the child data to.
     */
    public static final String COPY_FILE_OUTPUT_PATH = "copy.file.output.path";

    /**
     * The compression type to use for file output.  One of "none", "gz", "lzo", or "snappy".
     */
    public static final String COPY_FILE_OUTPUT_COMPRESSION_TYPE = "copy.file.output.compression.type";

    /**
     * "true" to clear the file output directory before copying. "false" to leave the output directory alone.
     */
    public static final String USE_COPY_FILE_OUTPUT_DIRECTORY_CLEAR = "use.copy.file.output.directory.clear";

    /**
     * The input directory for importing files into accumulo tables.
     */
    public static final String COPY_FILE_IMPORT_DIRECTORY = "copy.file.import.directory";

    /**
     * "true" to read from the input directory. "false" otherwise.
     */
    public static final String USE_COPY_FILE_IMPORT = "use.copy.file.import";

    /**
     * "true" to extract a set of rules from a SPARQL query, and only copy statements relevant to those rules. "false" otherwise.
     * If set, either the query itself or a query file should also be provided.
     */
    public static final String USE_COPY_QUERY_SPARQL = "use.copy.query.sparql";

    /**
     * The text of the query that defines which statements to copy.
     */
    public static final String QUERY_STRING_PROP = "ac.copy.query";

    /**
     * The path to a file containing the query that defines which statements to copy.
     */
    public static final String QUERY_FILE_PROP = "ac.copy.queryfile";

    /**
     *  startTime is the time of the data to copy. Only parent data AFTER the selected time will be copied to the child.
     */
    private String startTime = null;
    private boolean useCopyFileOutput = false;
    private String baseOutputDir = null;
    private String localBaseOutputDir = null;
    private String compressionType = null;
    private boolean useCopyFileOutputDirectoryClear = false;
    private String tempDir = null;
    private boolean useCopyFileImport = false;
    private boolean useQuery = false;
    private String localCopyFileImportDir = null;
    private String baseImportDir = null;

    private final List<String> tables = new ArrayList<>();

    private AccumuloInstanceDriver childAccumuloInstanceDriver = null;

    /**
     * Sets up and initializes the copy tool's configuration.
     * @throws Exception
     */
    public void setup() throws Exception {
        super.init();

        tempDir = conf.get("hadoop.tmp.dir", null);
        if (tempDir == null) {
            throw new Exception(
                    "Invalid hadoop temp directory. \"hadoop.tmp.dir\" could not be found in the configuration.");
        }

        useCopyFileOutput = conf.getBoolean(USE_COPY_FILE_OUTPUT, false);
        baseOutputDir = tempDir + "/copy_tool_file_output/";
        localBaseOutputDir = conf.get(COPY_FILE_OUTPUT_PATH, null);
        compressionType = conf.get(COPY_FILE_OUTPUT_COMPRESSION_TYPE, null);
        useCopyFileOutputDirectoryClear = conf.getBoolean(USE_COPY_FILE_OUTPUT_DIRECTORY_CLEAR, false);
        localCopyFileImportDir = conf.get(COPY_FILE_IMPORT_DIRECTORY, null);
        baseImportDir = tempDir + "/copy_tool_import/";

        startTime = conf.get(MergeTool.START_TIME_PROP, null);

        if (!useCopyFileImport) {
            if (startTime != null) {
                try {
                    final Date date = MergeTool.START_TIME_FORMATTER.parse(startTime);
                    log.info("Will copy all data after " + date);
                } catch (final ParseException e) {
                    throw new Exception("Unable to parse the provided start time: " + startTime, e);
                }
            }

            Date copyRunTime = new Date();
            final boolean useTimeSync = conf.getBoolean(USE_NTP_SERVER_PROP, false);
            if (useTimeSync) {
                final String tomcatUrl = conf.get(PARENT_TOMCAT_URL_PROP, null);
                final String ntpServerHost = conf.get(NTP_SERVER_HOST_PROP, null);
                Long timeOffset = null;
                Date ntpDate = null;
                try {
                    log.info("Comparing parent machine's time to NTP server time...");
                    ntpDate = TimeUtils.getNtpServerDate(ntpServerHost);
                    final Date parentMachineDate = TimeUtils.getMachineDate(tomcatUrl);
                    final boolean isMachineLocal = TimeUtils.isUrlLocalMachine(tomcatUrl);
                    timeOffset = TimeUtils.getTimeDifference(ntpDate, parentMachineDate, isMachineLocal);
                } catch (IOException | ParseException e) {
                    throw new Exception("Unable to get time difference between machine and NTP server.", e);
                }
                if (timeOffset != null) {
                    conf.set(PARENT_TIME_OFFSET_PROP, "" + timeOffset);
                }
                copyRunTime = ntpDate;
            }
            final String copyRunTimeString = MergeTool.START_TIME_FORMATTER.format(copyRunTime);
            if (copyRunTime != null) {
                conf.set(COPY_RUN_TIME_PROP, copyRunTimeString);
            }
        }

        MergeTool.setDuplicateKeys(conf);

        final String copyTableListProperty = conf.get(COPY_TABLE_LIST_PROP);
        if (StringUtils.isNotBlank(copyTableListProperty)) {
            // Copy the tables specified in the config
            final String[] split = copyTableListProperty.split(",");
            tables.addAll(Arrays.asList(split));
        } else if (useCopyFileImport) {
            final File importDir = new File(localCopyFileImportDir);
            final String[] files = importDir.list();
            tables.addAll(Arrays.asList(files));
        } else {
            // By default copy all tables
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_OSP_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_NS_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_EVAL_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_STATS_SUFFIX);
            tables.add(tablePrefix + RdfCloudTripleStoreConstants.TBL_SEL_SUFFIX);
            /* TODO: SEE RYA-160
            tables.add(ConfigUtils.getFreeTextDocTablename(conf));
            tables.add(ConfigUtils.getFreeTextTermTablename(conf));
            tables.add(ConfigUtils.getGeoTablename(conf));
            tables.add(ConfigUtils.getTemporalTableName(conf));
            tables.add(ConfigUtils.getEntityTableName(conf));
            */
        }
        if (tables.isEmpty()) {
            log.warn("No list of tables to copy was provided.");
        } else {
            final String tablesToCopy = Joiner.on("\r\n\t").join(tables);
            log.info("Will attempt to copy the following tables/indices from the parent:\r\n\t" + tablesToCopy);
        }
    }

    @Override
    public int run(final String[] strings) throws Exception {
        useCopyFileImport = conf.getBoolean(USE_COPY_FILE_IMPORT, false);
        useQuery = conf.getBoolean(USE_COPY_QUERY_SPARQL, false);

        if (useCopyFileImport) {
            return runImport();
        } else if (useQuery) {
            return runQueryCopy();
        } else {
            return runCopy();
        }
    }

    private int runCopy() throws Exception {
        log.info("Setting up Copy Tool...");

        setup();

        if (!useCopyFileOutput) {
            createChildInstance(conf);
        }

        final AccumuloRdfConfiguration parentAccumuloRdfConfiguration = new AccumuloRdfConfiguration(conf);
        parentAccumuloRdfConfiguration.setTablePrefix(tablePrefix);
        final Connector parentConnector = AccumuloRyaUtils.setupConnector(parentAccumuloRdfConfiguration);
        final TableOperations parentTableOperations = parentConnector.tableOperations();

        for (final String table : tables) {
            // Check if the parent table exists before creating a job on it
            if (parentTableOperations.exists(table)) {
                final String childTable = table.replaceFirst(tablePrefix, childTablePrefix);
                final String jobName = "Copy Tool, copying Parent Table: " + table + ", into Child Table: "
                        + childTable + ", " + System.currentTimeMillis();
                log.info("Initializing job: " + jobName);
                conf.set(MRUtils.JOB_NAME_PROP, jobName);
                conf.set(MergeTool.TABLE_NAME_PROP, table);

                final Job job = Job.getInstance(conf);
                job.setJarByClass(CopyTool.class);

                setupAccumuloInput(job);

                InputFormatBase.setInputTableName(job, table);

                // Set input output of the particular job
                if (useCopyFileOutput) {
                    job.setMapOutputKeyClass(Key.class);
                    job.setMapOutputValueClass(Value.class);
                    job.setOutputKeyClass(Key.class);
                    job.setOutputValueClass(Value.class);
                } else {
                    job.setMapOutputKeyClass(Text.class);
                    job.setMapOutputValueClass(Mutation.class);
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(Mutation.class);
                }

                setupAccumuloOutput(job, childTable);

                // Set mapper and reducer classes
                if (useCopyFileOutput) {
                    setupSplitsFile(job, parentTableOperations, table, childTable);
                    job.setMapperClass(FileCopyToolMapper.class);
                } else {
                    job.setMapperClass(AccumuloCopyToolMapper.class);
                }
                job.setReducerClass(Reducer.class);

                // Submit the job
                final Date beginTime = new Date();
                log.info("Job for table \"" + table + "\" started: " + beginTime);
                final int exitCode = job.waitForCompletion(true) ? 0 : 1;

                if (exitCode == 0) {
                    if (useCopyFileOutput) {
                        log.info("Moving data from HDFS to the local file system for the table: " + childTable);
                        final Path hdfsPath = getPath(baseOutputDir, childTable);
                        final Path localPath = getPath(localBaseOutputDir, childTable);
                        log.info("HDFS directory: " + hdfsPath.toString());
                        log.info("Local directory: " + localPath.toString());
                        copyHdfsToLocal(hdfsPath, localPath);
                    }

                    final Date endTime = new Date();
                    log.info("Job for table \"" + table + "\" finished: " + endTime);
                    log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
                } else {
                    log.error("Job for table \"" + table + "\" Failed!!!");
                    return exitCode;
                }
            } else {
                log.warn("The table \"" + table + "\" was NOT found in the parent instance and cannot be copied.");
            }
        }

        return 0;
    }

    private int runImport() throws Exception {
        log.info("Setting up Copy Tool for importing...");

        setup();

        createChildInstance(conf);

        for (final String childTable : tables) {
            final String jobName = "Copy Tool, importing Exported Parent Table files from: "
                    + getPath(localCopyFileImportDir, childTable).toString() + ", into Child Table: " + childTable
                    + ", " + System.currentTimeMillis();
            log.info("Initializing job: " + jobName);
            conf.set(MRUtils.JOB_NAME_PROP, jobName);

            // Submit the job
            final Date beginTime = new Date();
            log.info("Job for table \"" + childTable + "\" started: " + beginTime);

            createTableIfNeeded(childTable);
            importFilesToChildTable(childTable);

            final Date endTime = new Date();
            log.info("Job for table \"" + childTable + "\" finished: " + endTime);
            log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
        }

        return 0;
    }

    private int runQueryCopy() throws Exception {
        log.info("Setting up Copy Tool with a query-based ruleset...");
        setup();
        if (!useCopyFileOutput) {
            createChildInstance(conf);
        }

        // Set up the configuration
        final AccumuloRdfConfiguration aconf = new AccumuloRdfConfiguration(conf);
        aconf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
        aconf.setTablePrefix(tablePrefix);
        aconf.setFlush(false);
        ConfigUtils.setIndexers(aconf);

        // Since we're copying at the statement-level, ignore any given list of tables and determine
        // which tables we might need to create based on which indexers are desired.
        final TablePrefixLayoutStrategy prefixStrategy = new TablePrefixLayoutStrategy(tablePrefix);
        tables.clear();
        // Always include core tables
        tables.add(prefixStrategy.getSpo());
        tables.add(prefixStrategy.getOsp());
        tables.add(prefixStrategy.getPo());
        // Copy namespaces if they exist
        tables.add(prefixStrategy.getNs());
        // Add tables associated with any configured indexers
        /* TODO: SEE RYA-160
        if (aconf.getBoolean(ConfigUtils.USE_FREETEXT, false)) {
        tables.add(ConfigUtils.getFreeTextDocTablename(conf));
        tables.add(ConfigUtils.getFreeTextTermTablename(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_GEO, false)) {
        tables.add(ConfigUtils.getGeoTablename(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_TEMPORAL, false)) {
        tables.add(ConfigUtils.getTemporalTableName(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_ENTITY, false)) {
        tables.add(ConfigUtils.getEntityTableName(conf));
        }
        */
        // Ignore anything else, e.g. statistics -- must be recalculated for the child if desired

        // Extract the ruleset, and copy the namespace table directly
        final AccumuloQueryRuleset ruleset = new AccumuloQueryRuleset(aconf);
        ruleset.addTable(prefixStrategy.getNs());
        for (final String line : ruleset.toString().split("\n")) {
            log.info(line);
        }

        // Create a Job and configure its input and output
        final Job job = Job.getInstance(aconf);
        job.setJarByClass(this.getClass());
        setupMultiTableInputFormat(job, ruleset);
        setupAccumuloOutput(job, "");

        if (useCopyFileOutput) {
            // Configure job for file output
            job.setJobName("Ruleset-based export to file: " + tablePrefix + " -> " + localBaseOutputDir);
            // Map (row) to (table+key, key+value)
            job.setMapperClass(RowRuleMapper.class);
            job.setMapOutputKeyClass(GroupedRow.class);
            job.setMapOutputValueClass(GroupedRow.class);
            // Group according to table and and sort according to key
            job.setGroupingComparatorClass(GroupedRow.GroupComparator.class);
            job.setSortComparatorClass(GroupedRow.SortComparator.class);
            // Reduce ([table+row], rows): output each row to the file for that table, in sorted order
            job.setReducerClass(MultipleFileReducer.class);
            job.setOutputKeyClass(Key.class);
            job.setOutputValueClass(Value.class);
        } else {
            // Configure job for table output
            job.setJobName("Ruleset-based copy: " + tablePrefix + " -> " + childTablePrefix);
            // Map (row): convert to statement, insert to child (for namespace table, output row directly)
            job.setMapperClass(AccumuloRyaRuleMapper.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Mutation.class);
            job.setNumReduceTasks(0);
            // Create the child tables, so mappers don't try to do this in parallel
            for (final String parentTable : tables) {
                final String childTable = parentTable.replaceFirst(tablePrefix, childTablePrefix);
                createTableIfNeeded(childTable);
            }
        }

        // Run the job and copy files to local filesystem if needed
        final Date beginTime = new Date();
        log.info("Job started: " + beginTime);
        final boolean success = job.waitForCompletion(true);
        if (success) {
            if (useCopyFileOutput) {
                log.info("Moving data from HDFS to the local file system");
                final Path baseOutputPath = new Path(baseOutputDir);
                for (final FileStatus status : FileSystem.get(conf).listStatus(baseOutputPath)) {
                    if (status.isDirectory()) {
                        final String tableName = status.getPath().getName();
                        final Path hdfsPath = getPath(baseOutputDir, tableName);
                        final Path localPath = getPath(localBaseOutputDir, tableName);
                        log.info("HDFS directory: " + hdfsPath.toString());
                        log.info("Local directory: " + localPath.toString());
                        copyHdfsToLocal(hdfsPath, localPath);
                    }
                }
            }
            final Date endTime = new Date();
            log.info("Job finished: " + endTime);
            log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
            return 0;
        } else {
            log.error("Job failed!!!");
            return 1;
        }
    }

    /**
     * Creates the child table if it doesn't already exist.
     * @param childTableName the name of the child table.
     * @throws IOException
     */
    public void createTableIfNeeded(final String childTableName) throws IOException {
        try {
            final Configuration childConfig = MergeToolMapper.getChildConfig(conf);
            final AccumuloRdfConfiguration childAccumuloRdfConfiguration = new AccumuloRdfConfiguration(
                    childConfig);
            childAccumuloRdfConfiguration.setTablePrefix(childTablePrefix);
            final Connector childConnector = AccumuloRyaUtils.setupConnector(childAccumuloRdfConfiguration);
            if (!childConnector.tableOperations().exists(childTableName)) {
                log.info("Creating table: " + childTableName);
                childConnector.tableOperations().create(childTableName);
                log.info("Created table: " + childTableName);
                log.info("Granting authorizations to table: " + childTableName);
                childConnector.securityOperations().grantTablePermission(childUserName, childTableName,
                        TablePermission.WRITE);
                log.info("Granted authorizations to table: " + childTableName);
            }
        } catch (TableExistsException | AccumuloException | AccumuloSecurityException e) {
            throw new IOException(e);
        }
    }

    private void setupSplitsFile(final Job job, final TableOperations parentTableOperations,
            final String parentTableName, final String childTableName) throws Exception {
        final FileSystem fs = FileSystem.get(conf);
        fs.setPermission(getPath(baseOutputDir, childTableName),
                new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
        final Path splitsPath = getPath(baseOutputDir, childTableName, "splits.txt");
        final Collection<Text> splits = parentTableOperations.listSplits(parentTableName, 100);
        log.info("Creating splits file at: " + splitsPath);
        try (PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsPath)), false,
                StandardCharsets.UTF_8.name())) {
            for (final Text split : splits) {
                final String encoded = new String(Base64.encodeBase64(TextUtil.getBytes(split)),
                        StandardCharsets.UTF_8);
                out.println(encoded);
            }
        }
        fs.setPermission(splitsPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));

        final String userDir = System.getProperty("user.dir");
        // The splits file has a symlink created in the user directory for some reason.
        // It might be better to copy the entire file for Windows but it doesn't seem to matter if
        // the user directory symlink is broken.
        java.nio.file.Files.deleteIfExists(new File(userDir, "splits.txt").toPath());
        //Files.copy(new File(splitsPath.toString()), new File(userDir, "splits.txt"));
        job.setPartitionerClass(KeyRangePartitioner.class);
        KeyRangePartitioner.setSplitFile(job, splitsPath.toString());
        job.setNumReduceTasks(splits.size() + 1);
    }

    /**
     * Converts a path string, or a sequence of strings that when joined form a path string,
     * to a {@link org.apache.hadoop.fs.Path}.
     * @param first The path string or initial part of the path string.
     * @param more Additional strings to be joined to form the path string.
     * @return the resulting {@link org.apache.hadoop.fs.Path}.
     */
    public static Path getPath(final String first, final String... more) {
        final java.nio.file.Path path = Paths.get(first, more);
        final String stringPath = FilenameUtils.separatorsToUnix(path.toAbsolutePath().toString());
        final Path hadoopPath = new Path(stringPath);
        return hadoopPath;
    }

    /**
     * Imports the files that hold the table data into the child instance.
     * @param childTableName the name of the child table to import.
     * @throws Exception
     */
    public void importFilesToChildTable(final String childTableName) throws Exception {
        final Configuration childConfig = MergeToolMapper.getChildConfig(conf);
        final AccumuloRdfConfiguration childAccumuloRdfConfiguration = new AccumuloRdfConfiguration(childConfig);
        childAccumuloRdfConfiguration.setTablePrefix(childTablePrefix);
        final Connector childConnector = AccumuloRyaUtils.setupConnector(childAccumuloRdfConfiguration);
        final TableOperations childTableOperations = childConnector.tableOperations();

        final Path localWorkDir = getPath(localCopyFileImportDir, childTableName);
        final Path hdfsBaseWorkDir = getPath(baseImportDir, childTableName);

        final FileSystem fs = FileSystem.get(conf);
        if (fs.exists(hdfsBaseWorkDir)) {
            fs.delete(hdfsBaseWorkDir, true);
        }

        log.info("Importing from the local directory: " + localWorkDir);
        log.info("Importing to the HDFS directory: " + hdfsBaseWorkDir);
        copyLocalToHdfs(localWorkDir, hdfsBaseWorkDir);

        final Path files = getPath(hdfsBaseWorkDir.toString(), "files");
        final Path failures = getPath(hdfsBaseWorkDir.toString(), "failures");

        // With HDFS permissions on, we need to make sure the Accumulo user can read/move the files
        final FsShell shell = new FsShell(conf);
        shell.run(new String[] { "-chmod", "777", hdfsBaseWorkDir.toString() });
        if (fs.exists(failures)) {
            fs.delete(failures, true);
        }
        fs.mkdirs(failures);

        childTableOperations.importDirectory(childTableName, files.toString(), failures.toString(), false);
    }

    /**
     * Copies the file from the local file system into the HDFS.
     * @param localInputPath the local system input {@link Path}.
     * @param hdfsOutputPath the HDFS output {@link Path}.
     * @throws IOException
     */
    public void copyLocalToHdfs(final Path localInputPath, final Path hdfsOutputPath) throws IOException {
        copyLocalToHdfs(localInputPath, hdfsOutputPath, conf);
    }

    /**
     * Copies the file from the local file system into the HDFS.
     * @param localInputPath the local system input {@link Path}.
     * @param hdfsOutputPath the HDFS output {@link Path}.
     * @param configuration the {@link Configuration} to use.
     * @throws IOException
     */
    public static void copyLocalToHdfs(final Path localInputPath, final Path hdfsOutputPath,
            final Configuration configuration) throws IOException {
        final FileSystem fs = FileSystem.get(configuration);
        fs.copyFromLocalFile(localInputPath, hdfsOutputPath);
    }

    /**
     * Copies the file from HDFS into the local file system.
     * @param hdfsInputPath the HDFS input {@link Path}.
     * @param localOutputPath the local system output {@link Path}.
     * @throws IOException
     */
    public void copyHdfsToLocal(final Path hdfsInputPath, final Path localOutputPath) throws IOException {
        copyHdfsToLocal(hdfsInputPath, localOutputPath, conf);
    }

    /**
     * Copies the file from HDFS into the local file system.
     * @param hdfsInputPath the HDFS input {@link Path}.
     * @param localOutputPath the local system output {@link Path}.
     * @param configuration the {@link Configuration} to use.
     * @throws IOException
     */
    public static void copyHdfsToLocal(final Path hdfsInputPath, final Path localOutputPath,
            final Configuration configuration) throws IOException {
        final FileSystem fs = FileSystem.get(configuration);
        fs.copyToLocalFile(hdfsInputPath, localOutputPath);
    }

    @Override
    protected void setupAccumuloInput(final Job job) throws AccumuloSecurityException {
        if (useCopyFileImport) {
            try {
                FileInputFormat.setInputPaths(job, localCopyFileImportDir);
            } catch (final IOException e) {
                log.error("Failed to set copy file import directory", e);
            }
        } else {
            // set up accumulo input
            if (!hdfsInput) {
                job.setInputFormatClass(AccumuloInputFormat.class);
            } else {
                job.setInputFormatClass(AccumuloHDFSFileInputFormat.class);
            }
            AbstractInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd));
            InputFormatBase.setInputTableName(job,
                    RdfCloudTripleStoreUtils.layoutPrefixToTable(rdfTableLayout, tablePrefix));
            AbstractInputFormat.setScanAuthorizations(job, authorizations);
            if (!mock) {
                AbstractInputFormat.setZooKeeperInstance(job,
                        new ClientConfiguration().withInstance(instance).withZkHosts(zk));
            } else {
                AbstractInputFormat.setMockInstance(job, instance);
            }
            if (ttl != null) {
                final IteratorSetting setting = new IteratorSetting(1, "fi", AgeOffFilter.class);
                AgeOffFilter.setTTL(setting, Long.valueOf(ttl));
                InputFormatBase.addIterator(job, setting);
            }
            if (startTime != null) {
                final IteratorSetting setting = getStartTimeSetting(startTime);
                InputFormatBase.addIterator(job, setting);
            }
            for (final IteratorSetting iteratorSetting : AccumuloRyaUtils.COMMON_REG_EX_FILTER_SETTINGS) {
                InputFormatBase.addIterator(job, iteratorSetting);
            }
        }
    }

    /**
     * Set up job to use AccumuloMultiTableInput format, using the tables/ranges given by a ruleset.
     * @param job The Job to configure
     * @param rules The ruleset mapping a query to the appropriate tables and ranges
     */
    protected void setupMultiTableInputFormat(final Job job, final AccumuloQueryRuleset rules)
            throws AccumuloSecurityException {
        AbstractInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd));
        AbstractInputFormat.setScanAuthorizations(job, authorizations);
        if (!mock) {
            AbstractInputFormat.setZooKeeperInstance(job,
                    new ClientConfiguration().withInstance(instance).withZkHosts(zk));
        } else {
            AbstractInputFormat.setMockInstance(job, instance);
        }
        final Map<String, InputTableConfig> configs = rules.getInputConfigs();
        // Add any relevant iterator settings
        final List<IteratorSetting> additionalSettings = new LinkedList<>(
                AccumuloRyaUtils.COMMON_REG_EX_FILTER_SETTINGS);
        if (ttl != null) {
            final IteratorSetting ttlSetting = new IteratorSetting(1, "fi", AgeOffFilter.class);
            AgeOffFilter.setTTL(ttlSetting, Long.valueOf(ttl));
            additionalSettings.add(ttlSetting);
        }
        if (startTime != null) {
            final IteratorSetting startTimeSetting = getStartTimeSetting(startTime);
            additionalSettings.add(startTimeSetting);
        }
        for (final Map.Entry<String, InputTableConfig> entry : configs.entrySet()) {
            final List<IteratorSetting> iterators = entry.getValue().getIterators();
            iterators.addAll(additionalSettings);
            entry.getValue().setIterators(iterators);
        }
        // Set the input format
        AccumuloMultiTableInputFormat.setInputTableConfigs(job, configs);
        job.setInputFormatClass(AccumuloMultiTableInputFormat.class);
    }

    @Override
    protected void setupAccumuloOutput(final Job job, final String outputTable) throws AccumuloSecurityException {
        AccumuloOutputFormat.setConnectorInfo(job, childUserName, new PasswordToken(childPwd));
        AccumuloOutputFormat.setCreateTables(job, true);
        AccumuloOutputFormat.setDefaultTableName(job, outputTable);
        if (!childMock) {
            AccumuloOutputFormat.setZooKeeperInstance(job,
                    new ClientConfiguration().withInstance(childInstance).withZkHosts(childZk));
        } else {
            AccumuloOutputFormat.setMockInstance(job, childInstance);
        }
        if (useCopyFileOutput) {
            log.info("Using file output format mode.");
            if (StringUtils.isNotBlank(baseOutputDir)) {
                Path baseOutputPath;
                Path filesOutputPath;
                if (StringUtils.isNotBlank(outputTable)) {
                    filesOutputPath = getPath(baseOutputDir, outputTable, "files");
                    baseOutputPath = filesOutputPath.getParent();
                    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
                } else {
                    // If table name is not given, configure output for one level higher:
                    // it's up to the job to handle subdirectories. Make sure the parent
                    // exists.
                    filesOutputPath = getPath(baseOutputDir);
                    baseOutputPath = filesOutputPath;
                    LazyOutputFormat.setOutputFormatClass(job, AccumuloFileOutputFormat.class);
                    MultipleOutputs.setCountersEnabled(job, true);
                }
                log.info("File output destination: " + filesOutputPath);
                if (useCopyFileOutputDirectoryClear) {
                    try {
                        clearOutputDir(baseOutputPath);
                    } catch (final IOException e) {
                        log.error("Error clearing out output path.", e);
                    }
                }
                try {
                    final FileSystem fs = FileSystem.get(conf);
                    fs.mkdirs(filesOutputPath.getParent());
                    fs.setPermission(filesOutputPath.getParent(),
                            new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
                } catch (final IOException e) {
                    log.error("Failed to set permission for output path.", e);
                }
                FileOutputFormat.setOutputPath(job, filesOutputPath);

                if (StringUtils.isNotBlank(compressionType)) {
                    if (isValidCompressionType(compressionType)) {
                        log.info("File compression type: " + compressionType);
                        AccumuloFileOutputFormat.setCompressionType(job, compressionType);
                    } else {
                        log.warn("Invalid compression type: " + compressionType);
                    }
                }
            }
        } else {
            log.info("Using accumulo output format mode.");
            job.setOutputFormatClass(AccumuloOutputFormat.class);
        }
    }

    /**
     * Sets up and runs the copy tool with the provided args.
     * @param args the arguments list.
     * @return the execution result.
     */
    public int setupAndRun(final String[] args) {
        int returnCode = -1;
        try {
            final Configuration conf = new Configuration();
            final Set<String> toolArgs = ToolConfigUtils.getUserArguments(conf, args);
            if (!toolArgs.isEmpty()) {
                final String parameters = Joiner.on("\r\n\t").join(toolArgs);
                log.info("Running Copy Tool with the following parameters...\r\n\t" + parameters);
            }

            returnCode = ToolRunner.run(conf, this, args);
        } catch (final Exception e) {
            log.error("Error running copy tool", e);
        }
        return returnCode;
    }

    public static void main(final String[] args) {
        final String log4jConfiguration = System.getProperties().getProperty("log4j.configuration");
        if (StringUtils.isNotBlank(log4jConfiguration)) {
            final String parsedConfiguration = StringUtils.removeStart(log4jConfiguration, "file:");
            final File configFile = new File(parsedConfiguration);
            if (configFile.exists()) {
                DOMConfigurator.configure(parsedConfiguration);
            } else {
                BasicConfigurator.configure();
            }
        }
        log.info("Starting Copy Tool");

        Thread.setDefaultUncaughtExceptionHandler(
                (thread, throwable) -> log.error("Uncaught exception in " + thread.getName(), throwable));

        final CopyTool copyTool = new CopyTool();
        final int returnCode = copyTool.setupAndRun(args);

        log.info("Finished running Copy Tool");

        System.exit(returnCode);
    }

    /**
     * Creates an {@link IteratorSetting} with a time stamp filter that starts with the specified data.
     * @param startTimeString the start time of the filter.
     * @return the {@link IteratorSetting}.
     */
    public static IteratorSetting getStartTimeSetting(final String startTimeString) {
        Date date = null;
        try {
            date = MergeTool.START_TIME_FORMATTER.parse(startTimeString);
        } catch (final ParseException e) {
            throw new IllegalArgumentException("Couldn't parse " + startTimeString, e);
        }
        return getStartTimeSetting(date);
    }

    /**
     * Creates an {@link IteratorSetting} with a time stamp filter that starts with the specified data.
     * @param date the start {@link Date} of the filter.
     * @return the {@link IteratorSetting}.
     */
    public static IteratorSetting getStartTimeSetting(final Date date) {
        return getStartTimeSetting(date.getTime());
    }

    /**
     * Creates an {@link IteratorSetting} with a time stamp filter that starts with the specified data.
     * @param time the start time of the filter.
     * @return the {@link IteratorSetting}.
     */
    public static IteratorSetting getStartTimeSetting(final long time) {
        final IteratorSetting setting = new IteratorSetting(1, "startTimeIterator", TimestampFilter.class);
        TimestampFilter.setStart(setting, time, true);
        TimestampFilter.setEnd(setting, Long.MAX_VALUE, true);
        return setting;
    }

    /**
     * Checks to see if the specified compression type is valid. The compression must be defined in
     * {@link Algorithm} to be valid.
     * @param compressionType the compression type to check.
     * @return {@code true} if the compression type is one of "none", "gz", "lzo", or "snappy".
     * {@code false} otherwise.
     */
    private static boolean isValidCompressionType(final String compressionType) {
        for (final Algorithm algorithm : Algorithm.values()) {
            if (algorithm.getName().equals(compressionType)) {
                return true;
            }
        }
        return false;
    }

    private void clearOutputDir(final Path path) throws IOException {
        final FileSystem fs = FileSystem.get(conf);
        fs.delete(path, true);
    }

    private Instance createChildInstance(final Configuration config) throws Exception {
        Instance instance = null;
        String instanceTypeProp = config.get(CREATE_CHILD_INSTANCE_TYPE_PROP);
        final String childAuth = config.get(MRUtils.AC_AUTH_PROP + MergeTool.CHILD_SUFFIX);

        // Default to distribution cluster if not specified
        if (StringUtils.isBlank(instanceTypeProp)) {
            instanceTypeProp = InstanceType.DISTRIBUTION.toString();
        }

        final InstanceType instanceType = InstanceType.fromName(instanceTypeProp);
        switch (instanceType) {
        case DISTRIBUTION:
            if (childInstance == null) {
                throw new IllegalArgumentException("Must specify instance name for distributed mode");
            } else if (childZk == null) {
                throw new IllegalArgumentException("Must specify ZooKeeper hosts for distributed mode");
            }
            instance = new ZooKeeperInstance(childInstance, childZk);
            break;

        case MINI:
            childAccumuloInstanceDriver = new AccumuloInstanceDriver("Child", false, true, false, false,
                    childUserName, childPwd, childInstance, childTablePrefix, childAuth);
            childAccumuloInstanceDriver.setUpInstance();
            childAccumuloInstanceDriver.setUpTables();
            childZk = childAccumuloInstanceDriver.getZooKeepers();
            MergeTool.setDuplicateKeysForProperty(config, MRUtils.AC_ZK_PROP + MergeTool.CHILD_SUFFIX, childZk);
            instance = new ZooKeeperInstance(childInstance, childZk);
            break;

        case MOCK:
            instance = new MockInstance(childInstance);
            break;

        default:
            throw new AccumuloException("Unexpected instance type: " + instanceType);
        }

        return instance;
    }

    /**
     * @return the child {@link AccumuloInstanceDriver} or {@code null}.
     */
    public AccumuloInstanceDriver getChildAccumuloInstanceDriver() {
        return childAccumuloInstanceDriver;
    }

    /**
     * Shuts down the child {@link AccumuloInstanceDriver} in the {@link CopyTool} if it exists.
     * @throws Exception
     */
    public void shutdown() throws Exception {
        if (childAccumuloInstanceDriver != null) {
            childAccumuloInstanceDriver.tearDown();
        }
    }
}