org.apache.sqoop.util.AppendUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.sqoop.util.AppendUtils.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.sqoop.util;

import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.cloudera.sqoop.manager.ImportJobContext;
import com.cloudera.sqoop.SqoopOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Utilities used when appending imported files to an existing dir.
 */
public class AppendUtils {

    public static final Log LOG = LogFactory.getLog(AppendUtils.class.getName());

    private static final SimpleDateFormat DATE_FORM = new SimpleDateFormat("ddHHmmssSSSSSSSSS");
    private static final String TEMP_IMPORT_ROOT = System.getProperty("sqoop.test.import.rootDir", "_sqoop");

    private static final int PARTITION_DIGITS = 5;
    private static final String FILEPART_SEPARATOR = "-";
    private static final String FILEEXT_SEPARATOR = ".";

    private static final Pattern DATA_PART_PATTERN = Pattern.compile("part.*-([0-9]{" + PARTITION_DIGITS + "}+).*");

    private ImportJobContext context = null;

    public AppendUtils(ImportJobContext context) {
        this.context = context;
    }

    /**
     * Moves the imported files from temporary directory to specified target-dir,
     * renaming partition number if appending file exists.
     */
    public void append() throws IOException {

        SqoopOptions options = context.getOptions();
        Path tempDir = context.getDestination();

        // Try in this order: target-dir or warehouse-dir
        Path userDestDir = null;
        if (options.getTargetDir() != null) {
            userDestDir = new Path(options.getTargetDir());
        } else if (options.getWarehouseDir() != null) {
            userDestDir = new Path(options.getWarehouseDir(), context.getTableName());
        } else {
            userDestDir = new Path(context.getTableName());
        }
        FileSystem fs = userDestDir.getFileSystem(options.getConf());

        int nextPartition = 0;

        if (!fs.exists(tempDir)) {
            // This occurs if there was no source (tmp) dir. This might happen
            // if the import was an HBase-target import, but the user specified
            // --append anyway. This is a warning, not an error.
            LOG.warn("Cannot append files to target dir; no such directory: " + tempDir);
            return;
        }

        // Create target directory.
        if (!fs.exists(userDestDir)) {
            LOG.info("Creating missing output directory - " + userDestDir.getName());
            fs.mkdirs(userDestDir);
            nextPartition = 0;
        } else {
            LOG.info("Appending to directory " + userDestDir.getName());
            // Get the right next partition for the imported files
            nextPartition = getNextPartition(fs, userDestDir);
        }

        // move files
        moveFiles(fs, tempDir, userDestDir, nextPartition);

        // delete temporary path
        LOG.debug("Deleting temporary folder " + tempDir.getName());
        fs.delete(tempDir, true);
    }

    /**
     * Returns the greatest partition number available for appending, for data
     * files in targetDir.
     */
    private int getNextPartition(FileSystem fs, Path targetDir) throws IOException {

        int nextPartition = 0;
        FileStatus[] existingFiles = fs.listStatus(targetDir);
        if (existingFiles != null && existingFiles.length > 0) {
            for (FileStatus fileStat : existingFiles) {
                if (!fileStat.isDir()) {
                    String filename = fileStat.getPath().getName();
                    Matcher mat = DATA_PART_PATTERN.matcher(filename);
                    if (mat.matches()) {
                        int thisPart = Integer.parseInt(mat.group(1));
                        if (thisPart >= nextPartition) {
                            nextPartition = thisPart;
                            nextPartition++;
                        }
                    }
                }
            }
        }

        if (nextPartition > 0) {
            LOG.info("Using found partition " + nextPartition);
        }

        return nextPartition;
    }

    /**
     * Move selected files from source to target using a specified starting partition.
     *
     * Directories are moved without restriction.  Note that the serial
     * number of directories bears no relation to the file partition
     * numbering.
     */
    private void moveFiles(FileSystem fs, Path sourceDir, Path targetDir, int partitionStart) throws IOException {

        /* list files in the source dir and check for errors */

        FileStatus[] sourceFiles = fs.listStatus(sourceDir);

        if (null == sourceFiles) {
            // If we've already checked that the dir exists, and now it can't be
            // listed, this is a genuine error (permissions, fs integrity, or other).
            throw new IOException("Could not list files from " + sourceDir);
        }

        /* state used throughout the entire move operation */

        // pad the data partition number thusly
        NumberFormat partFormat = NumberFormat.getInstance();
        partFormat.setMinimumIntegerDigits(PARTITION_DIGITS);
        partFormat.setGroupingUsed(false);

        // where the data partitioning is currently at
        int dataPart = partitionStart;

        /* loop through all top-level files and copy matching ones */

        for (FileStatus fileStatus : sourceFiles) {
            String sourceFilename = fileStatus.getPath().getName();
            StringBuilder destFilename = new StringBuilder();

            if (fileStatus.isDir()) { // move all subdirectories
                // pass target dir as initial dest to prevent nesting inside preexisting dir
                if (fs.rename(fileStatus.getPath(), targetDir)) {
                    LOG.debug("Directory: " + sourceFilename + " renamed to: " + sourceFilename);
                } else {
                    int dirNumber = 0;
                    Path destPath;
                    do {
                        // clear the builder in case this isn't the first iteration
                        destFilename.setLength(0);

                        // name-nnnnn?
                        destFilename.append(sourceFilename).append("-").append(partFormat.format(dirNumber++));

                        destPath = new Path(targetDir, destFilename.toString());
                        if (fs.exists(destPath))
                            continue;

                        /*
                         * There's still a race condition right here if an
                         * identically-named directory is created concurrently.
                         * It can be avoided by creating a parent dir for all
                         * migrated dirs, or by an intermediate rename.
                         */

                    } while (!fs.rename(fileStatus.getPath(), destPath));

                    LOG.debug("Directory: " + sourceFilename + " renamed to: " + destPath.getName());
                }
            } else if (DATA_PART_PATTERN.matcher(sourceFilename).matches()) { // move only matching top-level files
                do {
                    // clear the builder in case this isn't the first iteration
                    destFilename.setLength(0);

                    // name-nnnnn
                    destFilename.append(getFilename(sourceFilename)).append(partFormat.format(dataPart++));

                    // .ext?
                    String extension = getFileExtension(sourceFilename);
                    if (extension != null)
                        destFilename.append(getFileExtension(sourceFilename));
                } while (!fs.rename(fileStatus.getPath(), new Path(targetDir, destFilename.toString())));

                LOG.debug("Filename: " + sourceFilename + " repartitioned to: " + destFilename.toString());
            } else { // ignore everything else
                LOG.debug("Filename: " + sourceFilename + " ignored");
            }
        }
    }

    /** returns the name component of a file. */
    private String getFilename(String filename) {
        String result = null;
        int pos = filename.lastIndexOf(FILEPART_SEPARATOR);
        if (pos != -1) {
            result = filename.substring(0, pos + 1);
        } else {
            pos = filename.lastIndexOf(FILEEXT_SEPARATOR);
            if (pos != -1) {
                result = filename.substring(0, pos);
            } else {
                result = filename;
            }
        }
        return result;
    }

    /** returns the extension component of a filename. */
    private String getFileExtension(String filename) {
        int pos = filename.lastIndexOf(FILEEXT_SEPARATOR);
        if (pos != -1) {
            return filename.substring(pos, filename.length());
        } else {
            return null;
        }
    }

    /**
     * Creates a unique path object inside the sqoop temporary directory.
     *
     * @param salt Salt that will be appended at the end of the generated directory.
     *             Can be arbitrary string, for example table name or query checksum.
     * @return a path pointing to the temporary directory
     */
    public static Path getTempAppendDir(String salt) {
        String timeId = DATE_FORM.format(new Date(System.currentTimeMillis()));
        String jvmName = ManagementFactory.getRuntimeMXBean().getName().replaceAll("@", "_");
        String tempDir = TEMP_IMPORT_ROOT + Path.SEPARATOR + timeId + "_" + jvmName + "_" + salt;
        return new Path(tempDir);
    }

}