com.cloudera.sqoop.util.AppendUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.sqoop.util.AppendUtils.java

Source

/**
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.sqoop.util;

import java.io.IOException;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.cloudera.sqoop.manager.ImportJobContext;
import com.cloudera.sqoop.SqoopOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Utilities used when appending imported files to an existing dir.
 */
public class AppendUtils {

    public static final Log LOG = LogFactory.getLog(AppendUtils.class.getName());

    private static final SimpleDateFormat DATE_FORM = new SimpleDateFormat("ddHHmmssSSS");
    private static final String TEMP_IMPORT_ROOT = "_sqoop";

    private static final int PARTITION_DIGITS = 5;
    private static final String FILEPART_SEPARATOR = "-";
    private static final String FILEEXT_SEPARATOR = ".";

    private ImportJobContext context = null;

    public AppendUtils(ImportJobContext context) {
        this.context = context;
    }

    /**
     * Moves the imported files from temporary directory to specified target-dir,
     * renaming partition number if appending file exists.
     */
    public void append() throws IOException {

        SqoopOptions options = context.getOptions();
        FileSystem fs = FileSystem.get(options.getConf());
        Path tempDir = context.getDestination();

        // Try in this order: target-dir or warehouse-dir
        Path userDestDir = null;
        if (options.getTargetDir() != null) {
            userDestDir = new Path(options.getTargetDir());
        } else if (options.getWarehouseDir() != null) {
            userDestDir = new Path(options.getWarehouseDir(), context.getTableName());
        } else {
            userDestDir = new Path(context.getTableName());
        }

        int nextPartition = 0;

        if (!fs.exists(tempDir)) {
            // This occurs if there was no source (tmp) dir. This might happen
            // if the import was an HBase-target import, but the user specified
            // --append anyway. This is a warning, not an error.
            LOG.warn("Cannot append files to target dir; no such directory: " + tempDir);
            return;
        }

        // Create target directory.
        if (!fs.exists(userDestDir)) {
            LOG.info("Creating missing output directory - " + userDestDir.getName());
            fs.mkdirs(userDestDir);
            nextPartition = 0;
        } else {
            LOG.info("Appending to directory " + userDestDir.getName());
            // Get the right next partition for the imported files
            nextPartition = getNextPartition(fs, userDestDir);
        }

        // move files
        moveFiles(fs, tempDir, userDestDir, nextPartition);

        // delete temporary path
        LOG.debug("Deleting temporary folder " + tempDir.getName());
        fs.delete(tempDir, true);
    }

    /**
     * Returns the greatest partition number available for appending, for data
     * files in targetDir.
     */
    private int getNextPartition(FileSystem fs, Path targetDir) throws IOException {

        int nextPartition = 0;
        FileStatus[] existingFiles = fs.listStatus(targetDir);
        if (existingFiles != null && existingFiles.length > 0) {
            Pattern patt = Pattern.compile("part.*-([0-9][0-9][0-9][0-9][0-9]).*");
            for (FileStatus fileStat : existingFiles) {
                if (!fileStat.isDir()) {
                    String filename = fileStat.getPath().getName();
                    Matcher mat = patt.matcher(filename);
                    if (mat.matches()) {
                        int thisPart = Integer.parseInt(mat.group(1));
                        if (thisPart >= nextPartition) {
                            nextPartition = thisPart;
                            nextPartition++;
                        }
                    }
                }
            }
        }

        if (nextPartition > 0) {
            LOG.info("Using found partition " + nextPartition);
        }

        return nextPartition;
    }

    /**
     * Move files from source to target using a specified starting partition.
     */
    private void moveFiles(FileSystem fs, Path sourceDir, Path targetDir, int partitionStart) throws IOException {

        NumberFormat numpart = NumberFormat.getInstance();
        numpart.setMinimumIntegerDigits(PARTITION_DIGITS);
        numpart.setGroupingUsed(false);
        Pattern patt = Pattern.compile("part.*-([0-9][0-9][0-9][0-9][0-9]).*");
        FileStatus[] tempFiles = fs.listStatus(sourceDir);

        if (null == tempFiles) {
            // If we've already checked that the dir exists, and now it can't be
            // listed, this is a genuine error (permissions, fs integrity, or other).
            throw new IOException("Could not list files from " + sourceDir);
        }

        // Move and rename files & directories from temporary to target-dir thus
        // appending file's next partition
        for (FileStatus fileStat : tempFiles) {
            if (!fileStat.isDir()) {
                // Move imported data files
                String filename = fileStat.getPath().getName();
                Matcher mat = patt.matcher(filename);
                if (mat.matches()) {
                    String name = getFilename(filename);
                    String fileToMove = name.concat(numpart.format(partitionStart++));
                    String extension = getFileExtension(filename);
                    if (extension != null) {
                        fileToMove = fileToMove.concat(extension);
                    }
                    LOG.debug("Filename: " + filename + " repartitioned to: " + fileToMove);
                    fs.rename(fileStat.getPath(), new Path(targetDir, fileToMove));
                }
            } else {
                // Move directories (_logs & any other)
                String dirName = fileStat.getPath().getName();
                Path path = new Path(targetDir, dirName);
                int dirNumber = 0;
                while (fs.exists(path)) {
                    path = new Path(targetDir, dirName.concat("-").concat(numpart.format(dirNumber++)));
                }
                LOG.debug("Directory: " + dirName + " renamed to: " + path.getName());
                fs.rename(fileStat.getPath(), path);
            }
        }
    }

    /** returns the name component of a file. */
    private String getFilename(String filename) {
        String result = null;
        int pos = filename.lastIndexOf(FILEPART_SEPARATOR);
        if (pos != -1) {
            result = filename.substring(0, pos + 1);
        } else {
            pos = filename.lastIndexOf(FILEEXT_SEPARATOR);
            if (pos != -1) {
                result = filename.substring(0, pos);
            } else {
                result = filename;
            }
        }
        return result;
    }

    /** returns the extension component of a filename. */
    private String getFileExtension(String filename) {
        int pos = filename.lastIndexOf(FILEEXT_SEPARATOR);
        if (pos != -1) {
            return filename.substring(pos, filename.length());
        } else {
            return null;
        }
    }

    /**
     * Creates a unique path object inside the sqoop temporary directory.
     * 
     * @param tableName
     * @return a path pointing to the temporary directory
     */
    public static Path getTempAppendDir(String tableName) {
        String timeId = DATE_FORM.format(new Date(System.currentTimeMillis()));
        String tempDir = TEMP_IMPORT_ROOT + Path.SEPARATOR + timeId + tableName;
        return new Path(tempDir);
    }

}