org.apache.falcon.regression.core.util.HadoopUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.falcon.regression.core.util.HadoopUtil.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.falcon.regression.core.util;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.falcon.regression.core.helpers.ColoHelper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.UUID;
import java.util.regex.Pattern;

/**
 * Util methods related to hadoop.
 */
public final class HadoopUtil {

    public static final String SOMETHING_RANDOM = "somethingRandom";
    private static final Logger LOGGER = Logger.getLogger(HadoopUtil.class);
    private static Pattern protocol = Pattern.compile(":[\\d]+/");

    private HadoopUtil() {
        throw new AssertionError("Instantiating utility class...");
    }

    /*
     * Removes 'hdfs(hftp)://server:port'
     */
    public static String cutProtocol(String path) {
        if (StringUtils.isNotEmpty(path)) {
            if (protocol.matcher(path).find()) {
                return '/' + protocol.split(path)[1];
            }
        }
        return path;
    }

    public static String joinPath(String basePath, String... restParts) {
        final String separator = "/";
        List<String> cleanParts = new ArrayList<>();
        String cleanBasePath = basePath.replaceFirst(separator + "$", "");
        cleanParts.add(cleanBasePath);
        for (String onePart : restParts) {
            final String cleanPart = onePart.replaceFirst("^" + separator, "").replaceFirst(separator + "$", "");
            cleanParts.add(cleanPart);
        }
        return StringUtils.join(cleanParts, separator);
    }

    /**
     * Retrieves all file names contained in a given directory.
     * @param fs filesystem
     * @param location given directory
     * @return list of file names
     * @throws IOException
     */
    public static List<String> getAllFilesHDFS(FileSystem fs, Path location) throws IOException {
        List<String> files = new ArrayList<>();
        if (!fs.exists(location)) {
            return files;
        }
        FileStatus[] stats = fs.listStatus(location);
        for (FileStatus stat : stats) {
            if (!isDir(stat)) {
                files.add(stat.getPath().toString());
            }
        }
        return files;
    }

    /**
     * Retrieves all directories withing a given depth starting from a specific dir.
     * @param fs filesystem
     * @param location given dir
     * @param depth depth
     * @return all matching directories
     * @throws IOException
     */
    public static List<Path> getAllDirsRecursivelyHDFS(FileSystem fs, Path location, int depth) throws IOException {
        List<Path> returnList = new ArrayList<>();
        FileStatus[] stats = fs.listStatus(location);
        for (FileStatus stat : stats) {
            if (isDir(stat)) {
                returnList.add(stat.getPath());
                if (depth > 0) {
                    returnList.addAll(getAllDirsRecursivelyHDFS(fs, stat.getPath(), depth - 1));
                }
            }
        }
        return returnList;
    }

    /**
     * Recursively retrieves all data file names from a given location.
     * @param fs filesystem
     * @param location given location
     * @return list of all files
     * @throws IOException
     */
    public static List<Path> getAllFilesRecursivelyHDFS(FileSystem fs, Path location) throws IOException {
        List<Path> returnList = new ArrayList<>();
        RemoteIterator<LocatedFileStatus> remoteIterator;
        try {
            remoteIterator = fs.listFiles(location, true);
        } catch (FileNotFoundException e) {
            LOGGER.info("Path '" + location + "' is not found on " + fs.getUri());
            return returnList;
        }
        while (remoteIterator.hasNext()) {
            Path path = remoteIterator.next().getPath();
            if (!path.toUri().toString().contains("_SUCCESS")) {
                returnList.add(path);
            }
        }
        return returnList;
    }

    /**
     * Recursively retrieves all data file names from a given location and looks for presence of availabilityFlag.
     * If availabilityFlag is null then it looks for _SUCCESS file(set as default).
     * @param fs filesystem
     * @param location given location
     * @param availabilityFlag value of availability flag set in entity
     * @return
     * @throws IOException
     */
    public static boolean getSuccessFolder(FileSystem fs, Path location, String availabilityFlag)
            throws IOException {
        LOGGER.info("location : " + location);
        for (FileStatus stat : fs.listStatus(location)) {
            if (availabilityFlag.isEmpty()) {
                if (stat.getPath().getName().equals("_SUCCESS")) {
                    return true;
                }
            } else {
                if (stat.getPath().getName().equals(availabilityFlag)) {
                    return true;
                }
            }
        }
        return false;
    }

    @SuppressWarnings("deprecation")
    private static boolean isDir(FileStatus stat) {
        return stat.isDir();
    }

    /**
     * Copies file from local place to hdfs location.
     * @param fs target filesystem
     * @param dstHdfsDir destination
     * @param srcFileLocation source location
     * @throws IOException
     */
    public static void copyDataToFolder(final FileSystem fs, String dstHdfsDir, final String srcFileLocation)
            throws IOException {
        LOGGER.info(String.format("Copying local dir %s to hdfs location %s on %s", srcFileLocation, dstHdfsDir,
                fs.getUri()));
        fs.copyFromLocalFile(new Path(srcFileLocation), new Path(cutProtocol(dstHdfsDir)));
    }

    /**
     * Copies a whole directory to hdfs.
     * @param fs target filesystem
     * @param dstHdfsDir destination dir
     * @param localLocation source location
     * @throws IOException
     */
    public static void uploadDir(final FileSystem fs, final String dstHdfsDir, final String localLocation)
            throws IOException {
        LOGGER.info(String.format("Uploading local dir %s to hdfs location %s", localLocation, dstHdfsDir));
        HadoopUtil.deleteDirIfExists(dstHdfsDir, fs);
        HadoopUtil.copyDataToFolder(fs, dstHdfsDir, localLocation);
    }

    /**
     * Copies given data to hdfs location.
     * @param fs target filesystem
     * @param dstHdfsDir destination dir
     * @param data source location
     * @param overwrite do we want to overwrite the data
     * @throws IOException
     */
    public static void writeDataForHive(final FileSystem fs, final String dstHdfsDir, final CharSequence data,
            boolean overwrite) throws IOException {
        LOGGER.info(String.format("Writing data %s to hdfs location %s", data, dstHdfsDir));
        final File tempFile = File.createTempFile(UUID.randomUUID().toString().split("-")[0], ".dat");
        FileUtils.write(tempFile, data);
        if (overwrite) {
            HadoopUtil.deleteDirIfExists(dstHdfsDir, fs);
        }
        try {
            fs.mkdirs(new Path(dstHdfsDir));
        } catch (Exception e) {
            //ignore
        }
        fs.setPermission(new Path(dstHdfsDir), FsPermission.getDirDefault());
        HadoopUtil.copyDataToFolder(fs, dstHdfsDir, tempFile.getAbsolutePath());
        if (!tempFile.delete()) {
            LOGGER.warn("Deletion of " + tempFile + " failed.");
        }
    }

    /**
     * Lists names of given directory subfolders.
     * @param fs filesystem
     * @param baseDir given directory
     * @return list of subfolders
     * @throws IOException
     */
    public static List<String> getHDFSSubFoldersName(FileSystem fs, String baseDir) throws IOException {
        List<String> returnList = new ArrayList<>();
        FileStatus[] stats = fs.listStatus(new Path(baseDir));
        for (FileStatus stat : stats) {
            if (isDir(stat)) {
                returnList.add(stat.getPath().getName());
            }
        }
        return returnList;
    }

    /**
     * Checks if file is present in given directory.
     * @param fs filesystem
     * @param hdfsPath path to a given directory
     * @param fileToCheckFor file
     * @return either file present or not
     * @throws IOException
     */
    public static boolean isFilePresentHDFS(FileSystem fs, String hdfsPath, String fileToCheckFor)
            throws IOException {
        LOGGER.info("getting file from folder: " + hdfsPath);
        List<String> fileNames = getAllFileNamesFromHDFS(fs, hdfsPath);
        for (String filePath : fileNames) {
            if (filePath.contains(fileToCheckFor)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Lists all file names for a given directory.
     * @param fs filesystem
     * @param hdfsPath path to a given directory
     * @return list of files which given directory contains
     * @throws IOException
     */
    private static List<String> getAllFileNamesFromHDFS(FileSystem fs, String hdfsPath) throws IOException {
        List<String> returnList = new ArrayList<>();
        LOGGER.info("getting file from folder: " + hdfsPath);
        FileStatus[] stats = fs.listStatus(new Path(hdfsPath));
        for (FileStatus stat : stats) {
            String currentPath = stat.getPath().toUri().getPath(); // gives directory name
            if (!isDir(stat)) {
                returnList.add(currentPath);
            }
        }
        return returnList;
    }

    /**
     * Removes directory with a given name and creates empty one with the same name.
     * @param fs filesystem
     * @param path path to a directory
     * @throws IOException
     */
    public static void recreateDir(FileSystem fs, String path) throws IOException {
        deleteDirIfExists(path, fs);
        LOGGER.info("creating hdfs dir: " + path + " on " + fs.getConf().get("fs.default.name"));
        fs.mkdirs(new Path(path));
    }

    /**
     * Recreates dirs for a list of filesystems.
     * @param fileSystems list of filesystems
     * @param path path to a directory
     * @throws IOException
     */
    public static void recreateDir(List<FileSystem> fileSystems, String path) throws IOException {
        for (FileSystem fs : fileSystems) {
            recreateDir(fs, path);
        }
    }

    /**
     * Removes given directory from a filesystem.
     * @param hdfsPath path to a given directory
     * @param fs filesystem
     * @throws IOException
     */
    public static void deleteDirIfExists(String hdfsPath, FileSystem fs) throws IOException {
        Path path = new Path(hdfsPath);
        if (fs.exists(path)) {
            LOGGER.info(String.format("Deleting HDFS path: %s on %s", path, fs.getUri()));
            fs.delete(path, true);
        } else {
            LOGGER.info(String.format("Not deleting non-existing HDFS path: %s on %s", path, fs.getUri()));
        }
    }

    /**
     * Copies data in folders without prefix.
     * @param fs filesystem
     * @param inputPath source location
     * @param remoteLocations destination location
     * @throws IOException
     */
    public static void flattenAndPutDataInFolder(FileSystem fs, String inputPath, List<String> remoteLocations)
            throws IOException {
        flattenAndPutDataInFolder(fs, inputPath, "", remoteLocations);
    }

    /**
     * Copies files from a source directory to target directories on hdfs.
     * @param fs target filesystem
     * @param inputPath source location
     * @param remotePathPrefix prefix for target directories
     * @param remoteLocations target directories
     * @return list of exact locations where data was copied
     * @throws IOException
     */
    public static List<String> flattenAndPutDataInFolder(FileSystem fs, String inputPath, String remotePathPrefix,
            List<String> remoteLocations) throws IOException {
        if (StringUtils.isNotEmpty(remotePathPrefix)) {
            deleteDirIfExists(remotePathPrefix, fs);
        }
        LOGGER.info("Creating data in folders: \n" + remoteLocations);
        File input = new File(inputPath);
        File[] files = input.isDirectory() ? input.listFiles() : new File[] { input };
        List<Path> filePaths = new ArrayList<>();
        assert files != null;
        for (final File file : files) {
            if (!file.isDirectory()) {
                final Path filePath = new Path(file.getAbsolutePath());
                filePaths.add(filePath);
            }
        }
        if (!remotePathPrefix.endsWith("/") && !remoteLocations.get(0).startsWith("/")) {
            remotePathPrefix += "/";
        }
        List<String> locations = new ArrayList<>();
        for (String remoteDir : remoteLocations) {
            String remoteLocation = remotePathPrefix + remoteDir;
            remoteLocation = cutProtocol(remoteLocation);
            locations.add(remoteLocation);
            LOGGER.info(String.format("copying to: %s files: %s", fs.getUri() + remoteLocation,
                    Arrays.toString(files)));
            if (!fs.exists(new Path(remoteLocation))) {
                fs.mkdirs(new Path(remoteLocation));
            }
            fs.copyFromLocalFile(false, true, filePaths.toArray(new Path[filePaths.size()]),
                    new Path(remoteLocation));
        }
        return locations;
    }

    /**
     * Copies data from local sources to remote directories.
     * @param fs target filesystem
     * @param folderPrefix prefix for remote directories
     * @param folderList remote directories
     * @param fileLocations sources
     * @throws IOException
     */
    public static void copyDataToFolders(FileSystem fs, final String folderPrefix, List<String> folderList,
            String... fileLocations) throws IOException {
        for (final String folder : folderList) {
            String folderSpace = folder.replaceAll("/", "_");
            File file = new File(OSUtil.NORMAL_INPUT + folderSpace + ".txt");
            FileUtils.writeStringToFile(file, "folder", true);
            fs.copyFromLocalFile(new Path(file.getAbsolutePath()), new Path(folderPrefix + folder));
            if (!file.delete()) {
                LOGGER.info("delete was not successful for file: " + file);
            }
            Path[] srcPaths = new Path[fileLocations.length];
            for (int i = 0; i < srcPaths.length; ++i) {
                srcPaths[i] = new Path(fileLocations[i]);
            }
            LOGGER.info(String.format("copying  %s to %s%s on %s", Arrays.toString(srcPaths), folderPrefix, folder,
                    fs.getUri()));
            fs.copyFromLocalFile(false, true, srcPaths, new Path(folderPrefix + folder));
        }
    }

    /**
     * Uploads data to remote directories with names within date ranges.
     * @param fs target filesystem
     * @param interval dates ranges before and after current date
     * @param minuteSkip time to skip within a range to get intermediate directories
     * @param folderPrefix prefix for remote directories
     * @throws IOException
     */
    public static void lateDataReplenish(FileSystem fs, int interval, int minuteSkip, String folderPrefix)
            throws IOException {
        List<String> folderData = TimeUtil.getMinuteDatesOnEitherSide(interval, minuteSkip);
        folderData.add(SOMETHING_RANDOM);
        flattenAndPutDataInFolder(fs, OSUtil.NORMAL_INPUT, folderPrefix, folderData);
    }

    /**
     * Creates list of folders on remote filesystem.
     * @param fs remote filesystem
     * @param folderPrefix prefix for remote directories
     * @param folderList list of folders
     * @throws IOException
     */
    public static void createFolders(FileSystem fs, final String folderPrefix, List<String> folderList)
            throws IOException {
        for (final String folder : folderList) {
            final String pathString = cutProtocol(folderPrefix + folder);
            LOGGER.info("Creating " + fs.getUri() + "/" + pathString);
            fs.mkdirs(new Path(pathString));
        }
    }

    /**
     * Created folders in remote location according to current time and copies files here.
     * @param fs target filesystem
     * @param remoteLocation remote location
     * @param localLocation source
     * @throws IOException
     */
    public static void injectMoreData(FileSystem fs, final String remoteLocation, String localLocation)
            throws IOException {
        File[] files = new File(localLocation).listFiles();
        assert files != null;
        for (final File file : files) {
            if (!file.isDirectory()) {
                String path = remoteLocation + "/" + System.currentTimeMillis() / 1000 + "/";
                LOGGER.info("inserting data@ " + path);
                fs.copyFromLocalFile(new Path(file.getAbsolutePath()), new Path(path));
            }
        }

    }

    /**
     * Uploads either _SUCCESS or dataFile4.txt file to remote directories with names within date
     * ranges.
     * @param fs target filesystem
     * @param interval dates ranges before and after current date
     * @param minuteSkip time to skip within a range to get intermediate directories
     * @param folderPrefix prefix for remote directories
     * @param fileToBePut what file to copy to remote locations
     * @throws IOException
     */
    public static void putFileInFolderHDFS(FileSystem fs, int interval, int minuteSkip, String folderPrefix,
            String fileToBePut) throws IOException {
        List<String> folderPaths = TimeUtil.getMinuteDatesOnEitherSide(interval, minuteSkip);
        LOGGER.info("folderData: " + folderPaths.toString());
        createFolders(fs, folderPrefix, folderPaths);
        if (fileToBePut.equals("_SUCCESS")) {
            copyDataToFolders(fs, folderPrefix, folderPaths, OSUtil.concat(OSUtil.NORMAL_INPUT, "_SUCCESS"));
        } else {
            copyDataToFolders(fs, folderPrefix, folderPaths, OSUtil.concat(OSUtil.NORMAL_INPUT, "dataFile4.txt"));
        }
    }

    /**
     * Uploads dataFile4.txt file to remote directories with names within date ranges.
     * @param fs target filesystem
     * @param interval dates ranges before and after current date
     * @param minuteSkip time to skip within a range to get intermediate directories
     * @param folderPrefix prefix for remote directories
     * @param postFix postfix for remote locations
     * @throws IOException
     */
    public static void lateDataReplenishWithoutSuccess(FileSystem fs, int interval, int minuteSkip,
            String folderPrefix, String postFix) throws IOException {
        List<String> folderPaths = TimeUtil.getMinuteDatesOnEitherSide(interval, minuteSkip);
        LOGGER.info("folderData: " + folderPaths.toString());
        if (postFix != null) {
            for (int i = 0; i < folderPaths.size(); i++) {
                folderPaths.set(i, folderPaths.get(i) + postFix);
            }
        }
        createFolders(fs, folderPrefix, folderPaths);
        copyDataToFolders(fs, folderPrefix, folderPaths, OSUtil.concat(OSUtil.NORMAL_INPUT, "dataFile4.txt"));
    }

    /**
     * Uploads both dataFile4.txt and _SUCCESS files to remote directories with names within date
     * ranges.
     * @param fs target filesystem
     * @param interval dates ranges before and after current date
     * @param minuteSkip time to skip within a range to get intermediate directories
     * @param folderPrefix prefix for remote directories
     * @param postFix postfix for remote locations
     * @throws IOException
     */
    public static void lateDataReplenish(FileSystem fs, int interval, int minuteSkip, String folderPrefix,
            String postFix) throws IOException {
        List<String> folderPaths = TimeUtil.getMinuteDatesOnEitherSide(interval, minuteSkip);
        LOGGER.info("folderData: " + folderPaths.toString());
        if (postFix != null) {
            for (int i = 0; i < folderPaths.size(); i++) {
                folderPaths.set(i, folderPaths.get(i) + postFix);
            }
        }
        createFolders(fs, folderPrefix, folderPaths);
        copyDataToFolders(fs, folderPrefix, folderPaths, OSUtil.concat(OSUtil.NORMAL_INPUT, "_SUCCESS"),
                OSUtil.concat(OSUtil.NORMAL_INPUT, "dataFile4.txt"));
    }

    /**
     * Creates empty folders in hdfs.
     * @param helper target
     * @param folderList list of folders
     * @throws IOException
     * @deprecated method creates filesystem object by itself. We should pass existing FileSystem
     * object to such methods.
     */
    @Deprecated
    public static void createHDFSFolders(ColoHelper helper, List<String> folderList) throws IOException {
        LOGGER.info("creating folders.....");
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://" + helper.getFeedHelper().getHadoopURL());
        final FileSystem fs = FileSystem.get(conf);
        for (final String folder : folderList) {
            if (StringUtils.isNotEmpty(folder)) {
                fs.mkdirs(new Path(cutProtocol(folder)));
            }
        }
        LOGGER.info("created folders.....");
    }
}