com.linkedin.cubert.utils.FileSystemUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.cubert.utils.FileSystemUtils.java

Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.utils;

import static com.linkedin.cubert.utils.JsonUtils.getText;

import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.ArrayNode;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

/**
 * Utility methods to enumerate paths in the file system.
 * 
 * @author Maneesh Varshney
 * 
 */
public class FileSystemUtils {

    public static List<Path> getPaths(FileSystem fs, JsonNode json, JsonNode params) throws IOException {
        return getPaths(fs, json, false, params);
    }

    public static List<Path> getPaths(FileSystem fs, JsonNode json, boolean schemaOnly, JsonNode params)
            throws IOException {
        if (json.isArray()) {
            List<Path> paths = new ArrayList<Path>();
            // If the specified input is array, recursively get paths for each item in the
            // array
            ArrayNode anode = (ArrayNode) json;
            for (int i = 0; i < anode.size(); i++) {
                paths.addAll(getPaths(fs, json.get(i), params));
            }
            return paths;
        } else if (json.isTextual()) {
            return getPaths(fs, new Path(json.getTextValue()));
        } else {
            List<Path> paths = new ArrayList<Path>();
            Path root = new Path(getText(json, "root"));
            Path basePath = root;
            JsonNode startDateJson = json.get("startDate");
            if (schemaOnly && json.get("origStartDate") != null)
                startDateJson = json.get("origStartDate");

            JsonNode endDateJson = json.get("endDate");
            if (startDateJson == null || endDateJson == null) {
                throw new IllegalArgumentException("StartDate and endDate need to be specified");
            }
            String startDuration, endDuration;
            if (startDateJson.isTextual()) {
                startDuration = startDateJson.getTextValue();
                endDuration = endDateJson.getTextValue();
            }

            else {
                startDuration = startDateJson.toString();
                endDuration = endDateJson.toString();
            }

            boolean errorOnMissing = false;
            JsonNode errorOnMissingJson = params.get("errorOnMissing");
            if (errorOnMissingJson != null)
                errorOnMissing = Boolean.parseBoolean(errorOnMissingJson.getTextValue());

            boolean useHourlyForMissingDaily = false;
            JsonNode useHourlyForMissingDailyJson = params.get("useHourlyForMissingDaily");
            if (useHourlyForMissingDailyJson != null)
                useHourlyForMissingDaily = Boolean.parseBoolean(useHourlyForMissingDailyJson.getTextValue());

            DateTimeFormatter dtf = DateTimeFormat.forPattern("yyyyMMdd");
            DateTimeFormatter dtfwHour = DateTimeFormat.forPattern("yyyyMMddHH");
            DateTime startDate, endDate;
            boolean isDaily;
            int hourStep;
            if (startDuration.length() == 8) {
                if (endDuration.length() != 8)
                    throw new IllegalArgumentException(
                            "EndDate " + endDuration + " is not consistent with StartDate " + startDuration);
                startDate = dtf.parseDateTime(startDuration);
                endDate = dtf.parseDateTime(endDuration);
                isDaily = true;
                hourStep = 24;
            } else if (startDuration.length() == 10) {
                if (endDuration.length() != 10)
                    throw new IllegalArgumentException(
                            "EndDate " + endDuration + " is not consistent with StartDate " + startDuration);
                startDate = dtfwHour.parseDateTime(startDuration);
                endDate = dtfwHour.parseDateTime(endDuration);
                isDaily = false;
                hourStep = 1;
            } else {
                throw new IllegalArgumentException(
                        "Cannot parse StartDate " + startDuration + " as daily or hourly duration");

            }

            for (Path path : getPaths(fs, root)) {
                if (isDaily) {
                    if (path.getName().equals("daily"))
                        basePath = path;
                    else
                        basePath = new Path(path, "daily");
                } else {
                    if (path.getName().equals("hourly"))
                        basePath = path;
                    else
                        basePath = new Path(path, "hourly");
                }

                //If daily folder itself doesn't exist
                if (!fs.exists(basePath) && isDaily && useHourlyForMissingDaily
                        && fs.exists(new Path(basePath.getParent(), "hourly"))) {
                    basePath = new Path(basePath.getParent(), "hourly");
                    endDate = endDate.plusHours(23);
                    isDaily = false;
                    hourStep = 1;
                }

                paths.addAll(getDurationPaths(fs, basePath, startDate, endDate, isDaily, hourStep, errorOnMissing,
                        useHourlyForMissingDaily));
            }

            if (paths.isEmpty() && schemaOnly)
                throw new IOException(String.format("No input files at %s from %s to %s", basePath.toString(),
                        startDuration, endDuration));
            return paths;
        }

    }

    private static Path generateDatedPath(Path base, int year, int month, int day) {
        return generateDatedPath(base, year, month, day, -1);
    }

    private static Path generateDatedPath(Path base, int year, int month, int day, int hour) {
        NumberFormat nf2 = new DecimalFormat("00");
        return new Path(base,
                hour != -1
                        ? nf2.format(year) + "/" + nf2.format(month) + "/" + nf2.format(day) + "/"
                                + nf2.format(hour)
                        : nf2.format(year) + "/" + nf2.format(month) + "/" + nf2.format(day));
    }

    public static List<Path> getPaths(FileSystem fs, Path path) throws IOException {
        List<Path> paths = new ArrayList<Path>();

        String pathStr = path.toString();

        if (pathStr.contains("*")) {
            for (Path p : getGlobPaths(fs, path)) {
                paths.add(getLatestPath(fs, p));
            }
        } else {
            paths.add(getLatestPath(fs, path));
        }

        return paths;
    }

    public static List<Path> getGlobPaths(FileSystem fs, Path path) throws IOException {
        List<Path> paths = new ArrayList<Path>();

        FileStatus[] fileStatus = fs.globStatus(path);

        if (fileStatus == null)
            throw new IOException("Cannot determine paths at " + path.toString());

        for (FileStatus status : fileStatus) {
            paths.add(status.getPath());
        }

        return paths;
    }

    public static Path getLatestPath(FileSystem fs, Path path) throws IOException {
        String pathStr = path.toString();

        // Return the same path, if there is no "#LATEST" within it
        if (!pathStr.contains("#LATEST"))
            return path;

        // replace all #LATEST with glob "*"
        pathStr = pathStr.replaceAll("#LATEST", "*");

        FileStatus[] fileStatus = fs.globStatus(new Path(pathStr));

        if (fileStatus == null || fileStatus.length == 0)
            throw new IOException("Cannot determine paths at " + pathStr);

        String latestPath = null;
        for (FileStatus status : fileStatus) {
            String thisPath = status.getPath().toString();
            if (latestPath == null || thisPath.compareTo(latestPath) > 0)
                latestPath = thisPath;

        }
        return new Path(latestPath);
    }

    public static List<Path> getDurationPaths(FileSystem fs, Path root, DateTime startDate, DateTime endDate,
            boolean isDaily, int hourStep, boolean errorOnMissing, boolean useHourlyForMissingDaily)
            throws IOException {
        List<Path> paths = new ArrayList<Path>();
        while (endDate.compareTo(startDate) >= 0) {
            Path loc;
            if (isDaily)
                loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth());
            else
                loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth(),
                        endDate.getHourOfDay());

            // Check that directory exists, and contains avro files.
            if (fs.exists(loc) && fs.globStatus(new Path(loc, "*" + "avro")).length > 0) {
                paths.add(loc);
            }

            else {

                loc = generateDatedPath(new Path(root.getParent(), "hourly"), endDate.getYear(),
                        endDate.getMonthOfYear(), endDate.getDayOfMonth());
                if (isDaily && useHourlyForMissingDaily && fs.exists(loc)) {
                    for (FileStatus hour : fs.listStatus(loc)) {
                        paths.add(hour.getPath());
                    }
                }

                else if (errorOnMissing) {
                    throw new RuntimeException("Missing directory " + loc.toString());
                }

            }
            if (hourStep == 24)
                endDate = endDate.minusDays(1);
            else
                endDate = endDate.minusHours(hourStep);
        }
        return paths;
    }

}