com.netflix.aegisthus.tools.StorageHelper.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.aegisthus.tools.StorageHelper.java

Source

/**
 * Copyright 2014 Netflix, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.netflix.aegisthus.tools;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
import java.util.Set;
import java.util.UUID;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

public class StorageHelper {
    public static String CFG_BASE_TEMP_DIR = "storagehelper.base.temp.dir";
    public static String CFG_OUTPUT_DIR = "storagehelper.output.dir";
    public static String CFG_STORAGE_DEBUG = "storagehelper.debug";

    private static final Log LOG = LogFactory.getLog(StorageHelper.class);;
    private Configuration config = null;
    private TaskAttemptContext ctx = null;
    private boolean debug = false;
    private Set<String> jobFiles = null;
    private Set<String> taskFiles = null;

    public StorageHelper(Configuration config) {
        this.config = config;
        debug = config.getBoolean(CFG_STORAGE_DEBUG, false);
    }

    public StorageHelper(TaskAttemptContext ctx) {
        this.ctx = ctx;
        this.config = ctx.getConfiguration();
        debug = config.getBoolean(CFG_STORAGE_DEBUG, false);
    }

    private Path commitPath() {
        String base = getBaseTempLocation();
        return new Path(base, "commits");
    }

    private Path commitPath(int taskId) {
        Path commits = commitPath();
        return new Path(commits, String.format("commit-%d", taskId));
    }

    public void copyToTemp(String file, String prefix, boolean snappy) throws IOException {
        String target = getBaseTaskAttemptTempLocation();
        Path targetPath = new Path(target, prefix);
        Path filePath = new Path(file);
        Path fullPath = new Path(targetPath, filePath.getName());

        String log = String.format("copying %s to %s", file, fullPath.toUri().toString());
        LOG.info(log);
        ctx.setStatus(log);
        Utils.copy(new Path(file), fullPath, snappy, ctx);
    }

    public void copyToTemp(String file, boolean snappy) throws IOException {
        String target = getBaseTaskAttemptTempLocation();
        Path targetPath = new Path(target);
        Path filePath = new Path(file);
        Path fullPath = new Path(targetPath, filePath.getName());
        String log = String.format("copying %s to %s", file, fullPath.toUri().toString());
        LOG.info(log);
        ctx.setStatus(log);
        Utils.copy(filePath, fullPath, snappy, ctx);
    }

    public void deleteBaseTempLocation() throws IOException {
        String base = getBaseTempLocation();
        Path path = new Path(base);
        FileSystem fs = path.getFileSystem(config);
        LOG.info(String.format("deleting: %s", base));
        fs.delete(path, true);
    }

    public void deleteCommitted() throws IOException {
        for (String file : getCommittedFiles()) {
            LOG.info(String.format("deleting: %s", file));
            Utils.delete(config, new Path(file), false);
        }
    }

    private int getAttemptId() throws IOException {
        if (ctx == null) {
            throw new IOException("Not running in a TaskAttemptContext");
        }
        return ctx.getTaskAttemptID().getId();
    }

    public String getBaseTaskAttemptTempLocation() throws IOException {
        int taskId = getTaskId();
        int attemptId = getAttemptId();
        String base = getBaseTempLocation();
        return String.format("%s/%d-%d", base, taskId, attemptId);

    }

    /**
     * This method has a side effect of setting values in the config for the
     * job.
     */
    public String getBaseTempLocation() {
        String base = config.get(CFG_BASE_TEMP_DIR);
        if (base == null) {
            base = String.format("%s/%s", "/tmp", UUID.randomUUID());
            config.set(CFG_BASE_TEMP_DIR, base);
        }
        return base;
    }

    public Set<String> getCommittedFiles() throws IOException {
        if (jobFiles == null) {
            List<Path> logs = Lists.newArrayList(DirectoryWalker.with(config).add(commitPath()).paths());
            jobFiles = readCommitLogs(logs);
        }
        return jobFiles;
    }

    public Set<String> getCommittedFiles(int taskId) throws IOException {
        if (taskFiles == null) {
            List<Path> logs = Lists.newArrayList();
            logs.add(commitPath(taskId));
            taskFiles = readCommitLogs(logs);
        }
        return taskFiles;
    }

    public List<String> getCommittedFolderList() throws IOException {
        Set<String> folders = Sets.newHashSet();
        Set<String> files = getCommittedFiles();
        for (String file : files) {
            folders.add(file.replaceAll("/[^/]+$", ""));
        }
        return Lists.newArrayList(folders);
    }

    public Path getFinalPath() throws IOException {
        if (config.get(CFG_OUTPUT_DIR) == null) {
            throw new IOException(String.format("%s cannot be null", CFG_OUTPUT_DIR));
        }
        return new Path(config.get(CFG_OUTPUT_DIR));
    }

    private int getTaskId() throws IOException {
        if (ctx == null) {
            throw new IOException("Not running in a TaskAttemptContext");
        }
        return ctx.getTaskAttemptID().getTaskID().getId();
    }

    /**
     * This method will check if this file was previously committed by this
     * task.
     */
    public boolean isFileMine(int taskId, String file) throws IOException {
        return getCommittedFiles(taskId).contains(file);
    }

    public void logCommit(String file) throws IOException {
        Path log = commitPath(getTaskId());
        if (debug) {
            LOG.info(String.format("logging (%s) to commit log (%s)", file, log.toUri().toString()));
        }
        FileSystem fs = log.getFileSystem(config);
        DataOutputStream os = null;
        if (fs.exists(log)) {
            os = fs.append(log);
        } else {
            os = fs.create(log);
        }
        os.writeBytes(file);
        os.write('\n');
        os.close();
    }

    public void moveTaskOutputToFinal() throws IOException {
        String tempLocation = getBaseTaskAttemptTempLocation();
        Path path = new Path(tempLocation);
        List<String> relativePaths = Lists
                .newArrayList(DirectoryWalker.with(config).threaded().omitHidden().add(path).relativePathStrings());
        Path finalPath = getFinalPath();
        for (String relative : relativePaths) {
            LOG.info(String.format("moving (%s) from (%s) to (%s)", relative, path.toUri().toString(),
                    finalPath.toUri().toString()));
            Utils.copy(new Path(relative), path, finalPath, ctx);
            ctx.progress();
        }
    }

    private Set<String> readCommitLogs(List<Path> logs) throws IOException {
        Set<String> files = Sets.newHashSet();
        FileSystem fs = null;

        for (Path log : logs) {
            // all logs are on the same filesystem.
            if (fs == null) {
                fs = log.getFileSystem(config);
            }
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(log)));
            String file;
            while ((file = br.readLine()) != null) {
                files.add(file);
            }

            if (ctx != null && files.size() % 1000 == 0) {
                ctx.progress();
            }
            br.close();
        }
        return files;
    }

    public void setFinalPath(String path) {
        config.set(CFG_OUTPUT_DIR, path);
    }
}