org.deeplearning4j.hadoop.util.HdfsUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.deeplearning4j.hadoop.util.HdfsUtils.java

Source

/*
 *
 *  * Copyright 2015 Skymind,Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 *
 */

package org.deeplearning4j.hadoop.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.InetAddress;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A applyTransformToDestination of utils for basic hdfs operations
 * @author Adam Gibson
 *
 */
public class HdfsUtils {

    private HdfsUtils() {
    }

    private static final Logger log = LoggerFactory.getLogger(HdfsUtils.class);
    private static Map<Configuration, FileSystem> systems = new HashMap<Configuration, FileSystem>();
    public final static String HDFS_HOST = "hdfs.host";

    public static void setRunLocal(Configuration conf) {
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.job.tracker", "local");
        conf.set("mapred.system.dir", "/tmp/mapred/system");
        conf.set("mapred.local.dir", "/tmp/mapred");
        conf.set("hadoop.tmp.dir", "/tmp");

    }

    public static void setJarFileFor(Configuration conf, Class<?> jarClass) {
        String jar = findJar(jarClass);
        conf.setClassLoader(Thread.currentThread().getContextClassLoader());
        conf.set("mapred.jar", jar);
    }

    public static String findJar(Class<?> my_class) {
        ClassLoader loader = my_class.getClassLoader();
        String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
        try {
            for (Enumeration<?> itr = loader.getResources(class_file); itr.hasMoreElements();) {
                URL url = (URL) itr.nextElement();
                if ("jar".equals(url.getProtocol())) {
                    String toReturn = url.getPath();
                    if (toReturn.startsWith("file:")) {
                        toReturn = toReturn.substring("file:".length());
                    }
                    //URLDecoder is a misnamed class, since it actually decodes
                    // x-www-form-urlencoded MIME type rather than actual
                    // URL encoding (which the file path has). Therefore it would
                    // decode +s to ' 's which is incorrect (spaces are actually
                    // either unencoded or encoded as "%20"). Replace +s first, so
                    // that they are kept sacred during the decoding process.
                    toReturn = toReturn.replaceAll("\\+", "%2B");
                    toReturn = URLDecoder.decode(toReturn, "UTF-8");
                    return toReturn.replaceAll("!.*$", "");
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return null;
    }

    /**
     * Returns the default file system used to reach hdfs
     * @param conf the configuration to use
     * @return conf.getFromOrigin(fs.defaultFS)
     */
    public static String getHost(Configuration conf) {
        if (conf.get("hdfs.host") == null) {
            try {
                HdfsUtils.setHostForConf(conf);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        return conf.get("hdfs.host");
    }

    public static void setHadoopUser(String userName) {
        System.setProperty("HADOOP_USER_NAME", userName);
    }

    /**
     * Returns the default file system used to reach hdfs
     * @param conf the configuration to use
     * @return conf.getFromOrigin(fs.defaultFS)
     */
    public static String getHdfs(Configuration conf) {
        return conf.get("fs.defaultFS");
    }

    /**
     * Set the name node and job tracker for a given host
     * based on an autodiscovered file: host.properties
     * @param conf the configuration to applyTransformToDestination
     * @throws IOException
     */
    public static void setHostForConf(Configuration conf) throws IOException {
        Properties prop = new Properties();
        InputStream in = HdfsUtils.class.getResourceAsStream("/host.properties");
        if (in == null) {
            throw new IOException("No host.properties found");
        }

        prop.load(in);
        in.close();
        String host = prop.getProperty("hdfs.host", "localhost");
        boolean reachable = InetAddress.getByName(host).isReachable(1000);
        if (!reachable) {
            log.warn("Host " + host + " was not reachable! Falling back to localhost");
            host = "localhost";
        }
        log.info("Using host  " + host);
        conf.set("hdfs.host", host);
        conf.set("fs.defaultFS", String.format("hdfs://%s:8020", host));
        conf.set("mapred.job.tracker", String.format("hdfs://%s:8021", host));
    }

    public static void cleanup(Configuration conf) throws Exception {
        if (conf.get(HDFS_HOST) != null) {
            String hdfs = getHost(conf);
            HdfsLock lock = new HdfsLock(hdfs, 2181);
            if (lock.isLocked()) {
                if (log.isDebugEnabled())
                    log.debug("Returning paths; already found host");
                List<Path> paths = lock.getPaths();
                FileSystem system = FileSystem.get(conf);
                for (Path path : paths)
                    system.delete(path, true);
                lock.delete();
            }
            lock.close();
        }
    }

    /**
     * Adapted from 
     * http://terrier.org/docs/v3.5/javadoc/org/terrier/utility/io/HadoopUtility.html#saveClassPathToJob%28org.apache.hadoop.mapred.JobConf%29
     * @param jobConf
     * @throws IOException
     */
    public static List<Path> saveClassPathToJob(JobConf jobConf) throws Exception {
        String hdfs = getHost(jobConf);

        HdfsLock lock = new HdfsLock(hdfs);
        String hdfs2 = getHdfs(jobConf);
        if (jobConf.get(HDFS_HOST) != null) {
            if (lock.isLocked()) {
                List<Path> ret = lock.getPaths();
                StringBuffer files = new StringBuffer();
                StringBuffer classPath = new StringBuffer();
                for (Path path : ret) {
                    files.append(hdfs2 + path.toString());
                    files.append(",");
                    classPath.append(hdfs2 + path.toString());
                    classPath.append(":");
                    jobConf.addResource(path.toUri().toURL());
                }
                String classPathToSet = classPath.toString().substring(0, classPath.lastIndexOf(":"));
                String filesToSet = files.toString().substring(0, files.lastIndexOf(","));
                log.info("Setting class path " + classPathToSet);
                log.info("Using files " + filesToSet);
                jobConf.set("mapred.cache.files", filesToSet);
                jobConf.set("mapred.job.classpath.files", classPathToSet);
                return ret;
            }
        }
        List<Path> paths = new ArrayList<Path>();
        log.info("Copying classpath to job");

        final String[] jars = findJarFiles(new String[] { System.getenv().get("CLASSPATH"),
                System.getProperty("java.class.path"), System.getProperty("surefire.test.class.path") });

        final FileSystem defFS = FileSystem.get(jobConf);
        int numFilesWritten = 0;
        for (String jarFile : jars) {
            //class path issues
            if (jarFile.contains("hadoop-client")) {
                log.info("Skipping hadoop-client");
                continue;
            } else if (jarFile.contains("mapreduce-run")) {
                log.info("Skipping map reduce run");
                continue;
            }

            Path srcJarFilePath = new Path("file:///" + jarFile);
            String filename = srcJarFilePath.getName();
            Path tmpJarFilePath = makeFile(jobConf, filename);
            log.info("Uploading " + jarFile + " to " + tmpJarFilePath.toString());
            try {
                defFS.copyFromLocalFile(srcJarFilePath, tmpJarFilePath);
                jobConf.addResource(tmpJarFilePath);
                paths.add(tmpJarFilePath);
                numFilesWritten++;
            } catch (Exception e) {
                for (Path path : paths) {
                    if (defFS.exists(path))
                        defFS.delete(path, true);
                }

                lock.close();
                log.error(String.format("Exception writing to hdfs; rolling back %d jar files ", numFilesWritten),
                        e);
                throw new IOException("Couldn't write jar file " + jarFile);
            }
        }
        try {
            lock.create(paths);
        } catch (KeeperException.SessionExpiredException e) {
            lock = new HdfsLock(hdfs);
            lock.create(paths);

        }

        lock.close();
        //resolve any differences by removing  clashing names in the files (archives are removed from files)

        Set<Path> remove = new HashSet<Path>();
        for (Path path : paths) {
            boolean exists = false;
            try {
                exists = defFS.exists(path);
            } catch (IllegalArgumentException e) {
                exists = false;
            }
            if (!exists)
                remove.add(path);
        }
        paths.removeAll(remove);
        return paths;
    }

    protected static final String[] checkSystemProperties = { "file", "java", "line", "os", "path", "sun", "user" };
    protected static final Random random = new Random();

    public static Path makeTemporaryFile(Configuration jobConf, String filename) throws IOException {
        final int randomKey = jobConf.getInt("terrier.tempfile.id", random.nextInt());
        jobConf.setInt("terrier.tempfile.id", randomKey);
        FileSystem defFS = FileSystem.get(jobConf);
        final Path tempFile = new Path("/tmp/" + (randomKey) + "-" + filename);
        defFS.deleteOnExit(tempFile);
        return tempFile;
    }

    public static Path makeFile(Configuration jobConf, String filename) throws IOException {
        final int randomKey = jobConf.getInt("terrier.tempfile.id", random.nextInt());
        jobConf.setInt("terrier.tempfile.id", randomKey);
        final Path tempFile = new Path("/tmp/" + (randomKey) + "-" + filename);
        return tempFile;
    }

    protected static String[] findJarFiles(String[] classPathLines) {
        Set<String> jars = new HashSet<String>();
        for (String locationsLine : classPathLines) {
            if (locationsLine == null)
                continue;
            for (String CPentry : locationsLine.split(":")) {
                if (CPentry.endsWith(".jar"))
                    jars.add(new File(CPentry).getAbsoluteFile().toString());
            }
        }
        return jars.toArray(new String[0]);
    }

    public static void close(Configuration conf) throws Exception {
        FileSystem system = systems.get(conf);
        if (system != null) {
            system.close();
            systems.remove(conf);

        }
    }

    public static String getHdfsUri(Configuration conf) {
        return conf.get("fs.default.name", "127.0.0.1:8020");
    }

    private static FileSystem getFileSystem(Configuration conf) throws Exception {
        FileSystem ret = systems.get(conf);
        if (ret == null) {
            ret = FileSystem.get(conf);
            systems.put(conf, ret);
        }
        return ret;
    }

    public static void ensureUserDirExists(Configuration conf) throws Exception {
        FileSystem fs = getFileSystem(conf);
        if (!fs.exists(new Path(prependUserPath("")))) {
            boolean dirs = fs.mkdirs(new Path(prependUserPath("")));
            if (!dirs)
                throw new IllegalStateException("Couldn't make " + prependUserPath(""));
            FileUtil.chmod(prependUserPath(""), "777");
        }

    }

    public static void createFile(String path, Configuration conf) throws Exception {
        createFile(path, conf, true);

    }

    public static void ensureParentDirectoriesExist(String basePath, Configuration conf) throws Exception {
        File f = new File(basePath);
        if (!pathExists(conf, f.getParent()))
            mkdir(f.getParent(), conf, false);
    }

    public static void createFile(String path, Configuration conf, boolean prepend) throws Exception {
        FileSystem fs = getFileSystem(conf);
        ensureParentDirectoriesExist(path, conf);
        //returns an output stream, ensure it's closed
        if (!prepend && !pathExists(conf, path)) {
            OutputStream os = fs.create(new Path(path));
            IOUtils.closeQuietly(os);
        } else if (!pathExists(conf, prependUserPath(path))) {
            OutputStream os = fs.create(new Path(prependUserPath(path)));
            IOUtils.closeQuietly(os);
        }
    }

    public static boolean pathExists(Configuration conf, String path) throws Exception {
        FileSystem fs = getFileSystem(conf);
        boolean ret = fs.exists(new Path(path));
        return ret;
    }

    public static String getUser() {
        return "hdfs";
    }

    public static String getUserPath() {
        return "/user/" + getUser() + "/";
    }

    public static String prependUserPath(String path) {
        return "/user/" + getUser() + "/" + path;
    }

    public static void deleteUserDir(Configuration conf) throws Exception {
        FileSystem fs = getFileSystem(conf);
        if (fs.exists(new Path(prependUserPath("")))) {
            boolean delete = fs.delete(new Path(prependUserPath("")), true);
            if (!delete)
                throw new RuntimeException("Couldn't delete file " + prependUserPath(""));
        }
    }

    public static void writeText(String text, Configuration conf, String file) throws Exception {
        FileSystem fs = getFileSystem(conf);
        if (pathExists(conf, file))
            deletePath(file, conf, false);
        ensureParentDirectoriesExist(file, conf);

        FSDataOutputStream fdos = fs.create(new Path(file), true);
        fdos.writeBytes(text);
        fdos.flush();
        fdos.close();
    }

    public static String getContents(String text, Configuration conf, String file) throws Exception {
        FileSystem fs = getFileSystem(conf);
        if (!pathExists(conf, file))
            return "";

        InputStream is = fs.open(new Path(file));
        StringBuffer sb = new StringBuffer();
        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
        String line = null;
        while ((line = reader.readLine()) != null)
            sb.append(line);
        reader.close();
        is.close();

        return sb.toString();
    }

    public static void deletePath(String path, Configuration conf, boolean prependUserPath) throws Exception {
        FileSystem fs = getFileSystem(conf);
        if (!prependUserPath && pathExists(conf, path))
            fs.delete(new Path(path), true);
        else if (pathExists(conf, prependUserPath(path)))
            fs.delete(new Path(prependUserPath(path)), true);
    }

    public static void mkdir(String path, Configuration conf, boolean prependUserPath) throws Exception {
        if (prependUserPath)
            ensureUserDirExists(conf);

        FileSystem fs = getFileSystem(conf);
        if (prependUserPath && !fs.exists(new Path(prependUserPath(path))))
            fs.mkdirs(new Path(prependUserPath(path)));
        if (!fs.exists(new Path(path)))
            fs.mkdirs(new Path(path));
    }

    public static void deleteDir(String path, Configuration conf) throws Exception {
        deletePath(path, conf, true);
    }

    public static void mkdir(String path, Configuration conf) throws Exception {
        deletePath(path, conf, true);
    }

}