org.cloudata.core.parallel.hadoop.CloudataMapReduceUtil.java Source code

Introduction

Here is the source code for org.cloudata.core.parallel.hadoop.CloudataMapReduceUtil.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.cloudata.core.parallel.hadoop;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.cloudata.core.common.conf.CloudataConf;
import org.cloudata.core.common.util.FileUtil;
import org.cloudata.core.fs.CloudataFileSystem;
import org.cloudata.core.fs.GPath;

/**
 * Add cloudata's library to classpath for MapReduce job.
 */
public class CloudataMapReduceUtil {
    public static final Log LOG = LogFactory.getLog(CloudataMapReduceUtil.class.getName());
    static String CLOUDATA_JAR = "cloudata-[0-9]\\.[0-9](\\p{Alnum}|-|\\.)*\\.jar";
    static String ZOOKEEPER_JAR = "zookeeper-[0-9]\\.[0-9](\\p{Alnum}|-|\\.)*\\.jar";
    static String CLOUDATA_CONF = "cloudata-site.xml";

    static Configuration conf = new Configuration();

    /**
     * Configure classpath to run MapReduce job which uses Cloudata<BR>
     * 1. Uploads cloudata library to HDFS: cloudata-xxx-core.jar<BR>
     * 2. make cloudata-configure.jar includes cloudata-site.xml and uploads to HDFS<BR>
     * 3. run DistributedCache.addArchiveToClassPath() for each file<BR>
     * Must call clearMapReduce() after running job.
     * @param jobConf
     * @param cloudataHomeDir
     * @return temporary directory for linrary in HDFS. When calling clearMapReduce(), use this value.
     * @throws IOException
     */
    public static String initMapReduce(JobConf jobConf) throws IOException {

        String jarPath = "Cloudata_Lib_" + System.currentTimeMillis();

        FileSystem fs = FileSystem.get(conf);

        Path parentPath = fs.makeQualified(new Path(jarPath));
        //upload jar to dfs
        fs.mkdirs(parentPath);

        String[] classpaths = System.getProperty("java.class.path", "").split(File.pathSeparator);

        if (classpaths == null || classpaths.length == 0) {
            throw new IOException("No classpath");
        }

        List<String> uploadedFiles = new ArrayList<String>();
        for (String eachPath : classpaths) {
            if (eachPath.indexOf("cloudata") >= 0 || eachPath.indexOf("lib") >= 0
                    || eachPath.indexOf("conf") >= 0) {
                uploadFile(fs, parentPath, new File(eachPath), uploadedFiles);
            }
        }

        if (uploadedFiles.size() == 0) {
            throw new IOException("No lib files[cloudata-xxx-core.jar, zookeeper-xxx.jar] in classpath");
        }

        Path rootPath = new Path("/");
        String rootUri = fs.makeQualified(rootPath).toString();

        for (String eachPath : uploadedFiles) {
            Path path = fs.makeQualified(new Path(eachPath));
            String pathStr = path.toUri().toString();
            if (pathStr.indexOf(rootUri) >= 0) {
                pathStr = pathStr.substring(pathStr.indexOf(rootUri) + rootUri.length());
            }
            if (!pathStr.startsWith("/")) {
                pathStr = "/" + pathStr;
            }
            LOG.debug("DistributedCache.addArchiveToClassPath: " + pathStr);
            DistributedCache.addArchiveToClassPath(new Path(pathStr), jobConf);
        }
        return jarPath;
    }

    private static void uploadFile(FileSystem fs, Path parentPath, File file, List<String> uploadedFiles)
            throws IOException {
        if (file.isDirectory()) {
            File[] files = file.listFiles();
            if (files == null || files.length == 0) {
                return;
            }
            for (File eachFile : files) {
                uploadFile(fs, parentPath, eachFile, uploadedFiles);
            }
        } else {
            String fileName = file.getName();
            String fullPath = file.getPath();
            boolean matched = false;

            if (fileName.matches(CLOUDATA_JAR)) {
                matched = true;
            } else if (fileName.matches(ZOOKEEPER_JAR)) {
                matched = true;
            } else if (fileName.matches(CLOUDATA_CONF)) {
                //jar ??  upload
                Path uploadPath = makeJarToHDFS(fs, parentPath, file);
                uploadedFiles.add(uploadPath.toUri().toString());
            }

            if (matched) {
                Path uploadPath = new Path(parentPath, fileName);
                OutputStream out = fs.create(uploadPath);
                FileUtil.dumpFile(fullPath, out);
                out.close();
                uploadedFiles.add(uploadPath.toUri().toString());
            }
        }
    }

    private static Path makeJarToHDFS(FileSystem fs, Path parentPath, File file) throws IOException {
        Path path = new Path(parentPath, file.getName() + ".jar");

        JarOutputStream out = new JarOutputStream(fs.create(path));
        out.putNextEntry(new JarEntry(file.getName()));

        BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));

        byte[] buf = new byte[1024];

        try {
            int readBytes = 0;
            while ((readBytes = in.read(buf)) > 0) {
                out.write(buf, 0, readBytes);
            }
        } finally {
            if (out != null) {
                out.close();
            }

            if (in != null) {
                in.close();
            }
        }

        return path;
    }

    /**
     * Clear temporary library directory.
     * @param jarPath
     * @throws IOException
     */
    public static void clearMapReduce(String jarPath) throws IOException {
        CloudataFileSystem fs = CloudataFileSystem.get(new CloudataConf());
        GPath parentPath = new GPath(jarPath);
        fs.delete(parentPath, true);
    }

    public static int getMapNum(JobConf jobConf) {
        String taskId = jobConf.get("mapred.task.id");
        String taskId2 = taskId.substring(taskId.length() - 8);
        return Integer.parseInt(taskId2.substring(0, 6));
    }

    public static void main(String[] args) throws Exception {

    }
}