org.apache.flink.runtime.fs.maprfs.MapRFileSystem.java Source code

Introduction

Here is the source code for org.apache.flink.runtime.fs.maprfs.MapRFileSystem.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.fs.maprfs;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.flink.core.fs.BlockLocation;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FSDataOutputStream;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.fs.hdfs.DistributedBlockLocation;
import org.apache.flink.runtime.fs.hdfs.DistributedDataInputStream;
import org.apache.flink.runtime.fs.hdfs.DistributedDataOutputStream;
import org.apache.flink.runtime.fs.hdfs.DistributedFileStatus;

/**
 * Concrete implementation of the {@link FileSystem} base class for the MapR
 * file system. The class contains MapR specific code to initialize the
 * connection to the file system. Apart from that, we code mainly reuses the
 * existing HDFS wrapper code.
 */
public final class MapRFileSystem extends FileSystem {

    /**
     * The log object used for debugging.
     */
    private static final Log LOG = LogFactory.getLog(MapRFileSystem.class);

    /**
     * The name of MapR's class containing the implementation of the Hadoop HDFS
     * interface.
     */
    private static final String MAPR_FS_IMPL_CLASS = "com.mapr.fs.MapRFileSystem";

    /**
     * Name of the environment variable to determine the location of the MapR
     * installation.
     */
    private static final String MAPR_HOME_ENV = "MAPR_HOME";

    /**
     * The default location of the MapR installation.
     */
    private static final String DEFAULT_MAPR_HOME = "/opt/mapr/";

    /**
     * The path relative to the MAPR_HOME where MapR stores how to access the
     * configured clusters.
     */
    private static final String MAPR_CLUSTER_CONF_FILE = "/conf/mapr-clusters.conf";

    /**
     * A Hadoop configuration object used during the file system initialization.
     */
    private final org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();

    /**
     * The MapR class containing the implementation of the Hadoop HDFS
     * interface.
     */
    private final Class<? extends org.apache.hadoop.fs.FileSystem> fsClass;

    /**
     * The MapR implementation of the Hadoop HDFS interface.
     */
    private org.apache.hadoop.fs.FileSystem fs;

    /**
     * Creates a new MapRFileSystem object to access the MapR file system.
     * 
     * @throws IOException
     *             throw if the required MapR classes cannot be found
     */
    @SuppressWarnings("unchecked")
    public MapRFileSystem() throws IOException {

        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Trying to load class %s to access the MapR file system", MAPR_FS_IMPL_CLASS));
        }

        try {
            this.fsClass = (Class<? extends org.apache.hadoop.fs.FileSystem>) Class.forName(MAPR_FS_IMPL_CLASS);
        } catch (Exception e) {
            throw new IOException(String.format(
                    "Cannot find class %s, probably the runtime was not compiled against the MapR Hadoop libraries",
                    MAPR_FS_IMPL_CLASS), e);
        }
    }

    @Override
    public Path getWorkingDirectory() {

        return new Path(this.fs.getWorkingDirectory().toUri());
    }

    @Override
    public URI getUri() {

        return this.fs.getUri();
    }

    @Override
    public void initialize(final URI path) throws IOException {

        if (LOG.isInfoEnabled()) {
            LOG.info(String.format("Initializing MapR file system for path %s", path.toString()));
        }

        final String authority = path.getAuthority();
        if (authority == null || authority.isEmpty()) {

            // Use the default constructor to instantiate MapR file system
            // object

            try {
                this.fs = this.fsClass.newInstance();
            } catch (Exception e) {
                throw new IOException(e);
            }
        } else {

            // We have an authority, check the MapR cluster configuration to
            // find the CLDB locations.
            final String[] cldbLocations = getCLDBLocations(authority);

            // Find the appropriate constructor
            final Constructor<? extends org.apache.hadoop.fs.FileSystem> constructor;
            try {
                constructor = this.fsClass.getConstructor(String.class, String[].class);
            } catch (NoSuchMethodException e) {
                throw new IOException(e);
            }

            // Instantiate the file system object
            try {
                this.fs = constructor.newInstance(authority, cldbLocations);
            } catch (Exception e) {
                throw new IOException(e);
            }
        }

        this.fs.initialize(path, this.conf);
    }

    /**
     * Retrieves the CLDB locations for the given MapR cluster name
     * 
     * @param authority
     *            the name of the MapR cluster
     * @return a list of CLDB locations
     * @throws IOException
     *             thrown if the CLDB locations for the given MapR cluster name
     *             cannot be determined
     */
    private static String[] getCLDBLocations(final String authority) throws IOException {

        // Determine the MapR home
        String maprHome = System.getenv(MAPR_HOME_ENV);
        if (maprHome == null) {
            maprHome = DEFAULT_MAPR_HOME;
        }

        final File maprClusterConf = new File(maprHome, MAPR_CLUSTER_CONF_FILE);

        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Trying to retrieve MapR cluster configuration from %s", maprClusterConf));
        }

        // Read the cluster configuration file, format is specified at
        // http://doc.mapr.com/display/MapR/mapr-clusters.conf
        BufferedReader br = null;
        try {
            br = new BufferedReader(new FileReader(maprClusterConf));

            String line;
            while ((line = br.readLine()) != null) {

                // Normalize the string
                line = line.trim();
                line = line.replace('\t', ' ');

                final String[] fields = line.split(" ");
                if (fields == null) {
                    continue;
                }

                if (fields.length < 1) {
                    continue;
                }

                final String clusterName = fields[0];

                if (!clusterName.equals(authority)) {
                    continue;
                }

                final List<String> cldbLocations = new ArrayList<String>();

                for (int i = 1; i < fields.length; ++i) {

                    // Make sure this is not a key-value pair MapR recently
                    // introduced in the file format along with their security
                    // features.
                    if (!fields[i].isEmpty() && !fields[i].contains("=")) {
                        cldbLocations.add(fields[i]);
                    }
                }

                if (cldbLocations.isEmpty()) {
                    throw new IOException(String.format("%s contains entry for cluster %s but no CLDB locations.",
                            maprClusterConf, authority));
                }

                return cldbLocations.toArray(new String[0]);
            }

        } finally {
            if (br != null) {
                br.close();
            }
        }

        throw new IOException(String.format("Unable to find CLDB locations for cluster %s", authority));
    }

    @Override
    public FileStatus getFileStatus(final Path f) throws IOException {

        final org.apache.hadoop.fs.FileStatus status = this.fs
                .getFileStatus(new org.apache.hadoop.fs.Path(f.toString()));

        return new DistributedFileStatus(status);
    }

    @Override
    public BlockLocation[] getFileBlockLocations(final FileStatus file, final long start, final long len)
            throws IOException {

        if (!(file instanceof DistributedFileStatus)) {
            throw new IOException("file is not an instance of DistributedFileStatus");
        }

        final DistributedFileStatus f = (DistributedFileStatus) file;

        final org.apache.hadoop.fs.BlockLocation[] blkLocations = fs
                .getFileBlockLocations(f.getInternalFileStatus(), start, len);

        // Wrap up HDFS specific block location objects
        final DistributedBlockLocation[] distBlkLocations = new DistributedBlockLocation[blkLocations.length];
        for (int i = 0; i < distBlkLocations.length; i++) {
            distBlkLocations[i] = new DistributedBlockLocation(blkLocations[i]);
        }

        return distBlkLocations;
    }

    @Override
    public FSDataInputStream open(final Path f, final int bufferSize) throws IOException {

        final org.apache.hadoop.fs.FSDataInputStream fdis = this.fs
                .open(new org.apache.hadoop.fs.Path(f.toString()), bufferSize);

        return new DistributedDataInputStream(fdis);
    }

    @Override
    public FSDataInputStream open(final Path f) throws IOException {

        final org.apache.hadoop.fs.FSDataInputStream fdis = this.fs
                .open(new org.apache.hadoop.fs.Path(f.toString()));

        return new DistributedDataInputStream(fdis);
    }

    @Override
    public FSDataOutputStream create(final Path f, final boolean overwrite, final int bufferSize,
            final short replication, final long blockSize) throws IOException {

        final org.apache.hadoop.fs.FSDataOutputStream fdos = this.fs
                .create(new org.apache.hadoop.fs.Path(f.toString()), overwrite, bufferSize, replication, blockSize);

        return new DistributedDataOutputStream(fdos);
    }

    @Override
    public FSDataOutputStream create(final Path f, final boolean overwrite) throws IOException {

        final org.apache.hadoop.fs.FSDataOutputStream fdos = this.fs
                .create(new org.apache.hadoop.fs.Path(f.toString()), overwrite);

        return new DistributedDataOutputStream(fdos);
    }

    @Override
    public boolean delete(final Path f, final boolean recursive) throws IOException {

        return this.fs.delete(new org.apache.hadoop.fs.Path(f.toString()), recursive);
    }

    @Override
    public FileStatus[] listStatus(final Path f) throws IOException {

        final org.apache.hadoop.fs.FileStatus[] hadoopFiles = this.fs
                .listStatus(new org.apache.hadoop.fs.Path(f.toString()));
        final FileStatus[] files = new FileStatus[hadoopFiles.length];

        // Convert types
        for (int i = 0; i < files.length; i++) {
            files[i] = new DistributedFileStatus(hadoopFiles[i]);
        }

        return files;
    }

    @Override
    public boolean mkdirs(final Path f) throws IOException {

        return this.fs.mkdirs(new org.apache.hadoop.fs.Path(f.toString()));
    }

    @Override
    public boolean rename(final Path src, final Path dst) throws IOException {

        return this.fs.rename(new org.apache.hadoop.fs.Path(src.toString()),
                new org.apache.hadoop.fs.Path(dst.toString()));
    }

    @SuppressWarnings("deprecation")
    @Override
    public long getDefaultBlockSize() {

        return this.fs.getDefaultBlockSize();
    }

    @Override
    public boolean isDistributedFS() {

        return true;
    }
}