org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java Source code

Introduction

Here is the source code for org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java
Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.kvstore;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Formatter;
import java.util.List;
import java.util.Locale;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.kiji.mapreduce.KeyValueStore;
import org.kiji.mapreduce.KeyValueStoreConfiguration;
import org.kiji.mapreduce.util.Lists;

/**
 * Abstract base class that provides common functionality to file-backed
 * KeyValueStore implementations, such as the ability to use directory
 * globbing, the DistributedCache, etc.
 *
 * @param <K> the key type expected to be implemented by the keys to this store.
 * @param <V> the value type expected to be accessed by keys to this store.
 */
public abstract class FileKeyValueArrayStore<K, V> extends KeyValueStore<K, List<V>> implements Configurable {

    private static final Logger LOG = LoggerFactory.getLogger(FileKeyValueStore.class.getName());

    /**
     * KeyValueStoreConfiguration variable specifying the prefix for DCache
     * target file names.  If empty, then the DCache is disabled.
     */
    public static final String CONF_DCACHE_PREFIX = "dcache.suffix";

    /**
     * KeyValueStoreConfiguration variable specifying the set of input paths,
     * as a comma-delimited list.
     */
    public static final String CONF_PATHS = "paths";

    /**
     * KeyValueStoreConfiguration variable specifying the
     * maximum number of values to associate with each key.
     */
    public static final String CONF_MAX_VALUES = "max.values";

    /** The Hadoop configuration. */
    private Configuration mConf;

    /** True if we should distribute the input files via the DistributedCache. */
    private boolean mUseDCache;

    /** Max number of values to read. */
    private long mMaxValues;

    /** Files stored in the distributed cache have this as their prefix. */
    private String mDCachePrefix;

    /** List of input paths to files to include in the store. */
    private List<Path> mInputPaths;

    /**
     * Constructor that accepts an Options argument to configure it.
     *
     * @param options the specifications for this FileKeyValueArrayStore.
     */
    public FileKeyValueArrayStore(FileKeyValueStore.Options<? extends FileKeyValueStore.Options<?>> options) {
        this(options, Long.MAX_VALUE);
    }

    /**
     * Constructor that accepts an Options argument to configure it.
     *
     * @param options the specifications for this FileKeyValueStore.
     * @param maxValues the maximum number of values to associate with each key.
     */
    public FileKeyValueArrayStore(FileKeyValueStore.Options<? extends FileKeyValueStore.Options<?>> options,
            Long maxValues) {
        setConf(options.getConfiguration());
        setEnableDistributedCache(options.getUseDistributedCache());
        setInputPaths(options.getInputPaths());
        setMaxValues(maxValues);
        mDCachePrefix = "";
    }

    /** {@inheritDoc} */
    @Override
    public void setConf(Configuration conf) {
        mConf = conf;
    }

    /** {@inheritDoc} */
    @Override
    public Configuration getConf() {
        return mConf;
    }

    /**
     * Gets the maximum number of values to associate with each key.
     *
     * @return The maximum number of values to associate with each key.
     */
    public long getMaxValues() {
        return mMaxValues;
    }

    /**
     * Sets the maximum number of values to associate with each key.
     *
     * @param maxValues The maximum number of values to associate with each key.
     */
    public void setMaxValues(long maxValues) {
        mMaxValues = maxValues;
    }

    /**
     * Adds an input path to the list of files which should be loaded into this
     * KeyValueStore.
     *
     * @param path the path to the file or directory to load.
     */
    public void addInputPath(Path path) {
        mInputPaths.add(path);
    }

    /**
     * Sets the input path which specifies the file or directory that should be
     * loaded as the input for this KeyValueStore. This method overwrites the
     * values set by any prior calls to setInputPath() or addInputPath().
     *
     * @param path the path to the file or directory to load.
     */
    public void setInputPath(Path path) {
        mInputPaths.clear();
        addInputPath(path);
    }

    /**
     * Sets the input paths which specify the files or directories that should
     * be loaded as the input for this KeyValueStore. This method overwrites
     * any previous calls to setInputPath() or addInputPath().
     *
     * @param paths the list of paths to files or directories to load.
     */
    public void setInputPaths(List<Path> paths) {
        mInputPaths = new ArrayList<Path>(paths);
    }

    /**
     * Returns the set of input path(s) to be loaded by this KeyValueStore.
     * If this is called within a MapReduce task, these may refer to local paths
     * on disk. This may include directories and wildcards and other user-entered data.
     *
     * @return an unmodifiable list of input paths, backed by the underlying collection
     *     within this KeyValueStore.
     */
    public List<Path> getInputPaths() {
        return Collections.unmodifiableList(mInputPaths);
    }

    /**
     * An aggregator for use with Lists.foldLeft() that expands a list of paths that
     * may include directories to include only files; directories are expanded to multiple
     * file entries that are the files in this directory.
     */
    private class DirExpandAggregator extends Lists.Aggregator<Path, List<Path>> {
        // NOTE: This class stands ready to be factored out to assist other places
        // in Kiji where we need to expand a list of files, dirs and globs into just a list
        // of files.

        /** Last exception encountered during file stat lookups for the input paths. */
        private IOException mLastExn;

        /**
         * For each input path, modify the 'outputs' list to include the path
         * itself (if it is a file), or all the files in the directory (if it
         * is a directory). Also expands globs with FileSystem.globStatus().
         *
         * @param inputPath the input path to expand.
         * @param outputs list of output paths being accumulated.
         * @return the 'outputs' list.
         */
        @Override
        public List<Path> eval(Path inputPath, List<Path> outputs) {
            try {
                FileSystem fs = inputPath.getFileSystem(getConf());
                FileStatus[] matches = fs.globStatus(inputPath);
                if (null == matches) {
                    mLastExn = new IOException("No such input path: " + inputPath);
                } else if (matches.length == 0) {
                    mLastExn = new IOException("Input pattern \"" + inputPath + "\" matches 0 files.");
                } else {
                    for (FileStatus match : matches) {
                        if (match.isDir()) {
                            // Match all the files in this dir, except the "bonus" files generated by
                            // MapReduce.
                            for (FileStatus subFile : fs.listStatus(match.getPath(),
                                    new org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter())) {
                                outputs.add(subFile.getPath());
                                LOG.debug("Added file: " + subFile.getPath());
                            }
                        } else {
                            // Just a file; add directly.
                            outputs.add(match.getPath());
                            LOG.debug("Added file: " + match.getPath());
                        }
                    }
                }
            } catch (IOException ioe) {
                mLastExn = ioe;
            }
            return outputs;
        }

        /**
         * Returns the last exception encountered during FS operation while expanding
         * directories.
         *
         * @return the last exception encountered, or null if none was encountered.
         */
        private IOException getLastException() {
            return mLastExn;
        }
    }

    /**
     * Returns the set of input path(s) that should be actually opened for read.
     * This set of paths may be on local disk (e.g., if the DistributedCache was used
     * to transmit the files), or in HDFS. This will not contain directory names nor
     * globs; it is expanded to the literal set of files to open.
     *
     * @return an unmodifiable list of input paths, backed by the underlying collection
     *     within this KeyValueStore.
     * @throws IOException if there is an error communicating with the underlying
     *     FileSystem while expanding paths and globs.
     */
    public List<Path> getExpandedInputPaths() throws IOException {
        // If we've read a bunch of files from the DistributedCache's local dir,
        // no further unglobbing is necessary. Just return the values.
        if (!mDCachePrefix.isEmpty()) {
            return Collections.unmodifiableList(mInputPaths);
        }

        // Otherwise, these are "raw" user inputs. Unglob and expand them.
        DirExpandAggregator expander = new DirExpandAggregator();

        List<Path> actualInputPaths = Lists.distinct(Lists.foldLeft(new ArrayList<Path>(), mInputPaths, expander));

        IOException savedException = expander.getLastException();
        if (null != savedException) {
            // Rethrow the saved exception from this context.
            throw savedException;
        }

        return Collections.unmodifiableList(actualInputPaths);
    }

    /**
     * Sets a flag that enables or disables the use of the DistributedCache to manage
     * the distribution of the input files to the tasks that back this KeyValueStore.
     * This has no effect for KeyValueStores that are not used in MapReduce jobs.
     * <b>The DistributedCache is enabled by default.</b> This should be disabled if your files
     * are particularly large (the DCache limit is 10 GB per job, by default), but take
     * care that too many mappers do not overwhelm the same HDFS nodes.
     *
     * <p>(Note that most file-backed KeyValueStore implementations read the entire
     * set of input files into memory; 10 GB or more is unlikely to fit in the heap.)</p>
     *
     * @param enable a boolean that indicates whether the use of the distributed cache
     *     should be enabled.
     */
    public void setEnableDistributedCache(boolean enable) {
        mUseDCache = enable;
    }

    /**
     * If the cache URI prefix is already set, return this value. Otherwise create
     * a new unique cache URI prefix.
     *
     * @return the DistributedCache URI prefix for files used by this store.
     */
    private String getCachePrefix() {
        if (mDCachePrefix.isEmpty()) {
            // We need to put the files for this KVStore into the distributed cache. They
            // should be given symlink names that do not conflict with the names associated
            // with other KeyValueStores. Pick a symlink prefix that is unique to this store.
            long prefixId = System.currentTimeMillis() ^ (((long) this.hashCode()) << 8);
            StringBuilder sb = new StringBuilder();
            Formatter formatter = new Formatter(sb, Locale.US);
            formatter.format("%08x", prefixId);
            String newPrefix = sb.toString();
            LOG.debug("This KeyValueStore uses Distributed cache files in namespace: " + newPrefix);
            return newPrefix;
        } else {
            return mDCachePrefix; // Prefix is already set.
        }
    }

    /** {@inheritDoc} */
    @Override
    public void storeToConf(KeyValueStoreConfiguration conf) throws IOException {
        if (mInputPaths.isEmpty()) {
            throw new IOException("Required attribute not set: input path");
        }

        conf.setLong(CONF_MAX_VALUES, mMaxValues);

        if (mUseDCache && !"local".equals(conf.getDelegate().get("mapred.job.tracker", ""))) {
            // If we're scheduled to use the distributed cache, and we're not in the LocalJobRunner,
            // add these files to the DistributedCache.

            // TODO(WIBI-1653): This does not handle any sort of MapperTester, etc.
            // We need a separate flag that tells this to ignore mUseDCache if we're in a test
            // environment, and just use the original input file specs.
            final String dCachePrefix = getCachePrefix();

            // Associate this randomly chosen prefix id with this KVStore implementation.
            conf.set(CONF_DCACHE_PREFIX, dCachePrefix);

            // Add the input paths to the DistributedCache and translate path names.
            int uniqueId = 0;
            for (Path inputPath : getExpandedInputPaths()) {
                FileSystem fs = inputPath.getFileSystem(conf.getDelegate());
                Path absolutePath = inputPath.makeQualified(fs);

                String uriStr = absolutePath.toString() + "#" + dCachePrefix + "-" + uniqueId;
                LOG.debug("Adding to DistributedCache: " + uriStr);
                uniqueId++;
                try {
                    DistributedCache.addCacheFile(new URI(uriStr), conf.getDelegate());
                } catch (URISyntaxException use) {
                    throw new IOException("Could not construct URI for file: " + uriStr, use);
                }
            }

            // Ensure that symlinks are created for cached files.
            DistributedCache.createSymlink(conf.getDelegate());

            // Now save the cache prefix into the local state.  We couldn't set this earlier,
            // because we wanted getExpandedInputPaths() to actually unglob things. That
            // function will behave differently if mDCachePrefix is already initialized.
            mDCachePrefix = dCachePrefix;
        } else {
            // Just put the regular HDFS paths in the Configuration.
            conf.setStrings(CONF_PATHS,
                    Lists.toArray(Lists.map(mInputPaths, new Lists.ToStringFn<Path>()), String.class));
        }
    }

    /** {@inheritDoc} */
    @Override
    public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
        setConf(conf.getDelegate());
        mDCachePrefix = conf.get(CONF_DCACHE_PREFIX, "");
        LOG.debug("Input dCachePrefix: " + mDCachePrefix);
        mMaxValues = conf.getLong(CONF_MAX_VALUES, Long.MAX_VALUE);

        if (mDCachePrefix.isEmpty()) {
            // Read an ordinary list of files from the Configuration.
            // These may include directories and globs to expand.
            mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS, new String[0])),
                    new Lists.Func<String, Path>() {
                        @Override
                        public Path eval(String in) {
                            LOG.debug("File input: " + in);
                            return new Path(in);
                        }
                    });
        } else {
            // Use the dcache prefix to get the names of the files for this store.
            // The symlinks are already present in the working dir of the task.
            final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
            FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
            if (null == statuses || statuses.length == 0) {
                throw new IOException("No files associated with the job in the DistributedCache");
            }

            // Get the (absolute) input file paths to use.
            mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
                @Override
                public Path eval(FileStatus status) {
                    Path out = status.getPath().makeQualified(localFs);
                    LOG.debug("Loaded from DistributedCache: " + out);
                    return out;
                }
            });
        }
    }
}