Source code

Java tutorial


Here is the source code for


 * Pentaho Big Data
 * <p>
 * Copyright (C) 2002-2017 by Hitachi Vantara :
 * <p>
 * ******************************************************************************
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * <p>
 * <p>
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.

package org.pentaho.hadoop.shim;

import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSelectInfo;
import org.apache.commons.vfs2.FileSelector;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.commons.vfs2.FileType;
import org.apache.commons.vfs2.impl.DefaultFileSystemManager;
import org.apache.log4j.Logger;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.util.StringUtil;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.hadoop.shim.api.ActiveHadoopConfigurationLocator;
import org.pentaho.hadoop.shim.api.Required;
import org.pentaho.hadoop.shim.api.ShimProperties;
import org.pentaho.hadoop.shim.spi.FormatShim;
import org.pentaho.hadoop.shim.spi.HadoopConfigurationProvider;
import org.pentaho.hadoop.shim.spi.HadoopShim;
import org.pentaho.hadoop.shim.spi.PentahoHadoopShim;
import org.pentaho.hadoop.shim.spi.PigShim;
import org.pentaho.hadoop.shim.spi.SnappyShim;
import org.pentaho.hadoop.shim.spi.SqoopShim;
import org.pentaho.hbase.shim.spi.HBaseShim;
import org.pentaho.oozie.shim.api.OozieClientFactory;

 * A file-based Hadoop configuration provider that knows how to load Hadoop configurations from a VFS file system. This
 * class is not thread-safe.
public class HadoopConfigurationLocator implements HadoopConfigurationProvider {
    private static final String JAR_EXTENSION = ".jar";

    private static final String CONFIG_PROPERTIES_FILE = "";

    private static final String CONFIG_PROPERTY_IGNORE_CLASSES = "ignore.classes";

    private static final String CONFIG_PROPERTY_EXCLUDE_JARS = "exclude.jars";

    private static final String SHIM_CLASSPATH_IGNORE = "classpath.ignore";

    private static final String CONFIG_PROPERTY_CLASSPATH = "classpath";

    private static final String CONFIG_PROPERTY_LIBRARY_PATH = "library.path";

    private static final String CONFIG_PROPERTY_NAME = "name";

    private static final String PMR_PROPERTIES = "";

    private static final URL[] EMPTY_URL_ARRAY = new URL[0];

    private static final Class<?> PKG = HadoopConfigurationLocator.class;

    private Logger logger = Logger.getLogger(getClass());

     * This is a set of shim classes to load from each Hadoop configuration. TODO Externalize this list so we may
     * configure it per installation
    private static final Class<? extends PentahoHadoopShim>[] SHIM_TYPES = new Class[] { HadoopShim.class,
            HBaseShim.class, PigShim.class, FormatShim.class, SnappyShim.class, SqoopShim.class,
            OozieClientFactory.class };

    private static final PentahoHadoopShim[] EMPTY_SHIM_ARRAY = new PentahoHadoopShim[0];

     * Currently known shim configurations
    private Map<String, HadoopConfiguration> configurations;

     * Flag indicating we've been initialized. We require initialization to know where to look for Hadoop configurations
     * on disk.
    private boolean initialized;

     * Used to determine the active Hadoop configuration at runtime
    private ActiveHadoopConfigurationLocator activeLocator;

     * The file system manager used to provide shims a way to register their {@link FileProvider} implementations.
    private HadoopConfigurationFileSystemManager fsm;

    private DefaultFileSystemManager defaultFsm;

     * Initialize this factory with a directory of where to look for cluster configurations.
     * @param baseDir       Directory to look for Hadoop configurations in
     * @param activeLocator A locator for resolving the current active Hadoop configuration
     * @param fsm           A file system manager to inject VFS file providers into from any loaded Hadoop configuration
    public void init(FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator,
            DefaultFileSystemManager fsm) throws ConfigurationException {
        if (baseDir == null) {
            throw new NullPointerException(FileObject.class.getSimpleName() + " is required");
        if (activeLocator == null) {
            throw new NullPointerException(ActiveHadoopConfigurationLocator.class.getSimpleName() + " is required");
        if (fsm == null) {
            throw new NullPointerException(DefaultFileSystemManager.class.getSimpleName() + " is required");
        this.defaultFsm = fsm;
        this.fsm = new HadoopConfigurationFileSystemManager(this, fsm);
        findHadoopConfigurations(baseDir, activeLocator);
        this.activeLocator = activeLocator;
        initialized = true;

     * Attempt to find any Hadoop configuration as a direct descendant of the provided directory.
     * @param baseDir Directory to look for Hadoop configurations in
     * @throws ConfigurationException
    private void findHadoopConfigurations(FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator)
            throws ConfigurationException {
        configurations = new HashMap<String, HadoopConfiguration>();
        try {
            if (!baseDir.exists()) {
                throw new ConfigurationException(BaseMessages.getString(PKG,
                        "Error.HadoopConfigurationDirectoryDoesNotExist", baseDir.getURL()));
            for (FileObject f : baseDir.findFiles(new FileSelector() {
                public boolean includeFile(FileSelectInfo info) throws Exception {
                    return info.getDepth() == 1 && FileType.FOLDER.equals(info.getFile().getType());

                public boolean traverseDescendents(FileSelectInfo info) throws Exception {
                    return info.getDepth() == 0;
            })) {
                // Only load the specified configuration (ID should match the basename, we allow case-insensitivity)
                if (f.getName().getBaseName().equalsIgnoreCase(activeLocator.getActiveConfigurationId())) {
                    HadoopConfiguration config = loadHadoopConfiguration(f);
                    if (config != null) {
                        configurations.put(config.getIdentifier(), config);
        } catch (FileSystemException ex) {
            throw new ConfigurationException(BaseMessages.getString(PKG, "Error.UnableToLoadConfigurations",
                    baseDir.getName().getFriendlyURI()), ex);

     * Exclude jars contained in exclude.jars property in file from the list of URLs
     * @param urls                 the list of all the URLs to add to the class loader
     * @param excludedJarsProperty exclude.jars property from a file
     * @return The rest of the jars in {@code urls} after excluding the jars listed in {@code excludedJarsProperty}.

    protected List<URL> filterJars(List<URL> urls, String excludedJarsProperty) {

        Pattern pattern;
        Matcher matcher;
        String[] excludedJars;

        if (!(excludedJarsProperty == null || excludedJarsProperty.trim().isEmpty())) {
            excludedJars = excludedJarsProperty.split(",");
            if (excludedJars != null) {
                for (String excludedJar : excludedJars) {
                    pattern = Pattern.compile(".*/" + excludedJar.toLowerCase() + "-.*\\.jar$");
                    matcher = pattern.matcher("");
                    Iterator<URL> iterator = urls.listIterator();
                    while (iterator.hasNext()) {
                        URL url =;
                        if (url.toString().toLowerCase().contains(excludedJar.toLowerCase())) {
                            if (excludedJar.endsWith(".jar")
                                    || url.toString().toLowerCase().contains(excludedJar.toLowerCase() + ".jar")) {
                            } else {
                                if (matcher.reset(url.toString().toLowerCase()).matches()) {
        return urls;

    private List<URL> findJarsIn(FileObject path, final int maxdepth, final Set<String> paths)
            throws FileSystemException {
        FileObject[] jars = path.findFiles(new FileSelector() {
            public boolean includeFile(FileSelectInfo info) throws Exception {
                for (String path : paths) {
                    if (info.getFile().getURL().toString().endsWith(path)) {
                        return false;
                return info.getFile().getName().getBaseName().endsWith(JAR_EXTENSION);

            public boolean traverseDescendents(FileSelectInfo info) throws Exception {
                for (String path : paths) {
                    if (info.getFile().getURL().toString().endsWith(path)) {
                        return false;
                return info.getDepth() <= maxdepth;

        List<URL> jarUrls = new ArrayList<URL>();
        for (FileObject jar : jars) {
        return jarUrls;

     * Find all jar files in the path provided.
     * @param path     Path to search for jar files within
     * @param maxdepth Maximum traversal depth (1-based)
     * @return All jars found within {@code path} in at most {@code maxdepth} subdirectories.
     * @throws FileSystemException

    private void checkInitialized() {
        if (!initialized) {
            throw new RuntimeException(BaseMessages.getString(PKG, "Error.LocatorNotInitialized"));

     * Locates an implementation of {@code service} using the {@link ServiceLoader}.
     * @param cl Class loader to look for implementations in
     * @return The first implementation found.
    protected <T> T locateServiceImpl(ClassLoader cl, Class<T> service) {
        ServiceLoader<T> loader = ServiceLoader.load(service, cl);
        Iterator<T> iter = loader.iterator();
        if (iter.hasNext()) {
        return null;

     * Create a ClassLoader to load resources for a {@code HadoopConfiguration}.
     * @param root           Configuration root directory
     * @param parent         Parent class loader to delegate to if resources cannot be found in the configuration's
     *                       directory or provided classpath
     * @param classpathUrls  Additional URLs to add to the class loader. These will be added before any internal
     *                       resources.
     * @param ignoredClasses Classes (or packages) that should not be loaded by the class loader
     * @return A class loader capable of loading a Hadoop configuration located at {@code root}.
     * @throws ConfigurationException Error creating a class loader for the Hadoop configuration located at {@code root}
    protected ClassLoader createConfigurationLoader(FileObject root, ClassLoader parent, List<URL> classpathUrls,
            ShimProperties configurationProperties, String... ignoredClasses) throws ConfigurationException {
        try {
            if (root == null || !FileType.FOLDER.equals(root.getType())) {
                throw new IllegalArgumentException("root must be a folder: " + root);

            // Find all jar files in the configuration, at most 2 folders deep
            List<URL> jars = findJarsIn(root, 3, configurationProperties.getConfigSet(SHIM_CLASSPATH_IGNORE));

            // Add the root of the configuration
            jars.add(0, new URL(root.getURL().toExternalForm() + "/"));
            // Inject any overriding URLs before all other paths
            if (classpathUrls != null) {
                jars.addAll(0, classpathUrls);
            //Exclude jars contained in exclude.jars property in file from the list of jars
            jars = filterJars(jars, configurationProperties.getProperty(CONFIG_PROPERTY_EXCLUDE_JARS));

            return new HadoopConfigurationClassLoader(jars.toArray(EMPTY_URL_ARRAY), parent, ignoredClasses);
        } catch (Exception ex) {
            throw new ConfigurationException(BaseMessages.getString(PKG, "Error.CreatingClassLoader"), ex);

    private Properties getPmrProperties() {
        InputStream pmrProperties = getClass().getClassLoader().getResourceAsStream(PMR_PROPERTIES);
        Properties properties = new Properties();
        if (pmrProperties != null) {
            try {
            } catch (IOException ioe) {
                // not available
            } finally {
                if (pmrProperties != null) {
                    try {
                    } catch (IOException e) {
                        // not available
        return properties;

    boolean isRunningOnCluster() {
        Properties pmrProperties = getPmrProperties();
        String isPmr = pmrProperties.getProperty("isPmr", "false");
        return ("true".equals(isPmr));

     * Parse a set of URLs from a comma-separated list of URLs. If the URL points to a directory all jar files within that
     * directory will be returned as well.
     * @param urlString Comma-separated list of URLs (relative or absolute)
     * @return List of URLs resolved from {@code urlString}
    protected List<URL> parseURLs(FileObject root, String urlString) {
        if (urlString == null || urlString.trim().isEmpty()) {
            return Collections.emptyList();
        String[] paths = urlString.split(",");
        List<URL> urls = new ArrayList<URL>();
        for (String path : paths) {
            try {
                FileObject file = root.resolveFile(path.trim());
                if (!file.exists()) {
                    file = defaultFsm.resolveFile(path.trim());
                if (FileType.FOLDER.equals(file.getType())) {
                    // Add directories with a trailing / so the URL ClassLoader interprets
                    // them as directories
                    urls.add(new URL(file.getURL().toExternalForm() + "/"));
                    // Also add all jars within this directory
                    urls.addAll(findJarsIn(file, 1, new HashSet<String>()));
                } else {
            } catch (Exception e) {
                // Log invalid path
                logger.error(BaseMessages.getString(PKG, "Error.InvalidClasspathEntry", path));
        return urls;

     * Attempt to discover a valid Hadoop configuration from the provided folder.
     * @param folder Folder that may represent a Hadoop configuration
     * @return A Hadoop configuration for the folder provided or null if none is found.
     * @throws ConfigurationException Error when loading the Hadoop configuration.
    protected HadoopConfiguration loadHadoopConfiguration(FileObject folder) throws ConfigurationException {
        ShimProperties configurationProperties = new ShimProperties();
        try {
            FileObject configFile = folder.getChild(CONFIG_PROPERTIES_FILE);
            if (configFile != null) {
        } catch (Exception ex) {
            throw new ConfigurationException(BaseMessages.getString(PKG,
                    "Error.UnableToLoadConfigurationProperties", CONFIG_PROPERTIES_FILE));

        for (Entry<String, String> entry : configurationProperties.getPrefixedProperties("java.system")
                .entrySet()) {
            System.setProperty(entry.getKey(), entry.getValue());

        try {
            List<URL> classpathElements = null;
            if (!isRunningOnCluster()) {
                // Parse all URLs from an optional classpath from the configuration file
                classpathElements = parseURLs(folder,

            // Allow external configuration of classes to ignore
            String ignoredClassesProperty = configurationProperties.getProperty(CONFIG_PROPERTY_IGNORE_CLASSES);
            String[] ignoredClasses = null;
            if (!StringUtil.isEmpty(ignoredClassesProperty)) {
                ignoredClasses = ignoredClassesProperty.split(",");

            // Pass our class loader in to the configurations' CL as its parent so it
            // can find the same
            // API classes we're using
            ClassLoader cl = createConfigurationLoader(folder, getClass().getClassLoader(), classpathElements,
                    configurationProperties, ignoredClasses);
            verifyClasses(cl, configurationProperties.getProperty("required.classes"),

            // Treat the Hadoop shim special. It is absolutely required for a Hadoop configuration.
            HadoopShim hadoopShim = null;
            List<PentahoHadoopShim> shims = new ArrayList<PentahoHadoopShim>();
            // Attempt to locate a shim within this folder
            for (Class<? extends PentahoHadoopShim> shimType : SHIM_TYPES) {
                PentahoHadoopShim s = locateServiceImpl(cl, shimType);
                if (s == null && shimType.getAnnotation(Required.class) != null) {
                    logger.warn(BaseMessages.getString(PKG, "Error.MissingRequiredShim", shimType.getSimpleName()));
                    // Do not continue to load the configuration if we are missing a required shim
                    return null;
                if (HadoopShim.class.isAssignableFrom(shimType)) {
                    hadoopShim = (HadoopShim) s;
                } else {
            String id = folder.getName().getBaseName();
            String name = configurationProperties.getProperty(CONFIG_PROPERTY_NAME, id);

            HadoopConfiguration config = new HadoopConfiguration(configurationProperties, folder, id, name,
                    hadoopShim, shims.toArray(EMPTY_SHIM_ARRAY));

            // Register native libraries after everything else has been loaded successfully

            hadoopShim.onLoad(config, fsm);
            return config;
        } catch (Throwable t) {
            throw new ConfigurationException(
                    BaseMessages.getString(PKG, "Error.LoadingConfiguration") + " " + t.toString(), t);

    protected void verifyClasses(ClassLoader classLoader, String requiredClasses, String shimName)
            throws ConfigurationException {
        if (!Const.isEmpty(requiredClasses)) {
            for (String className : requiredClasses.split(",")) {
                try {
                } catch (Throwable e) {
                    throw new ConfigurationException(
                            BaseMessages.getString(PKG, "Error.MissingRequiredClasses", className, shimName));

     * Register a comma-separated list of native library paths.
     * @param paths Comma-separated list of libraries
    protected void registerNativeLibraryPaths(String paths) {
        if (paths == null) {
        for (String path : paths.split(",")) {
            boolean successful = registerNativeLibraryPath(path);
            if (!successful) {
                logger.error(BaseMessages.getString(PKG, "Error.RegisteringLibraryPath", path));

     * Dynamically register a native library path. This relies on a specific implementation detail of ClassLoader: it's
     * usr_paths property.
     * @param path Library path to add
     * @return {@code true} if the library path could be added successfully
    protected boolean registerNativeLibraryPath(String path) {
        if (path == null) {
            throw new NullPointerException();
        path = path.trim();
        try {
            Field f = ClassLoader.class.getDeclaredField("usr_paths");
            boolean accessible = f.isAccessible();
            try {
                String[] paths = (String[]) f.get(null);

                // Make sure the path isn't already registered
                for (String p : paths) {
                    if (p.equals(path)) {
                        return true; // Success, it's already there!

                String[] newPaths = new String[paths.length + 1];
                System.arraycopy(paths, 0, newPaths, 0, paths.length);
                newPaths[paths.length] = path;
                f.set(null, newPaths);
                // Success!
                return true;
            } finally {
        } catch (Exception ex) {
            // Something went wrong, definitely not successful
            return false;

     * Load the properties file located at {@code file}
     * @param file Location of a properties file to load
     * @return Loaded properties file
     * @throws IOException         Error loading properties from file
     * @throws FileSystemException Error locating input stream for file
    protected Properties loadProperties(FileObject file) throws FileSystemException, IOException {
        Properties p = new Properties();
        return p;

    public List<HadoopConfiguration> getConfigurations() {
        return new ArrayList<HadoopConfiguration>(configurations.values());

    public boolean hasConfiguration(String id) {
        return configurations.containsKey(id);

    public HadoopConfiguration getConfiguration(String id) throws ConfigurationException {
        HadoopConfiguration config = configurations.get(id);
        if (config == null) {
            throw new ConfigurationException(BaseMessages.getString(PKG, "Error.UnknownHadoopConfiguration", id));
        return config;

    public HadoopConfiguration getActiveConfiguration() throws ConfigurationException {
        return getConfiguration(activeLocator.getActiveConfigurationId());