Java tutorial
/******************************************************************************* * Pentaho Big Data * <p> * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com * <p> * ****************************************************************************** * <p> * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. ******************************************************************************/ package org.pentaho.hadoop.shim; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Field; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.ServiceLoader; import java.util.Set; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.FileSelectInfo; import org.apache.commons.vfs2.FileSelector; import org.apache.commons.vfs2.FileSystemException; import org.apache.commons.vfs2.FileType; import org.apache.commons.vfs2.impl.DefaultFileSystemManager; import org.apache.log4j.Logger; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.StringUtil; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.hadoop.shim.api.ActiveHadoopConfigurationLocator; import org.pentaho.hadoop.shim.api.Required; import org.pentaho.hadoop.shim.api.ShimProperties; import org.pentaho.hadoop.shim.spi.FormatShim; import org.pentaho.hadoop.shim.spi.HadoopConfigurationProvider; import org.pentaho.hadoop.shim.spi.HadoopShim; import org.pentaho.hadoop.shim.spi.PentahoHadoopShim; import org.pentaho.hadoop.shim.spi.PigShim; import org.pentaho.hadoop.shim.spi.SnappyShim; import org.pentaho.hadoop.shim.spi.SqoopShim; import org.pentaho.hbase.shim.spi.HBaseShim; import org.pentaho.oozie.shim.api.OozieClientFactory; /** * A file-based Hadoop configuration provider that knows how to load Hadoop configurations from a VFS file system. This * class is not thread-safe. */ public class HadoopConfigurationLocator implements HadoopConfigurationProvider { private static final String JAR_EXTENSION = ".jar"; private static final String CONFIG_PROPERTIES_FILE = "config.properties"; private static final String CONFIG_PROPERTY_IGNORE_CLASSES = "ignore.classes"; private static final String CONFIG_PROPERTY_EXCLUDE_JARS = "exclude.jars"; private static final String SHIM_CLASSPATH_IGNORE = "classpath.ignore"; private static final String CONFIG_PROPERTY_CLASSPATH = "classpath"; private static final String CONFIG_PROPERTY_LIBRARY_PATH = "library.path"; private static final String CONFIG_PROPERTY_NAME = "name"; private static final String PMR_PROPERTIES = "pmr.properties"; private static final URL[] EMPTY_URL_ARRAY = new URL[0]; private static final Class<?> PKG = HadoopConfigurationLocator.class; private Logger logger = Logger.getLogger(getClass()); /** * This is a set of shim classes to load from each Hadoop configuration. TODO Externalize this list so we may * configure it per installation */ @SuppressWarnings("unchecked") private static final Class<? extends PentahoHadoopShim>[] SHIM_TYPES = new Class[] { HadoopShim.class, HBaseShim.class, PigShim.class, FormatShim.class, SnappyShim.class, SqoopShim.class, OozieClientFactory.class }; private static final PentahoHadoopShim[] EMPTY_SHIM_ARRAY = new PentahoHadoopShim[0]; /** * Currently known shim configurations */ private Map<String, HadoopConfiguration> configurations; /** * Flag indicating we've been initialized. We require initialization to know where to look for Hadoop configurations * on disk. */ private boolean initialized; /** * Used to determine the active Hadoop configuration at runtime */ private ActiveHadoopConfigurationLocator activeLocator; /** * The file system manager used to provide shims a way to register their {@link FileProvider} implementations. */ private HadoopConfigurationFileSystemManager fsm; private DefaultFileSystemManager defaultFsm; /** * Initialize this factory with a directory of where to look for cluster configurations. * * @param baseDir Directory to look for Hadoop configurations in * @param activeLocator A locator for resolving the current active Hadoop configuration * @param fsm A file system manager to inject VFS file providers into from any loaded Hadoop configuration */ public void init(FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator, DefaultFileSystemManager fsm) throws ConfigurationException { if (baseDir == null) { throw new NullPointerException(FileObject.class.getSimpleName() + " is required"); } if (activeLocator == null) { throw new NullPointerException(ActiveHadoopConfigurationLocator.class.getSimpleName() + " is required"); } if (fsm == null) { throw new NullPointerException(DefaultFileSystemManager.class.getSimpleName() + " is required"); } this.defaultFsm = fsm; this.fsm = new HadoopConfigurationFileSystemManager(this, fsm); findHadoopConfigurations(baseDir, activeLocator); this.activeLocator = activeLocator; initialized = true; } /** * Attempt to find any Hadoop configuration as a direct descendant of the provided directory. * * @param baseDir Directory to look for Hadoop configurations in * @throws ConfigurationException */ private void findHadoopConfigurations(FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator) throws ConfigurationException { configurations = new HashMap<String, HadoopConfiguration>(); try { if (!baseDir.exists()) { throw new ConfigurationException(BaseMessages.getString(PKG, "Error.HadoopConfigurationDirectoryDoesNotExist", baseDir.getURL())); } for (FileObject f : baseDir.findFiles(new FileSelector() { @Override public boolean includeFile(FileSelectInfo info) throws Exception { return info.getDepth() == 1 && FileType.FOLDER.equals(info.getFile().getType()); } @Override public boolean traverseDescendents(FileSelectInfo info) throws Exception { return info.getDepth() == 0; } })) { // Only load the specified configuration (ID should match the basename, we allow case-insensitivity) if (f.getName().getBaseName().equalsIgnoreCase(activeLocator.getActiveConfigurationId())) { HadoopConfiguration config = loadHadoopConfiguration(f); if (config != null) { configurations.put(config.getIdentifier(), config); } } } } catch (FileSystemException ex) { throw new ConfigurationException(BaseMessages.getString(PKG, "Error.UnableToLoadConfigurations", baseDir.getName().getFriendlyURI()), ex); } } /** * Exclude jars contained in exclude.jars property in config.properties file from the list of URLs * * @param urls the list of all the URLs to add to the class loader * @param excludedJarsProperty exclude.jars property from a config.properties file * @return The rest of the jars in {@code urls} after excluding the jars listed in {@code excludedJarsProperty}. */ protected List<URL> filterJars(List<URL> urls, String excludedJarsProperty) { Pattern pattern; Matcher matcher; String[] excludedJars; if (!(excludedJarsProperty == null || excludedJarsProperty.trim().isEmpty())) { excludedJars = excludedJarsProperty.split(","); if (excludedJars != null) { for (String excludedJar : excludedJars) { pattern = Pattern.compile(".*/" + excludedJar.toLowerCase() + "-.*\\.jar$"); matcher = pattern.matcher(""); Iterator<URL> iterator = urls.listIterator(); while (iterator.hasNext()) { URL url = iterator.next(); if (url.toString().toLowerCase().contains(excludedJar.toLowerCase())) { if (excludedJar.endsWith(".jar") || url.toString().toLowerCase().contains(excludedJar.toLowerCase() + ".jar")) { iterator.remove(); } else { if (matcher.reset(url.toString().toLowerCase()).matches()) { iterator.remove(); } } } } } } } return urls; } private List<URL> findJarsIn(FileObject path, final int maxdepth, final Set<String> paths) throws FileSystemException { FileObject[] jars = path.findFiles(new FileSelector() { @Override public boolean includeFile(FileSelectInfo info) throws Exception { for (String path : paths) { if (info.getFile().getURL().toString().endsWith(path)) { return false; } } return info.getFile().getName().getBaseName().endsWith(JAR_EXTENSION); } @Override public boolean traverseDescendents(FileSelectInfo info) throws Exception { for (String path : paths) { if (info.getFile().getURL().toString().endsWith(path)) { return false; } } return info.getDepth() <= maxdepth; } }); List<URL> jarUrls = new ArrayList<URL>(); for (FileObject jar : jars) { jarUrls.add(jar.getURL()); } return jarUrls; } /** * Find all jar files in the path provided. * * @param path Path to search for jar files within * @param maxdepth Maximum traversal depth (1-based) * @return All jars found within {@code path} in at most {@code maxdepth} subdirectories. * @throws FileSystemException */ private void checkInitialized() { if (!initialized) { throw new RuntimeException(BaseMessages.getString(PKG, "Error.LocatorNotInitialized")); } } /** * Locates an implementation of {@code service} using the {@link ServiceLoader}. * * @param cl Class loader to look for implementations in * @return The first implementation found. */ protected <T> T locateServiceImpl(ClassLoader cl, Class<T> service) { ServiceLoader<T> loader = ServiceLoader.load(service, cl); Iterator<T> iter = loader.iterator(); if (iter.hasNext()) { return iter.next(); } return null; } /** * Create a ClassLoader to load resources for a {@code HadoopConfiguration}. * * @param root Configuration root directory * @param parent Parent class loader to delegate to if resources cannot be found in the configuration's * directory or provided classpath * @param classpathUrls Additional URLs to add to the class loader. These will be added before any internal * resources. * @param ignoredClasses Classes (or packages) that should not be loaded by the class loader * @return A class loader capable of loading a Hadoop configuration located at {@code root}. * @throws ConfigurationException Error creating a class loader for the Hadoop configuration located at {@code root} */ protected ClassLoader createConfigurationLoader(FileObject root, ClassLoader parent, List<URL> classpathUrls, ShimProperties configurationProperties, String... ignoredClasses) throws ConfigurationException { try { if (root == null || !FileType.FOLDER.equals(root.getType())) { throw new IllegalArgumentException("root must be a folder: " + root); } // Find all jar files in the configuration, at most 2 folders deep List<URL> jars = findJarsIn(root, 3, configurationProperties.getConfigSet(SHIM_CLASSPATH_IGNORE)); // Add the root of the configuration jars.add(0, new URL(root.getURL().toExternalForm() + "/")); // Inject any overriding URLs before all other paths if (classpathUrls != null) { jars.addAll(0, classpathUrls); } //Exclude jars contained in exclude.jars property in config.properties file from the list of jars jars = filterJars(jars, configurationProperties.getProperty(CONFIG_PROPERTY_EXCLUDE_JARS)); return new HadoopConfigurationClassLoader(jars.toArray(EMPTY_URL_ARRAY), parent, ignoredClasses); } catch (Exception ex) { throw new ConfigurationException(BaseMessages.getString(PKG, "Error.CreatingClassLoader"), ex); } } private Properties getPmrProperties() { InputStream pmrProperties = getClass().getClassLoader().getResourceAsStream(PMR_PROPERTIES); Properties properties = new Properties(); if (pmrProperties != null) { try { properties.load(pmrProperties); } catch (IOException ioe) { // pmr.properties not available } finally { if (pmrProperties != null) { try { pmrProperties.close(); } catch (IOException e) { // pmr.properties not available } } } } return properties; } @VisibleForTesting boolean isRunningOnCluster() { Properties pmrProperties = getPmrProperties(); String isPmr = pmrProperties.getProperty("isPmr", "false"); return ("true".equals(isPmr)); } /** * Parse a set of URLs from a comma-separated list of URLs. If the URL points to a directory all jar files within that * directory will be returned as well. * * @param urlString Comma-separated list of URLs (relative or absolute) * @return List of URLs resolved from {@code urlString} */ protected List<URL> parseURLs(FileObject root, String urlString) { if (urlString == null || urlString.trim().isEmpty()) { return Collections.emptyList(); } String[] paths = urlString.split(","); List<URL> urls = new ArrayList<URL>(); for (String path : paths) { try { FileObject file = root.resolveFile(path.trim()); if (!file.exists()) { file = defaultFsm.resolveFile(path.trim()); } if (FileType.FOLDER.equals(file.getType())) { // Add directories with a trailing / so the URL ClassLoader interprets // them as directories urls.add(new URL(file.getURL().toExternalForm() + "/")); // Also add all jars within this directory urls.addAll(findJarsIn(file, 1, new HashSet<String>())); } else { urls.add(file.getURL()); } } catch (Exception e) { // Log invalid path logger.error(BaseMessages.getString(PKG, "Error.InvalidClasspathEntry", path)); } } return urls; } /** * Attempt to discover a valid Hadoop configuration from the provided folder. * * @param folder Folder that may represent a Hadoop configuration * @return A Hadoop configuration for the folder provided or null if none is found. * @throws ConfigurationException Error when loading the Hadoop configuration. */ protected HadoopConfiguration loadHadoopConfiguration(FileObject folder) throws ConfigurationException { ShimProperties configurationProperties = new ShimProperties(); try { FileObject configFile = folder.getChild(CONFIG_PROPERTIES_FILE); if (configFile != null) { configurationProperties.putAll(loadProperties(configFile)); } } catch (Exception ex) { throw new ConfigurationException(BaseMessages.getString(PKG, "Error.UnableToLoadConfigurationProperties", CONFIG_PROPERTIES_FILE)); } for (Entry<String, String> entry : configurationProperties.getPrefixedProperties("java.system") .entrySet()) { System.setProperty(entry.getKey(), entry.getValue()); } try { List<URL> classpathElements = null; if (!isRunningOnCluster()) { // Parse all URLs from an optional classpath from the configuration file classpathElements = parseURLs(folder, configurationProperties.getProperty(CONFIG_PROPERTY_CLASSPATH)); } // Allow external configuration of classes to ignore String ignoredClassesProperty = configurationProperties.getProperty(CONFIG_PROPERTY_IGNORE_CLASSES); String[] ignoredClasses = null; if (!StringUtil.isEmpty(ignoredClassesProperty)) { ignoredClasses = ignoredClassesProperty.split(","); } // Pass our class loader in to the configurations' CL as its parent so it // can find the same // API classes we're using ClassLoader cl = createConfigurationLoader(folder, getClass().getClassLoader(), classpathElements, configurationProperties, ignoredClasses); verifyClasses(cl, configurationProperties.getProperty("required.classes"), configurationProperties.getProperty("name")); // Treat the Hadoop shim special. It is absolutely required for a Hadoop configuration. HadoopShim hadoopShim = null; List<PentahoHadoopShim> shims = new ArrayList<PentahoHadoopShim>(); // Attempt to locate a shim within this folder for (Class<? extends PentahoHadoopShim> shimType : SHIM_TYPES) { PentahoHadoopShim s = locateServiceImpl(cl, shimType); if (s == null && shimType.getAnnotation(Required.class) != null) { logger.warn(BaseMessages.getString(PKG, "Error.MissingRequiredShim", shimType.getSimpleName())); // Do not continue to load the configuration if we are missing a required shim return null; } if (HadoopShim.class.isAssignableFrom(shimType)) { hadoopShim = (HadoopShim) s; } else { shims.add(s); } } String id = folder.getName().getBaseName(); String name = configurationProperties.getProperty(CONFIG_PROPERTY_NAME, id); HadoopConfiguration config = new HadoopConfiguration(configurationProperties, folder, id, name, hadoopShim, shims.toArray(EMPTY_SHIM_ARRAY)); // Register native libraries after everything else has been loaded successfully registerNativeLibraryPaths(configurationProperties.getProperty(CONFIG_PROPERTY_LIBRARY_PATH)); hadoopShim.onLoad(config, fsm); return config; } catch (Throwable t) { throw new ConfigurationException( BaseMessages.getString(PKG, "Error.LoadingConfiguration") + " " + t.toString(), t); } } protected void verifyClasses(ClassLoader classLoader, String requiredClasses, String shimName) throws ConfigurationException { if (!Const.isEmpty(requiredClasses)) { for (String className : requiredClasses.split(",")) { try { classLoader.loadClass(className); } catch (Throwable e) { throw new ConfigurationException( BaseMessages.getString(PKG, "Error.MissingRequiredClasses", className, shimName)); } } } } /** * Register a comma-separated list of native library paths. * * @param paths Comma-separated list of libraries */ protected void registerNativeLibraryPaths(String paths) { if (paths == null) { return; } for (String path : paths.split(",")) { boolean successful = registerNativeLibraryPath(path); if (!successful) { logger.error(BaseMessages.getString(PKG, "Error.RegisteringLibraryPath", path)); } } } /** * Dynamically register a native library path. This relies on a specific implementation detail of ClassLoader: it's * usr_paths property. * * @param path Library path to add * @return {@code true} if the library path could be added successfully */ protected boolean registerNativeLibraryPath(String path) { if (path == null) { throw new NullPointerException(); } path = path.trim(); try { Field f = ClassLoader.class.getDeclaredField("usr_paths"); boolean accessible = f.isAccessible(); f.setAccessible(true); try { String[] paths = (String[]) f.get(null); // Make sure the path isn't already registered for (String p : paths) { if (p.equals(path)) { return true; // Success, it's already there! } } String[] newPaths = new String[paths.length + 1]; System.arraycopy(paths, 0, newPaths, 0, paths.length); newPaths[paths.length] = path; f.set(null, newPaths); // Success! return true; } finally { f.setAccessible(accessible); } } catch (Exception ex) { // Something went wrong, definitely not successful return false; } } /** * Load the properties file located at {@code file} * * @param file Location of a properties file to load * @return Loaded properties file * @throws IOException Error loading properties from file * @throws FileSystemException Error locating input stream for file */ protected Properties loadProperties(FileObject file) throws FileSystemException, IOException { Properties p = new Properties(); p.load(file.getContent().getInputStream()); return p; } @Override public List<HadoopConfiguration> getConfigurations() { checkInitialized(); return new ArrayList<HadoopConfiguration>(configurations.values()); } @Override public boolean hasConfiguration(String id) { checkInitialized(); return configurations.containsKey(id); } @Override public HadoopConfiguration getConfiguration(String id) throws ConfigurationException { checkInitialized(); HadoopConfiguration config = configurations.get(id); if (config == null) { throw new ConfigurationException(BaseMessages.getString(PKG, "Error.UnknownHadoopConfiguration", id)); } return config; } @Override public HadoopConfiguration getActiveConfiguration() throws ConfigurationException { return getConfiguration(activeLocator.getActiveConfigurationId()); } }