Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thinkbiganalytics.nifi.v2.hdfs; import com.thinkbiganalytics.nifi.security.ApplySecurityPolicy; import com.thinkbiganalytics.nifi.security.KerberosProperties; import com.thinkbiganalytics.nifi.security.SecurityUtil; import com.thinkbiganalytics.nifi.security.SpringSecurityContextLoader; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.annotation.lifecycle.OnStopped; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.ValidationContext; import org.apache.nifi.components.ValidationResult; import org.apache.nifi.components.Validator; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.exception.ProcessException; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URI; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import javax.annotation.Nonnull; import javax.annotation.Nullable; import javax.net.SocketFactory; /** * This is a base class that is helpful when building processors interacting with HDFS. */ public abstract class AbstractHadoopProcessor extends AbstractHadoopNiFiVersionAwareProcessor { // properties public static final PropertyDescriptor HADOOP_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder() .name("Hadoop Configuration Resources") .description( "A file or comma separated list of files which contains the Hadoop file system configuration. Without this, Hadoop " + "will search the classpath for a 'core-site.xml' and 'hdfs-site.xml' file or will revert to a default configuration.") .required(false).addValidator(createMultipleFilesExistValidator()).build(); public static final String DIRECTORY_PROP_NAME = "Directory"; private static final Object RESOURCES_LOCK = new Object(); // variables shared by all threads of this processor // Hadoop Configuration, Filesystem, and UserGroupInformation (optional) private final AtomicReference<HdfsResources> hdfsResources = new AtomicReference<>(); /** * Property for Kerberos service keytab file */ protected PropertyDescriptor kerberosKeytab; /** * Property for Kerberos service principal */ protected PropertyDescriptor kerberosPrincipal; private long kerberosReloginThreshold; private long lastKerberosReloginTime; /** * List of properties */ private List<PropertyDescriptor> properties; private static Configuration getConfigurationFromResources(String configResources) throws IOException { boolean foundResources = false; final Configuration config = new Configuration(); if (null != configResources) { String[] resources = configResources.split(","); for (String resource : resources) { config.addResource(new Path(resource.trim())); foundResources = true; } } if (!foundResources) { // check that at least 1 non-default resource is available on the classpath String configStr = config.toString(); for (String resource : configStr.substring(configStr.indexOf(":") + 1).split(",")) { if (!resource.contains("default") && config.getResource(resource.trim()) != null) { foundResources = true; break; } } } if (!foundResources) { throw new IOException( "Could not find any of the " + HADOOP_CONFIGURATION_RESOURCES.getName() + " on the classpath"); } return config; } /** * Validates that one or more files exist, as specified in a single property. * * @return a validator instance that validates the files given */ public static Validator createMultipleFilesExistValidator() { return new Validator() { @Override public ValidationResult validate(String subject, String input, ValidationContext context) { final String[] files = input.split(","); for (String filename : files) { try { final File file = new File(filename.trim()); final boolean valid = file.exists() && file.isFile(); if (!valid) { final String message = "File " + file + " does not exist or is not a file"; return new ValidationResult.Builder().subject(subject).input(input).valid(false) .explanation(message).build(); } } catch (SecurityException e) { final String message = "Unable to access " + filename + " due to " + e.getMessage(); return new ValidationResult.Builder().subject(subject).input(input).valid(false) .explanation(message).build(); } } return new ValidationResult.Builder().subject(subject).input(input).valid(true).build(); } }; } @Override protected void init(@Nonnull final ProcessorInitializationContext context) { super.init(context); hdfsResources.set(new HdfsResources(null, null, null)); // Create Kerberos properties final SpringSecurityContextLoader securityContextLoader = SpringSecurityContextLoader.create(context); final KerberosProperties kerberosProperties = securityContextLoader.getKerberosProperties(); kerberosKeytab = kerberosProperties.createKerberosKeytabProperty(); kerberosPrincipal = kerberosProperties.createKerberosPrincipalProperty(); final PropertyDescriptor ADDITIONAL_CLASSPATH_RESOURCES = AdditionalProperties .getHdfsAdditionalClasspathResources(); // Create list of properties final List<PropertyDescriptor> props = new ArrayList<>(); props.add(HADOOP_CONFIGURATION_RESOURCES); if (ADDITIONAL_CLASSPATH_RESOURCES != null) { getLog().debug("ADDITIONAL_CLASSPATH_RESOURCES obtained"); props.add(ADDITIONAL_CLASSPATH_RESOURCES); } else { getLog().debug("ADDITIONAL_CLASSPATH_RESOURCES is null!"); } props.add(kerberosPrincipal); props.add(kerberosKeytab); props.add(KerberosProperties.KERBEROS_RELOGIN_PERIOD); properties = Collections.unmodifiableList(props); } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return properties; } /** * If your subclass also has an @OnScheduled annotated method and you need hdfsResources in that method, * then be sure to call super.abstractOnScheduled(context) * * @param context the context of the processor * @throws IOException if configuration cannot be set for the HDFS resource */ @OnScheduled public final void abstractOnScheduled(ProcessContext context) throws IOException { try { // This value will be null when called from ListHDFS, because it overrides all of the default // properties this processor sets. TODO: re-work ListHDFS to utilize Kerberos if (context.getProperty(KerberosProperties.KERBEROS_RELOGIN_PERIOD).getValue() != null) { kerberosReloginThreshold = context.getProperty(KerberosProperties.KERBEROS_RELOGIN_PERIOD) .asTimePeriod(TimeUnit.SECONDS); } HdfsResources resources = hdfsResources.get(); if (resources.getConfiguration() == null) { String configResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue(); String dir = context.getProperty(DIRECTORY_PROP_NAME).getValue(); dir = dir == null ? "/" : dir; resources = resetHDFSResources(configResources, dir, context); hdfsResources.set(resources); } } catch (IOException ex) { getLog().error("HDFS Configuration error - {}", new Object[] { ex }); hdfsResources.set(new HdfsResources(null, null, null)); throw ex; } } @OnStopped public final void abstractOnStopped() { HdfsResources hdfs = hdfsResources.get(); if (hdfs != null) { FileSystem fs = hdfsResources.get().getFileSystem(); if (fs != null) { try { getLog().info("Processor Stop in progress. Will release HDFS resources."); fs.close(); } catch (IOException e) { getLog().error("Received IOException when attempting to close HDFS FileSystem handle"); } } } hdfsResources.set(new HdfsResources(null, null, null)); } /** * Reset Hadoop Configuration and FileSystem based on the supplied configuration resources. * * @param configResources for configuration * @param dir the target directory * @param context for context, which gives access to the principal * @return An HdfsResources object * @throws IOException if unable to access HDFS */ HdfsResources resetHDFSResources(String configResources, String dir, ProcessContext context) throws IOException { // org.apache.hadoop.conf.Configuration saves its current thread context class loader to use for threads that it creates // later to do I/O. We need this class loader to be the NarClassLoader instead of the magical // NarThreadContextClassLoader. ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); try { Configuration config = getConfigurationFromResources(configResources); // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout checkHdfsUriForTimeout(config); // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete // restart String disableCacheName = String.format("fs.%s.impl.disable.cache", FileSystem.getDefaultUri(config).getScheme()); config.set(disableCacheName, "true"); // If kerberos is enabled, create the file system as the kerberos principal // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time FileSystem fs = null; UserGroupInformation ugi = null; synchronized (RESOURCES_LOCK) { if (config.get("hadoop.security.authentication").equalsIgnoreCase("kerberos")) { String principal = context.getProperty(kerberosPrincipal).getValue(); String keyTab = context.getProperty(kerberosKeytab).getValue(); UserGroupInformation.setConfiguration(config); ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keyTab); modifyConfig(context, config); fs = getFileSystemAsUser(config, ugi); lastKerberosReloginTime = System.currentTimeMillis() / 1000; } else { config.set("ipc.client.fallback-to-simple-auth-allowed", "true"); config.set("hadoop.security.authentication", "simple"); modifyConfig(context, config); fs = getFileSystem(config); } } getLog().info( "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}", new Object[] { fs.getWorkingDirectory(), fs.getDefaultBlockSize(new Path(dir)), fs.getDefaultReplication(new Path(dir)), config.toString() }); return new HdfsResources(config, fs, ugi); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } } // can be overridden by child classes to modify configuration before filesystem handle is obtained abstract void modifyConfig(ProcessContext context, Configuration config); /** * This exists in order to allow unit tests to override it so that they don't take several minutes waiting for UDP packets to be received * * @param config the configuration to use * @return the FileSystem that is created for the given Configuration * @throws IOException if unable to create the FileSystem */ protected FileSystem getFileSystem(final Configuration config) throws IOException { if (getLog().isDebugEnabled()) { String disableCacheName = String.format("fs.%s.impl.disable.cache", FileSystem.getDefaultUri(config).getScheme()); getLog().debug(String.format("'%s'='%s'", disableCacheName, config.get(disableCacheName))); } return FileSystem.get(config); } protected FileSystem getFileSystemAsUser(final Configuration config, UserGroupInformation ugi) throws IOException { try { return ugi.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws Exception { return FileSystem.get(config); } }); } catch (InterruptedException e) { throw new IOException("Unable to create file system: " + e.getMessage()); } } /* * Drastically reduce the timeout of a socket connection from the default in FileSystem.get() */ protected void checkHdfsUriForTimeout(Configuration config) throws IOException { URI hdfsUri = FileSystem.getDefaultUri(config); String address = hdfsUri.getAuthority(); int port = hdfsUri.getPort(); if (address == null || address.isEmpty() || port < 0) { return; } InetSocketAddress namenode = NetUtils.createSocketAddr(address, port); SocketFactory socketFactory = NetUtils.getDefaultSocketFactory(config); Socket socket = null; try { socket = socketFactory.createSocket(); NetUtils.connect(socket, namenode, 1000); // 1 second timeout } finally { IOUtils.closeQuietly(socket); } } protected Configuration getConfiguration() { return hdfsResources.get().getConfiguration(); } protected FileSystem getFileSystem() { // if kerberos is enabled, check if the ticket should be renewed before returning the FS if (hdfsResources.get().getUserGroupInformation() != null && isTicketOld()) { tryKerberosRelogin(hdfsResources.get().getUserGroupInformation()); } return hdfsResources.get().getFileSystem(); } /** * Gets the Hadoop file system for the specified context. * * @param context the process context * @return the Hadoop file system, or {@code null} if an error occurred */ @Nullable protected FileSystem getFileSystem(@Nonnull final ProcessContext context) { // Get Hadoop configuration final Configuration configuration = getConfiguration(); if (configuration == null) { getLog().error("Missing Hadoop configuration."); return null; } // Validate user if security is enabled if (SecurityUtil.isSecurityEnabled(configuration)) { // Get properties String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue(); String keyTab = context.getProperty(kerberosKeytab).getValue(); String principal = context.getProperty(kerberosPrincipal).getValue(); if (keyTab.isEmpty() || principal.isEmpty()) { getLog().error("Kerberos keytab or principal information missing in Kerberos enabled cluster."); return null; } // Authenticate try { getLog().debug("User authentication initiated."); if (new ApplySecurityPolicy().validateUserWithKerberos(getLog(), hadoopConfigurationResources, principal, keyTab)) { getLog().debug("User authenticated successfully."); } else { getLog().error("User authentication failed."); return null; } } catch (Exception e) { getLog().error("Failed to authenticate:" + e, e); return null; } } // Get file system final FileSystem fileSystem = getFileSystem(); if (fileSystem != null) { return fileSystem; } else { getLog().error("Hadoop FileSystem not properly configured."); return null; } } protected void tryKerberosRelogin(UserGroupInformation ugi) { try { getLog().info( "Kerberos ticket age exceeds threshold [{} seconds] " + "attempting to renew ticket for user {}", new Object[] { kerberosReloginThreshold, ugi.getUserName() }); ugi.checkTGTAndReloginFromKeytab(); lastKerberosReloginTime = System.currentTimeMillis() / 1000; getLog().info("Kerberos relogin successful or ticket still valid"); } catch (IOException e) { // Most likely case of this happening is ticket is expired and error getting a new one, // meaning dfs operations would fail getLog().error("Kerberos relogin failed", e); throw new ProcessException("Unable to renew kerberos ticket", e); } } protected boolean isTicketOld() { return (System.currentTimeMillis() / 1000 - lastKerberosReloginTime) > kerberosReloginThreshold; } static protected class HdfsResources { private final Configuration configuration; private final FileSystem fileSystem; private final UserGroupInformation userGroupInformation; public HdfsResources(Configuration configuration, FileSystem fileSystem, UserGroupInformation userGroupInformation) { this.configuration = configuration; this.fileSystem = fileSystem; this.userGroupInformation = userGroupInformation; } public Configuration getConfiguration() { return configuration; } public FileSystem getFileSystem() { return fileSystem; } public UserGroupInformation getUserGroupInformation() { return userGroupInformation; } } }