co.cask.cdap.explore.service.ExploreServiceUtils.java Source code

Introduction

Here is the source code for co.cask.cdap.explore.service.ExploreServiceUtils.java
Source

/*
 * Copyright  2014-2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.explore.service;

import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data2.datafabric.dataset.service.DatasetService;
import co.cask.cdap.data2.util.hbase.HBaseTableUtilFactory;
import co.cask.cdap.explore.guice.ExploreRuntimeModule;
import co.cask.cdap.explore.service.hive.Hive12CDH5ExploreService;
import co.cask.cdap.explore.service.hive.Hive12ExploreService;
import co.cask.cdap.explore.service.hive.Hive13ExploreService;
import co.cask.cdap.explore.service.hive.Hive14ExploreService;
import co.cask.cdap.hive.ExploreUtils;
import co.cask.cdap.internal.app.runtime.spark.SparkUtils;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.io.ByteStreams;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.twill.api.ClassAcceptor;
import org.apache.twill.internal.utils.Dependencies;
import org.objectweb.asm.ClassReader;
import org.objectweb.asm.ClassVisitor;
import org.objectweb.asm.ClassWriter;
import org.objectweb.asm.MethodVisitor;
import org.objectweb.asm.Opcodes;
import org.objectweb.asm.commons.GeneratorAdapter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.jar.JarOutputStream;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

/**
 * Utility class for the explore service.
 */
public class ExploreServiceUtils {
    private static final Logger LOG = LoggerFactory.getLogger(ExploreServiceUtils.class);

    private static final String HIVE_AUTHFACTORY_CLASS_NAME = "org.apache.hive.service.auth.HiveAuthFactory";

    /**
     * Hive support enum.
     */
    public enum HiveSupport {
        // The order of the enum values below is very important
        // CDH 5.0 to 5.1 uses Hive 0.12
        HIVE_CDH5_0(Pattern.compile("^.*cdh5.0\\..*$"), Hive12CDH5ExploreService.class), HIVE_CDH5_1(
                Pattern.compile("^.*cdh5.1\\..*$"), Hive12CDH5ExploreService.class),
        // CDH 5.2.x and 5.3.x use Hive 0.13
        HIVE_CDH5_2(Pattern.compile("^.*cdh5.2\\..*$"), Hive13ExploreService.class), HIVE_CDH5_3(
                Pattern.compile("^.*cdh5.3\\..*$"), Hive13ExploreService.class),
        // CDH > 5.3 uses Hive >= 1.1 (which Hive14ExploreService supports)
        HIVE_CDH5(Pattern.compile("^.*cdh5\\..*$"), Hive14ExploreService.class),

        HIVE_12(null, Hive12ExploreService.class), HIVE_13(null, Hive13ExploreService.class), HIVE_14(null,
                Hive14ExploreService.class), HIVE_1_0(null, Hive14ExploreService.class), HIVE_1_1(null,
                        Hive14ExploreService.class), HIVE_1_2(null, Hive14ExploreService.class);

        private final Pattern hadoopVersionPattern;
        private final Class<? extends ExploreService> hiveExploreServiceClass;

        HiveSupport(Pattern hadoopVersionPattern, Class<? extends ExploreService> hiveExploreServiceClass) {
            this.hadoopVersionPattern = hadoopVersionPattern;
            this.hiveExploreServiceClass = hiveExploreServiceClass;
        }

        public Pattern getHadoopVersionPattern() {
            return hadoopVersionPattern;
        }

        public Class<? extends ExploreService> getHiveExploreServiceClass() {
            return hiveExploreServiceClass;
        }
    }

    // Caching the dependencies so that we don't trace them twice
    private static Set<File> exploreDependencies = null;

    private static final Pattern HIVE_SITE_FILE_PATTERN = Pattern.compile("^.*/hive-site\\.xml$");
    private static final Pattern YARN_SITE_FILE_PATTERN = Pattern.compile("^.*/yarn-site\\.xml$");
    private static final Pattern MAPRED_SITE_FILE_PATTERN = Pattern.compile("^.*/mapred-site\\.xml$");

    public static Class<? extends ExploreService> getHiveService() {
        HiveSupport hiveVersion = checkHiveSupport(null);
        return hiveVersion.getHiveExploreServiceClass();
    }

    public static HiveSupport checkHiveSupport() {
        return checkHiveSupport(ExploreUtils.getExploreClassloader());
    }

    public static String getHiveVersion() {
        return getHiveVersion(ExploreUtils.getExploreClassloader());
    }

    public static String getHiveVersion(@Nullable ClassLoader hiveClassLoader) {
        ClassLoader usingCL = hiveClassLoader;
        if (usingCL == null) {
            usingCL = ExploreServiceUtils.class.getClassLoader();
        }

        try {
            Class<?> hiveVersionInfoClass = usingCL.loadClass("org.apache.hive.common.util.HiveVersionInfo");
            return (String) hiveVersionInfoClass.getDeclaredMethod("getVersion").invoke(null);
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    /**
     * Check that Hive is in the class path - with a right version.
     */
    public static HiveSupport checkHiveSupport(@Nullable ClassLoader hiveClassLoader) {
        // First try to figure which hive support is relevant based on Hadoop distribution name
        String hadoopVersion = VersionInfo.getVersion();
        for (HiveSupport hiveSupport : HiveSupport.values()) {
            if (hiveSupport.getHadoopVersionPattern() != null
                    && hiveSupport.getHadoopVersionPattern().matcher(hadoopVersion).matches()) {
                return hiveSupport;
            }
        }

        String hiveVersion = getHiveVersion(hiveClassLoader);
        LOG.info("Client Hive version: {}", hiveVersion);
        if (hiveVersion.startsWith("0.12.")) {
            return HiveSupport.HIVE_12;
        } else if (hiveVersion.startsWith("0.13.")) {
            return HiveSupport.HIVE_13;
        } else if (hiveVersion.startsWith("0.14.") || hiveVersion.startsWith("1.0.")) {
            return HiveSupport.HIVE_14;
        } else if (hiveVersion.startsWith("1.1.")) {
            return HiveSupport.HIVE_1_1;
        } else if (hiveVersion.startsWith(("1.2"))) {
            return HiveSupport.HIVE_1_2;
        }

        throw new RuntimeException("Hive distribution not supported. Set the configuration '"
                + Constants.Explore.EXPLORE_ENABLED + "' to false to start up without Explore.");
    }

    /**
     * Return the list of absolute paths of the bootstrap classes.
     */
    public static Set<String> getBoostrapClasses() {
        ImmutableSet.Builder<String> builder = ImmutableSet.builder();
        for (String classpath : Splitter.on(File.pathSeparatorChar)
                .split(System.getProperty("sun.boot.class.path"))) {
            File file = new File(classpath);
            builder.add(file.getAbsolutePath());
            try {
                builder.add(file.getCanonicalPath());
            } catch (IOException e) {
                LOG.warn("Could not add canonical path to aux class path for file {}", file.toString(), e);
            }
        }
        return builder.build();
    }

    /**
     * Trace the jar dependencies needed by the Explore container. Uses a separate class loader to load Hive classes,
     * built using the explore classpath passed as a system property to master.
     *
     * @return an ordered set of jar files.
     */
    public static Set<File> traceExploreDependencies(File tmpDir) throws IOException {
        if (exploreDependencies != null) {
            return exploreDependencies;
        }

        ClassLoader classLoader = ExploreUtils.getExploreClassloader();
        Set<File> additionalJars = new HashSet<>();
        if (isSparkAvailable()) {
            File sparkAssemblyJar = SparkUtils.locateSparkAssemblyJar();
            LOG.debug("Adding spark jar to explore dependency {}", sparkAssemblyJar);
            additionalJars.add(sparkAssemblyJar);
        }
        if (isTezAvailable()) {
            additionalJars.addAll(getTezJars());
        }
        return traceExploreDependencies(classLoader, tmpDir, additionalJars);
    }

    /**
     * Trace the jar dependencies needed by the Explore container.
     *
     * @param classLoader class loader to use to trace the dependencies.
     *                    If it is null, use the class loader of this class.
     * @param tmpDir temporary directory for storing rewritten jar files.
     * @param additionalJars additional jars that will be added to the end of the returned set.
     * @return an ordered set of jar files.
     */
    private static Set<File> traceExploreDependencies(ClassLoader classLoader, File tmpDir,
            Set<File> additionalJars) throws IOException {
        if (exploreDependencies != null) {
            return exploreDependencies;
        }

        ClassLoader usingCL = classLoader;
        if (classLoader == null) {
            usingCL = ExploreRuntimeModule.class.getClassLoader();
        }

        final Set<String> bootstrapClassPaths = getBoostrapClasses();

        ClassAcceptor classAcceptor = new ClassAcceptor() {
            /* Excluding any class contained in the bootstrapClassPaths and Kryo classes.
              * We need to remove Kryo dependency in the Explore container. Spark introduced version 2.21 version of Kryo,
              * which would be normally shipped to the Explore container. Yet, Hive requires Kryo 2.22,
              * and gets it from the Hive jars - hive-exec.jar to be precise.
              * */
            @Override
            public boolean accept(String className, URL classUrl, URL classPathUrl) {
                return !(bootstrapClassPaths.contains(classPathUrl.getFile())
                        || className.startsWith("com.esotericsoftware.kryo"));
            }
        };

        Set<File> hBaseTableDeps = traceDependencies(usingCL, classAcceptor, tmpDir,
                HBaseTableUtilFactory.getHBaseTableUtilClass().getName());

        // Note the order of dependency jars is important so that HBase jars come first in the classpath order
        // LinkedHashSet maintains insertion order while removing duplicate entries.
        Set<File> orderedDependencies = new LinkedHashSet<>();
        orderedDependencies.addAll(hBaseTableDeps);
        orderedDependencies.addAll(traceDependencies(usingCL, classAcceptor, tmpDir, DatasetService.class.getName(),
                // Referred to by string rather than Class.getName()
                // because DatasetStorageHandler and StreamStorageHandler
                // extend a Hive class, which isn't present in this class loader
                "co.cask.cdap.hive.datasets.DatasetStorageHandler", "co.cask.cdap.hive.stream.StreamStorageHandler",
                "org.apache.hadoop.hive.ql.exec.mr.ExecDriver", "org.apache.hive.service.cli.CLIService",
                "org.apache.hadoop.mapred.YarnClientProtocolProvider",
                // Needed for - at least - CDH 4.4 integration
                "org.apache.hive.builtins.BuiltinUtils",
                // Needed for - at least - CDH 5 integration
                "org.apache.hadoop.hive.shims.Hadoop23Shims"));
        orderedDependencies.addAll(additionalJars);

        exploreDependencies = orderedDependencies;
        return orderedDependencies;
    }

    /**
     * Trace the dependencies files of the given className, using the classLoader,
     * and including the classes that's accepted by the classAcceptor
     *
     * Nothing is returned if the classLoader, or if not provided, the ExploreRuntimeModule class loader,
     * does not contain the className.
     */
    public static Set<File> traceDependencies(@Nullable ClassLoader classLoader, final ClassAcceptor classAcceptor,
            File tmpDir, String... classNames) throws IOException {
        LOG.debug("Tracing dependencies for classes: {}", Arrays.toString(classNames));

        ClassLoader usingCL = classLoader;
        if (usingCL == null) {
            usingCL = ExploreRuntimeModule.class.getClassLoader();
        }

        final String rewritingClassName = HIVE_AUTHFACTORY_CLASS_NAME;
        final Set<File> rewritingFiles = Sets.newHashSet();

        final Set<File> jarFiles = Sets.newHashSet();
        Dependencies.findClassDependencies(usingCL, new ClassAcceptor() {
            @Override
            public boolean accept(String className, URL classUrl, URL classPathUrl) {
                if (!classAcceptor.accept(className, classUrl, classPathUrl)) {
                    return false;
                }

                if (rewritingClassName.equals(className)) {
                    rewritingFiles.add(new File(classPathUrl.getFile()));
                }

                jarFiles.add(new File(classPathUrl.getFile()));
                return true;
            }
        }, classNames);

        // Rewrite HiveAuthFactory.loginFromKeytab to be a no-op method.
        // This is needed because we don't want to use Hive's CLIService since
        // we're already using delegation tokens
        for (File rewritingFile : rewritingFiles) {
            // TODO: this may cause lots of rewrites since we may rewrite the same jar multiple times
            File rewrittenJar = rewriteHiveAuthFactory(rewritingFile,
                    new File(tmpDir, rewritingFile.getName() + "-" + System.currentTimeMillis() + ".jar"));
            jarFiles.add(rewrittenJar);
            LOG.debug("Rewrote {} to {}", rewritingFile.getAbsolutePath(), rewrittenJar.getAbsolutePath());
        }
        jarFiles.removeAll(rewritingFiles);

        if (LOG.isDebugEnabled()) {
            for (File jarFile : jarFiles) {
                LOG.debug("Added jar {}", jarFile.getAbsolutePath());
            }
        }

        return jarFiles;
    }

    @VisibleForTesting
    static File rewriteHiveAuthFactory(File sourceJar, File targetJar) throws IOException {
        try (JarFile input = new JarFile(sourceJar);
                JarOutputStream output = new JarOutputStream(new FileOutputStream(targetJar))) {
            String hiveAuthFactoryPath = HIVE_AUTHFACTORY_CLASS_NAME.replace('.', '/') + ".class";

            Enumeration<JarEntry> sourceEntries = input.entries();
            while (sourceEntries.hasMoreElements()) {
                JarEntry entry = sourceEntries.nextElement();
                output.putNextEntry(new JarEntry(entry.getName()));

                try (InputStream entryInputStream = input.getInputStream(entry)) {
                    if (!hiveAuthFactoryPath.equals(entry.getName())) {
                        ByteStreams.copy(entryInputStream, output);
                        continue;
                    }

                    try {
                        // Rewrite the bytecode of HiveAuthFactory.loginFromKeytab method to a no-op method
                        ClassReader cr = new ClassReader(entryInputStream);
                        ClassWriter cw = new ClassWriter(ClassWriter.COMPUTE_MAXS);
                        cr.accept(new ClassVisitor(Opcodes.ASM5, cw) {
                            @Override
                            public MethodVisitor visitMethod(final int access, final String name, final String desc,
                                    String signature, String[] exceptions) {
                                MethodVisitor methodVisitor = super.visitMethod(access, name, desc, signature,
                                        exceptions);
                                if (!"loginFromKeytab".equals(name)) {
                                    return methodVisitor;
                                }
                                GeneratorAdapter adapter = new GeneratorAdapter(methodVisitor, access, name, desc);
                                adapter.returnValue();

                                // VisitMaxs with 0 so that COMPUTE_MAXS from ClassWriter will compute the right values.
                                adapter.visitMaxs(0, 0);
                                return new MethodVisitor(Opcodes.ASM5) {
                                };
                            }
                        }, 0);
                        output.write(cw.toByteArray());
                    } catch (Exception e) {
                        throw new IOException("Unable to generate HiveAuthFactory class", e);
                    }
                }
            }

            return targetJar;
        }
    }

    /**
     * Updates environment variables in hive-site.xml, mapred-site.xml and yarn-site.xml for explore.
     * All other conf files are returned without any update.
     * @param confFile conf file to update
     * @param tempDir temp dir to create files if necessary
     * @return the new conf file to use in place of confFile
     */
    public static File updateConfFileForExplore(File confFile, File tempDir) {
        if (HIVE_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
            return updateHiveConfFile(confFile, tempDir);
        } else if (YARN_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
            return updateYarnConfFile(confFile, tempDir);
        } else if (MAPRED_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
            return updateMapredConfFile(confFile, tempDir);
        } else {
            return confFile;
        }
    }

    /**
     * Change yarn-site.xml file, and return a temp copy of it to which are added
     * necessary options.
     */
    private static File updateYarnConfFile(File confFile, File tempDir) {
        Configuration conf = new Configuration(false);
        try {
            conf.addResource(confFile.toURI().toURL());
        } catch (MalformedURLException e) {
            LOG.error("File {} is malformed.", confFile, e);
            throw Throwables.propagate(e);
        }

        String yarnAppClassPath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
                Joiner.on(",").join(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH));

        // add the pwd/* at the beginning of classpath. so user's jar will take precedence and without this change,
        // job.jar will be at the beginning of the classpath, since job.jar has old guava version classes,
        // we want to add pwd/* before
        yarnAppClassPath = "$PWD/*," + yarnAppClassPath;

        conf.set(YarnConfiguration.YARN_APPLICATION_CLASSPATH, yarnAppClassPath);

        File newYarnConfFile = new File(tempDir, "yarn-site.xml");
        try (FileOutputStream os = new FileOutputStream(newYarnConfFile)) {
            conf.writeXml(os);
        } catch (IOException e) {
            LOG.error("Problem creating and writing to temporary yarn-conf.xml conf file at {}", newYarnConfFile,
                    e);
            throw Throwables.propagate(e);
        }

        return newYarnConfFile;
    }

    /**
     * Change mapred-site.xml file, and return a temp copy of it to which are added
     * necessary options.
     */
    private static File updateMapredConfFile(File confFile, File tempDir) {
        Configuration conf = new Configuration(false);
        try {
            conf.addResource(confFile.toURI().toURL());
        } catch (MalformedURLException e) {
            LOG.error("File {} is malformed.", confFile, e);
            throw Throwables.propagate(e);
        }

        String mrAppClassPath = conf.get(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH,
                MRJobConfig.DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH);

        // Add the pwd/* at the beginning of classpath. Without this change, old jars from mr framework classpath
        // get into classpath.
        mrAppClassPath = "$PWD/*," + mrAppClassPath;

        conf.set(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH, mrAppClassPath);

        File newMapredConfFile = new File(tempDir, "mapred-site.xml");
        try (FileOutputStream os = new FileOutputStream(newMapredConfFile)) {
            conf.writeXml(os);
        } catch (IOException e) {
            LOG.error("Problem creating and writing to temporary mapred-site.xml conf file at {}",
                    newMapredConfFile, e);
            throw Throwables.propagate(e);
        }

        return newMapredConfFile;
    }

    /**
     * Change hive-site.xml file, and return a temp copy of it to which are added
     * necessary options.
     */
    private static File updateHiveConfFile(File confFile, File tempDir) {
        Configuration conf = new Configuration(false);
        try {
            conf.addResource(confFile.toURI().toURL());
        } catch (MalformedURLException e) {
            LOG.error("File {} is malformed.", confFile, e);
            throw Throwables.propagate(e);
        }

        // we prefer jars at container's root directory before job.jar,
        // we edit the YARN_APPLICATION_CLASSPATH in yarn-site.xml using
        // co.cask.cdap.explore.service.ExploreServiceUtils.updateYarnConfFile and
        // setting the MAPREDUCE_JOB_CLASSLOADER and MAPREDUCE_JOB_USER_CLASSPATH_FIRST to false will put
        // YARN_APPLICATION_CLASSPATH before job.jar for container's classpath.
        conf.setBoolean(Job.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, false);
        conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, false);

        String sparkHome = System.getenv(Constants.SPARK_HOME);
        if (sparkHome != null) {
            LOG.debug("Setting spark.home in hive conf to {}", sparkHome);
            conf.set("spark.home", sparkHome);
        }

        File newHiveConfFile = new File(tempDir, "hive-site.xml");

        try (FileOutputStream os = new FileOutputStream(newHiveConfFile)) {
            conf.writeXml(os);
        } catch (IOException e) {
            LOG.error("Problem creating temporary hive-site.xml conf file at {}", newHiveConfFile, e);
            throw Throwables.propagate(e);
        }
        return newHiveConfFile;
    }

    public static boolean isSparkAvailable() {
        try {
            // SparkUtils.locateSparkAssemblyJar() throws IllegalStateException if it is not able to locate spark jar
            SparkUtils.locateSparkAssemblyJar();
            return true;
        } catch (IllegalStateException e) {
            LOG.debug("Got exception while determining spark availability", e);
            return false;
        }
    }

    public static boolean isSparkEngine(HiveConf hiveConf) {
        // We don't support setting engine through session configuration now
        String engine = hiveConf.get("hive.execution.engine");
        return "spark".equalsIgnoreCase(engine);
    }

    // This method is used to determine if Tez is enabled based on TEZ_HOME environment variable.
    // Master prepares the explore container by adding jars available in TEZ_HOME.
    // However this environment variable is not available to explore container itself(BaseHiveService).
    // There we check the existence of tez using hive.execution.engine config variable.
    private static boolean isTezAvailable() {
        return System.getenv(Constants.TEZ_HOME) != null;
    }

    private static Set<File> getTezJars() {
        String tezHome = System.getenv(Constants.TEZ_HOME);
        Path tezHomeDir = Paths.get(tezHome);
        final PathMatcher pathMatcher = tezHomeDir.getFileSystem().getPathMatcher("glob:*.jar");
        final Set<File> tezJars = new HashSet<>();
        try {
            Files.walkFileTree(tezHomeDir, new SimpleFileVisitor<Path>() {
                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    if (attrs.isRegularFile() && pathMatcher.matches(file.getFileName())) {
                        tezJars.add(file.toFile());
                    }
                    return FileVisitResult.CONTINUE;
                }

                @Override
                public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
                    // Ignore error
                    return FileVisitResult.CONTINUE;
                }
            });
        } catch (IOException e) {
            LOG.warn("Exception raised while inspecting {}", tezHomeDir, e);
        }
        return tezJars;
    }
}