org.datacleaner.spark.ApplicationDriver.java Source code

Introduction

Here is the source code for org.datacleaner.spark.ApplicationDriver.java
Source

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.metamodel.util.Action;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.HdfsResource;
import org.apache.metamodel.util.MutableRef;
import org.apache.spark.launcher.SparkLauncher;
import org.datacleaner.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Strings;

public class ApplicationDriver {

    private static final Logger logger = LoggerFactory.getLogger(ApplicationDriver.class);

    private static final String PRIMARY_JAR_FILENAME_PREFIX = "DataCleaner-spark";

    private final String _hostname;
    private final int _port;
    private final String _jarDirectoryPath;
    private final String _sparkHome;

    public ApplicationDriver(String hostname, int port, String jarDirectoryPath) {
        this(hostname, port, jarDirectoryPath, determineSparkHome());
    }

    private static String determineSparkHome() {
        String sparkHome = System.getProperty("SPARK_HOME");

        if (Strings.isNullOrEmpty(sparkHome)) {
            sparkHome = System.getenv("SPARK_HOME");
        }

        if (Strings.isNullOrEmpty(sparkHome)) {
            throw new IllegalStateException(
                    "Could not determine SPARK_HOME. Please set the environment variable, system property or provide it as a "
                            + ApplicationDriver.class.getSimpleName() + " constructor argument");
        }

        return sparkHome;
    }

    public ApplicationDriver(String hostname, int port, String jarDirectoryPath, String sparkHome) {
        _hostname = hostname;
        _port = port;
        _jarDirectoryPath = jarDirectoryPath;
        _sparkHome = sparkHome;
    }

    /**
     * Launches and waits for the execution of a DataCleaner job on Spark.
     * 
     * @param configurationHdfsPath
     *            configuration file path (on HDFS)
     * @param jobHdfsPath
     *            job file path (on HDFS)
     * @return the exit code of the spark-submit process
     * @throws Exception
     */
    public int launch(String configurationHdfsPath, String jobHdfsPath) throws Exception {
        // create hadoop configuration directory
        final File hadoopConfDir = createTemporaryHadoopConfDir();

        final SparkLauncher sparkLauncher = createSparkLauncher(hadoopConfDir, configurationHdfsPath, jobHdfsPath);

        return launch(sparkLauncher);
    }

    public int launch(SparkLauncher sparkLauncher) throws Exception {
        final Process process = sparkLauncher.launch();

        final InputStream errorStream = process.getErrorStream();
        startLogger(errorStream);

        final InputStream inputStream = process.getInputStream();
        startLogger(inputStream);

        return process.waitFor();
    }

    private void startLogger(final InputStream stream) {
        new Thread() {
            public void run() {
                try (final BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
                    String line = br.readLine();
                    while (line != null) {
                        logger.info(line);
                        line = br.readLine();
                    }
                    br.close();
                } catch (Exception e) {
                    logger.warn("Logger thread failure: " + e.getMessage(), e);
                }
            };
        }.start();
    }

    public HdfsResource createResource(String hdfsPath) {
        return new HdfsResource(_hostname, _port, hdfsPath);
    }

    public SparkLauncher createSparkLauncher(File hadoopConfDir, String configurationHdfsPath, String jobHdfsPath)
            throws Exception {
        // mimic env. variables
        final Map<String, String> env = new HashMap<>();
        env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath());

        final SparkLauncher sparkLauncher = new SparkLauncher(env);

        sparkLauncher.setSparkHome(_sparkHome);
        sparkLauncher.setMaster("yarn-cluster");
        sparkLauncher.setAppName("DataCleaner");

        final MutableRef<String> primaryJar = new MutableRef<>();
        final List<String> jars = buildJarFiles(primaryJar);
        logger.info("Using JAR files: {}", jars);

        for (final String jar : jars) {
            sparkLauncher.addJar(jar);
        }
        sparkLauncher.setMainClass(Main.class.getName());

        // the primary jar is always the first argument
        sparkLauncher.addAppArgs(primaryJar.get());

        sparkLauncher.addAppArgs(toHdfsPath(configurationHdfsPath));
        sparkLauncher.addAppArgs(toHdfsPath(jobHdfsPath));

        return sparkLauncher;
    }

    private String toHdfsPath(String path) {
        if (path.startsWith("hdfs://")) {
            return path;
        }
        return "hdfs://" + _hostname + ":" + _port + path;
    }

    private List<String> buildJarFiles(MutableRef<String> primaryJarRef) throws IOException {
        final List<String> list = new ArrayList<>();

        final Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://" + _hostname + ":" + _port);

        final FileSystem fs = FileSystem.newInstance(conf);
        try {
            final Path directoryPath = new Path(_jarDirectoryPath);
            final RemoteIterator<LocatedFileStatus> files = fs.listFiles(directoryPath, false);
            while (files.hasNext()) {
                final LocatedFileStatus file = files.next();
                final Path path = file.getPath();
                final String filename = path.getName();
                if (filename.startsWith(PRIMARY_JAR_FILENAME_PREFIX)) {
                    primaryJarRef.set(path.toString());
                } else {
                    list.add(path.toString());
                }
            }
        } finally {
            FileHelper.safeClose(fs);
        }

        if (primaryJarRef.get() == null) {
            throw new IllegalArgumentException("Failed to find primary jar (starting with '"
                    + PRIMARY_JAR_FILENAME_PREFIX + "') in JAR file directory: " + _jarDirectoryPath);
        }

        return list;
    }

    public File createTemporaryHadoopConfDir() throws IOException {
        final File hadoopConfDir = new File(FileHelper.getTempDir(),
                "datacleaner_hadoop_conf_" + UUID.randomUUID().toString());
        hadoopConfDir.mkdirs();

        final File coreSiteFile = new File(hadoopConfDir, "core-site.xml");
        try (final InputStream inputStream = getClass().getResourceAsStream("core-site-template.xml")) {
            final BufferedReader reader = FileHelper.getBufferedReader(inputStream, FileHelper.UTF_8_ENCODING);
            try (final Writer writer = FileHelper.getWriter(coreSiteFile)) {
                String line = reader.readLine();
                while (line != null) {
                    line = StringUtils.replaceAll(line, "${HDFS_HOSTNAME}", _hostname);
                    line = StringUtils.replaceAll(line, "${HDFS_PORT}", _port + "");
                    writer.write(line);

                    line = reader.readLine();
                }
            }
        }
        return hadoopConfDir;
    }

    public void copyFileToHdfs(File file, String hdfsPath) {
        copyFileToHdfs(file, hdfsPath, true);
    }

    public void copyFileToHdfs(final File file, final String hdfsPath, final boolean overwrite) {
        final HdfsResource hdfsResource = createResource(hdfsPath);
        if (!overwrite && hdfsResource.isExists()) {
            // no need to copy
            return;
        }
        hdfsResource.write(new Action<OutputStream>() {
            @Override
            public void run(OutputStream out) throws Exception {
                final FileInputStream in = new FileInputStream(file);
                FileHelper.copy(in, out);
                in.close();
            }
        });
    }
}