org.pentaho.di.job.entries.hadooptransjobexecutor.DistributedCacheUtilTest.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.job.entries.hadooptransjobexecutor.DistributedCacheUtilTest.java

Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.job.entries.hadooptransjobexecutor;

import org.apache.commons.vfs.AllFileSelector;
import org.apache.commons.vfs.FileObject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.junit.Test;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.plugins.JobEntryPluginType;
import org.pentaho.di.core.plugins.Plugin;
import org.pentaho.di.core.plugins.PluginInterface;
import org.pentaho.di.core.util.EnvUtil;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.hdfs.vfs.HDFSFileSystem;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;

/**
 * Test the DistributedCacheUtil
 */
public class DistributedCacheUtilTest {

    private FileObject createTestFolderWithContent() throws Exception {
        return createTestFolderWithContent("sample-folder");
    }

    private FileObject createTestFolderWithContent(String rootFolderName) throws Exception {
        String rootName = "bin/test/" + rootFolderName;
        FileObject root = KettleVFS.getFileObject(rootName);
        FileObject jar1 = KettleVFS.getFileObject(rootName + Const.FILE_SEPARATOR + "jar1.jar");
        FileObject jar2 = KettleVFS.getFileObject(rootName + Const.FILE_SEPARATOR + "jar2.jar");
        FileObject folder = KettleVFS.getFileObject(rootName + Const.FILE_SEPARATOR + "folder");
        FileObject file = KettleVFS
                .getFileObject(rootName + Const.FILE_SEPARATOR + "folder" + Const.FILE_SEPARATOR + "file.txt");

        root.createFolder();
        folder.createFolder();
        jar1.createFile();
        jar2.createFile();
        file.createFile();

        return root;
    }

    @Test
    public void deleteDirectory() throws Exception {
        FileObject test = KettleVFS.getFileObject("bin/test/deleteDirectoryTest");
        test.createFolder();

        DistributedCacheUtil ch = new DistributedCacheUtil();
        ch.deleteDirectory(test);
        try {
            assertFalse(test.exists());
        } finally {
            // Delete the directory with java.io.File if it wasn't removed
            File f = new File("bin/test/deleteDirectoryTest");
            if (f.exists() && !f.delete()) {
                throw new IOException("unable to delete test directory: " + f.getAbsolutePath());
            }
        }
    }

    @Test
    public void extract_invalid_archive() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        try {
            ch.extract(KettleVFS.getFileObject("bogus"), null);
            fail("expected exception");
        } catch (IllegalArgumentException ex) {
            assertTrue(ex.getMessage().startsWith("archive does not exist"));
        }
    }

    @Test
    public void extract_destination_exists() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        FileObject archive = KettleVFS.getFileObject("test-res/pentaho-mapreduce-sample.jar");

        try {
            ch.extract(archive, KettleVFS.getFileObject("."));
        } catch (IllegalArgumentException ex) {
            assertTrue("destination already exists".equals(ex.getMessage()));
        }
    }

    @Test
    public void extractToTemp() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        FileObject archive = KettleVFS.getFileObject("test-res/pentaho-mapreduce-sample.jar");
        FileObject extracted = ch.extractToTemp(archive);

        assertNotNull(extracted);
        assertTrue(extracted.exists());
        try {
            // There should be 3 files and 5 directories inside the root folder (which is the 9th entry)
            assertTrue(extracted.findFiles(new AllFileSelector()).length == 9);
        } finally {
            // clean up after ourself
            ch.deleteDirectory(extracted);
        }
    }

    @Test
    public void extractToTemp_missing_archive() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        try {
            ch.extractToTemp(null);
            fail("Expected exception");
        } catch (NullPointerException ex) {
            assertEquals("archive is required", ex.getMessage());
        }
    }

    @Test
    public void findFiles_vfs() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        FileObject testFolder = createTestFolderWithContent();

        try {
            // Simply test we can find the jar files in our test folder
            List<String> jars = ch.findFiles(testFolder, "jar");
            assertEquals(2, jars.size());

            // Look for all files and folders
            List<String> all = ch.findFiles(testFolder, null);
            assertEquals(5, all.size());
        } finally {
            testFolder.delete(new AllFileSelector());
        }
    }

    @Test
    public void findFiles_vfs_hdfs() throws Exception {

        // Stage files then make sure we can find them in HDFS
        DistributedCacheUtil ch = new DistributedCacheUtil();
        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);
        HDFSFileSystem.setMockHDFSFileSystem(fs);

        // Must use absolute paths so the HDFS VFS FileSystem can resolve the URL properly (can't do relative paths when
        // using KettleVFS.getFileObject() within HDFS)
        Path root = new Path(KettleVFS.getFileObject(".").getURL().getPath() + "/bin/test/findFiles_hdfs");
        Path dest = new Path(root, "org/pentaho/mapreduce/");

        FileObject hdfsDest = KettleVFS.getFileObject("hdfs://localhost/" + dest.toString());

        // Copy the contents of test folder
        FileObject source = createTestFolderWithContent();

        try {
            try {
                ch.stageForCache(source, fs, dest, true);

                List<String> files = ch.findFiles(hdfsDest, null);
                assertEquals(5, files.size());
            } finally {
                fs.delete(root, true);
            }
        } finally {
            source.delete(new AllFileSelector());
        }
    }

    @Test
    public void findFiles_hdfs_native() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        // Copy the contents of test folder
        FileObject source = createTestFolderWithContent();
        Path root = new Path("bin/test/stageArchiveForCacheTest");
        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);
        Path dest = new Path(root, "org/pentaho/mapreduce/");
        try {
            try {
                ch.stageForCache(source, fs, dest, true);

                List<Path> files = ch.findFiles(fs, dest, null);
                assertEquals(3, files.size());

                files = ch.findFiles(fs, dest, Pattern.compile(".*jar$"));
                assertEquals(2, files.size());

                files = ch.findFiles(fs, dest, Pattern.compile(".*folder$"));
                assertEquals(1, files.size());
            } finally {
                fs.delete(root, true);
            }
        } finally {
            source.delete(new AllFileSelector());
        }
    }

    /**
     * Utility to attempt to stage a file to HDFS for use with Distributed Cache.
     *
     * @param ch                Distributed Cache Helper
     * @param source            File or directory to stage
     * @param fs                FileSystem to stage to
     * @param root              Root directory to clean up when this test is complete
     * @param dest              Destination path to stage to
     * @param expectedFileCount Expected number of files to exist in the destination once staged
     * @param expectedDirCount  Expected number of directories to exist in the destiation once staged
     * @throws Exception
     */
    private void stageForCacheTester(DistributedCacheUtil ch, FileObject source, FileSystem fs, Path root,
            Path dest, int expectedFileCount, int expectedDirCount) throws Exception {
        try {
            ch.stageForCache(source, fs, dest, true);

            assertTrue(fs.exists(dest));
            ContentSummary cs = fs.getContentSummary(dest);
            assertEquals(expectedFileCount, cs.getFileCount());
            assertEquals(expectedDirCount, cs.getDirectoryCount());
            assertEquals(FsPermission.createImmutable((short) 0755), fs.getFileStatus(dest).getPermission());
        } finally {
            // Clean up after ourself
            if (!fs.delete(root, true)) {
                System.err.println("error deleting FileSystem temp dir " + root);
            }
        }
    }

    @Test
    public void stageForCache() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        // Copy the contents of test folder
        FileObject source = createTestFolderWithContent();

        try {
            Path root = new Path("bin/test/stageArchiveForCacheTest");
            Path dest = new Path(root, "org/pentaho/mapreduce/");

            Configuration conf = new Configuration();
            org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

            stageForCacheTester(ch, source, fs, root, dest, 3, 2);
        } finally {
            source.delete(new AllFileSelector());
        }
    }

    @Test
    public void stageForCache_missing_source() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        Path dest = new Path("bin/test/bogus-destination");
        FileObject bogusSource = KettleVFS.getFileObject("bogus");
        try {
            ch.stageForCache(bogusSource, fs, dest, true);
            fail("expected exception when source does not exist");
        } catch (KettleFileException ex) {
            assertEquals(BaseMessages.getString(DistributedCacheUtil.class,
                    "DistributedCacheUtil.SourceDoesNotExist", bogusSource), ex.getMessage().trim());
        }
    }

    @Test
    public void stageForCache_destination_no_overwrite() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        FileObject source = createTestFolderWithContent();
        try {
            Path root = new Path("bin/test/stageForCache_destination_exists");
            Path dest = new Path(root, "dest");

            fs.mkdirs(dest);
            assertTrue(fs.exists(dest));
            assertTrue(fs.getFileStatus(dest).isDir());
            try {
                ch.stageForCache(source, fs, dest, false);
            } catch (KettleFileException ex) {
                assertTrue(ex.getMessage().contains("Destination exists"));
            } finally {
                fs.delete(root, true);
            }
        } finally {
            source.delete(new AllFileSelector());
        }
    }

    @Test
    public void stageForCache_destination_exists() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        FileObject source = createTestFolderWithContent();
        try {
            Path root = new Path("bin/test/stageForCache_destination_exists");
            Path dest = new Path(root, "dest");

            fs.mkdirs(dest);
            assertTrue(fs.exists(dest));
            assertTrue(fs.getFileStatus(dest).isDir());

            stageForCacheTester(ch, source, fs, root, dest, 3, 2);
        } finally {
            source.delete(new AllFileSelector());
        }
    }

    @Test
    public void addCachedFilesToClasspath() throws IOException {
        DistributedCacheUtil ch = new DistributedCacheUtil();
        Configuration conf = new Configuration();

        List<Path> files = Arrays.asList(new Path("a"), new Path("b"), new Path("c"));

        ch.addCachedFilesToClasspath(files, conf);

        assertEquals("yes", conf.get("mapred.create.symlink"));

        for (Path file : files) {
            assertTrue(conf.get("mapred.cache.files").contains(file.toString()));
            assertTrue(conf.get("mapred.job.classpath.files").contains(file.toString()));
        }
    }

    @Test
    public void ispmrInstalledAt() throws IOException {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        Path root = new Path("bin/test/ispmrInstalledAt");
        Path lib = new Path(root, "lib");
        Path plugins = new Path(root, "plugins");
        Path bigDataPlugin = new Path(plugins, DistributedCacheUtil.PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME);

        Path lockFile = ch.getLockFileAt(root);
        try {
            // Create all directories (parent directories created automatically)
            fs.mkdirs(lib);
            fs.mkdirs(bigDataPlugin);

            assertTrue(ch.isKettleEnvironmentInstalledAt(fs, root));

            // If lock file is there pmr is not installed
            fs.create(lockFile);
            assertFalse(ch.isKettleEnvironmentInstalledAt(fs, root));

            // Try to create a file instead of a directory for the pentaho-big-data-plugin. This should be detected.
            fs.delete(bigDataPlugin, true);
            fs.create(bigDataPlugin);
            assertFalse(ch.isKettleEnvironmentInstalledAt(fs, root));
        } finally {
            fs.delete(root, true);
        }
    }

    @Test
    public void installKettleEnvironment_missing_arguments() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        try {
            ch.installKettleEnvironment(null, null, null, null, null);
            fail("Expected exception on missing archive");
        } catch (NullPointerException ex) {
            assertEquals("pmrArchive is required", ex.getMessage());
        }

        try {
            ch.installKettleEnvironment(KettleVFS.getFileObject("."), null, null, null, null);
            fail("Expected exception on missing archive");
        } catch (NullPointerException ex) {
            assertEquals("destination is required", ex.getMessage());
        }

        try {
            ch.installKettleEnvironment(KettleVFS.getFileObject("."), null, new Path("."), null, null);
            fail("Expected exception on missing archive");
        } catch (NullPointerException ex) {
            assertEquals("big data plugin required", ex.getMessage());
        }
    }

    @Test
    public void installKettleEnvironment() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        // This "empty pmr" contains a lib/ folder but with no content
        FileObject pmrArchive = KettleVFS.getFileObject("test-res/empty-pmr.zip");

        FileObject bigDataPluginDir = createTestFolderWithContent(
                DistributedCacheUtil.PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME);

        Path root = new Path("bin/test/installKettleEnvironment");
        try {
            ch.installKettleEnvironment(pmrArchive, fs, root, bigDataPluginDir, null);
            assertTrue(ch.isKettleEnvironmentInstalledAt(fs, root));
        } finally {
            bigDataPluginDir.delete(new AllFileSelector());
            fs.delete(root, true);
        }
    }

    @Test
    public void installKettleEnvironment_additional_plugins() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        // This "empty pmr" contains a lib/ folder but with no content
        FileObject pmrArchive = KettleVFS.getFileObject("test-res/empty-pmr.zip");

        FileObject bigDataPluginDir = createTestFolderWithContent(
                DistributedCacheUtil.PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME);
        FileObject samplePluginDir = createTestFolderWithContent("sample-plugin");

        Path root = new Path("bin/test/installKettleEnvironment");
        try {
            ch.installKettleEnvironment(pmrArchive, fs, root, bigDataPluginDir, Arrays.asList(samplePluginDir));
            assertTrue(ch.isKettleEnvironmentInstalledAt(fs, root));
            assertTrue(fs.exists(new Path(root, "plugins/sample-plugin")));
        } finally {
            bigDataPluginDir.delete(new AllFileSelector());
            samplePluginDir.delete(new AllFileSelector());
            fs.delete(root, true);
        }
    }

    @Test
    public void stagePluginsForCache() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        Path pluginsDir = new Path("bin/test/plugins-installation-dir");

        FileObject pluginDir = createTestFolderWithContent();

        try {
            ch.stagePluginsForCache(fs, pluginsDir, true, Arrays.asList(pluginDir));
            Path pluginInstallPath = new Path(pluginsDir, pluginDir.getURL().toURI().getPath());
            assertTrue(fs.exists(pluginInstallPath));
            ContentSummary summary = fs.getContentSummary(pluginInstallPath);
            assertEquals(3, summary.getFileCount());
            assertEquals(2, summary.getDirectoryCount());
        } finally {
            pluginDir.delete(new AllFileSelector());
            fs.delete(pluginsDir, true);
        }
    }

    @Test
    public void configureWithpmr() throws Exception {
        DistributedCacheUtil ch = new DistributedCacheUtil();

        Configuration conf = new Configuration();
        org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(conf);

        // This "empty pmr" contains a lib/ folder and some empty kettle-*.jar files but no actual content
        FileObject pmrArchive = KettleVFS.getFileObject("test-res/empty-pmr.zip");

        FileObject bigDataPluginDir = createTestFolderWithContent(
                DistributedCacheUtil.PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME);

        Path root = new Path("bin/test/installKettleEnvironment");
        try {
            ch.installKettleEnvironment(pmrArchive, fs, root, bigDataPluginDir, null);
            assertTrue(ch.isKettleEnvironmentInstalledAt(fs, root));

            ch.configureWithKettleEnvironment(conf, fs, root);

            // Make sure our libraries are on the classpath
            assertTrue(conf.get("mapred.cache.files").contains("lib/kettle-core.jar"));
            assertTrue(conf.get("mapred.cache.files").contains("lib/kettle-engine.jar"));
            assertTrue(conf.get("mapred.job.classpath.files").contains("lib/kettle-core.jar"));
            assertTrue(conf.get("mapred.job.classpath.files").contains("lib/kettle-engine.jar"));

            // Make sure our plugins folder is registered
            assertTrue(conf.get("mapred.cache.files").contains("#plugins"));

            // Make sure our libraries aren't included twice
            assertFalse(conf.get("mapred.cache.files").contains("#lib"));

            // We should not have individual files registered
            assertFalse(conf.get("mapred.cache.files").contains("pentaho-big-data-plugin/jar1.jar"));
            assertFalse(conf.get("mapred.cache.files").contains("pentaho-big-data-plugin/jar2.jar"));
            assertFalse(conf.get("mapred.cache.files").contains("pentaho-big-data-plugin/folder/file.txt"));

        } finally {
            bigDataPluginDir.delete(new AllFileSelector());
            fs.delete(root, true);
        }
    }

    @Test
    public void findPluginFolder() throws Exception {
        DistributedCacheUtil util = new DistributedCacheUtil();

        // Fake out the "plugins" directory for the project's root directory
        System.setProperty(Const.PLUGIN_BASE_FOLDERS_PROP, KettleVFS.getFileObject(".").getURL().toURI().getPath());

        assertNotNull("Should have found plugin dir: src/", util.findPluginFolder("src"));
        assertNotNull("Should be able to find nested plugin dir: src/org/", util.findPluginFolder("src/org"));

        assertNull("Should not have found plugin dir: src/org/", util.findPluginFolder("org"));
    }

    @Test
    public void addFilesToClassPath() throws IOException {
        DistributedCacheUtil util = new DistributedCacheUtil();
        Path p1 = new Path("/testing1");
        Path p2 = new Path("/testing2");
        Configuration conf = new Configuration();
        util.addFileToClassPath(p1, conf);
        util.addFileToClassPath(p2, conf);
        assertEquals("/testing1:/testing2", conf.get("mapred.job.classpath.files"));
    }

    @Test
    public void addFilesToClassPath_custom_path_separator() throws IOException {
        DistributedCacheUtil util = new DistributedCacheUtil();
        Path p1 = new Path("/testing1");
        Path p2 = new Path("/testing2");
        Configuration conf = new Configuration();

        System.setProperty("hadoop.cluster.path.separator", "J");

        util.addFileToClassPath(p1, conf);
        util.addFileToClassPath(p2, conf);
        assertEquals("/testing1J/testing2", conf.get("mapred.job.classpath.files"));

    }
}