org.apache.pig.test.TestJobControlCompiler.java Source code

Introduction

Here is the source code for org.apache.pig.test.TestJobControlCompiler.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.jar.JarOutputStream;
import java.util.zip.ZipEntry;

import javax.tools.JavaCompiler;
import javax.tools.JavaCompiler.CompilationTask;
import javax.tools.JavaFileObject;
import javax.tools.StandardJavaFileManager;
import javax.tools.ToolProvider;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.ExecType;
import org.apache.pig.FuncSpec;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.executionengine.JobCreationException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceOper;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROperPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.OperatorKey;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestJobControlCompiler {

    private static final Configuration CONF = new Configuration();

    @BeforeClass
    public static void setupClass() throws Exception {
        // creating a hadoop-site.xml and making it visible to Pig
        // making sure it is at the same location as for other tests to not pick
        // up a conf from a previous test
        File conf_dir = new File("build/classes");
        File hadoopSite = new File(conf_dir, "hadoop-site.xml");
        hadoopSite.deleteOnExit();
        FileWriter fw = new FileWriter(hadoopSite);
        try {
            fw.write("<?xml version=\"1.0\"?>\n");
            fw.write("<?xml-stylesheet type=\"text/xsl\" href=\"nutch-conf.xsl\"?>\n");
            fw.write("<configuration>\n");
            fw.write("</configuration>\n");
        } finally {
            fw.close();
        }
        // making hadoop-site.xml visible to Pig as it REQUIRES!!! one when
        // running in mapred mode
        Thread.currentThread().setContextClassLoader(new URLClassLoader(new URL[] { conf_dir.toURI().toURL() }));
    }

    /**
     * specifically tests that REGISTERED jars get added to distributed cache
     * @throws Exception
     */
    @Test
    public void testJarAddedToDistributedCache() throws Exception {

        // creating a jar with a UDF *not* in the current classloader
        File tmpFile = File.createTempFile("Some_", ".jar");
        tmpFile.deleteOnExit();
        String className = createTestJar(tmpFile);
        final String testUDFFileName = className + ".class";

        // JobControlCompiler setup
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
        PigContext pigContext = pigServer.getPigContext();
        pigContext.connect();
        pigContext.addJar(tmpFile.getAbsolutePath());
        JobControlCompiler jobControlCompiler = new JobControlCompiler(pigContext, CONF);
        MROperPlan plan = new MROperPlan();
        MapReduceOper mro = new MapReduceOper(new OperatorKey());
        mro.UDFs = new HashSet<String>();
        mro.UDFs.add(className + "()");
        plan.add(mro);

        // compiling the job
        JobControl jobControl = jobControlCompiler.compile(plan, "test");
        JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();

        // verifying the jar gets on distributed cache
        Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
        // guava jar is not shipped with Hadoop 2.x
        Assert.assertEquals("size for " + Arrays.toString(fileClassPaths), HadoopShims.isHadoopYARN() ? 5 : 6,
                fileClassPaths.length);
        Path distributedCachePath = fileClassPaths[0];
        Assert.assertEquals("ends with jar name: " + distributedCachePath, distributedCachePath.getName(),
                tmpFile.getName());
        // hadoop bug requires path to not contain hdfs://hotname in front
        Assert.assertTrue("starts with /: " + distributedCachePath,
                distributedCachePath.toString().startsWith("/"));
        Assert.assertTrue("jar pushed to distributed cache should contain testUDF",
                jarContainsFileNamed(new File(fileClassPaths[0].toUri().getPath()), testUDFFileName));
    }

    private static List<File> createFiles(String... extensions) throws IOException {
        List<File> files = new ArrayList<File>();
        for (String extension : extensions) {
            File file = File.createTempFile("file", extension);
            file.deleteOnExit();
            files.add(file);
        }
        return files;
    }

    private static void assertFilesInDistributedCache(URI[] uris, int size, String... extensions) {
        Assert.assertEquals(size, uris.length);
        for (int i = 0; i < uris.length; i++) {
            Assert.assertTrue(uris[i].toString().endsWith(extensions[i]));
        }
    }

    @Test
    public void testAddArchiveToDistributedCache() throws IOException {
        final File textFile = File.createTempFile("file", ".txt");
        textFile.deleteOnExit();

        final List<File> zipArchives = createFiles(".zip");
        zipArchives.add(textFile);
        final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar");

        final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
        final PigContext pigContext = pigServer.getPigContext();
        pigContext.connect();
        pigContext.getProperties().put("pig.streaming.ship.files", StringUtils.join(zipArchives, ","));
        pigContext.getProperties().put("pig.streaming.cache.files", StringUtils.join(tarArchives, ","));

        final JobConf jobConf = compileTestJob(pigContext, CONF);

        URI[] uris = DistributedCache.getCacheFiles(jobConf);
        int sizeTxt = 0;
        for (int i = 0; i < uris.length; i++) {
            if (uris[i].toString().endsWith(".txt")) {
                sizeTxt++;
            }
        }
        Assert.assertTrue(sizeTxt == 1);
        assertFilesInDistributedCache(DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz", ".tar.gz",
                ".tar");
    }

    private JobConf compileTestJob(final PigContext pigContext, Configuration conf) throws JobCreationException {
        final JobControlCompiler jobControlCompiler = new JobControlCompiler(pigContext, conf);

        final MROperPlan plan = new MROperPlan();
        plan.add(new MapReduceOper(new OperatorKey()));

        final JobControl jobControl = jobControlCompiler.compile(plan, "test");
        final JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();
        return jobConf;
    }

    /**
     * Tests that no duplicate jars are added to distributed cache, which might cause conflicts
     * and tests with both symlinked and normal jar specification
     */
    @Test
    public void testNoDuplicateJarsInDistributedCache() throws Exception {

        // JobControlCompiler setup
        final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
        PigContext pigContext = pigServer.getPigContext();
        pigContext.connect();

        Configuration conf = new Configuration();
        DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf,
                FileSystem.get(conf));
        DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf,
                FileSystem.get(conf));
        DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf));
        createAndAddResource("udf.jar", pigContext);
        createAndAddResource("udf1.jar", pigContext);
        createAndAddResource("udf2.jar", pigContext);
        createAndAddResource("another.jar", pigContext);

        final JobConf jobConf = compileTestJob(pigContext, conf);

        // verifying the jar gets on distributed cache
        URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf);
        Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
        // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice)
        // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar
        System.out.println("cache.files= " + Arrays.toString(cacheURIs));
        System.out.println("classpath.files= " + Arrays.toString(fileClassPaths));
        if (HadoopShims.isHadoopYARN()) {
            // Default jars - 5 (pig, antlr, joda-time, automaton)
            // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar
            Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9,
                    Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
            Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9,
                    Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
        } else {
            // Default jars - 5. Has guava in addition
            // There will be same entries duplicated for udf.jar and udf2.jar
            Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12,
                    Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
            Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12,
                    Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
        }

        // Count occurrences of the resources
        Map<String, Integer> occurrences = new HashMap<String, Integer>();

        for (URI cacheURI : cacheURIs) {
            Integer val = occurrences.get(cacheURI.toString());
            val = (val == null) ? 1 : ++val;
            occurrences.put(cacheURI.toString(), val);
        }
        if (HadoopShims.isHadoopYARN()) {
            Assert.assertEquals(9, occurrences.size());
        } else {
            Assert.assertEquals(10, occurrences.size()); //guava jar in addition
        }

        for (String file : occurrences.keySet()) {
            if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) {
                // Same path added twice which is ok. It should not be a shipped to hdfs temp path.
                // We assert path is same by checking count
                Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file));
            } else {
                // check that only single occurrence even though we added once to dist cache (simulating via Oozie)
                // and second time through pig register jar when there is symlink
                Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file));
            }
        }
    }

    private File createAndAddResource(String name, PigContext pigContext) throws IOException {
        File f = new File(name);
        f.createNewFile();
        f.deleteOnExit();
        pigContext.addJar(name);
        return f;
    }

    @Test
    public void testEstimateNumberOfReducers() throws Exception {
        Assert.assertEquals(2, JobControlCompiler.estimateNumberOfReducers(new Job(CONF),
                createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 999)));

        Assert.assertEquals(2, JobControlCompiler.estimateNumberOfReducers(new Job(CONF),
                createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 1000)));

        Assert.assertEquals(3, JobControlCompiler.estimateNumberOfReducers(new Job(CONF),
                createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 1001)));
    }

    private static MapReduceOper createMockPOLoadMapReduceOper(long size) throws Exception {
        MapReduceOper mro = new MapReduceOper(new OperatorKey());
        mro.mapPlan.add(createPOLoadWithSize(size, new PigStorage()));
        return mro;
    }

    public static POLoad createPOLoadWithSize(long size, LoadFunc loadFunc) throws Exception {
        File file = File.createTempFile("tempFile", ".tmp");
        file.deleteOnExit();
        RandomAccessFile f = new RandomAccessFile(file, "rw");
        f.setLength(size);
        f.close();

        loadFunc.setLocation(file.getAbsolutePath(), new org.apache.hadoop.mapreduce.Job(CONF));
        FuncSpec funcSpec = new FuncSpec(loadFunc.getClass().getCanonicalName());
        POLoad poLoad = new POLoad(new OperatorKey(), loadFunc);
        poLoad.setLFile(new FileSpec(file.getAbsolutePath(), funcSpec));
        poLoad.setPc(new PigContext());
        poLoad.setUp();

        return poLoad;
    }

    /**
     * checks if the given file name is in the jar
     * @param jarFile the jar to check
     * @param name the name to find (full path in the jar)
     * @return true if the name was found
     * @throws IOException
     */
    private boolean jarContainsFileNamed(File jarFile, String name) throws IOException {
        Enumeration<JarEntry> entries = new JarFile(jarFile).entries();
        while (entries.hasMoreElements()) {
            JarEntry entry = entries.nextElement();
            if (entry.getName().equals(name)) {
                return true;
            }
        }
        return false;
    }

    /**
     * creates a jar containing a UDF not in the current classloader
     * @param jarFile the jar to create
     * @return the name of the class created (in the default package)
     * @throws IOException
     * @throws FileNotFoundException
     */
    private String createTestJar(File jarFile) throws IOException, FileNotFoundException {

        // creating the source .java file
        File javaFile = File.createTempFile("TestUDF", ".java");
        javaFile.deleteOnExit();
        String className = javaFile.getName().substring(0, javaFile.getName().lastIndexOf('.'));
        FileWriter fw = new FileWriter(javaFile);
        try {
            fw.write("import org.apache.pig.EvalFunc;\n");
            fw.write("import org.apache.pig.data.Tuple;\n");
            fw.write("import java.io.IOException;\n");
            fw.write("public class " + className + " extends EvalFunc<String> {\n");
            fw.write("  public String exec(Tuple input) throws IOException {\n");
            fw.write("    return \"test\";\n");
            fw.write("  }\n");
            fw.write("}\n");
        } finally {
            fw.close();
        }

        // compiling it
        JavaCompiler compiler = ToolProvider.getSystemJavaCompiler();
        StandardJavaFileManager fileManager = compiler.getStandardFileManager(null, null, null);
        Iterable<? extends JavaFileObject> compilationUnits1 = fileManager.getJavaFileObjects(javaFile);
        CompilationTask task = compiler.getTask(null, fileManager, null, null, null, compilationUnits1);
        task.call();

        // here is the compiled file
        File classFile = new File(javaFile.getParentFile(), className + ".class");
        Assert.assertTrue(classFile.exists());

        // putting it in the jar
        JarOutputStream jos = new JarOutputStream(new FileOutputStream(jarFile));
        try {
            jos.putNextEntry(new ZipEntry(classFile.getName()));
            try {
                InputStream testClassContentIS = new FileInputStream(classFile);
                try {
                    byte[] buffer = new byte[64000];
                    int n;
                    while ((n = testClassContentIS.read(buffer)) != -1) {
                        jos.write(buffer, 0, n);
                    }
                } finally {
                    testClassContentIS.close();
                }
            } finally {
                jos.closeEntry();
            }
        } finally {
            jos.close();
        }

        return className;
    }
}