org.apache.solr.hadoop.MorphlineBasicMiniMRTest.java Source code

Introduction

Here is the source code for org.apache.solr.hadoop.MorphlineBasicMiniMRTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.hadoop;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.reflect.Array;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.security.authorize.ProxyUsers;
import org.apache.hadoop.util.JarFinder;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.hadoop.hack.MiniMRCluster;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.apache.solr.util.BadMrClusterThreadsFilter;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;

@ThreadLeakAction({ Action.WARN })
@ThreadLeakLingering(linger = 0)
@ThreadLeakZombies(Consequence.CONTINUE)
@ThreadLeakFilters(defaultFilters = true, filters = { BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
})
@Slow
@Nightly
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {

    private static final boolean ENABLE_LOCAL_JOB_RUNNER = false; // for debugging only
    private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
    private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
    private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");

    private static String SEARCH_ARCHIVES_JAR;

    private static MiniDFSCluster dfsCluster = null;
    private static MiniMRCluster mrCluster = null;
    private static int numRuns = 0;

    private final String inputAvroFile;
    private final int count;

    private static String tempDir;

    private static File solrHomeDirectory;

    protected MapReduceIndexerTool createTool() {
        return new MapReduceIndexerTool();
    }

    public MorphlineBasicMiniMRTest() {
        int data = random().nextInt(3);
        switch (data) {
        case 0:
            this.inputAvroFile = "sample-statuses-20120906-141433.avro";
            this.count = 2;
            break;
        case 1:
            this.inputAvroFile = "sample-statuses-20120521-100919.avro";
            this.count = 20;
            break;
        case 2:
            this.inputAvroFile = "sample-statuses-20120906-141433-medium.avro";
            this.count = 2104;
            break;
        default:
            throw new RuntimeException("Test setup is broken");
        }

    }

    @BeforeClass
    public static void setupClass() throws Exception {
        solrHomeDirectory = createTempDir().toFile();

        assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
                Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));

        assumeFalse("This test fails on Java 9 (https://issues.apache.org/jira/browse/SOLR-8876)",
                Constants.JRE_IS_MINIMUM_JAVA9);
        assumeFalse("FIXME: This test does not work with Windows because of native library requirements",
                Constants.WINDOWS);

        AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
        FileUtils.copyDirectory(MINIMR_CONF_DIR, solrHomeDirectory);
        File dataDir = createTempDir().toFile();
        tempDir = dataDir.getAbsolutePath();
        new File(tempDir).mkdirs();
        FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"),
                new File(tempDir + "/custom-mimetypes.xml"));

        AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);

        System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());

        int taskTrackers = 1;
        int dataNodes = 2;
        //    String proxyUser = System.getProperty("user.name");
        //    String proxyGroup = "g";
        //    StringBuilder sb = new StringBuilder();
        //    sb.append("127.0.0.1,localhost");
        //    for (InetAddress i : InetAddress.getAllByName(InetAddress.getLocalHost().getHostName())) {
        //      sb.append(",").append(i.getCanonicalHostName());
        //    }

        new File(dataDir, "nm-local-dirs").mkdirs();

        System.setProperty("solr.hdfs.blockcache.enabled", "false");

        System.setProperty("test.build.dir", dataDir + File.separator + "hdfs" + File.separator + "test-build-dir");
        System.setProperty("test.build.data", dataDir + File.separator + "hdfs" + File.separator + "build");
        System.setProperty("test.cache.data", dataDir + File.separator + "hdfs" + File.separator + "cache");

        // Initialize AFTER test.build.dir is set, JarFinder uses it.
        SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);

        JobConf conf = new JobConf();
        conf.set("dfs.block.access.token.enable", "false");
        conf.set("dfs.permissions", "true");
        conf.set("hadoop.security.authentication", "simple");
        conf.set(YarnConfiguration.NM_LOCAL_DIRS, dataDir.getPath() + File.separator + "nm-local-dirs");
        conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, dataDir + File.separator + "nm-logs");
        conf.set("testWorkDir", dataDir.getPath() + File.separator + "testWorkDir");
        conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
        conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");

        dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
        FileSystem fileSystem = dfsCluster.getFileSystem();
        fileSystem.mkdirs(new Path("/tmp"));
        fileSystem.mkdirs(new Path("/user"));
        fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
        fileSystem.setPermission(new Path("/tmp"), FsPermission.valueOf("-rwxrwxrwx"));
        fileSystem.setPermission(new Path("/user"), FsPermission.valueOf("-rwxrwxrwx"));
        fileSystem.setPermission(new Path("/hadoop/mapred/system"), FsPermission.valueOf("-rwx------"));
        String nnURI = fileSystem.getUri().toString();
        int numDirs = 1;
        String[] racks = null;
        String[] hosts = null;

        mrCluster = new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks, hosts, null, conf);
        ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
    }

    @AfterClass
    public static void teardownClass() throws Exception {
        System.clearProperty("solr.hdfs.blockcache.enabled");
        System.clearProperty("test.build.dir");
        System.clearProperty("test.build.data");
        System.clearProperty("test.cache.data");

        if (mrCluster != null) {
            mrCluster.shutdown();
            mrCluster = null;
        }
        if (dfsCluster != null) {
            dfsCluster.shutdown();
            dfsCluster = null;
        }

        FileSystem.closeAll();
    }

    @After
    public void tearDown() throws Exception {
        System.clearProperty("hadoop.log.dir");
        System.clearProperty("solr.hdfs.blockcache.enabled");

        super.tearDown();
    }

    private JobConf getJobConf() {
        return mrCluster.createJobConf();
    }

    @Test
    public void testPathParts() throws Exception { // see PathParts
        FileSystem fs = dfsCluster.getFileSystem();
        int dfsClusterPort = fs.getWorkingDirectory().toUri().getPort();
        assertTrue(dfsClusterPort > 0);
        JobConf jobConf = getJobConf();
        Configuration simpleConf = new Configuration();

        for (Configuration conf : Arrays.asList(jobConf, simpleConf)) {
            for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
                for (String up : Arrays.asList("", "../")) {
                    String down = up.length() == 0 ? "foo/" : "";
                    String uploadURL = "hdfs://localhost:12345/user/foo/" + up + "bar.txt" + queryAndFragment;
                    PathParts parts = new PathParts(uploadURL, conf);
                    assertEquals(uploadURL, parts.getUploadURL());
                    assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
                    assertEquals("bar.txt", parts.getName());
                    assertEquals("hdfs", parts.getScheme());
                    assertEquals("localhost", parts.getHost());
                    assertEquals(12345, parts.getPort());
                    assertEquals("hdfs://localhost:12345/user/" + down + "bar.txt", parts.getId());
                    assertEquals(parts.getId(), parts.getDownloadURL());
                    assertFileNotFound(parts);

                    uploadURL = "hdfs://localhost/user/foo/" + up + "bar.txt" + queryAndFragment;
                    parts = new PathParts(uploadURL, conf);
                    assertEquals(uploadURL, parts.getUploadURL());
                    assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
                    assertEquals("bar.txt", parts.getName());
                    assertEquals("hdfs", parts.getScheme());
                    assertEquals("localhost", parts.getHost());
                    assertEquals(8020, parts.getPort());
                    assertEquals("hdfs://localhost:8020/user/" + down + "bar.txt", parts.getId());
                    assertEquals(parts.getId(), parts.getDownloadURL());
                    assertFileNotFound(parts);
                }
            }
        }

        for (Configuration conf : Arrays.asList(jobConf)) {
            for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
                for (String up : Arrays.asList("", "../")) {
                    // verify using absolute path
                    String down = up.length() == 0 ? "foo/" : "";
                    String uploadURL = "/user/foo/" + up + "bar.txt" + queryAndFragment;
                    PathParts parts = new PathParts(uploadURL, conf);
                    assertEquals(uploadURL, parts.getUploadURL());
                    assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
                    assertEquals("bar.txt", parts.getName());
                    assertEquals("hdfs", parts.getScheme());
                    assertTrue(
                            "localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
                    assertEquals(dfsClusterPort, parts.getPort());
                    assertTrue(
                            parts.getId().equals("hdfs://localhost:" + dfsClusterPort + "/user/" + down + "bar.txt")
                                    || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort
                                            + "/user/" + down + "bar.txt"));
                    assertFileNotFound(parts);

                    // verify relative path is interpreted to be relative to user's home dir and resolved to an absolute path
                    uploadURL = "xuser/foo/" + up + "bar.txt" + queryAndFragment;
                    parts = new PathParts(uploadURL, conf);
                    assertEquals(uploadURL, parts.getUploadURL());
                    String homeDir = "/user/" + System.getProperty("user.name");
                    assertEquals(homeDir + "/xuser/" + down + "bar.txt", parts.getURIPath());
                    assertEquals("bar.txt", parts.getName());
                    assertEquals("hdfs", parts.getScheme());
                    assertTrue(
                            "localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
                    assertEquals(dfsClusterPort, parts.getPort());
                    assertTrue(parts.getId()
                            .equals("hdfs://localhost:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
                            || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + homeDir
                                    + "/xuser/" + down + "bar.txt"));
                    assertFileNotFound(parts);
                }
            }
        }

        try {
            new PathParts("/user/foo/bar.txt", simpleConf);
            fail("host/port resolution requires minimr conf, not a simple conf");
        } catch (IllegalArgumentException e) {
            ; // expected
        }
    }

    private void assertFileNotFound(PathParts parts) {
        try {
            parts.getFileSystem().getFileStatus(parts.getUploadPath());
            fail();
        } catch (IOException e) {
            ; // expected
        }
    }

    @Test
    public void mrRun() throws Exception {
        FileSystem fs = dfsCluster.getFileSystem();
        Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
        fs.delete(inDir, true);
        String DATADIR = "/user/testing/testMapperReducer/data";
        Path dataDir = fs.makeQualified(new Path(DATADIR));
        fs.delete(dataDir, true);
        Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
        fs.delete(outDir, true);

        assertTrue(fs.mkdirs(inDir));
        Path INPATH = new Path(inDir, "input.txt");
        OutputStream os = fs.create(INPATH);
        Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
        wr.write(DATADIR + "/" + inputAvroFile);
        wr.close();

        assertTrue(fs.mkdirs(dataDir));
        fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);

        JobConf jobConf = getJobConf();
        jobConf.set("jobclient.output.filter", "ALL");
        if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints
            jobConf.set("mapred.job.tracker", "local");
        }
        jobConf.setMaxMapAttempts(1);
        jobConf.setMaxReduceAttempts(1);
        jobConf.setJar(SEARCH_ARCHIVES_JAR);

        int shards = 2;
        int maxReducers = Integer.MAX_VALUE;
        if (ENABLE_LOCAL_JOB_RUNNER) {
            // local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work.
            // see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
            maxReducers = 1;
            shards = 1;
        }

        String[] args = new String[] {
                "--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
                "--morphline-id=morphline1", "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
                "--output-dir=" + outDir.toString(), "--shards=" + shards, "--verbose",
                numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
                numRuns % 3 == 0 ? "--reducers=" + shards
                        : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers)) };
        if (numRuns % 3 == 2) {
            args = concat(args, new String[] { "--fanout=2" });
        }
        if (numRuns == 0) {
            // force (slow) MapReduce based randomization to get coverage for that as well
            args = concat(new String[] { "-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1" },
                    args);
        }
        MapReduceIndexerTool tool = createTool();
        int res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        Job job = tool.job;
        assertTrue(job.isComplete());
        assertTrue(job.isSuccessful());

        if (numRuns % 3 != 2) {
            // Only run this check if mtree merge is disabled.
            // With mtree merge enabled the BatchWriter counters aren't available anymore because 
            // variable "job" now refers to the merge job rather than the indexing job
            assertEquals(
                    "Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN,
                    count,
                    job.getCounters()
                            .findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString())
                            .getValue());
        }

        // Check the output is as expected
        outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
        Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));

        System.out.println("outputfiles:" + Arrays.toString(outputFiles));

        UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);

        // run again with --dryrun mode:  
        tool = createTool();
        args = concat(args, new String[] { "--dry-run" });
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);

        numRuns++;
    }

    protected static <T> T[] concat(T[]... arrays) {
        if (arrays.length <= 0) {
            throw new IllegalArgumentException();
        }
        Class clazz = null;
        int length = 0;
        for (T[] array : arrays) {
            clazz = array.getClass();
            length += array.length;
        }
        T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
        int pos = 0;
        for (T[] array : arrays) {
            System.arraycopy(array, 0, result, pos, array.length);
            pos += array.length;
        }
        return result;
    }

}