Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.hadoop; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.lang.reflect.Array; import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.security.authorize.ProxyUsers; import org.apache.hadoop.util.JarFinder; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.lucene.util.Constants; import org.apache.lucene.util.LuceneTestCase.AwaitsFix; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.cloud.AbstractZkTestCase; import org.apache.solr.hadoop.hack.MiniMRCluster; import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase; import org.apache.solr.util.BadHdfsThreadsFilter; import org.apache.solr.util.BadMrClusterThreadsFilter; import org.junit.After; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import com.carrotsearch.randomizedtesting.annotations.Nightly; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence; @ThreadLeakAction({ Action.WARN }) @ThreadLeakLingering(linger = 0) @ThreadLeakZombies(Consequence.CONTINUE) @ThreadLeakFilters(defaultFilters = true, filters = { BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s) }) @Slow @Nightly @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076") public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 { private static final boolean ENABLE_LOCAL_JOB_RUNNER = false; // for debugging only private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent(); private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents"; private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr"); private static String SEARCH_ARCHIVES_JAR; private static MiniDFSCluster dfsCluster = null; private static MiniMRCluster mrCluster = null; private static int numRuns = 0; private final String inputAvroFile; private final int count; private static String tempDir; private static File solrHomeDirectory; protected MapReduceIndexerTool createTool() { return new MapReduceIndexerTool(); } public MorphlineBasicMiniMRTest() { int data = random().nextInt(3); switch (data) { case 0: this.inputAvroFile = "sample-statuses-20120906-141433.avro"; this.count = 2; break; case 1: this.inputAvroFile = "sample-statuses-20120521-100919.avro"; this.count = 20; break; case 2: this.inputAvroFile = "sample-statuses-20120906-141433-medium.avro"; this.count = 2104; break; default: throw new RuntimeException("Test setup is broken"); } } @BeforeClass public static void setupClass() throws Exception { solrHomeDirectory = createTempDir().toFile(); assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs", Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false"))); assumeFalse("This test fails on Java 9 (https://issues.apache.org/jira/browse/SOLR-8876)", Constants.JRE_IS_MINIMUM_JAVA9); assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS); AbstractZkTestCase.SOLRHOME = solrHomeDirectory; FileUtils.copyDirectory(MINIMR_CONF_DIR, solrHomeDirectory); File dataDir = createTempDir().toFile(); tempDir = dataDir.getAbsolutePath(); new File(tempDir).mkdirs(); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true); System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath()); int taskTrackers = 1; int dataNodes = 2; // String proxyUser = System.getProperty("user.name"); // String proxyGroup = "g"; // StringBuilder sb = new StringBuilder(); // sb.append("127.0.0.1,localhost"); // for (InetAddress i : InetAddress.getAllByName(InetAddress.getLocalHost().getHostName())) { // sb.append(",").append(i.getCanonicalHostName()); // } new File(dataDir, "nm-local-dirs").mkdirs(); System.setProperty("solr.hdfs.blockcache.enabled", "false"); System.setProperty("test.build.dir", dataDir + File.separator + "hdfs" + File.separator + "test-build-dir"); System.setProperty("test.build.data", dataDir + File.separator + "hdfs" + File.separator + "build"); System.setProperty("test.cache.data", dataDir + File.separator + "hdfs" + File.separator + "cache"); // Initialize AFTER test.build.dir is set, JarFinder uses it. SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class); JobConf conf = new JobConf(); conf.set("dfs.block.access.token.enable", "false"); conf.set("dfs.permissions", "true"); conf.set("hadoop.security.authentication", "simple"); conf.set(YarnConfiguration.NM_LOCAL_DIRS, dataDir.getPath() + File.separator + "nm-local-dirs"); conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, dataDir + File.separator + "nm-logs"); conf.set("testWorkDir", dataDir.getPath() + File.separator + "testWorkDir"); conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false"); conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0"); dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null); FileSystem fileSystem = dfsCluster.getFileSystem(); fileSystem.mkdirs(new Path("/tmp")); fileSystem.mkdirs(new Path("/user")); fileSystem.mkdirs(new Path("/hadoop/mapred/system")); fileSystem.setPermission(new Path("/tmp"), FsPermission.valueOf("-rwxrwxrwx")); fileSystem.setPermission(new Path("/user"), FsPermission.valueOf("-rwxrwxrwx")); fileSystem.setPermission(new Path("/hadoop/mapred/system"), FsPermission.valueOf("-rwx------")); String nnURI = fileSystem.getUri().toString(); int numDirs = 1; String[] racks = null; String[] hosts = null; mrCluster = new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks, hosts, null, conf); ProxyUsers.refreshSuperUserGroupsConfiguration(conf); } @AfterClass public static void teardownClass() throws Exception { System.clearProperty("solr.hdfs.blockcache.enabled"); System.clearProperty("test.build.dir"); System.clearProperty("test.build.data"); System.clearProperty("test.cache.data"); if (mrCluster != null) { mrCluster.shutdown(); mrCluster = null; } if (dfsCluster != null) { dfsCluster.shutdown(); dfsCluster = null; } FileSystem.closeAll(); } @After public void tearDown() throws Exception { System.clearProperty("hadoop.log.dir"); System.clearProperty("solr.hdfs.blockcache.enabled"); super.tearDown(); } private JobConf getJobConf() { return mrCluster.createJobConf(); } @Test public void testPathParts() throws Exception { // see PathParts FileSystem fs = dfsCluster.getFileSystem(); int dfsClusterPort = fs.getWorkingDirectory().toUri().getPort(); assertTrue(dfsClusterPort > 0); JobConf jobConf = getJobConf(); Configuration simpleConf = new Configuration(); for (Configuration conf : Arrays.asList(jobConf, simpleConf)) { for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) { for (String up : Arrays.asList("", "../")) { String down = up.length() == 0 ? "foo/" : ""; String uploadURL = "hdfs://localhost:12345/user/foo/" + up + "bar.txt" + queryAndFragment; PathParts parts = new PathParts(uploadURL, conf); assertEquals(uploadURL, parts.getUploadURL()); assertEquals("/user/" + down + "bar.txt", parts.getURIPath()); assertEquals("bar.txt", parts.getName()); assertEquals("hdfs", parts.getScheme()); assertEquals("localhost", parts.getHost()); assertEquals(12345, parts.getPort()); assertEquals("hdfs://localhost:12345/user/" + down + "bar.txt", parts.getId()); assertEquals(parts.getId(), parts.getDownloadURL()); assertFileNotFound(parts); uploadURL = "hdfs://localhost/user/foo/" + up + "bar.txt" + queryAndFragment; parts = new PathParts(uploadURL, conf); assertEquals(uploadURL, parts.getUploadURL()); assertEquals("/user/" + down + "bar.txt", parts.getURIPath()); assertEquals("bar.txt", parts.getName()); assertEquals("hdfs", parts.getScheme()); assertEquals("localhost", parts.getHost()); assertEquals(8020, parts.getPort()); assertEquals("hdfs://localhost:8020/user/" + down + "bar.txt", parts.getId()); assertEquals(parts.getId(), parts.getDownloadURL()); assertFileNotFound(parts); } } } for (Configuration conf : Arrays.asList(jobConf)) { for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) { for (String up : Arrays.asList("", "../")) { // verify using absolute path String down = up.length() == 0 ? "foo/" : ""; String uploadURL = "/user/foo/" + up + "bar.txt" + queryAndFragment; PathParts parts = new PathParts(uploadURL, conf); assertEquals(uploadURL, parts.getUploadURL()); assertEquals("/user/" + down + "bar.txt", parts.getURIPath()); assertEquals("bar.txt", parts.getName()); assertEquals("hdfs", parts.getScheme()); assertTrue( "localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost())); assertEquals(dfsClusterPort, parts.getPort()); assertTrue( parts.getId().equals("hdfs://localhost:" + dfsClusterPort + "/user/" + down + "bar.txt") || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + "/user/" + down + "bar.txt")); assertFileNotFound(parts); // verify relative path is interpreted to be relative to user's home dir and resolved to an absolute path uploadURL = "xuser/foo/" + up + "bar.txt" + queryAndFragment; parts = new PathParts(uploadURL, conf); assertEquals(uploadURL, parts.getUploadURL()); String homeDir = "/user/" + System.getProperty("user.name"); assertEquals(homeDir + "/xuser/" + down + "bar.txt", parts.getURIPath()); assertEquals("bar.txt", parts.getName()); assertEquals("hdfs", parts.getScheme()); assertTrue( "localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost())); assertEquals(dfsClusterPort, parts.getPort()); assertTrue(parts.getId() .equals("hdfs://localhost:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt") || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")); assertFileNotFound(parts); } } } try { new PathParts("/user/foo/bar.txt", simpleConf); fail("host/port resolution requires minimr conf, not a simple conf"); } catch (IllegalArgumentException e) { ; // expected } } private void assertFileNotFound(PathParts parts) { try { parts.getFileSystem().getFileStatus(parts.getUploadPath()); fail(); } catch (IOException e) { ; // expected } } @Test public void mrRun() throws Exception { FileSystem fs = dfsCluster.getFileSystem(); Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input")); fs.delete(inDir, true); String DATADIR = "/user/testing/testMapperReducer/data"; Path dataDir = fs.makeQualified(new Path(DATADIR)); fs.delete(dataDir, true); Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output")); fs.delete(outDir, true); assertTrue(fs.mkdirs(inDir)); Path INPATH = new Path(inDir, "input.txt"); OutputStream os = fs.create(INPATH); Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8); wr.write(DATADIR + "/" + inputAvroFile); wr.close(); assertTrue(fs.mkdirs(dataDir)); fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir); JobConf jobConf = getJobConf(); jobConf.set("jobclient.output.filter", "ALL"); if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints jobConf.set("mapred.job.tracker", "local"); } jobConf.setMaxMapAttempts(1); jobConf.setMaxReduceAttempts(1); jobConf.setJar(SEARCH_ARCHIVES_JAR); int shards = 2; int maxReducers = Integer.MAX_VALUE; if (ENABLE_LOCAL_JOB_RUNNER) { // local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work. // see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/ maxReducers = 1; shards = 1; } String[] args = new String[] { "--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf", "--morphline-id=morphline1", "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(), "--output-dir=" + outDir.toString(), "--shards=" + shards, "--verbose", numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(), numRuns % 3 == 0 ? "--reducers=" + shards : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers)) }; if (numRuns % 3 == 2) { args = concat(args, new String[] { "--fanout=2" }); } if (numRuns == 0) { // force (slow) MapReduce based randomization to get coverage for that as well args = concat(new String[] { "-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1" }, args); } MapReduceIndexerTool tool = createTool(); int res = ToolRunner.run(jobConf, tool, args); assertEquals(0, res); Job job = tool.job; assertTrue(job.isComplete()); assertTrue(job.isSuccessful()); if (numRuns % 3 != 2) { // Only run this check if mtree merge is disabled. // With mtree merge enabled the BatchWriter counters aren't available anymore because // variable "job" now refers to the merge job rather than the indexing job assertEquals( "Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN, count, job.getCounters() .findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString()) .getValue()); } // Check the output is as expected outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR); Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir)); System.out.println("outputfiles:" + Arrays.toString(outputFiles)); UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards); // run again with --dryrun mode: tool = createTool(); args = concat(args, new String[] { "--dry-run" }); res = ToolRunner.run(jobConf, tool, args); assertEquals(0, res); numRuns++; } protected static <T> T[] concat(T[]... arrays) { if (arrays.length <= 0) { throw new IllegalArgumentException(); } Class clazz = null; int length = 0; for (T[] array : arrays) { clazz = array.getClass(); length += array.length; } T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length); int pos = 0; for (T[] array : arrays) { System.arraycopy(array, 0, result, pos, array.length); pos += array.length; } return result; } }