Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.hadoop; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.split.JobSplit; import org.apache.hadoop.mapreduce.split.SplitMetaInfoReader; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.tez.dag.api.DataSourceDescriptor; import org.apache.tez.dag.api.TaskLocationHint; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; public class TestMRInputHelpers { protected static MiniDFSCluster dfsCluster; private static Configuration conf = new Configuration(); private static FileSystem remoteFs; private static Path testFilePath; private static Path oldSplitsDir; private static Path newSplitsDir; private static String TEST_ROOT_DIR = "target" + Path.SEPARATOR + TestMRHelpers.class.getName() + "-tmpDir"; @BeforeClass public static void setup() throws IOException { try { conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, TEST_ROOT_DIR); dfsCluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).format(true).racks(null).build(); remoteFs = dfsCluster.getFileSystem(); } catch (IOException io) { throw new RuntimeException("problem starting mini dfs cluster", io); } Configuration testConf = new YarnConfiguration(dfsCluster.getFileSystem().getConf()); File testConfFile = new File(TEST_ROOT_DIR, "test.xml"); try { testConfFile.createNewFile(); testConf.writeXml(new FileOutputStream(testConfFile)); testConfFile.deleteOnExit(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new RuntimeException(e); } remoteFs.mkdirs(new Path("/tmp/input/")); remoteFs.mkdirs(new Path("/tmp/splitsDirNew/")); remoteFs.mkdirs(new Path("/tmp/splitsDirOld/")); testFilePath = remoteFs.makeQualified(new Path("/tmp/input/test.xml")); remoteFs.copyFromLocalFile(new Path(testConfFile.getAbsolutePath()), testFilePath); FileStatus fsStatus = remoteFs.getFileStatus(testFilePath); Assert.assertTrue(fsStatus.getLen() > 0); oldSplitsDir = remoteFs.makeQualified(new Path("/tmp/splitsDirOld/")); newSplitsDir = remoteFs.makeQualified(new Path("/tmp/splitsDirNew/")); } @Test(timeout = 5000) public void testNewSplitsGen() throws Exception { DataSourceDescriptor dataSource = generateDataSourceDescriptorMapReduce(newSplitsDir); Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)); Assert.assertTrue( dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)); RemoteIterator<LocatedFileStatus> files = remoteFs.listFiles(newSplitsDir, false); boolean foundSplitsFile = false; boolean foundMetaFile = false; int totalFilesFound = 0; while (files.hasNext()) { LocatedFileStatus status = files.next(); String fName = status.getPath().getName(); totalFilesFound++; if (fName.equals(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)) { foundSplitsFile = true; } else if (fName.equals(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)) { foundMetaFile = true; } else { Assert.fail("Found invalid file in splits dir, filename=" + fName); } Assert.assertTrue(status.getLen() > 0); } Assert.assertEquals(2, totalFilesFound); Assert.assertTrue(foundSplitsFile); Assert.assertTrue(foundMetaFile); verifyLocationHints(newSplitsDir, dataSource.getLocationHint().getTaskLocationHints()); } @Test(timeout = 5000) public void testOldSplitsGen() throws Exception { DataSourceDescriptor dataSource = generateDataSourceDescriptorMapRed(oldSplitsDir); Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)); Assert.assertTrue( dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)); RemoteIterator<LocatedFileStatus> files = remoteFs.listFiles(oldSplitsDir, false); boolean foundSplitsFile = false; boolean foundMetaFile = false; int totalFilesFound = 0; while (files.hasNext()) { LocatedFileStatus status = files.next(); String fName = status.getPath().getName(); totalFilesFound++; if (fName.equals(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)) { foundSplitsFile = true; } else if (fName.equals(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)) { foundMetaFile = true; } else { Assert.fail("Found invalid file in splits dir, filename=" + fName); } Assert.assertTrue(status.getLen() > 0); } Assert.assertEquals(2, totalFilesFound); Assert.assertTrue(foundSplitsFile); Assert.assertTrue(foundMetaFile); verifyLocationHints(oldSplitsDir, dataSource.getLocationHint().getTaskLocationHints()); } @Test(timeout = 5000) public void testInputSplitLocalResourceCreation() throws Exception { DataSourceDescriptor dataSource = generateDataSourceDescriptorMapRed(oldSplitsDir); Map<String, LocalResource> localResources = dataSource.getAdditionalLocalFiles(); Assert.assertEquals(2, localResources.size()); Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)); Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)); } private void verifyLocationHints(Path inputSplitsDir, List<TaskLocationHint> actual) throws Exception { JobID jobId = new JobID("dummy", 1); JobSplit.TaskSplitMetaInfo[] splitsInfo = SplitMetaInfoReader.readSplitMetaInfo(jobId, remoteFs, conf, inputSplitsDir); int splitsCount = splitsInfo.length; List<TaskLocationHint> locationHints = new ArrayList<TaskLocationHint>(splitsCount); for (int i = 0; i < splitsCount; ++i) { locationHints.add(TaskLocationHint.createTaskLocationHint( new HashSet<String>(Arrays.asList(splitsInfo[i].getLocations())), null)); } Assert.assertEquals(locationHints, actual); } private DataSourceDescriptor generateDataSourceDescriptorMapReduce(Path inputSplitsDir) throws Exception { JobConf jobConf = new JobConf(dfsCluster.getFileSystem().getConf()); jobConf.setUseNewMapper(true); jobConf.setClass(org.apache.hadoop.mapreduce.MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class, InputFormat.class); jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString()); return MRInputHelpers.configureMRInputWithLegacySplitGeneration(jobConf, inputSplitsDir, true); } private DataSourceDescriptor generateDataSourceDescriptorMapRed(Path inputSplitsDir) throws Exception { JobConf jobConf = new JobConf(dfsCluster.getFileSystem().getConf()); jobConf.setUseNewMapper(false); jobConf.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString()); return MRInputHelpers.configureMRInputWithLegacySplitGeneration(jobConf, inputSplitsDir, true); } }