Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode.metrics; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Options.Rename; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.test.MetricsAsserts; import org.apache.hadoop.util.Time; import org.apache.log4j.Level; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.DataInputStream; import java.io.IOException; import java.util.Random; import org.apache.hadoop.metrics2.MetricsSource; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.junit.Assert.assertTrue; /** * Test for metrics published by the Namenode */ public class TestNameNodeMetrics { private static final Configuration CONF = new HdfsConfiguration(); private static final int DFS_REPLICATION_INTERVAL = 1; private static final Path TEST_ROOT_DIR_PATH = new Path("/testNameNodeMetrics"); private static final String NN_METRICS = "NameNodeActivity"; private static final String NS_METRICS = "FSNamesystem"; // Number of datanodes in the cluster private static final int DATANODE_COUNT = 3; private static final int WAIT_GAUGE_VALUE_RETRIES = 20; // Rollover interval of percentile metrics (in seconds) private static final int PERCENTILES_INTERVAL = 1; static { CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100); CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1); CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, DFS_REPLICATION_INTERVAL); CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, DFS_REPLICATION_INTERVAL); CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + PERCENTILES_INTERVAL); // Enable stale DataNodes checking CONF.setBoolean(DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY, true); ((Log4JLogger) LogFactory.getLog(MetricsAsserts.class)).getLogger().setLevel(Level.DEBUG); } private MiniDFSCluster cluster; private DistributedFileSystem fs; private Random rand = new Random(); private FSNamesystem namesystem; private BlockManager bm; private static Path getTestPath(String fileName) { return new Path(TEST_ROOT_DIR_PATH, fileName); } @Before public void setUp() throws Exception { cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT).build(); cluster.waitActive(); Thread.sleep(10000); namesystem = cluster.getNamesystem(); bm = namesystem.getBlockManager(); fs = (DistributedFileSystem) cluster.getFileSystem(); } @After public void tearDown() throws Exception { MetricsSource source = DefaultMetricsSystem.instance().getSource("UgiMetrics"); if (source != null) { // Run only once since the UGI metrics is cleaned up during teardown MetricsRecordBuilder rb = getMetrics(source); assertQuantileGauges("GetGroups1s", rb); } cluster.shutdown(); } /** * create a file with a length of <code>fileLen</code> */ private void createFile(Path file, long fileLen, short replicas) throws IOException { DFSTestUtil.createFile(fs, file, fileLen, replicas, rand.nextLong()); } private void updateMetrics() throws Exception { // Wait for metrics update (corresponds to dfs.namenode.replication.interval // for some block related metrics to get updated) Thread.sleep(1500 * DFS_REPLICATION_INTERVAL); } private void readFile(FileSystem fileSys, Path name) throws IOException { //Just read file so that getNumBlockLocations are incremented DataInputStream stm = fileSys.open(name); byte[] buffer = new byte[4]; stm.read(buffer, 0, 4); stm.close(); } /** * Test that capacity metrics are exported and pass * basic sanity tests. */ @Test(timeout = 1800) public void testCapacityMetrics() throws Exception { MetricsRecordBuilder rb = getMetrics(NS_METRICS); long capacityTotal = MetricsAsserts.getLongGauge("CapacityTotal", rb); assert (capacityTotal != 0); long capacityUsed = MetricsAsserts.getLongGauge("CapacityUsed", rb); long capacityRemaining = MetricsAsserts.getLongGauge("CapacityRemaining", rb); long capacityUsedNonDFS = MetricsAsserts.getLongGauge("CapacityUsedNonDFS", rb); assert (capacityUsed + capacityRemaining + capacityUsedNonDFS == capacityTotal); } /** * Test metrics indicating the number of stale DataNodes */ @Test public void testStaleNodes() throws Exception { // Set two datanodes as stale for (int i = 0; i < 2; i++) { DataNode dn = cluster.getDataNodes().get(i); DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true); long staleInterval = CONF.getLong(DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY, DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT); cluster.getNameNode().getNamesystem().getBlockManager().getDatanodeManager() .getDatanode(dn.getDatanodeId()).setLastUpdate(Time.now() - staleInterval - 1); } // Let HeartbeatManager to check heartbeat BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem().getBlockManager()); assertGauge("StaleDataNodes", 2, getMetrics(NS_METRICS)); // Reset stale datanodes for (int i = 0; i < 2; i++) { DataNode dn = cluster.getDataNodes().get(i); DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false); cluster.getNameNode().getNamesystem().getBlockManager().getDatanodeManager() .getDatanode(dn.getDatanodeId()).setLastUpdate(Time.now()); } // Let HeartbeatManager to refresh BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem().getBlockManager()); assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS)); } /** * Test metrics associated with addition of a file */ @Test public void testFileAdd() throws Exception { // Add files with 100 blocks final Path file = getTestPath("testFileAdd"); createFile(file, 3200, (short) 3); final long blockCount = 32; int blockCapacity = namesystem.getBlockCapacity(); updateMetrics(); assertGauge("BlockCapacity", blockCapacity, getMetrics(NS_METRICS)); MetricsRecordBuilder rb = getMetrics(NN_METRICS); // File create operations is 1 // Number of files created is depth of <code>file</code> path assertCounter("CreateFileOps", 1L, rb); assertCounter("FilesCreated", (long) file.depth(), rb); updateMetrics(); long filesTotal = file.depth() + 1; // Add 1 for root rb = getMetrics(NS_METRICS); assertGauge("FilesTotal", filesTotal, rb); assertGauge("BlocksTotal", blockCount, rb); fs.delete(file, true); filesTotal--; // reduce the filecount for deleted file rb = waitForDnMetricValue(NS_METRICS, "FilesTotal", filesTotal); assertGauge("BlocksTotal", 0L, rb); assertGauge("PendingDeletionBlocks", 0L, rb); rb = getMetrics(NN_METRICS); // Delete file operations and number of files deleted must be 1 assertCounter("DeleteFileOps", 1L, rb); assertCounter("FilesDeleted", 1L, rb); } /** * Corrupt a block and ensure metrics reflects it */ @Test public void testCorruptBlock() throws Exception { // Create a file with single block with two replicas final Path file = getTestPath("testCorruptBlock"); createFile(file, 100, (short) 2); // Corrupt first replica of the block LocatedBlock block = NameNodeAdapter.getBlockLocations(cluster.getNameNode(), file.toString(), 0, 1).get(0); bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], "STORAGE_ID", "TEST"); updateMetrics(); MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("CorruptBlocks", 1L, rb); assertGauge("PendingReplicationBlocks", 1L, rb); assertGauge("ScheduledReplicationBlocks", 1L, rb); fs.delete(file, true); rb = waitForDnMetricValue(NS_METRICS, "CorruptBlocks", 0L); assertGauge("PendingReplicationBlocks", 0L, rb); assertGauge("ScheduledReplicationBlocks", 0L, rb); } /** * Create excess blocks by reducing the replication factor for * for a file and ensure metrics reflects it */ @Test public void testExcessBlocks() throws Exception { Path file = getTestPath("testExcessBlocks"); createFile(file, 100, (short) 2); NameNodeAdapter.setReplication(namesystem, file.toString(), (short) 1); MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("ExcessBlocks", 1L, rb); // verify ExcessBlocks metric is decremented and // excessReplicateMap is cleared after deleting a file fs.delete(file, true); rb = getMetrics(NS_METRICS); assertGauge("ExcessBlocks", 0L, rb); assertTrue(bm.excessReplicateMap.isEmpty()); } /** * Test to ensure metrics reflects missing blocks */ @Test public void testMissingBlock() throws Exception { // Create a file with single block with two replicas Path file = getTestPath("testMissingBlocks"); createFile(file, 100, (short) 1); // Corrupt the only replica of the block to result in a missing block LocatedBlock block = NameNodeAdapter.getBlockLocations(cluster.getNameNode(), file.toString(), 0, 1).get(0); bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], "STORAGE_ID", "TEST"); updateMetrics(); MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("UnderReplicatedBlocks", 1L, rb); assertGauge("MissingBlocks", 1L, rb); assertGauge("MissingReplOneBlocks", 1L, rb); fs.delete(file, true); waitForDnMetricValue(NS_METRICS, "UnderReplicatedBlocks", 0L); } private void waitForDeletion() throws InterruptedException { // Wait for more than DATANODE_COUNT replication intervals to ensure all // the blocks pending deletion are sent for deletion to the datanodes. Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000); } /** * Wait for the named gauge value from the metrics source to reach the * desired value. * <p/> * There's an initial delay then a spin cycle of sleep and poll. Because * all the tests use a shared FS instance, these tests are not independent; * that's why the initial sleep is in there. * * @param source * metrics source * @param name * gauge name * @param expected * expected value * @return the last metrics record polled * @throws Exception * if something went wrong. */ private MetricsRecordBuilder waitForDnMetricValue(String source, String name, long expected) throws Exception { MetricsRecordBuilder rb; long gauge; //initial wait. waitForDeletion(); //lots of retries are allowed for slow systems; fast ones will still //exit early int retries = (DATANODE_COUNT + 1) * WAIT_GAUGE_VALUE_RETRIES; rb = getMetrics(source); gauge = MetricsAsserts.getLongGauge(name, rb); while (gauge != expected && (--retries > 0)) { Thread.sleep(DFS_REPLICATION_INTERVAL * 500); rb = getMetrics(source); gauge = MetricsAsserts.getLongGauge(name, rb); } //at this point the assertion is valid or the retry count ran out assertGauge(name, expected, rb); return rb; } @Test public void testRenameMetrics() throws Exception { Path src = getTestPath("src"); createFile(src, 100, (short) 1); Path target = getTestPath("target"); createFile(target, 100, (short) 1); fs.rename(src, target, Rename.OVERWRITE); updateMetrics(); MetricsRecordBuilder rb = getMetrics(NN_METRICS); assertCounter("FilesRenamed", 1L, rb); assertCounter("FilesDeleted", 1L, rb); } /** * Test numGetBlockLocations metric * <p/> * Test initiates and performs file operations (create,read,close,open file ) * which results in metrics changes. These metrics changes are updated and * tested for correctness. * <p/> * create file operation does not increment numGetBlockLocation * one read file operation increments numGetBlockLocation by 1 * * @throws IOException * in case of an error */ @Test public void testGetBlockLocationMetric() throws Exception { Path file1_Path = new Path(TEST_ROOT_DIR_PATH, "file1.dat"); // When cluster starts first time there are no file (read,create,open) // operations so metric GetBlockLocations should be 0. assertCounter("GetBlockLocations", 0L, getMetrics(NN_METRICS)); //Perform create file operation createFile(file1_Path, 100, (short) 2); updateMetrics(); //Create file does not change numGetBlockLocations metric //expect numGetBlockLocations = 0 for previous and current interval assertCounter("GetBlockLocations", 0L, getMetrics(NN_METRICS)); // Open and read file operation increments GetBlockLocations // Perform read file operation on earlier created file readFile(fs, file1_Path); updateMetrics(); // Verify read file operation has incremented numGetBlockLocations by 1 assertCounter("GetBlockLocations", 1L, getMetrics(NN_METRICS)); // opening and reading file twice will increment numGetBlockLocations by 2 readFile(fs, file1_Path); readFile(fs, file1_Path); updateMetrics(); assertCounter("GetBlockLocations", 3L, getMetrics(NN_METRICS)); } /** * Tests that the sync and block report metrics get updated on cluster * startup. */ @Test public void testSyncAndBlockReportMetric() throws Exception { //Block reporting may take a few seconds Thread.sleep(5000); MetricsRecordBuilder rb = getMetrics(NN_METRICS); // Each datanode reports in when the cluster comes up updateMetrics(); assertCounter("BlockReportNumOps", (long) DATANODE_COUNT * cluster.getStoragesPerDatanode(), rb); // Sleep for an interval+slop to let the percentiles rollover Thread.sleep((PERCENTILES_INTERVAL + 1) * 1000); // Check that the percentiles were updated assertQuantileGauges("Syncs1s", rb); assertQuantileGauges("BlockReport1s", rb); } }