Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import java.util.Random; import java.net.InetSocketAddress; import java.net.Socket; import java.nio.channels.SocketChannel; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DataTransferProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.hdfs.server.datanode.FSDataset; import org.apache.hadoop.hdfs.server.datanode.RaidBlockSender; import org.apache.hadoop.io.Text; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.ChecksumException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.BlockMissingException; import org.apache.hadoop.hdfs.RaidDFSUtil; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.raid.RaidNode; import org.apache.hadoop.raid.RaidUtils; import org.apache.hadoop.raid.protocol.PolicyInfo.ErasureCodeType; /** * contains the core functionality of the block fixer * * raid.blockfix.interval - interval between checks for corrupt files * * raid.blockfix.history.interval - interval before fixing same file again * * raid.blockfix.read.timeout - read time out * * raid.blockfix.write.timeout - write time out */ public class BlockFixer extends Configured implements Runnable { public static final Log LOG = LogFactory.getLog("org.apache.hadoop.raid.BlockFixer"); public static final String BLOCKFIX_INTERVAL = "raid.blockfix.interval"; public static final String BLOCKFIX_HISTORY_INTERVAL = "raid.blockfix.history.interval"; public static final String BLOCKFIX_READ_TIMEOUT = "raid.blockfix.read.timeout"; public static final String BLOCKFIX_WRITE_TIMEOUT = "raid.blockfix.write.timeout"; public static final long DEFAULT_BLOCKFIX_INTERVAL = 60 * 1000; // 1 min public static final long DEFAULT_BLOCKFIX_HISTORY_INTERVAL = 60 * 60 * 1000; // 60 mins private java.util.HashMap<String, java.util.Date> history; private long numFilesFixed = 0; private String xorPrefix; private String rsPrefix; private Encoder xorEncoder; private Decoder xorDecoder; private Encoder rsEncoder; private Decoder rsDecoder; // interval between checks for corrupt files protected long blockFixInterval = DEFAULT_BLOCKFIX_INTERVAL; // interval before fixing same file again protected long historyInterval = DEFAULT_BLOCKFIX_HISTORY_INTERVAL; public volatile boolean running = true; public BlockFixer(Configuration conf) throws IOException { super(conf); history = new java.util.HashMap<String, java.util.Date>(); blockFixInterval = getConf().getInt(BLOCKFIX_INTERVAL, (int) blockFixInterval); xorPrefix = RaidNode.xorDestinationPath(getConf()).toUri().getPath(); if (!xorPrefix.endsWith(Path.SEPARATOR)) { xorPrefix += Path.SEPARATOR; } int stripeLength = RaidNode.getStripeLength(getConf()); xorEncoder = new XOREncoder(getConf(), stripeLength); xorDecoder = new XORDecoder(getConf(), stripeLength); rsPrefix = RaidNode.rsDestinationPath(getConf()).toUri().getPath(); if (!rsPrefix.endsWith(Path.SEPARATOR)) { rsPrefix += Path.SEPARATOR; } int parityLength = RaidNode.rsParityLength(getConf()); rsEncoder = new ReedSolomonEncoder(getConf(), stripeLength, parityLength); rsDecoder = new ReedSolomonDecoder(getConf(), stripeLength, parityLength); } public void run() { while (running) { try { LOG.info("BlockFixer continuing to run..."); doFix(); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } catch (Error err) { LOG.error("Exiting after encountering " + StringUtils.stringifyException(err)); throw err; } } } public long filesFixed() { return numFilesFixed; } void doFix() throws InterruptedException, IOException { while (running) { // Sleep before proceeding to fix files. Thread.sleep(blockFixInterval); // Purge history older than the history interval. purgeHistory(); List<Path> corruptFiles = getCorruptFiles(); if (corruptFiles.isEmpty()) { // If there are no corrupt files, retry after some time. continue; } LOG.info("Found " + corruptFiles.size() + " corrupt files."); sortCorruptFiles(corruptFiles); for (Path srcPath : corruptFiles) { if (!running) break; try { fixFile(srcPath); } catch (IOException ie) { LOG.error("Hit error while processing " + srcPath + ": " + StringUtils.stringifyException(ie)); // Do nothing, move on to the next file. } } } } void fixFile(Path srcPath) throws IOException { if (RaidNode.isParityHarPartFile(srcPath)) { processCorruptParityHarPartFile(srcPath); return; } // The corrupted file is a XOR parity file if (isXorParityFile(srcPath)) { processCorruptParityFile(srcPath, xorEncoder); return; } // The corrupted file is a ReedSolomon parity file if (isRsParityFile(srcPath)) { processCorruptParityFile(srcPath, rsEncoder); return; } // The corrupted file is a source file RaidNode.ParityFilePair ppair = RaidNode.xorParityForSource(srcPath, getConf()); Decoder decoder = null; if (ppair != null) { decoder = xorDecoder; } else { ppair = RaidNode.rsParityForSource(srcPath, getConf()); if (ppair != null) { decoder = rsDecoder; } } // If we have a parity file, process the file and fix it. if (ppair != null) { processCorruptFile(srcPath, ppair, decoder); } } /** * We maintain history of fixed files because a fixed file may appear in * the list of corrupt files if we loop around too quickly. * This function removes the old items in the history so that we can * recognize files that have actually become corrupt since being fixed. */ void purgeHistory() { // Default history interval is 1 hour. long historyInterval = getConf().getLong(BLOCKFIX_HISTORY_INTERVAL, 3600 * 1000); java.util.Date cutOff = new java.util.Date(System.currentTimeMillis() - historyInterval); List<String> toRemove = new java.util.ArrayList<String>(); for (String key : history.keySet()) { java.util.Date item = history.get(key); if (item.before(cutOff)) { toRemove.add(key); } } for (String key : toRemove) { LOG.info("Removing " + key + " from history"); history.remove(key); } } /** * @return A list of corrupt files as obtained from the namenode */ List<Path> getCorruptFiles() throws IOException { DistributedFileSystem dfs = getDFS(new Path("/")); String[] nnCorruptFiles = RaidDFSUtil.getCorruptFiles(getConf()); List<Path> corruptFiles = new LinkedList<Path>(); for (String file : nnCorruptFiles) { if (!history.containsKey(file)) { corruptFiles.add(new Path(file)); } } RaidUtils.filterTrash(getConf(), corruptFiles); return corruptFiles; } /** * Sorts source files ahead of parity files. */ void sortCorruptFiles(List<Path> files) { // TODO: We should first fix the files that lose more blocks Comparator<Path> comp = new Comparator<Path>() { public int compare(Path p1, Path p2) { if (isXorParityFile(p2) || isRsParityFile(p2)) { // If p2 is a parity file, p1 is smaller. return -1; } if (isXorParityFile(p1) || isRsParityFile(p1)) { // If p1 is a parity file, p2 is smaller. return 1; } // If both are source files, they are equal. return 0; } }; Collections.sort(files, comp); } /** * Reads through a corrupt source file fixing corrupt blocks on the way. * @param srcPath Path identifying the corrupt file. * @throws IOException */ void processCorruptFile(Path srcPath, RaidNode.ParityFilePair parityPair, Decoder decoder) throws IOException { LOG.info("Processing corrupt file " + srcPath); DistributedFileSystem srcFs = getDFS(srcPath); FileStatus srcStat = srcFs.getFileStatus(srcPath); long blockSize = srcStat.getBlockSize(); long srcFileSize = srcStat.getLen(); String uriPath = srcPath.toUri().getPath(); int numBlocksFixed = 0; List<LocatedBlock> corrupt = RaidDFSUtil.corruptBlocksInFile(srcFs, uriPath, 0, srcFileSize); for (LocatedBlock lb : corrupt) { Block corruptBlock = lb.getBlock(); long corruptOffset = lb.getStartOffset(); LOG.info("Found corrupt block " + corruptBlock + ", offset " + corruptOffset); final long blockContentsSize = Math.min(blockSize, srcFileSize - corruptOffset); File localBlockFile = File.createTempFile(corruptBlock.getBlockName(), ".tmp"); localBlockFile.deleteOnExit(); try { decoder.recoverBlockToFile(srcFs, srcPath, parityPair.getFileSystem(), parityPair.getPath(), blockSize, corruptOffset, localBlockFile, blockContentsSize); // We have a the contents of the block, send them. DatanodeInfo datanode = chooseDatanode(lb.getLocations()); computeMetdataAndSendFixedBlock(datanode, localBlockFile, lb, blockContentsSize); numBlocksFixed++; LOG.info("Adding " + srcPath + " to history"); history.put(srcPath.toString(), new java.util.Date()); } finally { localBlockFile.delete(); } } LOG.info("Fixed " + numBlocksFixed + " blocks in " + srcPath); numFilesFixed++; } /** * checks whether file is xor parity file */ boolean isXorParityFile(Path p) { String pathStr = p.toUri().getPath(); if (pathStr.contains(RaidNode.HAR_SUFFIX)) { return false; } return pathStr.startsWith(xorPrefix); } /** * checks whether file is rs parity file */ boolean isRsParityFile(Path p) { String pathStr = p.toUri().getPath(); if (pathStr.contains(RaidNode.HAR_SUFFIX)) { return false; } return pathStr.startsWith(rsPrefix); } /** * Returns a DistributedFileSystem hosting the path supplied. */ protected DistributedFileSystem getDFS(Path p) throws IOException { return (DistributedFileSystem) p.getFileSystem(getConf()); } /** * Fixes corrupt blocks in a parity file. * This function uses the corresponding source file to regenerate parity * file blocks. */ void processCorruptParityFile(Path parityPath, Encoder encoder) throws IOException { LOG.info("Processing corrupt file " + parityPath); Path srcPath = sourcePathFromParityPath(parityPath); if (srcPath == null) { LOG.warn("Unusable parity file " + parityPath); return; } DistributedFileSystem parityFs = getDFS(parityPath); FileStatus parityStat = parityFs.getFileStatus(parityPath); long blockSize = parityStat.getBlockSize(); long parityFileSize = parityStat.getLen(); FileStatus srcStat = getDFS(srcPath).getFileStatus(srcPath); long srcFileSize = srcStat.getLen(); // Check timestamp. if (srcStat.getModificationTime() != parityStat.getModificationTime()) { LOG.info("Mismatching timestamp for " + srcPath + " and " + parityPath + ", moving on..."); return; } String uriPath = parityPath.toUri().getPath(); int numBlocksFixed = 0; List<LocatedBlock> corrupt = RaidDFSUtil.corruptBlocksInFile(parityFs, uriPath, 0, parityFileSize); for (LocatedBlock lb : corrupt) { Block corruptBlock = lb.getBlock(); long corruptOffset = lb.getStartOffset(); LOG.info("Found corrupt block " + corruptBlock + ", offset " + corruptOffset); File localBlockFile = File.createTempFile(corruptBlock.getBlockName(), ".tmp"); localBlockFile.deleteOnExit(); try { encoder.recoverParityBlockToFile(parityFs, srcPath, srcFileSize, blockSize, parityPath, corruptOffset, localBlockFile); // We have a the contents of the block, send them. DatanodeInfo datanode = chooseDatanode(lb.getLocations()); computeMetdataAndSendFixedBlock(datanode, localBlockFile, lb, blockSize); numBlocksFixed++; LOG.info("Adding " + parityPath + " to history"); history.put(parityPath.toString(), new java.util.Date()); } finally { localBlockFile.delete(); } } LOG.info("Fixed " + numBlocksFixed + " blocks in " + parityPath); numFilesFixed++; } /** * Reads through a parity HAR part file, fixing corrupt blocks on the way. * A HAR block can contain many file blocks, as long as the HAR part file * block size is a multiple of the file block size. */ void processCorruptParityHarPartFile(Path partFile) throws IOException { LOG.info("Processing corrupt file " + partFile); // Get some basic information. DistributedFileSystem dfs = getDFS(partFile); FileStatus partFileStat = dfs.getFileStatus(partFile); long partFileSize = partFileStat.getLen(); long partFileBlockSize = partFileStat.getBlockSize(); LOG.info(partFile + " has block size " + partFileBlockSize); // Find the path to the index file. // Parity file HARs are only one level deep, so the index files is at the // same level as the part file. String harDirectory = partFile.toUri().getPath(); // Temporarily. harDirectory = harDirectory.substring(0, harDirectory.lastIndexOf(Path.SEPARATOR)); Path indexFile = new Path(harDirectory + "/" + HarIndex.indexFileName); FileStatus indexStat = dfs.getFileStatus(indexFile); // Parses through the HAR index file. HarIndex harIndex = new HarIndex(dfs.open(indexFile), indexStat.getLen()); String uriPath = partFile.toUri().getPath(); int numBlocksFixed = 0; List<LocatedBlock> corrupt = RaidDFSUtil.corruptBlocksInFile(dfs, uriPath, 0, partFileSize); for (LocatedBlock lb : corrupt) { Block corruptBlock = lb.getBlock(); long corruptOffset = lb.getStartOffset(); File localBlockFile = File.createTempFile(corruptBlock.getBlockName(), ".tmp"); localBlockFile.deleteOnExit(); processCorruptParityHarPartBlock(dfs, partFile, corruptBlock, corruptOffset, partFileStat, harIndex, localBlockFile); // Now we have recovered the part file block locally, send it. try { DatanodeInfo datanode = chooseDatanode(lb.getLocations()); computeMetdataAndSendFixedBlock(datanode, localBlockFile, lb, localBlockFile.length()); numBlocksFixed++; LOG.info("Adding " + partFile + " to history"); history.put(partFile.toString(), new java.util.Date()); } finally { localBlockFile.delete(); } } LOG.info("Fixed " + numBlocksFixed + " blocks in " + partFile); numFilesFixed++; } /** * This fixes a single part file block by recovering in sequence each * parity block in the part file block. */ private void processCorruptParityHarPartBlock(FileSystem dfs, Path partFile, Block corruptBlock, long corruptOffset, FileStatus partFileStat, HarIndex harIndex, File localBlockFile) throws IOException { String partName = partFile.toUri().getPath(); // Temporarily. partName = partName.substring(1 + partName.lastIndexOf(Path.SEPARATOR)); OutputStream out = new FileOutputStream(localBlockFile); try { // A HAR part file block could map to several parity files. We need to // use all of them to recover this block. final long corruptEnd = Math.min(corruptOffset + partFileStat.getBlockSize(), partFileStat.getLen()); for (long offset = corruptOffset; offset < corruptEnd;) { HarIndex.IndexEntry entry = harIndex.findEntry(partName, offset); if (entry == null) { String msg = "Corrupt index file has no matching index entry for " + partName + ":" + offset; LOG.warn(msg); throw new IOException(msg); } Path parityFile = new Path(entry.fileName); Encoder encoder; if (isXorParityFile(parityFile)) { encoder = xorEncoder; } else if (isRsParityFile(parityFile)) { encoder = rsEncoder; } else { String msg = "Could not figure out parity file correctly"; LOG.warn(msg); throw new IOException(msg); } Path srcFile = sourcePathFromParityPath(parityFile); FileStatus srcStat = dfs.getFileStatus(srcFile); if (srcStat.getModificationTime() != entry.mtime) { String msg = "Modification times of " + parityFile + " and " + srcFile + " do not match."; LOG.warn(msg); throw new IOException(msg); } long corruptOffsetInParity = offset - entry.startOffset; LOG.info(partFile + ":" + offset + " maps to " + parityFile + ":" + corruptOffsetInParity + " and will be recovered from " + srcFile); encoder.recoverParityBlockToStream(dfs, srcFile, srcStat.getLen(), srcStat.getBlockSize(), parityFile, corruptOffsetInParity, out); // Finished recovery of one parity block. Since a parity block has the // same size as a source block, we can move offset by source block size. offset += srcStat.getBlockSize(); LOG.info("Recovered " + srcStat.getBlockSize() + " part file bytes "); if (offset > corruptEnd) { String msg = "Recovered block spills across part file blocks. Cannot continue..."; throw new IOException(msg); } } } finally { out.close(); } } /** * Choose a datanode (hostname:portnumber). The datanode is chosen at * random from the live datanodes. * @param locationsToAvoid locations to avoid. * @return A string in the format name:port. * @throws IOException */ private DatanodeInfo chooseDatanode(DatanodeInfo[] locationsToAvoid) throws IOException { DistributedFileSystem dfs = getDFS(new Path("/")); DatanodeInfo[] live = dfs.getClient().datanodeReport(DatanodeReportType.LIVE); LOG.info("Choosing a datanode from " + live.length + " live nodes while avoiding " + locationsToAvoid.length); Random rand = new Random(); DatanodeInfo chosen = null; int maxAttempts = 1000; for (int i = 0; i < maxAttempts && chosen == null; i++) { int idx = rand.nextInt(live.length); chosen = live[idx]; for (DatanodeInfo avoid : locationsToAvoid) { if (chosen.name.equals(avoid.name)) { LOG.info("Avoiding " + avoid.name); chosen = null; break; } } } if (chosen == null) { throw new IOException("Could not choose datanode"); } LOG.info("Choosing datanode " + chosen.name); return chosen; } /** * Reads data from the data stream provided and computes metadata. */ static DataInputStream computeMetadata(Configuration conf, InputStream dataStream) throws IOException { ByteArrayOutputStream mdOutBase = new ByteArrayOutputStream(1024 * 1024); DataOutputStream mdOut = new DataOutputStream(mdOutBase); // First, write out the version. mdOut.writeShort(FSDataset.METADATA_VERSION); // Create a summer and write out its header. int bytesPerChecksum = conf.getInt("io.bytes.per.checksum", 512); DataChecksum sum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32, bytesPerChecksum); sum.writeHeader(mdOut); // Buffer to read in a chunk of data. byte[] buf = new byte[bytesPerChecksum]; // Buffer to store the checksum bytes. byte[] chk = new byte[sum.getChecksumSize()]; // Read data till we reach the end of the input stream. int bytesSinceFlush = 0; while (true) { // Read some bytes. int bytesRead = dataStream.read(buf, bytesSinceFlush, bytesPerChecksum - bytesSinceFlush); if (bytesRead == -1) { if (bytesSinceFlush > 0) { boolean reset = true; sum.writeValue(chk, 0, reset); // This also resets the sum. // Write the checksum to the stream. mdOut.write(chk, 0, chk.length); bytesSinceFlush = 0; } break; } // Update the checksum. sum.update(buf, bytesSinceFlush, bytesRead); bytesSinceFlush += bytesRead; // Flush the checksum if necessary. if (bytesSinceFlush == bytesPerChecksum) { boolean reset = true; sum.writeValue(chk, 0, reset); // This also resets the sum. // Write the checksum to the stream. mdOut.write(chk, 0, chk.length); bytesSinceFlush = 0; } } byte[] mdBytes = mdOutBase.toByteArray(); return new DataInputStream(new ByteArrayInputStream(mdBytes)); } private void computeMetdataAndSendFixedBlock(DatanodeInfo datanode, File localBlockFile, LocatedBlock block, long blockSize) throws IOException { LOG.info("Computing metdata"); InputStream blockContents = null; DataInputStream blockMetadata = null; try { blockContents = new FileInputStream(localBlockFile); blockMetadata = computeMetadata(getConf(), blockContents); blockContents.close(); // Reopen blockContents = new FileInputStream(localBlockFile); sendFixedBlock(datanode, blockContents, blockMetadata, block, blockSize); } finally { if (blockContents != null) { blockContents.close(); blockContents = null; } if (blockMetadata != null) { blockMetadata.close(); blockMetadata = null; } } } /** * Send a generated block to a datanode. * @param datanode Chosen datanode name in host:port form. * @param blockContents Stream with the block contents. * @param corruptBlock Block identifying the block to be sent. * @param blockSize size of the block. * @throws IOException */ private void sendFixedBlock(DatanodeInfo datanode, final InputStream blockContents, DataInputStream metadataIn, LocatedBlock block, long blockSize) throws IOException { InetSocketAddress target = NetUtils.createSocketAddr(datanode.name); Socket sock = SocketChannel.open().socket(); int readTimeout = getConf().getInt(BLOCKFIX_READ_TIMEOUT, HdfsConstants.READ_TIMEOUT); NetUtils.connect(sock, target, readTimeout); sock.setSoTimeout(readTimeout); int writeTimeout = getConf().getInt(BLOCKFIX_WRITE_TIMEOUT, HdfsConstants.WRITE_TIMEOUT); OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout); DataOutputStream out = new DataOutputStream( new BufferedOutputStream(baseStream, FSConstants.SMALL_BUFFER_SIZE)); boolean corruptChecksumOk = false; boolean chunkOffsetOK = false; boolean verifyChecksum = true; boolean transferToAllowed = false; try { LOG.info("Sending block " + block.getBlock() + " from " + sock.getLocalSocketAddress().toString() + " to " + sock.getRemoteSocketAddress().toString() + " " + blockSize + " bytes"); RaidBlockSender blockSender = new RaidBlockSender(block.getBlock(), blockSize, 0, blockSize, corruptChecksumOk, chunkOffsetOK, verifyChecksum, transferToAllowed, metadataIn, new RaidBlockSender.InputStreamFactory() { @Override public InputStream createStream(long offset) throws IOException { // we are passing 0 as the offset above, so we can safely ignore // the offset passed return blockContents; } }); DatanodeInfo[] nodes = new DatanodeInfo[] { datanode }; DataTransferProtocol.Sender.opWriteBlock(out, block.getBlock(), 1, DataTransferProtocol.BlockConstructionStage.PIPELINE_SETUP_CREATE, 0, blockSize, 0, "", null, nodes, block.getBlockToken()); blockSender.sendBlock(out, baseStream); LOG.info("Sent block " + block.getBlock() + " to " + datanode.name); } finally { out.close(); } } /** * returns the source file corresponding to a parity file */ Path sourcePathFromParityPath(Path parityPath) { String parityPathStr = parityPath.toUri().getPath(); if (parityPathStr.startsWith(xorPrefix)) { // Remove the prefix to get the source file. String src = parityPathStr.replaceFirst(xorPrefix, "/"); return new Path(src); } else if (parityPathStr.startsWith(rsPrefix)) { // Remove the prefix to get the source file. String src = parityPathStr.replaceFirst(rsPrefix, "/"); return new Path(src); } return null; } }