Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.inmobi.conduit.distcp.tools.mapred; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.util.EnumSet; import java.util.Map; import org.apache.commons.codec.binary.Base64; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.Mapper; import com.inmobi.conduit.distcp.tools.DistCpConstants; import com.inmobi.conduit.distcp.tools.DistCpOptions.FileAttribute; import com.inmobi.conduit.distcp.tools.util.DistCpUtils; import com.inmobi.conduit.distcp.tools.util.HadoopCompat; import com.inmobi.conduit.distcp.tools.util.RetriableCommand; import com.inmobi.conduit.distcp.tools.util.ThrottledInputStream; import com.inmobi.messaging.util.AuditUtil; /** * This class extends RetriableCommand to implement the copy of files, * with retries on failure. */ public class RetriableFileCopyCommand extends RetriableCommand { private static final Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class); private static final int BUFFER_SIZE = 8 * 1024; private static final int WINDOW_SIZE = 60; private CompressionCodecFactory compressionCodecs = null; /** * Constructor, taking a description of the action. * @param description Verbose description of the copy operation. */ public RetriableFileCopyCommand(String description) { super(description); } /** * Implementation of RetriableCommand::doExecute(). * This is the actual copy-implementation. * @param arguments Argument-list to the command. * @return Number of bytes copied. * @throws Exception: CopyReadException, if there are read-failures. All other * failures are IOExceptions. */ @SuppressWarnings("unchecked") @Override protected Object doExecute(Object... arguments) throws Exception { assert arguments.length == 5 : "Unexpected argument list."; FileStatus source = (FileStatus) arguments[0]; assert !source.isDir() : "Unexpected file-status. Expected file."; Path target = (Path) arguments[1]; Mapper.Context context = (Mapper.Context) arguments[2]; EnumSet<FileAttribute> fileAttributes = (EnumSet<FileAttribute>) arguments[3]; Map<Long, Long> received = (Map<Long, Long>) arguments[4]; return doCopy(source, target, context, fileAttributes, received); } private long doCopy(FileStatus sourceFileStatus, Path target, Mapper.Context context, EnumSet<FileAttribute> fileAttributes, Map<Long, Long> received) throws IOException { Path tmpTargetPath = getTmpFile(target, context); final Configuration configuration = HadoopCompat.getTaskConfiguration(context); FileSystem targetFS = target.getFileSystem(configuration); compressionCodecs = new CompressionCodecFactory(context.getConfiguration()); try { if (LOG.isDebugEnabled()) { LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target); LOG.debug("Tmp-file path: " + tmpTargetPath); } FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem(configuration); long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus, context, fileAttributes, received); compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead); if (bytesRead > 0) { compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath); } promoteTmpToTarget(tmpTargetPath, target, targetFS); return bytesRead; } finally { if (targetFS.exists(tmpTargetPath)) targetFS.delete(tmpTargetPath, false); } } private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS, FileStatus sourceFileStatus, Mapper.Context context, EnumSet<FileAttribute> fileAttributes, Map<Long, Long> received) throws IOException { OutputStream outStream = new BufferedOutputStream(targetFS.create(tmpTargetPath, true, BUFFER_SIZE, getReplicationFactor(fileAttributes, sourceFileStatus, targetFS), getBlockSize(fileAttributes, sourceFileStatus, targetFS), context)); return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, context, received); } private void compareFileLengths(FileStatus sourceFileStatus, Path target, Configuration configuration, long bytesRead) throws IOException { final Path sourcePath = sourceFileStatus.getPath(); FileSystem fs = sourcePath.getFileSystem(configuration); if (fs.getFileStatus(sourcePath).getLen() != bytesRead) throw new IOException("Mismatch in length of source:" + sourcePath + " and target:" + target); } private void compareCheckSums(FileSystem sourceFS, Path source, FileSystem targetFS, Path target) throws IOException { if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target)) throw new IOException("Check-sum mismatch between " + source + " and " + target); } //If target file exists and unable to delete target - fail //If target doesn't exist and unable to create parent folder - fail //If target is successfully deleted and parent exists, if rename fails - fail private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs) throws IOException { if ((fs.exists(target) && !fs.delete(target, false)) || (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent())) || !fs.rename(tmpTarget, target)) { throw new IOException("Failed to promote tmp-file:" + tmpTarget + " to: " + target); } } private Path getTmpFile(Path target, Mapper.Context context) { Path targetWorkPath = new Path( HadoopCompat.getTaskConfiguration(context).get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); Path root = target.equals(targetWorkPath) ? targetWorkPath.getParent() : targetWorkPath; LOG.info("Creating temp file: " + new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString())); return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()); } private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream, int bufferSize, Mapper.Context context, Map<Long, Long> received) throws IOException { Path source = sourceFileStatus.getPath(); ThrottledInputStream inStream = null; final CompressionCodec codec = compressionCodecs.getCodec(source); InputStream compressedIn = null; OutputStream commpressedOut = null; BufferedReader reader = null; long numberOfLinesRead = 0; try { inStream = getInputStream(source, HadoopCompat.getTaskConfiguration(context)); compressedIn = codec.createInputStream(inStream); commpressedOut = codec.createOutputStream(outStream); // LineReader reader = new LineReader(compressedIn, // context.getConfiguration(), null); reader = new BufferedReader(new InputStreamReader(compressedIn)); byte[] bytesRead = readLine(reader); while (bytesRead != null) { numberOfLinesRead++; commpressedOut.write(bytesRead); commpressedOut.write("\n".getBytes()); updateContextStatus(inStream.getTotalBytesRead(), context, sourceFileStatus, numberOfLinesRead); if (received != null) { byte[] decodedMsg = Base64.decodeBase64(bytesRead); incrementReceived(decodedMsg, received); } bytesRead = readLine(reader); } HadoopCompat.incrementCounter(HadoopCompat.getCounter(context, CopyMapper.Counter.SLEEP_TIME_MS), inStream.getTotalSleepTime()); LOG.info("STATS: " + inStream); } finally { IOUtils.cleanup(LOG, inStream, reader, compressedIn); try { if (commpressedOut != null) commpressedOut.close(); outStream.close(); } catch (IOException exception) { LOG.error("Could not close output-stream. ", exception); throw exception; } } return inStream.getTotalBytesRead();// totalBytesRead; } private static Long getWindow(Long timestamp) { Long window = timestamp - (timestamp % (WINDOW_SIZE * 1000)); return window; } private static void incrementReceived(byte[] msg, Map<Long, Long> received) { long timestamp = AuditUtil.getTimestamp(msg); long window = getWindow(timestamp); if (timestamp != -1) { if (received.containsKey(window)) { received.put(window, received.get(window) + 1); } else { received.put(window, Long.valueOf(1)); } } } private void updateContextStatus(long totalBytesRead, Mapper.Context context, FileStatus sourceFileStatus, long numberOfLinesRead) { StringBuilder message = new StringBuilder( DistCpUtils.getFormatter().format(totalBytesRead * 100.0f / sourceFileStatus.getLen())); message.append("% ").append(description).append(" [") .append(DistCpUtils.getStringDescriptionFor(totalBytesRead)).append('/') .append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen())).append(']') .append(" number of lines read: ").append(String.valueOf(numberOfLinesRead)); HadoopCompat.setStatus(context, message.toString()); } private static int readBytes(InputStream inStream, byte buf[]) throws IOException { try { return inStream.read(buf); } catch (IOException e) { throw new CopyReadException(e); } } private static byte[] readLine(BufferedReader reader) throws IOException { String line = reader.readLine(); if (line == null) return null; return line.getBytes(); } private static ThrottledInputStream getInputStream(Path path, Configuration conf) throws IOException { try { FileSystem fs = path.getFileSystem(conf); long bandwidthKB = getAllowedBandwidth(conf); return new ThrottledInputStream(new BufferedInputStream(fs.open(path)), bandwidthKB * 1024); } catch (IOException e) { throw new CopyReadException(e); } } private static long getAllowedBandwidth(Configuration conf) { return (long) conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_KB, DistCpConstants.DEFAULT_BANDWIDTH_KB); } private static short getReplicationFactor(EnumSet<FileAttribute> fileAttributes, FileStatus sourceFile, FileSystem targetFS) { return fileAttributes.contains(FileAttribute.REPLICATION) ? sourceFile.getReplication() : targetFS.getDefaultReplication(); } private static long getBlockSize(EnumSet<FileAttribute> fileAttributes, FileStatus sourceFile, FileSystem targetFS) { return fileAttributes.contains(FileAttribute.BLOCKSIZE) ? sourceFile.getBlockSize() : targetFS.getDefaultBlockSize(); } /** * Special subclass of IOException. This is used to distinguish read-operation * failures from other kinds of IOExceptions. * The failure to read from source is dealt with specially, in the CopyMapper. * Such failures may be skipped if the DistCpOptions indicate so. * Write failures are intolerable, and amount to CopyMapper failure. */ public static class CopyReadException extends IOException { public CopyReadException(Throwable rootCause) { super(rootCause); } } }