Java tutorial
/* * Copyright 2008-2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package voldemort.server.protocol.hadoop; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URL; import java.text.NumberFormat; import java.util.Arrays; import java.util.Comparator; import java.util.concurrent.atomic.AtomicInteger; import javax.management.ObjectName; import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import voldemort.VoldemortException; import voldemort.annotations.jmx.JmxGetter; import voldemort.server.VoldemortConfig; import voldemort.server.protocol.admin.AsyncOperationStatus; import voldemort.store.readonly.FileFetcher; import voldemort.store.readonly.ReadOnlyStorageMetadata; import voldemort.store.readonly.checksum.CheckSum; import voldemort.store.readonly.checksum.CheckSum.CheckSumType; import voldemort.utils.ByteUtils; import voldemort.utils.DynamicEventThrottler; import voldemort.utils.DynamicThrottleLimit; import voldemort.utils.EventThrottler; import voldemort.utils.JmxUtils; import voldemort.utils.Time; import voldemort.utils.Utils; import com.linkedin.tusk.RestFSException; import com.linkedin.tusk.RestFileStatus; import com.linkedin.tusk.RestFileSystem; public class RestHadoopFetcher implements FileFetcher { private static final Logger logger = Logger.getLogger(RestHadoopFetcher.class); private static final AtomicInteger copyCount = new AtomicInteger(0); private final Long maxBytesPerSecond, reportingIntervalBytes; private EventThrottler throttler = null; private DynamicThrottleLimit globalThrottleLimit = null; private final int bufferSize; private AsyncOperationStatus status; private long minBytesPerSecond = 0; private long retryDelayMs = 0; private int maxAttempts = 0; public RestHadoopFetcher(VoldemortConfig config) { this(null, null, config.getReadOnlyFetcherReportingIntervalBytes(), config.getFetcherBufferSize(), config.getReadOnlyFetcherMinBytesPerSecond(), config.getReadOnlyFetchRetryCount(), config.getReadOnlyFetchRetryDelayMs()); logger.info("Created Rest-based hdfs fetcher with no dynamic throttler, buffer size " + bufferSize + ", reporting interval bytes " + reportingIntervalBytes); } public RestHadoopFetcher(VoldemortConfig config, DynamicThrottleLimit dynThrottleLimit) { this(dynThrottleLimit, null, config.getReadOnlyFetcherReportingIntervalBytes(), config.getFetcherBufferSize(), config.getReadOnlyFetcherMinBytesPerSecond(), config.getReadOnlyFetchRetryCount(), config.getReadOnlyFetchRetryDelayMs()); logger.info("Created Rest-based hdfs fetcher with throttle rate " + dynThrottleLimit.getRate() + ", buffer size " + bufferSize + ", reporting interval bytes " + reportingIntervalBytes); } // Test-only constructor public RestHadoopFetcher() { this((Long) null, VoldemortConfig.REPORTING_INTERVAL_BYTES, VoldemortConfig.DEFAULT_BUFFER_SIZE); } // Test-only constructor public RestHadoopFetcher(Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize) { this(null, maxBytesPerSecond, reportingIntervalBytes, bufferSize, 0, 3, 1000); } public RestHadoopFetcher(DynamicThrottleLimit dynThrottleLimit, Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize, long minBytesPerSecond, int retryCount, long retryDelayMs) { if (maxBytesPerSecond != null) { this.maxBytesPerSecond = maxBytesPerSecond; this.throttler = new EventThrottler(this.maxBytesPerSecond); } else if (dynThrottleLimit != null && dynThrottleLimit.getRate() != 0) { this.maxBytesPerSecond = dynThrottleLimit.getRate(); this.throttler = new DynamicEventThrottler(dynThrottleLimit); this.globalThrottleLimit = dynThrottleLimit; logger.info( "Initializing Dynamic Event throttler with rate : " + this.maxBytesPerSecond + " bytes / sec"); } else this.maxBytesPerSecond = null; this.reportingIntervalBytes = Utils.notNull(reportingIntervalBytes); this.bufferSize = bufferSize; this.status = null; this.minBytesPerSecond = minBytesPerSecond; this.maxAttempts = retryCount + 1; this.retryDelayMs = retryDelayMs; } @Override public File fetch(String sourceFileUrl, String destinationFile) throws IOException { // add job to the throttler addThrottledJob(); ObjectName jmxName = null; try { // authentication RestHadoopAuth.loginSecuredHdfs(); // instantiate RestFS // It is a hack to replace webhdfs w/ http because the BnP job will // construct URL w/ webhdfs. We have to release this code to all // servers before modifying BnP. Otherwise, servers will immediately // using http w/o going through RESTHdfsClient and the operations // will fail. sourceFileUrl = sourceFileUrl.replace("webhdfs", "http"); URL sourceUrl = new URL(sourceFileUrl); RestFileSystem rfs = new RestFileSystem( sourceUrl.getProtocol() + "://" + sourceUrl.getHost() + ":" + sourceUrl.getPort()); String fullyQualifiedFileName = sourceUrl.getFile(); CopyStats stats = new CopyStats(fullyQualifiedFileName, sizeOfPath(rfs, fullyQualifiedFileName)); jmxName = JmxUtils.registerMbean("hdfs-copy-" + copyCount.getAndIncrement(), stats); File destination = new File(destinationFile); if (destination.exists()) { throw new VoldemortException( "Version directory " + destination.getAbsolutePath() + " already exists"); } logger.info("Starting fetch for : " + sourceFileUrl); boolean result = fetch(rfs, fullyQualifiedFileName, destination, stats); logger.info("Completed fetch : " + sourceFileUrl); if (result) { return destination; } else { return null; } } catch (RestFSException rfse) { rfse.printStackTrace(); logger.error("Encountered exception while accessing hadoop via RestHdfsClient : " + rfse); throw new VoldemortException("Error while accessing hadoop via RestHdfsClient : " + rfse); } catch (Throwable te) { te.printStackTrace(); logger.error("Error thrown while trying to get Hadoop filesystem"); throw new VoldemortException("Error thrown while trying to get Hadoop filesystem : " + te); } finally { removeThrottledJob(); if (jmxName != null) JmxUtils.unregisterMbean(jmxName); } } @Override public void setAsyncOperationStatus(AsyncOperationStatus status) { } private void addThrottledJob() throws VoldemortException { if (this.globalThrottleLimit != null) { if (this.globalThrottleLimit.getSpeculativeRate() < this.minBytesPerSecond) throw new VoldemortException("Too many push jobs."); this.globalThrottleLimit.incrementNumJobs(); } } private void removeThrottledJob() { if (this.globalThrottleLimit != null) { this.globalThrottleLimit.decrementNumJobs(); } } private long sizeOfPath(RestFileSystem rfs, String path) throws IOException, RestFSException { long size = 0; RestFileStatus[] statuses = rfs.listStatus(path).toArray(); if (statuses != null) { for (RestFileStatus status : statuses) { if (status.isDir()) size += sizeOfPath(rfs, path + File.separator + status.getPathSuffix()); else size += status.getLength(); } } return size; } private void sleepForRetryDelayMs() { if (retryDelayMs > 0) { try { Thread.sleep(retryDelayMs); } catch (InterruptedException ie) { logger.error("Fetcher interrupted while waiting to retry", ie); } } } /** * Function to copy a file from the given filesystem with a checksum of type * 'checkSumType' computed and returned. In case an error occurs during such * a copy, we do a retry for a maximum of NUM_RETRIES * * @param rfs RestFilesystem used to copy the file * @param source Source path of the file to copy * @param dest Destination path of the file on the local machine * @param stats Stats for measuring the transfer progress * @param checkSumType Type of the Checksum to be computed for this file * @return A Checksum (generator) of type checkSumType which contains the * computed checksum of the copied file * @throws IOException */ private CheckSum copyFileWithCheckSum(RestFileSystem rfs, String source, File dest, CopyStats stats, CheckSumType checkSumType) throws Throwable { CheckSum fileCheckSumGenerator = null; logger.info("Starting copy of " + source + " to " + dest); BufferedInputStream input = null; OutputStream output = null; for (int attempt = 0; attempt < maxAttempts; attempt++) { boolean success = true; long totalBytesRead = 0; boolean fsOpened = false; try { // Create a per file checksum generator if (checkSumType != null) { fileCheckSumGenerator = CheckSum.getInstance(checkSumType); } logger.info("Attempt " + attempt + " at copy of " + source + " to " + dest); input = new BufferedInputStream(rfs.openFile(source).getInputStream()); fsOpened = true; output = new BufferedOutputStream(new FileOutputStream(dest)); byte[] buffer = new byte[bufferSize]; while (true) { int read = input.read(buffer); if (read < 0) { break; } else { output.write(buffer, 0, read); } // Update the per file checksum if (fileCheckSumGenerator != null) { fileCheckSumGenerator.update(buffer, 0, read); } // Check if we need to throttle the fetch if (throttler != null) { throttler.maybeThrottle(read); } stats.recordBytes(read); if (stats.getBytesSinceLastReport() > reportingIntervalBytes) { NumberFormat format = NumberFormat.getNumberInstance(); format.setMaximumFractionDigits(2); logger.info(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); if (this.status != null) { this.status.setStatus(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); } stats.reset(); } } // at this point, we are done! logger.info("Completed copy of " + source + " to " + dest); } catch (Throwable te) { success = false; if (!fsOpened) { logger.error("Error while opening the file stream to " + source, te); } else { logger.error("Error while copying file " + source + " after " + totalBytesRead + " bytes.", te); } if (te.getCause() != null) { logger.error("Cause of error ", te.getCause()); } te.printStackTrace(); if (attempt < maxAttempts - 1) { logger.info("Will retry copying after " + retryDelayMs + " ms"); sleepForRetryDelayMs(); } else { logger.info("Fetcher giving up copy after " + maxAttempts + " attempts"); throw te; } } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); if (success) { break; } } logger.info("Completed copy of " + source + " to " + dest); } return fileCheckSumGenerator; } private boolean fetch(RestFileSystem rfs, String source, File dest, CopyStats stats) throws Throwable, RestFSException { boolean fetchSucceed = false; if (rfs.fileStatus(source).isDir()) { Utils.mkdirs(dest); RestFileStatus statuses[] = rfs.listStatus(source).toArray(); if (statuses != null && statuses.length > 0) { // sort the files so that index files come last. Maybe // this will help keep them cached until the swap Arrays.sort(statuses, new IndexFileLastComparator()); byte[] origCheckSum = null; CheckSumType checkSumType = CheckSumType.NONE; // Do a checksum of checksum - Similar to HDFS CheckSum checkSumGenerator = null; CheckSum fileCheckSumGenerator = null; for (RestFileStatus status : statuses) { String fileNameWithAbsolutePath = status.getAbsolutePath(); String shortFileName = status.getPathSuffix(); logger.info("fetching file: " + fileNameWithAbsolutePath); // Kept for backwards compatibility if (shortFileName.contains("checkSum.txt")) { logger.warn("Found checksum file in old format: " + shortFileName); } else if (shortFileName.contains(".metadata")) { logger.debug("Reading .metadata"); // Read metadata into local file File copyLocation = new File(dest, shortFileName); copyFileWithCheckSum(rfs, fileNameWithAbsolutePath, copyLocation, stats, null); // Open the local file to initialize checksum ReadOnlyStorageMetadata metadata; try { metadata = new ReadOnlyStorageMetadata(copyLocation); } catch (IOException e) { logger.error("Error reading metadata file ", e); throw new VoldemortException(e); } // Read checksum String checkSumTypeString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM_TYPE); String checkSumString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM); if (checkSumTypeString != null && checkSumString != null) { try { origCheckSum = Hex.decodeHex(checkSumString.toCharArray()); } catch (DecoderException e) { logger.error("Exception reading checksum file. Ignoring checksum ", e); continue; } logger.debug("Checksum from .metadata " + new String(Hex.encodeHex(origCheckSum))); // Define the Global checksum generator checkSumType = CheckSum.fromString(checkSumTypeString); checkSumGenerator = CheckSum.getInstance(checkSumType); } } else if (!shortFileName.startsWith(".")) { // Read other (.data , .index files) File copyLocation = new File(dest, shortFileName); fileCheckSumGenerator = copyFileWithCheckSum(rfs, fileNameWithAbsolutePath, copyLocation, stats, checkSumType); if (fileCheckSumGenerator != null && checkSumGenerator != null) { byte[] checkSum = fileCheckSumGenerator.getCheckSum(); if (logger.isDebugEnabled()) { logger.debug("Checksum for " + shortFileName + " - " + new String(Hex.encodeHex(checkSum))); } checkSumGenerator.update(checkSum); } } } logger.info( "Completed reading all files from " + source.toString() + " to " + dest.getAbsolutePath()); // Check checksum if (checkSumType != CheckSumType.NONE) { byte[] newCheckSum = checkSumGenerator.getCheckSum(); boolean checkSumComparison = (ByteUtils.compare(newCheckSum, origCheckSum) == 0); logger.info("Checksum generated from streaming - " + new String(Hex.encodeHex(newCheckSum))); logger.info("Checksum on file - " + new String(Hex.encodeHex(origCheckSum))); logger.info("Check-sum verification - " + checkSumComparison); fetchSucceed = checkSumComparison; } else { logger.info("No check-sum verification required"); fetchSucceed = true; } } else { logger.error("No files found under the source location: " + source); } } else { logger.error("Source " + source + " should be a directory"); } return fetchSucceed; } public static class CopyStats { private final String fileName; private volatile long bytesSinceLastReport; private volatile long totalBytesCopied; private volatile long lastReportNs; private volatile long totalBytes; public CopyStats(String fileName, long totalBytes) { this.fileName = fileName; this.totalBytesCopied = 0L; this.bytesSinceLastReport = 0L; this.totalBytes = totalBytes; this.lastReportNs = System.nanoTime(); } public void recordBytes(long bytes) { this.totalBytesCopied += bytes; this.bytesSinceLastReport += bytes; } public void reset() { this.bytesSinceLastReport = 0; this.lastReportNs = System.nanoTime(); } public long getBytesSinceLastReport() { return bytesSinceLastReport; } public double getPercentCopied() { if (totalBytes == 0) { return 0.0; } else { return (double) (totalBytesCopied * 100) / (double) totalBytes; } } @JmxGetter(name = "totalBytesCopied", description = "The total number of bytes copied so far in this transfer.") public long getTotalBytesCopied() { return totalBytesCopied; } @JmxGetter(name = "bytesPerSecond", description = "The rate of the transfer in bytes/second.") public double getBytesPerSecond() { double ellapsedSecs = (System.nanoTime() - lastReportNs) / (double) Time.NS_PER_SECOND; return bytesSinceLastReport / ellapsedSecs; } @JmxGetter(name = "filename", description = "The file path being copied.") public String getFilename() { return this.fileName; } } /** * A comparator that sorts index files last. This is a heuristic for * retaining the index file in page cache until the swap occurs * */ public static class IndexFileLastComparator implements Comparator<RestFileStatus> { @Override public int compare(RestFileStatus fs1, RestFileStatus fs2) { // directories before files if (fs1.isDir()) return fs2.isDir() ? 0 : -1; if (fs2.isDir()) return fs1.isDir() ? 0 : 1; String f1 = fs1.getPathSuffix(), f2 = fs2.getPathSuffix(); // All metadata files given priority if (f1.endsWith("metadata")) return -1; if (f2.endsWith("metadata")) return 1; // if both same, lexicographically if ((f1.endsWith(".index") && f2.endsWith(".index")) || (f1.endsWith(".data") && f2.endsWith(".data"))) { return f1.compareToIgnoreCase(f2); } if (f1.endsWith(".index")) { return 1; } else { return -1; } } } /* * Main method for testing fetching */ public static void main(String[] args) throws Exception { if (args.length < 5) Utils.croak("USAGE: java " + RestHadoopFetcher.class.getName() + " [url] [keytab location] [kerberos username] [kerberos realm] [kerberos kdc]"); long MAX_BYTES_PER_SECOND = 1024 * 1024 * 1024; long REPORTING_INTERVAL_BYTES = VoldemortConfig.REPORTING_INTERVAL_BYTES; int BUFFER_SIZE = VoldemortConfig.DEFAULT_BUFFER_SIZE; String url = args[0]; String keytabLocation = args[1]; String kerberosUser = args[2]; String realm = args[3]; String kdc = args[4]; // login RestHadoopAuth restAuth = new RestHadoopAuth(realm, kdc, kerberosUser, keytabLocation); restAuth.start(); RestHadoopFetcher fetcher = new RestHadoopFetcher(null, MAX_BYTES_PER_SECOND, REPORTING_INTERVAL_BYTES, BUFFER_SIZE, 0, 3, 1000); // start file fetching long start = System.currentTimeMillis(); File location = fetcher.fetch(url, System.getProperty("user.home") + File.separator + start); // complete file fetching; print stats long size = location.length(); double rate = size * Time.MS_PER_SECOND / (double) (System.currentTimeMillis() - start); NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(2); System.out.println( "Fetch to " + location + " completed: " + nf.format(rate / (1024.0 * 1024.0)) + " MB/sec."); // logout restAuth.stop(); } }