Java tutorial
/* * Copyright 2008-2009 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package voldemort.store.readonly.fetcher; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.security.PrivilegedExceptionAction; import java.text.NumberFormat; import java.util.Arrays; import java.util.Comparator; import java.util.concurrent.atomic.AtomicInteger; import javax.management.ObjectName; import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.security.UserGroupInformation; import org.apache.log4j.Logger; import voldemort.VoldemortException; import voldemort.annotations.jmx.JmxGetter; import voldemort.server.VoldemortConfig; import voldemort.server.protocol.admin.AsyncOperationStatus; import voldemort.store.readonly.FileFetcher; import voldemort.store.readonly.ReadOnlyStorageMetadata; import voldemort.store.readonly.checksum.CheckSum; import voldemort.store.readonly.checksum.CheckSum.CheckSumType; import voldemort.utils.ByteUtils; import voldemort.utils.DynamicEventThrottler; import voldemort.utils.DynamicThrottleLimit; import voldemort.utils.EventThrottler; import voldemort.utils.JmxUtils; import voldemort.utils.Time; import voldemort.utils.Utils; /* * A fetcher that fetches the store files from HDFS */ public class HdfsFetcher implements FileFetcher { private static final Logger logger = Logger.getLogger(HdfsFetcher.class); private static String keytabPath = ""; private static String kerberosPrincipal = VoldemortConfig.DEFAULT_KERBEROS_PRINCIPAL; private final Long maxBytesPerSecond, reportingIntervalBytes; private final int bufferSize; private static final AtomicInteger copyCount = new AtomicInteger(0); private AsyncOperationStatus status; private EventThrottler throttler = null; private long minBytesPerSecond = 0; private DynamicThrottleLimit globalThrottleLimit = null; private static final int NUM_RETRIES = 3; private VoldemortConfig voldemortConfig = null; public static final String FS_DEFAULT_NAME = "fs.default.name"; /* Additional constructor invoked from ReadOnlyStoreManagementServlet */ public HdfsFetcher(VoldemortConfig config) { this(null, null, config.getReportingIntervalBytes(), config.getFetcherBufferSize(), config.getMinBytesPerSecond(), config.getReadOnlyKeytabPath(), config.getReadOnlyKerberosUser()); this.voldemortConfig = config; logger.info("Created hdfs fetcher with no dynamic throttler, buffer size " + bufferSize + ", reporting interval bytes " + reportingIntervalBytes); } public HdfsFetcher(VoldemortConfig config, DynamicThrottleLimit dynThrottleLimit) { this(dynThrottleLimit, null, config.getReportingIntervalBytes(), config.getFetcherBufferSize(), config.getMinBytesPerSecond(), config.getReadOnlyKeytabPath(), config.getReadOnlyKerberosUser()); this.voldemortConfig = config; logger.info("Created hdfs fetcher with throttle rate " + dynThrottleLimit.getRate() + ", buffer size " + bufferSize + ", reporting interval bytes " + reportingIntervalBytes); } public HdfsFetcher() { this((Long) null, VoldemortConfig.REPORTING_INTERVAL_BYTES, VoldemortConfig.DEFAULT_BUFFER_SIZE); } public HdfsFetcher(Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize) { this(null, maxBytesPerSecond, reportingIntervalBytes, bufferSize, 0, "", ""); } public HdfsFetcher(DynamicThrottleLimit dynThrottleLimit, Long maxBytesPerSecond, Long reportingIntervalBytes, int bufferSize, long minBytesPerSecond, String keytabLocation, String kerberosUser) { if (maxBytesPerSecond != null) { this.maxBytesPerSecond = maxBytesPerSecond; this.throttler = new EventThrottler(this.maxBytesPerSecond); } else if (dynThrottleLimit != null && dynThrottleLimit.getRate() != 0) { this.maxBytesPerSecond = dynThrottleLimit.getRate(); this.throttler = new DynamicEventThrottler(dynThrottleLimit); this.globalThrottleLimit = dynThrottleLimit; logger.info( "Initializing Dynamic Event throttler with rate : " + this.maxBytesPerSecond + " bytes / sec"); } else this.maxBytesPerSecond = null; this.reportingIntervalBytes = Utils.notNull(reportingIntervalBytes); this.bufferSize = bufferSize; this.status = null; this.minBytesPerSecond = minBytesPerSecond; HdfsFetcher.kerberosPrincipal = kerberosUser; HdfsFetcher.keytabPath = keytabLocation; } public File fetch(String sourceFileUrl, String destinationFile) throws IOException { String hadoopConfigPath = ""; if (this.voldemortConfig != null) { hadoopConfigPath = this.voldemortConfig.getHadoopConfigPath(); } return fetch(sourceFileUrl, destinationFile, hadoopConfigPath); } public File fetch(String sourceFileUrl, String destinationFile, String hadoopConfigPath) throws IOException { if (this.globalThrottleLimit != null) { if (this.globalThrottleLimit.getSpeculativeRate() < this.minBytesPerSecond) throw new VoldemortException("Too many push jobs."); this.globalThrottleLimit.incrementNumJobs(); } ObjectName jmxName = null; try { final Configuration config = new Configuration(); FileSystem fs = null; config.setInt("io.socket.receive.buffer", bufferSize); config.set("hadoop.rpc.socket.factory.class.ClientProtocol", ConfigurableSocketFactory.class.getName()); config.set("hadoop.security.group.mapping", "org.apache.hadoop.security.ShellBasedUnixGroupsMapping"); final Path path = new Path(sourceFileUrl); boolean isHftpBasedFetch = sourceFileUrl.length() > 4 && sourceFileUrl.substring(0, 4).equals("hftp"); logger.info("URL : " + sourceFileUrl + " and hftp protocol enabled = " + isHftpBasedFetch); logger.info("Hadoop path = " + hadoopConfigPath + " , keytab path = " + HdfsFetcher.keytabPath + " , kerberos principal = " + HdfsFetcher.kerberosPrincipal); if (hadoopConfigPath.length() > 0 && !isHftpBasedFetch) { config.addResource(new Path(hadoopConfigPath + "/core-site.xml")); config.addResource(new Path(hadoopConfigPath + "/hdfs-site.xml")); String security = config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION); if (security == null || !security.equals("kerberos")) { logger.error("Security isn't turned on in the conf: " + CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION + " = " + config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION)); logger.error("Please make sure that the Hadoop config directory path is valid."); return null; } else { logger.info("Security is turned on in the conf. Trying to authenticate ..."); } } try { if (HdfsFetcher.keytabPath.length() > 0 && !isHftpBasedFetch) { if (!new File(HdfsFetcher.keytabPath).exists()) { logger.error("Invalid keytab file path. Please provide a valid keytab path"); return null; } // First login using the specified principal and keytab file UserGroupInformation.setConfiguration(config); UserGroupInformation.loginUserFromKeytab(HdfsFetcher.kerberosPrincipal, HdfsFetcher.keytabPath); /* * If login is successful, get the filesystem object. NOTE: * Ideally we do not need a doAs block for this. Consider * removing it in the future once the Hadoop jars have the * corresponding patch (tracked in the Hadoop Apache * project: HDFS-3367) */ try { logger.info("I've logged in and am now Doasing as " + UserGroupInformation.getCurrentUser().getUserName()); fs = UserGroupInformation.getCurrentUser() .doAs(new PrivilegedExceptionAction<FileSystem>() { public FileSystem run() throws Exception { FileSystem fs = path.getFileSystem(config); return fs; } }); } catch (InterruptedException e) { logger.error(e.getMessage()); } catch (Exception e) { logger.error("Got an exception while getting the filesystem object: "); logger.error("Exception class : " + e.getClass()); e.printStackTrace(); for (StackTraceElement et : e.getStackTrace()) { logger.error(et.toString()); } } } else { fs = path.getFileSystem(config); } } catch (IOException e) { e.printStackTrace(); logger.error("Error in authenticating or getting the Filesystem object !!!"); return null; } CopyStats stats = new CopyStats(sourceFileUrl, sizeOfPath(fs, path)); jmxName = JmxUtils.registerMbean("hdfs-copy-" + copyCount.getAndIncrement(), stats); File destination = new File(destinationFile); if (destination.exists()) { throw new VoldemortException( "Version directory " + destination.getAbsolutePath() + " already exists"); } logger.info("Starting fetch for : " + sourceFileUrl); boolean result = fetch(fs, path, destination, stats); logger.info("Completed fetch : " + sourceFileUrl); // Close the filesystem fs.close(); if (result) { return destination; } else { return null; } } catch (IOException e) { logger.error("Error while getting Hadoop filesystem : " + e); return null; } finally { if (this.globalThrottleLimit != null) { this.globalThrottleLimit.decrementNumJobs(); } if (jmxName != null) JmxUtils.unregisterMbean(jmxName); } } private boolean fetch(FileSystem fs, Path source, File dest, CopyStats stats) throws IOException { if (!fs.isFile(source)) { Utils.mkdirs(dest); FileStatus[] statuses = fs.listStatus(source); if (statuses != null) { // sort the files so that index files come last. Maybe // this will help keep them cached until the swap Arrays.sort(statuses, new IndexFileLastComparator()); byte[] origCheckSum = null; CheckSumType checkSumType = CheckSumType.NONE; // Do a checksum of checksum - Similar to HDFS CheckSum checkSumGenerator = null; CheckSum fileCheckSumGenerator = null; for (FileStatus status : statuses) { // Kept for backwards compatibility if (status.getPath().getName().contains("checkSum.txt")) { // Ignore old checksum files } else if (status.getPath().getName().contains(".metadata")) { logger.debug("Reading .metadata"); // Read metadata into local file File copyLocation = new File(dest, status.getPath().getName()); copyFileWithCheckSum(fs, status.getPath(), copyLocation, stats, null); // Open the local file to initialize checksum ReadOnlyStorageMetadata metadata; try { metadata = new ReadOnlyStorageMetadata(copyLocation); } catch (IOException e) { logger.error("Error reading metadata file ", e); throw new VoldemortException(e); } // Read checksum String checkSumTypeString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM_TYPE); String checkSumString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM); if (checkSumTypeString != null && checkSumString != null) { try { origCheckSum = Hex.decodeHex(checkSumString.toCharArray()); } catch (DecoderException e) { logger.error("Exception reading checksum file. Ignoring checksum ", e); continue; } logger.debug("Checksum from .metadata " + new String(Hex.encodeHex(origCheckSum))); checkSumType = CheckSum.fromString(checkSumTypeString); checkSumGenerator = CheckSum.getInstance(checkSumType); fileCheckSumGenerator = CheckSum.getInstance(checkSumType); } } else if (!status.getPath().getName().startsWith(".")) { // Read other (.data , .index files) File copyLocation = new File(dest, status.getPath().getName()); copyFileWithCheckSum(fs, status.getPath(), copyLocation, stats, fileCheckSumGenerator); if (fileCheckSumGenerator != null && checkSumGenerator != null) { byte[] checkSum = fileCheckSumGenerator.getCheckSum(); logger.debug("Checksum for " + status.getPath() + " - " + new String(Hex.encodeHex(checkSum))); checkSumGenerator.update(checkSum); } } } logger.info( "Completed reading all files from " + source.toString() + " to " + dest.getAbsolutePath()); // Check checksum if (checkSumType != CheckSumType.NONE) { byte[] newCheckSum = checkSumGenerator.getCheckSum(); boolean checkSumComparison = (ByteUtils.compare(newCheckSum, origCheckSum) == 0); logger.info("Checksum generated from streaming - " + new String(Hex.encodeHex(newCheckSum))); logger.info("Checksum on file - " + new String(Hex.encodeHex(origCheckSum))); logger.info("Check-sum verification - " + checkSumComparison); return checkSumComparison; } else { logger.info("No check-sum verification required"); return true; } } } logger.error("Source " + source.toString() + " should be a directory"); return false; } private void copyFileWithCheckSum(FileSystem fs, Path source, File dest, CopyStats stats, CheckSum fileCheckSumGenerator) throws IOException { logger.debug("Starting copy of " + source + " to " + dest); FSDataInputStream input = null; OutputStream output = null; for (int attempt = 0; attempt < NUM_RETRIES; attempt++) { boolean success = true; try { input = fs.open(source); output = new BufferedOutputStream(new FileOutputStream(dest)); byte[] buffer = new byte[bufferSize]; while (true) { int read = input.read(buffer); if (read < 0) { break; } else { output.write(buffer, 0, read); } if (fileCheckSumGenerator != null) fileCheckSumGenerator.update(buffer, 0, read); if (throttler != null) throttler.maybeThrottle(read); stats.recordBytes(read); if (stats.getBytesSinceLastReport() > reportingIntervalBytes) { NumberFormat format = NumberFormat.getNumberInstance(); format.setMaximumFractionDigits(2); logger.info(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); if (this.status != null) { this.status.setStatus(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); } stats.reset(); } } logger.info("Completed copy of " + source + " to " + dest); } catch (IOException ioe) { success = false; logger.error("Error during copying file ", ioe); ioe.printStackTrace(); if (attempt < NUM_RETRIES - 1) { logger.info("retrying copying"); } else { throw ioe; } } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); if (success) { break; } } logger.debug("Completed copy of " + source + " to " + dest); } } private long sizeOfPath(FileSystem fs, Path path) throws IOException { long size = 0; FileStatus[] statuses = fs.listStatus(path); if (statuses != null) { for (FileStatus status : statuses) { if (status.isDir()) size += sizeOfPath(fs, status.getPath()); else size += status.getLen(); } } return size; } public static class CopyStats { private final String fileName; private volatile long bytesSinceLastReport; private volatile long totalBytesCopied; private volatile long lastReportNs; private volatile long totalBytes; public CopyStats(String fileName, long totalBytes) { this.fileName = fileName; this.totalBytesCopied = 0L; this.bytesSinceLastReport = 0L; this.totalBytes = totalBytes; this.lastReportNs = System.nanoTime(); } public void recordBytes(long bytes) { this.totalBytesCopied += bytes; this.bytesSinceLastReport += bytes; } public void reset() { this.bytesSinceLastReport = 0; this.lastReportNs = System.nanoTime(); } public long getBytesSinceLastReport() { return bytesSinceLastReport; } public double getPercentCopied() { if (totalBytes == 0) { return 0.0; } else { return (double) (totalBytesCopied * 100) / (double) totalBytes; } } @JmxGetter(name = "totalBytesCopied", description = "The total number of bytes copied so far in this transfer.") public long getTotalBytesCopied() { return totalBytesCopied; } @JmxGetter(name = "bytesPerSecond", description = "The rate of the transfer in bytes/second.") public double getBytesPerSecond() { double ellapsedSecs = (System.nanoTime() - lastReportNs) / (double) Time.NS_PER_SECOND; return bytesSinceLastReport / ellapsedSecs; } @JmxGetter(name = "filename", description = "The file path being copied.") public String getFilename() { return this.fileName; } } /** * A comparator that sorts index files last. This is a heuristic for * retaining the index file in page cache until the swap occurs * */ public static class IndexFileLastComparator implements Comparator<FileStatus> { public int compare(FileStatus fs1, FileStatus fs2) { // directories before files if (fs1.isDir()) return fs2.isDir() ? 0 : -1; if (fs2.isDir()) return fs1.isDir() ? 0 : 1; String f1 = fs1.getPath().getName(), f2 = fs2.getPath().getName(); // All metadata files given priority if (f1.endsWith("metadata")) return -1; if (f2.endsWith("metadata")) return 1; // if both same, lexicographically if ((f1.endsWith(".index") && f2.endsWith(".index")) || (f1.endsWith(".data") && f2.endsWith(".data"))) { return f1.compareToIgnoreCase(f2); } if (f1.endsWith(".index")) { return 1; } else { return -1; } } } public void setAsyncOperationStatus(AsyncOperationStatus status) { this.status = status; } /* * Main method for testing fetching */ public static void main(String[] args) throws Exception { if (args.length < 1) Utils.croak("USAGE: java " + HdfsFetcher.class.getName() + " url [keytab location] [kerberos username] [hadoop-config-path]"); String url = args[0]; String keytabLocation = ""; String kerberosUser = ""; String hadoopPath = ""; if (args.length == 4) { keytabLocation = args[1]; kerberosUser = args[2]; hadoopPath = args[3]; } long maxBytesPerSec = 1024 * 1024 * 1024; Path p = new Path(url); final Configuration config = new Configuration(); final URI uri = new URI(url); config.setInt("io.file.buffer.size", VoldemortConfig.DEFAULT_BUFFER_SIZE); config.set("hadoop.rpc.socket.factory.class.ClientProtocol", ConfigurableSocketFactory.class.getName()); config.setInt("io.socket.receive.buffer", 1 * 1024 * 1024 - 10000); FileSystem fs = null; p = new Path(url); HdfsFetcher.keytabPath = keytabLocation; HdfsFetcher.kerberosPrincipal = kerberosUser; boolean isHftpBasedFetch = url.length() > 4 && url.substring(0, 4).equals("hftp"); logger.info("URL : " + url + " and hftp protocol enabled = " + isHftpBasedFetch); if (hadoopPath.length() > 0 && !isHftpBasedFetch) { config.set("hadoop.security.group.mapping", "org.apache.hadoop.security.ShellBasedUnixGroupsMapping"); config.addResource(new Path(hadoopPath + "/core-site.xml")); config.addResource(new Path(hadoopPath + "/hdfs-site.xml")); String security = config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION); if (security == null || !security.equals("kerberos")) { logger.info("Security isn't turned on in the conf: " + CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION + " = " + config.get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION)); logger.info("Fix that. Exiting."); return; } else { logger.info("Security is turned on in the conf. Trying to authenticate ..."); } } try { // Get the filesystem object if (keytabLocation.length() > 0 && !isHftpBasedFetch) { UserGroupInformation.setConfiguration(config); UserGroupInformation.loginUserFromKeytab(kerberosUser, keytabLocation); final Path path = p; try { logger.debug("I've logged in and am now Doasing as " + UserGroupInformation.getCurrentUser().getUserName()); fs = UserGroupInformation.getCurrentUser().doAs(new PrivilegedExceptionAction<FileSystem>() { public FileSystem run() throws Exception { FileSystem fs = path.getFileSystem(config); return fs; } }); } catch (InterruptedException e) { logger.error(e.getMessage()); } catch (Exception e) { logger.error("Got an exception while getting the filesystem object: "); logger.error("Exception class : " + e.getClass()); e.printStackTrace(); for (StackTraceElement et : e.getStackTrace()) { logger.error(et.toString()); } } } else { fs = p.getFileSystem(config); } } catch (IOException e) { e.printStackTrace(); System.err.println("Error in getting Hadoop filesystem object !!! Exiting !!!"); System.exit(-1); } FileStatus status = fs.listStatus(p)[0]; long size = status.getLen(); HdfsFetcher fetcher = new HdfsFetcher(null, maxBytesPerSec, VoldemortConfig.REPORTING_INTERVAL_BYTES, VoldemortConfig.DEFAULT_BUFFER_SIZE, 0, keytabLocation, kerberosUser); long start = System.currentTimeMillis(); File location = fetcher.fetch(url, System.getProperty("java.io.tmpdir") + File.separator + start, hadoopPath); double rate = size * Time.MS_PER_SECOND / (double) (System.currentTimeMillis() - start); NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(2); System.out.println( "Fetch to " + location + " completed: " + nf.format(rate / (1024.0 * 1024.0)) + " MB/sec."); fs.close(); } }