Java tutorial
/** * Copyright 2012 Twitter, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.twitter.elephanttwin.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.logging.Logger; import javax.annotation.Nullable; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.compress.GzipCodec; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.io.ByteStreams; import com.google.common.io.InputSupplier; import com.hadoop.compression.lzo.LzopCodec; import com.twitter.common.base.MorePreconditions; /** * Utilities for dealing with HDFS files. * * This class needs a lot of TLC to unjank historical accumulated cruft. */ public final class HdfsUtils { private static final Logger LOG = Logger.getLogger(HdfsUtils.class.getName()); private static final String HDFS_SCHEME = "hdfs://"; /** * Glob expression for part files. */ public static final String PART_FILE_GLOB = "part-*"; private HdfsUtils() { // utility } public static final PathFilter HIDDEN_FILE_FILTER = new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !(name.startsWith(".") || name.startsWith("_")); } }; /* return visible .lzo files only */ public final static PathFilter lzoFileFilter = new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return name.endsWith(".lzo") && !name.startsWith(".") && !name.startsWith("_"); } }; /* filter out hidden directories */ public final static PathFilter hiddenDirectoryFilter = new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return !name.startsWith(".") && !name.startsWith("_"); } }; /** * Tests to see if an HDFS cluster is currently reachable. * * @param hdfsNameNode the namenode to test for reachability * @return {@code true} if the name node is reachable. */ public static boolean reachable(String hdfsNamenode) { Configuration config = new Configuration(); config.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, hdfsNamenode); FileSystem fs; try { fs = FileSystem.get(config); if (fs != null) { fs.close(); return true; } return false; } catch (IOException e) { return false; } } /** * Fetch {code @fileName} if it matches an HDFS naming convention (hdfs://<namenode>/<path>). * * @param fileName The name of the file to open. * @param localFileName The name of the local file to store to. * @return The local file, or null if no HDFS files matched. * @throws IOException */ public static File getFile(String fileName, String localFileName) throws IOException { // TODO(Jimmy Lin): JohnS suggests something like: // URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory(configuration)) // And then just usual URL fetching? WTF uses this to get experiments from HDFS for example. // Need to investigate this. Preconditions.checkNotNull(localFileName); Preconditions.checkNotNull(fileName); @Nullable File file = null; if (HdfsUtils.isHdfsFile(fileName)) { // Separate the HDFS namenode from the rest of the file name. Iterator<String> iter = Splitter.on("/").split(fileName).iterator(); String hdfsNameNode = iter.next() + "//" + iter.next() + iter.next(); file = getHdfsFiles(hdfsNameNode, fileName, localFileName); } return file; } /** * Concatenate the content of all HDFS files matching {@code hdfsGlob} into a * file {@code localFilename}. * * @param hdfsNameNode The name of the Hadoop name node. * @param hdfsGlob Files matching this pattern will be fetched. * @param localFilename Name of local file to store concatenated content. * @return The newly created file. * @throws IOException when the file cannot be created/written. */ public static File getHdfsFiles(String hdfsNameNode, String hdfsGlob, String localFilename) throws IOException { Preconditions.checkNotNull(localFilename); Preconditions.checkNotNull(hdfsGlob); Preconditions.checkNotNull(hdfsNameNode); // init the FS connection and the local file. Configuration config = new Configuration(); config.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, hdfsNameNode); FileSystem dfs = FileSystem.get(config); File localFile = new File(localFilename); FileOutputStream localStream = new FileOutputStream(localFile); // get files that need downloading. FileStatus[] statuses = dfs.globStatus(new Path(hdfsGlob)); LOG.info("Pattern " + hdfsGlob + " matched " + statuses.length + " HDFS files, " + "fetching to " + localFile.getCanonicalPath() + "..."); // append each file. int copiedChars = 0; FSDataInputStream remoteStream = null; for (FileStatus status : statuses) { Path src = status.getPath(); try { remoteStream = dfs.open(src); copiedChars += IOUtils.copy(remoteStream, localStream); } catch (IOException e) { LOG.severe("Failed to open/copy " + src); } finally { IOUtils.closeQuietly(remoteStream); } } LOG.info("Fetch " + copiedChars + " bytes to local FS"); return localFile; } // TODO(Jimmy Lin): This method is from common_internal.io.HdfsUtil. // Does basically the same thing as the ones above. Combine. public static File downloadFileFromHdfs(final FileSystem hdfs, final String hdfsFilePath, final String localDirName) throws IOException { Preconditions.checkNotNull(hdfs); MorePreconditions.checkNotBlank(hdfsFilePath); File localDir = new File(MorePreconditions.checkNotBlank(localDirName)); Preconditions.checkArgument(localDir.exists(), "Local directory does not exist: " + localDirName); Preconditions.checkArgument(localDir.isDirectory(), "Not a directory: " + localDirName); Path path = new Path(hdfsFilePath); FSDataInputStream remoteStream = hdfs.open(path); String localFileName = localDirName.endsWith("/") ? localDirName + path.getName() : localDirName + "/" + path.getName(); File localFile = new File(localFileName); FileOutputStream localStream = new FileOutputStream(localFile); try { IOUtils.copy(remoteStream, localStream); } finally { IOUtils.closeQuietly(remoteStream); IOUtils.closeQuietly(localStream); } return localFile; } // TODO(Jimmy Lin): This method is from mesos.util.HdfsUtil. // Does basically the same thing as the ones above. Combine. public static File downloadFileFromHdfs(Configuration conf, String fileUri, String localDirName, boolean overwrite) throws IOException { Path path = new Path(fileUri); FileSystem hdfs = path.getFileSystem(conf); FSDataInputStream remoteStream = hdfs.open(path); File localFile = new File(localDirName, path.getName()); if (overwrite && localFile.exists()) { boolean success = localFile.delete(); if (!success) { LOG.warning("Failed to delete file to be overwritten: " + localFile); } } FileOutputStream localStream = new FileOutputStream(localFile); try { IOUtils.copy(remoteStream, localStream); } finally { IOUtils.closeQuietly(remoteStream); IOUtils.closeQuietly(localStream); } return localFile; } /** * Tests a filename is a fully-qualified hdfs path name; ie: hdfs://... * * @return true if {@code fileName} matches an HDFS file name. */ public static boolean isHdfsFile(String fileName) { return fileName.startsWith(HDFS_SCHEME); } // TODO(Jimmy Lin): This method does basically the same thing as the above. Refactor to combine. public static boolean isValidFile(final FileSystem hdfs, final String path) throws IOException { FileStatus[] statuses = hdfs.listStatus(new Path(path)); return (statuses.length == 1 && !statuses[0].isDir() && statuses[0].getBlockSize() > 0L); } public static FileSystem getHdfsFileSystem(final String hdfsConfigPath) throws IOException { return FileSystem.get(getHdfsConfiguration(hdfsConfigPath)); } public static Configuration getHdfsConfiguration(String hdfsConfigPath) throws IOException { Configuration conf = new Configuration(true); conf.addResource(new Path(hdfsConfigPath)); conf.reloadConfiguration(); return conf; } /** * @param result contains the list of FileStatus passed the filtering conditions; * @param fs * @param path * @param dirFilter : filter works on directories only; * @param fileFilter: filer works on files only; * @throws IOException */ public static void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter dirFilter, PathFilter fileFilter) throws IOException { FileStatus[] stats = fs.listStatus(path); if (stats != null) { for (FileStatus stat : stats) { if (stat.isDir() && dirFilter.accept(stat.getPath())) { addInputPathRecursively(result, fs, stat.getPath(), dirFilter, fileFilter); } else { if (fileFilter.accept(stat.getPath())) { result.add(stat); } } } } } /** * Returns {@link FileStatus} instances for all part files beneath the given parent URI. * * @param fs file system with which to retrieve part file status. * @param parent the parent URI within which part files should be globbed. * @return status of part files within parent URI. * @throws IOException */ public static List<FileStatus> partFileStatus(FileSystem fs, URI parent) throws IOException { return Lists.newArrayList(fs.globStatus(new Path(new Path(parent), PART_FILE_GLOB))); } public static Iterable<Path> getSubdirectories(final boolean recursive, final String baseDirectory, final FileSystem hdfs) throws IOException { FileStatus[] fileStat; Path basePath = new Path(baseDirectory); if (!hdfs.exists(basePath)) { throw new IOException( hdfs.getWorkingDirectory() + baseDirectory + " does not exist, cannot getSubdirectories"); } FileStatus status = hdfs.getFileStatus(basePath); if (!status.isDir()) { LOG.warning("tried to find subdirectories of " + status.getPath() + ", but it is a file"); return Lists.newArrayList(status.getPath()); } // get the stat on all files in the source directory fileStat = hdfs.listStatus(basePath); if (fileStat == null) { throw new IOException( "FileSystem.listStatus(" + basePath + ") returned null, cannot getSubdirectories"); } // get paths to the files in the source directory return Arrays.asList(FileUtil.stat2Paths(fileStat)); } /** * Copies a directory on fs and its contents recursively to the given local path. * * @param fs FileSystem, can be hdfs or a local fs * @param srcPath The path of the directory whose content you want to copy to local * @param localDestPath The directory into which you want to copy the contents. */ public static void copyDirContentsToLocal(FileSystem fs, String srcPath, String localDestPath) throws IOException { Iterable<Path> indexFiles = HdfsUtils.getSubdirectories(true, srcPath, fs); localDestPath = localDestPath.endsWith("/") ? localDestPath : localDestPath + "/"; for (Path indexFile : indexFiles) { LOG.info("Attempting to fetch from hdfs: " + indexFile.toString()); HdfsUtils.downloadFileFromHdfs(fs, indexFile.toString(), localDestPath); } } /** * Returns the namenode from a fully-qualified HDFS path. * * @param hdfsPath fully-qualified HDFS path * @return the namenode or {@code null} if invalid HDFS path */ public static String getHdfsFileSystemName(@Nullable String hdfsPath) { if (hdfsPath == null) { return null; } if (!hdfsPath.startsWith("hdfs://")) { return null; } URI uri; try { uri = new URI(hdfsPath); } catch (URISyntaxException e) { return null; } if (uri.getPort() == -1) { return "hdfs://" + uri.getHost(); } return "hdfs://" + uri.getHost() + ":" + uri.getPort(); } // This is a helper method. Use openInputStream. private static DataInputStream openFile(String file) throws IOException { // Open a local file or file on HDFS. if (HdfsUtils.isHdfsFile(file)) { // Find out what the namenode is, set up the proper configuration, and then // open the stream itself. Configuration config = new Configuration(); config.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, getHdfsFileSystemName(file)); FileSystem fs = FileSystem.get(config); // TODO(Jimmy Lin): above three lines are repeated a lot. Refactor to abstract. return fs.open(new Path(file)); } else { // Open local file. return new DataInputStream(new FileInputStream(new File(file))); } } /** * Opens an input stream to a file. Tries to do "the right thing", e.g., if it's a * fully-qualified (with proper schema) URI to file from HDFS, will read from HDFS; otherwise, * assumes local file; tries to handle compression appropriately, etc. * * @param dataFile file * @return input stream * @throws IOException */ public static InputStream openInputStream(String dataFile) throws IOException { return getInputStreamSupplier(dataFile).getInput(); } /** * Same as {@link openInputStream}, except with a supplier as an extra level of indirection. * * @param dataFile * @return InputSupplier that provides the input stream * @throws IOException */ public static InputSupplier<InputStream> getInputStreamSupplier(String dataFile) throws IOException { Preconditions.checkNotNull(dataFile); final InputStream in; if (dataFile.endsWith(".lzo")) { // Properly handle compressed files. LzopCodec codec = new LzopCodec(); codec.setConf(new Configuration()); in = codec.createInputStream(openFile(dataFile)); } else if (dataFile.endsWith(".gz")) { GzipCodec codec = new GzipCodec(); codec.setConf(new Configuration()); in = codec.createInputStream(openFile(dataFile)); } else { in = openFile(dataFile); } return new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { // TODO(Jimmy Lin): JohnS noted that this breaks the contract of InputSupplier // on 2 counts---the open is not lazy/initiated by the caller of getInput and // getInput does not return a fresh stream on each call. return in; } }; } /** * Returns a InputSupplier that returns a stream which is the concatenation of all source * streams. Note that this method may not make sense for certain binary-encoded streams, but * is convenient for processing multiple Hadoop part files in a line-delimited record format. * * @param streams source streams * @return InputSupplier supplying a concatenated stream * @throws IOException */ public static InputSupplier<InputStream> getInputStreamSupplier(InputStream... streams) throws IOException { // TODO(Jimmy Lin): JohnS noted that this breaks the contract of InputSupplier // on 2 counts---the open is not lazy/initiated by the caller of getInput and // getInput does not return a fresh stream on each call. // Furthermore: This only makes sense with InputSupplier<InputStream> inputs and then // ByteStreams.join already has this covered. return ByteStreams.join(Iterables.transform(Lists.newArrayList(streams), new Function<InputStream, InputSupplier<InputStream>>() { @Override public InputSupplier<InputStream> apply(final InputStream stream) { return new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return stream; } }; } })); } /** * Returns a InputSupplier that returns a stream which is the concatenation of all source * files. Note that this method may not make sense for certain binary-encoded streams, but * is convenient for processing multiple Hadoop part files in a line-delimited record format. * * @param files source files * @return InputSupplier supplying a concatenated stream * @throws IOException */ public static InputSupplier<InputStream> getInputStreamSupplier(List<String> files) throws IOException { return ByteStreams.join(Iterables.transform(files, new Function<String, InputSupplier<InputStream>>() { @Override public InputSupplier<InputStream> apply(final String file) { return new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return openInputStream(file); } }; } })); } /** * Get the last modification date of an HDFS file. * * @param fs The file system. * @param fileUri URI of the file. * @return The last modification date of the file, in msecs since epoch, or -1 if unknown. * @throws IOException */ public static long getModificationTime(FileSystem fs, String fileUri) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(fileUri); FileStatus fileStatus = fs.getFileStatus(new Path(fileUri)); return fileStatus == null ? -1 : fileStatus.getModificationTime(); } /** * @param fs The file system. * @param fileUri URI of the file. * @return True if the file is an HDFS directory. * @throws IOException */ public static boolean isDirectory(FileSystem fs, String fileUri) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(fileUri); FileStatus fileStatus = fs.getFileStatus(new Path(fileUri)); return fileStatus != null && fileStatus.isDir(); } /** * Read UTF-8 lines of text directly from HDFS. */ public static Collection<String> readLines(FileSystem fs, Path path) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(path); LOG.info("Reading from " + path.toUri()); List<String> lines = Lists.newArrayList(); try { if (!fs.exists(path)) { throw new IOException("File not found at " + path); } // TODO(Jimmy Lin): return CharStreams.readLines(new InputStreamReader(fs.open(path), "UTF-8")) // Note that this basically dups the functionality of HdfsFileTransfer. BufferedReader stream = new BufferedReader(new InputStreamReader(fs.open(path), "UTF-8")); String line; while ((line = stream.readLine()) != null) { lines.add(line); } LOG.info("Read " + lines.size() + " queries from " + path.toUri()); return lines; } catch (IOException e) { LOG.warning("Failed to read " + path.toUri() + ": " + e.toString()); throw e; } } /** * Write \n separated lines of text to HDFS as UTF-8. */ public static void writeLines(FileSystem fs, Path path, Iterable<String> lines) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(path); Preconditions.checkNotNull(lines); Writer stream = new BufferedWriter(new OutputStreamWriter(fs.create(path), "UTF-8")); try { for (String line : lines) { stream.write(line + "\n"); } } finally { stream.close(); } } }