List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.metamx.milano.pig.MilanoLoadFunc.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { Path basePath = new Path(location); FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration()); Set<Path> paths = new TreeSet<Path>(); if (fileSystem.getFileStatus(basePath).isDir()) { getPaths(basePath, paths, fileSystem); } else {/*from ww w . jav a 2s . c om*/ paths.add(basePath); } log.info("Setting input to " + paths); FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths)); }
From source file:com.metamx.milano.pig.MilanoLoadFunc.java
License:Apache License
private void getPaths(Path baseDirectory, Set<Path> paths, FileSystem fileSystem) throws IOException { FileStatus[] files = fileSystem.listStatus(baseDirectory); for (FileStatus file : files) { Path path = file.getPath(); FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { getPaths(path, paths, fileSystem); } else {//from w w w . j a v a 2 s.c o m paths.add(baseDirectory); } } }
From source file:com.metamx.milano.pig.MilanoLoadFunc.java
License:Apache License
/** * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata. * This is the method by which we pass the schema types and names directly to pig without having to specify them directly. * * @param location As passed to relativeToAbsolutePath * @param job The job.// w ww. ja v a 2 s .co m * * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist. * * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type. */ @Override public ResourceSchema getSchema(String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); Properties props = ConfigurationUtil.toProperties(conf); // HACK: Here we open the file directly to read the TypeMetadata. // HACK: There may be a better more direct way to do this, but it works for now. Path path = new Path(location); FileSystem fileSystem = path.getFileSystem(conf); FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { log.debug(String.format("Path is a directory.")); path = getFilePath(path, fileSystem); if (path == null) { return null; } } else if (!fileSystem.exists(path)) { return null; } MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path)); typeMetadata = reader.getMetadata(); reader.close(); if (typeMetadata == null) { return null; } descriptor = MilanoTool.with(typeMetadata).getDescriptor(); return new ResourceSchema(getMessageSchema(descriptor)); }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile) throws NoSplitFileException, IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException FileStatus splitFileStatus;//from w w w . j a va 2 s . c om try { splitFileStatus = fs.getFileStatus(splitFile); LOG.info("Found split file at : " + splitFileStatus); } catch (Exception e) { throw new NoSplitFileException(); } FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException while (fsDataStream.getPos() < splitFileStatus.getLen()) { callback.reset(); bsonDec.decode(fsDataStream, callback); BSONObject splitInfo = (BSONObject) callback.get(); splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile)); } splitsList = splits; }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public void readSplits() throws IOException { splitsList = new ArrayList<FileSplit>(); if (inputPath == null) { throw new IllegalStateException("Input path has not been set."); }// ww w . j av a2 s . c o m FileSystem fs = inputPath.getFileSystem(getConf()); FileStatus file = fs.getFileStatus(inputPath); readSplitsForFile(file); }
From source file:com.moz.fiji.mapreduce.input.impl.WholeFileRecordReader.java
License:Apache License
/** * Implementation detail: This constructor is built to be called via * reflection from within CombineFileRecordReader. * * @param fileSplit The CombineFileSplit that this will read from. * @param context The context for this task. * @param pathToProcess The path index from the CombineFileSplit to process in this record. *//* w ww. ja v a 2 s . co m*/ public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context, Integer pathToProcess) { mProcessed = false; mFileToRead = fileSplit.getPath(pathToProcess); mFileLength = fileSplit.getLength(pathToProcess); mConf = context.getConfiguration(); assert 0 == fileSplit.getOffset(pathToProcess); if (LOG.isDebugEnabled()) { LOG.debug("FileToRead is: " + mFileToRead.toString()); LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths()); try { final FileSystem fs = mFileToRead.getFileSystem(mConf); assert fs.getFileStatus(mFileToRead).getLen() == mFileLength; } catch (IOException ioe) { // oh well, I was just testing. } } mFileName = new Text(); mFileText = new Text(); }
From source file:com.moz.fiji.mapreduce.tools.FijiBulkLoad.java
License:Apache License
/** * Recursively grant additional read and write permissions to all. There is no * built-in way in the Hadoop Java API to recursively set permissions on a directory, * so we implement it here./*w w w.j av a 2 s. c o m*/ * * @param path The Path to the directory to chmod. * @throws IOException on IOException. */ private void recursiveGrantAllReadWritePermissions(Path path) throws IOException { FileSystem hdfs = path.getFileSystem(getConf()); recursiveGrantAllReadWritePermissions(hdfs, hdfs.getFileStatus(path)); }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void restoreState(State<T> state) { this.state = state; FileSystem fs; try {//from w ww.j av a 2 s.c om fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration()); } catch (IOException e) { LOG.error("Error while creating FileSystem in checkpoint restore.", e); throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e); } for (BucketState<T> bucketState : state.bucketStates.values()) { // we can clean all the pending files since they where renamed to final files // after this checkpoint was successful bucketState.pendingFiles.clear(); if (bucketState.currentFile != null) { // We were writing to a file when the last checkpoint occured. This file can either // be still in-progress or became a pending file at some point after the checkpoint. // Either way, we have to truncate it back to a valid state (or write a .valid-length) // file that specifies up to which length it is valid and rename it to the final name // before starting a new bucket file. Path partPath = new Path(bucketState.currentFile); try { Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName()) .suffix(pendingSuffix); Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); if (fs.exists(partPendingPath)) { LOG.debug( "In-progress file {} has been moved to pending after checkpoint, moving to final location.", partPath); // has been moved to pending in the mean time, rename to final location fs.rename(partPendingPath, partPath); } else if (fs.exists(partInProgressPath)) { LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath); // it was still in progress, rename to final path fs.rename(partInProgressPath, partPath); } else if (fs.exists(partPath)) { LOG.debug("In-Progress file {} was already moved to final location {}.", bucketState.currentFile, partPath); } else { LOG.debug( "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, " + "it was moved to final location by a previous snapshot restore", bucketState.currentFile); } refTruncate = reflectTruncate(fs); // truncate it or write a ".valid-length" file to specify up to which point it is valid if (refTruncate != null) { LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength); // some-one else might still hold the lease from a previous try, we are // recovering, after all ... if (fs instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fs; LOG.debug("Trying to recover file lease {}", partPath); dfs.recoverLease(partPath); boolean isclosed = dfs.isFileClosed(partPath); StopWatch sw = new StopWatch(); sw.start(); while (!isclosed) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } isclosed = dfs.isFileClosed(partPath); } } Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath, bucketState.currentFileValidLength); if (!truncated) { LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath); // we must wait for the asynchronous truncate operation to complete StopWatch sw = new StopWatch(); sw.start(); long newLen = fs.getFileStatus(partPath).getLen(); while (newLen != bucketState.currentFileValidLength) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } newLen = fs.getFileStatus(partPath).getLen(); } if (newLen != bucketState.currentFileValidLength) { throw new RuntimeException("Truncate did not truncate to right length. Should be " + bucketState.currentFileValidLength + " is " + newLen + "."); } } } else { LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath, bucketState.currentFileValidLength); Path validLengthFilePath = new Path(partPath.getParent(), validLengthPrefix + partPath.getName()).suffix(validLengthSuffix); if (!fs.exists(validLengthFilePath)) { FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath); lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength)); lengthFileOut.close(); } } // Now that we've restored the bucket to a valid state, reset the current file info bucketState.currentFile = null; bucketState.currentFileValidLength = -1; } catch (IOException e) { LOG.error("Error while restoring BucketingSink state.", e); throw new RuntimeException("Error while restoring BucketingSink state.", e); } catch (InvocationTargetException | IllegalAccessException e) { LOG.error("Cound not invoke truncate.", e); throw new RuntimeException("Could not invoke truncate.", e); } } LOG.debug("Clearing pending/in-progress files."); // Move files that are confirmed by a checkpoint but did not get moved to final location // because the checkpoint notification did not happen before a failure Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); LOG.debug("Moving pending files to final location on restore."); for (Long pastCheckpointId : pastCheckpointIds) { // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()) .suffix(pendingSuffix); try { if (fs.exists(pendingPath)) { LOG.debug( "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.", pendingPath, pastCheckpointId); fs.rename(pendingPath, finalPath); } } catch (IOException e) { LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}", pendingPath, finalPath, e); throw new RuntimeException( "Error while renaming pending file " + pendingPath + " to final path " + finalPath, e); } } } synchronized (bucketState.pendingFilesPerCheckpoint) { bucketState.pendingFilesPerCheckpoint.clear(); } } // we need to get this here since open() has not yet been called int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); // delete pending files try { RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true); while (bucketFiles.hasNext()) { LocatedFileStatus file = bucketFiles.next(); if (file.getPath().toString().endsWith(pendingSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } if (file.getPath().toString().endsWith(inProgressSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } } } catch (IOException e) { LOG.error("Error while deleting old pending files: {}", e); throw new RuntimeException("Error while deleting old pending files.", e); } }
From source file:com.mvdb.etl.actions.ActionUtils.java
License:Apache License
public static void copyLocalDirectoryToHdfsDirectory(String localDirectory, String hdfsDirectory) throws Throwable { String hdfsHome = getConfigurationValue(ConfigurationKeys.GLOBAL_CUSTOMER, ConfigurationKeys.GLOBAL_HADOOP_HOME); org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration(); conf.addResource(new Path(hdfsHome + "/conf/core-site.xml")); FileSystem hdfsFileSystem = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.get(new org.apache.hadoop.conf.Configuration()); Path localDirectoryPath = new Path(localDirectory); Path hdfsDirectoryPath = new Path(hdfsDirectory); if (hdfsFileSystem.exists(hdfsDirectoryPath)) { boolean deleteSuccess = hdfsFileSystem.delete(hdfsDirectoryPath, true); if (deleteSuccess == false) { throw new RuntimeException("Unable to delete " + hdfsDirectoryPath.toString()); }// www . j a v a 2 s . c o m } if (!localFileSystem.exists(localDirectoryPath)) { throw new RuntimeException("Input directory " + localDirectoryPath + " not found"); } FileStatus fileStatus1 = localFileSystem.getFileStatus(localDirectoryPath); if (!fileStatus1.isDir()) { throw new RuntimeException("Input " + localDirectoryPath + " should be a directory"); } if (hdfsFileSystem.exists(hdfsDirectoryPath)) { throw new RuntimeException("Output " + hdfsDirectoryPath + "already exists"); } logger.info("Attempting Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString()); FileUtil.copy(localFileSystem, localDirectoryPath, hdfsFileSystem, hdfsDirectoryPath, false, conf); logger.info("-Completed Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString()); }
From source file:com.mvdb.platform.action.VersionMerge.java
License:Apache License
private static void buildInputPathList(FileSystem fileSystem, Path topPath, List<Path> pathList, String lastMergedDirName, String lastcopiedDirName) throws IOException { FileStatus topPathStatus = fileSystem.getFileStatus(topPath); if (topPathStatus.isDir() == false) { String topPathFullName = topPath.toString(); String[] tokens = topPathFullName.split("/"); String fileName = tokens[tokens.length - 1]; if (fileName.startsWith("data-") && fileName.endsWith(".dat")) { String timeStamp = tokens[tokens.length - 2]; if (timeStamp.compareTo(lastMergedDirName) > 0 && timeStamp.compareTo(lastcopiedDirName) <= 0) { pathList.add(topPath);/*from w ww .j a va 2 s .c om*/ } } return; //This is a leaf } FileStatus[] fsArray = fileSystem.listStatus(topPath); for (FileStatus fileStatus : fsArray) { Path path = fileStatus.getPath(); buildInputPathList(fileSystem, path, pathList, lastMergedDirName, lastcopiedDirName); } }