Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    Path basePath = new Path(location);
    FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration());

    Set<Path> paths = new TreeSet<Path>();

    if (fileSystem.getFileStatus(basePath).isDir()) {
        getPaths(basePath, paths, fileSystem);
    } else {/*from  ww  w  . jav a 2s .  c om*/
        paths.add(basePath);
    }

    log.info("Setting input to " + paths);
    FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths));
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

private void getPaths(Path baseDirectory, Set<Path> paths, FileSystem fileSystem) throws IOException {
    FileStatus[] files = fileSystem.listStatus(baseDirectory);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        if (fileStatus.isDir()) {
            getPaths(path, paths, fileSystem);
        } else {//from  w w w . j  a  v a  2  s.c o m
            paths.add(baseDirectory);
        }
    }
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

/**
 * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata.
 * This is the method by which we pass the schema types and names directly to pig without having to specify them directly.
 *
 * @param location As passed to relativeToAbsolutePath
 * @param job      The job.//  w ww.  ja  v  a  2  s  .co  m
 *
 * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist.
 *
 * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
    Configuration conf = job.getConfiguration();
    Properties props = ConfigurationUtil.toProperties(conf);

    // HACK: Here we open the file directly to read the TypeMetadata.
    // HACK: There may be a better more direct way to do this, but it works for now.
    Path path = new Path(location);
    FileSystem fileSystem = path.getFileSystem(conf);

    FileStatus fileStatus = fileSystem.getFileStatus(path);
    if (fileStatus.isDir()) {
        log.debug(String.format("Path is a directory."));
        path = getFilePath(path, fileSystem);
        if (path == null) {
            return null;
        }
    } else if (!fileSystem.exists(path)) {
        return null;
    }

    MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path));
    typeMetadata = reader.getMetadata();
    reader.close();

    if (typeMetadata == null) {
        return null;
    }
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();

    return new ResourceSchema(getMessageSchema(descriptor));
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile)
        throws NoSplitFileException, IOException {
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException
    FileStatus splitFileStatus;//from   w w w  . j a va  2 s  . c om
    try {
        splitFileStatus = fs.getFileStatus(splitFile);
        LOG.info("Found split file at : " + splitFileStatus);
    } catch (Exception e) {
        throw new NoSplitFileException();
    }
    FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException
    while (fsDataStream.getPos() < splitFileStatus.getLen()) {
        callback.reset();
        bsonDec.decode(fsDataStream, callback);
        BSONObject splitInfo = (BSONObject) callback.get();
        splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile));
    }
    splitsList = splits;
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void readSplits() throws IOException {
    splitsList = new ArrayList<FileSplit>();
    if (inputPath == null) {
        throw new IllegalStateException("Input path has not been set.");
    }// ww w .  j av  a2  s . c  o  m
    FileSystem fs = inputPath.getFileSystem(getConf());
    FileStatus file = fs.getFileStatus(inputPath);
    readSplitsForFile(file);
}

From source file:com.moz.fiji.mapreduce.input.impl.WholeFileRecordReader.java

License:Apache License

/**
 * Implementation detail: This constructor is built to be called via
 * reflection from within CombineFileRecordReader.
 *
 * @param fileSplit The CombineFileSplit that this will read from.
 * @param context The context for this task.
 * @param pathToProcess The path index from the CombineFileSplit to process in this record.
 *//* w ww. ja v a  2  s  .  co  m*/
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context, Integer pathToProcess) {
    mProcessed = false;
    mFileToRead = fileSplit.getPath(pathToProcess);
    mFileLength = fileSplit.getLength(pathToProcess);
    mConf = context.getConfiguration();

    assert 0 == fileSplit.getOffset(pathToProcess);
    if (LOG.isDebugEnabled()) {
        LOG.debug("FileToRead is: " + mFileToRead.toString());
        LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());

        try {
            final FileSystem fs = mFileToRead.getFileSystem(mConf);
            assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
        } catch (IOException ioe) {
            // oh well, I was just testing.
        }
    }

    mFileName = new Text();
    mFileText = new Text();
}

From source file:com.moz.fiji.mapreduce.tools.FijiBulkLoad.java

License:Apache License

/**
 * Recursively grant additional read and write permissions to all. There is no
 * built-in way in the Hadoop Java API to recursively set permissions on a directory,
 * so we implement it here./*w w  w.j  av a  2  s.  c  o m*/
 *
 * @param path The Path to the directory to chmod.
 * @throws IOException on IOException.
 */
private void recursiveGrantAllReadWritePermissions(Path path) throws IOException {
    FileSystem hdfs = path.getFileSystem(getConf());
    recursiveGrantAllReadWritePermissions(hdfs, hdfs.getFileStatus(path));
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void restoreState(State<T> state) {
    this.state = state;

    FileSystem fs;
    try {//from   w  ww.j av  a 2 s.c om
        fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration());
    } catch (IOException e) {
        LOG.error("Error while creating FileSystem in checkpoint restore.", e);
        throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e);
    }

    for (BucketState<T> bucketState : state.bucketStates.values()) {
        // we can clean all the pending files since they where renamed to final files
        // after this checkpoint was successful
        bucketState.pendingFiles.clear();

        if (bucketState.currentFile != null) {
            // We were writing to a file when the last checkpoint occured. This file can either
            // be still in-progress or became a pending file at some point after the checkpoint.
            // Either way, we have to truncate it back to a valid state (or write a .valid-length)
            // file that specifies up to which length it is valid and rename it to the final name
            // before starting a new bucket file.
            Path partPath = new Path(bucketState.currentFile);
            try {
                Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName())
                        .suffix(pendingSuffix);
                Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
                        .suffix(inProgressSuffix);

                if (fs.exists(partPendingPath)) {
                    LOG.debug(
                            "In-progress file {} has been moved to pending after checkpoint, moving to final location.",
                            partPath);
                    // has been moved to pending in the mean time, rename to final location
                    fs.rename(partPendingPath, partPath);
                } else if (fs.exists(partInProgressPath)) {
                    LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
                    // it was still in progress, rename to final path
                    fs.rename(partInProgressPath, partPath);
                } else if (fs.exists(partPath)) {
                    LOG.debug("In-Progress file {} was already moved to final location {}.",
                            bucketState.currentFile, partPath);
                } else {
                    LOG.debug(
                            "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, "
                                    + "it was moved to final location by a previous snapshot restore",
                            bucketState.currentFile);
                }

                refTruncate = reflectTruncate(fs);
                // truncate it or write a ".valid-length" file to specify up to which point it is valid
                if (refTruncate != null) {
                    LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength);
                    // some-one else might still hold the lease from a previous try, we are
                    // recovering, after all ...
                    if (fs instanceof DistributedFileSystem) {
                        DistributedFileSystem dfs = (DistributedFileSystem) fs;
                        LOG.debug("Trying to recover file lease {}", partPath);
                        dfs.recoverLease(partPath);
                        boolean isclosed = dfs.isFileClosed(partPath);
                        StopWatch sw = new StopWatch();
                        sw.start();
                        while (!isclosed) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            isclosed = dfs.isFileClosed(partPath);
                        }
                    }
                    Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath,
                            bucketState.currentFileValidLength);
                    if (!truncated) {
                        LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);

                        // we must wait for the asynchronous truncate operation to complete
                        StopWatch sw = new StopWatch();
                        sw.start();
                        long newLen = fs.getFileStatus(partPath).getLen();
                        while (newLen != bucketState.currentFileValidLength) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            newLen = fs.getFileStatus(partPath).getLen();
                        }
                        if (newLen != bucketState.currentFileValidLength) {
                            throw new RuntimeException("Truncate did not truncate to right length. Should be "
                                    + bucketState.currentFileValidLength + " is " + newLen + ".");
                        }
                    }

                } else {
                    LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath,
                            bucketState.currentFileValidLength);
                    Path validLengthFilePath = new Path(partPath.getParent(),
                            validLengthPrefix + partPath.getName()).suffix(validLengthSuffix);
                    if (!fs.exists(validLengthFilePath)) {
                        FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath);
                        lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength));
                        lengthFileOut.close();
                    }
                }

                // Now that we've restored the bucket to a valid state, reset the current file info
                bucketState.currentFile = null;
                bucketState.currentFileValidLength = -1;
            } catch (IOException e) {
                LOG.error("Error while restoring BucketingSink state.", e);
                throw new RuntimeException("Error while restoring BucketingSink state.", e);
            } catch (InvocationTargetException | IllegalAccessException e) {
                LOG.error("Cound not invoke truncate.", e);
                throw new RuntimeException("Could not invoke truncate.", e);
            }
        }

        LOG.debug("Clearing pending/in-progress files.");

        // Move files that are confirmed by a checkpoint but did not get moved to final location
        // because the checkpoint notification did not happen before a failure

        Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
        LOG.debug("Moving pending files to final location on restore.");
        for (Long pastCheckpointId : pastCheckpointIds) {
            // All the pending files are buckets that have been completed but are waiting to be renamed
            // to their final name
            for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                Path finalPath = new Path(filename);
                Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                        .suffix(pendingSuffix);

                try {
                    if (fs.exists(pendingPath)) {
                        LOG.debug(
                                "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                        fs.rename(pendingPath, finalPath);
                    }
                } catch (IOException e) {
                    LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}",
                            pendingPath, finalPath, e);
                    throw new RuntimeException(
                            "Error while renaming pending file " + pendingPath + " to final path " + finalPath,
                            e);
                }
            }
        }

        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.clear();
        }
    }

    // we need to get this here since open() has not yet been called
    int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
    // delete pending files
    try {

        RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

        while (bucketFiles.hasNext()) {
            LocatedFileStatus file = bucketFiles.next();
            if (file.getPath().toString().endsWith(pendingSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
            if (file.getPath().toString().endsWith(inProgressSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error while deleting old pending files: {}", e);
        throw new RuntimeException("Error while deleting old pending files.", e);
    }
}

From source file:com.mvdb.etl.actions.ActionUtils.java

License:Apache License

public static void copyLocalDirectoryToHdfsDirectory(String localDirectory, String hdfsDirectory)
        throws Throwable {
    String hdfsHome = getConfigurationValue(ConfigurationKeys.GLOBAL_CUSTOMER,
            ConfigurationKeys.GLOBAL_HADOOP_HOME);
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    conf.addResource(new Path(hdfsHome + "/conf/core-site.xml"));
    FileSystem hdfsFileSystem = FileSystem.get(conf);

    FileSystem localFileSystem = FileSystem.get(new org.apache.hadoop.conf.Configuration());

    Path localDirectoryPath = new Path(localDirectory);
    Path hdfsDirectoryPath = new Path(hdfsDirectory);

    if (hdfsFileSystem.exists(hdfsDirectoryPath)) {
        boolean deleteSuccess = hdfsFileSystem.delete(hdfsDirectoryPath, true);
        if (deleteSuccess == false) {
            throw new RuntimeException("Unable to delete " + hdfsDirectoryPath.toString());
        }//  www  . j a v  a 2  s .  c o m
    }
    if (!localFileSystem.exists(localDirectoryPath)) {
        throw new RuntimeException("Input directory " + localDirectoryPath + " not found");
    }
    FileStatus fileStatus1 = localFileSystem.getFileStatus(localDirectoryPath);
    if (!fileStatus1.isDir()) {
        throw new RuntimeException("Input " + localDirectoryPath + " should be a directory");
    }
    if (hdfsFileSystem.exists(hdfsDirectoryPath)) {
        throw new RuntimeException("Output " + hdfsDirectoryPath + "already exists");
    }

    logger.info("Attempting Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString());
    FileUtil.copy(localFileSystem, localDirectoryPath, hdfsFileSystem, hdfsDirectoryPath, false, conf);
    logger.info("-Completed Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString());

}

From source file:com.mvdb.platform.action.VersionMerge.java

License:Apache License

private static void buildInputPathList(FileSystem fileSystem, Path topPath, List<Path> pathList,
        String lastMergedDirName, String lastcopiedDirName) throws IOException {
    FileStatus topPathStatus = fileSystem.getFileStatus(topPath);
    if (topPathStatus.isDir() == false) {
        String topPathFullName = topPath.toString();
        String[] tokens = topPathFullName.split("/");
        String fileName = tokens[tokens.length - 1];
        if (fileName.startsWith("data-") && fileName.endsWith(".dat")) {
            String timeStamp = tokens[tokens.length - 2];
            if (timeStamp.compareTo(lastMergedDirName) > 0 && timeStamp.compareTo(lastcopiedDirName) <= 0) {
                pathList.add(topPath);/*from  w  ww  .j a va  2  s .c om*/
            }
        }
        return; //This is a leaf
    }

    FileStatus[] fsArray = fileSystem.listStatus(topPath);
    for (FileStatus fileStatus : fsArray) {
        Path path = fileStatus.getPath();
        buildInputPathList(fileSystem, path, pathList, lastMergedDirName, lastcopiedDirName);
    }
}