Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    Path basePath = new Path(location);
    FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration());

    Set<Path> paths = new TreeSet<Path>();

    if (fileSystem.getFileStatus(basePath).isDir()) {
        getPaths(basePath, paths, fileSystem);
    } else {/*from  ww  w  . jav a 2s .  c om*/
        paths.add(basePath);
    }

    log.info("Setting input to " + paths);
    FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths));
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

private void getPaths(Path baseDirectory, Set<Path> paths, FileSystem fileSystem) throws IOException {
    FileStatus[] files = fileSystem.listStatus(baseDirectory);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        if (fileStatus.isDir()) {
            getPaths(path, paths, fileSystem);
        } else {//from  w w w . j  a  v a  2  s.c o m
            paths.add(baseDirectory);
        }
    }
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

/**
 * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata.
 * This is the method by which we pass the schema types and names directly to pig without having to specify them directly.
 *
 * @param location As passed to relativeToAbsolutePath
 * @param job      The job.//  w ww.  ja  v  a  2  s  .co  m
 *
 * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist.
 *
 * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
    Configuration conf = job.getConfiguration();
    Properties props = ConfigurationUtil.toProperties(conf);

    // HACK: Here we open the file directly to read the TypeMetadata.
    // HACK: There may be a better more direct way to do this, but it works for now.
    Path path = new Path(location);
    FileSystem fileSystem = path.getFileSystem(conf);

    FileStatus fileStatus = fileSystem.getFileStatus(path);
    if (fileStatus.isDir()) {
        log.debug(String.format("Path is a directory."));
        path = getFilePath(path, fileSystem);
        if (path == null) {
            return null;
        }
    } else if (!fileSystem.exists(path)) {
        return null;
    }

    MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path));
    typeMetadata = reader.getMetadata();
    reader.close();

    if (typeMetadata == null) {
        return null;
    }
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();

    return new ResourceSchema(getMessageSchema(descriptor));
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile)
        throws NoSplitFileException, IOException {
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException
    FileStatus splitFileStatus;//from   w w w  . j a va  2 s  . c om
    try {
        splitFileStatus = fs.getFileStatus(splitFile);
        LOG.info("Found split file at : " + splitFileStatus);
    } catch (Exception e) {
        throw new NoSplitFileException();
    }
    FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException
    while (fsDataStream.getPos() < splitFileStatus.getLen()) {
        callback.reset();
        bsonDec.decode(fsDataStream, callback);
        BSONObject splitInfo = (BSONObject) callback.get();
        splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile));
    }
    splitsList = splits;
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void readSplits() throws IOException {
    splitsList = new ArrayList<FileSplit>();
    if (inputPath == null) {
        throw new IllegalStateException("Input path has not been set.");
    }// ww w .  j av  a2  s . c  o  m
    FileSystem fs = inputPath.getFileSystem(getConf());
    FileStatus file = fs.getFileStatus(inputPath);
    readSplitsForFile(file);
}

From source file:com.moz.fiji.mapreduce.input.impl.WholeFileRecordReader.java

License:Apache License

/**
 * Implementation detail: This constructor is built to be called via
 * reflection from within CombineFileRecordReader.
 *
 * @param fileSplit The CombineFileSplit that this will read from.
 * @param context The context for this task.
 * @param pathToProcess The path index from the CombineFileSplit to process in this record.
 *//* w ww. ja v a  2  s  .  co  m*/
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context, Integer pathToProcess) {
    mProcessed = false;
    mFileToRead = fileSplit.getPath(pathToProcess);
    mFileLength = fileSplit.getLength(pathToProcess);
    mConf = context.getConfiguration();

    assert 0 == fileSplit.getOffset(pathToProcess);
    if (LOG.isDebugEnabled()) {
        LOG.debug("FileToRead is: " + mFileToRead.toString());
        LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());

        try {
            final FileSystem fs = mFileToRead.getFileSystem(mConf);
            assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
        } catch (IOException ioe) {
            // oh well, I was just testing.
        }
    }

    mFileName = new Text();
    mFileText = new Text();
}

From source file:com.moz.fiji.mapreduce.tools.FijiBulkLoad.java

License:Apache License

/**
 * Recursively grant additional read and write permissions to all. There is no
 * built-in way in the Hadoop Java API to recursively set permissions on a directory,
 * so we implement it here./*w w  w.j  av a  2  s.  c  o m*/
 *
 * @param path The Path to the directory to chmod.
 * @throws IOException on IOException.
 */
private void recursiveGrantAllReadWritePermissions(Path path) throws IOException {
    FileSystem hdfs = path.getFileSystem(getConf());
    recursiveGrantAllReadWritePermissions(hdfs, hdfs.getFileStatus(path));
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void restoreState(State<T> state) {
    this.state = state;

    FileSystem fs;
    try {//from   w  ww.j av  a 2 s.c om
        fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration());
    } catch (IOException e) {
        LOG.error("Error while creating FileSystem in checkpoint restore.", e);
        throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e);
    }

    for (BucketState<T> bucketState : state.bucketStates.values()) {
        // we can clean all the pending files since they where renamed to final files
        // after this checkpoint was successful
        bucketState.pendingFiles.clear();

        if (bucketState.currentFile != null) {
            // We were writing to a file when the last checkpoint occured. This file can either
            // be still in-progress or became a pending file at some point after the checkpoint.
            // Either way, we have to truncate it back to a valid state (or write a .valid-length)
            // file that specifies up to which length it is valid and rename it to the final name
            // before starting a new bucket file.
            Path partPath = new Path(bucketState.currentFile);
            try {
                Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName())
                        .suffix(pendingSuffix);
                Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
                        .suffix(inProgressSuffix);

                if (fs.exists(partPendingPath)) {
                    LOG.debug(
                            "In-progress file {} has been moved to pending after checkpoint, moving to final location.",
                            partPath);
                    // has been moved to pending in the mean time, rename to final location
                    fs.rename(partPendingPath, partPath);
                } else if (fs.exists(partInProgressPath)) {
                    LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
                    // it was still in progress, rename to final path
                    fs.rename(partInProgressPath, partPath);
                } else if (fs.exists(partPath)) {
                    LOG.debug("In-Progress file {} was already moved to final location {}.",
                            bucketState.currentFile, partPath);
                } else {
                    LOG.debug(
                            "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, "
                                    + "it was moved to final location by a previous snapshot restore",
                            bucketState.currentFile);
                }

                refTruncate = reflectTruncate(fs);
                // truncate it or write a ".valid-length" file to specify up to which point it is valid
                if (refTruncate != null) {
                    LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength);
                    // some-one else might still hold the lease from a previous try, we are
                    // recovering, after all ...
                    if (fs instanceof DistributedFileSystem) {
                        DistributedFileSystem dfs = (DistributedFileSystem) fs;
                        LOG.debug("Trying to recover file lease {}", partPath);
                        dfs.recoverLease(partPath);
                        boolean isclosed = dfs.isFileClosed(partPath);
                        StopWatch sw = new StopWatch();
                        sw.start();
                        while (!isclosed) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            isclosed = dfs.isFileClosed(partPath);
                        }
                    }
                    Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath,
                            bucketState.currentFileValidLength);
                    if (!truncated) {
                        LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);

                        // we must wait for the asynchronous truncate operation to complete
                        StopWatch sw = new StopWatch();
                        sw.start();
                        long newLen = fs.getFileStatus(partPath).getLen();
                        while (newLen != bucketState.currentFileValidLength) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            newLen = fs.getFileStatus(partPath).getLen();
                        }
                        if (newLen != bucketState.currentFileValidLength) {
                            throw new RuntimeException("Truncate did not truncate to right length. Should be "
                                    + bucketState.currentFileValidLength + " is " + newLen + ".");
                        }
                    }

                } else {
                    LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath,
                            bucketState.currentFileValidLength);
                    Path validLengthFilePath = new Path(partPath.getParent(),
                            validLengthPrefix + partPath.getName()).suffix(validLengthSuffix);
                    if (!fs.exists(validLengthFilePath)) {
                        FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath);
                        lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength));
                        lengthFileOut.close();
                    }
                }

                // Now that we've restored the bucket to a valid state, reset the current file info
                bucketState.currentFile = null;
                bucketState.currentFileValidLength = -1;
            } catch (IOException e) {
                LOG.error("Error while restoring BucketingSink state.", e);
                throw new RuntimeException("Error while restoring BucketingSink state.", e);
            } catch (InvocationTargetException | IllegalAccessException e) {
                LOG.error("Cound not invoke truncate.", e);
                throw new RuntimeException("Could not invoke truncate.", e);
            }
        }

        LOG.debug("Clearing pending/in-progress files.");

        // Move files that are confirmed by a checkpoint but did not get moved to final location
        // because the checkpoint notification did not happen before a failure

        Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
        LOG.debug("Moving pending files to final location on restore.");
        for (Long pastCheckpointId : pastCheckpointIds) {
            // All the pending files are buckets that have been completed but are waiting to be renamed
            // to their final name
            for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                Path finalPath = new Path(filename);
                Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                        .suffix(pendingSuffix);

                try {
                    if (fs.exists(pendingPath)) {
                        LOG.debug(
                                "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                        fs.rename(pendingPath, finalPath);
                    }
                } catch (IOException e) {
                    LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}",
                            pendingPath, finalPath, e);
                    throw new RuntimeException(
                            "Error while renaming pending file " + pendingPath + " to final path " + finalPath,
                            e);
                }
            }
        }

        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.clear();
        }
    }

    // we need to get this here since open() has not yet been called
    int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
    // delete pending files
    try {

        RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

        while (bucketFiles.hasNext()) {
            LocatedFileStatus file = bucketFiles.next();
            if (file.getPath().toString().endsWith(pendingSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
            if (file.getPath().toString().endsWith(inProgressSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error while deleting old pending files: {}", e);
        throw new RuntimeException("Error while deleting old pending files.", e);
    }
}

From source file:com.mvdb.etl.actions.ActionUtils.java

License:Apache License

public static void copyLocalDirectoryToHdfsDirectory(String localDirectory, String hdfsDirectory)
        throws Throwable {
    String hdfsHome = getConfigurationValue(ConfigurationKeys.GLOBAL_CUSTOMER,
            ConfigurationKeys.GLOBAL_HADOOP_HOME);
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    conf.addResource(new Path(hdfsHome + "/conf/core-site.xml"));
    FileSystem hdfsFileSystem = FileSystem.get(conf);

    FileSystem localFileSystem = FileSystem.get(new org.apache.hadoop.conf.Configuration());

    Path localDirectoryPath = new Path(localDirectory);
    Path hdfsDirectoryPath = new Path(hdfsDirectory);

    if (hdfsFileSystem.exists(hdfsDirectoryPath)) {
        boolean deleteSuccess = hdfsFileSystem.delete(hdfsDirectoryPath, true);
        if (deleteSuccess == false) {
            throw new RuntimeException("Unable to delete " + hdfsDirectoryPath.toString());
        }//  www  . j a v  a 2  s .  c o m
    }
    if (!localFileSystem.exists(localDirectoryPath)) {
        throw new RuntimeException("Input directory " + localDirectoryPath + " not found");
    }
    FileStatus fileStatus1 = localFileSystem.getFileStatus(localDirectoryPath);
    if (!fileStatus1.isDir()) {
        throw new RuntimeException("Input " + localDirectoryPath + " should be a directory");
    }
    if (hdfsFileSystem.exists(hdfsDirectoryPath)) {
        throw new RuntimeException("Output " + hdfsDirectoryPath + "already exists");
    }

    logger.info("Attempting Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString());
    FileUtil.copy(localFileSystem, localDirectoryPath, hdfsFileSystem, hdfsDirectoryPath, false, conf);
    logger.info("-Completed Copy " + localDirectoryPath.toString() + " to " + hdfsDirectoryPath.toString());

}

From source file:com.mvdb.platform.action.VersionMerge.java

License:Apache License

private static void buildInputPathList(FileSystem fileSystem, Path topPath, List<Path> pathList,
        String lastMergedDirName, String lastcopiedDirName) throws IOException {
    FileStatus topPathStatus = fileSystem.getFileStatus(topPath);
    if (topPathStatus.isDir() == false) {
        String topPathFullName = topPath.toString();
        String[] tokens = topPathFullName.split("/");
        String fileName = tokens[tokens.length - 1];
        if (fileName.startsWith("data-") && fileName.endsWith(".dat")) {
            String timeStamp = tokens[tokens.length - 2];
            if (timeStamp.compareTo(lastMergedDirName) > 0 && timeStamp.compareTo(lastcopiedDirName) <= 0) {
                pathList.add(topPath);/*from  w  ww  .j a va  2  s .c om*/
            }
        }
        return; //This is a leaf
    }

    FileStatus[] fsArray = fileSystem.listStatus(topPath);
    for (FileStatus fileStatus : fsArray) {
        Path path = fileStatus.getPath();
        buildInputPathList(fileSystem, path, pathList, lastMergedDirName, lastcopiedDirName);
    }
}