Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.google.mr4c.content.S3ContentFactory.java

License:Open Source License

public long getContentLength(URI uri) throws IOException {
    Path path = toPath(uri);//from  w w w  .  j a va  2 s . c  o  m
    FileSystem fs = getFileSystem(uri);
    FileStatus file = fs.getFileStatus(path);
    return file.getLen();
}

From source file:com.google.mr4c.sources.URIDataFileSource.java

License:Open Source License

@Override
public BlockLocation[] getBlockLocation() throws IOException {
    URI uri = ContentFactories.scrubURI(m_uri);
    FileSystem fs = FileSystem.get(uri, s_config);
    Path path = new Path(uri);
    FileStatus status = fs.getFileStatus(path);
    return fs.getFileBlockLocations(status, 0, status.getBlockSize());
}

From source file:com.gruter.hadoop.customShell.CustomShell.java

License:Apache License

private InputStream forMagic(Path p, FileSystem srcFs) throws IOException {
    FSDataInputStream i = srcFs.open(p);
    switch (i.readShort()) {
    case 0x1f8b: // RFC 1952
        i.seek(0);//from ww  w.j  a  v  a2s  .c o  m
        return new GZIPInputStream(i);
    case 0x5345: // 'S' 'E'
        if (i.readByte() == 'Q') {
            i.close();
            return new TextRecordInputStream(srcFs.getFileStatus(p));
        }
        break;
    }
    i.seek(0);
    /**
     * snappy
     */
    if (isSnappy(p.getName())) {
        return getSnappyCodec().createInputStream(i);
    }
    /**
     * end
     */
    return i;
}

From source file:com.gruter.hadoop.customShell.CustomShell.java

License:Apache License

int runCmdHandler(CmdHandler handler, String[] args, int startIndex, boolean recursive) throws IOException {
    int errors = 0;

    for (int i = startIndex; i < args.length; i++) {
        Path srcPath = new Path(args[i]);
        FileSystem srcFs = srcPath.getFileSystem(getConf());
        Path[] paths = FileUtil.stat2Paths(srcFs.globStatus(srcPath), srcPath);
        // if nothing matches to given glob pattern then increment error count
        if (paths.length == 0) {
            System.err.println(handler.getName() + ": could not get status for '" + args[i] + "'");
            errors++;/*from  ww w.  jav a  2s  .  c o m*/
        }
        for (Path path : paths) {
            try {
                FileStatus file = srcFs.getFileStatus(path);
                if (file == null) {
                    System.err.println(handler.getName() + ": could not get status for '" + path + "'");
                    errors++;
                } else {
                    errors += runCmdHandler(handler, file, srcFs, recursive);
                }
            } catch (IOException e) {
                String msg = (e.getMessage() != null ? e.getLocalizedMessage()
                        : (e.getCause().getMessage() != null ? e.getCause().getLocalizedMessage() : "null"));
                System.err.println(
                        handler.getName() + ": could not get status for '" + path + "': " + msg.split("\n")[0]);
                errors++;
            }
        }
    }

    return (errors > 0 || handler.getErrorCode() != 0) ? 1 : 0;
}

From source file:com.hadoop.compression.fourmc.FourMcInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file./*from w  ww. ja v a 2 s  .  c  o  m*/
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMcBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMcBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /*
    4mc Footer:
     Footer size:        4 bytes
     Footer version:     4 byte (1)
     Block index offset: 4 bytes delta offset for each stored block, the delta between offset between previous file position and next block
     Footer size:        4 bytes (repeated to be able to read from end of file)
     MAGIC SIGNATURE:    4 bytes: "4MC\0"
     Footer checksum:    4 bytes (always in XXHASH32)
            
    */

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMcCodec.FOURMC_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMcCodec.FOURMC_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != Lz4Decompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMcBlockIndex index = new FourMcBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}

From source file:com.hadoop.compression.fourmc.FourMzInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file.//from w  ww .j a  v  a  2  s. c  om
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMzBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMzBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMzCodec.FOURMZ_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMzCodec.FOURMZ_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != ZstdDecompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMzBlockIndex index = new FourMzBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}

From source file:com.hadoop.compression.lzo.LzoIndexer.java

License:Open Source License

/**
 * Lzo index a given path, calling recursively to index directories when encountered.
 * Files are only indexed if they end in .lzo and have no existing .lzo.index file.
 * /*from  w w w  .  j  av a 2 s.  c  o m*/
 * @param lzoPath The base path to index.
 * @param nestingLevel For pretty printing, the nesting level.
 * @throws IOException
 */
private void indexInternal(Path lzoPath, int nestingLevel) throws IOException {
    FileSystem fs = FileSystem.get(URI.create(lzoPath.toString()), conf_);
    FileStatus fileStatus = fs.getFileStatus(lzoPath);

    // Recursively walk
    if (fileStatus.isDir()) {
        LOG.info(getNesting(nestingLevel) + "LZO Indexing directory " + lzoPath + "...");
        FileStatus[] statuses = fs.listStatus(lzoPath);
        for (FileStatus childStatus : statuses) {
            indexInternal(childStatus.getPath(), nestingLevel + 1);
        }
    } else if (lzoPath.toString().endsWith(LZO_EXTENSION)) {
        Path lzoIndexPath = new Path(lzoPath.toString() + LzoIndex.LZO_INDEX_SUFFIX);
        if (fs.exists(lzoIndexPath)) {
            LOG.info(getNesting(nestingLevel) + "[SKIP] LZO index file already exists for " + lzoPath + "\n");
        } else {
            long startTime = System.currentTimeMillis();
            long fileSize = fileStatus.getLen();

            LOG.info(getNesting(nestingLevel) + "[INDEX] LZO Indexing file " + lzoPath + ", size "
                    + df_.format(fileSize / (1024.0 * 1024.0 * 1024.0)) + " GB...");
            if (indexSingleFile(fs, lzoPath)) {
                long indexSize = fs.getFileStatus(lzoIndexPath).getLen();
                double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
                LOG.info(getNesting(nestingLevel) + "Completed LZO Indexing in " + df_.format(elapsed)
                        + " seconds (" + df_.format(fileSize / (1024.0 * 1024.0 * elapsed))
                        + " MB/s).  Index size is " + df_.format(indexSize / 1024.0) + " KB.\n");
            }
        }
    }
}

From source file:com.hadoop.mapred.DeprecatedLzoTextInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    FileSplit[] splits = (FileSplit[]) super.getSplits(conf, numSplits);
    // Find new starts/ends of the filesplit that align with the LZO blocks.

    List<FileSplit> result = new ArrayList<FileSplit>();

    for (FileSplit fileSplit : splits) {
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        if (!LzoInputFormatCommon.isLzoFile(file.toString())) {
            // non-LZO file, keep the input split as is.
            result.add(fileSplit);//from  w w  w .j a v a 2 s. c o m
            continue;
        }

        // LZO file, try to split if the .index file was found
        LzoIndex index = indexes.get(file);
        if (index == null) {
            throw new IOException("Index not found for " + file);
        }
        if (index.isEmpty()) {
            // Empty index, keep it as is.
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long lzoStart = index.alignSliceStartToIndex(start, end);
        long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) {
            result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
        }
    }

    return result.toArray(new FileSplit[result.size()]);
}

From source file:com.hadoop.mapreduce.FourMcInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;/*www .ja v  a  2 s  .c  o m*/
    FourMcBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMcBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMcBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}

From source file:com.hadoop.mapreduce.FourMzInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;/*ww w  . j  a va  2 s  . co m*/
    FourMzBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMzBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMzBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMzBlockIndex.NOT_FOUND && fourMcEnd != FourMzBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mz split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}