Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.google.mr4c.content.S3ContentFactory.java

License:Open Source License

public long getContentLength(URI uri) throws IOException {
    Path path = toPath(uri);//from  w w w  .  j a va  2 s . c  o  m
    FileSystem fs = getFileSystem(uri);
    FileStatus file = fs.getFileStatus(path);
    return file.getLen();
}

From source file:com.google.mr4c.sources.URIDataFileSource.java

License:Open Source License

@Override
public BlockLocation[] getBlockLocation() throws IOException {
    URI uri = ContentFactories.scrubURI(m_uri);
    FileSystem fs = FileSystem.get(uri, s_config);
    Path path = new Path(uri);
    FileStatus status = fs.getFileStatus(path);
    return fs.getFileBlockLocations(status, 0, status.getBlockSize());
}

From source file:com.gruter.hadoop.customShell.CustomShell.java

License:Apache License

private InputStream forMagic(Path p, FileSystem srcFs) throws IOException {
    FSDataInputStream i = srcFs.open(p);
    switch (i.readShort()) {
    case 0x1f8b: // RFC 1952
        i.seek(0);//from ww  w.j  a  v  a2s  .c o  m
        return new GZIPInputStream(i);
    case 0x5345: // 'S' 'E'
        if (i.readByte() == 'Q') {
            i.close();
            return new TextRecordInputStream(srcFs.getFileStatus(p));
        }
        break;
    }
    i.seek(0);
    /**
     * snappy
     */
    if (isSnappy(p.getName())) {
        return getSnappyCodec().createInputStream(i);
    }
    /**
     * end
     */
    return i;
}

From source file:com.gruter.hadoop.customShell.CustomShell.java

License:Apache License

int runCmdHandler(CmdHandler handler, String[] args, int startIndex, boolean recursive) throws IOException {
    int errors = 0;

    for (int i = startIndex; i < args.length; i++) {
        Path srcPath = new Path(args[i]);
        FileSystem srcFs = srcPath.getFileSystem(getConf());
        Path[] paths = FileUtil.stat2Paths(srcFs.globStatus(srcPath), srcPath);
        // if nothing matches to given glob pattern then increment error count
        if (paths.length == 0) {
            System.err.println(handler.getName() + ": could not get status for '" + args[i] + "'");
            errors++;/*from  ww w.  jav a  2s  .  c o m*/
        }
        for (Path path : paths) {
            try {
                FileStatus file = srcFs.getFileStatus(path);
                if (file == null) {
                    System.err.println(handler.getName() + ": could not get status for '" + path + "'");
                    errors++;
                } else {
                    errors += runCmdHandler(handler, file, srcFs, recursive);
                }
            } catch (IOException e) {
                String msg = (e.getMessage() != null ? e.getLocalizedMessage()
                        : (e.getCause().getMessage() != null ? e.getCause().getLocalizedMessage() : "null"));
                System.err.println(
                        handler.getName() + ": could not get status for '" + path + "': " + msg.split("\n")[0]);
                errors++;
            }
        }
    }

    return (errors > 0 || handler.getErrorCode() != 0) ? 1 : 0;
}

From source file:com.hadoop.compression.fourmc.FourMcInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file./*from w  ww. ja v a 2 s  .  c  o  m*/
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMcBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMcBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /*
    4mc Footer:
     Footer size:        4 bytes
     Footer version:     4 byte (1)
     Block index offset: 4 bytes delta offset for each stored block, the delta between offset between previous file position and next block
     Footer size:        4 bytes (repeated to be able to read from end of file)
     MAGIC SIGNATURE:    4 bytes: "4MC\0"
     Footer checksum:    4 bytes (always in XXHASH32)
            
    */

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMcCodec.FOURMC_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMcCodec.FOURMC_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != Lz4Decompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMcBlockIndex index = new FourMcBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}

From source file:com.hadoop.compression.fourmc.FourMzInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file.//from w  ww .j a  v  a  2  s. c  om
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMzBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMzBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMzCodec.FOURMZ_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMzCodec.FOURMZ_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != ZstdDecompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMzBlockIndex index = new FourMzBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}

From source file:com.hadoop.compression.lzo.LzoIndexer.java

License:Open Source License

/**
 * Lzo index a given path, calling recursively to index directories when encountered.
 * Files are only indexed if they end in .lzo and have no existing .lzo.index file.
 * /*from  w w w  .  j  av a 2 s.  c  o m*/
 * @param lzoPath The base path to index.
 * @param nestingLevel For pretty printing, the nesting level.
 * @throws IOException
 */
private void indexInternal(Path lzoPath, int nestingLevel) throws IOException {
    FileSystem fs = FileSystem.get(URI.create(lzoPath.toString()), conf_);
    FileStatus fileStatus = fs.getFileStatus(lzoPath);

    // Recursively walk
    if (fileStatus.isDir()) {
        LOG.info(getNesting(nestingLevel) + "LZO Indexing directory " + lzoPath + "...");
        FileStatus[] statuses = fs.listStatus(lzoPath);
        for (FileStatus childStatus : statuses) {
            indexInternal(childStatus.getPath(), nestingLevel + 1);
        }
    } else if (lzoPath.toString().endsWith(LZO_EXTENSION)) {
        Path lzoIndexPath = new Path(lzoPath.toString() + LzoIndex.LZO_INDEX_SUFFIX);
        if (fs.exists(lzoIndexPath)) {
            LOG.info(getNesting(nestingLevel) + "[SKIP] LZO index file already exists for " + lzoPath + "\n");
        } else {
            long startTime = System.currentTimeMillis();
            long fileSize = fileStatus.getLen();

            LOG.info(getNesting(nestingLevel) + "[INDEX] LZO Indexing file " + lzoPath + ", size "
                    + df_.format(fileSize / (1024.0 * 1024.0 * 1024.0)) + " GB...");
            if (indexSingleFile(fs, lzoPath)) {
                long indexSize = fs.getFileStatus(lzoIndexPath).getLen();
                double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
                LOG.info(getNesting(nestingLevel) + "Completed LZO Indexing in " + df_.format(elapsed)
                        + " seconds (" + df_.format(fileSize / (1024.0 * 1024.0 * elapsed))
                        + " MB/s).  Index size is " + df_.format(indexSize / 1024.0) + " KB.\n");
            }
        }
    }
}

From source file:com.hadoop.mapred.DeprecatedLzoTextInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    FileSplit[] splits = (FileSplit[]) super.getSplits(conf, numSplits);
    // Find new starts/ends of the filesplit that align with the LZO blocks.

    List<FileSplit> result = new ArrayList<FileSplit>();

    for (FileSplit fileSplit : splits) {
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        if (!LzoInputFormatCommon.isLzoFile(file.toString())) {
            // non-LZO file, keep the input split as is.
            result.add(fileSplit);//from  w w  w .j a v a 2 s. c o m
            continue;
        }

        // LZO file, try to split if the .index file was found
        LzoIndex index = indexes.get(file);
        if (index == null) {
            throw new IOException("Index not found for " + file);
        }
        if (index.isEmpty()) {
            // Empty index, keep it as is.
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long lzoStart = index.alignSliceStartToIndex(start, end);
        long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) {
            result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
        }
    }

    return result.toArray(new FileSplit[result.size()]);
}

From source file:com.hadoop.mapreduce.FourMcInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;/*www .ja v  a  2 s  .c  o m*/
    FourMcBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMcBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMcBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}

From source file:com.hadoop.mapreduce.FourMzInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;/*ww w  . j  a va  2 s  . co m*/
    FourMzBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMzBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMzBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMzBlockIndex.NOT_FOUND && fourMcEnd != FourMzBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mz split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}