Example usage for org.apache.hadoop.fs FileSystem getStatistics

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getStatistics.

Prototype

@Deprecated
public static synchronized Statistics getStatistics(final String scheme, Class<? extends FileSystem> cls)

Source Link

Document

Get the statistics for a particular file system.

Usage

From source file:com.google.cloud.hadoop.fs.gcs.HadoopFileSystemIntegrationHelper.java

License:Open Source License

/**
 * Helper that reads text from the given file at the given offset
 * and returns it. If checkOverflow is true, it will make sure that
 * no more than 'len' bytes were read.//from   w  w w  . ja  va  2 s .c  o  m
 */
protected String readTextFile(Path hadoopPath, int offset, int len, boolean checkOverflow) throws IOException {
    String text = null;
    FSDataInputStream readStream = null;
    long fileSystemBytesRead = 0;
    FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(),
            ghfs.getClass());
    if (stats != null) {
        // Let it be null in case no stats have been added for our scheme yet.
        fileSystemBytesRead = stats.getBytesRead();
    }

    try {
        int bufferSize = len;
        bufferSize += checkOverflow ? 1 : 0;
        byte[] readBuffer = new byte[bufferSize];
        readStream = ghfs.open(hadoopPath, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT);
        int numBytesRead;
        if (offset > 0) {
            numBytesRead = readStream.read(offset, readBuffer, 0, bufferSize);
        } else {
            numBytesRead = readStream.read(readBuffer);
        }
        Assert.assertEquals(len, numBytesRead);
        text = new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8);
    } finally {
        if (readStream != null) {
            readStream.close();
        }
    }

    // After the read, the stats better be non-null for our ghfs scheme.
    stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass());
    Assert.assertNotNull(stats);
    long endFileSystemBytesRead = stats.getBytesRead();
    int bytesReadStats = (int) (endFileSystemBytesRead - fileSystemBytesRead);
    if (statistics == FileSystemStatistics.EXACT) {
        Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()),
                len, bytesReadStats);
    } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) {
        Assert.assertTrue(String.format("Expected %d <= %d", len, bytesReadStats), len <= bytesReadStats);
    } else if (statistics == FileSystemStatistics.NONE) {
        Assert.assertEquals("FS statistics expected to be 0", 0, fileSystemBytesRead);
        Assert.assertEquals("FS statistics expected to be 0", 0, endFileSystemBytesRead);
    } else if (statistics == FileSystemStatistics.IGNORE) {
        // NO-OP
    }

    return text;
}

From source file:com.google.cloud.hadoop.fs.gcs.HadoopFileSystemIntegrationHelper.java

License:Open Source License

/**
 * Writes a file with the given buffer repeated numWrites times.
 *
 * @param hadoopPath Path of the file to create.
 * @param buffer Data to write.//  w w  w.j  a v a2s.  c o m
 * @param numWrites Number of times to repeat the data.
 * @param overwrite If true, overwrite any existing file.
 * @return Number of bytes written.
 */
public int writeFile(Path hadoopPath, ByteBuffer buffer, int numWrites, boolean overwrite) throws IOException {
    int numBytesWritten = -1;
    int totalBytesWritten = 0;

    long fileSystemBytesWritten = 0;
    FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(),
            ghfs.getClass());
    if (stats != null) {
        // Let it be null in case no stats have been added for our scheme yet.
        fileSystemBytesWritten = stats.getBytesWritten();
    }
    FSDataOutputStream writeStream = null;
    boolean allWritesSucceeded = false;

    try {
        writeStream = ghfs.create(hadoopPath, FsPermission.getDefault(), overwrite,
                GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT,
                GoogleHadoopFileSystemBase.REPLICATION_FACTOR_DEFAULT,
                GoogleHadoopFileSystemBase.BLOCK_SIZE_DEFAULT, null); // progressable

        for (int i = 0; i < numWrites; i++) {
            buffer.clear();
            writeStream.write(buffer.array(), 0, buffer.capacity());
            numBytesWritten = buffer.capacity();
            totalBytesWritten += numBytesWritten;
        }
        allWritesSucceeded = true;
    } finally {
        if (writeStream != null) {
            try {
                writeStream.close();
            } catch (IOException e) {
                // Ignore IO exceptions while closing if write failed otherwise the
                // exception that caused the write to fail gets superseded.
                // On the other hand, if all writes succeeded then we need to know about the exception
                // that was thrown during closing.
                if (allWritesSucceeded) {
                    throw e;
                }
            }
        }
    }

    // After the write, the stats better be non-null for our ghfs scheme.
    stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass());
    Assert.assertNotNull(stats);
    long endFileSystemBytesWritten = stats.getBytesWritten();
    int bytesWrittenStats = (int) (endFileSystemBytesWritten - fileSystemBytesWritten);
    if (statistics == FileSystemStatistics.EXACT) {
        Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()),
                totalBytesWritten, bytesWrittenStats);
    } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) {
        Assert.assertTrue(String.format("Expected %d <= %d", totalBytesWritten, bytesWrittenStats),
                totalBytesWritten <= bytesWrittenStats);
    } else if (statistics == FileSystemStatistics.NONE) {
        // Do not perform any check because stats are either not maintained or are erratic.
    } else if (statistics == FileSystemStatistics.IGNORE) {
        // NO-OP
    }

    return totalBytesWritten;
}

From source file:com.twitter.hraven.etl.JobFilePartitioner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    myConf = getConf();//from   w ww. jav  a2  s. c o m

    // Presume this is all HDFS paths, even when access as file://
    hdfs = FileSystem.get(myConf);

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(myConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Grab the input path argument
    input = commandLine.getOptionValue("i");
    LOG.info("input=" + input);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info("output=" + output);

    skipExisting = commandLine.hasOption("s");
    LOG.info("skipExisting=" + skipExisting);

    moveFiles = commandLine.hasOption("m");
    LOG.info("moveFiles=" + moveFiles);

    if (skipExisting && moveFiles) {
        throw new IllegalArgumentException("Cannot use both options skipExisting and move simultaneously.");
    }

    if (commandLine.hasOption("x")) {
        try {
            maXretention = Integer.parseInt(commandLine.getOptionValue("x"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "maXretention option -x is is not a valid number: " + commandLine.getOptionValue("x"), nfe);
        }
        // Additional check
        if (maXretention < 0) {
            throw new IllegalArgumentException(
                    "Cannot retain less than 0 files. Specified maXretention option -x is: "
                            + commandLine.getOptionValue("x"));
        }
        LOG.info("maXretention=" + maXretention);
        if (moveFiles) {
            throw new IllegalArgumentException("Cannot use both options maXretention and move simultaneously.");
        }
    } else {
        maXretention = Integer.MAX_VALUE;
    }

    outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }

    Path inputPath = new Path(input);
    URI inputURI = inputPath.toUri();
    String inputScheme = inputURI.getScheme();

    LOG.info("input scheme is: " + inputScheme);

    // If input directory is HDFS, then process as such. Assume not scheme is
    // HDFS
    if ((inputScheme == null) || (hdfs.getUri().getScheme().equals(inputScheme))) {
        processHDFSSources(inputPath);
    } else if (inputScheme.equals("file")) {
        if (moveFiles) {
            throw new IllegalArgumentException(
                    "Cannot move files that are not already in hdfs. Input is not HDFS: " + input);
        }
        processPlainFileSources(inputURI);
    } else {
        throw new IllegalArgumentException("Cannot process files from this URI scheme: " + inputScheme);
    }

    Statistics statistics = FileSystem.getStatistics(outputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        System.out.println("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    return 0;
}

From source file:com.twitter.hraven.etl.JobFilePreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    // When we started processing. This is also the upper limit of files we
    // accept, next run will pick up the new incoming files.
    long processingStartMillis = System.currentTimeMillis();

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Output should be an hdfs path.
    FileSystem hdfs = FileSystem.get(hbaseConf);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info(" output=" + output);
    Path outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }//from w  ww. j  a va  2  s . co m

    // Grab the input path argument
    String input;
    if (commandLine.hasOption("i")) {
        input = commandLine.getOptionValue("i");
    } else {
        input = hbaseConf.get("mapred.job.tracker.history.completed.location");
    }
    LOG.info("input=" + input);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    boolean forceAllFiles = commandLine.hasOption("f");
    LOG.info("forceAllFiles: " + forceAllFiles);

    Path inputPath = new Path(input);
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName());
    }

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    /**
     * Grab the size of huge files to be moved argument
     * hbase cell can't store files bigger than
     * maxFileSize, hence no need to consider them for rawloading
     * Reference:
     * {@link https://github.com/twitter/hraven/issues/59}
     */
    String maxFileSizeStr = commandLine.getOptionValue("s");
    LOG.info("maxFileSize=" + maxFileSizeStr);
    long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT;
    try {
        maxFileSize = Long.parseLong(maxFileSizeStr);
    } catch (NumberFormatException nfe) {
        throw new ProcessingException(
                "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe);
    }

    ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf);

    boolean success = true;
    try {

        // Figure out where we last left off (if anywhere at all)
        ProcessRecord lastProcessRecord = null;

        if (!forceAllFiles) {
            lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster);
        }

        long minModificationTimeMillis = 0;
        if (lastProcessRecord != null) {
            // Start of this time period is the end of the last period.
            minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis();
        }

        // Do a sanity check. The end time of the last scan better not be later
        // than when we started processing.
        if (minModificationTimeMillis > processingStartMillis) {
            throw new RuntimeException("The last processing record has maxModificationMillis later than now: "
                    + lastProcessRecord);
        }

        // Accept only jobFiles and only those that fall in the desired range of
        // modification time.
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(
                hbaseConf, minModificationTimeMillis);

        String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis));

        ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
        LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath
                + " that are modified since " + timestamp);

        // get the files in the done folder,
        // need to traverse dirs under done recursively for versions
        // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
        // on/after cdh3u5
        FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath,
                jobFileModifiedRangePathFilter);

        LOG.info("Sorting " + jobFileStatusses.length + " job files.");

        Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

        // Process these files in batches at a time.
        int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize);
        LOG.info("Batch count: " + batchCount);
        for (int b = 0; b < batchCount; b++) {
            processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath);
        }

    } finally {
        processRecordService.close();
    }

    Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        LOG.info("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:org.apache.orc.bench.ColumnProjectionBenchmark.java

License:Apache License

@Benchmark
public void parquet(ExtraCounters counters) throws Exception {
    JobConf conf = new JobConf();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    if ("taxi".equals(dataset)) {
        conf.set("columns", "vendor_id,pickup_time");
        conf.set("columns.types", "int,timestamp");
    } else if ("sales".equals(dataset)) {
        conf.set("columns", "sales_id,customer_id");
        conf.set("columns.types", "bigint,bigint");
    } else if ("github".equals(dataset)) {
        conf.set("columns", "actor,created_at");
        conf.set("columns.types",
                "struct<avatar_url:string,gravatar_id:string," + "id:int,login:string,url:string>,timestamp");
    } else {//from  ww w . j  a v  a  2 s . co m
        throw new IllegalArgumentException("Unknown data set " + dataset);
    }
    Path path = Utilities.getVariant(root, dataset, "parquet", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();
    ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class);

    NullWritable nada = NullWritable.get();
    FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {});
    org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper(
            inputFormat, split, conf, Reporter.NULL);
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(nada, value)) {
        counters.records += 1;
    }
    recordReader.close();
    counters.bytesRead += statistics.getBytesRead();
    counters.reads += statistics.getReadOps();
    counters.invocations += 1;
}

From source file:org.apache.orc.bench.FullReadBenchmark.java

License:Apache License

@Benchmark
public void avro(ExtraCounters counters) throws Exception {
    Configuration conf = new Configuration();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    Path path = Utilities.getVariant(root, dataset, "avro", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();//from   w  ww  .  ja  v  a2s. co  m
    FsInput file = new FsInput(path, conf);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader);
    GenericRecord record = null;
    while (dataFileReader.hasNext()) {
        record = dataFileReader.next(record);
        counters.records += 1;
    }
    counters.bytesRead += statistics.getBytesRead();
    counters.reads += statistics.getReadOps();
    counters.invocations += 1;
}

From source file:org.apache.orc.bench.FullReadBenchmark.java

License:Apache License

@Benchmark
public void parquet(ExtraCounters counters) throws Exception {
    JobConf conf = new JobConf();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    Path path = Utilities.getVariant(root, dataset, "parquet", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();/*from   www  .  j  a v a2  s . c o m*/
    ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class);

    NullWritable nada = NullWritable.get();
    FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {});
    org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper(
            inputFormat, split, conf, Reporter.NULL);
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(nada, value)) {
        counters.records += 1;
    }
    recordReader.close();
    counters.bytesRead += statistics.getBytesRead();
    counters.reads += statistics.getReadOps();
    counters.invocations += 1;
}

From source file:org.apache.orc.bench.hive.ColumnProjectionBenchmark.java

License:Apache License

@Benchmark
public void parquet(ReadCounters counters) throws Exception {
    JobConf conf = new JobConf();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    if ("taxi".equals(dataset)) {
        conf.set("columns", "vendor_id,pickup_time");
        conf.set("columns.types", "int,timestamp");
    } else if ("sales".equals(dataset)) {
        conf.set("columns", "sales_id,customer_id");
        conf.set("columns.types", "bigint,bigint");
    } else if ("github".equals(dataset)) {
        conf.set("columns", "actor,created_at");
        conf.set("columns.types",
                "struct<avatar_url:string,gravatar_id:string," + "id:int,login:string,url:string>,timestamp");
    } else {//ww  w .j a  v  a2  s.c om
        throw new IllegalArgumentException("Unknown data set " + dataset);
    }
    Path path = Utilities.getVariant(root, dataset, "parquet", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();
    ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class);

    NullWritable nada = NullWritable.get();
    FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {});
    org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper(
            inputFormat, split, conf, Reporter.NULL);
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(nada, value)) {
        counters.addRecords(1);
    }
    recordReader.close();
    counters.addBytes(statistics.getReadOps(), statistics.getBytesRead());
    counters.addInvocation();
}

From source file:org.apache.orc.bench.hive.FullReadBenchmark.java

License:Apache License

@Benchmark
public void avro(ReadCounters counters) throws Exception {
    Configuration conf = new Configuration();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    Path path = Utilities.getVariant(root, dataset, "avro", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();//from   w  ww . j ava2s .co  m
    FsInput file = new FsInput(path, conf);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader);
    GenericRecord record = null;
    while (dataFileReader.hasNext()) {
        record = dataFileReader.next(record);
        counters.addRecords(1);
    }
    counters.addBytes(statistics.getReadOps(), statistics.getBytesRead());
    counters.addInvocation();
}

From source file:org.apache.orc.bench.hive.FullReadBenchmark.java

License:Apache License

@Benchmark
public void parquet(ReadCounters counters) throws Exception {
    JobConf conf = new JobConf();
    conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
    conf.set("fs.defaultFS", "track:///");
    Path path = Utilities.getVariant(root, dataset, "parquet", compression);
    FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class);
    statistics.reset();/*from  www .  j a v a 2s. c o  m*/
    ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class);

    NullWritable nada = NullWritable.get();
    FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {});
    org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper(
            inputFormat, split, conf, Reporter.NULL);
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(nada, value)) {
        counters.addRecords(1);
    }
    recordReader.close();
    counters.addBytes(statistics.getReadOps(), statistics.getBytesRead());
    counters.addInvocation();
}