List of usage examples for org.apache.hadoop.fs FileSystem getStatistics
@Deprecated public static synchronized Statistics getStatistics(final String scheme, Class<? extends FileSystem> cls)
From source file:com.google.cloud.hadoop.fs.gcs.HadoopFileSystemIntegrationHelper.java
License:Open Source License
/** * Helper that reads text from the given file at the given offset * and returns it. If checkOverflow is true, it will make sure that * no more than 'len' bytes were read.//from w w w . ja va 2 s .c o m */ protected String readTextFile(Path hadoopPath, int offset, int len, boolean checkOverflow) throws IOException { String text = null; FSDataInputStream readStream = null; long fileSystemBytesRead = 0; FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); if (stats != null) { // Let it be null in case no stats have been added for our scheme yet. fileSystemBytesRead = stats.getBytesRead(); } try { int bufferSize = len; bufferSize += checkOverflow ? 1 : 0; byte[] readBuffer = new byte[bufferSize]; readStream = ghfs.open(hadoopPath, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT); int numBytesRead; if (offset > 0) { numBytesRead = readStream.read(offset, readBuffer, 0, bufferSize); } else { numBytesRead = readStream.read(readBuffer); } Assert.assertEquals(len, numBytesRead); text = new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8); } finally { if (readStream != null) { readStream.close(); } } // After the read, the stats better be non-null for our ghfs scheme. stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); Assert.assertNotNull(stats); long endFileSystemBytesRead = stats.getBytesRead(); int bytesReadStats = (int) (endFileSystemBytesRead - fileSystemBytesRead); if (statistics == FileSystemStatistics.EXACT) { Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()), len, bytesReadStats); } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) { Assert.assertTrue(String.format("Expected %d <= %d", len, bytesReadStats), len <= bytesReadStats); } else if (statistics == FileSystemStatistics.NONE) { Assert.assertEquals("FS statistics expected to be 0", 0, fileSystemBytesRead); Assert.assertEquals("FS statistics expected to be 0", 0, endFileSystemBytesRead); } else if (statistics == FileSystemStatistics.IGNORE) { // NO-OP } return text; }
From source file:com.google.cloud.hadoop.fs.gcs.HadoopFileSystemIntegrationHelper.java
License:Open Source License
/** * Writes a file with the given buffer repeated numWrites times. * * @param hadoopPath Path of the file to create. * @param buffer Data to write.// w w w.j a v a2s. c o m * @param numWrites Number of times to repeat the data. * @param overwrite If true, overwrite any existing file. * @return Number of bytes written. */ public int writeFile(Path hadoopPath, ByteBuffer buffer, int numWrites, boolean overwrite) throws IOException { int numBytesWritten = -1; int totalBytesWritten = 0; long fileSystemBytesWritten = 0; FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); if (stats != null) { // Let it be null in case no stats have been added for our scheme yet. fileSystemBytesWritten = stats.getBytesWritten(); } FSDataOutputStream writeStream = null; boolean allWritesSucceeded = false; try { writeStream = ghfs.create(hadoopPath, FsPermission.getDefault(), overwrite, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT, GoogleHadoopFileSystemBase.REPLICATION_FACTOR_DEFAULT, GoogleHadoopFileSystemBase.BLOCK_SIZE_DEFAULT, null); // progressable for (int i = 0; i < numWrites; i++) { buffer.clear(); writeStream.write(buffer.array(), 0, buffer.capacity()); numBytesWritten = buffer.capacity(); totalBytesWritten += numBytesWritten; } allWritesSucceeded = true; } finally { if (writeStream != null) { try { writeStream.close(); } catch (IOException e) { // Ignore IO exceptions while closing if write failed otherwise the // exception that caused the write to fail gets superseded. // On the other hand, if all writes succeeded then we need to know about the exception // that was thrown during closing. if (allWritesSucceeded) { throw e; } } } } // After the write, the stats better be non-null for our ghfs scheme. stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); Assert.assertNotNull(stats); long endFileSystemBytesWritten = stats.getBytesWritten(); int bytesWrittenStats = (int) (endFileSystemBytesWritten - fileSystemBytesWritten); if (statistics == FileSystemStatistics.EXACT) { Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()), totalBytesWritten, bytesWrittenStats); } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) { Assert.assertTrue(String.format("Expected %d <= %d", totalBytesWritten, bytesWrittenStats), totalBytesWritten <= bytesWrittenStats); } else if (statistics == FileSystemStatistics.NONE) { // Do not perform any check because stats are either not maintained or are erratic. } else if (statistics == FileSystemStatistics.IGNORE) { // NO-OP } return totalBytesWritten; }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
@Override public int run(String[] args) throws Exception { myConf = getConf();//from w ww. jav a2 s. c o m // Presume this is all HDFS paths, even when access as file:// hdfs = FileSystem.get(myConf); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(myConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Grab the input path argument input = commandLine.getOptionValue("i"); LOG.info("input=" + input); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info("output=" + output); skipExisting = commandLine.hasOption("s"); LOG.info("skipExisting=" + skipExisting); moveFiles = commandLine.hasOption("m"); LOG.info("moveFiles=" + moveFiles); if (skipExisting && moveFiles) { throw new IllegalArgumentException("Cannot use both options skipExisting and move simultaneously."); } if (commandLine.hasOption("x")) { try { maXretention = Integer.parseInt(commandLine.getOptionValue("x")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "maXretention option -x is is not a valid number: " + commandLine.getOptionValue("x"), nfe); } // Additional check if (maXretention < 0) { throw new IllegalArgumentException( "Cannot retain less than 0 files. Specified maXretention option -x is: " + commandLine.getOptionValue("x")); } LOG.info("maXretention=" + maXretention); if (moveFiles) { throw new IllegalArgumentException("Cannot use both options maXretention and move simultaneously."); } } else { maXretention = Integer.MAX_VALUE; } outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); } Path inputPath = new Path(input); URI inputURI = inputPath.toUri(); String inputScheme = inputURI.getScheme(); LOG.info("input scheme is: " + inputScheme); // If input directory is HDFS, then process as such. Assume not scheme is // HDFS if ((inputScheme == null) || (hdfs.getUri().getScheme().equals(inputScheme))) { processHDFSSources(inputPath); } else if (inputScheme.equals("file")) { if (moveFiles) { throw new IllegalArgumentException( "Cannot move files that are not already in hdfs. Input is not HDFS: " + input); } processPlainFileSources(inputURI); } else { throw new IllegalArgumentException("Cannot process files from this URI scheme: " + inputScheme); } Statistics statistics = FileSystem.getStatistics(outputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); System.out.println("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } return 0; }
From source file:com.twitter.hraven.etl.JobFilePreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When we started processing. This is also the upper limit of files we // accept, next run will pick up the new incoming files. long processingStartMillis = System.currentTimeMillis(); Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Output should be an hdfs path. FileSystem hdfs = FileSystem.get(hbaseConf); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info(" output=" + output); Path outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); }//from w ww. j a va 2 s . co m // Grab the input path argument String input; if (commandLine.hasOption("i")) { input = commandLine.getOptionValue("i"); } else { input = hbaseConf.get("mapred.job.tracker.history.completed.location"); } LOG.info("input=" + input); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } boolean forceAllFiles = commandLine.hasOption("f"); LOG.info("forceAllFiles: " + forceAllFiles); Path inputPath = new Path(input); FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); if (!inputFileStatus.isDir()) { throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName()); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); /** * Grab the size of huge files to be moved argument * hbase cell can't store files bigger than * maxFileSize, hence no need to consider them for rawloading * Reference: * {@link https://github.com/twitter/hraven/issues/59} */ String maxFileSizeStr = commandLine.getOptionValue("s"); LOG.info("maxFileSize=" + maxFileSizeStr); long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT; try { maxFileSize = Long.parseLong(maxFileSizeStr); } catch (NumberFormatException nfe) { throw new ProcessingException( "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe); } ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf); boolean success = true; try { // Figure out where we last left off (if anywhere at all) ProcessRecord lastProcessRecord = null; if (!forceAllFiles) { lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster); } long minModificationTimeMillis = 0; if (lastProcessRecord != null) { // Start of this time period is the end of the last period. minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis(); } // Do a sanity check. The end time of the last scan better not be later // than when we started processing. if (minModificationTimeMillis > processingStartMillis) { throw new RuntimeException("The last processing record has maxModificationMillis later than now: " + lastProcessRecord); } // Accept only jobFiles and only those that fall in the desired range of // modification time. JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( hbaseConf, minModificationTimeMillis); String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis)); ContentSummary contentSummary = hdfs.getContentSummary(inputPath); LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath + " that are modified since " + timestamp); // get the files in the done folder, // need to traverse dirs under done recursively for versions // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 // on/after cdh3u5 FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath, jobFileModifiedRangePathFilter); LOG.info("Sorting " + jobFileStatusses.length + " job files."); Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); // Process these files in batches at a time. int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize); LOG.info("Batch count: " + batchCount); for (int b = 0; b < batchCount; b++) { processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath); } } finally { processRecordService.close(); } Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); LOG.info("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } // Return the status return success ? 0 : 1; }
From source file:org.apache.orc.bench.ColumnProjectionBenchmark.java
License:Apache License
@Benchmark public void parquet(ExtraCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); if ("taxi".equals(dataset)) { conf.set("columns", "vendor_id,pickup_time"); conf.set("columns.types", "int,timestamp"); } else if ("sales".equals(dataset)) { conf.set("columns", "sales_id,customer_id"); conf.set("columns.types", "bigint,bigint"); } else if ("github".equals(dataset)) { conf.set("columns", "actor,created_at"); conf.set("columns.types", "struct<avatar_url:string,gravatar_id:string," + "id:int,login:string,url:string>,timestamp"); } else {//from ww w . j a v a 2 s . co m throw new IllegalArgumentException("Unknown data set " + dataset); } Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset(); ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.records += 1; } recordReader.close(); counters.bytesRead += statistics.getBytesRead(); counters.reads += statistics.getReadOps(); counters.invocations += 1; }
From source file:org.apache.orc.bench.FullReadBenchmark.java
License:Apache License
@Benchmark public void avro(ExtraCounters counters) throws Exception { Configuration conf = new Configuration(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); Path path = Utilities.getVariant(root, dataset, "avro", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset();//from w ww . ja v a2s. co m FsInput file = new FsInput(path, conf); DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader); GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(record); counters.records += 1; } counters.bytesRead += statistics.getBytesRead(); counters.reads += statistics.getReadOps(); counters.invocations += 1; }
From source file:org.apache.orc.bench.FullReadBenchmark.java
License:Apache License
@Benchmark public void parquet(ExtraCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset();/*from www . j a v a2 s . c o m*/ ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.records += 1; } recordReader.close(); counters.bytesRead += statistics.getBytesRead(); counters.reads += statistics.getReadOps(); counters.invocations += 1; }
From source file:org.apache.orc.bench.hive.ColumnProjectionBenchmark.java
License:Apache License
@Benchmark public void parquet(ReadCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); if ("taxi".equals(dataset)) { conf.set("columns", "vendor_id,pickup_time"); conf.set("columns.types", "int,timestamp"); } else if ("sales".equals(dataset)) { conf.set("columns", "sales_id,customer_id"); conf.set("columns.types", "bigint,bigint"); } else if ("github".equals(dataset)) { conf.set("columns", "actor,created_at"); conf.set("columns.types", "struct<avatar_url:string,gravatar_id:string," + "id:int,login:string,url:string>,timestamp"); } else {//ww w .j a v a2 s.c om throw new IllegalArgumentException("Unknown data set " + dataset); } Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset(); ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.addRecords(1); } recordReader.close(); counters.addBytes(statistics.getReadOps(), statistics.getBytesRead()); counters.addInvocation(); }
From source file:org.apache.orc.bench.hive.FullReadBenchmark.java
License:Apache License
@Benchmark public void avro(ReadCounters counters) throws Exception { Configuration conf = new Configuration(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); Path path = Utilities.getVariant(root, dataset, "avro", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset();//from w ww . j ava2s .co m FsInput file = new FsInput(path, conf); DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader); GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(record); counters.addRecords(1); } counters.addBytes(statistics.getReadOps(), statistics.getBytesRead()); counters.addInvocation(); }
From source file:org.apache.orc.bench.hive.FullReadBenchmark.java
License:Apache License
@Benchmark public void parquet(ReadCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset();/*from www . j a v a 2s. c o m*/ ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.addRecords(1); } recordReader.close(); counters.addBytes(statistics.getReadOps(), statistics.getBytesRead()); counters.addInvocation(); }