List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.facebook.presto.hive.OrcFileWriterFactory.java
License:Apache License
@Override public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session) { if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) { return Optional.empty(); }//from w w w . ja v a 2 s . c om boolean isDwrf; if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { isDwrf = false; } else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { isDwrf = true; } else { return Optional.empty(); } CompressionKind compression = getCompression(schema, configuration); // existing tables and partitions may have columns in a different order than the writer is providing, so build // an index to rearrange columns in the proper order List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings() .splitToList(schema.getProperty(META_TABLE_COLUMNS, "")); List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream() .map(hiveType -> hiveType.getType(typeManager)).collect(toList()); int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray(); try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); OutputStream outputStream = fileSystem.create(path); Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty(); if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) { validationInputFactory = Optional.of(() -> { try { return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), stats); } catch (IOException e) { throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e); } }); } Callable<Void> rollbackAction = () -> { fileSystem.delete(path, false); return null; }; return Optional.of(new OrcFileWriter(outputStream, rollbackAction, isDwrf, fileColumnNames, fileColumnTypes, compression, fileInputColumnIndexes, ImmutableMap.<String, String>builder() .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()) .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory)); } catch (IOException e) { throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e); } }
From source file:com.facebook.presto.hive.parquet.HdfsParquetDataSource.java
License:Apache License
public static HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start, long length) { try {/*from www . j a va 2s.co m*/ long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); return new HdfsParquetDataSource(path, size, inputStream); } catch (Exception e) { if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) { throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); } throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage()), e); } }
From source file:com.facebook.presto.hive.parquet.ParquetTester.java
License:Apache License
private static void assertFileContents(JobConf jobConf, ObjectInspector objectInspector, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException { Path path = new Path(tempFile.getFile().toURI()); FileSystem fileSystem = path.getFileSystem(jobConf); ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream); TypeManager typeManager = new TypeRegistry(); ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager);//from ww w . ja v a 2 s .c o m assertEquals(parquetReader.getPosition(), 0); int rowsProcessed = 0; Iterator<?> iterator = expectedValues.iterator(); for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) { ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0); Block block = parquetReader.readPrimitive(columnDescriptor, type); for (int i = 0; i < batchSize; i++) { assertTrue(iterator.hasNext()); Object expected = iterator.next(); Object actual = decodeObject(type, block, i); assertEquals(actual, expected); } rowsProcessed += batchSize; assertEquals(parquetReader.getPosition(), rowsProcessed); } assertFalse(iterator.hasNext()); assertEquals(parquetReader.getPosition(), rowsProcessed); parquetReader.close(); }
From source file:com.facebook.presto.hive.parquet.reader.ParquetMetadataReader.java
License:Apache License
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException { FileStatus fileStatus = fileSystem.getFileStatus(file); try (FSDataInputStream inputStream = fileSystem.open(file)) { // Parquet File Layout: //// ww w . j a v a2 s . c o m // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC long length = fileStatus.getLen(); validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]); ColumnPath columnPath = ColumnPath.get(path); ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType() .getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); } }
From source file:com.facebook.presto.hive.rcfile.RcFilePageSourceFactory.java
License:Apache License
@Override public Optional<? extends ConnectorPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone) { if (!isRcfileOptimizedReaderEnabled(session)) { return Optional.empty(); }/* w w w . j av a 2s . c o m*/ RcFileEncoding rcFileEncoding; String deserializerClassName = getDeserializerClassName(schema); if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) { rcFileEncoding = new BinaryRcFileEncoding(); } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) { rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone); } else { return Optional.empty(); } long size; FSDataInputStream inputStream; try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); size = fileSystem.getFileStatus(path).getLen(); inputStream = fileSystem.open(path); } catch (Exception e) { throw Throwables.propagate(e); } try { ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder(); for (HiveColumnHandle column : columns) { readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager)); } RcFileReader rcFileReader = new RcFileReader( new HdfsRcFileDataSource(path.toString(), inputStream, size), rcFileEncoding, readColumns.build(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, new DataSize(1, Unit.MEGABYTE)); return Optional.of(new RcFilePageSource(rcFileReader, columns, hiveStorageTimeZone, typeManager)); } catch (Throwable e) { try { inputStream.close(); } catch (IOException ignored) { } throw Throwables.propagate(e); } }
From source file:com.facebook.presto.hive.RcFileFileWriterFactory.java
License:Apache License
@Override public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session) { if (!HiveSessionProperties.isRcfileOptimizedWriterEnabled(session)) { return Optional.empty(); }//from w w w.j ava 2 s . c o m if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { return Optional.empty(); } RcFileEncoding rcFileEncoding; if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { rcFileEncoding = new BinaryRcFileEncoding(); } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone); } else { return Optional.empty(); } Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC)); // existing tables and partitions may have columns in a different order than the writer is providing, so build // an index to rearrange columns in the proper order List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings() .splitToList(schema.getProperty(META_TABLE_COLUMNS, "")); List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream() .map(hiveType -> hiveType.getType(typeManager)).collect(toList()); int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray(); try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); OutputStream outputStream = fileSystem.create(path); Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty(); if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) { validationInputFactory = Optional.of(() -> { try { return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats); } catch (IOException e) { throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e); } }); } Callable<Void> rollbackAction = () -> { fileSystem.delete(path, false); return null; }; return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder() .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()) .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), validationInputFactory)); } catch (Exception e) { throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e); } }
From source file:com.flipkart.fdp.migration.distcp.utils.FileCountDriver.java
License:Apache License
public List<String> getFileStatusRecursive(Path path, FileSystem fs, String destBasePath) throws IOException { List<String> response = new ArrayList<String>(); FileStatus file = fs.getFileStatus(path); if (file != null && file.isFile()) { response.add(trimExtension(file.getPath().toUri().getPath(), destBasePath)); return response; }//www. ja v a 2 s . c o m FileStatus[] fstats = fs.listStatus(path); if (fstats != null && fstats.length > 0) { for (FileStatus fstat : fstats) { if (fstat.isDirectory()) { response.addAll(getFileStatusRecursive(fstat.getPath(), fs, destBasePath)); } else { response.add(trimExtension(fstat.getPath().toUri().getPath(), destBasePath)); } } } return response; }
From source file:com.floodCtr.Util.java
License:Open Source License
public static LocalResource newYarnAppResource(FileSystem fs, Path path, LocalResourceType type, LocalResourceVisibility vis) throws IOException { Path qualified = fs.makeQualified(path); FileStatus status = fs.getFileStatus(qualified); LocalResource resource = Records.newRecord(LocalResource.class); resource.setType(type);/*from w ww.java 2 s . c o m*/ resource.setVisibility(vis); resource.setResource(ConverterUtils.getYarnUrlFromPath(qualified)); resource.setTimestamp(status.getModificationTime()); resource.setSize(status.getLen()); return resource; }
From source file:com.flyhz.avengers.framework.AvengersClient.java
License:Apache License
/** * Main run function for the client/* w w w . j a v a 2 s. co m*/ * * @return true if application completed successfully * @throws IOException * @throws YarnException */ private boolean run(String appName, List<String> commands) throws IOException, YarnException { LOG.info("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(NodeState.RUNNING); LOG.info("Got Cluster node info from ASM"); for (NodeReport node : clusterNodeReports) { LOG.info("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress() + ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers()); } QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); LOG.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity() + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size() + ", queueChildQueueCount=" + queueInfo.getChildQueues().size()); List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); // if needed // If we do not have min/max, we may not be able to correctly request // the required resources from the RM for the app master // Memory ask has to be a multiple of min and less than max. // Dump out information about cluster capability as seen by the resource // manager int maxMem = appResponse.getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); // A resource ask cannot exceed the max. if (amMemory > maxMem) { LOG.info("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); ApplicationId appId = appContext.getApplicationId(); appContext.setApplicationName(appName); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of // the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); FileSystem fs = DistributedFileSystem.get(conf); Path src = new Path(appJar); Path dst = new Path(fs.getHomeDirectory(), "avengers/" + batchId + "/avengers.jar"); if (copy) { LOG.info("copy local jar to hdfs"); fs.copyFromLocalFile(false, true, src, dst); copy = false; } this.hdfsPath = dst.toUri().toString(); LOG.info("hdfs hdfsPath = {}", dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource amJarRsrc = Records.newRecord(LocalResource.class); amJarRsrc.setType(LocalResourceType.FILE); amJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION); LOG.info("YarnURLFromPath ->{}", ConverterUtils.getYarnUrlFromPath(dst)); amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); amJarRsrc.setTimestamp(destStatus.getModificationTime()); amJarRsrc.setSize(destStatus.getLen()); localResources.put("avengers.jar", amJarRsrc); // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(fs.getHomeDirectory(), "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } // The shell script has to be made available on the final container(s) // where it will be executed. // To do this, we need to first copy into the filesystem that is visible // to the yarn framework. // We do not need to set this as a local resource for the application // master as the application master does not need it. // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed // amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application // master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()).append(File.pathSeparatorChar); for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { classPathEnv.append(File.pathSeparatorChar); classPathEnv.append(c.trim()); } classPathEnv.append(File.pathSeparatorChar).append("./log4j.properties"); // add the runtime classpath needed for tests to work if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(':'); classPathEnv.append(System.getProperty("java.class.path")); } LOG.info("CLASSPATH -> " + classPathEnv); env.put("CLASSPATH", classPathEnv.toString()); amContainer.setEnvironment(env); for (String cmd : commands) { LOG.info("run command {},appId {}", cmd, appId.getId()); } amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = // applicationsManager.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on // success // or an exception thrown to denote some form of a failure LOG.info("Submitting application to ASM"); yarnClient.submitApplication(appContext); // Try submitting the same request again // app submission failure? // Monitor the application return monitorApplication(appId); }
From source file:com.fullcontact.cassandra.io.compress.CompressionMetadata.java
License:Apache License
/** * Create metadata about given compressed file including uncompressed data length, chunk size * and list of the chunk offsets of the compressed data. * <p/>/* w ww . j a v a2 s. c om*/ * This is an expensive operation! Don't create more than one for each * sstable. * * @param dataFilePath Path to the compressed file * @return metadata about given compressed file. */ public static CompressionMetadata create(String dataFilePath, FileSystem fs) { Descriptor desc = Descriptor.fromFilename(dataFilePath); try { return new CompressionMetadata(desc.filenameFor(Component.COMPRESSION_INFO), fs.getFileStatus(new Path(dataFilePath)).getLen(), fs); } catch (IOException e) { throw new RuntimeException(e); } }