Java tutorial
/* * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.common.util; import static com.uber.hoodie.common.util.FSUtils.getFs; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.MetadataNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Function; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; /** * Utility functions involving with parquet. */ public class ParquetUtils { /** * Read the rowKey list from the given parquet file. * * @param filePath The parquet file path. * @param configuration configuration to build fs object * @return Set Set of row keys */ public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) { return filterParquetRowKeys(configuration, filePath, new HashSet<>()); } /** * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, * then this will return all the rowkeys. * * @param filePath The parquet file path. * @param configuration configuration to build fs object * @param filter record keys filter * @return Set Set of row keys matching candidateRecordKeys */ public static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter) { Optional<RecordKeysFilterFunction> filterFunction = Optional.empty(); if (CollectionUtils.isNotEmpty(filter)) { filterFunction = Optional.of(new RecordKeysFilterFunction(filter)); } Configuration conf = new Configuration(configuration); conf.addResource(getFs(filePath.toString(), conf).getConf()); Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema); ParquetReader reader = null; Set<String> rowKeys = new HashSet<>(); try { reader = AvroParquetReader.builder(filePath).withConf(conf).build(); Object obj = reader.read(); while (obj != null) { if (obj instanceof GenericRecord) { String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) { rowKeys.add(recordKey); } } obj = reader.read(); } } catch (IOException e) { throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // ignore } } } return rowKeys; } public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { ParquetMetadata footer; try { // TODO(vc): Should we use the parallel reading version here? footer = ParquetFileReader.readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } return footer; } /** * Get the schema of the given parquet file. */ public static MessageType readSchema(Configuration configuration, Path parquetFilePath) { return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema(); } private static List<String> readParquetFooter(Configuration configuration, Path parquetFilePath, String... footerNames) { List<String> footerVals = new ArrayList<>(); ParquetMetadata footer = readMetadata(configuration, parquetFilePath); Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData(); for (String footerName : footerNames) { if (metadata.containsKey(footerName)) { footerVals.add(metadata.get(footerName)); } else { throw new MetadataNotFoundException("Could not find index in Parquet footer. " + "Looked for key " + footerName + " in " + parquetFilePath); } } return footerVals; } public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); } /** * Read out the bloom filter from the parquet file meta data. */ public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, Path parquetFilePath) { String footerVal = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); return new BloomFilter(footerVal); } public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) { List<String> minMaxKeys = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); if (minMaxKeys.size() != 2) { throw new HoodieException( String.format("Could not read min/max record key out of footer correctly from %s. read) : %s", parquetFilePath, minMaxKeys)); } return new String[] { minMaxKeys.get(0), minMaxKeys.get(1) }; } /** * NOTE: This literally reads the entire file contents, thus should be used with caution. */ public static List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath) { ParquetReader reader = null; List<GenericRecord> records = new ArrayList<>(); try { reader = AvroParquetReader.builder(filePath).withConf(configuration).build(); Object obj = reader.read(); while (obj != null) { if (obj instanceof GenericRecord) { records.add(((GenericRecord) obj)); } obj = reader.read(); } } catch (IOException e) { throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // ignore } } } return records; } static class RecordKeysFilterFunction implements Function<String, Boolean> { private final Set<String> candidateKeys; RecordKeysFilterFunction(Set<String> candidateKeys) { this.candidateKeys = candidateKeys; } @Override public Boolean apply(String recordKey) { if (candidateKeys.contains(recordKey)) { return true; } return false; } } }