Java tutorial
/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.dataflow.sdk.util; import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Verify.verify; import static com.google.common.base.Verify.verifyNotNull; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.io.BaseEncoding; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileConstants; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryDecoder; import org.apache.avro.io.DecoderFactory; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.util.Arrays; import java.util.List; import javax.annotation.Nullable; /** * A set of utilities for working with Avro files. * * <p>These utilities are based on the <a * href="https://avro.apache.org/docs/1.7.7/spec.html">Avro 1.7.7</a> specification. */ public class AvroUtils { /** * Avro file metadata. */ public static class AvroMetadata { private byte[] syncMarker; private String codec; private String schemaString; AvroMetadata(byte[] syncMarker, String codec, String schemaString) { this.syncMarker = syncMarker; this.codec = codec; this.schemaString = schemaString; } /** * The JSON-encoded <a href="https://avro.apache.org/docs/1.7.7/spec.html#schemas">schema</a> * string for the file. */ public String getSchemaString() { return schemaString; } /** * The <a href="https://avro.apache.org/docs/1.7.7/spec.html#Required+Codecs">codec</a> of the * file. */ public String getCodec() { return codec; } /** * The 16-byte sync marker for the file. See the documentation for * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">Object * Container File</a> for more information. */ public byte[] getSyncMarker() { return syncMarker; } } /** * Reads the {@link AvroMetadata} from the header of an Avro file. * * <p>This method parses the header of an Avro * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files"> * Object Container File</a>. * * @throws IOException if the file is an invalid format. */ public static AvroMetadata readMetadataFromFile(String fileName) throws IOException { String codec = null; String schemaString = null; byte[] syncMarker; try (InputStream stream = Channels.newInputStream(IOChannelUtils.getFactory(fileName).open(fileName))) { BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null); // The header of an object container file begins with a four-byte magic number, followed // by the file metadata (including the schema and codec), encoded as a map. Finally, the // header ends with the file's 16-byte sync marker. // See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on // the encoding of container files. // Read the magic number. byte[] magic = new byte[DataFileConstants.MAGIC.length]; decoder.readFixed(magic); if (!Arrays.equals(magic, DataFileConstants.MAGIC)) { throw new IOException("Missing Avro file signature: " + fileName); } // Read the metadata to find the codec and schema. ByteBuffer valueBuffer = ByteBuffer.allocate(512); long numRecords = decoder.readMapStart(); while (numRecords > 0) { for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) { String key = decoder.readString(); // readBytes() clears the buffer and returns a buffer where: // - position is the start of the bytes read // - limit is the end of the bytes read valueBuffer = decoder.readBytes(valueBuffer); byte[] bytes = new byte[valueBuffer.remaining()]; valueBuffer.get(bytes); if (key.equals(DataFileConstants.CODEC)) { codec = new String(bytes, "UTF-8"); } else if (key.equals(DataFileConstants.SCHEMA)) { schemaString = new String(bytes, "UTF-8"); } } numRecords = decoder.mapNext(); } if (codec == null) { codec = DataFileConstants.NULL_CODEC; } // Finally, read the sync marker. syncMarker = new byte[DataFileConstants.SYNC_SIZE]; decoder.readFixed(syncMarker); } return new AvroMetadata(syncMarker, codec, schemaString); } /** * Formats BigQuery seconds-since-epoch into String matching JSON export. Thread-safe and * immutable. */ private static final DateTimeFormatter DATE_AND_SECONDS_FORMATTER = DateTimeFormat .forPattern("yyyy-MM-dd HH:mm:ss").withZoneUTC(); // Package private for BigQueryTableRowIterator to use. static String formatTimestamp(String timestamp) { // timestamp is in "seconds since epoch" format, with scientific notation. // e.g., "1.45206229112345E9" to mean "2016-01-06 06:38:11.123456 UTC". // Separate into seconds and microseconds. double timestampDoubleMicros = Double.parseDouble(timestamp) * 1000000; long timestampMicros = (long) timestampDoubleMicros; long seconds = timestampMicros / 1000000; int micros = (int) (timestampMicros % 1000000); String dayAndTime = DATE_AND_SECONDS_FORMATTER.print(seconds * 1000); // No sub-second component. if (micros == 0) { return String.format("%s UTC", dayAndTime); } // Sub-second component. int digits = 6; int subsecond = micros; while (subsecond % 10 == 0) { digits--; subsecond /= 10; } String formatString = String.format("%%0%dd", digits); String fractionalSeconds = String.format(formatString, subsecond); return String.format("%s.%s UTC", dayAndTime, fractionalSeconds); } /** * Utility function to convert from an Avro {@link GenericRecord} to a BigQuery {@link TableRow}. * * See <a href="https://cloud.google.com/bigquery/exporting-data-from-bigquery#config"> * "Avro format"</a> for more information. */ public static TableRow convertGenericRecordToTableRow(GenericRecord record, TableSchema schema) { return convertGenericRecordToTableRow(record, schema.getFields()); } private static TableRow convertGenericRecordToTableRow(GenericRecord record, List<TableFieldSchema> fields) { TableRow row = new TableRow(); for (TableFieldSchema subSchema : fields) { // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the name field // is required, so it may not be null. Field field = record.getSchema().getField(subSchema.getName()); Object convertedValue = getTypedCellValue(field.schema(), subSchema, record.get(field.name())); if (convertedValue != null) { // To match the JSON files exported by BigQuery, do not include null values in the output. row.set(field.name(), convertedValue); } } return row; } @Nullable private static Object getTypedCellValue(Schema schema, TableFieldSchema fieldSchema, Object v) { // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the mode field // is optional (and so it may be null), but defaults to "NULLABLE". String mode = firstNonNull(fieldSchema.getMode(), "NULLABLE"); switch (mode) { case "REQUIRED": return convertRequiredField(schema.getType(), fieldSchema, v); case "REPEATED": return convertRepeatedField(schema, fieldSchema, v); case "NULLABLE": return convertNullableField(schema, fieldSchema, v); default: throw new UnsupportedOperationException( "Parsing a field with BigQuery field schema mode " + fieldSchema.getMode()); } } private static List<Object> convertRepeatedField(Schema schema, TableFieldSchema fieldSchema, Object v) { Type arrayType = schema.getType(); verify(arrayType == Type.ARRAY, "BigQuery REPEATED field %s should be Avro ARRAY, not %s", fieldSchema.getName(), arrayType); // REPEATED fields are represented as Avro arrays. if (v == null) { // Handle the case of an empty repeated field. return ImmutableList.of(); } @SuppressWarnings("unchecked") List<Object> elements = (List<Object>) v; ImmutableList.Builder<Object> values = ImmutableList.builder(); Type elementType = schema.getElementType().getType(); for (Object element : elements) { values.add(convertRequiredField(elementType, fieldSchema, element)); } return values.build(); } private static Object convertRequiredField(Type avroType, TableFieldSchema fieldSchema, Object v) { // REQUIRED fields are represented as the corresponding Avro types. For example, a BigQuery // INTEGER type maps to an Avro LONG type. checkNotNull(v, "REQUIRED field %s should not be null", fieldSchema.getName()); ImmutableMap<String, Type> fieldMap = ImmutableMap.<String, Type>builder().put("STRING", Type.STRING) .put("BYTES", Type.BYTES).put("INTEGER", Type.LONG).put("FLOAT", Type.DOUBLE) .put("BOOLEAN", Type.BOOLEAN).put("TIMESTAMP", Type.LONG).put("RECORD", Type.RECORD) .put("DATE", Type.STRING).put("DATETIME", Type.STRING).put("TIME", Type.STRING).build(); // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the type field // is required, so it may not be null. String bqType = fieldSchema.getType(); Type expectedAvroType = fieldMap.get(bqType); verifyNotNull(expectedAvroType, "Unsupported BigQuery type: %s", bqType); verify(avroType == expectedAvroType, "Expected Avro schema type %s, not %s, for BigQuery %s field %s", expectedAvroType, avroType, bqType, fieldSchema.getName()); switch (fieldSchema.getType()) { case "STRING": case "DATE": case "DATETIME": case "TIME": // Avro will use a CharSequence to represent String objects, but it may not always use // java.lang.String; for example, it may prefer org.apache.avro.util.Utf8. verify(v instanceof CharSequence, "Expected CharSequence (String), got %s", v.getClass()); return v.toString(); case "INTEGER": verify(v instanceof Long, "Expected Long, got %s", v.getClass()); return ((Long) v).toString(); case "FLOAT": verify(v instanceof Double, "Expected Double, got %s", v.getClass()); return v; case "BOOLEAN": verify(v instanceof Boolean, "Expected Boolean, got %s", v.getClass()); return v; case "TIMESTAMP": // TIMESTAMP data types are represented as Avro LONG types. They are converted back to // Strings with variable-precision (up to six digits) to match the JSON files export // by BigQuery. verify(v instanceof Long, "Expected Long, got %s", v.getClass()); Double doubleValue = ((Long) v) / 1000000.0; return formatTimestamp(doubleValue.toString()); case "RECORD": verify(v instanceof GenericRecord, "Expected GenericRecord, got %s", v.getClass()); return convertGenericRecordToTableRow((GenericRecord) v, fieldSchema.getFields()); case "BYTES": verify(v instanceof ByteBuffer, "Expected ByteBuffer, got %s", v.getClass()); ByteBuffer byteBuffer = (ByteBuffer) v; byte[] bytes = new byte[byteBuffer.limit()]; byteBuffer.get(bytes); return BaseEncoding.base64().encode(bytes); default: throw new UnsupportedOperationException( String.format("Unexpected BigQuery field schema type %s for field named %s", fieldSchema.getType(), fieldSchema.getName())); } } @Nullable private static Object convertNullableField(Schema avroSchema, TableFieldSchema fieldSchema, Object v) { // NULLABLE fields are represented as an Avro Union of the corresponding type and "null". verify(avroSchema.getType() == Type.UNION, "Expected Avro schema type UNION, not %s, for BigQuery NULLABLE field %s", avroSchema.getType(), fieldSchema.getName()); List<Schema> unionTypes = avroSchema.getTypes(); verify(unionTypes.size() == 2, "BigQuery NULLABLE field %s should be an Avro UNION of NULL and another type, not %s", fieldSchema.getName(), unionTypes); if (v == null) { return null; } Type firstType = unionTypes.get(0).getType(); if (!firstType.equals(Type.NULL)) { return convertRequiredField(firstType, fieldSchema, v); } return convertRequiredField(unionTypes.get(1).getType(), fieldSchema, v); } }