Java tutorial
/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.apache.arrow.vector.ipc; import static org.apache.arrow.vector.BufferLayout.BufferType.*; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import com.google.common.collect.ImmutableList; import io.netty.buffer.ArrowBuf; import org.apache.arrow.vector.*; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.BufferLayout.BufferType; import org.apache.arrow.vector.TypeLayout; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import com.fasterxml.jackson.core.JsonEncoding; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter; import com.fasterxml.jackson.databind.MappingJsonFactory; import org.apache.arrow.vector.util.DecimalUtility; import org.apache.arrow.vector.util.DictionaryUtility; import org.apache.commons.codec.binary.Hex; public class JsonFileWriter implements AutoCloseable { public static final class JSONWriteConfig { private final boolean pretty; private JSONWriteConfig(boolean pretty) { this.pretty = pretty; } private JSONWriteConfig() { this.pretty = false; } public JSONWriteConfig pretty(boolean pretty) { return new JSONWriteConfig(pretty); } } public static JSONWriteConfig config() { return new JSONWriteConfig(); } private final JsonGenerator generator; private Schema schema; public JsonFileWriter(File outputFile) throws IOException { this(outputFile, config()); } public JsonFileWriter(File outputFile, JSONWriteConfig config) throws IOException { MappingJsonFactory jsonFactory = new MappingJsonFactory(); this.generator = jsonFactory.createGenerator(outputFile, JsonEncoding.UTF8); if (config.pretty) { DefaultPrettyPrinter prettyPrinter = new DefaultPrettyPrinter(); prettyPrinter.indentArraysWith(NopIndenter.instance); this.generator.setPrettyPrinter(prettyPrinter); } // Allow writing of floating point NaN values not as strings this.generator.configure(JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS, false); } public void start(Schema schema, DictionaryProvider provider) throws IOException { List<Field> fields = new ArrayList<>(schema.getFields().size()); Set<Long> dictionaryIdsUsed = new HashSet<>(); this.schema = schema; // Store original Schema to ensure batches written match // Convert fields with dictionaries to have dictionary type for (Field field : schema.getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } Schema updatedSchema = new Schema(fields, schema.getCustomMetadata()); generator.writeStartObject(); generator.writeObjectField("schema", updatedSchema); // Write all dictionaries that were used if (!dictionaryIdsUsed.isEmpty()) { writeDictionaryBatches(generator, dictionaryIdsUsed, provider); } // Start writing of record batches generator.writeArrayFieldStart("batches"); } private void writeDictionaryBatches(JsonGenerator generator, Set<Long> dictionaryIdsUsed, DictionaryProvider provider) throws IOException { generator.writeArrayFieldStart("dictionaries"); for (Long id : dictionaryIdsUsed) { generator.writeStartObject(); generator.writeObjectField("id", id); generator.writeFieldName("data"); Dictionary dictionary = provider.lookup(id); FieldVector vector = dictionary.getVector(); List<Field> fields = ImmutableList.of(vector.getField()); List<FieldVector> vectors = ImmutableList.of(vector); VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); writeBatch(root); generator.writeEndObject(); } generator.writeEndArray(); } public void write(VectorSchemaRoot recordBatch) throws IOException { if (!recordBatch.getSchema().equals(schema)) { throw new IllegalArgumentException("record batches must have the same schema: " + schema); } writeBatch(recordBatch); } private void writeBatch(VectorSchemaRoot recordBatch) throws IOException { generator.writeStartObject(); { generator.writeObjectField("count", recordBatch.getRowCount()); generator.writeArrayFieldStart("columns"); for (Field field : recordBatch.getSchema().getFields()) { FieldVector vector = recordBatch.getVector(field.getName()); writeFromVectorIntoJson(field, vector); } generator.writeEndArray(); } generator.writeEndObject(); } private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOException { List<BufferType> vectorTypes = TypeLayout.getTypeLayout(field.getType()).getBufferTypes(); List<ArrowBuf> vectorBuffers = vector.getFieldBuffers(); if (vectorTypes.size() != vectorBuffers.size()) { throw new IllegalArgumentException("vector types and inner vector buffers are not the same size: " + vectorTypes.size() + " != " + vectorBuffers.size()); } generator.writeStartObject(); { generator.writeObjectField("name", field.getName()); int valueCount = vector.getValueCount(); generator.writeObjectField("count", valueCount); final int scale = (vector instanceof DecimalVector) ? ((DecimalVector) vector).getScale() : 0; for (int v = 0; v < vectorTypes.size(); v++) { BufferType bufferType = vectorTypes.get(v); ArrowBuf vectorBuffer = vectorBuffers.get(v); generator.writeArrayFieldStart(bufferType.getName()); final int bufferValueCount = (bufferType.equals(OFFSET)) ? valueCount + 1 : valueCount; for (int i = 0; i < bufferValueCount; i++) { if (bufferType.equals(DATA) && (vector.getMinorType() == Types.MinorType.VARCHAR || vector.getMinorType() == Types.MinorType.VARBINARY)) { writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v - 1), vector, i, scale); } else { writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale); } } generator.writeEndArray(); } List<Field> fields = field.getChildren(); List<FieldVector> children = vector.getChildrenFromFields(); if (fields.size() != children.size()) { throw new IllegalArgumentException( "fields and children are not the same size: " + fields.size() + " != " + children.size()); } if (fields.size() > 0) { generator.writeArrayFieldStart("children"); for (int i = 0; i < fields.size(); i++) { Field childField = fields.get(i); FieldVector childVector = children.get(i); writeFromVectorIntoJson(childField, childVector); } generator.writeEndArray(); } } generator.writeEndObject(); } private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, ArrowBuf offsetBuffer, FieldVector vector, final int index, final int scale) throws IOException { if (bufferType.equals(TYPE)) { generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); } else if (bufferType.equals(OFFSET)) { generator.writeNumber(buffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH)); } else if (bufferType.equals(VALIDITY)) { generator.writeNumber(vector.isNull(index) ? 0 : 1); } else if (bufferType.equals(DATA)) { switch (vector.getMinorType()) { case TINYINT: generator.writeNumber(TinyIntVector.get(buffer, index)); break; case SMALLINT: generator.writeNumber(SmallIntVector.get(buffer, index)); break; case INT: generator.writeNumber(IntVector.get(buffer, index)); break; case BIGINT: generator.writeNumber(BigIntVector.get(buffer, index)); break; case FLOAT4: generator.writeNumber(Float4Vector.get(buffer, index)); break; case FLOAT8: generator.writeNumber(Float8Vector.get(buffer, index)); break; case DATEDAY: generator.writeNumber(DateDayVector.get(buffer, index)); break; case DATEMILLI: generator.writeNumber(DateMilliVector.get(buffer, index)); break; case TIMESEC: generator.writeNumber(TimeSecVector.get(buffer, index)); break; case TIMEMILLI: generator.writeNumber(TimeMilliVector.get(buffer, index)); break; case TIMEMICRO: generator.writeNumber(TimeMicroVector.get(buffer, index)); break; case TIMENANO: generator.writeNumber(TimeNanoVector.get(buffer, index)); break; case TIMESTAMPSEC: generator.writeNumber(TimeStampSecVector.get(buffer, index)); break; case TIMESTAMPMILLI: generator.writeNumber(TimeStampMilliVector.get(buffer, index)); break; case TIMESTAMPMICRO: generator.writeNumber(TimeStampMicroVector.get(buffer, index)); break; case TIMESTAMPNANO: generator.writeNumber(TimeStampNanoVector.get(buffer, index)); break; case TIMESTAMPSECTZ: generator.writeNumber(TimeStampSecTZVector.get(buffer, index)); break; case TIMESTAMPMILLITZ: generator.writeNumber(TimeStampMilliTZVector.get(buffer, index)); break; case TIMESTAMPMICROTZ: generator.writeNumber(TimeStampMicroTZVector.get(buffer, index)); break; case TIMESTAMPNANOTZ: generator.writeNumber(TimeStampNanoTZVector.get(buffer, index)); break; case BIT: generator.writeNumber(BitVectorHelper.get(buffer, index)); break; case VARBINARY: { assert offsetBuffer != null; String hexString = Hex.encodeHexString(BaseVariableWidthVector.get(buffer, offsetBuffer, index)); generator.writeObject(hexString); break; } case VARCHAR: { assert offsetBuffer != null; byte[] b = (BaseVariableWidthVector.get(buffer, offsetBuffer, index)); generator.writeString(new String(b, "UTF-8")); break; } case DECIMAL: { BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale); // We write the unscaled value, because the scale is stored in the type metadata. generator.writeString(decimalValue.unscaledValue().toString()); break; } default: throw new UnsupportedOperationException("minor type: " + vector.getMinorType()); } } } @Override public void close() throws IOException { generator.writeEndArray(); generator.writeEndObject(); generator.close(); } }