org.apache.arrow.vector.file.json.JsonFileReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.arrow.vector.file.json.JsonFileReader.java

Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.arrow.vector.file.json;

import static com.fasterxml.jackson.core.JsonToken.END_ARRAY;
import static com.fasterxml.jackson.core.JsonToken.END_OBJECT;
import static com.fasterxml.jackson.core.JsonToken.START_ARRAY;
import static com.fasterxml.jackson.core.JsonToken.START_OBJECT;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.BitVector;
import org.apache.arrow.vector.BufferBacked;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.SmallIntVector;
import org.apache.arrow.vector.TimeStampSecVector;
import org.apache.arrow.vector.TimeStampMilliVector;
import org.apache.arrow.vector.TimeStampMicroVector;
import org.apache.arrow.vector.TimeStampNanoVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.UInt1Vector;
import org.apache.arrow.vector.UInt2Vector;
import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.UInt8Vector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ValueVector.Mutator;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.complex.NullableMapVector;
import org.apache.arrow.vector.schema.ArrowVectorType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.MappingJsonFactory;
import com.google.common.base.Objects;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.binary.Hex;

public class JsonFileReader implements AutoCloseable {
    private final File inputFile;
    private final JsonParser parser;
    private final BufferAllocator allocator;
    private Schema schema;

    public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException {
        super();
        this.inputFile = inputFile;
        this.allocator = allocator;
        MappingJsonFactory jsonFactory = new MappingJsonFactory();
        this.parser = jsonFactory.createParser(inputFile);
    }

    public Schema start() throws JsonParseException, IOException {
        readToken(START_OBJECT);
        {
            this.schema = readNextField("schema", Schema.class);
            nextFieldIs("batches");
            readToken(START_ARRAY);
            return schema;
        }
    }

    public VectorSchemaRoot read() throws IOException {
        JsonToken t = parser.nextToken();
        if (t == START_OBJECT) {
            VectorSchemaRoot recordBatch = new VectorSchemaRoot(schema, allocator);
            {
                int count = readNextField("count", Integer.class);
                recordBatch.setRowCount(count);
                nextFieldIs("columns");
                readToken(START_ARRAY);
                {
                    for (Field field : schema.getFields()) {
                        FieldVector vector = recordBatch.getVector(field.getName());
                        readVector(field, vector);
                    }
                }
                readToken(END_ARRAY);
            }
            readToken(END_OBJECT);
            return recordBatch;
        } else if (t == END_ARRAY) {
            return null;
        } else {
            throw new IllegalArgumentException("Invalid token: " + t);
        }
    }

    private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException {
        List<ArrowVectorType> vectorTypes = field.getTypeLayout().getVectorTypes();
        List<BufferBacked> fieldInnerVectors = vector.getFieldInnerVectors();
        if (vectorTypes.size() != fieldInnerVectors.size()) {
            throw new IllegalArgumentException("vector types and inner vectors are not the same size: "
                    + vectorTypes.size() + " != " + fieldInnerVectors.size());
        }
        readToken(START_OBJECT);
        {
            String name = readNextField("name", String.class);
            if (!Objects.equal(field.getName(), name)) {
                throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name);
            }
            int count = readNextField("count", Integer.class);
            for (int v = 0; v < vectorTypes.size(); v++) {
                ArrowVectorType vectorType = vectorTypes.get(v);
                BufferBacked innerVector = fieldInnerVectors.get(v);
                nextFieldIs(vectorType.getName());
                readToken(START_ARRAY);
                ValueVector valueVector = (ValueVector) innerVector;
                valueVector.allocateNew();
                Mutator mutator = valueVector.getMutator();

                int innerVectorCount = vectorType.equals(OFFSET) ? count + 1 : count;
                for (int i = 0; i < innerVectorCount; i++) {
                    parser.nextToken();
                    setValueFromParser(valueVector, i);
                }
                mutator.setValueCount(innerVectorCount);
                readToken(END_ARRAY);
            }
            // if children
            List<Field> fields = field.getChildren();
            if (!fields.isEmpty()) {
                List<FieldVector> vectorChildren = vector.getChildrenFromFields();
                if (fields.size() != vectorChildren.size()) {
                    throw new IllegalArgumentException("fields and children are not the same size: " + fields.size()
                            + " != " + vectorChildren.size());
                }
                nextFieldIs("children");
                readToken(START_ARRAY);
                for (int i = 0; i < fields.size(); i++) {
                    Field childField = fields.get(i);
                    FieldVector childVector = vectorChildren.get(i);
                    readVector(childField, childVector);
                }
                readToken(END_ARRAY);
            }
            if (vector instanceof NullableMapVector) {
                ((NullableMapVector) vector).valueCount = count;
            }
        }
        readToken(END_OBJECT);
    }

    private byte[] decodeHexSafe(String hexString) throws IOException {
        try {
            return Hex.decodeHex(hexString.toCharArray());
        } catch (DecoderException e) {
            throw new IOException("Unable to decode hex string: " + hexString, e);
        }
    }

    private void setValueFromParser(ValueVector valueVector, int i) throws IOException {
        switch (valueVector.getMinorType()) {
        case BIT:
            ((BitVector) valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0);
            break;
        case TINYINT:
            ((TinyIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case SMALLINT:
            ((SmallIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case INT:
            ((IntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case BIGINT:
            ((BigIntVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        case UINT1:
            ((UInt1Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case UINT2:
            ((UInt2Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case UINT4:
            ((UInt4Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
            break;
        case UINT8:
            ((UInt8Vector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        case FLOAT4:
            ((Float4Vector) valueVector).getMutator().set(i, parser.readValueAs(Float.class));
            break;
        case FLOAT8:
            ((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class));
            break;
        case VARBINARY:
            ((VarBinaryVector) valueVector).getMutator().setSafe(i,
                    decodeHexSafe(parser.readValueAs(String.class)));
            break;
        case VARCHAR:
            ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8));
            break;
        case TIMESTAMPSEC:
            ((TimeStampSecVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        case TIMESTAMPMILLI:
            ((TimeStampMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        case TIMESTAMPMICRO:
            ((TimeStampMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        case TIMESTAMPNANO:
            ((TimeStampNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class));
            break;
        default:
            throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType());
        }
    }

    @Override
    public void close() throws IOException {
        parser.close();
    }

    private <T> T readNextField(String expectedFieldName, Class<T> c) throws IOException, JsonParseException {
        nextFieldIs(expectedFieldName);
        parser.nextToken();
        return parser.readValueAs(c);
    }

    private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException {
        String name = parser.nextFieldName();
        if (name == null || !name.equals(expectedFieldName)) {
            throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name);
        }
    }

    private void readToken(JsonToken expected) throws JsonParseException, IOException {
        JsonToken t = parser.nextToken();
        if (t != expected) {
            throw new IllegalStateException("Expected " + expected + " but got " + t);
        }
    }

}