org.apache.tajo.storage.sequencefile.SequenceFileScanner.java Source code

Introduction

Here is the source code for org.apache.tajo.storage.sequencefile.SequenceFileScanner.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.storage.sequencefile;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.*;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.conf.TajoConf;
import org.apache.tajo.datum.Datum;
import org.apache.tajo.datum.NullDatum;
import org.apache.tajo.storage.*;
import org.apache.tajo.storage.fragment.Fragment;
import org.apache.tajo.util.BytesUtils;

import java.io.IOException;

public class SequenceFileScanner extends FileScanner {
    private static final Log LOG = LogFactory.getLog(SequenceFileScanner.class);

    private FileSystem fs;
    private SequenceFile.Reader reader;
    private SerializerDeserializer serde;
    private byte[] nullChars;
    private char delimiter;

    private int currentIdx = 0;
    private int[] projectionMap;

    private boolean hasBinarySerDe = false;
    private long totalBytes = 0L;

    private long start, end;
    private boolean more = true;

    /**
     * Whether a field is null or not. Because length is 0 does not means the
     * field is null. In particular, a 0-length string is not null.
     */
    private boolean[] fieldIsNull;

    /**
     * The start positions and lengths of fields. Only valid when the data is parsed.
     */
    private int[] fieldStart;
    private int[] fieldLength;

    private int elementOffset, elementSize;

    private Writable EMPTY_KEY;

    public SequenceFileScanner(Configuration conf, Schema schema, TableMeta meta, Fragment fragment)
            throws IOException {
        super(conf, schema, meta, fragment);
    }

    @Override
    public void init() throws IOException {
        // FileFragment information
        if (fs == null) {
            fs = FileScanner.getFileSystem((TajoConf) conf, fragment.getPath());
        }

        reader = new SequenceFile.Reader(fs, fragment.getPath(), conf);

        String nullCharacters = StringEscapeUtils
                .unescapeJava(this.meta.getOption(StorageConstants.SEQUENCEFILE_NULL, NullDatum.DEFAULT_TEXT));
        if (StringUtils.isEmpty(nullCharacters)) {
            nullChars = NullDatum.get().asTextBytes();
        } else {
            nullChars = nullCharacters.getBytes();
        }

        String delim = meta.getOption(StorageConstants.SEQUENCEFILE_DELIMITER,
                StorageConstants.DEFAULT_FIELD_DELIMITER);
        this.delimiter = StringEscapeUtils.unescapeJava(delim).charAt(0);

        this.start = fragment.getStartKey();
        this.end = start + fragment.getLength();

        if (fragment.getStartKey() > reader.getPosition())
            reader.sync(this.start);

        more = start < end;

        if (targets == null) {
            targets = schema.toArray();
        }

        fieldIsNull = new boolean[schema.getColumns().size()];
        fieldStart = new int[schema.getColumns().size()];
        fieldLength = new int[schema.getColumns().size()];

        prepareProjection(targets);

        try {
            String serdeClass = this.meta.getOption(StorageConstants.SEQUENCEFILE_SERDE,
                    TextSerializerDeserializer.class.getName());
            serde = (SerializerDeserializer) Class.forName(serdeClass).newInstance();

            if (serde instanceof BinarySerializerDeserializer) {
                hasBinarySerDe = true;
            }

            Class<? extends Writable> keyClass = (Class<? extends Writable>) Class
                    .forName(reader.getKeyClassName());
            EMPTY_KEY = keyClass.newInstance();

        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
            throw new IOException(e);
        }
        super.init();
    }

    public Writable getKey() {
        return EMPTY_KEY;
    }

    private void prepareProjection(Column[] targets) {
        projectionMap = new int[targets.length];

        int tid;
        for (int i = 0; i < targets.length; i++) {
            tid = schema.getColumnId(targets[i].getQualifiedName());
            projectionMap[i] = tid;
        }
    }

    @Override
    public Tuple next() throws IOException {
        if (!more)
            return null;

        long pos = reader.getPosition();
        boolean remaining = reader.next(EMPTY_KEY);

        if (pos >= end && reader.syncSeen()) {
            more = false;
        } else {
            more = remaining;
        }

        if (more) {
            Tuple tuple = null;
            byte[][] cells;

            if (hasBinarySerDe) {
                BytesWritable bytesWritable = new BytesWritable();
                reader.getCurrentValue(bytesWritable);
                tuple = makeTuple(bytesWritable);
                totalBytes += (long) bytesWritable.getBytes().length;
            } else {
                Text text = new Text();
                reader.getCurrentValue(text);
                cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap,
                        schema.getColumns().size());
                totalBytes += (long) text.getBytes().length;
                tuple = new LazyTuple(schema, cells, 0, nullChars, serde);
            }
            currentIdx++;
            return tuple;
        } else {
            return null;
        }
    }

    /**
     * In hive, LazyBinarySerDe is serialized as follows: start A B A B A B end bytes[] ->
     * |-----|---------|--- ... ---|-----|---------|
     *
     * Section A is one null-byte, corresponding to eight struct fields in Section
     * B. Each bit indicates whether the corresponding field is null (0) or not null
     * (1). Each field is a LazyBinaryObject.
     *
     * Following B, there is another section A and B. This pattern repeats until the
     * all struct fields are serialized.
     *
     * So, tajo must make a tuple after parsing hive style BinarySerDe.
     */
    private Tuple makeTuple(BytesWritable value) throws IOException {
        Tuple tuple = new VTuple(schema.getColumns().size());

        int start = 0;
        int length = value.getLength();

        /**
         * Please note that one null byte is followed by eight fields, then more
         * null byte and fields.
         */
        int structByteEnd = start + length;
        byte[] bytes = value.getBytes();

        byte nullByte = bytes[start];
        int lastFieldByteEnd = start + 1;

        // Go through all bytes in the byte[]
        for (int i = 0; i < schema.getColumns().size(); i++) {
            fieldIsNull[i] = true;
            if ((nullByte & (1 << (i % 8))) != 0) {
                fieldIsNull[i] = false;
                parse(schema.getColumn(i), bytes, lastFieldByteEnd);

                fieldStart[i] = lastFieldByteEnd + elementOffset;
                fieldLength[i] = elementSize;
                lastFieldByteEnd = fieldStart[i] + fieldLength[i];

                for (int j = 0; j < projectionMap.length; j++) {
                    if (projectionMap[j] == i) {
                        Datum datum = serde.deserialize(schema.getColumn(i), bytes, fieldStart[i], fieldLength[i],
                                nullChars);
                        tuple.put(i, datum);
                    }
                }
            }

            // next byte is a null byte if there are more bytes to go
            if (7 == (i % 8)) {
                if (lastFieldByteEnd < structByteEnd) {
                    nullByte = bytes[lastFieldByteEnd];
                    lastFieldByteEnd++;
                } else {
                    // otherwise all null afterwards
                    nullByte = 0;
                    lastFieldByteEnd++;
                }
            }
        }

        return tuple;
    }

    /**
     * Check a particular field and set its size and offset in bytes based on the
     * field type and the bytes arrays.
     *
     * For void, boolean, byte, short, int, long, float and double, there is no
     * offset and the size is fixed. For string, the first four bytes are used to store the size.
     * So the offset is 4 and the size is computed by concating the first four bytes together.
     * The first four bytes are defined with respect to the offset in the bytes arrays.
     *
     * @param col
     *          catalog column information
     * @param bytes
     *          bytes arrays store the table row
     * @param offset
     *          offset of this field
     */
    private void parse(Column col, byte[] bytes, int offset) throws IOException {
        switch (col.getDataType().getType()) {
        case BOOLEAN:
        case BIT:
            elementOffset = 0;
            elementSize = 1;
            break;
        case INT2:
            elementOffset = 0;
            elementSize = 2;
            break;
        case INT4:
        case INT8:
            elementOffset = 0;
            elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
            break;
        case FLOAT4:
            elementOffset = 0;
            elementSize = 4;
            break;
        case FLOAT8:
            elementOffset = 0;
            elementSize = 8;
            break;
        case BLOB:
        case PROTOBUF:
        case INET4:
        case CHAR:
        case TEXT:
            elementOffset = 1;
            elementSize = bytes[offset];
            break;
        default:
            elementOffset = 0;
            elementSize = 0;
        }
    }

    @Override
    public void reset() throws IOException {
        if (reader != null) {
            reader.sync(0);
        }
    }

    @Override
    public void close() throws IOException {
        if (reader != null)
            reader.close();

        if (tableStats != null) {
            tableStats.setReadBytes(totalBytes);
            tableStats.setNumRows(currentIdx);
        }
    }

    @Override
    public boolean isProjectable() {
        return true;
    }

    @Override
    public boolean isSelectable() {
        return true;
    }

    @Override
    public boolean isSplittable() {
        return true;
    }
}