org.apache.hawq.pxf.service.BridgeOutputBuilder.java Source code

Introduction

Here is the source code for org.apache.hawq.pxf.service.BridgeOutputBuilder.java
Source

package org.apache.hawq.pxf.service;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.hawq.pxf.api.BadRecordException;
import org.apache.hawq.pxf.api.OneField;
import org.apache.hawq.pxf.api.OutputFormat;
import org.apache.hawq.pxf.api.io.DataType;
import org.apache.hawq.pxf.service.io.BufferWritable;
import org.apache.hawq.pxf.service.io.GPDBWritable;
import org.apache.hawq.pxf.service.io.GPDBWritable.TypeMismatchException;
import org.apache.hawq.pxf.service.io.Text;
import org.apache.hawq.pxf.service.io.Writable;
import org.apache.hawq.pxf.service.utilities.ProtocolData;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.ObjectUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

import static org.apache.hawq.pxf.api.io.DataType.TEXT;

/**
 * Class creates the output record that is piped by the java process to the HAWQ
 * backend. Actually, the output record is serialized and the obtained byte
 * string is piped to the HAWQ segment. The output record will implement
 * Writable, and the mission of BridgeOutputBuilder will be to translate a list
 * of {@link OneField} objects (obtained from the Resolver) into an output
 * record.
 */
public class BridgeOutputBuilder {
    private ProtocolData inputData;
    private Writable output = null;
    private LinkedList<Writable> outputList = null;
    private Writable partialLine = null;
    private GPDBWritable errorRecord = null;
    private int[] schema;
    private String[] colNames;
    private boolean samplingEnabled = false;
    private boolean isPartialLine = false;

    private static final byte DELIM = 10; /* (byte)'\n'; */

    private static final Log LOG = LogFactory.getLog(BridgeOutputBuilder.class);

    /**
     * Constructs a BridgeOutputBuilder.
     *
     * @param input input data, like requested output format and schema
     *            information
     */
    public BridgeOutputBuilder(ProtocolData input) {
        inputData = input;
        outputList = new LinkedList<Writable>();
        makeErrorRecord();
        samplingEnabled = (inputData.getStatsSampleRatio() > 0);
    }

    /**
     * We need a separate GPDBWritable record to represent the error record.
     * Just setting the errorFlag on the "output" GPDBWritable variable is not
     * good enough, since the GPDBWritable is built only after the first record
     * is read from the file. And if we encounter an error while fetching the
     * first record from the file, then the output member will be null. The
     * reason we cannot count on the schema to build the GPDBWritable output
     * variable before reading the first record, is because the schema does not
     * account for arrays - we cannot know from the schema the length of an
     * array. We find out only after fetching the first record.
     */
    void makeErrorRecord() {
        int[] errSchema = { TEXT.getOID() };

        if (inputData.outputFormat() != OutputFormat.BINARY) {
            return;
        }

        errorRecord = new GPDBWritable(errSchema);
        errorRecord.setError(true);
    }

    /**
     * Returns the error record. If the output format is not binary, error
     * records are not supported, and the given exception will be thrown
     *
     * @param ex exception to be stored in record
     * @return error record
     * @throws Exception if the output format is not binary
     */
    public Writable getErrorOutput(Exception ex) throws Exception {
        if (inputData.outputFormat() == OutputFormat.BINARY) {
            errorRecord.setString(0, ex.getMessage());
            return errorRecord;
        } else {
            throw ex;
        }
    }

    /**
     * Translates recFields (obtained from the Resolver) into an output record.
     *
     * @param recFields record fields to be serialized
     * @return list of Writable objects with serialized row
     * @throws BadRecordException if building the output record failed
     */
    public LinkedList<Writable> makeOutput(List<OneField> recFields) throws BadRecordException {
        if (output == null && inputData.outputFormat() == OutputFormat.BINARY) {
            makeGPDBWritableOutput();
        }

        outputList.clear();

        fillOutputRecord(recFields);

        return outputList;
    }

    /**
     * Returns whether or not this is a partial line.
     *
     * @return true for a partial line
     */
    public Writable getPartialLine() {
        return partialLine;
    }

    /**
     * Creates the GPDBWritable object. The object is created one time and is
     * refilled from recFields for each record sent
     *
     * @return empty GPDBWritable object with set columns
     */
    GPDBWritable makeGPDBWritableOutput() {
        int num_actual_fields = inputData.getColumns();
        schema = new int[num_actual_fields];
        colNames = new String[num_actual_fields];

        for (int i = 0; i < num_actual_fields; i++) {
            schema[i] = inputData.getColumn(i).columnTypeCode();
            colNames[i] = inputData.getColumn(i).columnName();
        }

        output = new GPDBWritable(schema);

        return (GPDBWritable) output;
    }

    /**
     * Fills the output record based on the fields in recFields.
     *
     * @param recFields record fields
     * @throws BadRecordException if building the output record failed
     */
    void fillOutputRecord(List<OneField> recFields) throws BadRecordException {
        if (inputData.outputFormat() == OutputFormat.BINARY) {
            fillGPDBWritable(recFields);
        } else {
            fillText(recFields);
        }
    }

    /**
     * Fills a GPDBWritable object based on recFields. The input record
     * recFields must correspond to schema. If the record has more or less
     * fields than the schema we throw an exception. We require that the type of
     * field[i] in recFields corresponds to the type of field[i] in the schema.
     *
     * @param recFields record fields
     * @throws BadRecordException if building the output record failed
     */
    void fillGPDBWritable(List<OneField> recFields) throws BadRecordException {
        int size = recFields.size();
        if (size == 0) { // size 0 means the resolver couldn't deserialize any
                         // of the record fields
            throw new BadRecordException("No fields in record");
        } else if (size != schema.length) {
            throw new BadRecordException("Record has " + size + " fields but the schema size is " + schema.length);
        }

        for (int i = 0; i < size; i++) {
            OneField current = recFields.get(i);
            if (!isTypeInSchema(current.type, schema[i])) {
                throw new BadRecordException(
                        "For field " + colNames[i] + " schema requires type " + DataType.get(schema[i]).toString()
                                + " but input record has type " + DataType.get(current.type).toString());
            }

            fillOneGPDBWritableField(current, i);
        }

        outputList.add(output);
    }

    /**
     * Tests if data type is a string type. String type is a type that can be
     * serialized as string, such as varchar, bpchar, text, numeric, timestamp,
     * date.
     *
     * @param type data type
     * @return whether data type is string type
     */
    boolean isStringType(DataType type) {
        return Arrays.asList(DataType.VARCHAR, DataType.BPCHAR, DataType.TEXT, DataType.NUMERIC, DataType.TIMESTAMP,
                DataType.DATE).contains(type);
    }

    /**
     * Tests if record field type and schema type correspond.
     *
     * @param recType record type code
     * @param schemaType schema type code
     * @return whether record type and schema type match
     */
    boolean isTypeInSchema(int recType, int schemaType) {
        DataType dtRec = DataType.get(recType);
        DataType dtSchema = DataType.get(schemaType);

        return (dtSchema == DataType.UNSUPPORTED_TYPE || dtRec == dtSchema
                || (isStringType(dtRec) && isStringType(dtSchema)));
    }

    /**
     * Fills a Text object based on recFields.
     *
     * @param recFields record fields
     * @throws BadRecordException if text formatted record has more than one
     *             field
     */
    void fillText(List<OneField> recFields) throws BadRecordException {
        /*
         * For the TEXT case there must be only one record in the list
         */
        if (recFields.size() != 1) {
            throw new BadRecordException(
                    "BridgeOutputBuilder must receive one field when handling the TEXT format");
        }

        OneField fld = recFields.get(0);
        int type = fld.type;
        Object val = fld.val;
        if (DataType.get(type) == DataType.BYTEA) {// from LineBreakAccessor
            if (samplingEnabled) {
                convertTextDataToLines((byte[]) val);
            } else {
                output = new BufferWritable((byte[]) val);
                outputList.add(output); // TODO break output into lines
            }
        } else { // from QuotedLineBreakAccessor
            String textRec = (String) val;
            output = new Text(textRec + "\n");
            outputList.add(output);
        }
    }

    /**
     * Breaks raw bytes into lines. Used only for sampling.
     *
     * When sampling a data source, we have to make sure that we deal with
     * actual rows (lines) and not bigger chunks of data such as used by
     * LineBreakAccessor for performance. The input byte array is broken into
     * lines, each one stored in the outputList. In case the read data doesn't
     * end with a line delimiter, which can happen when reading chunks of bytes,
     * the partial line is stored separately, and is being completed when
     * reading the next chunk of data.
     *
     * @param val input raw data to break into lines
     */
    void convertTextDataToLines(byte[] val) {
        int len = val.length;
        int start = 0;
        int end = 0;
        byte[] line;
        BufferWritable writable;

        while (start < len) {
            end = ArrayUtils.indexOf(val, DELIM, start);
            if (end == ArrayUtils.INDEX_NOT_FOUND) {
                // data finished in the middle of the line
                end = len;
                isPartialLine = true;
            } else {
                end++; // include the DELIM character
                isPartialLine = false;
            }
            line = Arrays.copyOfRange(val, start, end);

            if (partialLine != null) {
                // partial data was completed
                ((BufferWritable) partialLine).append(line);
                writable = (BufferWritable) partialLine;
                partialLine = null;
            } else {
                writable = new BufferWritable(line);
            }

            if (isPartialLine) {
                partialLine = writable;
            } else {
                outputList.add(writable);
            }
            start = end;
        }
    }

    /**
     * Fills one GPDBWritable field.
     *
     * @param oneField field
     * @param colIdx column index
     * @throws BadRecordException if field type is not supported or doesn't
     *             match the schema
     */
    void fillOneGPDBWritableField(OneField oneField, int colIdx) throws BadRecordException {
        int type = oneField.type;
        Object val = oneField.val;
        GPDBWritable gpdbOutput = (GPDBWritable) output;
        try {
            switch (DataType.get(type)) {
            case INTEGER:
                gpdbOutput.setInt(colIdx, (Integer) val);
                break;
            case FLOAT8:
                gpdbOutput.setDouble(colIdx, (Double) val);
                break;
            case REAL:
                gpdbOutput.setFloat(colIdx, (Float) val);
                break;
            case BIGINT:
                gpdbOutput.setLong(colIdx, (Long) val);
                break;
            case SMALLINT:
                gpdbOutput.setShort(colIdx, (Short) val);
                break;
            case BOOLEAN:
                gpdbOutput.setBoolean(colIdx, (Boolean) val);
                break;
            case BYTEA:
                byte[] bts = null;
                if (val != null) {
                    int length = Array.getLength(val);
                    bts = new byte[length];
                    for (int j = 0; j < length; j++) {
                        bts[j] = Array.getByte(val, j);
                    }
                }
                gpdbOutput.setBytes(colIdx, bts);
                break;
            case VARCHAR:
            case BPCHAR:
            case CHAR:
            case TEXT:
            case NUMERIC:
            case TIMESTAMP:
            case DATE:
                gpdbOutput.setString(colIdx, ObjectUtils.toString(val, null));
                break;
            default:
                String valClassName = (val != null) ? val.getClass().getSimpleName() : null;
                throw new UnsupportedOperationException(valClassName + " is not supported for HAWQ conversion");
            }
        } catch (TypeMismatchException e) {
            throw new BadRecordException(e);
        }
    }
}