com.aliyun.odps.ship.common.RecordConverter.java Source code

Java tutorial

Introduction

Here is the source code for com.aliyun.odps.ship.common.RecordConverter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.aliyun.odps.ship.common;

import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.TimeZone;

import org.apache.commons.cli.ParseException;

import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.data.ArrayRecord;
import com.aliyun.odps.data.Record;

public class RecordConverter {

    private final byte[] nullBytes;
    private ArrayRecord r = null;
    TableSchema schema;
    String nullTag;
    SimpleDateFormat dateFormatter;

    DecimalFormat doubleFormat;
    String charset;
    String defaultCharset;

    public RecordConverter(TableSchema schema, String nullTag, String dateFormat, String tz, String charset)
            throws UnsupportedEncodingException {

        this.schema = schema;
        this.nullTag = nullTag;

        if (dateFormat == null) {
            this.dateFormatter = new SimpleDateFormat(Constants.DEFAULT_DATE_FORMAT_PATTERN);
        } else {
            dateFormatter = new SimpleDateFormat(dateFormat);
        }
        dateFormatter.setLenient(false);
        if (tz != null) {
            TimeZone t = TimeZone.getTimeZone(tz);
            if (!tz.equalsIgnoreCase("GMT") && t.getID().equals("GMT")) {
                System.err.println(
                        Constants.WARNING_INDICATOR + "possible invalid time zone: " + tz + ", fall back to GMT");
            }
            dateFormatter.setTimeZone(t);
        }

        doubleFormat = new DecimalFormat();
        doubleFormat.setMinimumFractionDigits(0);
        doubleFormat.setMaximumFractionDigits(20);

        setCharset(charset);
        r = new ArrayRecord(schema.getColumns().toArray(new Column[0]));
        nullBytes = nullTag.getBytes(defaultCharset);
    }

    public RecordConverter(TableSchema schema, String nullTag, String dateFormat, String tz)
            throws UnsupportedEncodingException {
        this(schema, nullTag, dateFormat, tz, Constants.REMOTE_CHARSET);
    }

    /**
     * tunnel record to byte[] array
     */
    public byte[][] format(Record r) throws UnsupportedEncodingException {

        int cols = schema.getColumns().size();
        byte[][] line = new byte[cols][];
        byte[] colValue = null;
        for (int i = 0; i < cols; i++) {

            OdpsType t = schema.getColumn(i).getType();
            switch (t) {
            case BIGINT: {
                Long v = r.getBigint(i);
                colValue = v == null ? null : v.toString().getBytes(defaultCharset);
                break;
            }
            case DOUBLE: {
                Double v = r.getDouble(i);
                if (v == null) {
                    colValue = null;
                } else if (v.equals(Double.POSITIVE_INFINITY) || v.equals(Double.NEGATIVE_INFINITY)) {
                    colValue = v.toString().getBytes(defaultCharset);
                } else {
                    colValue = doubleFormat.format(v).replaceAll(",", "").getBytes(defaultCharset);
                }
                break;
            }
            case DATETIME: {
                Date v = r.getDatetime(i);
                if (v == null) {
                    colValue = null;
                } else {
                    colValue = dateFormatter.format(v).getBytes(defaultCharset);
                }
                break;
            }
            case BOOLEAN: {
                Boolean v = r.getBoolean(i);
                colValue = v == null ? null : v.toString().getBytes(defaultCharset);
                break;
            }
            case STRING: {
                byte[] v = r.getBytes(i);
                if (v == null) {
                    colValue = null;
                } else if (Util.isIgnoreCharset(charset)) {
                    colValue = v;
                } else {
                    // data at ODPS side is always utf-8
                    colValue = new String(v, Constants.REMOTE_CHARSET).getBytes(charset);
                }
                break;
            }
            case DECIMAL: {
                BigDecimal v = r.getDecimal(i);
                colValue = v == null ? null : v.toPlainString().getBytes(defaultCharset);
                break;
            }
            default:
                throw new RuntimeException("Unknown column type: " + t);
            }

            if (colValue == null) {
                line[i] = nullBytes;
            } else {
                line[i] = colValue;
            }
        }
        return line;
    }

    /**
     * byte array to tunnel record
     */
    public Record parse(byte[][] line) throws ParseException, UnsupportedEncodingException {

        if (line == null) {
            return null;
        }
        int cols = schema.getColumns().size();

        if (line.length != cols) {
            throw new ParseException(
                    Constants.ERROR_INDICATOR + "column mismatch, expected " + schema.getColumns().size()
                            + " columns, " + line.length + " columns found, please check data or delimiter\n");
        }

        boolean isIgnoreCharset = Util.isIgnoreCharset(charset);

        int idx = 0;
        for (byte[] v : line) {
            OdpsType type = schema.getColumn(idx).getType();
            String eMsg = "";
            try {
                if (Arrays.equals(v, nullBytes)) {
                    r.set(idx, null);
                    idx++;
                    continue;
                }

                switch (type) {
                case BIGINT: {
                    String vStr = new String(v, defaultCharset);
                    r.setBigint(idx, Long.valueOf(vStr));
                    break;
                }
                case DOUBLE: {
                    String vStr = new String(v, defaultCharset);
                    r.setDouble(idx, Double.valueOf(vStr));
                    break;
                }
                case DATETIME: {
                    String vStr = new String(v, defaultCharset);
                    r.setDatetime(idx, dateFormatter.parse(vStr));
                    break;
                }
                case BOOLEAN: {
                    String vStr = new String(v, defaultCharset);
                    vStr = vStr.trim().toLowerCase();
                    if (vStr.equals("true") || vStr.equals("false")) {
                        r.setBoolean(idx, vStr.equals("true"));
                    } else if (vStr.equals("0") || vStr.equals("1")) {
                        r.setBoolean(idx, vStr.equals("1"));
                    } else {
                        eMsg = "invalid boolean type, expect: 'true'|'false'|'0'|'1'";
                        throw new IllegalArgumentException(eMsg);
                    }
                    break;
                }
                case STRING:
                    try {
                        if (isIgnoreCharset) {
                            r.setString(idx, v);
                        } else {
                            r.setString(idx, new String(v, charset));
                        }
                    } catch (IllegalArgumentException e) {
                        // for big than 8M
                        eMsg = "string type big than 8M";
                        throw new IllegalArgumentException(eMsg);
                    }
                    break;
                case DECIMAL:
                    String vStr = new String(v, defaultCharset);
                    r.setDecimal(idx, new BigDecimal(vStr));
                    break;
                default:
                    eMsg = "Unknown column type";
                    throw new IllegalArgumentException(eMsg);
                }
            } catch (Exception e) {
                String val;
                String vStr;
                if (isIgnoreCharset) {
                    vStr = new String(v, Constants.REMOTE_CHARSET);
                } else {
                    vStr = new String(v, charset);
                }
                if (vStr.length() > 20) {
                    val = vStr.substring(0, 17) + "...";
                } else {
                    val = vStr;
                }
                throw new ParseException(Constants.ERROR_INDICATOR + "format error - " + ":" + (idx + 1) + ", "
                        + type + ":'" + val + "'  " + eMsg);
            }
            idx++;
        }
        return r;
    }

    private void setCharset(String charset) {
        if (Util.isIgnoreCharset(charset)) {
            this.charset = null;
            this.defaultCharset = Constants.REMOTE_CHARSET;
        } else {
            this.charset = charset;
            this.defaultCharset = charset;
        }
    }

}