org.apache.pig.data.DefaultTuple.java Source code

Introduction

Here is the source code for org.apache.pig.data.DefaultTuple.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.data;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.util.ObjectSerializer;

/**
 * A default implementation of Tuple. This class will be created by the DefaultTupleFactory.
 */
public class DefaultTuple extends AbstractTuple {

    private static final long serialVersionUID = 2L;
    protected List<Object> mFields;

    /**
     * Default constructor. This constructor is public so that hadoop can call it directly. However, inside pig you
     * should never be calling this function. Use TupleFactory instead.
     * <br>Time complexity: O(1), after allocation
     */
    public DefaultTuple() {
        mFields = new ArrayList<Object>();
    }

    /**
     * Construct a tuple with a known number of fields. Package level so that callers cannot directly invoke it.
     * <br>Resulting tuple is filled pre-filled with null elements. Time complexity: O(N), after allocation
     *
     * @param size
     *            Number of fields to allocate in the tuple.
     */
    DefaultTuple(int size) {
        mFields = new ArrayList<Object>(size);
        for (int i = 0; i < size; i++)
            mFields.add(null);
    }

    /**
     * Construct a tuple from an existing list of objects. Package level so that callers cannot directly invoke it.
     * <br>Time complexity: O(N) plus running time of input object iteration, after allocation
     * @param c
     *            List of objects to turn into a tuple.
     */
    DefaultTuple(List<Object> c) {
        mFields = new ArrayList<Object>(c);
    }

    /**
     * Construct a tuple from an existing list of objects. Package level so that callers cannot directly invoke it.
     * <br>Time complexity: O(1)
     *
     * @param c
     *            List of objects to turn into a tuple. This list will be kept as part of the tuple.
     * @param junk
     *            Just used to differentiate from the constructor above that copies the list.
     */
    DefaultTuple(List<Object> c, int junk) {
        mFields = c;
    }

    /**
     * Find the size of the tuple. Used to be called arity().
     *
     * @return number of fields in the tuple.
     */
    @Override
    public int size() {
        return mFields.size();
    }

    /**
     * Get the value in a given field.
     *
     * @param fieldNum
     *            Number of the field to get the value for.
     * @return value, as an Object.
     * @throws ExecException
     *             if the field number is greater than or equal to the number of fields in the tuple.
     */
    @Override
    public Object get(int fieldNum) throws ExecException {
        return mFields.get(fieldNum);
    }

    /**
     * Get all of the fields in the tuple as a list.
     *
     * @return List&lt;Object&gt; containing the fields of the tuple in order.
     */
    @Override
    public List<Object> getAll() {
        return mFields;
    }

    /**
     * Set the value in a given field.
     *
     * @param fieldNum
     *            Number of the field to set the value for.
     * @param val
     *            Object to put in the indicated field.
     * @throws ExecException
     *             if the field number is greater than or equal to the number of fields in the tuple.
     */
    @Override
    public void set(int fieldNum, Object val) throws ExecException {
        mFields.set(fieldNum, val);
    }

    /**
     * Append a field to a tuple. This method is not efficient as it may force copying of existing data in order to grow
     * the data structure. Whenever possible you should construct your Tuple with the newTuple(int) method and then fill
     * in the values with set(), rather than construct it with newTuple() and append values.
     *
     * @param val
     *            Object to append to the tuple.
     */
    @Override
    public void append(Object val) {
        mFields.add(val);
    }

    /**
     * Determine the size of tuple in memory. This is used by data bags to determine their memory size. This need not be
     * exact, but it should be a decent estimation.
     *
     * @return estimated memory size.
     */
    @Override
    public long getMemorySize() {
        Iterator<Object> i = mFields.iterator();

        // rest of the fixed portion of mfields size is accounted within empty_tuple_size
        long mfields_var_size = SizeUtil.roundToEight(4 + 4 * mFields.size());
        // in java hotspot 32bit vm, there seems to be a minimum tuple size of 96
        // which is probably from the minimum size of this array list
        mfields_var_size = Math.max(40, mfields_var_size);

        // fixed overhead = 48 bytes
        //8 - tuple object header
        //8 - mFields reference
        //32 - mFields array list fixed size
        long sum = 48 + mfields_var_size;
        while (i.hasNext()) {
            sum += SizeUtil.getPigObjMemSize(i.next());
        }
        return sum;
    }

    @Override
    public int compareTo(Object other) {
        if (other instanceof Tuple) {
            Tuple t = (Tuple) other;
            int mySz = mFields.size();
            int tSz = t.size();
            if (tSz < mySz) {
                return 1;
            } else if (tSz > mySz) {
                return -1;
            } else {
                for (int i = 0; i < mySz; i++) {
                    try {
                        int c = DataType.compare(mFields.get(i), t.get(i));
                        if (c != 0) {
                            return c;
                        }
                    } catch (ExecException e) {
                        throw new RuntimeException("Unable to compare tuples", e);
                    }
                }
                return 0;
            }
        } else {
            return DataType.compare(this, other);
        }
    }

    public static class DefaultTupleRawComparator extends WritableComparator implements TupleRawComparator {
        private final Log mLog = LogFactory.getLog(getClass());
        private boolean[] mAsc;
        private boolean mWholeTuple;
        private boolean mHasNullField;
        private TupleFactory mFact;

        public DefaultTupleRawComparator() {
            super(DefaultTuple.class);
        }

        @Override
        public Configuration getConf() {
            return null;
        }

        @Override
        public void setConf(Configuration conf) {
            try {
                mAsc = (boolean[]) ObjectSerializer.deserialize(conf.get("pig.sortOrder"));
            } catch (IOException ioe) {
                mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage());
                throw new RuntimeException(ioe);
            }
            if (mAsc == null) {
                mAsc = new boolean[1];
                mAsc[0] = true;
            }
            // If there's only one entry in mAsc, it means it's for the whole
            // tuple. So we can't be looking for each column.
            mWholeTuple = (mAsc.length == 1);
            mFact = TupleFactory.getInstance();
        }

        @Override
        public boolean hasComparedTupleNull() {
            return mHasNullField;
        }

        /**
         * Compare two DefaultTuples as raw bytes. We assume the Tuples are NOT PigNullableWritable, so client classes
         * need to deal with Null and Index.
         */
        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            ByteBuffer bb1 = ByteBuffer.wrap(b1, s1, l1);
            ByteBuffer bb2 = ByteBuffer.wrap(b2, s2, l2);
            int rc = compareDefaultTuple(bb1, bb2, true); // FIXME adjust for secondary sort asc
            return rc;
        }

        /**
         * Compare two DefaultTuples as raw bytes.
         */
        private int compareDefaultTuple(ByteBuffer bb1, ByteBuffer bb2, boolean topLevel) {
            mHasNullField = false;
            // store the position in case of deserialization
            int s1 = bb1.position();
            int s2 = bb2.position();
            int rc = 0;
            byte tupleType1 = bb1.get();
            byte tupleType2 = bb2.get();
            assert (tupleType1 == tupleType2 && tupleType1 == DataType.TUPLE);
            // first compare sizes
            int sz1 = bb1.getInt();
            int sz2 = bb2.getInt();
            if (sz1 > sz2) {
                return 1;
            } else if (sz1 < sz2) {
                return -1;
            } else {
                // if sizes are the same, compare field by field
                for (int i = 0; i < sz1 && rc == 0; i++) {
                    byte dt1 = bb1.get();
                    byte dt2 = bb2.get();
                    if (dt1 == dt2) {
                        switch (dt1) {
                        case DataType.NULL:
                            if (topLevel) // we are scanning the top-level Tuple (original call)
                                mHasNullField = true;
                            rc = 0;
                            break;
                        case DataType.BOOLEAN:
                        case DataType.BYTE:
                            byte bv1 = bb1.get();
                            byte bv2 = bb2.get();
                            rc = (bv1 < bv2 ? -1 : (bv1 == bv2 ? 0 : 1));
                            break;
                        case DataType.INTEGER:
                            int iv1 = bb1.getInt();
                            int iv2 = bb2.getInt();
                            rc = (iv1 < iv2 ? -1 : (iv1 == iv2 ? 0 : 1));
                            break;
                        case DataType.LONG:
                            long lv1 = bb1.getLong();
                            long lv2 = bb2.getLong();
                            rc = (lv1 < lv2 ? -1 : (lv1 == lv2 ? 0 : 1));
                            break;
                        case DataType.FLOAT:
                            float fv1 = bb1.getFloat();
                            float fv2 = bb2.getFloat();
                            rc = Float.compare(fv1, fv2);
                            break;
                        case DataType.DOUBLE:
                            double dv1 = bb1.getDouble();
                            double dv2 = bb2.getDouble();
                            rc = Double.compare(dv1, dv2);
                            break;
                        case DataType.BIGINTEGER: {
                            if (bb1.get() != DataType.BYTEARRAY || bb2.get() != DataType.BYTEARRAY) {
                                throw new RuntimeException(
                                        "Issue in comparing raw bytes for DefaultTuple! BIGINTEGER was not serialized with BYTEARRAY");
                            }

                            int basz1 = bb1.getInt();
                            int basz2 = bb2.getInt();
                            byte[] ba1 = new byte[basz1];
                            byte[] ba2 = new byte[basz2];
                            bb1.get(ba1);
                            bb2.get(ba2);
                            rc = new BigInteger(ba1).compareTo(new BigInteger(ba2));
                            break;
                        }
                        case DataType.BIGDECIMAL: {
                            byte catype1 = bb1.get();
                            byte catype2 = bb2.get();
                            int casz1 = (catype1 == DataType.CHARARRAY) ? bb1.getShort() : bb1.getInt();
                            int casz2 = (catype2 == DataType.CHARARRAY) ? bb2.getShort() : bb2.getInt();
                            byte[] ca1 = new byte[casz1];
                            byte[] ca2 = new byte[casz2];
                            bb1.get(ca1);
                            bb2.get(ca2);
                            String str1 = null, str2 = null;
                            try {
                                str1 = new String(ca1, DataReaderWriter.UTF8);
                                str2 = new String(ca2, DataReaderWriter.UTF8);
                            } catch (UnsupportedEncodingException uee) {
                                mLog.warn("Unsupported string encoding", uee);
                                uee.printStackTrace();
                            }
                            if (str1 != null && str2 != null)
                                rc = new BigDecimal(str1).compareTo(new BigDecimal(str2));
                            break;
                        }
                        case DataType.DATETIME:
                            long dtv1 = bb1.getLong();
                            bb1.position(bb1.position() + 2); // move cursor forward without read the timezone bytes
                            long dtv2 = bb2.getLong();
                            bb2.position(bb2.position() + 2);
                            rc = (dtv1 < dtv2 ? -1 : (dtv1 == dtv2 ? 0 : 1));
                            break;
                        case DataType.BYTEARRAY:
                            int basz1 = bb1.getInt();
                            int basz2 = bb2.getInt();
                            byte[] ba1 = new byte[basz1];
                            byte[] ba2 = new byte[basz2];
                            bb1.get(ba1);
                            bb2.get(ba2);
                            rc = DataByteArray.compare(ba1, ba2);
                            break;
                        case DataType.CHARARRAY:
                        case DataType.BIGCHARARRAY:
                            int casz1 = (dt1 == DataType.CHARARRAY) ? bb1.getShort() : bb1.getInt();
                            int casz2 = (dt1 == DataType.CHARARRAY) ? bb2.getShort() : bb2.getInt();
                            byte[] ca1 = new byte[casz1];
                            byte[] ca2 = new byte[casz2];
                            bb1.get(ca1);
                            bb2.get(ca2);
                            String str1 = null, str2 = null;
                            try {
                                str1 = new String(ca1, DataReaderWriter.UTF8);
                                str2 = new String(ca2, DataReaderWriter.UTF8);
                            } catch (UnsupportedEncodingException uee) {
                                mLog.warn("Unsupported string encoding", uee);
                                uee.printStackTrace();
                            }
                            if (str1 != null && str2 != null)
                                rc = str1.compareTo(str2);
                            break;
                        case DataType.TUPLE:
                            // put back the cursor to before DataType.TUPLE
                            bb1.position(bb1.position() - 1);
                            bb2.position(bb2.position() - 1);
                            rc = compareDefaultTuple(bb1, bb2, false);
                            break;
                        default:
                            mLog.info(
                                    "Unsupported DataType for binary comparison, switching to object deserialization: "
                                            + DataType.genTypeToNameMap().get(dt1) + "(" + dt1 + ")");
                            Tuple t1 = mFact.newTuple();
                            Tuple t2 = mFact.newTuple();
                            try {
                                t1.readFields(new DataInputStream(
                                        new ByteArrayInputStream(bb1.array(), s1, bb1.limit())));
                                t2.readFields(new DataInputStream(
                                        new ByteArrayInputStream(bb2.array(), s2, bb2.limit())));
                            } catch (IOException ioe) {
                                mLog.error("Unable to instantiate tuples for comparison: " + ioe.getMessage());
                                throw new RuntimeException(ioe.getMessage(), ioe);
                            }
                            // delegate to compareTuple
                            return compareTuple(t1, t2);
                        }
                    } else { // compare DataTypes
                        if (dt1 < dt2)
                            rc = -1;
                        else
                            rc = 1;
                    }
                    // flip if the order is descending
                    if (rc != 0) {
                        if (!mWholeTuple && !mAsc[i])
                            rc *= -1;
                        else if (mWholeTuple && !mAsc[0])
                            rc *= -1;
                    }
                }
            }
            return rc;
        }

        @Override
        public int compare(Object o1, Object o2) {
            NullableTuple nt1 = (NullableTuple) o1;
            NullableTuple nt2 = (NullableTuple) o2;
            int rc = 0;

            // if either are null, handle differently
            if (!nt1.isNull() && !nt2.isNull()) {
                rc = compareTuple((Tuple) nt1.getValueAsPigType(), (Tuple) nt2.getValueAsPigType());
            } else {
                // for sorting purposes two nulls are equal
                if (nt1.isNull() && nt2.isNull())
                    rc = 0;
                else if (nt1.isNull())
                    rc = -1;
                else
                    rc = 1;
                if (mWholeTuple && !mAsc[0])
                    rc *= -1;
            }
            return rc;
        }

        private int compareTuple(Tuple t1, Tuple t2) {
            int sz1 = t1.size();
            int sz2 = t2.size();
            if (sz2 < sz1) {
                return 1;
            } else if (sz2 > sz1) {
                return -1;
            } else {
                for (int i = 0; i < sz1; i++) {
                    try {
                        int c = DataType.compare(t1.get(i), t2.get(i));
                        if (c != 0) {
                            if (!mWholeTuple && !mAsc[i])
                                c *= -1;
                            else if (mWholeTuple && !mAsc[0])
                                c *= -1;
                            return c;
                        }
                    } catch (ExecException e) {
                        throw new RuntimeException("Unable to compare tuples", e);
                    }
                }
                return 0;
            }
        }

    }

    @Override
    public int hashCode() {
        int hash = 17;
        for (Iterator<Object> it = mFields.iterator(); it.hasNext();) {
            Object o = it.next();
            if (o != null) {
                hash = 31 * hash + o.hashCode();
            }
        }
        return hash;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeByte(DataType.TUPLE);
        int sz = size();
        out.writeInt(sz);
        for (int i = 0; i < sz; i++) {
            DataReaderWriter.writeDatum(out, mFields.get(i));
        }
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        // Clear our fields, in case we're being reused.
        mFields.clear();

        // Make sure it's a tuple.
        byte b = in.readByte();
        if (b != DataType.TUPLE) {
            int errCode = 2112;
            String msg = "Unexpected data while reading tuple " + "from binary file.";
            throw new ExecException(msg, errCode, PigException.BUG);
        }
        // Read the number of fields
        int sz = in.readInt();
        for (int i = 0; i < sz; i++) {
            try {
                append(DataReaderWriter.readDatum(in));
            } catch (ExecException ee) {
                throw ee;
            }
        }
    }

    public static Class<? extends TupleRawComparator> getComparatorClass() {
        return DefaultTupleRawComparator.class;
    }
}