com.moz.fiji.schema.FormattedEntityId.java Source code

Java tutorial

Introduction

Here is the source code for com.moz.fiji.schema.FormattedEntityId.java

Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.moz.fiji.schema;

import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;

import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.moz.fiji.annotations.ApiAudience;
import com.moz.fiji.schema.avro.ComponentType;
import com.moz.fiji.schema.avro.RowKeyEncoding;
import com.moz.fiji.schema.avro.RowKeyFormat2;
import com.moz.fiji.schema.util.ByteArrayFormatter;
import com.moz.fiji.schema.util.Hasher;

/**
 * Implements the Formatted Entity Id row key. This allows users to specify keys composed
 * of one or more components of the primitive types of string, integer or long with some
 * flexibility to specify hash prefixing.
 *
 * <h2>Goals</h2>
 * <ul>
 * <li>Allow logical hierarchies in the row key. E.g. store_id:product_id?.</li>
 * <li>Simplify ordering for primitive types (especially ints and longs).</li>
 * <li>Allow range scans over part of the key.</li>
 * <li>Simplify load distribution.</li>
 * <li>Allow nullable components while preserving ordering.</li>
 * </ul>
 *
 * <h2>Specifying formatted row keys in the layout</h2>
 * In the layout file for the table:
 * <ul>
 *   <li>Use RowKeyFormat2</li>
 *   <li>Set RowKeyEncoding to FORMATTED.</li>
 *   <li>Set the hash spec to specify the hashing algorithm and hash size
 *   for the hash prefix to your key.</li>
 *   <li>Set range_scan_start_index to the element at which you would like
 *   to support range scans. Components to the left of this one will be
 *   used to calculate the hash.</li>
 *   <li>Set nullable_start_index to determine which components can be null.</li>
 *   <li>Specify the primitive types of the composite key using
 *   {@link com.moz.fiji.schema.avro.RowKeyComponent}</li>
 * </ul>
 * <p>For more details, refer to {@link RowKeyFormat2}</p>
 *
 * <h2>Hbase encoding of formatted entity IDs</h2>
 * <ul>
 *   <li>Key begins with a hash of hash_size bytes as specified in the
 *   {@link com.moz.fiji.schema.avro.HashSpec}</li>
 *   <li>String components are UTF-8 encoded followed by '\u0000'</li>
 *   <li>Integers and longs are in their twos complement representation with the MSB toggled for
 *   preserving ordering.</li>
 *   <li>Integers are 4 byte long.</li>
 *   <li>Longs are 8 byte long.</li>
 *   <li>In order to preserve ordering, all components to the right of a null component must be
 *   null.</li>
 * </ul>
 *
 * <h2>Example</h2>
 * <p>
 * The json below creates a composite key with 3 components of type string, integer and long.
 * There will be a two byte hash of the first two components prefixed to the key. By default,
 * all except the first components can be set to null.
 * </p>
 *
 * <code>
 * keys_format : {
 *   encoding : "FORMATTED",
 *   salt : {
 *     hash_size : 2
 *   },
 *   range_scan_start_index: 2,
 *   components : [ {
 *     name : "dummy",
 *     type : "STRING"
 *   }, {
 *     name : "anint",
 *     type : "INTEGER"
 *   }, {
 *     name : "along",
 *     type : "LONG"
 *   } ]
 * }
 * </code>
 */
@ApiAudience.Private
public final class FormattedEntityId extends EntityId {
    // HBase row key bytes. The encoded components of the row key
    // potentially including a hash prefix, as specified in the row key format.
    private byte[] mHBaseRowKey;

    private List<Object> mComponentValues;

    private RowKeyFormat2 mRowKeyFormat;

    private static final Logger LOG = LoggerFactory.getLogger(FormattedEntityId.class);

    /**
     * Convert class of object to the correct ComponentType.
     * @param obj Input object (key component).
     * @return ComponentType representing the class of the input object.
     */
    private static ComponentType getType(Object obj) {
        if (obj instanceof String) {
            return ComponentType.STRING;
        } else if (obj instanceof Integer) {
            return ComponentType.INTEGER;
        } else if (obj instanceof Long) {
            return ComponentType.LONG;
        }
        throw new EntityIdException("Unexpected type for Component " + obj.getClass().getName());
    }

    /**
     * Creates a FormattedEntityId from the specified Fiji row key.
     *
     * @param fijiRowKey An ordered <b>mutable</b> list of objects of row key components. The
     *    contents of this list may be modified in case of any type promotions from
     *    Integer to Long.
     * @param format The RowKeyFormat2 as specified in the layout file.
     * @return a new FormattedEntityId with the specified Fiji row key.
     */
    static FormattedEntityId getEntityId(List<Object> fijiRowKey, RowKeyFormat2 format) {
        Preconditions.checkNotNull(format);
        Preconditions.checkNotNull(format.getSalt());
        Preconditions.checkNotNull(fijiRowKey);

        // Validity check for fiji  Row Key.
        if (fijiRowKey.size() > format.getComponents().size()) {
            throw new EntityIdException("Too many components in fiji Row Key");
        }
        if (fijiRowKey.size() < format.getComponents().size()) {
            int krk = fijiRowKey.size();
            int fgc = format.getComponents().size();
            if (LOG.isDebugEnabled()) {
                LOG.debug(
                        "{} components found in {}, {} expected by row key format."
                                + "The last {} components implicitly set to null",
                        krk, fijiRowKey.toString(), fgc, fgc - krk);
            }
        }
        if (fijiRowKey.size() < format.getNullableStartIndex()) {
            throw new EntityIdException("Too few components in fiji Row key");
        }

        // Validate the components passed in against the row key format such
        // as prevent non-null component from following null components and checking
        // component types against the format specified.
        boolean hasSeenNull = false;
        for (int i = 0; i < fijiRowKey.size(); i++) {
            // Prevent non-null components that follow null components
            if (hasSeenNull) {
                if (null == fijiRowKey.get(i)) {
                    continue;
                } else {
                    throw new EntityIdException("Non null component follows null component");
                }
            } else if (null == fijiRowKey.get(i)) {
                // we found a null, check if this is at a position greater than or equal to
                // nullable_start_index (the position from which null values are allowed)
                // also set the flag indicating we've seen a null value
                if (format.getNullableStartIndex() <= i) {
                    hasSeenNull = true;
                    continue;
                } else {
                    throw new EntityIdException("Unexpected null component in fiji row key." + String
                            .format("Expected at least %d non-null components", format.getNullableStartIndex()));
                }
            } else {
                // for non-null components ensure that the type matches the format spec
                ComponentType type = getType(fijiRowKey.get(i));
                if (null == type || type != format.getComponents().get(i).getType()) {
                    if (type == ComponentType.INTEGER
                            && format.getComponents().get(i).getType() == ComponentType.LONG) {
                        fijiRowKey.set(i, ((Integer) fijiRowKey.get(i)).longValue());
                    } else {
                        throw new EntityIdException(
                                String.format("Invalid type for component %s at index %d in fijiRowKey",
                                        fijiRowKey.get(i).toString(), i));
                    }
                }
            }
        }

        byte[] hbaseRowKey = null;

        hbaseRowKey = makeHbaseRowKey(format, fijiRowKey);

        return new FormattedEntityId(format, hbaseRowKey, fijiRowKey);
    }

    /**
     * Creates a FormattedEntityId from the specified HBase row key.
     *
     * @param hbaseRowKey A byte[] containing the HBase row key.
     * @param format The RowKeyFormat as specified in the layout file.
     * @return a new FormattedEntityId with the specified HBase row key.
     */
    static FormattedEntityId fromHBaseRowKey(byte[] hbaseRowKey, RowKeyFormat2 format) {
        Preconditions.checkNotNull(format);
        Preconditions.checkNotNull(format.getSalt());
        Preconditions.checkNotNull(hbaseRowKey);
        // we modify the hbaseRowKey in the makeFijiRowKey code for integer encoding, so we make
        // a copy of it to pass to makeFijiRowKey:
        byte[] cloneHbaseKey = hbaseRowKey.clone();
        List<Object> fijiRowKey = makeFijiRowKey(format, cloneHbaseKey);
        return new FormattedEntityId(format, hbaseRowKey, fijiRowKey);
    }

    /**
     * Create an hbase row key, which is a byte array from the given formatted fijiRowKey.
     * This method requires that the fijiRowKey argument is the correct length for the specified
     * format.
     * The following encoding will be used to ensure correct ordering:
     * Strings are UTF-8 encoded and terminated by a null byte. Strings cannot contain "\u0000".
     * Integers are exactly 4 bytes long.
     * Longs are exactly 8 bytes long.
     * Both integers and longs have the sign bit flipped so that their values are wrapped around to
     * create the correct lexicographic ordering. (i.e. after converting to byte array,
     * MIN_INT < 0 < MAX_INT).
     * Hashed components are exactly hash_size bytes long and are the first component of
     * the hbase key.
     * Except for the first, all components of a fijiRowKey can be null. However, to maintain
     * ordering, all components to the right of a null component must also be null. Nullable index
     * in the row key format specifies which component (and hence following components) are nullable.
     * By default, the hash only uses the first component, but this can be changed using the Range
     * Scan index.
     *
     * @param format The formatted row key format for this table.
     * @param fijiRowKey An ordered list of Objects of the key components.
     * @return A byte array representing the encoded Hbase row key.
     */
    private static byte[] makeHbaseRowKey(RowKeyFormat2 format, List<Object> fijiRowKey) {

        ArrayList<byte[]> hbaseKey = new ArrayList<byte[]>();
        final byte zeroDelim = 0;

        int pos;
        for (pos = 0; pos < fijiRowKey.size(); pos++) {
            // we have already done the validation check for null cascades.
            if (null == fijiRowKey.get(pos)) {
                continue;
            }
            byte[] tempBytes;
            switch (getType(fijiRowKey.get(pos))) {
            case STRING:
                if (((String) fijiRowKey.get(pos)).contains("\u0000")) {
                    throw new EntityIdException("String component cannot contain \u0000");
                }
                try {
                    hbaseKey.add(((String) fijiRowKey.get(pos)).getBytes("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    LOG.error(e.toString());
                    throw new EntityIdException(String.format("UnsupportedEncoding for component %d", pos));
                }
                break;
            case INTEGER:
                int temp = (Integer) fijiRowKey.get(pos);
                tempBytes = ByteBuffer.allocate(Integer.SIZE / Byte.SIZE).putInt(temp).array();
                tempBytes[0] = (byte) ((int) tempBytes[0] ^ (int) Byte.MIN_VALUE);
                hbaseKey.add(tempBytes);
                break;
            case LONG:
                long templong = (Long) fijiRowKey.get(pos);
                tempBytes = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).putLong(templong).array();
                tempBytes[0] = (byte) ((int) tempBytes[0] ^ (int) Byte.MIN_VALUE);
                hbaseKey.add(tempBytes);
                break;
            default:
                throw new RuntimeException("Invalid code path");
            }
        }

        // hash stuff
        int hashUpto = format.getRangeScanStartIndex() - 1;
        ByteArrayOutputStream tohash = new ByteArrayOutputStream();
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        for (pos = 0; pos <= hashUpto && pos < hbaseKey.size(); pos++) {
            tohash.write(hbaseKey.get(pos), 0, hbaseKey.get(pos).length);
        }
        byte[] hashed = Arrays.copyOfRange(Hasher.hash(tohash.toByteArray()), 0, format.getSalt().getHashSize());
        baos.write(hashed, 0, hashed.length);

        // to materialize or not to materialize that is the question
        if (format.getSalt().getSuppressKeyMaterialization()) {
            return baos.toByteArray();
        } else {
            for (pos = 0; pos < hbaseKey.size(); pos++) {
                baos.write(hbaseKey.get(pos), 0, hbaseKey.get(pos).length);
                if (format.getComponents().get(pos).getType() == ComponentType.STRING
                        || format.getComponents().get(pos) == null) {
                    // empty strings will be encoded as null, hence we need to delimit them too
                    baos.write(zeroDelim);
                }
            }
            return baos.toByteArray();
        }
    }

    @Override
    public byte[] getHBaseRowKey() {
        return mHBaseRowKey;
    }

    /**
     * Decode a byte array containing an hbase row key into an ordered list corresponding to
     * the key format in the layout file.
     *
     * @param format The row key format as specified in the layout file.
     * @param hbaseRowKey A byte array containing the hbase row key.
     * @return An ordered list of component values in the key.
     */
    private static List<Object> makeFijiRowKey(RowKeyFormat2 format, byte[] hbaseRowKey) {
        if (hbaseRowKey.length == 0) {
            throw new EntityIdException("Invalid hbase row key");
        }
        List<Object> fijiRowKey = new ArrayList<Object>();
        // skip over the hash
        int pos = format.getSalt().getHashSize();
        // we are suppressing materialization, so the components cannot be retrieved.
        int fijiRowElem = 0;
        if (format.getSalt().getSuppressKeyMaterialization()) {
            if (pos < hbaseRowKey.length) {
                throw new EntityIdException("Extra bytes in key after hash when materialization is" + "suppressed");
            }
            return null;
        }
        ByteBuffer buf;

        while (fijiRowElem < format.getComponents().size() && pos < hbaseRowKey.length) {
            switch (format.getComponents().get(fijiRowElem).getType()) {
            case STRING:
                // Read the row key until we encounter a Null (0) byte or end.
                int endpos = pos;
                while (endpos < hbaseRowKey.length && (hbaseRowKey[endpos] != (byte) 0)) {
                    endpos += 1;
                }
                String str = null;
                try {
                    str = new String(hbaseRowKey, pos, endpos - pos, "UTF-8");
                } catch (UnsupportedEncodingException e) {
                    LOG.error(e.toString());
                    throw new EntityIdException(String.format("UnsupportedEncoding for component %d", fijiRowElem));
                }
                fijiRowKey.add(str);
                pos = endpos + 1;
                break;
            case INTEGER:
                // Toggle highest order bit to return to original 2's complement.
                hbaseRowKey[pos] = (byte) ((int) hbaseRowKey[pos] ^ (int) Byte.MIN_VALUE);
                try {
                    buf = ByteBuffer.wrap(hbaseRowKey, pos, Integer.SIZE / Byte.SIZE);
                } catch (IndexOutOfBoundsException e) {
                    throw new EntityIdException("Malformed hbase Row Key");
                }
                fijiRowKey.add(Integer.valueOf(buf.getInt()));
                pos = pos + Integer.SIZE / Byte.SIZE;
                break;
            case LONG:
                // Toggle highest order bit to return to original 2's complement.
                hbaseRowKey[pos] = (byte) ((int) hbaseRowKey[pos] ^ (int) Byte.MIN_VALUE);
                try {
                    buf = ByteBuffer.wrap(hbaseRowKey, pos, Long.SIZE / Byte.SIZE);
                } catch (IndexOutOfBoundsException e) {
                    throw new EntityIdException("Malformed hbase Row Key");
                }
                fijiRowKey.add(Long.valueOf(buf.getLong()));
                pos = pos + Long.SIZE / Byte.SIZE;
                break;
            default:
                throw new RuntimeException("Invalid code path");
            }
            fijiRowElem += 1;
        }

        // Fail if there are extra bytes in hbase row key.
        if (pos < hbaseRowKey.length) {
            throw new EntityIdException("Extra bytes in hbase row key cannot be mapped to any " + "component");
        }

        // Fail if we encounter nulls before it is legal to do so.
        if (fijiRowElem < format.getNullableStartIndex()) {
            throw new EntityIdException("Too few components decoded from hbase row key. Component " + "number "
                    + fijiRowElem + " cannot be null");
        }

        // finish up with nulls for everything that wasn't in the key
        for (; fijiRowElem < format.getComponents().size(); fijiRowElem++) {
            fijiRowKey.add(null);
        }

        return fijiRowKey;
    }

    /**
     * Creates a new FormattedEntityId.
     * @param format Format of the row key as specified in the layout file.
     * @param hbaseRowKey Byte array containing the hbase row key.
     * @param fijiRowKey An ordered list of row key components.
     */
    private FormattedEntityId(RowKeyFormat2 format, byte[] hbaseRowKey, List<Object> fijiRowKey) {
        mRowKeyFormat = Preconditions.checkNotNull(format);
        Preconditions.checkArgument(format.getEncoding() == RowKeyEncoding.FORMATTED);
        Preconditions.checkNotNull(format.getSalt(),
                "Formatted entityIds may not specify a null 'salt' field in RowKeyFormat2.");
        mHBaseRowKey = hbaseRowKey;
        if (format.getSalt().getSuppressKeyMaterialization()) {
            mComponentValues = null;
        } else {
            mComponentValues = fijiRowKey;
        }
    }

    /** {@inheritDoc} */
    @Override
    @SuppressWarnings("unchecked")
    public <T> T getComponentByIndex(int idx) {
        Preconditions.checkState(!mRowKeyFormat.getSalt().getSuppressKeyMaterialization(),
                "Cannot retrieve components as materialization is suppressed");
        Preconditions.checkArgument(idx >= 0 && idx < mComponentValues.size());
        return (T) mComponentValues.get(idx);
    }

    /** {@inheritDoc} */
    @Override
    public List<Object> getComponents() {
        Preconditions.checkState(!mRowKeyFormat.getSalt().getSuppressKeyMaterialization(),
                "Cannot retrieve components as materialization is suppressed");
        return Collections.unmodifiableList(mComponentValues);
    }

    /** {@inheritDoc} */
    @Override
    public String toString() {
        if (!mRowKeyFormat.getSalt().getSuppressKeyMaterialization()) {
            return Objects.toStringHelper(FormattedEntityId.class)
                    .add("components", Joiner.on(",").join(mComponentValues))
                    .add("hbase", Bytes.toStringBinary(mHBaseRowKey)).toString();
        } else {
            return Objects.toStringHelper(FormattedEntityId.class).addValue("components are suppressed")
                    .add("hbase", Bytes.toStringBinary(mHBaseRowKey)).toString();
        }
    }

    /** {@inheritDoc} */
    @Override
    public String toShellString() {
        if (mRowKeyFormat.getSalt().getSuppressKeyMaterialization()) {
            return String.format("hbase=hex:%s", ByteArrayFormatter.toHex(mHBaseRowKey));
        }

        /** Set of characters which must be escaped */
        HashSet<Character> escapeSet = Sets.newHashSet('"', '\\', '\'');
        ArrayList<String> componentStrings = Lists.newArrayList();
        for (Object component : mComponentValues) {
            if (component == null) {
                componentStrings.add("null");
            } else {
                String unescapedString = component.toString();
                ArrayList<Character> escapedString = Lists.newArrayList();
                for (char c : unescapedString.toCharArray()) {
                    if (escapeSet.contains(c)) {
                        escapedString.add('\\');
                    }
                    escapedString.add(c);
                }
                if (getType(component) == ComponentType.STRING) {
                    componentStrings.add(String.format("'%s'", StringUtils.join(escapedString, "")));
                } else {
                    componentStrings.add(component.toString());
                }
            }
        }
        return "[" + StringUtils.join(componentStrings, ", ") + "]";
    }
}