Java tutorial
/******************************************************************************* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. ******************************************************************************/ package com.google.cloud.dataflow.sdk.runners.worker; import com.google.common.math.LongMath; import com.google.common.primitives.Longs; import java.math.RoundingMode; import java.util.ArrayList; import java.util.Arrays; /** * This module provides routines for encoding a sequence of typed * entities into a byte array. The resulting byte arrays can be * lexicographically compared to yield the same comparison value that * would have been generated if the encoded items had been compared * one by one according to their type. * <p> * More precisely, suppose: * <ol> * <li> byte array A is generated by encoding the sequence of items [A_1..A_n] * <li> byte array B is generated by encoding the sequence of items [B_1..B_n] * <li> The types match; i.e., for all i: A_i was encoded using * the same routine as B_i * </ol> * Then: * Comparing A vs. B lexicographically is the same as comparing * the vectors [A_1..A_n] and [B_1..B_n] lexicographically. * * <p> * <b>This class is NOT thread safe.</b> */ public class OrderedCode { // We want to encode a few extra symbols in strings: // <sep> Separator between items // <infinity> Infinite string // // Therefore we need an alphabet with at least 258 characters. We // achieve this by using two-letter sequences starting with '\0' and '\xff' // as extra symbols: // <sep> encoded as => \0\1 // \0 encoded as => \0\xff // \xff encoded as => \xff\x00 // <infinity> encoded as => \xff\xff // // The remaining two letter sequences starting with '\0' and '\xff' // are currently unused. public static final byte ESCAPE1 = 0x00; public static final byte NULL_CHARACTER = (byte) 0xff; // Combined with ESCAPE1 public static final byte SEPARATOR = 0x01; // Combined with ESCAPE1 public static final byte ESCAPE2 = (byte) 0xff; public static final byte INFINITY = (byte) 0xff; // Combined with ESCAPE2 public static final byte FF_CHARACTER = 0x00; // Combined with ESCAPE2 public static final byte[] ESCAPE1_SEPARATOR = { ESCAPE1, SEPARATOR }; public static final byte[] INFINITY_ENCODED = { ESCAPE2, INFINITY }; /** * This array maps encoding length to header bits in the first two bytes for * SignedNumIncreasing encoding. */ private static final byte[][] LENGTH_TO_HEADER_BITS = { { 0, 0 }, { (byte) 0x80, 0 }, { (byte) 0xc0, 0 }, { (byte) 0xe0, 0 }, { (byte) 0xf0, 0 }, { (byte) 0xf8, 0 }, { (byte) 0xfc, 0 }, { (byte) 0xfe, 0 }, { (byte) 0xff, 0 }, { (byte) 0xff, (byte) 0x80 }, { (byte) 0xff, (byte) 0xc0 } }; /** * This array maps encoding lengths to the header bits that overlap with * the payload and need fixing during readSignedNumIncreasing. */ private static final long[] LENGTH_TO_MASK = { 0L, 0x80L, 0xc000L, 0xe00000L, 0xf0000000L, 0xf800000000L, 0xfc0000000000L, 0xfe000000000000L, 0xff00000000000000L, 0x8000000000000000L, 0L }; /** * This array maps the number of bits in a number to the encoding * length produced by WriteSignedNumIncreasing. * For positive numbers, the number of bits is 1 plus the most significant * bit position (the highest bit position in a positive long is 63). * For a negative number n, we count the bits in ~n. * That is, length = BITS_TO_LENGTH[log2Floor(n < 0 ? ~n : n) + 1]. */ private static final short[] BITS_TO_LENGTH = { 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10 }; /** * stores the current encoded value as a list of byte arrays. Note that this * is manipulated as we read/write items. * Note that every item will fit on at most one array. One array may * have more than one item (eg when used for decoding). While encoding, * one array will have exactly one item. While returning the encoded array * we will merge all the arrays in this list. */ private final ArrayList<byte[]> encodedArrays = new ArrayList<>(); /** * This is the current position on the first array. Will be non-zero * only if the ordered code was created using encoded byte array. */ private int firstArrayPosition = 0; /** * Creates OrderedCode from scractch. Typically used at encoding time. */ public OrderedCode() { } /** * Creates OrderedCode from a given encoded byte array. Typically used at * decoding time. * * <p> * <b> For better performance, it uses the input array provided (not a copy). * Therefore the input array should not be modified.</b> */ public OrderedCode(byte[] encodedByteArray) { encodedArrays.add(encodedByteArray); } /** * Adds the given byte array item to the OrderedCode. It encodes the input * byte array, followed by a separator and appends the result to its * internal encoded byte array store. * * <p> * It works with the input array, * so the input array 'value' should not be modified till the method returns. * * @param value bytes to be written. * @see #readBytes() */ public void writeBytes(byte[] value) { // Determine the length of the encoded array int encodedLength = 2; // for separator for (byte b : value) { if ((b == ESCAPE1) || (b == ESCAPE2)) { encodedLength += 2; } else { encodedLength++; } } byte[] encodedArray = new byte[encodedLength]; int copyStart = 0; int outIndex = 0; for (int i = 0; i < value.length; i++) { byte b = value[i]; if (b == ESCAPE1) { System.arraycopy(value, copyStart, encodedArray, outIndex, i - copyStart); outIndex += i - copyStart; encodedArray[outIndex++] = ESCAPE1; encodedArray[outIndex++] = NULL_CHARACTER; copyStart = i + 1; } else if (b == ESCAPE2) { System.arraycopy(value, copyStart, encodedArray, outIndex, i - copyStart); outIndex += i - copyStart; encodedArray[outIndex++] = ESCAPE2; encodedArray[outIndex++] = FF_CHARACTER; copyStart = i + 1; } } if (copyStart < value.length) { System.arraycopy(value, copyStart, encodedArray, outIndex, value.length - copyStart); outIndex += value.length - copyStart; } encodedArray[outIndex++] = ESCAPE1; encodedArray[outIndex] = SEPARATOR; encodedArrays.add(encodedArray); } /** * Encodes the long item, in big-endian format, and appends the result to its * internal encoded byte array store. * <p> * Note that the specified long is treated like a uint64, e.g. * {@code new OrderedCode().writeNumIncreasing(-1L).getEncodedBytes()} * is greater than * {@code new OrderedCode().writeNumIncreasing(Long.MAX_VALUE).getEncodedBytes()}. * * @see #readNumIncreasing() */ public void writeNumIncreasing(long value) { // Values are encoded with a single byte length prefix, followed // by the actual value in big-endian format with leading 0 bytes // dropped. byte[] bufer = new byte[9]; // 8 bytes for value plus one byte for length int len = 0; while (value != 0) { len++; bufer[9 - len] = (byte) (value & 0xff); value >>>= 8; } bufer[9 - len - 1] = (byte) len; len++; byte[] encodedArray = new byte[len]; System.arraycopy(bufer, 9 - len, encodedArray, 0, len); encodedArrays.add(encodedArray); } /** * Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0. */ int log2Floor(long n) { if (n < 0) { throw new IllegalArgumentException("must be non-negative"); } return n == 0 ? -1 : LongMath.log2(n, RoundingMode.FLOOR); } /** * Calculates the encoding length in bytes of the signed number n. */ int getSignedEncodingLength(long n) { return BITS_TO_LENGTH[log2Floor(n < 0 ? ~n : n) + 1]; } /** * Encodes the long item, in big-endian format, and appends the result to its * internal encoded byte array store. * <p> * Note that the specified long is treated like an int64, i.e. * {@code new OrderedCode().writeNumIncreasing(-1L).getEncodedBytes()} * is less than * {@code new OrderedCode().writeNumIncreasing(0L).getEncodedBytes()}. * * @see #readSignedNumIncreasing() */ public void writeSignedNumIncreasing(long val) { long x = val < 0 ? ~val : val; if (x < 64) { // Fast path for encoding length == 1. byte[] encodedArray = new byte[] { (byte) (LENGTH_TO_HEADER_BITS[1][0] ^ val) }; encodedArrays.add(encodedArray); return; } // buf = val in network byte order, sign extended to 10 bytes. byte signByte = val < 0 ? (byte) 0xff : 0; byte[] buf = new byte[2 + Longs.BYTES]; buf[0] = buf[1] = signByte; System.arraycopy(Longs.toByteArray(val), 0, buf, 2, Longs.BYTES); int len = getSignedEncodingLength(x); if (len < 2) { throw new IllegalStateException( "Invalid length (" + len + ")" + " returned by getSignedEncodingLength(" + x + ")"); } int beginIndex = buf.length - len; buf[beginIndex] ^= LENGTH_TO_HEADER_BITS[len][0]; buf[beginIndex + 1] ^= LENGTH_TO_HEADER_BITS[len][1]; byte[] encodedArray = new byte[len]; System.arraycopy(buf, beginIndex, encodedArray, 0, len); encodedArrays.add(encodedArray); } /** * Encodes and appends INFINITY item to its internal encoded byte array * store. * * @see #readInfinity() */ public void writeInfinity() { writeTrailingBytes(INFINITY_ENCODED); } /** * Appends the byte array item to its internal encoded byte array * store. This is used for the last item and is not encoded. It * also can be used to write a fixed number of bytes that will be * read back using {@link #readBytes(int)}. * * <p> * It stores the input array in the store, * so the input array 'value' should not be modified. * * @param value bytes to be written. * @see #readTrailingBytes() * @see #readBytes(int) */ public void writeTrailingBytes(byte[] value) { if ((value == null) || (value.length == 0)) { throw new IllegalArgumentException("Value cannot be null or have 0 elements"); } encodedArrays.add(value); } /** * Returns the next byte array item from its encoded byte array store and * removes the item from the store. * * @see #writeBytes(byte[]) */ public byte[] readBytes() { if ((encodedArrays == null) || (encodedArrays.size() == 0) || ((encodedArrays.get(0)).length - firstArrayPosition <= 0)) { throw new IllegalArgumentException("Invalid encoded byte array"); } // Determine the length of the decoded array // We only scan up to "length-2" since a valid string must end with // a two character terminator: 'ESCAPE1 SEPARATOR' byte[] store = encodedArrays.get(0); int decodedLength = 0; boolean valid = false; int i = firstArrayPosition; while (i < store.length - 1) { byte b = store[i++]; if (b == ESCAPE1) { b = store[i++]; if (b == SEPARATOR) { valid = true; break; } else if (b == NULL_CHARACTER) { decodedLength++; } else { throw new IllegalArgumentException("Invalid encoded byte array"); } } else if (b == ESCAPE2) { b = store[i++]; if (b == FF_CHARACTER) { decodedLength++; } else { throw new IllegalArgumentException("Invalid encoded byte array"); } } else { decodedLength++; } } if (!valid) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] decodedArray = new byte[decodedLength]; int copyStart = firstArrayPosition; int outIndex = 0; int j = firstArrayPosition; while (j < store.length - 1) { byte b = store[j++]; // note that j has been incremented if (b == ESCAPE1) { System.arraycopy(store, copyStart, decodedArray, outIndex, j - copyStart - 1); outIndex += j - copyStart - 1; // ESCAPE1 SEPARATOR ends component // ESCAPE1 NULL_CHARACTER represents '\0' b = store[j++]; if (b == SEPARATOR) { if ((store.length - j) == 0) { // we are done with the first array encodedArrays.remove(0); firstArrayPosition = 0; } else { firstArrayPosition = j; } return decodedArray; } else if (b == NULL_CHARACTER) { decodedArray[outIndex++] = 0x00; } // else not required - handled during length determination copyStart = j; } else if (b == ESCAPE2) { System.arraycopy(store, copyStart, decodedArray, outIndex, j - copyStart - 1); outIndex += j - copyStart - 1; // ESCAPE2 FF_CHARACTER represents '\xff' // ESCAPE2 INFINITY is an error b = store[j++]; if (b == FF_CHARACTER) { decodedArray[outIndex++] = (byte) 0xff; } // else not required - handled during length determination copyStart = j; } } // not required due to the first phase, but need to entertain the compiler throw new IllegalArgumentException("Invalid encoded byte array"); } /** * Returns the next long item (encoded in big-endian format via * {@code writeNumIncreasing(long)}) from its internal encoded byte array * store and removes the item from the store. * * @see #writeNumIncreasing(long) */ public long readNumIncreasing() { if ((encodedArrays == null) || (encodedArrays.size() == 0) || ((encodedArrays.get(0)).length - firstArrayPosition < 1)) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] store = encodedArrays.get(0); // Decode length byte int len = store[firstArrayPosition]; if ((firstArrayPosition + len + 1 > store.length) || len > 8) { throw new IllegalArgumentException("Invalid encoded byte array"); } long result = 0; for (int i = 0; i < len; i++) { result <<= 8; result |= (store[firstArrayPosition + i + 1] & 0xff); } if ((store.length - firstArrayPosition - len - 1) == 0) { // we are done with the first array encodedArrays.remove(0); firstArrayPosition = 0; } else { firstArrayPosition = firstArrayPosition + len + 1; } return result; } /** * Returns the next long item (encoded via * {@code writeSignedNumIncreasing(long)}) from its internal encoded byte * array store and removes the item from the store. * * @see #writeSignedNumIncreasing(long) */ public long readSignedNumIncreasing() { if ((encodedArrays == null) || (encodedArrays.size() == 0) || ((encodedArrays.get(0)).length - firstArrayPosition < 1)) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] store = encodedArrays.get(0); long xorMask = ((store[firstArrayPosition] & 0x80) == 0) ? ~0L : 0L; // Store first byte as an int rather than a (signed) byte -- to avoid // accidental byte-to-int promotion later, which would extend the byte's // sign bit (if any). int firstByte = (store[firstArrayPosition] & 0xff) ^ (int) (xorMask & 0xff); // Now calculate and test length, and set x to raw (unmasked) result. int len; long x; if (firstByte != 0xff) { len = 7 - log2Floor(firstByte ^ 0xff); if (store.length - firstArrayPosition < len) { throw new IllegalArgumentException("Invalid encoded byte array"); } x = xorMask; // Sign extend using xorMask. for (int i = firstArrayPosition; i < firstArrayPosition + len; i++) { x = (x << 8) | (store[i] & 0xff); } } else { len = 8; if (store.length - firstArrayPosition < len) { throw new IllegalArgumentException("Invalid encoded byte array"); } int secondByte = (store[firstArrayPosition + 1] & 0xff) ^ (int) (xorMask & 0xff); if (secondByte >= 0x80) { if (secondByte < 0xc0) { len = 9; } else { int thirdByte = (store[firstArrayPosition + 2] & 0xff) ^ (int) (xorMask & 0xff); if (secondByte == 0xc0 && thirdByte < 0x80) { len = 10; } else { // Either len > 10 or len == 10 and #bits > 63. throw new IllegalArgumentException("Invalid encoded byte array"); } } if (store.length - firstArrayPosition < len) { throw new IllegalArgumentException("Invalid encoded byte array"); } } x = Longs.fromByteArray( Arrays.copyOfRange(store, firstArrayPosition + len - 8, firstArrayPosition + len)); } x ^= LENGTH_TO_MASK[len]; // Remove spurious header bits. if (len != getSignedEncodingLength(x)) { throw new IllegalArgumentException("Invalid encoded byte array"); } if ((store.length - firstArrayPosition - len) == 0) { // We are done with the first array. encodedArrays.remove(0); firstArrayPosition = 0; } else { firstArrayPosition = firstArrayPosition + len; } return x; } /** * Removes INFINITY item from its internal encoded byte array store * if present. Returns whether INFINITY was present. * * @see #writeInfinity() */ public boolean readInfinity() { if ((encodedArrays == null) || (encodedArrays.size() == 0) || ((encodedArrays.get(0)).length - firstArrayPosition < 1)) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] store = encodedArrays.get(0); if (store.length - firstArrayPosition < 2) { return false; } if ((store[firstArrayPosition] == ESCAPE2) && (store[firstArrayPosition + 1] == INFINITY)) { if ((store.length - firstArrayPosition - 2) == 0) { // we are done with the first array encodedArrays.remove(0); firstArrayPosition = 0; } else { firstArrayPosition = firstArrayPosition + 2; } return true; } else { return false; } } /** * Returns the trailing byte array item from its internal encoded byte array * store and removes the item from the store. * * @see #writeTrailingBytes(byte[]) */ public byte[] readTrailingBytes() { // one item is contained within one byte array if ((encodedArrays == null) || (encodedArrays.size() != 1)) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] store = encodedArrays.get(0); encodedArrays.remove(0); assert encodedArrays.size() == 0; return Arrays.copyOfRange(store, firstArrayPosition, store.length); } /** * Reads (unencoded) {@code len} bytes. * * @see #writeTrailingBytes(byte[]) */ public byte[] readBytes(int len) { if ((encodedArrays == null) || (encodedArrays.size() == 0) || ((encodedArrays.get(0)).length - firstArrayPosition < len)) { throw new IllegalArgumentException("Invalid encoded byte array"); } byte[] store = encodedArrays.get(0); byte[] result; if (store.length - firstArrayPosition == len) { // We are done with the first array. result = encodedArrays.remove(0); firstArrayPosition = 0; } else { result = new byte[len]; System.arraycopy(store, firstArrayPosition, result, 0, len); firstArrayPosition = firstArrayPosition + len; } return result; } /** * Returns the encoded bytes that represent the current state of the * OrderedCode. * * <p> * <b> NOTE: This method returns OrederedCode's internal array (not a * copy) for better performance. Therefore the returned array should not be * modified.</b> */ public byte[] getEncodedBytes() { if (encodedArrays.size() == 0) { return new byte[0]; } if ((encodedArrays.size() == 1) && (firstArrayPosition == 0)) { return encodedArrays.get(0); } int totalLength = 0; for (int i = 0; i < encodedArrays.size(); i++) { byte[] bytes = encodedArrays.get(i); if (i == 0) { totalLength += bytes.length - firstArrayPosition; } else { totalLength += bytes.length; } } byte[] encodedBytes = new byte[totalLength]; int destPos = 0; for (int i = 0; i < encodedArrays.size(); i++) { byte[] bytes = encodedArrays.get(i); if (i == 0) { System.arraycopy(bytes, firstArrayPosition, encodedBytes, destPos, bytes.length - firstArrayPosition); destPos += bytes.length - firstArrayPosition; } else { System.arraycopy(bytes, 0, encodedBytes, destPos, bytes.length); destPos += bytes.length; } } // replace the store with merged array, so that repeated calls // don't need to merge. The reads can handle both the versions. encodedArrays.clear(); encodedArrays.add(encodedBytes); firstArrayPosition = 0; return encodedBytes; } /** * Returns true if this has more encoded bytes that haven't been read, * false otherwise. Return value of true doesn't imply anything about * validity of remaining data. * @return true if it has more encoded bytes that haven't been read, * false otherwise. */ public boolean hasRemainingEncodedBytes() { // We delete an array after fully consuming it. return encodedArrays != null && encodedArrays.size() != 0; } }