co.cask.cdap.api.dataset.lib.IndexedTable.java Source code

Introduction

Here is the source code for co.cask.cdap.api.dataset.lib.IndexedTable.java
Source

/*
 * Copyright  2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.api.dataset.lib;

import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.RecordScanner;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.data.batch.SplitReader;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.dataset.table.Delete;
import co.cask.cdap.api.dataset.table.Get;
import co.cask.cdap.api.dataset.table.Increment;
import co.cask.cdap.api.dataset.table.Put;
import co.cask.cdap.api.dataset.table.Result;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scan;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.Table;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.Longs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.Type;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import javax.annotation.Nullable;

/**
 * Implements a table that creates and maintains indexes on values stored within a configured set of column names.
 *
 * <p>
 * This dataset uses two tables:
 * <ul>
 *   <li>the actual data table, which stores the raw, unmodified rows which are written; and</li>
 *   <li>an index table, with rows keyed by the indexed column and value (plus data row key for uniqueness),
 *   which contains a reference to the row key in the data table matching the indexed value.</li>
 * </ul>
 * </p>
 *
 * <p>The indexed values need not be unique.  When reading the data back by index value, a {@link Scanner} will be
 * returned, allowing the client to iterate through all matching rows. Exact matches as well as range lookups on
 * indexed values are supported.
 * </p>
 *
 * <p>Index entries are created by storing additional rows in a second table.  These index rows are keyed by
 * column name, column value, and original row key, each field separated by a single null byte delimiter.  It is not
 * recommended that the name of an index column contains the null byte (this can cause a degradation in performance
 * of index reads).
 * </p>
 *
 * <p>The columns to index can be configured in the {@link co.cask.cdap.api.dataset.DatasetProperties} used
 * when the dataset instance in created.  Multiple column names should be listed as a comma-separated string
 * (with no spaces):
 *
 * <p>
 * <pre><code>public class MyApp extends AbstractApplication {
 *   public void configure() {
 *     setName("MyApp");
 *     ...
 *     createDataset("indexedData", IndexedTable.class,
 *                   DatasetProperties.builder().add(
 *                       IndexedTableDefinition.INDEX_COLUMNS_CONF_KEY, "col1,col2").build());
 *     ...
 *   }
 * }</code></pre>
 * </p>
 * 
 * <p>
 * Note that this means that the column names which should be indexed cannot contain the comma character,
 * as it would break parsing of the configuration property.
 * </p>
 *
 * @see co.cask.cdap.api.dataset.lib.IndexedTableDefinition#INDEX_COLUMNS_CONF_KEY
 */
public class IndexedTable extends AbstractDataset implements Table {
    private static final Logger LOG = LoggerFactory.getLogger(IndexedTable.class);

    /**
     * Column key used to store the existence of a row in the secondary index.
     */
    private static final byte[] IDX_COL = { 'r' };
    private static final byte DELIMITER_BYTE = 0;
    private static final byte[] KEY_DELIMITER = new byte[] { DELIMITER_BYTE };

    private final boolean hasColumnWithDelimiter;
    // the two underlying tables
    private Table table, index;
    // the secondary index column
    private SortedSet<byte[]> indexedColumns;

    /**
     * Configuration time constructor.
     * 
     * @param name the name of the table
     * @param table table to use as the table
     * @param index table to use as the index
     * @param columnsToIndex the names of the data columns to index
     */
    public IndexedTable(String name, Table table, Table index, byte[][] columnsToIndex) {
        super(name, table, index);
        this.table = table;
        this.index = index;
        this.indexedColumns = Sets.newTreeSet(Bytes.BYTES_COMPARATOR);
        this.hasColumnWithDelimiter = hasDelimiterByte(columnsToIndex);
        Collections.addAll(this.indexedColumns, columnsToIndex);
    }

    /**
     * Checks if a set of columns contains the DELIMITER_BYTE.
     * This is needed because only when a column has a null byte in it do we need to check for false positive in the scan
     * on the index by checking against the column's value in the data table.
     * For instance, a false positive can arise with an index on the following two rows:
     * <ul>
     *  <li>Row #1: column name: a\0b, column value: c</li>
     *  <li>Row #2: column name: a, column value: b\0c</li>
     * </ul>
     * Because of how the rows are constructed in the index table, both will be prefixed (and therefore looked up by)
     * the row key prefix a\0b\0c.
     *
     * @param columns the set of columns being checked
     * @return true if any of the columns contain the DELIMITER_BYTE
     */
    private boolean hasDelimiterByte(byte[][] columns) {
        for (byte[] column : columns) {
            for (byte b : column) {
                if (b == DELIMITER_BYTE) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Read a row by row key from the data table.
     *
     * @param get the read operation, as if it were on a non-indexed table
     * @return the result of the read on the underlying primary table
     */
    @Override
    public Row get(Get get) {
        return table.get(get);
    }

    @Override
    public Row get(byte[] row) {
        return table.get(row);
    }

    @Override
    public byte[] get(byte[] row, byte[] column) {
        return table.get(row, column);
    }

    @Override
    public Row get(byte[] row, byte[][] columns) {
        return table.get(row, columns);
    }

    @Override
    public Row get(byte[] row, byte[] startColumn, byte[] stopColumn, int limit) {
        return table.get(row, startColumn, stopColumn, limit);
    }

    @Override
    public List<Row> get(List<Get> gets) {
        return table.get(gets);
    }

    /**
     * Reads table rows by the given secondary index key.  If no rows are indexed by the given key, then a
     * {@link co.cask.cdap.api.dataset.table.Scanner} with no results will be returned.
     * 
     * @return a Scanner returning rows from the data table, whose stored value for the given column matches the
     * given value.
     * @throws java.lang.IllegalArgumentException if the given column is not configured for indexing.
     */
    public Scanner readByIndex(byte[] column, byte[] value) {
        assertIndexedColumn(column);
        byte[] rowKeyPrefix = Bytes.concat(column, KEY_DELIMITER, value, KEY_DELIMITER);
        byte[] stopRow = Bytes.stopKeyForPrefix(rowKeyPrefix);
        Scanner indexScan = index.scan(rowKeyPrefix, stopRow);
        return new IndexScanner(indexScan, column, value);
    }

    /**
     * Reads table rows within the given secondary index key range. If no rows are indexed, falling within the given
     * range, then a {@link co.cask.cdap.api.dataset.table.Scanner} with no results will be returned.
     *
     * @param column the column to use for the index lookup
     * @param startValue the inclusive start of the range for which rows must fall within to be returned in the scan.
     *                   {@code null} means start from first row of the table
     * @param endValue the exclusive end of the range for which rows must fall within to be returned in the scan
     *                 {@code null} means end with the last row of the table
     * @return a Scanner returning rows from the data table, whose stored value for the given column is within the the
     *         given range.
     * @throws java.lang.IllegalArgumentException if the given column is not configured for indexing.
     */
    public Scanner scanByIndex(byte[] column, @Nullable byte[] startValue, @Nullable byte[] endValue) {
        assertIndexedColumn(column);
        // KEY_DELIMITER is not used at the end of the rowKeys, because they are used for a range scan,
        // instead of a fixed-match lookup
        byte[] startRow = startValue == null ? Bytes.concat(column, KEY_DELIMITER)
                : Bytes.concat(column, KEY_DELIMITER, startValue);
        byte[] stopRow = endValue == null ? Bytes.stopKeyForPrefix(Bytes.concat(column, KEY_DELIMITER))
                : Bytes.concat(column, KEY_DELIMITER, endValue);
        Scanner indexScan = index.scan(startRow, stopRow);
        return new IndexRangeScanner(indexScan, column, startValue, endValue);
    }

    private void assertIndexedColumn(byte[] column) {
        if (!indexedColumns.contains(column)) {
            throw new IllegalArgumentException(
                    "Column " + Bytes.toStringBinary(column) + " is not configured for indexing");
        }
    }

    /**
     * Writes a put to the data table. If any of the columns in the {@link Put} are configured to be indexed, the
     * appropriate indexes will be updated with the indexed values referencing the data table row.
     * 
     * @param put The put operation to store
     */
    @Override
    public void put(Put put) {
        // if different value exists, remove current index ref
        // add a new index ref unless same value already exists
        byte[] dataRow = put.getRow();
        // find which values need to be indexed
        Map<byte[], byte[]> putColumns = put.getValues();
        Set<byte[]> colsToIndex = Sets.newTreeSet(Bytes.BYTES_COMPARATOR);
        for (Map.Entry<byte[], byte[]> putEntry : putColumns.entrySet()) {
            if (indexedColumns.contains(putEntry.getKey())) {
                colsToIndex.add(putEntry.getKey());
            }
        }

        // first read the existing indexed values to find which have changed and need to be updated
        Row existingRow = table.get(dataRow, colsToIndex.toArray(new byte[colsToIndex.size()][]));
        for (Map.Entry<byte[], byte[]> entry : existingRow.getColumns().entrySet()) {
            if (!Arrays.equals(entry.getValue(), putColumns.get(entry.getKey()))) {
                index.delete(createIndexKey(dataRow, entry.getKey(), entry.getValue()), IDX_COL);
            } else {
                // value already indexed
                colsToIndex.remove(entry.getKey());
            }
        }

        // add new index entries for all values that have changed or did not exist
        for (byte[] col : colsToIndex) {
            index.put(createIndexKey(dataRow, col, putColumns.get(col)), IDX_COL, dataRow);
        }

        // store the data row
        table.put(put);
    }

    private byte[] createIndexKey(byte[] row, byte[] column, byte[] value) {
        return Bytes.concat(column, KEY_DELIMITER, value, KEY_DELIMITER, row);
    }

    @Override
    public void put(byte[] row, byte[] column, byte[] value) {
        Put put = new Put(row);
        put.add(column, value);
        put(put);
    }

    @Override
    public void put(byte[] row, byte[][] columns, byte[][] values) {
        Put put = new Put(row);
        for (int i = 0; i < columns.length; i++) {
            put.add(columns[i], values[i]);
        }
        put(put);
    }

    /**
     * Perform a delete on the data table.  Any index entries referencing the deleted row will also be removed.
     * 
     * @param delete The delete operation identifying the row and optional columns to remove
     */
    @Override
    public void delete(Delete delete) {
        if (delete.getColumns().isEmpty()) {
            // full row delete
            delete(delete.getRow());
            return;
        }
        delete(delete.getRow(), delete.getColumns().toArray(new byte[0][]));
    }

    @Override
    public void delete(byte[] row) {
        Row existingRow = table.get(row);
        if (existingRow.isEmpty()) {
            // no row to delete
            return;
        }

        // delete all index entries
        deleteIndexEntries(existingRow);

        // delete the row
        table.delete(row);
    }

    @Override
    public void delete(byte[] row, byte[] column) {
        delete(row, new byte[][] { column });
    }

    @Override
    public void delete(byte[] row, byte[][] columns) {
        Row existingRow = table.get(row, columns);
        if (existingRow.isEmpty()) {
            // no row to delete
            return;
        }

        // delete all index entries
        deleteIndexEntries(existingRow);

        // delete the row's columns
        table.delete(row, columns);
    }

    private void deleteIndexEntries(Row existingRow) {
        byte[] row = existingRow.getRow();
        for (Map.Entry<byte[], byte[]> entry : existingRow.getColumns().entrySet()) {
            if (indexedColumns.contains(entry.getKey())) {
                index.delete(createIndexKey(row, entry.getKey(), entry.getValue()), IDX_COL);
            }
        }
    }

    /**
     * Perform a swap operation by primary key.
     * Parameters are as if they were on a non-indexed table.
     * Note that if the swap is on the secondary key column,
     * then the index must be updated; otherwise, this is a
     * pass-through to the underlying table.
     */
    @Override
    public boolean compareAndSwap(byte[] row, byte[] column, byte[] expected, byte[] newValue) {
        // if the swap is on a column other than the column key, then
        // the index is not affected - just execute the swap.
        // also, if the swap is on the index column, but the old value
        // is the same as the new value, then the index is not affected either.
        if (!indexedColumns.contains(column) || Arrays.equals(expected, newValue)) {
            return table.compareAndSwap(row, column, expected, newValue);
        }

        // the swap is on the index column. it will only succeed if the current
        // value matches the expected value of the swap. if that value is not null,
        // then we must remove the row key from the index for that value.
        Delete idxDelete = null;
        if (expected != null) {
            idxDelete = new Delete(createIndexKey(row, column, expected), IDX_COL);
        }

        // if the new value is not null, then we must add the rowkey to the index
        // for that value.
        Put idxPut = null;
        if (newValue != null) {
            idxPut = new Put(createIndexKey(row, column, newValue), IDX_COL, row);
        }

        // apply all operations to both tables
        boolean success = table.compareAndSwap(row, column, expected, newValue);
        if (!success) {
            // do nothing: no changes
            return false;
        }
        if (idxDelete != null) {
            index.delete(idxDelete);
        }
        if (idxPut != null) {
            index.put(idxPut);
        }

        return true;
    }

    /**
     * Increments (atomically) the specified row and column by the specified amount, and returns the new value.
     * Note that performing this operation on an indexed column will generally have a negative impact on performance,
     * since up to three writes will need to be performed for every increment (one removing the index for the previous,
     * pre-increment value, one adding the index for the incremented value, and one for the increment itself).
     *
     * @see Table#incrementAndGet(byte[], byte[], long)
     */
    @Override
    public long incrementAndGet(byte[] row, byte[] column, long amount) {
        byte[] newValue = incrementAndGet(row, new byte[][] { column }, new long[] { amount }).get(column);
        return Bytes.toLong(newValue);
    }

    /**
     * Increments (atomically) the specified row and columns by the specified amounts, and returns the new values.
     * Note that performing this operation on an indexed column will generally have a negative impact on performance,
     * since up to three writes will need to be performed for every increment (one removing the index for the previous,
     * pre-increment value, one adding the index for the incremented value, and one for the increment itself).
     *
     * @see Table#incrementAndGet(byte[], byte[][], long[])
     */
    @Override
    public Row incrementAndGet(byte[] row, byte[][] columns, long[] amounts) {
        Preconditions.checkArgument(columns.length == amounts.length,
                "Size of columns and amounts arguments must match");

        Row existingRow = table.get(row, columns);
        byte[][] updatedValues = new byte[columns.length][];
        NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);

        for (int i = 0; i < columns.length; i++) {
            long existingValue = 0L;
            byte[] existingBytes = existingRow.get(columns[i]);
            if (existingBytes != null) {
                if (existingBytes.length != Bytes.SIZEOF_LONG) {
                    throw new NumberFormatException(
                            "Attempted to increment a value that is not convertible to long," + " row: "
                                    + Bytes.toStringBinary(row) + " column: " + Bytes.toStringBinary(columns[i]));
                }
                existingValue = Bytes.toLong(existingBytes);
                if (indexedColumns.contains(columns[i])) {
                    index.delete(createIndexKey(row, columns[i], existingBytes), IDX_COL);
                }
            }
            updatedValues[i] = Bytes.toBytes(existingValue + amounts[i]);
            result.put(columns[i], updatedValues[i]);
            if (indexedColumns.contains(columns[i])) {
                index.put(createIndexKey(row, columns[i], updatedValues[i]), IDX_COL, row);
            }
        }

        table.put(row, columns, updatedValues);
        return new Result(row, result);
    }

    /**
     * Increments (atomically) the specified row and columns by the specified amounts, and returns the new values.
     * Note that performing this operation on an indexed column will generally have a negative impact on performance,
     * since up to three writes will need to be performed for every increment (one removing the index for the previous,
     * pre-increment value, one adding the index for the incremented value, and one for the increment itself).
     *
     * @see Table#incrementAndGet(Increment)
     */
    @Override
    public Row incrementAndGet(Increment increment) {
        Map<byte[], Long> incrementValues = increment.getValues();
        return incrementAndGet(increment.getRow(),
                incrementValues.keySet().toArray(new byte[incrementValues.size()][]),
                Longs.toArray(incrementValues.values()));
    }

    /**
     * Increments (atomically) the specified row and column by the specified amount, without returning the new value.
     * Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
     *
     * @see Table#increment(byte[], byte[], long)
     */
    @Override
    public void increment(byte[] row, byte[] column, long amount) {
        // read-less increments should not be used on indexed columns
        if (indexedColumns.contains(column)) {
            throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
                    + Bytes.toStringBinary(column) + "'");
        }
        table.increment(row, column, amount);
    }

    /**
     * Increments (atomically) the specified row and columns by the specified amounts, without returning the new values.
     * Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
     *
     * @see Table#increment(byte[], byte[][], long[])
     */
    @Override
    public void increment(byte[] row, byte[][] columns, long[] amounts) {
        // read-less increments should not be used on indexed columns
        for (byte[] col : columns) {
            if (indexedColumns.contains(col)) {
                throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
                        + Bytes.toStringBinary(col) + "'");
            }
        }
        table.increment(row, columns, amounts);
    }

    /**
     * Increments (atomically) the specified row and columns by the specified amounts, without returning the new values.
     * Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
     *
     * @see Table#increment(Increment)
     */
    @Override
    public void increment(Increment increment) {
        for (byte[] col : increment.getValues().keySet()) {
            if (indexedColumns.contains(col)) {
                throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
                        + Bytes.toStringBinary(col) + "'");
            }
        }
        table.increment(increment);
    }

    @Override
    public Scanner scan(@Nullable byte[] startRow, @Nullable byte[] stopRow) {
        return table.scan(startRow, stopRow);
    }

    @Override
    public Scanner scan(Scan scan) {
        return table.scan(scan);
    }

    /* BatchReadable implementation */

    @Override
    public List<Split> getSplits(int numSplits, byte[] start, byte[] stop) {
        return table.getSplits(numSplits, start, stop);
    }

    @Override
    public Type getRecordType() {
        return table.getRecordType();
    }

    @Override
    public List<Split> getSplits() {
        return table.getSplits();
    }

    @Override
    public RecordScanner<StructuredRecord> createSplitRecordScanner(Split split) {
        return table.createSplitRecordScanner(split);
    }

    @Override
    public SplitReader<byte[], Row> createSplitReader(Split split) {
        return table.createSplitReader(split);
    }

    /* BatchWritable implementation */

    @Override
    public void write(byte[] bytes, Put put) {
        put(put);
    }

    private abstract class AbstractIndexScanner implements Scanner {
        // scanner over index table
        private final Scanner baseScanner;
        private final byte[] column;

        public AbstractIndexScanner(Scanner baseScanner, byte[] column) {
            this.baseScanner = baseScanner;
            this.column = column;
        }

        /**
         * checks if a particular column value matches a criteria defined by the implementing class
         *
         * @param columnValue the column to check for a match
         * @return false to indicate to skip the corresponding row
         */
        protected abstract boolean matches(byte[] columnValue);

        @Nullable
        @Override
        public Row next() {
            // TODO: retrieve results in batches to minimize RPC overhead (requires multi-get support in table)
            // keep going until we hit a non-null, non-empty data row, or we exhaust the index
            for (Row indexRow = baseScanner.next(); indexRow != null; indexRow = baseScanner.next()) {
                byte[] rowkey = indexRow.get(IDX_COL);
                if (rowkey == null) {
                    LOG.warn("Row of Indexed table '{}' is missing index column. Row key: {}", getName(),
                            indexRow.getRow());
                    continue;
                }
                byte[] columnValue = Arrays.copyOfRange(indexRow.getRow(), column.length + 1,
                        indexRow.getRow().length - rowkey.length - 1);
                // Verify that datarow matches the expected row key to avoid issues with column name or value
                // containing the delimiter used. This is a sufficient check, as long as columns don't contain the null byte.
                if (matches(columnValue)) {
                    Row row = table.get(rowkey);
                    // If a column has null byte (the key delimiter) in it, then we need to check against the data row's column
                    // to be sure this row isn't a false positive in the scan.
                    // For reference, take a look at IndexedTableTest#testIndexKeyDelimiterAmbiguity
                    if (hasColumnWithDelimiter && !Bytes.equals(row.get(column), columnValue)) {
                        continue;
                    }
                    return row;
                }
            }
            // end of index
            return null;
        }

        @Override
        public void close() {
            baseScanner.close();
        }
    }

    // scanner that matches column values based upon exact match
    private class IndexScanner extends AbstractIndexScanner {
        private final byte[] value;

        public IndexScanner(Scanner baseScanner, byte[] column, byte[] value) {
            super(baseScanner, column);
            this.value = value;
        }

        @Override
        protected boolean matches(byte[] columnValue) {
            return Bytes.equals(columnValue, value);
        }
    }

    // scanner that matches column values based upon range
    private class IndexRangeScanner extends AbstractIndexScanner {
        private final byte[] start;
        private final byte[] end;

        public IndexRangeScanner(Scanner baseScanner, byte[] column, @Nullable byte[] start, @Nullable byte[] end) {
            super(baseScanner, column);
            this.start = start;
            this.end = end;
        }

        @Override
        protected boolean matches(byte[] columnValue) {
            return (start == null || Bytes.compareTo(columnValue, start) >= 0)
                    && (end == null || Bytes.compareTo(columnValue, end) < 0);
        }
    }
}