org.archive.hadoop.pig.HBaseStorage.java Source code

Introduction

Here is the source code for org.archive.hadoop.pig.HBaseStorage.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.archive.hadoop.pig;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadPushDown;
import org.apache.pig.LoadStoreCaster;
import org.apache.pig.OrderedLoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.StoreFuncInterface;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.hbase.HBaseBinaryConverter;
import org.apache.pig.backend.hadoop.hbase.HBaseTableInputFormat.HBaseTableIFBuilder;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

import com.google.common.collect.Lists;

/**
 * A HBase implementation of LoadFunc and StoreFunc
 * 
 * TODO(dmitriy) test that all this stuff works
 * TODO(dmitriy) documentation
 * 
 * Note(Kenji Nagahshi): this is a copy of org.apache.pig.backend.hadoop.hbase.HBaseStorage with
 * small modification for allowing pig script to pass timestamp for each row. currently timestamp support
 * is for write (store) only.
 */
public class HBaseStorage extends LoadFunc implements StoreFuncInterface, LoadPushDown, OrderedLoadFunc {

    private static final Log LOG = LogFactory.getLog(HBaseStorage.class);

    private final static String STRING_CASTER = "UTF8StorageConverter";
    private final static String BYTE_CASTER = "HBaseBinaryConverter";
    private final static String CASTER_PROPERTY = "pig.hbase.caster";

    private List<byte[][]> columnList_ = Lists.newArrayList();
    private HTable m_table;
    private Configuration m_conf;
    private RecordReader reader;
    private RecordWriter writer;
    private Scan scan;
    private String contextSignature = null;

    private final CommandLine configuredOptions_;
    private final static Options validOptions_ = new Options();
    private final static CommandLineParser parser_ = new GnuParser();
    private boolean loadRowKey_;
    private final long limit_;
    private final int caching_;

    protected transient byte[] gt_;
    protected transient byte[] gte_;
    protected transient byte[] lt_;
    protected transient byte[] lte_;

    private LoadCaster caster_;

    private ResourceSchema schema_;

    private RequiredFieldList requiredFieldList;

    private boolean initialized = false;

    // <IA-MOD>
    private int timestampColumn = -1;
    // </IA-MOD>

    private static void populateValidOptions() {
        validOptions_.addOption("loadKey", false, "Load Key");
        validOptions_.addOption("gt", true,
                "Records must be greater than this value " + "(binary, double-slash-escaped)");
        validOptions_.addOption("lt", true, "Records must be less than this value (binary, double-slash-escaped)");
        validOptions_.addOption("gte", true, "Records must be greater than or equal to this value");
        validOptions_.addOption("lte", true, "Records must be less than or equal to this value");
        validOptions_.addOption("caching", true, "Number of rows scanners should cache");
        validOptions_.addOption("limit", true, "Per-region limit");
        validOptions_.addOption("caster", true, "Caster to use for converting values. A class name, "
                + "HBaseBinaryConverter, or Utf8StorageConverter. For storage, casters must implement LoadStoreCaster.");
    }

    /**
     * Constructor. Construct a HBase Table LoadFunc and StoreFunc to load or store the cells of the
     * provided columns.
     * 
     * @param columnList
     *            columnlist that is a presented string delimited by space.
     * @throws ParseException when unale to parse arguments
     * @throws IOException 
     */
    public HBaseStorage(String columnList) throws ParseException, IOException {
        this(columnList, "");
    }

    /**
     * Constructor. Construct a HBase Table LoadFunc and StoreFunc to load or store. 
     * @param columnList
     * @param optString Loader options. Known options:<ul>
     * <li>-loadKey=(true|false)  Load the row key as the first column
     * <li>-gt=minKeyVal
     * <li>-lt=maxKeyVal 
     * <li>-gte=minKeyVal
     * <li>-lte=maxKeyVal
     * <li>-limit=numRowsPerRegion max number of rows to retrieve per region
     * <li>-caching=numRows  number of rows to cache (faster scans, more memory).
     * </ul>
     * @throws ParseException 
     * @throws IOException 
     */
    public HBaseStorage(String columnList, String optString) throws ParseException, IOException {
        populateValidOptions();
        String[] colNames = columnList.split(" ");
        String[] optsArr = optString.split(" ");
        try {
            configuredOptions_ = parser_.parse(validOptions_, optsArr);
        } catch (ParseException e) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("[-loadKey] [-gt] [-gte] [-lt] [-lte] [-caching] [-caster]", validOptions_);
            throw e;
        }

        loadRowKey_ = configuredOptions_.hasOption("loadKey");
        // <IA-MOD>
        int i = 0;
        for (String colName : colNames) {
            if (colName.equals("timestamp"))
                timestampColumn = i;
            columnList_.add(Bytes.toByteArrays(colName.split(":")));
            i++;
        }
        // </IA-MOD>

        m_conf = HBaseConfiguration.create();
        String defaultCaster = m_conf.get(CASTER_PROPERTY, STRING_CASTER);
        String casterOption = configuredOptions_.getOptionValue("caster", defaultCaster);
        if (STRING_CASTER.equalsIgnoreCase(casterOption)) {
            caster_ = new Utf8StorageConverter();
        } else if (BYTE_CASTER.equalsIgnoreCase(casterOption)) {
            caster_ = new HBaseBinaryConverter();
        } else {
            try {
                @SuppressWarnings("unchecked")
                Class<LoadCaster> casterClass = (Class<LoadCaster>) Class.forName(casterOption);
                caster_ = casterClass.newInstance();
            } catch (ClassCastException e) {
                LOG.error("Congifured caster does not implement LoadCaster interface.");
                throw new IOException(e);
            } catch (ClassNotFoundException e) {
                LOG.error("Configured caster class not found.", e);
                throw new IOException(e);
            } catch (InstantiationException e) {
                LOG.error("Unable to instantiate configured caster " + casterOption, e);
                throw new IOException(e);
            } catch (IllegalAccessException e) {
                LOG.error("Illegal Access Exception for configured caster " + casterOption, e);
                throw new IOException(e);
            }
        }

        caching_ = Integer.valueOf(configuredOptions_.getOptionValue("caching", "100"));
        limit_ = Long.valueOf(configuredOptions_.getOptionValue("limit", "-1"));
        initScan();
    }

    private void initScan() {
        scan = new Scan();
        // Set filters, if any.
        if (configuredOptions_.hasOption("gt")) {
            gt_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("gt")));
            addFilter(CompareOp.GREATER, gt_);
        }
        if (configuredOptions_.hasOption("lt")) {
            lt_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("lt")));
            addFilter(CompareOp.LESS, lt_);
        }
        if (configuredOptions_.hasOption("gte")) {
            gte_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("gte")));
            addFilter(CompareOp.GREATER_OR_EQUAL, gte_);
        }
        if (configuredOptions_.hasOption("lte")) {
            lte_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("lte")));
            addFilter(CompareOp.LESS_OR_EQUAL, lte_);
        }
    }

    private void addFilter(CompareOp op, byte[] val) {
        LOG.info("Adding filter " + op.toString() + " with value " + Bytes.toStringBinary(val));
        FilterList scanFilter = (FilterList) scan.getFilter();
        if (scanFilter == null) {
            scanFilter = new FilterList();
        }
        scanFilter.addFilter(new RowFilter(op, new BinaryComparator(val)));
        scan.setFilter(scanFilter);
    }

    @Override
    public Tuple getNext() throws IOException {
        try {
            if (!initialized) {
                Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                        new String[] { contextSignature });

                String projectedFields = p.getProperty(contextSignature + "_projectedFields");
                if (projectedFields != null) {
                    requiredFieldList = (RequiredFieldList) ObjectSerializer.deserialize(projectedFields);
                    pushProjection(requiredFieldList);
                }
                initialized = true;
            }
            if (reader.nextKeyValue()) {
                ImmutableBytesWritable rowKey = (ImmutableBytesWritable) reader.getCurrentKey();
                Result result = (Result) reader.getCurrentValue();
                int tupleSize = columnList_.size();
                if (loadRowKey_) {
                    tupleSize++;
                }
                Tuple tuple = TupleFactory.getInstance().newTuple(tupleSize);

                int startIndex = 0;
                if (loadRowKey_) {
                    tuple.set(0, new DataByteArray(rowKey.get()));
                    startIndex++;
                }
                for (int i = 0; i < columnList_.size(); ++i) {
                    byte[] cell = result.getValue(columnList_.get(i)[0], columnList_.get(i)[1]);
                    if (cell != null)
                        tuple.set(i + startIndex, new DataByteArray(cell));
                    else
                        tuple.set(i + startIndex, null);
                }
                return tuple;
            }
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
        return null;
    }

    @Override
    public InputFormat getInputFormat() {
        TableInputFormat inputFormat = new HBaseTableIFBuilder().withLimit(limit_).withGt(gt_).withGte(gte_)
                .withLt(lt_).withLte(lte_).withConf(m_conf).build();
        return inputFormat;
    }

    @Override
    public void prepareToRead(RecordReader reader, PigSplit split) {
        this.reader = reader;
    }

    @Override
    public void setUDFContextSignature(String signature) {
        this.contextSignature = signature;
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        job.getConfiguration().setBoolean("pig.noSplitCombination", true);
        m_conf = job.getConfiguration();
        HBaseConfiguration.addHbaseResources(m_conf);

        // Make sure the HBase, ZooKeeper, and Guava jars get shipped.
        TableMapReduceUtil.addDependencyJars(job.getConfiguration(), org.apache.hadoop.hbase.client.HTable.class,
                com.google.common.collect.Lists.class, org.apache.zookeeper.ZooKeeper.class);

        String tablename = location;
        if (location.startsWith("hbase://")) {
            tablename = location.substring(8);
        }
        if (m_table == null) {
            m_table = new HTable(m_conf, tablename);
        }
        m_table.setScannerCaching(caching_);
        m_conf.set(TableInputFormat.INPUT_TABLE, tablename);

        // Set up scan if it is not already set up.
        if (m_conf.get(TableInputFormat.SCAN) != null) {
            return;
        }

        for (byte[][] col : columnList_) {
            scan.addColumn(col[0], col[1]);
        }
        if (requiredFieldList != null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                    new String[] { contextSignature });
            p.setProperty(contextSignature + "_projectedFields", ObjectSerializer.serialize(requiredFieldList));
        }
        m_conf.set(TableInputFormat.SCAN, convertScanToString(scan));
    }

    @Override
    public String relativeToAbsolutePath(String location, Path curDir) throws IOException {
        return location;
    }

    private static String convertScanToString(Scan scan) {
        try {
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            DataOutputStream dos = new DataOutputStream(out);
            scan.write(dos);
            return Base64.encodeBytes(out.toByteArray());
        } catch (IOException e) {
            LOG.error(e);
            return "";
        }

    }

    /**
     * Set up the caster to use for reading values out of, and writing to, HBase. 
     */
    @Override
    public LoadCaster getLoadCaster() throws IOException {
        return caster_;
    }

    /*
     * StoreFunc Methods
     * @see org.apache.pig.StoreFuncInterface#getOutputFormat()
     */

    @Override
    public OutputFormat getOutputFormat() throws IOException {
        TableOutputFormat outputFormat = new TableOutputFormat();
        HBaseConfiguration.addHbaseResources(m_conf);
        outputFormat.setConf(m_conf);
        return outputFormat;
    }

    @Override
    public void checkSchema(ResourceSchema s) throws IOException {
        if (!(caster_ instanceof LoadStoreCaster)) {
            LOG.error("Caster must implement LoadStoreCaster for writing to HBase.");
            throw new IOException("Bad Caster " + caster_.getClass());
        }
        schema_ = s;
    }

    // Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
    @Override
    public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
        this.writer = writer;
    }

    // Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
    @SuppressWarnings("unchecked")
    @Override
    public void putNext(Tuple t) throws IOException {
        if (!initialized) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                    new String[] { contextSignature });
            String serializedSchema = p.getProperty(contextSignature + "_schema");
            if (serializedSchema != null) {
                schema_ = (ResourceSchema) ObjectSerializer.deserialize(serializedSchema);
            }
            initialized = true;
        }
        ResourceFieldSchema[] fieldSchemas = (schema_ == null) ? null : schema_.getFields();
        Put put = new Put(objToBytes(t.get(0),
                (fieldSchemas == null) ? DataType.findType(t.get(0)) : fieldSchemas[0].getType()));
        long ts = System.currentTimeMillis();
        // <IA-MOD>
        if (timestampColumn >= 0) {
            ts = (Long) t.get(timestampColumn + 1);
        }
        // </IA-MOD>

        for (int i = 1; i < t.size(); ++i) {
            // <IA-MOD>
            if (i - 1 == timestampColumn)
                continue;
            // </IA-MOD>
            put.add(columnList_.get(i - 1)[0], columnList_.get(i - 1)[1], ts, objToBytes(t.get(i),
                    (fieldSchemas == null) ? DataType.findType(t.get(i)) : fieldSchemas[i].getType()));
        }
        try {
            writer.write(null, put);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }

    @SuppressWarnings("unchecked")
    private byte[] objToBytes(Object o, byte type) throws IOException {
        LoadStoreCaster caster = (LoadStoreCaster) caster_;
        if (o == null)
            return null;
        switch (type) {
        case DataType.BYTEARRAY:
            return ((DataByteArray) o).get();
        case DataType.BAG:
            return caster.toBytes((DataBag) o);
        case DataType.CHARARRAY:
            return caster.toBytes((String) o);
        case DataType.DOUBLE:
            return caster.toBytes((Double) o);
        case DataType.FLOAT:
            return caster.toBytes((Float) o);
        case DataType.INTEGER:
            return caster.toBytes((Integer) o);
        case DataType.LONG:
            return caster.toBytes((Long) o);

        // The type conversion here is unchecked. 
        // Relying on DataType.findType to do the right thing.
        case DataType.MAP:
            return caster.toBytes((Map<String, Object>) o);

        case DataType.NULL:
            return null;
        case DataType.TUPLE:
            return caster.toBytes((Tuple) o);
        case DataType.ERROR:
            throw new IOException("Unable to determine type of " + o.getClass());
        default:
            throw new IOException("Unable to find a converter for tuple field " + o);
        }
    }

    @Override
    public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
        return location;
    }

    @Override
    public void setStoreFuncUDFContextSignature(String signature) {
        this.contextSignature = signature;
    }

    @Override
    public void setStoreLocation(String location, Job job) throws IOException {
        if (location.startsWith("hbase://")) {
            job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location.substring(8));
        } else {
            job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location);
        }
        Properties props = UDFContext.getUDFContext().getUDFProperties(getClass(),
                new String[] { contextSignature });
        if (!props.containsKey(contextSignature + "_schema")) {
            props.setProperty(contextSignature + "_schema", ObjectSerializer.serialize(schema_));
        }
        m_conf = HBaseConfiguration.addHbaseResources(job.getConfiguration());
    }

    @Override
    public void cleanupOnFailure(String location, Job job) throws IOException {
    }

    /*
     * LoadPushDown Methods.
     */

    @Override
    public List<OperatorSet> getFeatures() {
        return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
    }

    @Override
    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {

        // colOffset is the offset in our columnList that we need to apply to indexes we get from requiredFields
        // (row key is not a real column)
        int colOffset = loadRowKey_ ? 1 : 0;
        // projOffset is the offset to the requiredFieldList we need to apply when figuring out which columns to prune.
        // (if key is pruned, we should skip row key's element in this list when trimming colList)
        int projOffset = colOffset;

        this.requiredFieldList = requiredFieldList;
        List<RequiredField> requiredFields = requiredFieldList.getFields();
        if (requiredFieldList != null && requiredFields.size() > (columnList_.size() + colOffset)) {
            throw new FrontendException(
                    "The list of columns to project from HBase is larger than HBaseStorage is configured to load.");
        }

        List<byte[][]> newColumns = Lists.newArrayListWithExpectedSize(requiredFields.size());

        // HBase Row Key is the first column in the schema when it's loaded, 
        // and is not included in the columnList (since it's not a proper column).
        if (loadRowKey_ && (requiredFields.size() < 1 || requiredFields.get(0).getIndex() != 0)) {
            loadRowKey_ = false;
            projOffset = 0;
        }

        for (int i = projOffset; i < requiredFields.size(); i++) {
            int fieldIndex = requiredFields.get(i).getIndex();
            newColumns.add(columnList_.get(fieldIndex - colOffset));
        }
        columnList_ = newColumns;
        return new RequiredFieldResponse(true);
    }

    @Override
    public WritableComparable<InputSplit> getSplitComparable(InputSplit split) throws IOException {
        return new WritableComparable<InputSplit>() {
            TableSplit tsplit = new TableSplit();

            @Override
            public void readFields(DataInput in) throws IOException {
                tsplit.readFields(in);
            }

            @Override
            public void write(DataOutput out) throws IOException {
                tsplit.write(out);
            }

            @Override
            public int compareTo(InputSplit split) {
                return tsplit.compareTo((TableSplit) split);
            }
        };
    }

}