Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.twitter.elephantbird.pig.load; import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.UnknownScannerException; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.BinaryComparator; import org.apache.hadoop.hbase.filter.CompareFilter; import org.apache.hadoop.hbase.filter.FilterList; import org.apache.hadoop.hbase.filter.RowFilter; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.util.StringUtils; import org.apache.pig.Slice; import org.apache.pig.backend.datastorage.DataStorage; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import com.google.common.collect.Maps; import com.twitter.elephantbird.pig.util.PigCounterHelper; /** * HBase Slice to load a portion of range of a table. The key range will be * [start, end) Modeled from org.apache.hadoop.hbase.mapred.TableSplit. */ public class HBaseSlice implements Slice { /** A Generated Serial Version UID **/ private static final long serialVersionUID = 9035916017187148965L; private static final Log LOG = LogFactory.getLog(HBaseSlice.class); private transient PigCounterHelper counterHelper_; // assigned during construction /** Table Name **/ private final byte[] tableName_; /** Table Start Row **/ private final byte[] startRow_; /** Table End Row **/ private final byte[] endRow_; /** Table Region Location **/ private final String regionLocation_; /** Input Columns **/ private final byte[][] inputColumns_; /** Whether the row should be loaded **/ private final boolean loadRowKey_; /** BigInteger representations of row range */ private final BigInteger bigStart_; private final BigInteger bigEnd_; private final BigDecimal bigRange_; private Map<CompareFilter.CompareOp, String> innerFilters_ = Maps.newHashMap(); private long limit_ = -1; // created as part of init /** The connection to the table in Hbase **/ private transient HTable m_table; /** The scanner over the table **/ private transient ResultScanner m_scanner; private transient long seenRows_ = 0; private transient ArrayList<Object> mProtoTuple; /** * Record the last processed row, so that we can restart the scanner when an * exception happened during scanning a table */ private transient byte[] m_lastRow_; /** * Constructor * * @param tableName * table name * @param startRow * start now, inclusive * @param endRow * end row, exclusive * @param inputColumns * input columns * @param location * region location */ public HBaseSlice(byte[] tableName, byte[] startRow, byte[] endRow, byte[][] inputColumns, boolean loadRowKey, final String location) { tableName_ = tableName; startRow_ = startRow; endRow_ = endRow; inputColumns_ = inputColumns; regionLocation_ = location; loadRowKey_ = loadRowKey; // We have to deal with different byte lengths of keys producing very different // BigIntegers (bigendianness is great this way). The code is mostly cribbed // from HBase's Bytes class. byte[] startPadded; byte[] endPadded; if (startRow.length < endRow.length) { startPadded = Bytes.padTail(startRow, endRow.length - startRow.length); endPadded = endRow; } else if (endRow.length < startRow.length) { startPadded = startRow; endPadded = Bytes.padTail(endRow, startRow.length - endRow.length); } else { startPadded = startRow; endPadded = endRow; } byte[] prependHeader = { 1, 0 }; bigStart_ = new BigInteger(Bytes.add(prependHeader, startPadded)); bigEnd_ = new BigInteger(Bytes.add(prependHeader, endPadded)); bigRange_ = new BigDecimal(bigEnd_.subtract(bigStart_)); } public void addFilter(CompareFilter.CompareOp compareOp, String filterValue) { innerFilters_.put(compareOp, filterValue); } /** @return table name */ public byte[] getTableName() { return this.tableName_; } /** @return starting row key */ public byte[] getStartRow() { return this.startRow_; } /** @return end row key */ public byte[] getEndRow() { return this.endRow_; } /** @return input columns */ public byte[][] getInputColumns() { return this.inputColumns_; } /** @return the region's hostname */ public String getRegionLocation() { return this.regionLocation_; } @Override public long getStart() { // Not clear how to obtain this in a table... return 0; } @Override public long getLength() { // Not clear how to obtain this in a table... // it seems to be used only for sorting splits return 0; } @Override public String[] getLocations() { return new String[] { regionLocation_ }; } @Override public long getPos() throws IOException { // This should be the ordinal tuple in the range; // not clear how to calculate... return 0; } @Override public float getProgress() throws IOException { // No way to know max.. just return 0. Sorry, reporting on the last slice is janky. // So is reporting on the first slice, by the way -- it will start out too high, possibly at 100%. if (endRow_.length == 0) return 0; byte[] lastPadded = m_lastRow_; if (m_lastRow_.length < endRow_.length) { lastPadded = Bytes.padTail(m_lastRow_, endRow_.length - m_lastRow_.length); } if (m_lastRow_.length < startRow_.length) { lastPadded = Bytes.padTail(m_lastRow_, startRow_.length - m_lastRow_.length); } byte[] prependHeader = { 1, 0 }; BigInteger bigLastRow = new BigInteger(Bytes.add(prependHeader, lastPadded)); BigDecimal processed = new BigDecimal(bigLastRow.subtract(bigStart_)); try { BigDecimal progress = processed.setScale(3).divide(bigRange_, BigDecimal.ROUND_HALF_DOWN); return progress.floatValue(); } catch (java.lang.ArithmeticException e) { return 0; } } @Override public void init(DataStorage store) throws IOException { HBaseConfiguration conf = new HBaseConfiguration(); // connect to the given table m_table = new HTable(conf, tableName_); // init the scanner initScanner(); } /** * Init the table scanner * * @throws IOException */ private void initScanner() throws IOException { restart(startRow_); m_lastRow_ = startRow_; } /** * Restart scanning from survivable exceptions by creating a new scanner. * * @param startRow * the start row * @throws IOException */ private void restart(byte[] startRow) throws IOException { Scan scan; if ((endRow_ != null) && (endRow_.length > 0)) { scan = new Scan(startRow, endRow_); } else { scan = new Scan(startRow); } // Set filters, if any. FilterList scanFilter = null; if (!innerFilters_.isEmpty()) { scanFilter = new FilterList(); for (Map.Entry<CompareFilter.CompareOp, String> entry : innerFilters_.entrySet()) { scanFilter.addFilter( new RowFilter(entry.getKey(), new BinaryComparator(Bytes.toBytesBinary(entry.getValue())))); } scan.setFilter(scanFilter); } scan.addColumns(inputColumns_); this.m_scanner = this.m_table.getScanner(scan); } @Override public boolean next(Tuple value) throws IOException { Result result; try { result = m_scanner.next(); } catch (UnknownScannerException e) { LOG.info("recovered from " + StringUtils.stringifyException(e)); restart(m_lastRow_); if (m_lastRow_ != startRow_) { m_scanner.next(); // skip presumed already mapped row } result = this.m_scanner.next(); } boolean hasMore = result != null && result.size() > 0 && (limit_ < 0 || limit_ > seenRows_); if (hasMore) { if (counterHelper_ == null) counterHelper_ = new PigCounterHelper(); counterHelper_.incrCounter(HBaseSlice.class.getName(), Bytes.toString(tableName_) + " rows read", 1); m_lastRow_ = result.getRow(); convertResultToTuple(result, value); seenRows_ += 1; } return hasMore; } /** * Convert a row result to a tuple * * @param result * row result * @param tuple * tuple */ private void convertResultToTuple(Result result, Tuple tuple) { if (mProtoTuple == null) mProtoTuple = new ArrayList<Object>(inputColumns_.length + (loadRowKey_ ? 1 : 0)); if (loadRowKey_) { mProtoTuple.add(new DataByteArray(result.getRow())); } for (byte[] column : inputColumns_) { byte[] value = result.getValue(column); if (value == null) { mProtoTuple.add(null); } else { mProtoTuple.add(new DataByteArray(value)); } } Tuple newT = TupleFactory.getInstance().newTuple(mProtoTuple); mProtoTuple.clear(); tuple.reference(newT); } @Override public void close() throws IOException { if (m_scanner != null) { m_scanner.close(); m_scanner = null; } } @Override public String toString() { return regionLocation_ + ":" + Bytes.toString(startRow_) + "," + Bytes.toString(endRow_); } public void setLimit(String limit) { LOG.info("Setting Slice limit to " + Long.valueOf(limit)); limit_ = Long.valueOf(limit); } }