uis.cipsi.rdd.opentsdb.TSDBInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for uis.cipsi.rdd.opentsdb.TSDBInputFormat.java

Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package uis.cipsi.rdd.opentsdb;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.RegexStringComparator;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;

import com.google.protobuf.InvalidProtocolBufferException;

/**
 * Convert HBase tabular data into a format that is consumable by Map/Reduce.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class TSDBInputFormat extends TableInputFormatBase implements Configurable {

    /**
     * Converts the given Base64 string back into a Scan instance.
     *
     * @param base64
     *            The scan details.
     * @return The newly created Scan instance.
     * @throws IOException
     *             When reading the scan instance fails.
     */
    public static Scan convertStringToScan(String base64) throws IOException {
        byte[] decoded = Base64.decode(base64);
        ClientProtos.Scan scan;
        try {
            scan = ClientProtos.Scan.parseFrom(decoded);
        } catch (InvalidProtocolBufferException ipbe) {
            throw new IOException(ipbe);
        }

        return ProtobufUtil.toScan(scan);
    }

    private static final Log LOG = LogFactory.getLog(TSDBInputFormat.class);

    /** Job parameter that specifies the input table. */
    public static final String INPUT_TABLE = "hbase.mapreduce.inputtable";
    /**
     * If specified, use start keys of this table to split. This is useful when
     * you are preparing data for bulkload.
     */
    private static final String SPLIT_TABLE = "hbase.mapreduce.splittable";
    /**
     * Base-64 encoded scanner. All other SCAN_ confs are ignored if this is
     * specified. See {@link TableMapReduceUtil#convertScanToString(Scan)} for
     * more details.
     */
    public static final String SCAN = "hbase.mapreduce.scan";
    /** Scan start row */
    public static final String SCAN_ROW_START = "hbase.mapreduce.scan.row.start";
    /** Scan stop row */
    public static final String SCAN_ROW_STOP = "hbase.mapreduce.scan.row.stop";
    /** Column Family to Scan */
    public static final String SCAN_COLUMN_FAMILY = "hbase.mapreduce.scan.column.family";
    /** Space delimited list of columns and column families to scan. */
    public static final String SCAN_COLUMNS = "hbase.mapreduce.scan.columns";
    /** The timestamp used to filter columns with a specific timestamp. */
    public static final String SCAN_TIMESTAMP = "hbase.mapreduce.scan.timestamp";

    /** The maximum number of version to return. */
    public static final String SCAN_MAXVERSIONS = "hbase.mapreduce.scan.maxversions";
    /** Set to false to disable server-side caching of blocks for this scan. */
    public static final String SCAN_CACHEBLOCKS = "hbase.mapreduce.scan.cacheblocks";
    /** The number of rows for caching that will be passed to scanners. */
    public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";
    /** Set the maximum number of values to return for each call to next(). */
    public static final String SCAN_BATCHSIZE = "hbase.mapreduce.scan.batchsize";
    /** Specify if we have to shuffle the map tasks. */
    public static final String SHUFFLE_MAPS = "hbase.mapreduce.inputtable.shufflemaps";

    // OpenTSDB specific settings
    /**
     * The starting timestamp used to filter columns with a specific range of
     * versions.
     */

    public static final String TSDB_UIDS = "net.opentsdb.tsdb.uid";
    /** The opentsdb metric to be retrived. */
    public static final String METRICS = "net.opentsdb.rowkey";
    /** The opentsdb metric to be retrived. */
    public static final String TAGKV = "net.opentsdb.tagkv";
    /** The tag keys for the associated metric (space seperated). */
    public static final String TSDB_STARTKEY = "net.opentsdb.start";
    /** The tag values for the tag keys (space seperated). */
    public static final String TSDB_ENDKEY = "net.opentsdb.end";

    public static final String TSDB_TIMERANGE_START = "net.opentsdb.tsdb.scan.timerange.start";
    public static final String TSDB_TIMERANGE_END = "net.opentsdb.tsdb.scan.timerange.end";

    /** The configuration. */
    private Configuration conf = null;

    /**
     * Returns the current configuration.
     *
     * @return The current configuration.
     * @see org.apache.hadoop.conf.Configurable#getConf()
     */
    @Override
    public Configuration getConf() {
        return conf;
    }

    public static byte[] hexStringToByteArray(String s) {
        s = s.replace("\\x", "");
        byte[] b = new byte[s.length() / 2];
        for (int i = 0; i < b.length; i++) {
            int index = i * 2;
            int v = Integer.parseInt(s.substring(index, index + 2), 16);
            b[i] = (byte) v;
        }
        return b;
    }

    /**
     * Sets the configuration. This is used to set the details for the table to
     * be scanned.
     *
     * @param configuration
     *            The configuration to set.
     * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
     */
    @Override
    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION", justification = "Intentional")
    public void setConf(Configuration configuration) {
        this.conf = configuration;

        Scan scan = null;

        if (conf.get(SCAN) != null) {
            try {
                scan = convertStringToScan(conf.get(SCAN));
            } catch (IOException e) {
                LOG.error("An error occurred.", e);
            }
        } else {
            try {
                scan = new Scan();
                // Configuration for extracting the UIDs for the user specified
                // metric and tag names.
                if (conf.get(TSDB_UIDS) != null) {
                    // We get all uids for all specified column quantifiers
                    // (metrics|tagk|tagv)
                    String filter = String.format("^%s$", conf.get(TSDB_UIDS));
                    RegexStringComparator keyRegEx = new RegexStringComparator(filter);
                    RowFilter rowFilter = new RowFilter(CompareOp.EQUAL, keyRegEx);
                    scan.setFilter(rowFilter);
                } else {

                    // Configuration for extracting & filtering the required
                    // rows
                    // from tsdb table.
                    String metrics = "";
                    String tags = "";
                    if (conf.get(METRICS) != null) {
                        String filter = null;
                        metrics = conf.get(METRICS);
                        if (conf.get(TAGKV) != null) // If we have to extract based on a metric and its group of tags "^%s.{4}.*%s.*$"
                        {
                            filter = String.format("^%s.*%s.*$", conf.get(METRICS), conf.get(TAGKV));
                            tags = conf.get(TAGKV);
                        } else {
                            // If we have to extract based on just the metric
                            filter = String.format("^%s.+$", conf.get(METRICS));
                        }

                        RegexStringComparator keyRegEx = new RegexStringComparator(filter,
                                Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
                        // keyRegEx.setCharset(Charset.forName("ISO-8859-1"));
                        RowFilter rowFilter = new RowFilter(CompareOp.EQUAL, keyRegEx);
                        scan.setFilter(rowFilter);
                    }
                    // Extracts data based on the supplied timerange. If
                    // timerange
                    // is not provided then all data are extracted
                    if (conf.get(TSDB_TIMERANGE_START) != null && conf.get(TSDB_TIMERANGE_END) != null) {

                        //                  System.out.println(hexStringToByteArray(metrics + conf.get(TSDB_TIMERANGE_START) + tags) );
                        //                  System.out.println(hexStringToByteArray(metrics + conf.get(TSDB_TIMERANGE_END) + tags));

                        scan.setStartRow(hexStringToByteArray(metrics + conf.get(TSDB_TIMERANGE_START) + tags));
                        scan.setStopRow(hexStringToByteArray(metrics + conf.get(TSDB_TIMERANGE_END) + tags));

                    }
                }

                // false by default, full table scans generate too much BC churn
                scan.setCacheBlocks((conf.getBoolean(SCAN_CACHEBLOCKS, false)));
            } catch (Exception e) {
                LOG.error(StringUtils.stringifyException(e));
            }
        }

        setScan(scan);
    }

    @Override
    protected void initialize(JobContext context) throws IOException {
        // Do we have to worry about mis-matches between the Configuration from
        // setConf and the one
        // in this context?
        TableName tableName = TableName.valueOf(conf.get(INPUT_TABLE));
        try {
            initializeTable(ConnectionFactory.createConnection(new Configuration(conf)), tableName);
        } catch (Exception e) {
            LOG.error(StringUtils.stringifyException(e));
        }
    }

    /**
     * Parses a combined family and qualifier and adds either both or just the
     * family in case there is no qualifier. This assumes the older colon
     * divided notation, e.g. "family:qualifier".
     *
     * @param scan
     *            The Scan to update.
     * @param familyAndQualifier
     *            family and qualifier
     * @throws IllegalArgumentException
     *             When familyAndQualifier is invalid.
     */
    private static void addColumn(Scan scan, byte[] familyAndQualifier) {
        byte[][] fq = KeyValue.parseColumn(familyAndQualifier);
        if (fq.length == 1) {
            scan.addFamily(fq[0]);
        } else if (fq.length == 2) {
            scan.addColumn(fq[0], fq[1]);
        } else {
            throw new IllegalArgumentException("Invalid familyAndQualifier provided.");
        }
    }

    /**
     * Adds an array of columns specified using old format, family:qualifier.
     * <p>
     * Overrides previous calls to {@link Scan#addColumn(byte[], byte[])}for any
     * families in the input.
     *
     * @param scan
     *            The Scan to update.
     * @param columns
     *            array of columns, formatted as <code>family:qualifier</code>
     * @see Scan#addColumn(byte[], byte[])
     */
    public static void addColumns(Scan scan, byte[][] columns) {
        for (byte[] column : columns) {
            addColumn(scan, column);
        }
    }

    /**
     * Calculates the splits that will serve as input for the map tasks. The
     * number of splits matches the number of regions in a table. Splits are
     * shuffled if required.
     * 
     * @param context
     *            The current job context.
     * @return The list of input splits.
     * @throws IOException
     *             When creating the list of splits fails.
     * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
     */
    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        List<InputSplit> splits = super.getSplits(context);
        if ((conf.get(SHUFFLE_MAPS) != null) && "true".equals(conf.get(SHUFFLE_MAPS).toLowerCase())) {
            Collections.shuffle(splits);
        }
        return splits;
    }

    @Override
    protected Pair<byte[][], byte[][]> getStartEndKeys() throws IOException {
        if (conf.get(SPLIT_TABLE) != null) {
            TableName splitTableName = TableName.valueOf(conf.get(SPLIT_TABLE));
            try (Connection conn = ConnectionFactory.createConnection(getConf())) {
                try (RegionLocator rl = conn.getRegionLocator(splitTableName)) {
                    return rl.getStartEndKeys();
                }
            }
        }

        return super.getStartEndKeys();
    }

    /**
     * Sets split table in map-reduce job.
     */
    public static void configureSplitTable(Job job, TableName tableName) {
        job.getConfiguration().set(SPLIT_TABLE, tableName.getNameAsString());
    }
}