com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ery.hadoop.mrddx.hbase;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.ery.hadoop.mrddx.DBGroupReducer;
import com.ery.hadoop.mrddx.DBMapper;
import com.ery.hadoop.mrddx.DBRecord;
import com.ery.hadoop.mrddx.IHandleFormat;
import com.ery.hadoop.mrddx.db.mapreduce.DBWritable;
import com.ery.hadoop.mrddx.log.MRLog;
import com.ery.hadoop.mrddx.util.StringUtil;

/**
 * HBase?
 * 
    
    
    
 * @createDate 2013-1-4
 * @version v1.0
 * @param <T>
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class HbaseInputFormat<T extends HbaseWritable> extends InputFormat<LongWritable, T>
        implements Configurable, IHandleFormat {
    // 
    public static final Log LOG = LogFactory.getLog(HbaseInputFormat.class);

    /**
     * ?
     */
    private HbaseConfiguration dbConf;

    public Configuration getConf() {
        return dbConf.getConf();
    }

    public HbaseConfiguration getDBConf() {
        return dbConf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.dbConf = new HbaseConfiguration(conf, HbaseConfiguration.FLAG_HBASE_INPUT);
    }

    /**
     * Initializes the map-part of the job with the appropriate input settings.
     * 
     * @param job The map-reduce job
     * @param inputClass 
     * @param srcTargetFileNames 
     * @param tableName ??
     */
    public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName,
            String srcTargetFieldNames) {
        job.setInputFormatClass(HbaseInputFormat.class);
        HbaseConfiguration dbConf = new HbaseConfiguration(job.getConfiguration(),
                HbaseConfiguration.FLAG_HBASE_INPUT);
        dbConf.setInputClass(inputClass);
        dbConf.setInputTableName(tableName);
        dbConf.setInputHBaseColumnRelation(srcTargetFieldNames);
    }

    /**
     * ? ????
     * 
     * @param timerange (long)
     * @param startrow 
     * @param stoprow ?
     * @param timestamp 
     * @param filters 
     * @param familyColumns ?
     * @param familys ?
     */
    public static void setInputQueryCondition(Configuration job, long[] timerange, String startrow, String stoprow,
            long timestamp, String[] filters, String[] familyColumns, String[] familys) {
        HbaseConfiguration dbConf = new HbaseConfiguration(job, HbaseConfiguration.FLAG_HBASE_INPUT);
        dbConf.setInputHBaseQueryTimerange(StringUtil.valueOfLongToString(timerange));
        dbConf.setInputHBaseQueryStartRow(startrow);
        dbConf.setInputHBaseQueryStopRow(stoprow);
        dbConf.setInputHBaseQueryTimestamp(timestamp);
        dbConf.setInputHBaseQueryFilters(filters);
        dbConf.setInputHBaseQueryFamilyColumns(familyColumns);
        dbConf.setInputHBaseQueryFamilys(familys);
    }

    /**
     * ?RegionInfo?
     * 
     * @param job job
     * @return RegionInfo?
     * @throws Exception 
     */
    public static int getTableHRegionInfoCount(Configuration job, String startKey, String endKey) throws Exception {
        HbaseConfiguration dbConf = new HbaseConfiguration(job, HbaseConfiguration.FLAG_HBASE_INPUT);
        String tableName = dbConf.getInputTableName();
        if (null == tableName) {
            String meg = "The name of table is null!";
            MRLog.error(LOG, meg);
            throw new IOException(meg);
        }

        if (!validateCondition(dbConf, dbConf.getInputTableName())) {
            String meg = "validate condition error!";
            MRLog.error(LOG, meg);
            throw new IOException(meg);
        }

        List<HRegionLocation> lstHRegionLocation = getTableHRegionInfo(job, tableName, startKey, endKey);
        if (null == lstHRegionLocation || lstHRegionLocation.size() <= 0) {
            String meg = "The account of table'regionInfo is zero!";
            MRLog.error(LOG, meg);
            throw new IOException(meg);
        }

        // ?TableRegionInfo?
        printTableAllRegionInfo(job, tableName);
        printTableRequestRegionInfo(lstHRegionLocation, tableName);

        return lstHRegionLocation.size();
    }

    /**
     * ?RegionInfo
     * 
     * @param job job
     * @param tableName ??
     * @param startKey rowkey
     * @param endKey ?rowkey
     * @return RegionInfo
     * @throws Exception 
     */
    public static List<HRegionLocation> getTableHRegionInfo(Configuration job, String tableName, String startKey,
            String endKey) throws IOException {
        // ?startkey,endkey
        HTable table = new HTable(job, tableName);
        byte[][] startKeys = table.getStartKeys();
        byte[][] endKeys = table.getEndKeys();

        // 1?regionlocation
        if (null == startKey && null == endKey) {
            return table.getRegionsInRange(startKeys[0], endKeys[endKeys.length - 1]);
        }

        // 2: ?regionlocation
        if (startKey == null) {
            startKey = new String(startKeys[0]);
        }

        if (endKey == null) {
            endKey = new String(endKeys[endKeys.length - 1]);
        }

        // ?HRegionInfo
        List<HRegionLocation> lsHRegionInfo = table.getRegionsInRange(startKey.getBytes(), endKey.getBytes());
        return lsHRegionInfo;
    }

    /**
     * ??
     * 
     * @param dbConf ??
     * @param tableName ??
     * @return true ?
     * @throws TableNotFoundException ?
     * @throws IOException
     */
    protected static boolean validateCondition(HbaseConfiguration dbConf, String tableName)
            throws TableNotFoundException, IOException {
        if (null == tableName) {
            return false;
        }

        HBaseAdmin admin = new HBaseAdmin(dbConf.getConf());
        HTableDescriptor tableDes = admin.getTableDescriptor(tableName.getBytes());
        // ???
        Set<byte[]> setByte = tableDes.getFamiliesKeys();
        Set<String> familySets = new HashSet<String>();
        Iterator<byte[]> iterator = setByte.iterator();
        while (iterator.hasNext()) {
            familySets.add(new String(iterator.next()));
        }

        // check column and family
        String familyColumns[] = dbConf.getInputHBaseQueryFamilyColumns();
        if (null != familyColumns) {
            for (int i = 0; i < familyColumns.length; i++) {
                if (null == familyColumns[i] || familyColumns[i].trim().length() <= 0) {
                    String meg = "The parameter of columnfamily is null!";
                    MRLog.error(LOG, meg);
                    return false;
                }

                String fcolumn[] = familyColumns[i].split(HbaseConfiguration.sign_colon);
                if (fcolumn.length != 2) {
                    String meg = "The parameter of columnfamily is format error !";
                    MRLog.error(LOG, meg);
                    return false;
                }

                // check column
                HColumnDescriptor hcDesc = tableDes.getFamily(fcolumn[0].getBytes());
                if (!(null != hcDesc && familySets.contains(fcolumn[0])
                        && fcolumn[0].equals(new String(hcDesc.getName())))) {
                    String meg = "Column is not exist! column:" + fcolumn[0];
                    MRLog.error(LOG, meg);
                    return false;
                }
            }
        }

        // check family
        String familys[] = dbConf.getInputHBaseQueryFamilys();
        if (null != familys) {
            for (int i = 0; i < familys.length; i++) {
                if (!familySets.contains(familys[i])) {
                    String meg = "Family is not exist! family:" + familys[i];
                    MRLog.error(LOG, meg);
                    return false;
                }
            }
        }

        return true;
    }

    @SuppressWarnings("unchecked")
    @Override
    public RecordReader<LongWritable, T> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        Class<T> inputClass = (Class<T>) (this.dbConf.getInputClass());
        return new HbaseRecordReader<T>((HbaseInputSplit) split, inputClass, this.getConf(), this.getDBConf(),
                this.dbConf.getInputTableName(), this.dbConf.getInputHBaseColumnRelation());
    }

    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

        String startRow = this.dbConf.getInputHBaseQueryStartRow();
        String stopRow = this.dbConf.getInputHBaseQueryStopRow();
        String tableName = this.dbConf.getInputTableName();

        List<InputSplit> splits = new ArrayList<InputSplit>();
        List<HRegionLocation> lstHRegionLocation = getTableHRegionInfo(job.getConfiguration(), tableName, startRow,
                stopRow);
        for (int i = 0; i < lstHRegionLocation.size(); i++) {
            HRegionLocation hRegionLocation = lstHRegionLocation.get(i);
            String tempStart = null;
            String tempEnd = null;
            HRegionInfo hRegionInfo = hRegionLocation.getRegionInfo();
            // NO.1?
            if (null == startRow && null == stopRow) {
                tempStart = new String(hRegionInfo.getStartKey());
                tempEnd = new String(hRegionInfo.getEndKey());
                HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
                splits.add(split);
                continue;
            }

            // NO.2?
            byte[] startKeyByte = hRegionInfo.getStartKey();
            byte[] endKeyByte = hRegionInfo.getEndKey();
            if (null != startRow && hRegionInfo.containsRow(startRow.getBytes())) {
                tempStart = startRow;
            }

            if (null != stopRow && hRegionInfo.containsRow(stopRow.getBytes())) {
                tempEnd = stopRow;
            }

            tempStart = tempStart != null ? tempStart : new String(startKeyByte);
            tempEnd = tempEnd != null ? tempEnd : new String(endKeyByte);
            HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
            splits.add(split);
        }

        MRLog.info(LOG, "Finished hbase split!");
        return splits;
    }

    /**
     * ?TableRegionInfo?
     * 
     * @param job job
     * @param tableName ??
     * @throws IOException IO
     */
    private static void printTableAllRegionInfo(Configuration job, String tableName) throws IOException {
        HTable table = new HTable(job, tableName);
        StringBuilder regionLog = new StringBuilder();
        regionLog.append("<<Table[");
        regionLog.append(new String(table.getTableName()));
        regionLog.append("]all RegionInfo>>");
        NavigableMap<HRegionInfo, ServerName> mapHRegionInfo = table.getRegionLocations();
        Iterator<HRegionInfo> hregionIterator = mapHRegionInfo.keySet().iterator();
        while (hregionIterator.hasNext()) {
            regionLog.append("\nHRegionInfo:");
            HRegionInfo key = hregionIterator.next();
            ServerName value = mapHRegionInfo.get(key);
            regionLog.append(key.toString());
            regionLog.append("ServerInfo:");
            regionLog.append("{name=>:");
            regionLog.append(value.getServerName());
            regionLog.append(" ,HostAndPort=>:");
            regionLog.append(value.getHostAndPort());
            regionLog.append("}");
        }
        MRLog.info(LOG, regionLog.toString());
    }

    /**
     * ??TableRegionInfo?
     * 
     * @param lstHRegionInfo HRegionInfo
     * @param tableName ??
     */
    private static void printTableRequestRegionInfo(List<HRegionLocation> lstHRegionInfo, String tableName) {
        StringBuilder regionLog = new StringBuilder();
        regionLog.append("<<Table[");
        regionLog.append(tableName);
        regionLog.append("]request RegionInfo>>");
        for (HRegionLocation hRegionLocation : lstHRegionInfo) {
            regionLog.append("\nHRegionInfo:");
            if (null == hRegionLocation) {
                MRLog.warn(LOG, "??HRegionLocationnull!");
                continue;
            }
            HRegionInfo hRegionInfo = hRegionLocation.getRegionInfo();
            if (null == hRegionInfo) {
                MRLog.warn(LOG, "??HRegionInfonull!");
                continue;
            }
            regionLog.append(hRegionInfo.toString());
            regionLog.append("ServerInfo:");
            regionLog.append("{HostAndPort=>:");
            regionLog.append(hRegionLocation.getHostnamePort());
            regionLog.append("}");
        }
        MRLog.info(LOG, regionLog.toString());
    }

    /**
     * A InputSplit that spans a set of rows
     */
    public static class HbaseInputSplit extends InputSplit implements Writable {
        private String start;
        private String end;

        public HbaseInputSplit() {
        }

        /**
         * Convenience Constructor
         * 
         * @param start the index of the first row to select
         * @param end the index of the last row to select
         */
        public HbaseInputSplit(String start, String end) {
            this.start = start;
            this.end = end;
            MRLog.info(LOG, "HBase Split rowkey range=>" + this.start + ":" + this.end);
        }

        public String getStart() {
            return start;
        }

        public String getEnd() {
            return end;
        }

        @Override
        public String[] getLocations() throws IOException {
            return new String[] {};
        }

        @Override
        public long getLength() throws IOException {
            return 0;
        }

        @Override
        public void readFields(DataInput input) throws IOException {
            this.start = input.readUTF();
            this.end = input.readUTF();
        }

        @Override
        public void write(DataOutput output) throws IOException {
            output.writeUTF(this.start);
            output.writeUTF(this.end);
        }
    }

    /**
     * ??
     * 
     * @param paraMap ?
     * @return 
     * @throws Exception
     */
    protected String getParamInputFieldNames(Map<String, String> paraMap) throws Exception {
        String para = paraMap.get("inputFieldNames");
        if (null == para || para.trim().length() <= 0) {
            String meg = "?<inputFieldNames>.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        return para;
    }

    @Override
    public void handle(Job conf) throws Exception {
        // HBase??
        HbaseConfiguration hconf = new HbaseConfiguration(conf.getConfiguration(),
                HbaseConfiguration.FLAG_HBASE_INPUT);
        String tableName = hconf.getInputTableName();
        if (null == tableName || tableName.trim().length() <= 0) {
            String meg = "[MR ERROR]HBase??<" + HbaseConfiguration.INPUT_TABLE + ">?.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        // ?
        String inputFieldName[] = hconf.getInputFieldNames();
        this.vParamSrcTargetFieldNames(hconf, inputFieldName);

        if (hconf.getInputIsCombiner()) {
            conf.setCombinerClass(DBGroupReducer.class);
        }

        // ?TIMERANGE
        String timerange[] = hconf.getInputHBaseQueryTimerange();
        this.vParamQueryTimeRange(timerange);

        // ?startrow
        String startrow = hconf.getInputHBaseQueryStartRow();
        if (null == startrow || startrow.trim().length() <= 0) {
            MRLog.warn(LOG,
                    "[MR WARN]?startrow<" + HbaseConfiguration.INPUT_QUERY_STARTROW + ">.");
        }

        // ?stoprow
        String stoprow = hconf.getInputHBaseQueryStopRow();
        if (null == stoprow || stoprow.trim().length() <= 0) {
            MRLog.warn(LOG,
                    "[MR WARN]?stoprow<" + HbaseConfiguration.INPUT_QUERY_STOPROW + ">.");
        }

        // ?timestamp
        long timestamp = hconf.getInputHBaseQueryTimestamp();
        if (timestamp <= -1) {
            MRLog.warn(LOG, "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_TIMESTAMP
                    + ">.");
        }

        // ?filters
        String filters = hconf.getInputHBaseQueryFilters();
        if (null == filters || filters.length() <= 0) {
            MRLog.warn(LOG, "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FILTER
                    + ">.");
        }

        // ?familyColumns
        String familyColumns[] = hconf.getInputHBaseQueryFamilyColumns();
        if (null == familyColumns || familyColumns.length <= 0) {
            MRLog.warn(LOG,
                    "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS + ">.");
        }

        if (null != familyColumns) {
            for (String tmp : familyColumns) {
                if (tmp.split(":").length != 2) {
                    String meg = "[MR ERROR]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS
                            + ">.";
                    MRLog.error(LOG, meg);
                    throw new Exception(meg);
                }
            }
        }

        // ?familys
        String familys[] = hconf.getInputHBaseQueryFamilys();
        if (null == familys || familys.length <= 0) {
            MRLog.warn(LOG,
                    "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FAMILYS + ">.");
        }

        conf.setInputFormatClass(HbaseInputFormat.class);
        hconf.setInputClass(DBRecord.class);

        // ?MapTask?
        int taskNumber = HbaseInputFormat.getTableHRegionInfoCount(conf.getConfiguration(), startrow, stoprow);
        int reduceTasks = taskNumber;
        if (hconf.getInputMapEnd()) {
            reduceTasks = 0;
        }

        // 
        hconf.setNumMapTasks(taskNumber);
        hconf.setNumReduceTasks(reduceTasks);
        hconf.setInputClass(DBRecord.class);
        conf.setMapperClass(DBMapper.class);
        conf.setMapOutputKeyClass(DBRecord.class);
        conf.setMapOutputValueClass(DBRecord.class);
        if (hconf.getInputIsCombiner()) {
            conf.setCombinerClass(DBGroupReducer.class);
        }
    }

    /**
     * ??
     * 
     * @param hconf
     * 
     * @param hFieldName hbase
     * @param inputFieldName 
     * @return
     * @throws Exception
     */
    private void vParamSrcTargetFieldNames(HbaseConfiguration hconf, String[] inputFieldName) throws Exception {
        // ??
        String inColumnRelation = hconf.getInputHBaseColumnRelation();
        if (null == inColumnRelation) {
            String meg = "[MR ERROR]<" + HbaseConfiguration.INPUT_HBASE_COLUMN_RELATION
                    + ">?";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        List<String[]> list = new ArrayList<String[]>();// ?
        List<String[]> rela = new ArrayList<String[]>();// 
        StringUtil.decodeOutColumnSplitRelation(inColumnRelation, list, rela);
        String[][] clusterFieldNames = list.toArray(new String[0][0]);
        String[][] inColumnSplitRelations = rela.toArray(new String[0][0]);
        if (clusterFieldNames.length <= 0 || inColumnSplitRelations.length <= 0) {
            String meg = "[MR ERROR]<" + HbaseConfiguration.INPUT_HBASE_COLUMN_RELATION + ">?";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        // ?
        String splitSign = hconf.getInputHBaseColumnSplitSign();
        Set<String> setSrcFiled = StringUtil.parseStringArrayToSet(inputFieldName);
        for (int i = 0; i < inColumnSplitRelations.length; i++) {
            String temp[] = inColumnSplitRelations[i];
            for (int j = 0; j < temp.length; j++) {
                if (!setSrcFiled.contains(temp[j])) {
                    String meg = "[MR ERROR]<" + HbaseConfiguration.INPUT_HBASE_COLUMN_RELATION
                            + ">?<" + HbaseConfiguration.SYS_INPUT_FIELD_NAMES_PROPERTY
                            + ">";
                    LOG.error(meg);
                    throw new Exception(meg);
                }
            }

            if (temp.length > 1 && (null == splitSign || splitSign.trim().length() <= 0)) {
                String meg = "[MR ERROR]<" + HbaseConfiguration.INPUT_HBASE_COLUMN_RELATION
                        + ">?1, ?<"
                        + HbaseConfiguration.INPUT_HBASE_COLUMN_SPLIT_SIGN + ">";
                LOG.error(meg);
                throw new Exception(meg);
            }
        }
    }

    /**
     * ??
     * 
     * @param timerange 
     * @throws Exception 
     */
    private void vParamQueryTimeRange(String[] timerange) throws Exception {
        if (null == timerange || timerange.length <= 0) {
            // ???.
            MRLog.warn(LOG, "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_TIMERANGE
                    + ">.");
            return;
        }

        if (timerange.length != 2) {
            String meg = "[MR ERROR]HBase?<" + HbaseConfiguration.INPUT_QUERY_TIMERANGE
                    + ">,:132342155,32423532.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        long trange[] = new long[2];
        try {
            trange[0] = Long.parseLong(timerange[0]);
            trange[1] = Long.parseLong(timerange[1]);
        } catch (Exception e) {
            String meg = "[MR ERROR]HBase?<" + HbaseConfiguration.INPUT_QUERY_TIMERANGE
                    + ">,??.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        if (trange[0] > trange[1]) {
            String meg = "[MR ERROR]HBase?<" + HbaseConfiguration.INPUT_QUERY_TIMERANGE
                    + ">,?.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }
    }

}