org.trend.hgraph.mapreduce.lib.input.TableInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.trend.hgraph.mapreduce.lib.input.TableInputFormat.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.trend.hgraph.mapreduce.lib.input;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A customed <code>InputFormat</code> for generating multiple <code>Mapper</code>s for one HTable
 * region.
 * @author scott_miao
 * @see CalculateInputSplitMapper
 * @see CalculateInputSplitReducer
 */
public class TableInputFormat extends org.apache.hadoop.hbase.mapreduce.TableInputFormat {

    private static Logger LOGGER = LoggerFactory.getLogger(TableInputFormat.class);

    public static final String INPUT_SPLIT_PAIRS_INPUT_PATH = "hgraph.mapreduce.lib.input.splits.path";
    private String inputPath;

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        LinkedList<InputSplit> newSplits = new LinkedList<InputSplit>();
        FileSystem fs = null;
        InputStream is = null;
        LineIterator it = null;
        try {
            List<InputSplit> targets = super.getSplits(context);
            Map<String, List<TableSplit>> targetMap = createTargetMap(targets);

            HTable table = getHTable();
            byte[] tableName = table.getTableName();

            Path path = new Path(inputPath);
            fs = path.getFileSystem(this.getConf());
            is = fs.open(path);
            it = IOUtils.lineIterator(is, "UTF-8");
            String line = null;
            String[] lineSplits = null;
            String regionName = null;
            byte[] startKey = null;
            byte[] endKey = null;
            byte[] regionStartKey = null;
            byte[] regionEndKey = null;
            boolean nextRegion = false;

            HRegionLocation regionlocation = null;
            String location = null;
            HRegionInfo regionInfo = null;
            List<TableSplit> splits = null;
            TableSplit split = null;
            while (it.hasNext()) {
                if (!nextRegion) {
                    line = it.nextLine();
                } else {
                    nextRegion = false;
                }
                if (null == line || "".equals(line)) {
                    String msg = "skip a invalid line";
                    LOGGER.error(msg);
                    throw new IllegalStateException(msg);
                }

                lineSplits = getLineSplits(line);
                regionName = lineSplits[0];
                startKey = Bytes.toBytes(lineSplits[1]);
                endKey = Bytes.toBytes(lineSplits[2]);
                LOGGER.info("start to process region:" + regionName);
                regionlocation = table.getRegionLocation(startKey, false);
                if (null == regionlocation) {
                    String msg = "can not find location for region:" + regionName + " !!";
                    LOGGER.error(msg);
                    throw new IllegalStateException(msg);
                }

                regionInfo = regionlocation.getRegionInfo();
                location = regionlocation.getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0];
                if (!targetMap.containsKey(location)) {
                    String msg = "regionLocation:" + regionlocation + " not found in TableSplits !!";
                    LOGGER.error(msg);
                    throw new IllegalStateException(msg);
                }

                regionStartKey = regionInfo.getStartKey();
                regionEndKey = regionInfo.getEndKey();

                splits = targetMap.get(location);
                LOGGER.info("splits.size is " + splits.size());
                for (int a = 0; a < splits.size(); a++) {
                    split = splits.get(a);
                    if (Arrays.equals(split.getStartRow(), regionStartKey)
                            && Arrays.equals(split.getEndRow(), regionEndKey)) {
                        splits.remove(a);

                        if (Arrays.equals(HConstants.EMPTY_BYTE_ARRAY, regionStartKey)) {
                            startKey = regionStartKey;
                        }

                        split = new TableSplit(tableName, startKey, endKey, location);
                        newSplits.add(split);

                        if (it.hasNext()) {
                            while (it.hasNext()) {
                                line = it.nextLine();
                                lineSplits = getLineSplits(line);
                                // keep doing due to it may be in same region
                                if (regionName.equals(lineSplits[0])) {
                                    split = new TableSplit(tableName, Bytes.toBytes(lineSplits[1]),
                                            Bytes.toBytes(lineSplits[2]), location);
                                    newSplits.add(split);
                                } else {
                                    checkAndPatchRegionEndKey(newSplits, regionEndKey);

                                    nextRegion = true;
                                    break;
                                }
                            }
                        } else {
                            checkAndPatchRegionEndKey(newSplits, regionEndKey);
                        }
                        if (splits.size() == 0) {
                            // remove itself if it is empty
                            targetMap.remove(location);
                        }
                        break;
                    }
                }
            }

            // check if still any split not processed yet ?
            if (targetMap.size() > 0) {
                String msg = "targetMap.size:" + targetMap.size() + " still bigger than 0,"
                        + " which means that there still has TableSplit(s) not processed yet !!";
                LOGGER.error(msg);
                LOGGER.error("" + targetMap);
                throw new IllegalStateException(msg);
            }
        } catch (IOException e) {
            LOGGER.error("unknow error occues !!", e);
            throw e;
        } finally {
            if (null != it) {
                LineIterator.closeQuietly(it);
            }
            if (null != is) {
                IOUtils.closeQuietly(is);
            }
        }

        return newSplits;
    }

    private void checkAndPatchRegionEndKey(LinkedList<InputSplit> newSplits, byte[] regionEndKey) {
        if (Arrays.equals(HConstants.EMPTY_BYTE_ARRAY, regionEndKey)) {
            TableSplit tmpSplit = (TableSplit) newSplits.removeLast();
            newSplits.add(new TableSplit(tmpSplit.getTableName(), tmpSplit.getStartRow(), regionEndKey,
                    tmpSplit.getLocations()[0]));
        }
    }

    private static String[] getLineSplits(String line) {
        String[] lineSplits;
        lineSplits = line.split(CalculateInputSplitReducer.DELIMITER);
        if (lineSplits.length != 3) {
            String msg = "lineSplits.length:" + lineSplits.length + " is not equals to 3, lineSplits:"
                    + Arrays.toString(lineSplits);
            LOGGER.error(msg);
            throw new IllegalStateException(msg);
        }
        return lineSplits;
    }

    private static Map<String, List<TableSplit>> createTargetMap(List<InputSplit> targets) {
        Map<String, List<TableSplit>> targetMap = new HashMap<String, List<TableSplit>>();
        TableSplit target = null;
        String location = null;
        while (targets.size() > 0) {
            target = (TableSplit) targets.remove(0);
            location = target.getLocations()[0];
            if (!targetMap.containsKey(location)) {
                targetMap.put(location, new ArrayList<TableSplit>());
            }
            targetMap.get(location).add(target);
        }
        return targetMap;
    }

    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
        String path = conf.get(INPUT_SPLIT_PAIRS_INPUT_PATH);
        if (null == path || "".equals(path)) {
            throw new IllegalArgumentException(INPUT_SPLIT_PAIRS_INPUT_PATH + " shall always not be null or empty");
        }
        inputPath = path;
    }

}