org.apache.crunch.io.hbase.RegionLocationTable.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.crunch.io.hbase.RegionLocationTable.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.crunch.io.hbase;

import javax.annotation.Nullable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;

import com.google.common.collect.Maps;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.util.Bytes;

/**
 * Provides lookup functionality for the region server location for row keys in an HBase table.
 * <p>
 * This is a helper class to optimize the locality of HFiles created with {@link HFileOutputFormatForCrunch}, by
 * specifying the name of the region server which is hosting the region of a given row as the preferred HDFS data node
 * for hosting the written HFile. This is intended to ensure that bulk-created HFiles will be available on the local
 * filesystem on the region servers using the created HFile, thus allowing short-circuit reads to the local file system
 * on the bulk-created HFiles.
 */
class RegionLocationTable {

    /**
     * Per-output configuration key which contains the path to a serialized region location table.
     */
    public static final String REGION_LOCATION_TABLE_PATH = "crunch.hfileregionlocation.path";

    private final String tableName;
    private final NavigableMap<byte[], String> regionStartToServerHostName;

    public static RegionLocationTable create(String tableName, List<HRegionLocation> regionLocationList) {
        NavigableMap<byte[], String> regionStartToServerHostName = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
        for (HRegionLocation regionLocation : regionLocationList) {
            byte[] startKey = regionLocation.getRegionInfo().getStartKey();
            if (startKey == null) {
                startKey = HConstants.EMPTY_START_ROW;
            }
            regionStartToServerHostName.put(startKey, regionLocation.getServerName().getHostname());
        }
        return new RegionLocationTable(tableName, regionStartToServerHostName);
    }

    private RegionLocationTable(String tableName, NavigableMap<byte[], String> regionStartToServerHostName) {
        this.tableName = tableName;
        this.regionStartToServerHostName = regionStartToServerHostName;
    }

    /**
     * Returns the name of the HBase table to which this region location table applies.
     *
     * @return name of the related HBase table
     */
    public String getTableName() {
        return tableName;
    }

    /**
     * Returns the optional preferred node for a row.
     * <p>
     * The return value of this method is an {@link InetSocketAddress} to be in line with the HFile API (and
     * underlying HDFS API) which use InetSocketAddress. The port number is always 0 on the returned InetSocketAddress,
     * as it is not known from outside the scope of a region server. The HDFS API is implemented to deal "correctly"
     * with this, mapping host name to a random data node on the same machine, which is sufficient for the purposes
     * here.
     * <p>
     * The return value will be null if no preferred node is known for the given row.
     *
     * @param rowKey row key of the row for which the preferred node is to be calculated
     * @return socket address of the preferred storage node for the given row, or null
     */
    @Nullable
    public InetSocketAddress getPreferredNodeForRow(byte[] rowKey) {
        Map.Entry<byte[], String> matchingEntry = regionStartToServerHostName.floorEntry(rowKey);
        if (matchingEntry != null) {
            return InetSocketAddress.createUnresolved(matchingEntry.getValue(), 0);
        } else {
            return null;
        }
    }

    /**
     * Serialize this table to a {@link DataOutput}. The serialized value can be deserialized via the
     * {@link #deserialize(DataInput)} method.
     *
     * @param dataOutput output to which the table is to be serialized
     */
    public void serialize(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(tableName);
        dataOutput.writeInt(regionStartToServerHostName.size());
        for (Map.Entry<byte[], String> regionToHostEntry : regionStartToServerHostName.entrySet()) {
            byte[] rowKey = regionToHostEntry.getKey();
            dataOutput.writeInt(rowKey.length);
            dataOutput.write(rowKey);
            dataOutput.writeUTF(regionToHostEntry.getValue());
        }
    }

    /**
     * Deserialize a table which was serialized to with the {@link #serialize(DataOutput)} method.
     *
     * @param dataInput input containing a serialized instance of this class
     * @return the deserialized table
     */
    public static RegionLocationTable deserialize(DataInput dataInput) throws IOException {
        NavigableMap<byte[], String> regionStartToServerHostName = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
        String tableName = dataInput.readUTF();
        int numEntries = dataInput.readInt();
        for (int i = 0; i < numEntries; i++) {
            int rowKeyLength = dataInput.readInt();
            byte[] rowKey = new byte[rowKeyLength];
            dataInput.readFully(rowKey, 0, rowKeyLength);
            String hostName = dataInput.readUTF();
            regionStartToServerHostName.put(rowKey, hostName);
        }
        return new RegionLocationTable(tableName, regionStartToServerHostName);
    }
}