org.vilcek.hive.kv.KVHiveInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.vilcek.hive.kv.KVHiveInputFormat.java

Source

/*
 * This file is part of Hive KV Storage Handler
 * Copyright 2012 Alexandre Vilcek (alexandre.vilcek@oracle.com)
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */

package org.vilcek.hive.kv;

import java.io.IOException;
import java.rmi.NotBoundException;
import java.rmi.RemoteException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import oracle.kv.Consistency;
import oracle.kv.Depth;
import oracle.kv.Direction;
import oracle.kv.KVStoreException;
import oracle.kv.Key;
import oracle.kv.KeyRange;
import oracle.kv.impl.rep.RepNodeStatus;
import oracle.kv.impl.rep.admin.RepNodeAdminAPI;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.PartitionMap;
import oracle.kv.impl.topo.RepGroup;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.RepNode;
import oracle.kv.impl.topo.StorageNode;
import oracle.kv.impl.topo.StorageNodeId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.TopologyLocator;
import oracle.kv.impl.util.registry.RegistryUtils;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/**
 * 
 * @author Alexandre Vilcek (alexandre.vilcek@oracle.com)
 */
public class KVHiveInputFormat extends HiveInputFormat<LongWritable, MapWritable> {

    private static String kvStoreName;
    private static String[] kvHelperHosts;
    private static Direction direction = Direction.FORWARD;
    private static Depth depth = Depth.PARENT_AND_DESCENDANTS;
    private static Consistency consistency = null;
    private static long timeout = 0;
    private static TimeUnit timeoutUnit = null;

    @Override
    public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
        return new KVHiveRecordReader(split, job);
    }

    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        String kvHostPort = conf.get(ConfigProperties.KV_HOST_PORT);
        Pattern pattern = Pattern.compile(",");
        kvHelperHosts = pattern.split(kvHostPort);
        kvStoreName = conf.get(ConfigProperties.KV_NAME);

        Topology topology = null;
        try {
            topology = TopologyLocator.get(kvHelperHosts, 0);
        } catch (KVStoreException KVSE) {
            KVSE.printStackTrace();
            return null;
        }
        RegistryUtils regUtils = new RegistryUtils(topology);
        PartitionMap partitionMap = topology.getPartitionMap();
        int nParts = partitionMap.getNPartitions();
        List<InputSplit> ret = new ArrayList<InputSplit>(nParts);

        Map<Object, RepNodeStatus> statuses = new HashMap<Object, RepNodeStatus>();
        Path[] tablePaths = FileInputFormat.getInputPaths(conf);
        for (int i = 1; i <= nParts; i++) {
            PartitionId partId = new PartitionId(i);
            RepGroupId repGroupId = topology.getRepGroupId(partId);
            RepGroup repGroup = topology.get(repGroupId);
            Collection<RepNode> repNodes = repGroup.getRepNodes();
            List<String> repNodeNames = new ArrayList<String>();
            List<String> repNodeNamesAndPorts = new ArrayList<String>();
            for (RepNode rn : repNodes) {
                RepNodeStatus rnStatus = null;
                try {
                    if (statuses.containsKey(rn.getResourceId())) {
                        rnStatus = statuses.get(rn.getResourceId());
                    } else {
                        RepNodeAdminAPI rna = regUtils.getRepNodeAdmin(rn.getResourceId());
                        rnStatus = rna.ping();
                        statuses.put(rn.getResourceId(), rnStatus);
                    }
                } catch (RemoteException re) {
                    System.err.println("Ping failed for " + rn.getResourceId() + ": " + re.getMessage());
                    re.printStackTrace();
                    statuses.put(rn.getResourceId(), null);
                } catch (NotBoundException e) {
                    System.err.println(
                            "No RMI service for RN: " + rn.getResourceId() + " message: " + e.getMessage());
                }

                if (rnStatus == null) {
                    continue;
                }

                /*
                 * com.sleepycat.je.rep.ReplicatedEnvironment.State state = rnStatus.getReplicationState(); if (!state.isActive() ||
                 * (consistency == Consistency.ABSOLUTE && !state.isMaster())) { continue; }
                 */

                StorageNodeId snid = rn.getStorageNodeId();
                StorageNode sn = topology.get(snid);

                repNodeNames.add(sn.getHostname());
                repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort());
            }

            Key parentKey = null;
            String parentKeyValue = conf.get("oracle.kv.parentKey");
            if (parentKeyValue != null && parentKeyValue.length() > 0) {
                parentKey = Key.fromString(parentKeyValue);
            }
            KeyRange subRange = null;
            String subRangeValue = conf.get("oracle.kv.subRange");
            if (subRangeValue != null && subRangeValue.length() > 0) {
                subRange = KeyRange.fromString(subRangeValue);
            }

            int batchSize = conf.getInt("oracle.kv.batchSize", 0);

            ret.add(new KVHiveInputSplit(tablePaths[0])
                    .setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0])).setKVStoreName(kvStoreName)
                    .setKVPart(i).setLocations(repNodeNames.toArray(new String[0])).setDirection(direction)
                    .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth)
                    .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit));

        }

        return ret.toArray(new InputSplit[ret.size()]);
    }
}