com.aerospike.hadoop.mapreduce.AerospikeInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.aerospike.hadoop.mapreduce.AerospikeInputFormat.java

Source

/* 
 * Copyright 2014 Aerospike, Inc.
 *
 * Portions may be licensed to Aerospike, Inc. under one or more
 * contributor license agreements.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.aerospike.hadoop.mapreduce;

import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.aerospike.client.AerospikeClient;
import com.aerospike.client.AerospikeException;
import com.aerospike.client.Host;
import com.aerospike.client.cluster.Node;
import com.aerospike.client.policy.ClientPolicy;

/**
 * An {@link InputFormat} for data stored in an Aerospike database.
 */
public class AerospikeInputFormat extends InputFormat<AerospikeKey, AerospikeRecord>
        implements org.apache.hadoop.mapred.InputFormat<AerospikeKey, AerospikeRecord> {

    private static final Log log = LogFactory.getLog(AerospikeInputFormat.class);

    // ---------------- NEW API ----------------

    public List<InputSplit> getSplits(JobContext context) throws IOException {
        // Delegate to the old API.
        Configuration cfg = context.getConfiguration();
        JobConf jobconf = AerospikeConfigUtil.asJobConf(cfg);
        return Arrays.asList((InputSplit[]) getSplits(jobconf, jobconf.getNumMapTasks()));
    }

    public RecordReader<AerospikeKey, AerospikeRecord> createRecordReader(InputSplit split,
            TaskAttemptContext context) throws IOException, InterruptedException {
        return new AerospikeRecordReader();
    }

    // ---------------- OLD API ----------------

    public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        try {

            String oper = AerospikeConfigUtil.getInputOperation(job);
            String host = AerospikeConfigUtil.getInputHost(job);
            int port = AerospikeConfigUtil.getInputPort(job);
            String namespace = AerospikeConfigUtil.getInputNamespace(job);
            String setName = AerospikeConfigUtil.getInputSetName(job);
            String[] binNames = AerospikeConfigUtil.getInputBinNames(job);
            String numrangeBin = "";
            long numrangeBegin = 0;
            long numrangeEnd = 0;
            if (oper.equals("numrange")) {
                numrangeBin = AerospikeConfigUtil.getInputNumRangeBin(job);
                numrangeBegin = AerospikeConfigUtil.getInputNumRangeBegin(job);
                numrangeEnd = AerospikeConfigUtil.getInputNumRangeEnd(job);
            }

            log.info(String.format("using: %s %d %s %s", host, port, namespace, setName));

            AerospikeClient client = AerospikeClientSingleton.getInstance(new ClientPolicy(), host, port);
            Node[] nodes = client.getNodes();
            int nsplits = nodes.length;
            if (nsplits == 0) {
                throw new IOException("no Aerospike nodes found");
            }
            log.info(String.format("found %d nodes", nsplits));
            AerospikeSplit[] splits = new AerospikeSplit[nsplits];
            for (int ii = 0; ii < nsplits; ii++) {
                Node node = nodes[ii];
                String nodeName = node.getName();

                // We want to avoid 127.0.0.1 as a hostname
                // because this value will be transferred to a
                // different hadoop node to be processed.
                //
                List<Host> aliases = getAliases(node.getHost());
                Host nodehost = aliases.get(0);
                if (aliases.size() > 1) {
                    for (Host a : aliases) {
                        if (!a.name.equals("127.0.0.1")) {
                            nodehost = a;
                            break;
                        }
                    }
                }
                splits[ii] = new AerospikeSplit(oper, nodeName, nodehost.name, nodehost.port, namespace, setName,
                        binNames, numrangeBin, numrangeBegin, numrangeEnd);
                log.info("split: " + splits[ii]);
            }
            return splits;
        } catch (Exception ex) {
            throw new IOException("exception in getSplits", ex);
        }
    }

    public org.apache.hadoop.mapred.RecordReader<AerospikeKey, AerospikeRecord> getRecordReader(
            org.apache.hadoop.mapred.InputSplit split, JobConf job, Reporter reporter) throws IOException {
        return new AerospikeRecordReader((AerospikeSplit) split);
    }

    private List<Host> getAliases(Host host) {
        InetAddress[] addresses;

        try {
            addresses = InetAddress.getAllByName(host.name);
        } catch (UnknownHostException uhe) {
            throw new AerospikeException.Connection("Invalid host: " + host);
        }

        if (addresses.length == 0) {
            throw new AerospikeException.Connection("Failed to find addresses for " + host);
        }

        // Add capacity for current address aliases plus IPV6 address and hostname.
        List<Host> aliases = new ArrayList<Host>(addresses.length + 2);

        for (InetAddress address : addresses) {
            aliases.add(new Host(address.getHostAddress(), host.tlsName, host.port));
        }

        return aliases;
    }

}

// Local Variables:
// mode: java
// c-basic-offset: 4
// tab-width: 4
// indent-tabs-mode: nil
// End:
// vim: softtabstop=4:shiftwidth=4:expandtab