reconcile.hbase.mapreduce.KeyListInputFormat.java Source code

Introduction

Here is the source code for reconcile.hbase.mapreduce.KeyListInputFormat.java
Source

/*
 * Copyright (c) 2008, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National
 * Laboratory. Written by Teresa Cottom, cottom1@llnl.gov CODE-400187 All rights reserved. This file is part of
 * RECONCILE
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License (as published by the Free Software Foundation) version 2, dated June 1991. This program is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA For full text see license.txt
 */
package reconcile.hbase.mapreduce;

import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import reconcile.hbase.table.DocSchema;

public class KeyListInputFormat extends FileInputFormat<ImmutableBytesWritable, Result> {
    static final Log LOG = LogFactory.getLog(KeyListInputFormat.class);

    static final String TOP = "reconcile.hbase.mapreduce.KeyListInputFormat";
    static final public String SCAN_FAMILIES = TOP + ".scanFamilies";
    static final public String LOG_ON = TOP + ".logOn";
    static final String SPLIT_SIZE = TOP + ".keyListSplitSize";
    static final String MAX_SPLITS = TOP + ".maxSplits";

    static final String DEFAULT_SPLIT_SIZE = "200";
    static final String DEFAULT_MAX_SPLITS = "100";

    /**
     * KeyListSplit - Split portion of a KeyListInputFormat
     *
     * @author cottom1
     */
    public static class KeyListSplit extends InputSplit implements Writable {
        private List<String> keys = new ArrayList<String>();
        private String[] hosts;
        private String tableName;

        public KeyListSplit() {
        }

        public KeyListSplit(String tableName, List<String> keys, String[] hosts) {
            this.tableName = tableName;
            this.keys.addAll(keys);
            this.hosts = hosts;
            LOG.info("KeyListSplit: num keys(" + this.keys.size() + ") table(" + tableName + ") num hosts("
                    + hosts.length + ")");
        }

        public List<String> getKeys() {
            return keys;
        }

        public String getTableName() {
            return tableName;
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(tableName);
            out.writeInt(keys.size());
            for (String key : keys) {
                out.writeUTF(key);
            }
            out.writeInt(hosts.length);
            for (String host : hosts) {
                out.writeUTF(host);
            }
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            tableName = in.readUTF();
            int nitems = in.readInt();
            for (int i = 0; i < nitems; ++i) {
                keys.add(in.readUTF());
            }
            nitems = in.readInt();
            hosts = new String[nitems];
            for (int i = 0; i < nitems; ++i) {
                hosts[i] = in.readUTF();
            }
            LOG.info("KeyListSplit: read num keys(" + keys.size() + ") num hosts(" + hosts.length + ") table("
                    + tableName + ")");
        }

        @Override
        public String[] getLocations() throws IOException {
            return hosts;
        }

        @Override
        public long getLength() throws IOException, InterruptedException {
            return keys.size();
        }

    }

    /**
     * KeyRowReader - Works off of a KeyListSplit, and loops over keys in that split
     * retrieving the records for pre-selected columns from HBase
     *
     * @author cottom1
     *
     */
    private static class KeyRowReader extends RecordReader<ImmutableBytesWritable, Result> {
        private Result result;
        private ImmutableBytesWritable key;

        private DocSchema table;
        private boolean logOn = false;
        private TreeSet<String> scanFamilies = new TreeSet<String>();
        private KeyListSplit split;
        private int ndx;

        @Override
        public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
            ndx = 0;
            split = (KeyListSplit) arg0;
            table = new DocSchema(split.tableName);

            // Set the families retrieved by scan
            String value = arg1.getConfiguration().get(SCAN_FAMILIES);
            if (value != null) {
                for (String family : StringUtils.split(value)) {
                    scanFamilies.add(family);
                }
            }

            // Enable / disable logging
            value = arg1.getConfiguration().get(LOG_ON);
            if (value != null) {
                logOn = Boolean.parseBoolean(value);
            }
        }

        @Override
        public ImmutableBytesWritable getCurrentKey() {
            return key;
        }

        @Override
        public Result getCurrentValue() {
            return result;
        }

        @Override
        public boolean nextKeyValue() throws IOException {
            result = null;
            key = null;

            while ((ndx < split.getKeys().size()) && result == null) {
                String stringKey = split.getKeys().get(ndx);
                if (logOn) {
                    LOG.info("Search for record with key:" + stringKey);
                }

                byte[] byteKey = Bytes.toBytes(stringKey);
                key = new ImmutableBytesWritable(byteKey);
                Get get = new Get(byteKey);

                // Add scan families
                for (String family : scanFamilies) {
                    if (logOn) {
                        LOG.info("... add get family(" + family + ")");
                    }
                    get.addFamily(family.getBytes());
                }

                result = table.get(get);

                if (logOn) {
                    if (result == null) {
                        LOG.info("Unable to find record:" + stringKey);
                    } else {
                        LOG.info("Loaded record:" + stringKey);
                    }
                }
                ++ndx;
            }
            return result != null;
        }

        @Override
        public float getProgress() {
            if (ndx == 0 || split == null)
                return (float) 0.0;
            return ndx / (float) split.getKeys().size();
        }

        @Override
        public void close() throws IOException {
        }
    }

    @Override
    public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit arg0, TaskAttemptContext arg1)
            throws IOException, InterruptedException {
        KeyRowReader reader = new KeyRowReader();
        reader.initialize(arg0, arg1);
        return reader;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return true;
    }

    public static String[] getLocations(HConnection connection, String tableName, List<String> keys)
            throws IOException {
        TreeSet<String> hosts = new TreeSet<String>();

        for (String key : keys) {
            HRegionLocation region = connection.getRegionLocation(tableName.getBytes(), key.getBytes(), true);
            String hostName = region.getServerAddress().getHostname();
            LOG.debug("Key(" + key + ") found on host (" + hostName + ")");
            hosts.add(hostName);
        }
        return hosts.toArray(new String[hosts.size()]);
    }

    // private HConnection getHConnection() throws ZooKeeperConnectionException
    // {
    //
    // Configuration conf = HBaseConfiguration.create();
    // return HConnectionManager.getConnection(conf);
    // }

    @SuppressWarnings("deprecation")
    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
        TreeSet<String> uniqueKeys = new TreeSet<String>();

        FileSystem fs = FileSystem.get(context.getConfiguration());

        final String tableName = context.getConfiguration().get(JobConfig.TABLE_CONF);
        if (tableName == null || tableName.length() == 0)
            throw new IOException("HBase table name was not provided");
        int splitSize = Integer.parseInt(context.getConfiguration().get(SPLIT_SIZE, DEFAULT_SPLIT_SIZE));

        final int maxSplits = Integer.parseInt(context.getConfiguration().get(MAX_SPLITS, DEFAULT_MAX_SPLITS));

        Path[] files = getInputPaths(context);
        for (Path file : files) {
            if (fs.isDirectory(file) || !fs.exists(file))
                throw new IOException("Not a valid key list file: " + file.toString());

            InputStream is = null;
            BufferedReader reader = null;
            try {
                is = fs.open(file);
                reader = new BufferedReader(new InputStreamReader(is));
                String line = null;
                while ((line = reader.readLine()) != null) {
                    String key = line.trim();
                    if (key.length() > 0 && !key.startsWith("#")) {
                        uniqueKeys.add(key);
                    }
                }
            } finally {
                IOUtils.closeQuietly(is);
                IOUtils.closeQuietly(reader);
            }
        }

        ArrayList<String> keys = new ArrayList<String>();
        keys.addAll(uniqueKeys);

        if (keys.size() > 0) {
            HConnection conn = HConnectionManager.getConnection(context.getConfiguration());

            int numSplits = Math.max((int) Math.ceil(keys.size() / (splitSize * 1.0)), 1);
            if (numSplits > maxSplits) {
                splitSize = (int) Math.ceil(keys.size() / (maxSplits * 1.0));
                LOG.info("Overriding split size with (" + splitSize + ") to maintain max splits of (" + maxSplits
                        + ")");
                numSplits = Math.max((int) Math.ceil(keys.size() / (splitSize * 1.0)), 1);
            }

            LOG.info("There are (" + keys.size() + ") total keys. Split size(" + splitSize + ")  Num splits("
                    + numSplits + ")");
            int startNdx = 0;
            for (int i = 0; i < (numSplits - 1); ++i) {
                int endNdx = startNdx + splitSize;
                splits.add(createSplit(conn, tableName, keys, startNdx, endNdx));
                startNdx = endNdx;
            }
            // Add last split
            splits.add(createSplit(conn, tableName, keys, startNdx, keys.size()));
        }
        return splits;
    }

    private KeyListSplit createSplit(HConnection connection, String tableName, List<String> keys, int startNdx,
            int endNdx) throws IOException {
        List<String> splitKeys = keys.subList(startNdx, endNdx);

        String[] hosts = getLocations(connection, tableName, splitKeys);

        return new KeyListSplit(tableName, splitKeys, hosts);
    }

}