com.nearinfinity.honeycomb.hbase.bulkload.BulkLoadMapper.java Source code

Java tutorial

Introduction

Here is the source code for com.nearinfinity.honeycomb.hbase.bulkload.BulkLoadMapper.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * 
 * Copyright 2013 Near Infinity Corporation.
 */

package com.nearinfinity.honeycomb.hbase.bulkload;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.nearinfinity.honeycomb.hbase.HBaseMetadata;
import com.nearinfinity.honeycomb.hbase.HBaseStore;
import com.nearinfinity.honeycomb.hbase.MetadataCache;
import com.nearinfinity.honeycomb.hbase.MutationFactory;
import com.nearinfinity.honeycomb.hbase.config.ConfigConstants;
import com.nearinfinity.honeycomb.mysql.Row;
import com.nearinfinity.honeycomb.mysql.schema.ColumnSchema;
import com.nearinfinity.honeycomb.mysql.schema.TableSchema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTablePool;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.text.ParseException;
import java.util.List;
import java.util.Set;

import static com.google.common.base.Preconditions.checkNotNull;
import static java.lang.String.format;

/**
 * Mapper for loading large amounts of data into Honeycomb format.
 */
public class BulkLoadMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
    static final Logger LOG = Logger.getLogger(BulkLoadMapper.class);
    private RowParser rowParser;
    private String[] columns;
    private long tableId;
    private TableSchema schema;
    private MutationFactory mutationFactory;

    private static final String SQL_TABLE = "honeycomb.sql.table";
    private static final String SQL_COLUMNS = "honeycomb.sql.columns";
    private static final String SEPARATOR = "importtsv.separator";

    private static final String NOT_SET_ERROR = " not set.  Job will fail.";

    @Override
    protected void setup(Context context) throws IOException {
        Configuration conf = context.getConfiguration();

        char separator = conf.get(SEPARATOR, " ").charAt(0);
        columns = conf.getStrings(SQL_COLUMNS);
        String sqlTable = conf.get(SQL_TABLE);
        String hbaseTable = conf.get(ConfigConstants.TABLE_NAME);
        String columnFamily = conf.get(ConfigConstants.COLUMN_FAMILY);

        // Check that necessary configuration variables are set
        checkNotNull(conf.get(HConstants.ZOOKEEPER_QUORUM), HConstants.ZOOKEEPER_QUORUM + NOT_SET_ERROR);
        checkNotNull(sqlTable, SQL_TABLE + NOT_SET_ERROR);
        checkNotNull(columns, SQL_COLUMNS + NOT_SET_ERROR);
        checkNotNull(hbaseTable, ConfigConstants.TABLE_NAME + NOT_SET_ERROR);
        checkNotNull(columnFamily, ConfigConstants.COLUMN_FAMILY + NOT_SET_ERROR);

        LOG.info("Zookeeper: " + conf.get(HConstants.ZOOKEEPER_QUORUM));
        LOG.info("SQL Table: " + sqlTable);
        LOG.info("HBase Table: " + hbaseTable);
        LOG.info("HBase Column Family: " + columnFamily);
        LOG.info("Input separator: '" + separator + "'");

        final HTablePool pool = new HTablePool(conf, 1);
        HBaseMetadata metadata = new HBaseMetadata(new PoolHTableProvider(hbaseTable, pool));
        metadata.setColumnFamily(columnFamily);
        HBaseStore store = new HBaseStore(metadata, null, new MetadataCache(metadata));

        tableId = store.getTableId(sqlTable);
        schema = store.getSchema(sqlTable);
        mutationFactory = new MutationFactory(store);
        mutationFactory.setColumnFamily(columnFamily);

        rowParser = new RowParser(schema, columns, separator);

        checkSqlColumnsMatch(sqlTable);
    }

    private void checkSqlColumnsMatch(String sqlTable) {
        Set<String> expectedColumns = Sets.newHashSet();
        for (ColumnSchema column : schema.getColumns()) {
            expectedColumns.add(column.getColumnName());
        }
        List<String> invalidColumns = Lists.newLinkedList();
        for (String column : columns) {
            if (!expectedColumns.contains(column)) {
                LOG.error("Found non-existent column " + column);
                invalidColumns.add(column);
            }
        }
        if (invalidColumns.size() > 0) {
            String expectedColumnString = Joiner.on(",").join(expectedColumns);
            String invalidColumnString = Joiner.on(",").join(invalidColumns);
            String message = String.format(
                    "In table %s following columns (%s)" + " are not valid columns. Expected columns (%s)",
                    sqlTable, invalidColumnString, expectedColumnString);
            throw new IllegalStateException(message);
        }
    }

    @Override
    public void map(LongWritable offset, Text line, Context context) {
        try {
            Row row = rowParser.parseRow(line.toString());

            List<Put> puts = mutationFactory.insert(tableId, row);

            for (Put put : puts) {
                context.write(new ImmutableBytesWritable(put.getRow()), put);
            }

            context.getCounter(Counters.ROWS).increment(1);
            context.getCounter(Counters.PUTS).increment(puts.size());

        } catch (IOException e) {
            LOG.error("CSVParser unable to parse line: " + line.toString(), e);
            context.getCounter(Counters.FAILED_ROWS).increment(1);
        } catch (IllegalArgumentException e) {
            LOG.error(format("The line %s was incorrectly formatted. Error %s", line.toString(), e.getMessage()));
            context.getCounter(Counters.FAILED_ROWS).increment(1);
        } catch (ParseException e) {
            LOG.error(format("Parsing failed on line %s with message %s", line.toString(), e.getMessage()));
            context.getCounter(Counters.FAILED_ROWS).increment(1);
        } catch (Exception e) {
            LOG.error(format("The following error %s occurred during mapping" + " for line %s", e.getMessage(),
                    line.toString()));
            context.getCounter(Counters.FAILED_ROWS).increment(1);
        }
    }

    /**
     * Counters for the map / reduce job.
     */
    public enum Counters {
        /**
         * Number of rows processed by the job
         */
        ROWS,
        /**
         * Number of rows that failed during processing
         */
        FAILED_ROWS,
        /**
         * Total size of the {@link Put}s generated by the job
         */
        PUTS
    }
}