Java tutorial
/* * Copyright 2013 NGDATA nv * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ngdata.hbaseindexer.mr; import static com.ngdata.hbaseindexer.indexer.SolrServerFactory.createHttpSolrServers; import static com.ngdata.hbaseindexer.indexer.SolrServerFactory.createSharder; import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMaxConnectionsPerRoute; import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMaxConnectionsTotal; import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMode; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; import java.util.TreeMap; import com.google.common.base.Charsets; import com.google.common.collect.Maps; import com.ngdata.hbaseindexer.conf.IndexerComponentFactory; import com.ngdata.hbaseindexer.conf.IndexerComponentFactoryUtil; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.mapreduce.TableSplit; import org.apache.hadoop.io.Text; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrServer; import org.apache.solr.hadoop.SolrInputDocumentWritable; import org.apache.solr.hadoop.SolrOutputFormat; import org.apache.solr.hadoop.Utils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.Counting; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.SharedMetricRegistries; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.ngdata.hbaseindexer.SolrConnectionParams; import com.ngdata.hbaseindexer.conf.IndexerConf; import com.ngdata.hbaseindexer.conf.IndexerConf.RowReadMode; import com.ngdata.hbaseindexer.conf.IndexerConfBuilder; import com.ngdata.hbaseindexer.indexer.DirectSolrClassicInputDocumentWriter; import com.ngdata.hbaseindexer.indexer.DirectSolrInputDocumentWriter; import com.ngdata.hbaseindexer.indexer.Indexer; import com.ngdata.hbaseindexer.indexer.ResultToSolrMapperFactory; import com.ngdata.hbaseindexer.indexer.ResultWrappingRowData; import com.ngdata.hbaseindexer.indexer.RowData; import com.ngdata.hbaseindexer.indexer.Sharder; import com.ngdata.hbaseindexer.indexer.SharderException; import com.ngdata.hbaseindexer.indexer.SolrInputDocumentWriter; import com.ngdata.hbaseindexer.metrics.IndexerMetricsUtil; import com.ngdata.hbaseindexer.morphline.MorphlineResultToSolrMapper; import com.ngdata.hbaseindexer.parse.ResultToSolrMapper; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Counter; import com.yammer.metrics.core.Meter; import com.yammer.metrics.core.Metric; import com.yammer.metrics.core.MetricName; import com.yammer.metrics.core.Timer; import org.apache.http.client.HttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.solr.client.solrj.SolrServer; /** * Mapper for converting HBase Result objects into index documents. */ public class HBaseIndexerMapper extends TableMapper<Text, SolrInputDocumentWritable> { /** * Configuration key for setting the name of the indexer. */ public static final String INDEX_NAME_CONF_KEY = "hbase.indexer.indexname"; /** * Configuration key for setting the contents of the indexer config. */ public static final String INDEX_COMPONENT_FACTORY_KEY = "hbase.indexer.factory"; /** Configuration key for setting the contents of the indexer config. */ public static final String INDEX_CONFIGURATION_CONF_KEY = "hbase.indexer.configuration"; /** * Configuration key for setting the free-form index connection parameters. */ public static final String INDEX_CONNECTION_PARAMS_CONF_KEY = "hbase.indexer.index.connectionparams"; /** * Configuration key for setting the direct write flag. */ public static final String INDEX_DIRECT_WRITE_CONF_KEY = "hbase.indexer.directwrite"; /** * Configuration key for setting the HBase table name. */ public static final String TABLE_NAME_CONF_KEY = "hbase.indexer.table.name"; private static final String CONF_KEYVALUE_SEPARATOR = "="; private static final String CONF_VALUE_SEPARATOR = ";"; private static final Logger LOG = LoggerFactory.getLogger(HBaseIndexerMapper.class); private Indexer indexer; private SolrInputDocumentWriter solrDocWriter; /** * Add the given index connection parameters to a Configuration. * * @param conf the configuration in which to add the parameters * @param connectionParams index connection parameters */ public static void configureIndexConnectionParams(Configuration conf, Map<String, String> connectionParams) { String confValue = Joiner.on(CONF_VALUE_SEPARATOR).withKeyValueSeparator(CONF_KEYVALUE_SEPARATOR) .join(connectionParams); conf.set(INDEX_CONNECTION_PARAMS_CONF_KEY, confValue); } /** * Retrieve index connection parameters from a Configuration. * * @param conf configuration containing index connection parameters * @return index connection parameters */ public static Map<String, String> getIndexConnectionParams(Configuration conf) { String confValue = conf.get(INDEX_CONNECTION_PARAMS_CONF_KEY); if (confValue == null) { LOG.warn("No connection parameters found in configuration"); return ImmutableMap.of(); } return Splitter.on(CONF_VALUE_SEPARATOR).withKeyValueSeparator(CONF_KEYVALUE_SEPARATOR).split(confValue); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Utils.getLogConfigFile(context.getConfiguration()); if (LOG.isTraceEnabled()) { LOG.trace("CWD is {}", new File(".").getCanonicalPath()); TreeMap map = new TreeMap(); for (Map.Entry<String, String> entry : context.getConfiguration()) { map.put(entry.getKey(), entry.getValue()); } LOG.trace("Mapper configuration:\n{}", Joiner.on("\n").join(map.entrySet())); } String indexName = context.getConfiguration().get(INDEX_NAME_CONF_KEY); String indexerComponentFactory = context.getConfiguration().get(INDEX_COMPONENT_FACTORY_KEY); String indexConfiguration = context.getConfiguration().get(INDEX_CONFIGURATION_CONF_KEY); String tableName = context.getConfiguration().get(TABLE_NAME_CONF_KEY); if (indexName == null) { throw new IllegalStateException("No configuration value supplied for " + INDEX_NAME_CONF_KEY); } if (indexConfiguration == null) { throw new IllegalStateException("No configuration value supplied for " + INDEX_CONFIGURATION_CONF_KEY); } if (tableName == null) { throw new IllegalStateException("No configuration value supplied for " + TABLE_NAME_CONF_KEY); } Map<String, String> indexConnectionParams = getIndexConnectionParams(context.getConfiguration()); IndexerComponentFactory factory = IndexerComponentFactoryUtil.getComponentFactory(indexerComponentFactory, new ByteArrayInputStream(indexConfiguration.getBytes(Charsets.UTF_8)), indexConnectionParams); IndexerConf indexerConf = factory.createIndexerConf(); String morphlineFile = context.getConfiguration().get(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM); Map<String, String> params = indexerConf.getGlobalParams(); if (morphlineFile != null) { params.put(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM, morphlineFile); } String morphlineId = context.getConfiguration().get(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM); if (morphlineId != null) { params.put(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM, morphlineId); } for (Map.Entry<String, String> entry : context.getConfiguration()) { if (entry.getKey().startsWith(MorphlineResultToSolrMapper.MORPHLINE_VARIABLE_PARAM + ".")) { params.put(entry.getKey(), entry.getValue()); } if (entry.getKey().startsWith(MorphlineResultToSolrMapper.MORPHLINE_FIELD_PARAM + ".")) { params.put(entry.getKey(), entry.getValue()); } } ResultToSolrMapper mapper = factory.createMapper(indexName); // TODO This would be better-placed in the top-level job setup -- however, there isn't currently any // infrastructure to handle converting an in-memory model into XML (we can only interpret an // XML doc into the internal model), so we need to do this here for now if (indexerConf.getRowReadMode() != RowReadMode.NEVER) { LOG.warn("Changing row read mode from " + indexerConf.getRowReadMode() + " to " + RowReadMode.NEVER); indexerConf = new IndexerConfBuilder(indexerConf).rowReadMode(RowReadMode.NEVER).build(); } indexerConf.setGlobalParams(params); try { indexer = createIndexer(indexName, context, indexerConf, tableName, mapper, indexConnectionParams); } catch (SharderException e) { throw new RuntimeException(e); } } private Indexer createIndexer(String indexName, Context context, IndexerConf indexerConf, String tableName, ResultToSolrMapper mapper, Map<String, String> indexConnectionParams) throws IOException, SharderException { Configuration conf = context.getConfiguration(); if (conf.getBoolean(INDEX_DIRECT_WRITE_CONF_KEY, false)) { String solrMode = getSolrMode(indexConnectionParams); if (solrMode.equals("cloud")) { DirectSolrInputDocumentWriter writer = createCloudSolrWriter(context, indexConnectionParams); solrDocWriter = wrapInBufferedWriter(context, writer); return Indexer.createIndexer(indexName, indexerConf, tableName, mapper, null, null, solrDocWriter); } else if (solrMode.equals("classic")) { DirectSolrClassicInputDocumentWriter classicSolrWriter = createClassicSolrWriter(context, indexConnectionParams); Sharder sharder = createSharder(indexConnectionParams, classicSolrWriter.getNumServers()); solrDocWriter = wrapInBufferedWriter(context, classicSolrWriter); return Indexer.createIndexer(indexName, indexerConf, tableName, mapper, null, sharder, solrDocWriter); } else { throw new RuntimeException( "Only 'cloud' and 'classic' are valid values for solr.mode, but got " + solrMode); } } else { solrDocWriter = new MapReduceSolrInputDocumentWriter(context); return Indexer.createIndexer(indexName, indexerConf, tableName, mapper, null, null, solrDocWriter); } } private DirectSolrInputDocumentWriter createCloudSolrWriter(Context context, Map<String, String> indexConnectionParams) throws IOException { String indexZkHost = indexConnectionParams.get(SolrConnectionParams.ZOOKEEPER); String collectionName = indexConnectionParams.get(SolrConnectionParams.COLLECTION); if (indexZkHost == null) { throw new IllegalStateException("No index ZK host defined"); } if (collectionName == null) { throw new IllegalStateException("No collection name defined"); } CloudSolrServer solrServer = new CloudSolrServer(indexZkHost); solrServer.setDefaultCollection(collectionName); return new DirectSolrInputDocumentWriter(context.getConfiguration().get(INDEX_NAME_CONF_KEY), solrServer); } private DirectSolrClassicInputDocumentWriter createClassicSolrWriter(Context context, Map<String, String> indexConnectionParams) throws IOException { PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(); connectionManager.setDefaultMaxPerRoute(getSolrMaxConnectionsPerRoute(indexConnectionParams)); connectionManager.setMaxTotal(getSolrMaxConnectionsTotal(indexConnectionParams)); HttpClient httpClient = new DefaultHttpClient(connectionManager); List<SolrServer> solrServers = createHttpSolrServers(indexConnectionParams, httpClient); return new DirectSolrClassicInputDocumentWriter(context.getConfiguration().get(INDEX_NAME_CONF_KEY), solrServers); } private SolrInputDocumentWriter wrapInBufferedWriter(Context context, SolrInputDocumentWriter writer) throws MalformedURLException { int bufferSize = context.getConfiguration().getInt(SolrOutputFormat.SOLR_RECORD_WRITER_BATCH_SIZE, 100); return new BufferedSolrInputDocumentWriter(writer, bufferSize, context.getCounter(HBaseIndexerCounters.OUTPUT_INDEX_DOCUMENTS), context.getCounter(HBaseIndexerCounters.OUTPUT_INDEX_DOCUMENT_BATCHES)); } @Override protected void map(ImmutableBytesWritable key, Result result, Context context) throws IOException, InterruptedException { context.progress(); context.getCounter(HBaseIndexerCounters.INPUT_ROWS).increment(1L); try { TableSplit tableSplit; if (context.getInputSplit() instanceof TableSplit) { tableSplit = (TableSplit) context.getInputSplit(); indexer.indexRowData( ImmutableList.<RowData>of(new ResultWrappingRowData(result, tableSplit.getTableName()))); } else { throw new IOException("Input split not of type " + TableSplit.class + " but " + context.getInputSplit().getClass()); } } catch (SolrServerException e) { // These will only be thrown through if there is an exception on the server side. // Document-based errors will be swallowed and the counter will be incremented throw new RuntimeException(e); } catch (SharderException e) { throw new RuntimeException(e); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { try { solrDocWriter.close(); } catch (SolrServerException e) { throw new RuntimeException(e); } copyIndexingMetricsToCounters(context); copyIndexingMetrics3ToCounters(context); } private void copyIndexingMetricsToCounters(Context context) { final String COUNTER_GROUP = "HBase Indexer Metrics"; SortedMap<String, SortedMap<MetricName, Metric>> groupedMetrics = Metrics.defaultRegistry() .groupedMetrics(new IndexerMetricsUtil.IndexerMetricPredicate()); for (Entry<String, SortedMap<MetricName, Metric>> metricsGroupEntry : groupedMetrics.entrySet()) { SortedMap<MetricName, Metric> metricsGroupMap = metricsGroupEntry.getValue(); for (Entry<MetricName, Metric> metricEntry : metricsGroupMap.entrySet()) { MetricName metricName = metricEntry.getKey(); Metric metric = metricEntry.getValue(); String counterName = metricName.getType() + ": " + metricName.getName(); if (metric instanceof Counter) { Counter counter = (Counter) metric; context.getCounter(COUNTER_GROUP, counterName).increment(counter.count()); } else if (metric instanceof Meter) { Meter meter = (Meter) metric; context.getCounter(COUNTER_GROUP, counterName).increment(meter.count()); } else if (metric instanceof Timer) { Timer timer = (Timer) metric; context.getCounter(COUNTER_GROUP, counterName).increment((long) timer.sum()); } } } } private void copyIndexingMetrics3ToCounters(Context context) { for (String name : SharedMetricRegistries.names()) { MetricRegistry metricRegistry = SharedMetricRegistries.getOrCreate(name); for (Map.Entry<String, com.codahale.metrics.Counter> entry : metricRegistry.getCounters().entrySet()) { addCounting(context, entry.getKey(), entry.getValue(), 1); } for (Map.Entry<String, com.codahale.metrics.Histogram> entry : metricRegistry.getHistograms() .entrySet()) { addCounting(context, entry.getKey(), entry.getValue(), 1); } for (Map.Entry<String, com.codahale.metrics.Meter> entry : metricRegistry.getMeters().entrySet()) { addCounting(context, entry.getKey(), entry.getValue(), 1); } for (Map.Entry<String, com.codahale.metrics.Timer> entry : metricRegistry.getTimers().entrySet()) { long nanosPerMilliSec = 1000 * 1000; addCounting(context, entry.getKey(), entry.getValue(), nanosPerMilliSec); } } } private void addCounting(Context context, String metricName, Counting value, long scale) { final String COUNTER_GROUP = "HBase Indexer Metrics"; context.getCounter(COUNTER_GROUP, metricName).increment(value.getCount() / scale); } }