Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexwriter.elastic2; import static org.elasticsearch.node.NodeBuilder.nodeBuilder; import java.io.BufferedReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.net.InetAddress; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.indexer.IndexWriter; import org.apache.nutch.indexer.NutchDocument; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ListenableActionFuture; import org.elasticsearch.action.bulk.BulkItemResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.delete.DeleteRequestBuilder; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings.Builder; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.node.Node; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** */ public class ElasticIndexWriter implements IndexWriter { public static Logger LOG = LoggerFactory.getLogger(ElasticIndexWriter.class); private static final int DEFAULT_MAX_BULK_DOCS = 250; private static final int DEFAULT_MAX_BULK_LENGTH = 2500500; private Client client; private Node node; private String defaultIndex; private Configuration config; private BulkRequestBuilder bulk; private ListenableActionFuture<BulkResponse> execute; private int port = -1; private String host = null; private String clusterName = null; private int maxBulkDocs; private int maxBulkLength; private long indexedDocs = 0; private int bulkDocs = 0; private int bulkLength = 0; private boolean createNewBulk = false; @Override public void open(Configuration job) throws IOException { clusterName = job.get(ElasticConstants.CLUSTER); host = job.get(ElasticConstants.HOST); port = job.getInt(ElasticConstants.PORT, 9300); Builder settingsBuilder = Settings.builder(); BufferedReader reader = new BufferedReader(job.getConfResourceAsReader("elasticsearch.conf")); String line; String parts[]; while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line.trim(); parts = line.split("="); if (parts.length == 2) { settingsBuilder.put(parts[0].trim(), parts[1].trim()); } } } if (StringUtils.isNotBlank(clusterName)) settingsBuilder.put("cluster.name", clusterName); // Set the cluster name and build the settings Settings settings = settingsBuilder.build(); // Prefer TransportClient if (host != null && port > 1) { client = TransportClient.builder().settings(settings).build() .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port)); } else if (clusterName != null) { node = nodeBuilder().settings(settings).client(true).node(); client = node.client(); } bulk = client.prepareBulk(); defaultIndex = job.get(ElasticConstants.INDEX, "nutch"); maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS); maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH); } @Override public void write(NutchDocument doc) throws IOException { String id = (String) doc.getFieldValue("id"); String type = doc.getDocumentMeta().get("type"); if (type == null) type = "doc"; IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id); Map<String, Object> source = new HashMap<String, Object>(); // Loop through all fields of this doc for (String fieldName : doc.getFieldNames()) { if (doc.getFieldValues(fieldName).size() > 1) { source.put(fieldName, doc.getFieldValue(fieldName)); // Loop through the values to keep track of the size of this document for (Object value : doc.getFieldValues(fieldName)) { bulkLength += value.toString().length(); } } else { source.put(fieldName, doc.getFieldValue(fieldName)); bulkLength += doc.getFieldValue(fieldName).toString().length(); } } request.setSource(source); // Add this indexing request to a bulk request bulk.add(request); indexedDocs++; bulkDocs++; if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) { LOG.info("Processing bulk request [docs = {}, length = {}, total docs = {}" + ", last doc in bulk = '{}']", bulkDocs, bulkLength, indexedDocs, id); // Flush the bulk of indexing requests createNewBulk = true; commit(); } } @Override public void delete(String key) throws IOException { try { DeleteRequestBuilder builder = client.prepareDelete(); builder.setIndex(defaultIndex); builder.setType("doc"); builder.setId(key); builder.execute().actionGet(); } catch (ElasticsearchException e) { throw makeIOException(e); } } public static IOException makeIOException(ElasticsearchException e) { final IOException ioe = new IOException(); ioe.initCause(e); return ioe; } @Override public void update(NutchDocument doc) throws IOException { write(doc); } @Override public void commit() throws IOException { if (execute != null) { // wait for previous to finish long beforeWait = System.currentTimeMillis(); BulkResponse actionGet = execute.actionGet(); if (actionGet.hasFailures()) { for (BulkItemResponse item : actionGet) { if (item.isFailed()) { throw new RuntimeException("First failure in bulk: " + item.getFailureMessage()); } } } long msWaited = System.currentTimeMillis() - beforeWait; LOG.info("Previous took in ms {}, including wait {}", actionGet.getTookInMillis(), msWaited); execute = null; } if (bulk != null) { if (bulkDocs > 0) { // start a flush, note that this is an asynchronous call execute = bulk.execute(); } bulk = null; } if (createNewBulk) { // Prepare a new bulk request bulk = client.prepareBulk(); bulkDocs = 0; bulkLength = 0; } } @Override public void close() throws IOException { // Flush pending requests LOG.info("Processing remaining requests [docs = {}, length = {}, total docs = {}]", bulkDocs, bulkLength, indexedDocs); createNewBulk = false; commit(); // flush one more time to finalize the last bulk LOG.info("Processing to finalize last execute"); createNewBulk = false; commit(); // Close client.close(); if (node != null) { node.close(); } } @Override public String describe() { StringBuffer sb = new StringBuffer("ElasticIndexWriter\n"); sb.append("\t").append(ElasticConstants.CLUSTER).append(" : elastic prefix cluster\n"); sb.append("\t").append(ElasticConstants.HOST).append(" : hostname\n"); sb.append("\t").append(ElasticConstants.PORT).append(" : port (default 9300)\n"); sb.append("\t").append(ElasticConstants.INDEX).append(" : elastic index command \n"); sb.append("\t").append(ElasticConstants.MAX_BULK_DOCS) .append(" : elastic bulk index doc counts. (default 250) \n"); sb.append("\t").append(ElasticConstants.MAX_BULK_LENGTH) .append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n"); return sb.toString(); } @Override public void setConf(Configuration conf) { config = conf; String cluster = conf.get(ElasticConstants.CLUSTER); String host = conf.get(ElasticConstants.HOST); if (StringUtils.isBlank(cluster) && StringUtils.isBlank(host)) { String message = "Missing elastic.cluster and elastic.host. At least one of them should be set in nutch-site.xml "; message += "\n" + describe(); LOG.error(message); throw new RuntimeException(message); } } @Override public Configuration getConf() { return config; } }