org.apache.nutch.indexwriter.elasticrest.ElasticRestIndexWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.indexwriter.elasticrest.ElasticRestIndexWriter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//TODO refactor the dependencies out of root ivy file

package org.apache.nutch.indexwriter.elasticrest;

import io.searchbox.client.JestClient;
import io.searchbox.client.JestClientFactory;
import io.searchbox.client.JestResult;
import io.searchbox.client.JestResultHandler;
import io.searchbox.client.config.HttpClientConfig;
import io.searchbox.core.Bulk;
import io.searchbox.core.BulkResult;
import io.searchbox.core.Delete;
import io.searchbox.core.Index;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.http.concurrent.BasicFuture;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.nio.conn.SchemeIOSessionStrategy;
import org.apache.http.nio.conn.ssl.SSLIOSessionStrategy;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.IndexWriterParams;
import org.apache.nutch.indexer.NutchDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.Date;
import java.util.concurrent.ExecutionException;

/**
 */
public class ElasticRestIndexWriter implements IndexWriter {
    public static Logger LOG = LoggerFactory.getLogger(ElasticRestIndexWriter.class);

    private static final int DEFAULT_MAX_BULK_DOCS = 250;
    private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
    private static final String DEFAULT_SEPARATOR = "_";
    private static final String DEFAULT_SINK = "others";

    private JestClient client;
    private String defaultIndex;
    private String defaultType = null;

    private Configuration config;

    private Bulk.Builder bulkBuilder;
    private int port = -1;
    private String host = null;
    private Boolean https = null;
    private String user = null;
    private String password = null;
    private Boolean trustAllHostnames = null;

    private int maxBulkDocs;
    private int maxBulkLength;
    private long indexedDocs = 0;
    private int bulkDocs = 0;
    private int bulkLength = 0;
    private boolean createNewBulk = false;
    private long millis;
    private BasicFuture<JestResult> basicFuture = null;

    private String[] languages = null;
    private String separator = null;
    private String sink = null;

    @Override
    public void open(Configuration conf, String name) throws IOException {
        //Implementation not required
    }

    @Override
    public void open(IndexWriterParams parameters) throws IOException {
        host = parameters.get(ElasticRestConstants.HOST);
        if (StringUtils.isBlank(host)) {
            String message = "Missing host. It should be set in index-writers.xml";
            message += "\n" + describe();
            LOG.error(message);
            throw new RuntimeException(message);
        }

        port = parameters.getInt(ElasticRestConstants.PORT, 9200);
        user = parameters.get(ElasticRestConstants.USER);
        password = parameters.get(ElasticRestConstants.PASSWORD);
        https = parameters.getBoolean(ElasticRestConstants.HTTPS, false);
        trustAllHostnames = parameters.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false);

        languages = parameters.getStrings(ElasticRestConstants.LANGUAGES);
        separator = parameters.get(ElasticRestConstants.SEPARATOR, DEFAULT_SEPARATOR);
        sink = parameters.get(ElasticRestConstants.SINK, DEFAULT_SINK);

        // trust ALL certificates
        SSLContext sslContext = null;
        try {
            sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
                public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
                    return true;
                }
            }).build();
        } catch (NoSuchAlgorithmException | KeyManagementException | KeyStoreException e) {
            LOG.error("Failed to instantiate sslcontext object: \n{}", ExceptionUtils.getStackTrace(e));
            throw new SecurityException();
        }

        // skip hostname checks
        HostnameVerifier hostnameVerifier = null;
        if (trustAllHostnames) {
            hostnameVerifier = NoopHostnameVerifier.INSTANCE;
        } else {
            hostnameVerifier = new DefaultHostnameVerifier();
        }

        SSLConnectionSocketFactory sslSocketFactory = new SSLConnectionSocketFactory(sslContext);
        SchemeIOSessionStrategy httpsIOSessionStrategy = new SSLIOSessionStrategy(sslContext, hostnameVerifier);

        JestClientFactory jestClientFactory = new JestClientFactory();
        URL urlOfElasticsearchNode = new URL(https ? "https" : "http", host, port, "");

        if (host != null && port > 1) {
            HttpClientConfig.Builder builder = new HttpClientConfig.Builder(urlOfElasticsearchNode.toString())
                    .multiThreaded(true).connTimeout(300000).readTimeout(300000);
            if (https) {
                if (user != null && password != null) {
                    builder.defaultCredentials(user, password);
                }
                builder.defaultSchemeForDiscoveredNodes("https").sslSocketFactory(sslSocketFactory) // this only affects sync calls
                        .httpsIOSessionStrategy(httpsIOSessionStrategy); // this only affects async calls
            }
            jestClientFactory.setHttpClientConfig(builder.build());
        } else {
            throw new IllegalStateException(
                    "No host or port specified. Please set the host and port in nutch-site.xml");
        }

        client = jestClientFactory.getObject();

        defaultIndex = parameters.get(ElasticRestConstants.INDEX, "nutch");
        defaultType = parameters.get(ElasticRestConstants.TYPE, "doc");

        maxBulkDocs = parameters.getInt(ElasticRestConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS);
        maxBulkLength = parameters.getInt(ElasticRestConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH);

        bulkBuilder = new Bulk.Builder().defaultIndex(defaultIndex).defaultType(defaultType);
    }

    private static Object normalizeValue(Object value) {
        if (value == null) {
            return null;
        }

        if (value instanceof Map || value instanceof Date) {
            return value;
        }

        return value.toString();
    }

    @Override
    public void write(NutchDocument doc) throws IOException {
        String id = (String) doc.getFieldValue("id");
        String type = doc.getDocumentMeta().get("type");
        if (type == null) {
            type = defaultType;
        }

        Map<String, Object> source = new HashMap<String, Object>();

        // Loop through all fields of this doc
        for (String fieldName : doc.getFieldNames()) {
            Set<Object> allFieldValues = new LinkedHashSet<>(doc.getField(fieldName).getValues());

            if (allFieldValues.size() > 1) {
                Object[] normalizedFieldValues = allFieldValues.stream().map(ElasticRestIndexWriter::normalizeValue)
                        .toArray();

                // Loop through the values to keep track of the size of this document
                for (Object value : normalizedFieldValues) {
                    bulkLength += value.toString().length();
                }

                source.put(fieldName, normalizedFieldValues);
            } else if (allFieldValues.size() == 1) {
                Object normalizedFieldValue = normalizeValue(allFieldValues.iterator().next());
                source.put(fieldName, normalizedFieldValue);
                bulkLength += normalizedFieldValue.toString().length();
            }
        }

        String index;
        if (languages != null && languages.length > 0) {
            String language = (String) doc.getFieldValue("lang");
            boolean exists = false;
            for (String lang : languages) {
                if (lang.equals(language)) {
                    exists = true;
                    break;
                }
            }
            if (exists) {
                index = getLanguageIndexName(language);
            } else {
                index = getSinkIndexName();
            }
        } else {
            index = defaultIndex;
        }
        Index indexRequest = new Index.Builder(source).index(index).type(type).id(id).build();

        // Add this indexing request to a bulk request
        bulkBuilder.addAction(indexRequest);

        indexedDocs++;
        bulkDocs++;

        if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
            LOG.info("Processing bulk request [docs = {}, length = {}, total docs = {}, last doc in bulk = '{}']",
                    bulkDocs, bulkLength, indexedDocs, id);
            // Flush the bulk of indexing requests
            createNewBulk = true;
            commit();
        }
    }

    @Override
    public void delete(String key) throws IOException {
        try {
            if (languages != null && languages.length > 0) {
                Bulk.Builder bulkBuilder = new Bulk.Builder().defaultType(defaultType);
                for (String lang : languages) {
                    bulkBuilder.addAction(
                            new Delete.Builder(key).index(getLanguageIndexName(lang)).type(defaultType).build());
                }
                bulkBuilder.addAction(new Delete.Builder(key).index(getSinkIndexName()).type(defaultType).build());
                client.execute(bulkBuilder.build());
            } else {
                client.execute(new Delete.Builder(key).index(defaultIndex).type(defaultType).build());
            }
        } catch (IOException e) {
            LOG.error(ExceptionUtils.getStackTrace(e));
            throw e;
        }
    }

    @Override
    public void update(NutchDocument doc) throws IOException {
        try {
            write(doc);
        } catch (IOException e) {
            LOG.error(ExceptionUtils.getStackTrace(e));
            throw e;
        }
    }

    @Override
    public void commit() throws IOException {
        if (basicFuture != null) {
            // wait for previous to finish
            long beforeWait = System.currentTimeMillis();
            try {
                JestResult result = basicFuture.get();
                if (result == null) {
                    throw new RuntimeException();
                }
                long msWaited = System.currentTimeMillis() - beforeWait;
                LOG.info("Previous took in ms {}, including wait {}", millis, msWaited);
            } catch (InterruptedException | ExecutionException e) {
                LOG.error("Error waiting for result ", e);
            }
            basicFuture = null;
        }
        if (bulkBuilder != null) {
            if (bulkDocs > 0) {
                // start a flush, note that this is an asynchronous call
                basicFuture = new BasicFuture<>(null);
                millis = System.currentTimeMillis();
                client.executeAsync(bulkBuilder.build(), new JestResultHandler<BulkResult>() {
                    @Override
                    public void completed(BulkResult bulkResult) {
                        basicFuture.completed(bulkResult);
                        millis = System.currentTimeMillis() - millis;
                    }

                    @Override
                    public void failed(Exception e) {
                        basicFuture.completed(null);
                        LOG.error("Failed result: ", e);
                    }
                });
            }
            bulkBuilder = null;
        }
        if (createNewBulk) {
            // Prepare a new bulk request
            bulkBuilder = new Bulk.Builder().defaultIndex(defaultIndex).defaultType(defaultType);
            bulkDocs = 0;
            bulkLength = 0;
        }
    }

    @Override
    public void close() throws IOException {
        // Flush pending requests
        LOG.info("Processing remaining requests [docs = {}, length = {}, total docs = {}]", bulkDocs, bulkLength,
                indexedDocs);
        createNewBulk = false;
        commit();

        // flush one more time to finalize the last bulk
        LOG.info("Processing to finalize last execute");
        createNewBulk = false;
        commit();

        // Close
        client.shutdownClient();
    }

    @Override
    public String describe() {
        StringBuffer sb = new StringBuffer("ElasticRestIndexWriter\n");
        sb.append("\t").append(ElasticRestConstants.HOST).append(" : hostname\n");
        sb.append("\t").append(ElasticRestConstants.PORT).append(" : port\n");
        sb.append("\t").append(ElasticRestConstants.INDEX).append(" : elastic index command \n");
        sb.append("\t").append(ElasticRestConstants.MAX_BULK_DOCS)
                .append(" : elastic bulk index doc counts. (default 250) \n");
        sb.append("\t").append(ElasticRestConstants.MAX_BULK_LENGTH)
                .append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n");
        return sb.toString();
    }

    @Override
    public void setConf(Configuration conf) {
        config = conf;
    }

    @Override
    public Configuration getConf() {
        return config;
    }

    private String getLanguageIndexName(String lang) {
        return getComposedIndexName(defaultIndex, lang);
    }

    private String getSinkIndexName() {
        return getComposedIndexName(defaultIndex, sink);
    }

    private String getComposedIndexName(String prefix, String postfix) {
        return prefix + separator + postfix;
    }
}