Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.elasticsearch; import static org.apache.beam.sdk.io.elasticsearch.ElasticSearchIOTestUtils.FAMOUS_SCIENTISTS; import static org.apache.beam.sdk.io.elasticsearch.ElasticSearchIOTestUtils.NUM_SCIENTISTS; import static org.apache.beam.sdk.io.elasticsearch.ElasticSearchIOTestUtils.countByMatch; import static org.apache.beam.sdk.io.elasticsearch.ElasticSearchIOTestUtils.countByScientistName; import static org.apache.beam.sdk.io.elasticsearch.ElasticSearchIOTestUtils.refreshIndexAndGetCurrentNumDocs; import static org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource; import static org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.ConnectionConfiguration; import static org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read; import static org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DEFAULT_RETRY_PREDICATE; import static org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write; import static org.apache.beam.sdk.testing.SourceTestUtils.readFromSource; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.core.Is.isA; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DefaultRetryPredicate; import org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.RetryPredicate; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFnTester; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.PCollection; import org.apache.http.HttpEntity; import org.apache.http.entity.ContentType; import org.apache.http.nio.entity.NStringEntity; import org.elasticsearch.client.Response; import org.elasticsearch.client.RestClient; import org.hamcrest.CustomMatcher; import org.joda.time.Duration; import org.junit.rules.ExpectedException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** Common test class for {@link ElasticsearchIO}. */ class ElasticsearchIOTestCommon implements Serializable { private static final Logger LOG = LoggerFactory.getLogger(ElasticsearchIOTestCommon.class); private static final RetryPredicate CUSTOM_RETRY_PREDICATE = new DefaultRetryPredicate(400); private static final int EXPECTED_RETRIES = 2; private static final int MAX_ATTEMPTS = 3; private static final String[] BAD_FORMATTED_DOC = { "{ \"x\" :a,\"y\":\"ab\" }" }; private static final String OK_REQUEST = "{ \"index\" : { \"_index\" : \"test\", \"_type\" : \"doc\", \"_id\" : \"1\" } }\n" + "{ \"field1\" : 1 }\n"; private static final String BAD_REQUEST = "{ \"index\" : { \"_index\" : \"test\", \"_type\" : \"doc\", \"_id\" : \"1\" } }\n" + "{ \"field1\" : @ }\n"; static String getEsIndex() { return "beam" + Thread.currentThread().getId(); } static final String ES_TYPE = "test"; static final long NUM_DOCS_UTESTS = 400L; static final long NUM_DOCS_ITESTS = 50000L; static final float ACCEPTABLE_EMPTY_SPLITS_PERCENTAGE = 0.5f; private static final long AVERAGE_DOC_SIZE = 25L; private static final long BATCH_SIZE = 200L; private static final long BATCH_SIZE_BYTES = 2048L; private final long numDocs; private final ConnectionConfiguration connectionConfiguration; private final RestClient restClient; private final boolean useAsITests; private TestPipeline pipeline; private ExpectedException expectedException; ElasticsearchIOTestCommon(ConnectionConfiguration connectionConfiguration, RestClient restClient, boolean useAsITests) { this.connectionConfiguration = connectionConfiguration; this.restClient = restClient; this.numDocs = useAsITests ? NUM_DOCS_ITESTS : NUM_DOCS_UTESTS; this.useAsITests = useAsITests; } // lazy init of the test rules (cannot be static) void setPipeline(TestPipeline pipeline) { this.pipeline = pipeline; } void setExpectedException(ExpectedException expectedException) { this.expectedException = expectedException; } void testSplit(final int desiredBundleSizeBytes) throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient); } PipelineOptions options = PipelineOptionsFactory.create(); Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration); BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null); List<? extends BoundedSource<String>> splits = initialSource.split(desiredBundleSizeBytes, options); SourceTestUtils.assertSourcesEqualReferenceSource(initialSource, splits, options); long indexSize = BoundedElasticsearchSource.estimateIndexSize(connectionConfiguration); int expectedNumSources; if (desiredBundleSizeBytes == 0) { // desiredBundleSize is ignored because in ES 2.x there is no way to split shards. // 5 is the number of ES shards // (By default, each index in Elasticsearch is allocated 5 primary shards) expectedNumSources = 5; } else { float expectedNumSourcesFloat = (float) indexSize / desiredBundleSizeBytes; expectedNumSources = (int) Math.ceil(expectedNumSourcesFloat); } assertEquals("Wrong number of splits", expectedNumSources, splits.size()); int emptySplits = 0; for (BoundedSource<String> subSource : splits) { if (readFromSource(subSource, options).isEmpty()) { emptySplits += 1; } } assertThat("There are too many empty splits, parallelism is sub-optimal", emptySplits, lessThan((int) (ACCEPTABLE_EMPTY_SPLITS_PERCENTAGE * splits.size()))); } void testSizes() throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient); } PipelineOptions options = PipelineOptionsFactory.create(); Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration); BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null); // can't use equal assert as Elasticsearch indexes never have same size // (due to internal Elasticsearch implementation) long estimatedSize = initialSource.getEstimatedSizeBytes(options); LOG.info("Estimated size: {}", estimatedSize); assertThat("Wrong estimated size", estimatedSize, greaterThan(AVERAGE_DOC_SIZE * numDocs)); } void testRead() throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient); } PCollection<String> output = pipeline .apply(ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration) //set to default value, useful just to test parameter passing. .withScrollKeepalive("5m") //set to default value, useful just to test parameter passing. .withBatchSize(100L)); PAssert.thatSingleton(output.apply("Count", Count.globally())).isEqualTo(numDocs); pipeline.run(); } void testReadWithQuery() throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient); } String query = "{\n" + " \"query\": {\n" + " \"match\" : {\n" + " \"scientist\" : {\n" + " \"query\" : \"Einstein\"\n" + " }\n" + " }\n" + " }\n" + "}"; PCollection<String> output = pipeline.apply( ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration).withQuery(query)); PAssert.thatSingleton(output.apply("Count", Count.globally())).isEqualTo(numDocs / NUM_SCIENTISTS); pipeline.run(); } /** Test reading metadata by reading back the id of a document after writing it. */ void testReadWithMetadata() throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, 1, restClient); } PCollection<String> output = pipeline .apply(ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration).withMetadata()); PAssert.that(output).satisfies(new ContainsStringCheckerFn("\"_id\":\"0\"")); pipeline.run(); } void testWrite() throws Exception { List<String> data = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); pipeline.apply(Create.of(data)) .apply(ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration)); pipeline.run(); long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); assertEquals(numDocs, currentNumDocs); int count = countByScientistName(connectionConfiguration, restClient, "Einstein"); assertEquals(numDocs / NUM_SCIENTISTS, count); } void testWriteWithErrors() throws Exception { Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withMaxBatchSize(BATCH_SIZE); List<String> input = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS); expectedException.expect(isA(IOException.class)); expectedException.expectMessage(new CustomMatcher<String>("RegExp matcher") { @Override public boolean matches(Object o) { String message = (String) o; // This regexp tests that 2 malformed documents are actually in error // and that the message contains their IDs. // It also ensures that root reason, root error type, // caused by reason and caused by error type are present in message. // To avoid flakiness of the test in case of Elasticsearch error message change, // only "failed to parse" root reason is matched, // the other messages are matched using .+ return message.matches("(?is).*Error writing to Elasticsearch, some elements could not be inserted" + ".*Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*" + "Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*"); } }); // write bundles size is the runner decision, we cannot force a bundle size, // so we test the Writer as a DoFn outside of a runner. try (DoFnTester<String, Void> fnTester = DoFnTester.of(new Write.WriteFn(write))) { // inserts into Elasticsearch fnTester.processBundle(input); } } void testWriteWithMaxBatchSize() throws Exception { Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withMaxBatchSize(BATCH_SIZE); // write bundles size is the runner decision, we cannot force a bundle size, // so we test the Writer as a DoFn outside of a runner. try (DoFnTester<String, Void> fnTester = DoFnTester.of(new Write.WriteFn(write))) { List<String> input = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); long numDocsProcessed = 0; long numDocsInserted = 0; for (String document : input) { fnTester.processElement(document); numDocsProcessed++; // test every 100 docs to avoid overloading ES if ((numDocsProcessed % 100) == 0) { // force the index to upgrade after inserting for the inserted docs // to be searchable immediately long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); if ((numDocsProcessed % BATCH_SIZE) == 0) { /* bundle end */ assertEquals( "we are at the end of a bundle, we should have inserted all processed documents", numDocsProcessed, currentNumDocs); numDocsInserted = currentNumDocs; } else { /* not bundle end */ assertEquals("we are not at the end of a bundle, we should have inserted no more documents", numDocsInserted, currentNumDocs); } } } } } void testWriteWithMaxBatchSizeBytes() throws Exception { Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withMaxBatchSizeBytes(BATCH_SIZE_BYTES); // write bundles size is the runner decision, we cannot force a bundle size, // so we test the Writer as a DoFn outside of a runner. try (DoFnTester<String, Void> fnTester = DoFnTester.of(new Write.WriteFn(write))) { List<String> input = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); long numDocsProcessed = 0; long sizeProcessed = 0; long numDocsInserted = 0; long batchInserted = 0; for (String document : input) { fnTester.processElement(document); numDocsProcessed++; sizeProcessed += document.getBytes(StandardCharsets.UTF_8).length; // test every 40 docs to avoid overloading ES if ((numDocsProcessed % 40) == 0) { // force the index to upgrade after inserting for the inserted docs // to be searchable immediately long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); if (sizeProcessed / BATCH_SIZE_BYTES > batchInserted) { /* bundle end */ assertThat("we have passed a bundle size, we should have inserted some documents", currentNumDocs, greaterThan(numDocsInserted)); numDocsInserted = currentNumDocs; batchInserted = (sizeProcessed / BATCH_SIZE_BYTES); } else { /* not bundle end */ assertEquals("we are not at the end of a bundle, we should have inserted no more documents", numDocsInserted, currentNumDocs); } } } } } /** Extracts the name field from the JSON document. */ private static class ExtractValueFn implements Write.FieldValueExtractFn { private final String fieldName; private ExtractValueFn(String fieldName) { this.fieldName = fieldName; } @Override public String apply(JsonNode input) { return input.path(fieldName).asText(); } } /** * Tests that when using the scientist name as the document identifier only as many documents as * scientists are created, since subsequent calls with the same name invoke updates. */ void testWriteWithIdFn() throws Exception { List<String> data = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); pipeline.apply(Create.of(data)).apply(ElasticsearchIO.write() .withConnectionConfiguration(connectionConfiguration).withIdFn(new ExtractValueFn("scientist"))); pipeline.run(); long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); assertEquals(NUM_SCIENTISTS, currentNumDocs); int count = countByScientistName(connectionConfiguration, restClient, "Einstein"); assertEquals(1, count); } /** * Tests that documents are dynamically routed to different indexes and not the one specified in * the configuration. Documents should be routed to an index named the same as the scientist in * the document. Multiple indexes adds significant work to the ES server and even passing moderate * number of docs can overload the bulk queue and workers. The post explains more * https://www.elastic.co/blog/why-am-i-seeing-bulk-rejections-in-my-elasticsearch-cluster. * Therefore limit to a small number of docs to test routing behavior only. */ void testWriteWithIndexFn() throws Exception { long docsPerScientist = 10; // very conservative long adjustedNumDocs = docsPerScientist * FAMOUS_SCIENTISTS.length; List<String> data = ElasticSearchIOTestUtils.createDocuments(adjustedNumDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); pipeline.apply(Create.of(data)).apply(ElasticsearchIO.write() .withConnectionConfiguration(connectionConfiguration).withIndexFn(new ExtractValueFn("scientist"))); pipeline.run(); // verify counts on each index for (String scientist : FAMOUS_SCIENTISTS) { String index = scientist.toLowerCase(); long count = refreshIndexAndGetCurrentNumDocs(restClient, index, connectionConfiguration.getType()); assertEquals(scientist + " index holds incorrect count", docsPerScientist, count); } } /** Returns TYPE_0 or TYPE_1 based on the modulo 2 of the hash of the named field. */ static class Modulo2ValueFn implements Write.FieldValueExtractFn { private final String fieldName; Modulo2ValueFn(String fieldName) { this.fieldName = fieldName; } @Override public String apply(JsonNode input) { return "TYPE_" + input.path(fieldName).asText().hashCode() % 2; } } /** * Tests that documents are dynamically routed to different types and not the type that is given * in the configuration. Documents should be routed to the a type of type_0 or type_1 using a * modulo approach of the explicit id. * * <p>This test does not work with ES 6 because ES 6 does not allow one mapping has more than 1 * type */ void testWriteWithTypeFn2x5x() throws Exception { // defensive coding: this test requires an even number of docs long adjustedNumDocs = (numDocs & 1) == 0 ? numDocs : numDocs + 1; List<String> data = ElasticSearchIOTestUtils.createDocuments(adjustedNumDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); pipeline.apply(Create.of(data)).apply(ElasticsearchIO.write() .withConnectionConfiguration(connectionConfiguration).withTypeFn(new Modulo2ValueFn("id"))); pipeline.run(); for (int i = 0; i < 2; i++) { String type = "TYPE_" + i; long count = refreshIndexAndGetCurrentNumDocs(restClient, connectionConfiguration.getIndex(), type); assertEquals(type + " holds incorrect count", adjustedNumDocs / 2, count); } } /** * Tests that documents are correctly routed when index, type and document ID functions are * provided to overwrite the defaults of using the configuration and auto-generation of the * document IDs by Elasticsearch. The scientist name is used for the index, type and document ID. * As a result there should be only a single document in each index/type. */ void testWriteWithFullAddressing() throws Exception { List<String> data = ElasticSearchIOTestUtils.createDocuments(numDocs, ElasticSearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS); pipeline.apply(Create.of(data)) .apply(ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withIdFn(new ExtractValueFn("id")).withIndexFn(new ExtractValueFn("scientist")) .withTypeFn(new Modulo2ValueFn("scientist"))); pipeline.run(); for (String scientist : FAMOUS_SCIENTISTS) { String index = scientist.toLowerCase(); for (int i = 0; i < 2; i++) { String type = "TYPE_" + scientist.hashCode() % 2; long count = refreshIndexAndGetCurrentNumDocs(restClient, index, type); assertEquals("Incorrect count for " + index + "/" + type, numDocs / NUM_SCIENTISTS, count); } } } /** * Tests partial updates by adding a group field to each document in the standard test set. The * group field is populated as the modulo 2 of the document id allowing for a test to ensure the * documents are split into 2 groups. */ void testWritePartialUpdate() throws Exception { if (!useAsITests) { ElasticSearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient); } // defensive coding to ensure our initial state is as expected long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); assertEquals(numDocs, currentNumDocs); // partial documents containing the ID and group only List<String> data = new ArrayList<>(); for (int i = 0; i < numDocs; i++) { data.add(String.format("{\"id\" : %s, \"group\" : %s}", i, i % 2)); } pipeline.apply(Create.of(data)) .apply(ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withIdFn(new ExtractValueFn("id")).withUsePartialUpdate(true)); pipeline.run(); currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient); // check we have not unwittingly modified existing behaviour assertEquals(numDocs, currentNumDocs); assertEquals(numDocs / NUM_SCIENTISTS, countByScientistName(connectionConfiguration, restClient, "Einstein")); // Partial update assertions assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "0")); assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "1")); } /** * Function for checking if any string in iterable contains expected substring. Fails if no match * is found. */ private static class ContainsStringCheckerFn implements SerializableFunction<Iterable<String>, Void> { private String expectedSubString; ContainsStringCheckerFn(String expectedSubString) { this.expectedSubString = expectedSubString; } @Override public Void apply(Iterable<String> input) { for (String s : input) { if (s.contains(expectedSubString)) { return null; } } fail("No string found containing " + expectedSubString); return null; } } /** Test that the default predicate correctly parses chosen error code. */ public void testDefaultRetryPredicate(RestClient restClient) throws IOException { HttpEntity entity1 = new NStringEntity(BAD_REQUEST, ContentType.APPLICATION_JSON); Response response1 = restClient.performRequest("POST", "/_bulk", Collections.emptyMap(), entity1); assertTrue(CUSTOM_RETRY_PREDICATE.test(response1)); HttpEntity entity2 = new NStringEntity(OK_REQUEST, ContentType.APPLICATION_JSON); Response response2 = restClient.performRequest("POST", "/_bulk", Collections.emptyMap(), entity2); assertFalse(DEFAULT_RETRY_PREDICATE.test(response2)); } /** * Test that retries are invoked when Elasticsearch returns a specific error code. We invoke this * by issuing corrupt data and retrying on the `400` error code. Normal behaviour is to retry on * `429` only but that is difficult to simulate reliably. The logger is used to verify expected * behavior. */ public void testWriteRetry() throws Throwable { expectedException.expectCause(isA(IOException.class)); // max attempt is 3, but retry is 2 which excludes 1st attempt when error was identified and retry started. expectedException .expectMessage(String.format(ElasticsearchIO.Write.WriteFn.RETRY_FAILED_LOG, EXPECTED_RETRIES)); ElasticsearchIO.Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration) .withRetryConfiguration(ElasticsearchIO.RetryConfiguration .create(MAX_ATTEMPTS, Duration.millis(35000)).withRetryPredicate(CUSTOM_RETRY_PREDICATE)); pipeline.apply(Create.of(Arrays.asList(BAD_FORMATTED_DOC))).apply(write); pipeline.run(); } }