org.elasticsearch.xpack.ml.transforms.PainlessDomainSplitIT.java Source code

Java tutorial

Introduction

Here is the source code for org.elasticsearch.xpack.ml.transforms.PainlessDomainSplitIT.java

Source

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License;
 * you may not use this file except in compliance with the Elastic License.
 */
package org.elasticsearch.xpack.ml.transforms;

import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.util.EntityUtils;
import org.elasticsearch.client.Response;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.rest.ESRestTestCase;
import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.utils.DomainSplitFunction;
import org.joda.time.DateTime;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.Matchers.equalTo;

public class PainlessDomainSplitIT extends ESRestTestCase {

    static class TestConfiguration {
        public String subDomainExpected;
        public String domainExpected;
        public String hostName;

        TestConfiguration(String subDomainExpected, String domainExpected, String hostName) {
            this.subDomainExpected = subDomainExpected;
            this.domainExpected = domainExpected;
            this.hostName = hostName;
        }
    }

    public static final ArrayList<TestConfiguration> tests;

    static {
        tests = new ArrayList<>();

        tests.add(new TestConfiguration("", "", ""));
        tests.add(new TestConfiguration("", "", "."));

        // Test cases from https://github.com/john-kurkowski/tldextract/tree/master/tldextract/tests

        tests.add(new TestConfiguration("www", "google.com", "www.google.com"));
        tests.add(new TestConfiguration("www.maps", "google.co.uk", "www.maps.google.co.uk"));
        tests.add(new TestConfiguration("www", "theregister.co.uk", "www.theregister.co.uk"));
        tests.add(new TestConfiguration("", "gmail.com", "gmail.com"));
        tests.add(new TestConfiguration("media.forums", "theregister.co.uk", "media.forums.theregister.co.uk"));
        tests.add(new TestConfiguration("www", "www.com", "www.www.com"));
        tests.add(new TestConfiguration("", "www.com", "www.com"));
        tests.add(new TestConfiguration("", "internalunlikelyhostname", "internalunlikelyhostname"));
        tests.add(new TestConfiguration("internalunlikelyhostname", "bizarre", "internalunlikelyhostname.bizarre"));
        tests.add(new TestConfiguration("", "internalunlikelyhostname.info", "internalunlikelyhostname.info")); // .info is a valid TLD
        tests.add(new TestConfiguration("internalunlikelyhostname", "information",
                "internalunlikelyhostname.information"));
        tests.add(new TestConfiguration("", "216.22.0.192", "216.22.0.192"));
        tests.add(new TestConfiguration("", "::1", "::1"));
        tests.add(new TestConfiguration("", "FE80:0000:0000:0000:0202:B3FF:FE1E:8329",
                "FE80:0000:0000:0000:0202:B3FF:FE1E:8329"));
        tests.add(new TestConfiguration("216.22", "project.coop", "216.22.project.coop"));
        tests.add(new TestConfiguration("www", "xn--h1alffa9f.xn--p1ai", "www.xn--h1alffa9f.xn--p1ai"));
        tests.add(new TestConfiguration("", "", ""));
        tests.add(new TestConfiguration("www", "parliament.uk", "www.parliament.uk"));
        tests.add(new TestConfiguration("www", "parliament.co.uk", "www.parliament.co.uk"));
        tests.add(new TestConfiguration("www.a", "cgs.act.edu.au", "www.a.cgs.act.edu.au"));
        tests.add(new TestConfiguration("www", "google.com.au", "www.google.com.au"));
        tests.add(new TestConfiguration("www", "metp.net.cn", "www.metp.net.cn"));
        tests.add(new TestConfiguration("www", "waiterrant.blogspot.com", "www.waiterrant.blogspot.com"));
        tests.add(new TestConfiguration("", "kittens.blogspot.co.uk", "kittens.blogspot.co.uk"));
        tests.add(new TestConfiguration("", "prelert.s3.amazonaws.com", "prelert.s3.amazonaws.com"));
        tests.add(new TestConfiguration("daves_bucket", "prelert.s3.amazonaws.com",
                "daves_bucket.prelert.s3.amazonaws.com"));
        tests.add(new TestConfiguration("example", "example", "example.example"));
        tests.add(new TestConfiguration("b.example", "example", "b.example.example"));
        tests.add(new TestConfiguration("a.b.example", "example", "a.b.example.example"));
        tests.add(new TestConfiguration("example", "local", "example.local"));
        tests.add(new TestConfiguration("b.example", "local", "b.example.local"));
        tests.add(new TestConfiguration("a.b.example", "local", "a.b.example.local"));
        tests.add(new TestConfiguration("r192494180984795-1-1041782-channel-live.ums", "ustream.tv",
                "r192494180984795-1-1041782-cha" + "nnel-live.ums.ustream.tv"));
        tests.add(new TestConfiguration("192.168.62.9", "prelert.com", "192.168.62.9.prelert.com"));

        // These are not a valid DNS names
        tests.add(new TestConfiguration("kerberos.http.192.168", "62.222", "kerberos.http.192.168.62.222"));
        //tests.add(new TestConfiguration("192.168", "62.9\143\127", "192.168.62.9\143\127"));

        // no part of the DNS name can be longer than 63 octets
        /*
        String dnsLongerThan254Chars = "davesbucketdavesbucketdavesbucketdavesbucketdavesbucketdaves.bucketdavesbucketdavesbuc" +
            "ketdavesbucketdavesbucketdaves.bucketdavesbucketdavesbucketdavesbucketdavesbucket.davesbucketdavesbucketdaves" +
            "bucketdavesbucket.davesbucketdavesbucket.prelert.s3.amazonaws.com";
        String hrd = "prelert.s3.amazonaws.com";
        tests.add(new TestConfiguration(dnsLongerThan254Chars.substring(0, dnsLongerThan254Chars.length() - (hrd.length() + 1)),
            hrd, dnsLongerThan254Chars));
        */

        // [Zach] This breaks the script's JSON encoding, skipping for now
        //String bad = "0u1aof\209\1945\188hI4\236\197\205J\244\188\247\223\190F\2135\229gVE7\230i\215\231\205Qzay\225UJ\192
        // pw\216\231\204\194\216\193QV4g\196\207Whpvx.fVxl\194BjA\245kbYk\211XG\235\198\218B\252\219\225S\197\217I\2538n\229
        // \244\213\252\215Ly\226NW\242\248\244Q\220\245\221c\207\189\205Hxq5\224\240.\189Jt4\243\245t\244\198\199p\210\1987
        // r\2050L\239sR0M\190w\238\223\234L\226\2242D\233\210\206\195h\199\206tA\214J\192C\224\191b\188\201\251\198M\244h
        // \206.\198\242l\2114\191JBU\198h\207\215w\243\228R\1924\242\208\191CV\208p\197gDW\198P\217\195X\191Fp\196\197J\193
        // \245\2070\196zH\197\243\253g\239.adz.beacon.base.net";
        //hrd = "base.net";
        //tests.add(new TestConfiguration(bad.substring(0, bad.length() - (hrd.length() + 1)), hrd, bad));

        tests.add(new TestConfiguration("_example", "local", "_example.local"));
        tests.add(new TestConfiguration("www._maps", "google.co.uk", "www._maps.google.co.uk"));
        tests.add(new TestConfiguration("-forum", "theregister.co.uk", "-forum.theregister.co.uk"));
        tests.add(new TestConfiguration("www._yourmp", "parliament.uk", "www._yourmp.parliament.uk"));
        tests.add(new TestConfiguration("www.-a", "cgs.act.edu.au", "www.-a.cgs.act.edu.au"));
        tests.add(new TestConfiguration("", "-foundation.org", "-foundation.org"));
        tests.add(new TestConfiguration("www", "-foundation.org", "www.-foundation.org"));
        tests.add(new TestConfiguration("", "_nfsv4idmapdomain", "_nfsv4idmapdomain"));
        tests.add(new TestConfiguration("_nfsv4idmapdomain", "prelert.com", "_nfsv4idmapdomain.prelert.com"));

        // checkHighestRegisteredDomain() tests
        tests.add(new TestConfiguration(null, "example.com", "example.COM"));
        tests.add(new TestConfiguration(null, "example.com", "WwW.example.COM"));

        // TLD with only 1 rule.
        tests.add(new TestConfiguration(null, "domain.biz", "domain.biz"));
        tests.add(new TestConfiguration(null, "domain.biz", "b.domain.biz"));
        tests.add(new TestConfiguration(null, "domain.biz", "a.b.domain.biz"));

        // TLD with some 2-level rules.
        tests.add(new TestConfiguration(null, "example.com", "example.com"));
        tests.add(new TestConfiguration(null, "example.com", "b.example.com"));
        tests.add(new TestConfiguration(null, "example.com", "a.b.example.com"));
        tests.add(new TestConfiguration(null, "example.uk.com", "example.uk.com"));
        tests.add(new TestConfiguration(null, "example.uk.com", "b.example.uk.com"));
        tests.add(new TestConfiguration(null, "example.uk.com", "a.b.example.uk.com"));
        tests.add(new TestConfiguration(null, "test.ac", "test.ac"));
        tests.add(new TestConfiguration(null, "c.gov.cy", "c.gov.cy"));
        tests.add(new TestConfiguration(null, "c.gov.cy", "b.c.gov.cy"));
        tests.add(new TestConfiguration(null, "c.gov.cy", "a.b.c.gov.cy"));

        // more complex TLD
        tests.add(new TestConfiguration(null, "test.jp", "test.jp"));
        tests.add(new TestConfiguration(null, "test.jp", "www.test.jp"));
        tests.add(new TestConfiguration(null, "test.ac.jp", "test.ac.jp"));
        tests.add(new TestConfiguration(null, "test.ac.jp", "www.test.ac.jp"));
        tests.add(new TestConfiguration(null, "test.kyoto.jp", "test.kyoto.jp"));
        tests.add(new TestConfiguration(null, "b.ide.kyoto.jp", "b.ide.kyoto.jp"));
        tests.add(new TestConfiguration(null, "b.ide.kyoto.jp", "a.b.ide.kyoto.jp"));
        //tests.add(new TestConfiguration(null, "b.c.kobe.jp", "b.c.kobe.jp"));
        //tests.add(new TestConfiguration(null, "b.c.kobe.jp", "a.b.c.kobe.jp"));
        tests.add(new TestConfiguration(null, "city.kobe.jp", "city.kobe.jp"));
        tests.add(new TestConfiguration(null, "city.kobe.jp", "www.city.kobe.jp"));
        tests.add(new TestConfiguration(null, "test.us", "test.us"));
        tests.add(new TestConfiguration(null, "test.us", "www.test.us"));
        tests.add(new TestConfiguration(null, "test.ak.us", "test.ak.us"));
        tests.add(new TestConfiguration(null, "test.ak.us", "www.test.ak.us"));
        tests.add(new TestConfiguration(null, "test.k12.ak.us", "test.k12.ak.us"));
        tests.add(new TestConfiguration(null, "test.k12.ak.us", "www.test.k12.ak.us"));
        //tests.add(new TestConfiguration(null, ".com.cn", ".com.cn"));
        //tests.add(new TestConfiguration(null, ".?.cn", ".?.cn"));
        //tests.add(new TestConfiguration(null, ".?.cn", "www..?.cn"));
        //tests.add(new TestConfiguration(null, "shishi.?.cn", "shishi.?.cn"));
        //tests.add(new TestConfiguration(null, ".", "."));
        //tests.add(new TestConfiguration(null, ".", "www.."));
        //tests.add(new TestConfiguration(null, "shishi.", "shishi."));

        tests.add(new TestConfiguration(null, "xn--85x722f.com.cn", "xn--85x722f.com.cn"));
        tests.add(new TestConfiguration(null, "xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn"));
        tests.add(new TestConfiguration(null, "xn--85x722f.xn--55qx5d.cn", "www.xn--85x722f.xn--55qx5d.cn"));
        tests.add(new TestConfiguration(null, "shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn"));
        tests.add(new TestConfiguration(null, "xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s"));
        tests.add(new TestConfiguration(null, "xn--85x722f.xn--fiqs8s", "www.xn--85x722f.xn--fiqs8s"));
        tests.add(new TestConfiguration(null, "shishi.xn--fiqs8s", "shishi.xn--fiqs8s"));
    }

    public void testIsolated() throws Exception {
        Settings.Builder settings = Settings.builder().put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1)
                .put(IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0);

        createIndex("painless", settings.build());
        client().performRequest("PUT", "painless/test/1", Collections.emptyMap(),
                new StringEntity("{\"test\": \"test\"}", ContentType.APPLICATION_JSON));
        client().performRequest("POST", "painless/_refresh");

        Pattern pattern = Pattern.compile("domain_split\":\\[(.*?),(.*?)\\]");

        Map<String, Object> params = new HashMap<>(DomainSplitFunction.params.size() + 1);
        params.putAll(DomainSplitFunction.params);
        for (TestConfiguration testConfig : tests) {
            params.put("host", testConfig.hostName);
            String mapAsJson = Strings.toString(jsonBuilder().map(params));
            logger.info("params={}", mapAsJson);

            StringEntity body = new StringEntity(
                    "{\n" + "    \"query\" : {\n" + "        \"match_all\": {}\n" + "    },\n"
                            + "    \"script_fields\" : {\n" + "        \"domain_split\" : {\n"
                            + "            \"script\" : {\n" + "                \"lang\": \"painless\",\n"
                            + "                \"inline\": \"" + DomainSplitFunction.function
                            + " return domainSplit(params['host'], params); \",\n" + "                \"params\": "
                            + mapAsJson + "\n" + "            }\n" + "        }\n" + "    }\n" + "}",
                    ContentType.APPLICATION_JSON);

            Response response = client().performRequest("GET", "painless/test/_search", Collections.emptyMap(),
                    body);
            String responseBody = EntityUtils.toString(response.getEntity());
            Matcher m = pattern.matcher(responseBody);

            String actualSubDomain = "";
            String actualDomain = "";
            if (m.find()) {
                actualSubDomain = m.group(1).replace("\"", "");
                actualDomain = m.group(2).replace("\"", "");
            }

            String expectedTotal = "[" + testConfig.subDomainExpected + "," + testConfig.domainExpected + "]";
            String actualTotal = "[" + actualSubDomain + "," + actualDomain + "]";

            // domainSplit() tests had subdomain, testHighestRegisteredDomainCases() do not
            if (testConfig.subDomainExpected != null) {
                assertThat(
                        "Expected subdomain [" + testConfig.subDomainExpected + "] but found [" + actualSubDomain
                                + "]. Actual " + actualTotal + " vs Expected " + expectedTotal,
                        actualSubDomain, equalTo(testConfig.subDomainExpected));
            }

            assertThat(
                    "Expected domain [" + testConfig.domainExpected + "] but found [" + actualDomain + "].  Actual "
                            + actualTotal + " vs Expected " + expectedTotal,
                    actualDomain, equalTo(testConfig.domainExpected));
        }
    }

    public void testHRDSplit() throws Exception {

        // Create job
        String job = "{\n" + "      \"description\":\"Domain splitting\",\n" + "      \"analysis_config\" : {\n"
                + "          \"bucket_span\":\"3600s\",\n"
                + "          \"detectors\" :[{\"function\":\"count\", \"by_field_name\" : \"domain_split\"}]\n"
                + "      },\n" + "      \"data_description\" : {\n" + "          \"field_delimiter\":\",\",\n"
                + "          \"time_field\":\"time\"\n" + "          \n" + "      }\n" + "  }";

        client().performRequest("PUT", MachineLearning.BASE_PATH + "anomaly_detectors/hrd-split-job",
                Collections.emptyMap(), new StringEntity(job, ContentType.APPLICATION_JSON));
        client().performRequest("POST", MachineLearning.BASE_PATH + "anomaly_detectors/hrd-split-job/_open");

        // Create index to hold data
        Settings.Builder settings = Settings.builder().put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1)
                .put(IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0);

        createIndex("painless", settings.build(),
                "\"test\": { \"properties\": { \"domain\": { \"type\": \"keyword\" },"
                        + "\"time\": { \"type\": \"date\" } } }");

        // Index some data
        DateTime baseTime = new DateTime().minusYears(1);
        TestConfiguration test = tests.get(randomInt(tests.size() - 1));

        // domainSplit() tests had subdomain, testHighestRegisteredDomainCases() did not, so we need a special case for sub
        String expectedSub = test.subDomainExpected == null ? ".*" : test.subDomainExpected.replace(".", "\\.");
        String expectedHRD = test.domainExpected.replace(".", "\\.");
        Pattern pattern = Pattern
                .compile("domain_split\":\\[\"(" + expectedSub + "),(" + expectedHRD + ")\"[,\\]]");

        for (int i = 0; i < 100; i++) {

            DateTime time = baseTime.plusHours(i);
            if (i == 64) {
                // Anomaly has 100 docs, but we don't care about the value
                for (int j = 0; j < 100; j++) {
                    client().performRequest("PUT", "painless/test/" + time.toDateTimeISO() + "_" + j,
                            Collections.emptyMap(),
                            new StringEntity("{\"domain\": \"" + "bar.bar.com\", \"time\": \""
                                    + time.toDateTimeISO() + "\"}", ContentType.APPLICATION_JSON));
                }
            } else {
                // Non-anomalous values will be what's seen when the anomaly is reported
                client().performRequest("PUT", "painless/test/" + time.toDateTimeISO(), Collections.emptyMap(),
                        new StringEntity("{\"domain\": \"" + test.hostName + "\", \"time\": \""
                                + time.toDateTimeISO() + "\"}", ContentType.APPLICATION_JSON));
            }
        }

        client().performRequest("POST", "painless/_refresh");

        // Create and start datafeed
        String body = "{\n" + "         \"job_id\":\"hrd-split-job\",\n" + "         \"indexes\":[\"painless\"],\n"
                + "         \"types\":[\"test\"],\n" + "         \"script_fields\": {\n"
                + "            \"domain_split\": {\n"
                + "               \"script\": \"return domainSplit(doc['domain'].value, params);\"\n"
                + "            }\n" + "         }\n" + "      }";

        client().performRequest("PUT", MachineLearning.BASE_PATH + "datafeeds/hrd-split-datafeed",
                Collections.emptyMap(), new StringEntity(body, ContentType.APPLICATION_JSON));
        client().performRequest("POST", MachineLearning.BASE_PATH + "datafeeds/hrd-split-datafeed/_start");

        boolean passed = awaitBusy(() -> {
            try {
                client().performRequest("POST", "/_refresh");

                Response response = client().performRequest("GET",
                        MachineLearning.BASE_PATH + "anomaly_detectors/hrd-split-job/results/records");
                String responseBody = EntityUtils.toString(response.getEntity());

                if (responseBody.contains("\"count\":2")) {
                    Matcher m = pattern.matcher(responseBody);

                    String actualSubDomain = "";
                    String actualDomain = "";
                    if (m.find()) {
                        actualSubDomain = m.group(1).replace("\"", "");
                        actualDomain = m.group(2).replace("\"", "");
                    }

                    String expectedTotal = "[" + test.subDomainExpected + "," + test.domainExpected + "]";
                    String actualTotal = "[" + actualSubDomain + "," + actualDomain + "]";

                    // domainSplit() tests had subdomain, testHighestRegisteredDomainCases() do not
                    if (test.subDomainExpected != null) {
                        assertThat(
                                "Expected subdomain [" + test.subDomainExpected + "] but found [" + actualSubDomain
                                        + "]. Actual " + actualTotal + " vs Expected " + expectedTotal,
                                actualSubDomain, equalTo(test.subDomainExpected));
                    }

                    assertThat(
                            "Expected domain [" + test.domainExpected + "] but found [" + actualDomain
                                    + "].  Actual " + actualTotal + " vs Expected " + expectedTotal,
                            actualDomain, equalTo(test.domainExpected));

                    return true;
                } else {
                    logger.error(responseBody);
                    return false;
                }

            } catch (Exception e) {
                logger.error(e.getMessage());
                return false;
            }

        }, 5, TimeUnit.SECONDS);

        if (!passed) {
            fail("Anomaly records were not found within 5 seconds");
        }
    }
}