fr.pilato.elasticsearch.crawler.fs.test.integration.FsCrawlerImplAllParametersIT.java Source code

Introduction

Here is the source code for fr.pilato.elasticsearch.crawler.fs.test.integration.FsCrawlerImplAllParametersIT.java
Source

/*
 * Licensed to David Pilato (the "Author") under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Author licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package fr.pilato.elasticsearch.crawler.fs.test.integration;

import com.google.common.base.Charsets;
import fr.pilato.elasticsearch.crawler.fs.FsCrawlerImpl;
import fr.pilato.elasticsearch.crawler.fs.client.SearchRequest;
import fr.pilato.elasticsearch.crawler.fs.client.SearchResponse;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.Attributes;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.Doc;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.File;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.Meta;
import fr.pilato.elasticsearch.crawler.fs.meta.job.FsJobFileHandler;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.Elasticsearch;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.Fs;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.FsSettings;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.Percentage;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.Rest;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.Server;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.TimeValue;
import fr.pilato.elasticsearch.crawler.fs.util.FsCrawlerUtil;
import org.apache.http.entity.StringEntity;
import org.apache.logging.log4j.Level;
import org.elasticsearch.client.ResponseException;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import static fr.pilato.elasticsearch.crawler.fs.FsCrawlerImpl.LOOP_INFINITE;
import static fr.pilato.elasticsearch.crawler.fs.client.ElasticsearchClient.extractFromPath;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasKey;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.isOneOf;
import static org.hamcrest.Matchers.iterableWithSize;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeNoException;
import static org.junit.Assume.assumeThat;

/**
 * Test all crawler settings
 */
public class FsCrawlerImplAllParametersIT extends AbstractITCase {

    private FsCrawlerImpl crawler = null;
    private Path currentTestResourceDir;

    private static final Path DEFAULT_RESOURCES = Paths.get(getUrl("samples", "common"));

    /**
     * We suppose that each test has its own set of files. Even if we duplicate them, that will make the code
     * more readable.
     * The temp folder which is used as a root is automatically cleaned after the test so we don't have to worry
     * about it.
     */
    @Before
    public void copyTestResources() throws IOException, URISyntaxException {
        Path testResourceTarget = rootTmpDir.resolve("resources");
        if (Files.notExists(testResourceTarget)) {
            Files.createDirectory(testResourceTarget);
        }

        String currentTestName = getCurrentTestName();
        // We copy files from the src dir to the temp dir
        staticLogger.info("  --> Launching test [{}]", currentTestName);
        String url = getUrl("samples", currentTestName);
        Path from = Paths.get(url);
        currentTestResourceDir = testResourceTarget.resolve(currentTestName);

        if (Files.exists(from)) {
            staticLogger.debug("  --> Copying test resources from [{}]", from);
        } else {
            staticLogger.debug("  --> Copying test resources from [{}]", DEFAULT_RESOURCES);
            from = DEFAULT_RESOURCES;
        }

        FsCrawlerUtil.copyDirs(from, currentTestResourceDir);

        staticLogger.debug("  --> Test resources ready in [{}]", currentTestResourceDir);
    }

    @Before
    public void cleanExistingIndex() throws IOException {
        logger.info(" -> Removing existing index [{}*]", getCrawlerName());
        elasticsearchClient.deleteIndex(getCrawlerName() + "*");
    }

    @After
    public void shutdownCrawler() throws InterruptedException, IOException {
        stopCrawler();
    }

    private Fs.Builder startCrawlerDefinition() throws IOException {
        return startCrawlerDefinition(currentTestResourceDir.toString(), TimeValue.timeValueSeconds(5));
    }

    private Fs.Builder startCrawlerDefinition(TimeValue updateRate) throws IOException {
        return startCrawlerDefinition(currentTestResourceDir.toString(), updateRate);
    }

    private Fs.Builder startCrawlerDefinition(String dir) throws IOException {
        return startCrawlerDefinition(dir, TimeValue.timeValueSeconds(5));
    }

    private Fs.Builder startCrawlerDefinition(String dir, TimeValue updateRate) {
        logger.info("  --> creating crawler for dir [{}]", dir);
        return Fs.builder().setUrl(dir).setUpdateRate(updateRate);
    }

    private Elasticsearch endCrawlerDefinition(String indexName) {
        return generateElasticsearchConfig(indexName, securityInstalled, 1, null);
    }

    private void startCrawler() throws Exception {
        startCrawler(getCrawlerName());
    }

    private void startCrawler(final String jobName) throws Exception {
        startCrawler(jobName, startCrawlerDefinition().build(), endCrawlerDefinition(jobName), null);
    }

    private FsCrawlerImpl startCrawler(final String jobName, Fs fs, Elasticsearch elasticsearch, Server server)
            throws Exception {
        return startCrawler(jobName, fs, elasticsearch, server, null, TimeValue.timeValueSeconds(10));
    }

    private FsCrawlerImpl startCrawler(final String jobName, Fs fs, Elasticsearch elasticsearch, Server server,
            Rest rest, TimeValue duration) throws Exception {
        logger.info("  --> starting crawler [{}]", jobName);

        // TODO do this rarely() createIndex(jobName);

        crawler = new FsCrawlerImpl(metadataDir, FsSettings.builder(jobName).setElasticsearch(elasticsearch)
                .setFs(fs).setServer(server).setRest(rest).build(), LOOP_INFINITE, false, rest != null);
        crawler.start();

        // We wait up to X seconds before considering a failing test
        assertThat("Job meta file should exists in ~/.fscrawler...", awaitBusy(() -> {
            try {
                new FsJobFileHandler(metadataDir).read(jobName);
                return true;
            } catch (IOException e) {
                return false;
            }
        }, duration.seconds(), TimeUnit.SECONDS), equalTo(true));

        countTestHelper(jobName, null, null);

        // Make sure we refresh indexed docs before launching tests
        refresh();

        return crawler;
    }

    private void stopCrawler() throws InterruptedException, IOException {
        if (crawler != null) {
            staticLogger.info("  --> Stopping crawler");
            crawler.close();
            crawler = null;
        }
    }

    /**
     * Check that we have the expected number of docs or at least one if expected is null
     *
     * @param indexName Index we will search in.
     * @param term      Term you search for. MatchAll if null.
     * @param expected  expected number of docs. Null if at least 1.
     * @return the search response if further tests are needed
     * @throws Exception in case of error
     */
    public SearchResponse countTestHelper(final String indexName, String term, final Integer expected)
            throws Exception {
        return countTestHelper(indexName, term, expected, null);
    }

    @Test
    public void test_default_settings() throws Exception {
        startCrawler();
    }

    @Test
    public void test_filesize() throws Exception {
        startCrawler();

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Map<String, Object> file = (Map<String, Object>) hit.getSource().get(Doc.FIELD_NAMES.FILE);
            assertThat(file, notNullValue());
            assertThat(file.get(File.FIELD_NAMES.FILESIZE), is(12230));
        }
    }

    @Test
    public void test_filesize_limit() throws Exception {
        Fs fs = startCrawlerDefinition().setIndexedChars(new Percentage(7)).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object content = hit.getSource().get(Doc.FIELD_NAMES.CONTENT);
            Object indexedChars = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE)
                    .get(File.FIELD_NAMES.INDEXED_CHARS);
            assertThat(content, notNullValue());
            assertThat(indexedChars, notNullValue());

            // Our original text should be truncated
            assertThat(content, is("Novo de"));
            assertThat(indexedChars, is(7));
        }
    }

    @Test
    public void test_filesize_limit_percentage() throws Exception {
        Fs fs = startCrawlerDefinition().setIndexedChars(Percentage.parse("0.1%")).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object content = hit.getSource().get(Doc.FIELD_NAMES.CONTENT);
            Object indexedChars = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE)
                    .get(File.FIELD_NAMES.INDEXED_CHARS);
            assertThat(content, notNullValue());
            assertThat(indexedChars, notNullValue());

            // Our original text should be truncated
            assertThat(content, is("Novo denique"));
            assertThat(indexedChars, is(12));
        }
    }

    @Test
    public void test_filesize_nolimit() throws Exception {
        Fs fs = startCrawlerDefinition().setIndexedChars(new Percentage(-1)).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object content = hit.getSource().get(Doc.FIELD_NAMES.CONTENT);
            Object indexedChars = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE)
                    .get(File.FIELD_NAMES.INDEXED_CHARS);
            assertThat(content, notNullValue());
            assertThat(indexedChars, nullValue());

            // Our original text should not be truncated so we must have its end extracted
            assertThat((String) content, containsString("haecque non diu sunt perpetrata."));
        }
    }

    @Test
    public void test_filesize_disabled() throws Exception {
        Fs fs = startCrawlerDefinition().setAddFilesize(false).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Map<String, Object> file = (Map<String, Object>) hit.getSource().get(Doc.FIELD_NAMES.FILE);
            assertThat(file, notNullValue());
            assertThat(file.get(File.FIELD_NAMES.FILESIZE), nullValue());
        }
    }

    @Test
    public void test_includes() throws Exception {
        Fs fs = startCrawlerDefinition().addInclude("*_include.txt").build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
        countTestHelper(getCrawlerName(), null, 1);
    }

    @Test
    public void test_default_metadata() throws Exception {
        startCrawler();

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            assertThat(hit.getSource().get(Doc.FIELD_NAMES.ATTACHMENT), nullValue());

            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.FILENAME),
                    notNullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.CONTENT_TYPE),
                    notNullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.URL),
                    notNullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.FILESIZE),
                    notNullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.INDEXING_DATE),
                    notNullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.INDEXED_CHARS),
                    nullValue());
            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.LAST_MODIFIED),
                    notNullValue());

            assertThat(extractFromPath(hit.getSource(), Doc.FIELD_NAMES.META).get(Meta.FIELD_NAMES.TITLE),
                    notNullValue());
        }
    }

    @Test
    public void test_attributes() throws Exception {
        Fs fs = startCrawlerDefinition().setAttributesSupport(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            assertThat(
                    extractFromPath(hit.getSource(), Doc.FIELD_NAMES.ATTRIBUTES).get(Attributes.FIELD_NAMES.OWNER),
                    notNullValue());
        }
    }

    @Test
    public void test_remove_deleted_enabled() throws Exception {
        Fs fs = startCrawlerDefinition().setRemoveDeleted(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We should have two docs first
        countTestHelper(getCrawlerName(), null, 2, currentTestResourceDir);

        // We remove a file
        logger.info("  ---> Removing file deleted_roottxtfile.txt");
        Files.delete(currentTestResourceDir.resolve("deleted_roottxtfile.txt"));

        // We expect to have two files
        countTestHelper(getCrawlerName(), null, 1, currentTestResourceDir);
    }

    @Test
    public void test_remove_deleted_disabled() throws Exception {
        Fs fs = startCrawlerDefinition().setRemoveDeleted(false).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We should have two docs first
        countTestHelper(getCrawlerName(), null, 2, currentTestResourceDir);

        // We remove a file
        logger.info(" ---> Removing file deleted_roottxtfile.txt");
        Files.delete(currentTestResourceDir.resolve("deleted_roottxtfile.txt"));

        // We expect to have two files
        countTestHelper(getCrawlerName(), null, 2, currentTestResourceDir);
    }

    /**
     * Test case for https://github.com/dadoonet/fscrawler/issues/110
     * @throws Exception In case something is wrong
     */
    @Test
    public void test_rename_file() throws Exception {
        startCrawler();

        // We should have two docs first
        countTestHelper(getCrawlerName(), null, 1, currentTestResourceDir);

        // We rename the file
        logger.info(" ---> Renaming file roottxtfile.txt to renamed_roottxtfile.txt");
        // We create a copy of a file
        Files.move(currentTestResourceDir.resolve("roottxtfile.txt"),
                currentTestResourceDir.resolve("renamed_roottxtfile.txt"));

        // We expect to have one file only with a new name
        countTestHelper(getCrawlerName(), "file.filename:renamed_roottxtfile.txt", 1, currentTestResourceDir);
    }

    /**
     * Test case for issue #60: https://github.com/dadoonet/fscrawler/issues/60 : new files are not added
     */
    @Test
    public void test_add_new_file() throws Exception {
        startCrawler();

        // We should have one doc first
        SearchResponse response = countTestHelper(getCrawlerName(), null, 1, currentTestResourceDir);
        checkDocVersions(response, 1L);

        logger.info(" ---> Creating a new file new_roottxtfile.txt");
        Files.write(currentTestResourceDir.resolve("new_roottxtfile.txt"), "This is a second file".getBytes());

        // We expect to have two files
        response = countTestHelper(getCrawlerName(), null, 2, currentTestResourceDir);

        // It should be only version <= 2 for both docs
        checkDocVersions(response, 2L);

        logger.info(" ---> Creating a new file new_new_roottxtfile.txt");
        Files.write(currentTestResourceDir.resolve("new_new_roottxtfile.txt"), "This is a third file".getBytes());

        // We expect to have three files
        response = countTestHelper(getCrawlerName(), null, 3, currentTestResourceDir);

        // It should be only version <= 2 for all docs
        checkDocVersions(response, 2L);
    }

    /**
     * Iterate other response hits and check that _version is at most a given version
     * @param response The search response object
     * @param maxVersion Maximum version number we can have
     */
    private void checkDocVersions(SearchResponse response, long maxVersion) {
        // It should be only version <= maxVersion for all docs
        response.getHits().getHits().forEach(hit -> {
            // Read the document. This is needed since 5.0 as search does not return the _version field
            try {
                SearchResponse.Hit getHit = elasticsearchClient.get(hit.getIndex(), hit.getType(), hit.getId());
                assertThat(getHit.getVersion(), lessThanOrEqualTo(maxVersion));
            } catch (IOException e) {
                fail("We got an IOException: " + e.getMessage());
            }
        });
    }

    /**
     * Test case for issue #5: https://github.com/dadoonet/fscrawler/issues/5 : Support JSon documents
     */
    @Test
    public void test_json_support() throws Exception {
        Fs fs = startCrawlerDefinition().setJsonSupport(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        assertThat("We should have 2 doc for tweet in text field...", awaitBusy(() -> {
            try {
                SearchResponse response = elasticsearchClient.search(getCrawlerName(), null, "text:tweet");
                return response.getHits().getTotal() == 2;
            } catch (IOException e) {
                logger.warn("Caught exception while running the test", e);
                return false;
            }
        }), equalTo(true));
    }

    /**
     * Test case for issue #5: https://github.com/dadoonet/fscrawler/issues/5 : Support JSon documents
     */
    @Test
    public void test_json_disabled() throws Exception {
        Fs fs = startCrawlerDefinition().setJsonSupport(false).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        assertThat("We should have 0 doc for tweet in text field...", awaitBusy(() -> {
            try {
                SearchResponse response = elasticsearchClient.search(getCrawlerName(), null, "text:tweet");
                return response.getHits().getTotal() == 0;
            } catch (IOException e) {
                logger.warn("Caught exception while running the test", e);
                return false;
            }
        }), equalTo(true));

        assertThat("We should have 2 docs for tweet in _all...", awaitBusy(() -> {
            try {
                SearchResponse response = elasticsearchClient.search(getCrawlerName(), null, "_all:tweet");
                return response.getHits().getTotal() == 2;
            } catch (IOException e) {
                logger.warn("Caught exception while running the test", e);
                return false;
            }
        }), equalTo(true));
    }

    /**
     * Test case for issue #7: https://github.com/dadoonet/fscrawler/issues/7 : Use filename as ID
     */
    @Test
    public void test_filename_as_id() throws Exception {
        Fs fs = startCrawlerDefinition().setFilenameAsId(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        assertThat("Document should exists with [roottxtfile.txt] id...", awaitBusy(() -> {
            try {
                return elasticsearchClient.isExistingDocument(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                        "roottxtfile.txt");
            } catch (IOException e) {
                return false;
            }
        }), equalTo(true));
    }

    /**
     * Test case for issue #237:  https://github.com/dadoonet/fscrawler/issues/237 Delete json documents
     */
    @Test
    public void test_add_as_inner_object() throws Exception {
        Fs fs = startCrawlerDefinition().setJsonSupport(true).setAddAsInnerObject(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        assertThat("We should have 2 doc for tweet in object.text field...", awaitBusy(() -> {
            try {
                SearchResponse response = elasticsearchClient.search(getCrawlerName(), null, "object.text:tweet");
                return response.getHits().getTotal() == 2;
            } catch (IOException e) {
                logger.warn("Caught exception while running the test", e);
                return false;
            }
        }), equalTo(true));
    }

    @Test
    public void test_store_source() throws Exception {
        Fs fs = startCrawlerDefinition().setStoreSource(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            // We check that the field is in _source
            assertThat(hit.getSource().get(Doc.FIELD_NAMES.ATTACHMENT), notNullValue());
        }
    }

    @Test
    public void test_do_not_store_source() throws Exception {
        startCrawler();

        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null, "_source", "*");
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            // We check that the field has not been stored
            assertThat(hit.getFields(), nullValue());

            // We check that the field is not part of _source
            assertThat(hit.getSource().get(Doc.FIELD_NAMES.ATTACHMENT), nullValue());
        }
    }

    @Test
    public void test_defaults() throws Exception {
        startCrawler();

        // We expect to have one file
        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1);

        // The default configuration should not add file attributes
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            assertThat(hit.getSource().get(Doc.FIELD_NAMES.ATTRIBUTES), nullValue());
        }

    }

    @Test
    public void test_subdirs() throws Exception {
        startCrawler();

        // We expect to have two files
        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 2);

        // We check that the subdir document has his meta path data correctly set
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object virtual = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.PATH)
                    .get(fr.pilato.elasticsearch.crawler.fs.meta.doc.Path.FIELD_NAMES.VIRTUAL);
            assertThat(virtual, isOneOf("/subdir", "/"));
        }
    }

    @Test
    public void test_subdirs_deep_tree() throws Exception {
        startCrawler();

        // We expect to have two files
        countTestHelper(getCrawlerName(), null, 7);

        // Run aggs
        SearchResponse response = elasticsearchClient.searchJson(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                "{\n" + "  \"size\": 0, \n" + "  \"aggs\": {\n" + "    \"folders\": {\n" + "      \"terms\": {\n"
                        + "        \"field\": \"path.virtual.tree\"\n" + "      }\n" + "    }\n" + "  }\n" + "}");
        assertThat(response.getHits().getTotal(), is(7L));

        // aggregations
        assertThat(response.getAggregations(), hasKey("folders"));
        List<Object> buckets = (List) extractFromPath(response.getAggregations(), "folders").get("buckets");

        assertThat(buckets, iterableWithSize(7));
    }

    @Test
    public void test_subdirs_with_patterns() throws Exception {
        Fs fs = startCrawlerDefinition().addInclude("*.txt").build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We expect to have seven files
        countTestHelper(getCrawlerName(), null, 7);
    }

    @Test
    public void test_ignore_dir() throws Exception {
        Fs fs = startCrawlerDefinition().addExclude(".ignore").build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We expect to have one file
        countTestHelper(getCrawlerName(), null, 1);
    }

    @Test
    public void test_multiple_crawlers() throws Exception {
        Fs fs1 = startCrawlerDefinition(currentTestResourceDir.resolve("crawler1").toString()).build();
        Fs fs2 = startCrawlerDefinition(currentTestResourceDir.resolve("crawler2").toString()).build();
        FsCrawlerImpl crawler1 = startCrawler(getCrawlerName() + "_1", fs1,
                endCrawlerDefinition(getCrawlerName() + "_1"), null);
        FsCrawlerImpl crawler2 = startCrawler(getCrawlerName() + "_2", fs2,
                endCrawlerDefinition(getCrawlerName() + "_2"), null);
        // We should have one doc in index 1...
        countTestHelper(getCrawlerName() + "_1", null, 1);
        // We should have one doc in index 2...
        countTestHelper(getCrawlerName() + "_2", null, 1);

        crawler1.close();
        crawler2.close();
    }

    @Test
    public void test_filename_analyzer() throws Exception {
        startCrawler();

        // We should have one doc
        countTestHelper(getCrawlerName(), "file.filename:roottxtfile.txt", 1, null);
    }

    /**
     * Test for #83: https://github.com/dadoonet/fscrawler/issues/83
     */
    @Test
    public void test_time_value() throws Exception {
        Fs fs = startCrawlerDefinition(TimeValue.timeValueHours(1)).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We expect to have one file
        countTestHelper(getCrawlerName(), null, 1);
    }

    /**
     * You have to adapt this test to your own system (login / password and SSH connexion)
     * So this test is disabled by default
     */
    @Test
    @Ignore
    public void test_ssh() throws Exception {
        String username = "USERNAME";
        String password = "PASSWORD";
        String hostname = "localhost";

        Fs fs = startCrawlerDefinition().build();
        Server server = Server.builder().setHostname(hostname).setUsername(username).setPassword(password)
                .setProtocol(FsCrawlerImpl.PROTOCOL.SSH).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), server);

        countTestHelper(getCrawlerName(), null, 2);
    }

    /**
     * You have to adapt this test to your own system (login / pem file and SSH connexion)
     * So this test is disabled by default
     */
    @Test
    @Ignore
    public void test_ssh_with_key() throws Exception {
        String username = "USERNAME";
        String path_to_pem_file = "/path/to/private_key.pem";
        String hostname = "localhost";

        Fs fs = startCrawlerDefinition().build();
        Server server = Server.builder().setHostname(hostname).setUsername(username).setPemPath(path_to_pem_file)
                .setProtocol(FsCrawlerImpl.PROTOCOL.SSH).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), server);

        countTestHelper(getCrawlerName(), null, 1);
    }

    /**
     * Test for #105: https://github.com/dadoonet/fscrawler/issues/105
     */
    @Test
    public void test_unparsable() throws Exception {
        startCrawler();

        // We expect to have two files
        countTestHelper(getCrawlerName(), null, 2);
    }

    /**
     * Test for #103: https://github.com/dadoonet/fscrawler/issues/103
     */
    @Test
    public void test_index_content() throws Exception {
        Fs fs = startCrawlerDefinition().setIndexContent(false).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We expect to have one file
        countTestHelper(getCrawlerName(), null, 1);

        countTestHelper(getCrawlerName(), "content:file*", 0, null);
        countTestHelper(getCrawlerName(), "file.content_type:text*", 0, null);
    }

    @Test
    public void test_bulk_flush() throws Exception {
        Fs fs = startCrawlerDefinition().build();
        startCrawler(getCrawlerName(), fs, generateElasticsearchConfig(getCrawlerName(), securityInstalled, 100,
                TimeValue.timeValueSeconds(2)), null);

        countTestHelper(getCrawlerName(), null, 1);
    }

    @Test
    public void test_checksum_md5() throws Exception {
        try {
            MessageDigest.getInstance("MD5");
        } catch (NoSuchAlgorithmException e) {
            assumeNoException(e);
        }

        Fs fs = startCrawlerDefinition().setChecksum("MD5").build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object checksum = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.CHECKSUM);
            assertThat(checksum, is("caa71e1914ecbcf5ae4f46cf85de8648"));
        }
    }

    @Test
    public void test_checksum_sha1() throws Exception {
        try {
            MessageDigest.getInstance("SHA-1");
        } catch (NoSuchAlgorithmException e) {
            assumeNoException(e);
        }

        Fs fs = startCrawlerDefinition().setChecksum("SHA-1").build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
        SearchResponse searchResponse = countTestHelper(getCrawlerName(), null, 1, null);
        for (SearchResponse.Hit hit : searchResponse.getHits().getHits()) {
            Object checksum = extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(File.FIELD_NAMES.CHECKSUM);
            assertThat(checksum, is("81bf7dba781a1efbea6d9f2ad638ffe772ba4eab"));
        }
    }

    @Test
    public void test_checksum_non_existing_algorithm() throws Exception {
        Fs fs = startCrawlerDefinition().setChecksum("FSCRAWLER").build();
        crawler = new FsCrawlerImpl(metadataDir, FsSettings.builder(getCrawlerName())
                .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build());
        crawler.start();
        assertThat(crawler.isClosed(), is(true));
    }

    /**
     * Test case for issue #204: https://github.com/dadoonet/fscrawler/issues/204 : JSON files are indexed twice
     */
    @Test
    public void test_json_support_and_other_files() throws Exception {
        Fs fs = startCrawlerDefinition().setJsonSupport(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        assertThat("We should have 2 docs only...", awaitBusy(() -> {
            try {
                SearchResponse response = elasticsearchClient.search(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                        (String) null);
                return response.getHits().getTotal() == 2;
            } catch (IOException e) {
                logger.warn("Caught exception while running the test", e);
                return false;
            }
        }), equalTo(true));
    }

    /**
     * Test case for issue #185: https://github.com/dadoonet/fscrawler/issues/185 : Add xml_support setting
     */
    @Test
    public void test_xml_enabled() throws Exception {
        Fs fs = startCrawlerDefinition().setXmlSupport(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
        SearchResponse response = countTestHelper(getCrawlerName(), null, 3);

        countTestHelper(getCrawlerName(), "title:maeve", 1);
        countTestHelper(getCrawlerName(), "price:[5 TO 6]", 2);

        logger.info("XML documents converted to:");
        for (SearchResponse.Hit hit : response.getHits().getHits()) {
            logger.info("{}", hit.toString());
        }
    }

    /**
     * Test case for issue #185: https://github.com/dadoonet/fscrawler/issues/185 : Add xml_support setting
     */
    @Test
    public void test_xml_and_json_enabled() throws Exception {
        Fs fs = startCrawlerDefinition().setXmlSupport(true).setJsonSupport(true).build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(metadataDir, FsSettings.builder(getCrawlerName())
                .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build());
        crawler.start();

        // We wait up to 10 seconds before considering a failing test
        assertThat("Job should not start", awaitBusy(() -> crawler.isClosed()), equalTo(true));
    }

    /**
     * Test case for #227: https://github.com/dadoonet/fscrawler/issues/227 : Add support for run only once
     */
    @Test
    public void test_single_loop() throws Exception {
        Fs fs = startCrawlerDefinition().build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(
                metadataDir, FsSettings.builder(getCrawlerName())
                        .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build(),
                1, false, false);
        crawler.start();

        countTestHelper(getCrawlerName(), null, 1);

        assertThat("Job should stop after one run", crawler.isClosed(), is(true));
        assertThat(crawler.getRunNumber(), is(1));
    }

    /**
     * Test case for #227: https://github.com/dadoonet/fscrawler/issues/227 : Add support for run only once
     */
    @Test
    public void test_two_loops() throws Exception {
        Fs fs = startCrawlerDefinition().build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(
                metadataDir, FsSettings.builder(getCrawlerName())
                        .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build(),
                2, false, false);
        crawler.start();

        countTestHelper(getCrawlerName(), null, 1);

        assertThat("Job should stop after two runs", awaitBusy(() -> crawler.isClosed()), is(true));
        assertThat(crawler.getRunNumber(), is(2));
    }

    /**
     * Test case for #205: https://github.com/dadoonet/fscrawler/issues/205 : Add support for update mapping
     */
    @Test
    public void test_update_mapping() throws Exception {
        elasticsearchClient.createIndex(getCrawlerName(), false,
                "{\n" + "  \"settings\": {\n" + "    \"analysis\": {\n" + "      \"analyzer\": {\n"
                        + "        \"fscrawler_path\": {\n" + "          \"tokenizer\": \"fscrawler_path\"\n"
                        + "        }\n" + "      },\n" + "      \"tokenizer\": {\n"
                        + "        \"fscrawler_path\": {\n" + "          \"type\": \"path_hierarchy\"\n"
                        + "        }\n" + "      }\n" + "    }\n" + "  }\n" + "}\n");
        elasticsearchClient.putMapping(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                "{ \"" + FsCrawlerUtil.INDEX_TYPE_DOC + "\" : {   \"_source\" : {\n" + "    \"excludes\" : [\n"
                        + "      \"attachment\"\n" + "    ]\n" + "  }\n} }");

        Fs fs = startCrawlerDefinition().build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(
                metadataDir, FsSettings.builder(getCrawlerName())
                        .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build(),
                -1, true, false);
        crawler.start();

        countTestHelper(getCrawlerName(), null, 1);
    }

    /**
     * Test case for #205: https://github.com/dadoonet/fscrawler/issues/205 : Add support for update mapping
     */
    @Test
    public void test_update_mapping_but_dont_launch() throws Exception {
        elasticsearchClient.createIndex(getCrawlerName(), false,
                "{\n" + "  \"settings\": {\n" + "    \"analysis\": {\n" + "      \"analyzer\": {\n"
                        + "        \"fscrawler_path\": {\n" + "          \"tokenizer\": \"fscrawler_path\"\n"
                        + "        }\n" + "      },\n" + "      \"tokenizer\": {\n"
                        + "        \"fscrawler_path\": {\n" + "          \"type\": \"path_hierarchy\"\n"
                        + "        }\n" + "      }\n" + "    }\n" + "  }\n" + "}\n");
        elasticsearchClient.putMapping(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                "{ \"" + FsCrawlerUtil.INDEX_TYPE_DOC + "\" : {   \"_source\" : {\n" + "    \"excludes\" : [\n"
                        + "      \"attachment\"\n" + "    ]\n" + "  }\n} }");

        Fs fs = startCrawlerDefinition().build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(
                metadataDir, FsSettings.builder(getCrawlerName())
                        .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build(),
                0, true, false);
        crawler.start();

        assertThat(crawler.isClosed(), is(true));
        assertThat(crawler.getRunNumber(), is(0));
    }

    /**
     * Test case for #205: https://github.com/dadoonet/fscrawler/issues/205 : Add support for update mapping
     */
    @Test(expected = ResponseException.class)
    public void test_fail_update_mapping() throws Exception {
        elasticsearchClient.createIndex(getCrawlerName());
        elasticsearchClient.putMapping(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                "{ \"" + FsCrawlerUtil.INDEX_TYPE_DOC + "\" : {\n" + "  \"properties\": {\n"
                        + "    \"content\": {\n" + "      \"type\": \"date\"\n" + "    }\n" + "  }\n" + "}" + " }");

        Fs fs = startCrawlerDefinition().build();

        logger.info("  --> starting crawler [{}]", getCrawlerName());

        crawler = new FsCrawlerImpl(
                metadataDir, FsSettings.builder(getCrawlerName())
                        .setElasticsearch(endCrawlerDefinition(getCrawlerName())).setFs(fs).build(),
                -1, true, false);

        crawler.start();
    }

    /**
     * Test case for #95: https://github.com/dadoonet/fscrawler/issues/95 : Folder index is not getting delete on delete of folder
     */
    @Test
    public void test_remove_folder_deleted_enabled() throws Exception {
        Fs fs = startCrawlerDefinition().setRemoveDeleted(true).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We should have 7 docs first
        countTestHelper(getCrawlerName(), null, 7, currentTestResourceDir);

        logContentOfDir(currentTestResourceDir, Level.DEBUG);

        // We remove a directory
        logger.info("  ---> Removing dir subdir1");
        Files.walkFileTree(currentTestResourceDir.resolve("subdir1"), new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                Files.delete(file);
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
                Files.delete(dir);
                return FileVisitResult.CONTINUE;
            }
        });

        logContentOfDir(currentTestResourceDir, Level.DEBUG);

        // We expect to have 4 docs now
        countTestHelper(getCrawlerName(), null, 4, currentTestResourceDir);
    }

    /**
     * Test case for #155: https://github.com/dadoonet/fscrawler/issues/155 : New option: do not index folders
     */
    @Test
    public void test_ignore_folders() throws Exception {
        Fs fs = startCrawlerDefinition().setIndexFolders(false).build();
        startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

        // We expect to have two files
        countTestHelper(getCrawlerName(), null, 2);

        // We expect having no folders
        SearchRequest.Builder sr = SearchRequest.builder();
        SearchResponse response = elasticsearchClient.search(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_FOLDER,
                sr.build());
        staticLogger.trace("result {}", response.toString());
        assertThat(response.getHits().getTotal(), is(0L));
    }

    /**
     * Test case for #183: https://github.com/dadoonet/fscrawler/issues/183 : Optimize document and folder mappings
     * We want to make sure we can highlight documents even if we don't store fields
     */
    @Test
    public void test_highlight_documents() throws Exception {
        startCrawler();

        // We expect to have one file
        countTestHelper(getCrawlerName(), null, 1);

        // Let's test highlighting
        SearchResponse response = elasticsearchClient.searchJson(getCrawlerName(), FsCrawlerUtil.INDEX_TYPE_DOC,
                "{\n" + "  \"query\": {\n" + "    \"match\": {\n" + "      \"content\": \"exemplo\"\n" + "    }\n"
                        + "  },\n" + "  \"highlight\": {\n" + "    \"fields\": {\n" + "      \"content\": {}\n"
                        + "    }\n" + "  }\n" + "}");
        staticLogger.trace("result {}", response.toString());
        assertThat(response.getHits().getTotal(), is(1L));

        SearchResponse.Hit hit = response.getHits().getHits().get(0);
        assertThat(hit.getHighlight(), hasKey("content"));
        assertThat(hit.getHighlight().get("content"), hasSize(1));
        assertThat(hit.getHighlight().get("content").get(0), containsString("<em>exemplo</em>"));
    }

    /**
     * Test case for #230: https://github.com/dadoonet/fscrawler/issues/230 : Add support for compressed files
     * It's a long job, so we let it run up to 2 minutes
     */
    @Test
    public void test_zip() throws Exception {
        startCrawler(getCrawlerName(), startCrawlerDefinition().build(), endCrawlerDefinition(getCrawlerName()),
                null, null, TimeValue.timeValueMinutes(2));

        // We expect to have one file
        countTestHelper(getCrawlerName(), null, 1);
    }

    /**
     * Test case for #234: https://github.com/dadoonet/fscrawler/issues/234 : Support ingest pipeline processing
     */
    @Test
    public void test_ingest_pipeline() throws Exception {
        String crawlerName = getCrawlerName();

        // We can only run this test against a 5.0 cluster or >
        assumeThat("We skip the test as we are not running it with a 5.0 cluster or >",
                elasticsearchClient.isIngestSupported(), is(true));

        // Create an empty ingest pipeline
        String pipeline = "{\n" + "  \"description\" : \"describe pipeline\",\n" + "  \"processors\" : [\n"
                + "    {\n" + "      \"rename\": {\n" + "        \"field\": \"content\",\n"
                + "        \"target_field\": \"my_content_field\"\n" + "      }\n" + "    }\n" + "  ]\n" + "}";
        StringEntity entity = new StringEntity(pipeline, StandardCharsets.UTF_8);

        elasticsearchClient.getClient().performRequest("PUT", "_ingest/pipeline/" + crawlerName,
                Collections.emptyMap(), entity);

        Elasticsearch elasticsearch = endCrawlerDefinition(crawlerName);
        elasticsearch.setPipeline(crawlerName);

        startCrawler(crawlerName, startCrawlerDefinition().build(), elasticsearch, null);

        // We expect to have one file
        countTestHelper(crawlerName, "my_content_field:perniciosoque", 1);
    }

    /**
     * Test case for #251: https://github.com/dadoonet/fscrawler/issues/251 : Add a REST Layer
     */
    @Test
    public void test_with_rest_only() throws Exception {
        logger.info("  --> starting crawler [{}]", getCrawlerName());

        // TODO do this rarely() createIndex(jobName);

        crawler = new FsCrawlerImpl(metadataDir,
                FsSettings.builder(getCrawlerName()).setElasticsearch(endCrawlerDefinition(getCrawlerName()))
                        .setFs(startCrawlerDefinition().build()).setServer(null).setRest(rest).build(),
                0, false, true);
        crawler.start();

        // We expect to have one file
        //        countTestHelper(getCrawlerName(), null, 1);
    }

    /**
     * Test case for #136: https://github.com/dadoonet/fscrawler/issues/136 : Moving existing files does not index new files
     */
    @Test
    public void test_moving_files() throws Exception {
        String filename = "oldfile.txt";

        startCrawler();

        // Let's first create some files
        logger.info(" ---> Creating a file [{}]", filename);

        Path tmpDir = rootTmpDir.resolve("resources").resolve(getCurrentTestName() + "-tmp");
        if (Files.notExists(tmpDir)) {
            Files.createDirectory(tmpDir);
        }

        Path file = Files.createFile(tmpDir.resolve(filename));
        Files.write(file, "Hello world".getBytes(Charsets.UTF_8));

        // We should have 1 doc first
        countTestHelper(getCrawlerName(), null, 1);

        logContentOfDir(currentTestResourceDir, Level.DEBUG);

        // We remove a directory
        logger.info("  ---> Moving file [{}] to [{}]", file, currentTestResourceDir);
        Files.move(file, currentTestResourceDir.resolve(filename));

        logContentOfDir(currentTestResourceDir, Level.DEBUG);

        // We expect to have 4 docs now
        countTestHelper(getCrawlerName(), null, 2);
    }
}