grakn.core.deduplicator.AttributeDeduplicatorE2E.java Source code

Java tutorial

Introduction

Here is the source code for grakn.core.deduplicator.AttributeDeduplicatorE2E.java

Source

/*
 * GRAKN.AI - THE KNOWLEDGE GRAPH
 * Copyright (C) 2018 Grakn Labs Ltd
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

package grakn.core.deduplicator;

import grakn.client.GraknClient;
import grakn.core.concept.answer.ConceptMap;
import graql.lang.Graql;
import org.apache.commons.io.FileUtils;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zeroturnaround.exec.ProcessExecutor;

import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeoutException;

import static grakn.core.deduplicator.AttributeDeduplicatorE2EConstants.GRAKN_UNZIPPED_DIRECTORY;
import static grakn.core.deduplicator.AttributeDeduplicatorE2EConstants.assertGraknRunning;
import static grakn.core.deduplicator.AttributeDeduplicatorE2EConstants.assertGraknStopped;
import static grakn.core.deduplicator.AttributeDeduplicatorE2EConstants.assertZipExists;
import static grakn.core.deduplicator.AttributeDeduplicatorE2EConstants.unzipGrakn;
import static graql.lang.Graql.type;
import static graql.lang.Graql.var;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;

public class AttributeDeduplicatorE2E {
    private static Logger LOG = LoggerFactory.getLogger(AttributeDeduplicatorE2E.class);
    private GraknClient localhostGrakn = new GraknClient("localhost:48555");
    private Path queuePath = GRAKN_UNZIPPED_DIRECTORY.resolve("server").resolve("db").resolve("queue");

    private static ProcessExecutor commandExecutor = new ProcessExecutor()
            .directory(GRAKN_UNZIPPED_DIRECTORY.toFile()).redirectOutput(System.out).redirectError(System.err)
            .readOutput(true);

    @BeforeClass
    public static void setup_prepareDistribution() throws IOException, InterruptedException, TimeoutException {
        assertZipExists();
        unzipGrakn();
        assertGraknStopped();
        commandExecutor.command("./grakn", "server", "start").execute();
        assertGraknRunning();
    }

    @AfterClass
    public static void cleanup_cleanupDistribution() throws IOException, InterruptedException, TimeoutException {
        commandExecutor.command("./grakn", "server", "stop").execute();
        assertGraknStopped();
        FileUtils.deleteDirectory(GRAKN_UNZIPPED_DIRECTORY.toFile());
    }

    @Test
    public void shouldDeduplicateAttributes() throws InterruptedException, ExecutionException {
        int numOfUniqueNames = 10;
        int numOfDuplicatesPerName = 673;
        ExecutorService executorServiceForParallelInsertion = Executors.newFixedThreadPool(8);

        LOG.info("initiating the shouldDeduplicate10AttributesWithDuplicates test...");
        try (GraknClient.Session session = localhostGrakn.session("attribute_deduplicator_e2e")) {
            // insert 10 attributes, each with 100 duplicates
            LOG.info("defining the schema...");
            defineParentChildSchema(session);
            LOG.info("inserting " + numOfUniqueNames + " unique attributes with " + numOfDuplicatesPerName
                    + " duplicates per attribute....");
            insertNameShuffled(session, numOfUniqueNames, numOfDuplicatesPerName,
                    executorServiceForParallelInsertion);

            // wait until queue is empty
            LOG.info("names and duplicates have been inserted. waiting for the deduplication to finish...");
            long timeoutMs = 10000;
            waitUntilAllAttributesDeduplicated(timeoutMs);
            LOG.info("deduplication has finished.");

            // verify deduplicated attributes
            LOG.info("verifying the number of attributes");
            int countAfterDeduplication = countTotalNames(session);
            assertThat(countAfterDeduplication, equalTo(numOfUniqueNames));
            LOG.info("test completed successfully. there are " + countAfterDeduplication + " unique names found");
        }
    }

    private void defineParentChildSchema(GraknClient.Session session) {
        try (GraknClient.Transaction tx = session.transaction().write()) {
            List<ConceptMap> answer = tx
                    .execute(Graql.define(type("name").sub("attribute").datatype(Graql.Token.DataType.STRING),
                            type("parent").sub("role"), type("child").sub("role"),
                            type("person").sub("entity").has("name").plays("parent").plays("child"),
                            type("parentchild").sub("relation").relates("parent").relates("child")));
            tx.commit();
        }
    }

    private static void insertNameShuffled(GraknClient.Session session, int nameCount, int duplicatePerNameCount,
            ExecutorService executorService) throws ExecutionException, InterruptedException {

        List<String> duplicatedNames = new ArrayList<>();
        for (int i = 0; i < nameCount; ++i) {
            for (int j = 0; j < duplicatePerNameCount; ++j) {
                String name = "lorem ipsum dolor sit amet " + i;
                duplicatedNames.add(name);
            }
        }

        Collections.shuffle(duplicatedNames, new Random(1));

        List<CompletableFuture<Void>> asyncInsertions = new ArrayList<>();
        for (String name : duplicatedNames) {
            CompletableFuture<Void> asyncInsert = CompletableFuture.supplyAsync(() -> {
                try (GraknClient.Transaction tx = session.transaction().write()) {
                    List<ConceptMap> answer = tx.execute(Graql.insert(var().isa("name").val(name)));
                    tx.commit();
                }
                return null;
            }, executorService);
            asyncInsertions.add(asyncInsert);
        }

        CompletableFuture.allOf(asyncInsertions.toArray(new CompletableFuture[] {})).get();
    }

    private void waitUntilAllAttributesDeduplicated(long timeoutMs) throws InterruptedException {
        Thread.sleep(timeoutMs);
    }

    private int countTotalNames(GraknClient.Session session) {
        try (GraknClient.Transaction tx = session.transaction().read()) {
            return tx.execute(Graql.match(var("x").isa("name")).get().count()).get(0).number().intValue();
        }
    }
}