com.edduarte.vokter.Context.java Source code

Introduction

Here is the source code for com.edduarte.vokter.Context.java
Source

/*
 * Copyright 2015 Eduardo Duarte
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.edduarte.vokter;

import com.edduarte.vokter.diff.Difference;
import com.edduarte.vokter.diff.DifferenceDetector;
import com.edduarte.vokter.document.Document;
import com.edduarte.vokter.document.DocumentBuilder;
import com.edduarte.vokter.document.DocumentCollection;
import com.edduarte.vokter.job.JobManager;
import com.edduarte.vokter.job.JobManagerHandler;
import com.edduarte.vokter.keyword.Keyword;
import com.edduarte.vokter.keyword.KeywordBuilder;
import com.edduarte.vokter.parser.Parser;
import com.edduarte.vokter.parser.ParserPool;
import com.edduarte.vokter.parser.SimpleParser;
import com.edduarte.vokter.model.v1.SubscribeRequest;
import com.mongodb.BulkWriteOperation;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import org.eclipse.jetty.server.Connector;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.handler.HandlerList;
import org.eclipse.jetty.server.nio.SelectChannelConnector;
import org.eclipse.jetty.util.component.LifeCycle;
import org.eclipse.jetty.util.thread.QueuedThreadPool;
import org.eclipse.jetty.webapp.WebAppContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.security.ProtectionDomain;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
 * @author Eduardo Duarte (<a href="mailto:hello@edduarte.com">hello@edduarte.com</a>)
 * @version 1.3.2
 * @since 1.0.0
 */
public class Context implements LifeCycle.Listener, JobManagerHandler {

    private static final Logger logger = LoggerFactory.getLogger(Context.class);

    private static final String DOCUMENTS_DB = "vokter_documents_db";

    private static final String OCCURRENCES_DB = "vokter_occurrences_db";

    private static final String DIFFERENCES_DB = "vokter_differences_db";

    private static final Context instance;

    static {
        Context aux;
        try {
            aux = new Context();
        } catch (Exception ex) {
            logger.error(ex.getMessage(), ex);
            aux = null;
        }
        instance = aux;
    }

    /**
     * A manager for all Quartz jobs, handling scheduling and persistence of
     * asynchronous processes for document differences detection and
     * keyword-difference matching.
     */
    private final JobManager jobManager;

    /**
     * A parser-pool that contains a set number of parsers. When the last
     * parser from the pool is removed, future parsing workers will be locked
     * until a used parser is placed back in the pool.
     */
    private final ParserPool parserPool;

    /**
     * The client for the used MongoDB database.
     */
    private MongoClient mongoClient;

    /**
     * The MongoDB database for fetched documents.
     */
    private DB documentsDB;

    /**
     * The MongoDB database for parsed occurrences for each document.
     */
    private DB occurrencesDB;

    /**
     * The MongoDB database for detected differences between snapshots for
     * each document.
     */
    private DB differencesDB;

    /**
     * The Bayesian detection model that allows language detection.
     */
    private LanguageDetector langDetector;

    /**
     * The collection instance that will contain all imported and processed
     * corpus during the context's execution.
     */
    private DocumentCollection collection;

    /**
     * Flag that locks server shutdown until it is properly initialized.
     */
    private boolean initialized;

    /**
     * Flag that sets difference detection jobs to perform stopword filtering.
     */
    private boolean isStoppingEnabled = true;

    /**
     * Flag that sets difference detection jobs to perform stemming.
     */
    private boolean isStemmingEnabled = true;

    /**
     * Flag that sets difference detection jobs to of equal occurrences with different casing.
     */
    private boolean ignoreCase = true;

    private Context() throws Exception {
        super();
        initialized = false;
        jobManager = JobManager.create("vokter_job_manager", 420, this);
        parserPool = new ParserPool();
    }

    public static Context getInstance() {
        return instance;
    }

    public boolean createJob(final SubscribeRequest request) {
        return jobManager.createJob(request);
    }

    public boolean cancelJob(String documentUrl, final String clientUrl) {
        return jobManager.cancelMatchingJob(documentUrl, clientUrl);
    }

    /**
     * Indexes the specified document and detects differences between an older
     * snapshot and the new one. Once differences are collected, saves the resulting
     * index of all occurrences of the new snapshot for future query and comparison
     * jobs.
     */
    @Override
    public boolean detectDifferences(String url) {

        // create a new document snapshot for the provided url
        DocumentBuilder builder = DocumentBuilder.fromUrl(url).withLanguageDetector(langDetector);

        if (isStoppingEnabled) {
            builder.withStopwords();
        }
        if (isStemmingEnabled) {
            builder.withStemming();
        }
        if (ignoreCase) {
            builder.ignoreCase();
        }
        Document newDocument = builder.build(occurrencesDB, parserPool);
        if (newDocument == null) {
            // A problem occurred during processing, mostly during the fetching phase.
            // This could happen if the page was unavailable at the time.
            return false;
        }

        // check if there is a older document in the collection
        Document oldDocument = collection.get(url);

        if (oldDocument != null) {
            // there was already a document for this url on the collection, so
            // detect differences between them and add them to the differences
            // database
            DifferenceDetector detector = new DifferenceDetector(oldDocument, newDocument, parserPool);
            List<Difference> results = detector.call();

            removeExistingDifferences(url);
            DBCollection diffColl = differencesDB.getCollection(url);

            if (!results.isEmpty()) {
                BulkWriteOperation bulkOp = diffColl.initializeUnorderedBulkOperation();
                results.forEach(bulkOp::insert);
                bulkOp.execute();
                bulkOp = null;
                diffColl = null;
            }
        }

        // replace the old document in the collection with the new one
        collection.remove(url);
        collection.add(newDocument);

        return true;
    }

    /**
     * Collects the existing differences that were stored in the database.
     */
    @Override
    public List<Difference> getExistingDifferences(String url) {
        DBCollection diffColl = differencesDB.getCollection(url);
        Iterable<DBObject> cursor = diffColl.find();
        return StreamSupport.stream(cursor.spliterator(), true).map(Difference::new).collect(Collectors.toList());
    }

    /**
     * Removes existing differences for the specified url
     */
    @Override
    public void removeExistingDifferences(String url) {
        DBCollection diffColl = differencesDB.getCollection(url);
        diffColl.drop();
    }

    /**
     * Process and build keyword objects based on this context configuration
     */
    @Override
    public Keyword buildKeyword(String keywordInput) {
        KeywordBuilder builder = KeywordBuilder.fromText(keywordInput).withLanguageDetector(langDetector);

        if (isStoppingEnabled) {
            builder.withStopwords();
        }
        if (isStemmingEnabled) {
            builder.withStemming();
        }
        if (ignoreCase) {
            builder.ignoreCase();
        }
        return builder.build(parserPool);
    }

    public void setStopwordsEnabled(boolean isStoppingEnabled) {
        this.isStoppingEnabled = isStoppingEnabled;
    }

    public void setIgnoreCase(boolean ignoreCase) {
        this.ignoreCase = ignoreCase;
    }

    public boolean isStoppingEnabled() {
        return isStoppingEnabled;
    }

    public boolean isStemmingEnabled() {
        return isStemmingEnabled;
    }

    public void setStemmingEnabled(boolean isStemmingEnabled) {
        this.isStemmingEnabled = isStemmingEnabled;
    }

    public boolean isIgnoringCase() {
        return ignoreCase;
    }

    /**
     * Starts this REST context at the specified port, using the specified number
     * of threads and wrapping the specified collection and stopwords for queries.
     */
    public void start(final int port, final int maxThreads, final String dbHost, final int dbPort)
            throws Exception {
        if (initialized) {
            return;
        }

        // Set JSP to always use Standard JavaC
        System.setProperty("org.apache.jasper.compiler.disablejsr199", "false");

        mongoClient = new MongoClient(dbHost, dbPort);

        documentsDB = mongoClient.getDB(DOCUMENTS_DB);
        occurrencesDB = mongoClient.getDB(OCCURRENCES_DB);
        differencesDB = mongoClient.getDB(DIFFERENCES_DB);

        List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
        langDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles)
                .build();

        collection = new DocumentCollection("vokter_production_collection", documentsDB, occurrencesDB);

        logger.info("Starting jobs...");
        jobManager.initialize();

        logger.info("Starting parsers...");
        for (int i = 1; i < maxThreads; i++) {
            Parser p = new SimpleParser();
            parserPool.place(p);
        }

        logger.info("Starting server...");
        Server server = new Server();
        server.setStopAtShutdown(true);

        // Increase server thread pool
        QueuedThreadPool threadPool = new QueuedThreadPool();
        threadPool.setMaxThreads(maxThreads);
        server.setThreadPool(threadPool);

        // Ensure a non-blocking connector (NIO) is used.
        Connector connector = new SelectChannelConnector();
        connector.setPort(port);
        connector.setMaxIdleTime(30000);
        server.setConnectors(new Connector[] { connector });

        // Retrieve the jar file.
        ProtectionDomain protectionDomain = Context.class.getProtectionDomain();
        String jarFilePath = protectionDomain.getCodeSource().getLocation().toExternalForm();

        // Associate the web context (which includes all files in the 'resources'
        // folder to the jar file.
        WebAppContext context = new WebAppContext(jarFilePath, "/");
        context.setServer(server);

        // Add the handlers for the jar context file.
        HandlerList handlers = new HandlerList();
        handlers.addHandler(context);
        server.setHandler(handlers);
        server.addLifeCycleListener(this);

        server.start();

        logger.info("Server started at 'localhost:" + port + "'");
        logger.info("Press Ctrl+C to shutdown the server...");
        initialized = true;

        server.join();
    }

    @Override
    public void lifeCycleStarting(LifeCycle lifeCycle) {
    }

    @Override
    public void lifeCycleStarted(LifeCycle lifeCycle) {
    }

    @Override
    public void lifeCycleFailure(LifeCycle lifeCycle, Throwable throwable) {
    }

    @Override
    public void lifeCycleStopping(LifeCycle lifeCycle) {
    }

    @Override
    public void lifeCycleStopped(LifeCycle lifeCycle) {
        jobManager.stop();
        parserPool.clear();
        mongoClient.close();
        initialized = false;
    }
}