edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder.java

Source

/* $This file is distributed under the terms of the license in /doc/license.txt$ */

package edu.cornell.mannlib.vitro.webapp.search.indexing;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentSkipListSet;

import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpSession;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.hp.hpl.jena.query.QueryParseException;
import com.hp.hpl.jena.rdf.model.Statement;

import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.dao.IndividualDao;
import edu.cornell.mannlib.vitro.webapp.dao.WebappDaoFactory;
import edu.cornell.mannlib.vitro.webapp.search.beans.IndexerIface;
import edu.cornell.mannlib.vitro.webapp.search.beans.StatementToURIsToUpdate;
import edu.cornell.mannlib.vitro.webapp.utils.threads.VitroBackgroundThread;

/**
 * The IndexBuilder is used to rebuild or update a search index.
 * There should only be one IndexBuilder in a vitro web application.
 * It uses an implementation of a back-end through an object that
 * implements IndexerIface.  An example of a back-end is SolrIndexer.
 *
 * See the class SearchReindexingListener for an example of how a model change
 * listener can use an IndexBuilder to keep the full text index in sncy with 
 * updates to a model. It calls IndexBuilder.addToChangedUris().  
 */
public class IndexBuilder extends VitroBackgroundThread {
    private WebappDaoFactory wdf;
    private final IndexerIface indexer;

    /** Statements that have changed in the model.  The SearchReindexingListener
     * and other similar objects will use methods on IndexBuilder to add statements
     * to this queue.   
     */
    //private final ConcurrentLinkedQueue<Statement> changedStmtQueue = new ConcurrentLinkedQueue<Statement>();
    private final HashSet<Statement> changedStmts = new HashSet<Statement>();

    /** This is a list of objects that will compute what URIs need to be
     * updated in the search index when a statement changes.  */
    private final List<StatementToURIsToUpdate> stmtToURIsToIndexFunctions;

    /** Indicates that a full index re-build has been requested. */
    private volatile boolean reindexRequested = false;

    /** Indicates that a stop of the indexing objects has been requested. */
    private volatile boolean stopRequested = false;

    /** Length of time to wait before looking for work (if not wakened sooner). */
    public static final long MAX_IDLE_INTERVAL = 1000 * 60 /* msec */ ;

    /** Length of pause between when work comes into queue to when indexing starts */
    public static final long WAIT_AFTER_NEW_WORK_INTERVAL = 500; //msec

    /** Flag so we can tell that the index is being updated. */
    public static final String FLAG_UPDATING = "updating";

    /** Flag so we can tell that the index is being rebuilt. */
    public static final String FLAG_REBUILDING = "rebuilding";

    /** List of IndexingEventListeners */
    protected LinkedList<IndexingEventListener> indexingEventListeners = new LinkedList<IndexingEventListener>();

    /** number of threads to use during a full index rebuild. */
    public static final int REINDEX_THREADS = 10;

    /** Max threads to use during an update.  Smaller updates will use fewer threads. */
    public static final int MAX_UPDATE_THREADS = 10;

    /** Number of individuals to index per update thread. */
    public static final int URIS_PER_UPDATE_THREAD = 50;

    private static final Log log = LogFactory.getLog(IndexBuilder.class);

    public static IndexBuilder getBuilder(ServletContext ctx) {
        Object o = ctx.getAttribute(IndexBuilder.class.getName());
        if (o instanceof IndexBuilder) {
            return (IndexBuilder) o;
        } else {
            log.error("IndexBuilder has not been initialized.");
            return null;
        }
    }

    public IndexBuilder(IndexerIface indexer, WebappDaoFactory wdf,
            List<StatementToURIsToUpdate> stmtToURIsToIndexFunctions) {
        super("IndexBuilder");

        this.indexer = indexer;

        this.wdf = wdf;

        if (stmtToURIsToIndexFunctions != null)
            this.stmtToURIsToIndexFunctions = stmtToURIsToIndexFunctions;
        else
            this.stmtToURIsToIndexFunctions = Collections.emptyList();

        this.start();
    }

    protected IndexBuilder() {
        //for testing only
        this(null, null, null);
    }

    /**
     * Use this method to add URIs that need to be indexed.  Should be
     * able to add to changedStmtQueue while indexing is in process. 
     * 
     * If you have a statement that has been added or removed from the 
     * RDF model and you would like it to take effect in the search
     * index this is the method you should use.  Follow the adding of
     * your changes with a call to doUpdateIndex().
     */
    public void addToChanged(Statement stmt) {
        log.debug("call to addToChanged()");
        synchronized (changedStmts) {
            changedStmts.add(stmt);
        }
    }

    /**
     * This method will cause the IndexBuilder to completely rebuild
     * the index.
     */
    public synchronized void doIndexRebuild() {
        log.debug("call to doIndexRebuild()");
        //set flag for full index rebuild
        this.reindexRequested = true;
        //wake up                           
        this.notifyAll();
    }

    /** 
     * This will re-index Individuals were added with addToChanged(). 
     */
    public synchronized void doUpdateIndex() {
        log.debug("callto doUpdateIndex()");
        //wake up thread and it will attempt to index anything in changedUris
        this.notifyAll();
    }

    /**
     * Add a listener for indexing events.  Methods on listener will be called when
     * events happen in the IndexBuilder.  This is not a Jena ModelListener.
     */
    public synchronized void addIndexBuilderListener(IndexingEventListener listener) {
        indexingEventListeners.add(listener);
    }

    /**
     * This is called when the system shuts down.
     */
    public synchronized void stopIndexingThread() {
        stopRequested = true;
        this.notifyAll();
        this.interrupt();
    }

    @Override
    public void run() {
        while (!stopRequested) {
            try {
                if (reindexRequested) {
                    setWorkLevel(WorkLevel.WORKING, FLAG_REBUILDING);
                    log.debug("full re-index requested");

                    notifyListeners(IndexingEventListener.EventTypes.START_FULL_REBUILD);
                    indexRebuild();
                    notifyListeners(IndexingEventListener.EventTypes.FINISH_FULL_REBUILD);

                    setWorkLevel(WorkLevel.IDLE);
                } else {
                    boolean workToDo = false;
                    synchronized (changedStmts) {
                        workToDo = !changedStmts.isEmpty();
                    }
                    if (workToDo) {
                        setWorkLevel(WorkLevel.WORKING, FLAG_UPDATING);

                        //wait a bit to let a bit more work to come into the queue
                        Thread.sleep(WAIT_AFTER_NEW_WORK_INTERVAL);
                        log.debug("work found for IndexBuilder, starting update");

                        notifyListeners(IndexingEventListener.EventTypes.START_UPDATE);
                        updatedIndex();
                        notifyListeners(IndexingEventListener.EventTypes.FINISHED_UPDATE);
                        setWorkLevel(WorkLevel.IDLE);
                    } else {
                        log.debug("there is no indexing working to do, waiting for work");
                        synchronized (this) {
                            this.wait(MAX_IDLE_INTERVAL);
                        }
                    }
                }
            } catch (InterruptedException e) {
                log.debug("woken up", e);
            } catch (Throwable e) {
                log.error(e, e);
            }
        }

        if (indexer != null)
            indexer.abortIndexingAndCleanUp();
    }

    public static void checkIndexOnRootLogin(HttpServletRequest req) {
        HttpSession session = req.getSession();
        ServletContext context = session.getServletContext();
        IndexBuilder indexBuilder = (IndexBuilder) context.getAttribute(IndexBuilder.class.getName());

        log.debug("Checking if the index is empty");
        if (indexBuilder.indexer.isIndexEmpty()) {
            log.info("Search index is empty. Running a full index rebuild.");
            indexBuilder.doIndexRebuild();
        }
    }

    /* ******************** non-public methods ************************* */

    /**
     * Take the changed statements from the queue and determine which URIs that need to be updated in
     * the index.
     */
    private Collection<String> changedStatementsToUris() {
        //inform StatementToURIsToUpdate that index is starting
        for (StatementToURIsToUpdate stu : stmtToURIsToIndexFunctions) {
            stu.startIndexing();
        }

        Collection<String> urisToUpdate = new HashSet<String>();
        for (Statement stmt : getAndClearChangedStmts()) {
            for (StatementToURIsToUpdate stu : stmtToURIsToIndexFunctions) {
                urisToUpdate.addAll(stu.findAdditionalURIsToIndex(stmt));
            }
        }

        //inform StatementToURIsToUpdate that they are done
        for (StatementToURIsToUpdate stu : stmtToURIsToIndexFunctions) {
            stu.endIndxing();
        }

        return urisToUpdate;
    }

    private Statement[] getAndClearChangedStmts() {
        //get the statements that changed 
        Statement[] stmts = null;
        synchronized (changedStmts) {
            stmts = new Statement[changedStmts.size()];
            stmts = changedStmts.toArray(stmts);
            changedStmts.clear();
        }
        return stmts;
    }

    /**
     * Take the URIs that we got from the changedStmtQueue, and create the lists
     * of updated URIs and deleted URIs.
     */
    private UriLists makeAddAndDeleteLists(Collection<String> uris) {
        IndividualDao indDao = wdf.getIndividualDao();

        UriLists uriLists = new UriLists();
        for (String uri : uris) {
            if (uri != null) {
                try {
                    Individual ind = indDao.getIndividualByURI(uri);
                    if (ind != null) {
                        log.debug("uri to update or add to search index: " + uri);
                        uriLists.updatedUris.add(uri);
                    } else {
                        log.debug("found delete in changed uris: " + uri);
                        uriLists.deletedUris.add(uri);
                    }
                } catch (QueryParseException ex) {
                    log.error("could not get Individual " + uri, ex);
                }
            }
        }
        return uriLists;
    }

    /**
     * This rebuilds the whole index.
     */
    protected void indexRebuild() {
        log.info("Rebuild of search index is starting.");

        // clear out changed URIs since we are doing a full index rebuild
        changedStmts.clear();

        log.debug("Getting all URIs in the model");
        Iterator<String> uris = wdf.getIndividualDao().getAllOfThisTypeIterator();

        doBuild(uris, Collections.<String>emptyList(), REINDEX_THREADS);

        if (log != null) //log might be null if system is shutting down.
            log.info("Rebuild of search index is complete.");
    }

    protected void updatedIndex() {
        log.debug("Starting updateIndex()");

        UriLists uriLists = makeAddAndDeleteLists(changedStatementsToUris());
        int numberOfThreads = Math.min(MAX_UPDATE_THREADS,
                Math.max(uriLists.updatedUris.size() / URIS_PER_UPDATE_THREAD, 1));

        doBuild(uriLists.updatedUris.iterator(), uriLists.deletedUris, numberOfThreads);

        log.debug("Ending updateIndex()");
    }

    /**
     * For each sourceIterator, get all of the objects and attempt to
     * index them.
     *
     * This takes a list of source Iterators and, for each of these,
     * calls indexForSource.
     *
     * @param sourceIterators
     * @param newDocs true if we know that the document is new. Set
     * to false if we want to attempt to remove the object from the index before
     * attempting to index it.  If an object is not on the list but you set this
     * to false, and a check is made before adding, it will work fine; but
     * checking if an object is on the index is slow.
     */
    private void doBuild(Iterator<String> updates, Collection<String> deletes, int numberOfThreads) {
        boolean updateRequested = !reindexRequested;

        try {
            if (reindexRequested) {
                indexer.prepareForRebuild();
            }

            indexer.startIndexing();
            reindexRequested = false;

            if (updateRequested) {
                //if this is not a full reindex, deleted indivdiuals need to be removed from the index
                for (String deleteMe : deletes) {
                    try {
                        indexer.removeFromIndex(deleteMe);
                    } catch (Exception ex) {
                        log.debug(
                                "could not remove individual " + deleteMe + " from index, usually this is harmless",
                                ex);
                    }
                }
            }

            indexUriList(updates, numberOfThreads);

        } catch (Exception e) {
            if (log != null)
                log.debug("Exception during indexing", e);
        }

        indexer.endIndexing();
    }

    /**
     * Use the back end indexer to index each object that the Iterator returns.
     * @throws AbortIndexing 
     */
    private void indexUriList(Iterator<String> updateUris, int numberOfThreads) {
        //make lists of work URIs for workers
        List<List<String>> workLists = makeWorkerUriLists(updateUris, numberOfThreads);

        //setup workers with work
        List<IndexWorkerThread> workers = new ArrayList<IndexWorkerThread>();
        for (int i = 0; i < numberOfThreads; i++) {
            Iterator<Individual> workToDo = new UriToIndividualIterator(workLists.get(i), wdf);
            workers.add(new IndexWorkerThread(indexer, i, workToDo));
        }

        // reset the counters so we can monitor the progress
        IndexWorkerThread.resetCounters(System.currentTimeMillis(), figureWorkLoad(workLists));

        log.debug("Starting the building and indexing of documents in worker threads");
        // starting worker threads        
        for (int i = 0; i < numberOfThreads; i++) {
            workers.get(i).start();
        }

        //waiting for all the work to finish
        for (int i = 0; i < numberOfThreads; i++) {
            try {
                workers.get(i).join();
            } catch (InterruptedException e) {
                //this thread will get interrupted if the system is trying to shut down.               
                if (log != null)
                    log.debug(e, e);
                for (IndexWorkerThread thread : workers) {
                    thread.requestStop();
                }
                return;
            }
        }
    }

    /* maybe ObjectSourceIface should be replaced with just an iterator. */
    protected class UriToIndividualIterator implements Iterator<Individual> {
        private final Iterator<String> uris;
        private final WebappDaoFactory wdf;

        public UriToIndividualIterator(Iterator<String> uris, WebappDaoFactory wdf) {
            this.uris = uris;
            this.wdf = wdf;
        }

        public UriToIndividualIterator(List<String> uris, WebappDaoFactory wdf) {
            this.uris = uris.iterator();
            this.wdf = wdf;
        }

        @Override
        public boolean hasNext() {
            return uris.hasNext();
        }

        /** may return null */
        @Override
        public Individual next() {
            String uri = uris.next();
            return wdf.getIndividualDao().getIndividualByURI(uri);
        }

        @Override
        public void remove() {
            throw new IllegalAccessError("");
        }
    }

    private static List<List<String>> makeWorkerUriLists(Iterator<String> uris, int workers) {
        List<List<String>> work = new ArrayList<List<String>>(workers);
        for (int i = 0; i < workers; i++) {
            work.add(new ArrayList<String>());
        }

        int counter = 0;
        while (uris.hasNext()) {
            work.get(counter % workers).add(uris.next());
            counter++;
        }
        log.info("Number of individuals to be indexed : " + counter + " by " + workers + " worker theads.");
        return work;
    }

    private long figureWorkLoad(List<List<String>> workLists) {
        long load = 0;
        for (List<String> list : workLists) {
            load += list.size();
        }
        return load;
    }

    public long getCompletedCount() {
        return IndexWorkerThread.getCount();
    }

    public long getTotalToDo() {
        return IndexWorkerThread.getCountToIndex();
    }

    private static class UriLists {
        private final List<String> updatedUris = new ArrayList<String>();
        private final List<String> deletedUris = new ArrayList<String>();
    }

    protected void notifyListeners(IndexingEventListener.EventTypes event) {
        for (IndexingEventListener listener : indexingEventListeners) {
            try {
                if (listener != null)
                    listener.notifyOfIndexingEvent(event);
            } catch (Throwable th) {
                log.error("problem during NotifyListeners(): ", th);
            }
        }
    }
}