podd.search.impl.IndexManager.java Source code

Introduction

Here is the source code for podd.search.impl.IndexManager.java
Source

/*
 * Copyright (c) 2009 - 2010. School of Information Technology and Electrical
 * Engineering, The University of Queensland.  This software is being developed
 * for the "Phenomics Ontoogy Driven Data Management Project (PODD)" project.
 * PODD is a National e-Research Architecture Taskforce (NeAT) project
 * co-funded by ANDS and ARCS.
 *
 * PODD is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * PODD is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with PODD.  If not, see <http://www.gnu.org/licenses/>.
 */

package podd.search.impl;

import info.aduna.collections.iterators.CloseableIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.openrdf.model.Graph;
import org.springframework.util.StringUtils;
import podd.dataaccess.DAO;
import podd.dataaccess.misc.PostUpdater;
import podd.exception.DataAccessException;
import podd.exception.ObjectTripleGenerationException;
import podd.exception.PostProcessingException;
import podd.exception.SearchIndexingException;
import podd.model.entity.PoddEntity;
import podd.model.entity.PoddObject;
import podd.model.project.Project;
import podd.search.IndexFields;
import podd.triples.GraphParser;
import podd.triples.GraphParserFactory;
import podd.triples.ObjectTriplesGenerator;

import java.io.IOException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * An Index manager for inserting/updating/deleting index records by communicating with a configured Solr server
 *
 * @author Philip Wu
 */
public class IndexManager implements PostUpdater {

    static final Logger LOGGER = LoggerFactory.getLogger(IndexManager.class);

    /**
     * For Solr Dynamic fields. Anything that ends with _t will be regarded as text in solr.
     */
    public static final String INDEX_TEXT = "_t";
    /**
     * For Solr Dynamic fields. Anything that ends with _d will be regarded as a date in solr.
     */
    public static final String INDEX_DATE = "_dt";
    /**
     * The StrField type is not analyzed, but indexed/stored verbatim.  
     */
    public static final String INDEX_STRING = "_s";

    /**
     * The connection configuration to the solr server
     */
    protected SolrServer solrServer;

    /**
     * Control the behaviour of using threads
     * False is used for testcases that need to wait for indexing to complete to make assertions
     */
    protected boolean useThreads = true;
    /**
     * For better performance we can batch up the documents to be submitted for indexing
     */
    protected int batchSize = 20;

    /**
     * Used to fetch all the details of the poddObject from its triples
     */
    protected ObjectTriplesGenerator objectTriplesGenerator;

    /**
     * Controls the indexing of documents by caching behind the scenes
     */
    protected DocumentCacheManager cacheManager = new DocumentCacheManager();

    protected DAO objectDao;

    public SolrServer getSolrServer() {
        return solrServer;
    }

    public void setSolrServer(SolrServer solrServer) {
        this.solrServer = solrServer;
    }

    public boolean isUseThreads() {
        return useThreads;
    }

    public void setUseThreads(boolean useThreads) {
        this.useThreads = useThreads;
    }

    public int getBatchSize() {
        return batchSize;
    }

    public void setBatchSize(int batchSize) {
        this.batchSize = batchSize;
    }

    public ObjectTriplesGenerator getObjectTriplesGenerator() {
        return objectTriplesGenerator;
    }

    public void setObjectTriplesGenerator(ObjectTriplesGenerator objectTriplesGenerator) {
        this.objectTriplesGenerator = objectTriplesGenerator;
    }

    public DAO getObjectDao() {
        return objectDao;
    }

    public void setObjectDao(DAO objectDao) {
        this.objectDao = objectDao;
    }

    @Override
    public void close() throws PostProcessingException {
        // TODO Auto-generated method stub

    }

    @Override
    public void doDelete(PoddEntity poddEntity) throws PostProcessingException {
        deleteIndexedPoddObject(poddEntity);
    }

    @Override
    public void doSave(PoddEntity poddEntity) throws PostProcessingException {
        indexPoddObject(poddEntity);
    }

    @Override
    public void doUpdate(PoddEntity poddEntity) throws PostProcessingException {
        indexPoddObject(poddEntity);
    }

    /**
      * Flushes the cache
      * FIXME: Why is this method synchronized?
      */
    public synchronized void flush() {
        try {
            LOGGER.info("Index manager invokes flushing.");
            cacheManager.flush();
        } catch (IOException e) {
            LOGGER.error("Found exception", e);
        } catch (SolrServerException e) {
            LOGGER.error("Found exception", e);
        }
    }

    /**
     * Add or update an entry of the poddObject in the search index
     *
     * @param poddObject
     */
    public void indexPoddObject(Object poddObject) {
        if (poddObject instanceof PoddObject) {

            PoddObject poddObj = (PoddObject) poddObject;
            LOGGER.debug("Indexing poddObject: " + poddObj.getPid());

            IndexGenerator indexGenerator = new IndexGenerator(poddObj);
            if (useThreads) {
                // Fork a thread to do the indexing submission
                Thread t = new Thread(indexGenerator);
                t.start();
            } else {
                // Do the indexing without forking a thread
                indexGenerator.run();
            }
        }
    }

    /**
     * Delete a podd object from the index if it exists.
     *
     * @param poddObject
     */
    public void deleteIndexedPoddObject(Object poddObject) {
        if (poddObject instanceof PoddObject) {
            PoddObject poddObj = (PoddObject) poddObject;
            LOGGER.debug("Deleting: " + poddObj.getPid());

            // Invoke delete on Solr
            if (poddObj.getId() != null) {
                try {
                    // Before we delete, we must flush the cache
                    synchronized (this) {
                        flush();
                    }
                    solrServer.deleteByQuery("id:" + ClientUtils.escapeQueryChars(poddObj.getPid()));
                    solrServer.commit();
                } catch (SolrServerException e) {
                    LOGGER.error("Found exception", e);
                } catch (IOException e) {
                    LOGGER.error("Found exception", e);
                }
            }
        }
    }

    /**
     * Clears the index
     */
    public void clearIndex() {
        try {
            solrServer.deleteByQuery("*:*");
            solrServer.commit();
            LOGGER.debug("Index has been cleared");
        } catch (SolrServerException e) {
            LOGGER.error("Found exception", e);
        } catch (IOException e) {
            LOGGER.error("Found exception", e);
        }
    }

    /**
     * Clear and rebuild the index from all the podd objects currently available
     */
    public long rebuildIndex() {
        LOGGER.info("rebuilding index");
        // Clear the index first
        this.clearIndex();

        // Now rebuild the index
        long numIndexed = 0;
        try {
            for (CloseableIterator<PoddObject> it = objectDao.getAll(); it.hasNext();) {
                PoddObject poddObject = null;
                // Avoid Inconsistency errors
                try {
                    poddObject = it.next();
                } catch (RuntimeException e) {
                    LOGGER.error("Found exception", e);
                }

                if (poddObject != null) {
                    // No threading allowed, we wait for process to complete entirely
                    IndexGenerator indexGenerator = new IndexGenerator(poddObject);
                    indexGenerator.run();

                    numIndexed++;
                }
            }
        } catch (DataAccessException e) {
            LOGGER.error("Found exception", e);
        }

        this.flush();

        return numIndexed;
    }

    /**
     * Inner class for generating an index
     */
    class IndexGenerator implements Runnable {

        private PoddObject poddObj;

        public IndexGenerator(PoddObject poddObj) {
            this.poddObj = poddObj;
        }

        public void run() {
            try {
                doIndexing();
            } catch (SearchIndexingException e) {
                e.printStackTrace();
                LOGGER.error("Found exception", e);
            }
        }

        /**
         * Submits the PoddObject for indexing
         */
        public void doIndexing() throws SearchIndexingException {

            // Populate the fields of the document
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField(IndexFields.ID.toString(), poddObj.getPid());
            //doc.addField(IndexFields.CREATION_DATE.toString(), poddObj.getCreatedTime());
            doc.addField(IndexFields.OBJECT_TYPE.toString(), poddObj.getConcept().getConceptName());
            handleGraph(poddObj, doc);

            // Mainly for test cases
            if (doc.getFieldValue(IndexFields.DESCRIPTION.toString()) == null) {
                doc.addField(IndexFields.DESCRIPTION.toString(), poddObj.getLabel());
            }

            try {
                cacheManager.addDocument(poddObj.getPid(), doc);
            } catch (SolrServerException e) {
                LOGGER.error("Found exception", e);
            } catch (IOException e) {
                LOGGER.error("Found exception", e);
            }
        }

    }

    /**
     * Extracts Graph triples and store them into a document for indexing
     *
     * @param poddObj
     * @param doc
     * @throws SearchIndexingException
     */
    private void handleGraph(PoddObject poddObj, SolrInputDocument doc) throws SearchIndexingException {

        try {
            Project project = ObjectTriplesGenerator.getAncestor(poddObj);
            //LOGGER.info("project="+project);

            Graph graph = objectTriplesGenerator.getObjectTriples(poddObj);

            GraphParser parser = GraphParserFactory.getInstance().createGraphParser(graph, poddObj);
            parser.parse();
            Map<String, Object> fieldValueMap = parser.getFieldValueMap();
            if (fieldValueMap != null) {

                for (Map.Entry<String, Object> entry : fieldValueMap.entrySet()) {

                    Object fieldValue = entry.getValue();
                    String fieldName = entry.getKey() + getPostfix(fieldValue); // dynamic field

                    //LOGGER.debug("fieldName="+fieldName+ " fieldValue="+fieldValue);
                    Object existingFieldValue = doc.getFieldValue(fieldName);
                    if (existingFieldValue == null || !StringUtils.hasText(existingFieldValue.toString())) {
                        doc.addField(fieldName, fieldValue);
                    } else {
                        LOGGER.error("SolrDocument with index field name: " + fieldName
                                + ", already has a value of " + existingFieldValue);
                    }
                }
            }

            if (project != null) {
                doc.addField(IndexFields.PROJECT_ID.toString(), project.getPid());
            }
        } catch (ObjectTripleGenerationException e1) {
            LOGGER.error("Found exception", e1);
            e1.printStackTrace();
            // recast and throw
            throw new SearchIndexingException(e1);
        } catch (IllegalArgumentException e) {
            LOGGER.error("Found exception", e);
            // TODO: Don't recast to keep existing test cases working
        }
    }

    /**
     * Retrieve the postfix based on the type of the object
     *
     * @param obj
     * @return
     */
    private String getPostfix(Object obj) {
        if (obj instanceof Date) {
            return this.INDEX_DATE;
        } else if (obj instanceof EnumValue) {
            return this.INDEX_STRING;
        } else {
            return this.INDEX_TEXT;
        }
    }

    /**
     * Controls the caching of Solr documents meant for indexing
     *
     * @author Philip Wu
     */
    class DocumentCacheManager {

        /**
         * Cache of SolrDocuments. This cache should get flushed when the batch size has been reached, or somebody invokes a search.
         * We don't want the multiple SolrInputDocuments for the same PoddObject with the same ID to exist in the cache, so we use
         * a map: PODD ID ---> SolrInputDocument
         * <p/>
         * Since we fork a thread to cache documents, we must make the collection thread-safe.
         */
        private Map<String, SolrInputDocument> documentCache = Collections
                .synchronizedMap(new HashMap<String, SolrInputDocument>());

        /**
         * Adds document to cache and flush when batch size has been reached
         *
         * @param solrDocument
         * @throws IOException
         * @throws SolrServerException
         */
        public synchronized void addDocument(String pid, SolrInputDocument solrDocument)
                throws IOException, SolrServerException {
            // replace the existing solrDocument for that PID
            documentCache.put(pid, solrDocument);

            if (documentCache.size() > batchSize) {
                flush();
            }
        }

        /**
         * Flush the cache
         *
         * @throws IOException
         * @throws SolrServerException
         */
        public synchronized void flush() throws IOException, SolrServerException {
            LOGGER.info("Flushing Solr index, # = " + documentCache.size());
            if (documentCache.size() > 0) {
                try {
                    solrServer.add(documentCache.values());
                    solrServer.commit();
                } catch (SolrException e) {
                    LOGGER.error("Failed to commit document cache. Will commit each document one-by-one.");
                    LOGGER.error("Flushing document cache exception.", e);
                    solrServer.rollback();
                    Set<String> removalSet = new HashSet<String>();
                    // Commit each document that can be flushed and skip the offending documents by removing them
                    // TODO YF: Since we're clearing the documentCache at the end of the method, why do we need to remove the offending document?
                    for (String pid : documentCache.keySet()) {
                        SolrInputDocument doc = documentCache.get(pid);
                        if (!doc.isEmpty()) {
                            try {
                                solrServer.add(doc);
                                solrServer.commit();
                            } catch (SolrException e2) {
                                solrServer.rollback();
                                // remove the document from cache:
                                removalSet.add(pid);
                                LOGGER.error("Fail-safe. Removing offending document: "
                                        + doc.getField(IndexFields.ID.toString()));
                            }
                        } else {
                            removalSet.add(pid);
                            LOGGER.error("Fail-safe. Removed empty document: "
                                    + doc.getField(IndexFields.ID.toString()));
                        }
                    }
                    for (String pid : removalSet) {
                        documentCache.remove(pid);
                    }
                } finally {
                    documentCache.clear();
                }
            }
            LOGGER.info("Finished flushing.");
        }
    }

}