org.roosster.store.EntryStore.java Source code

Java tutorial

Introduction

Here is the source code for org.roosster.store.EntryStore.java

Source

/*
 * This file is part of ROOSSTER.
 * Copyright 2004, Benjamin Reitzammer <benjamin@roosster.org>
 * All rights reserved.
 *
 * ROOSSTER is free software; you can redistribute it and/or modify
 * it under the terms of the Artistic License.
 *
 * You should have received a copy of the Artistic License
 * along with ROOSSTER; if not, go to
 * http://www.opensource.org/licenses/artistic-license.php for details
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 * EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.roosster.store;

import java.io.IOException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.roosster.Configuration;
import org.roosster.Constants;
import org.roosster.InitializeException;
import org.roosster.Plugin;
import org.roosster.Registry;
import org.roosster.util.DateUtil;

/**
 * TODO better synchronizing
 *
 * @author <a href="mailto:benjamin@roosster.org">Benjamin Reitzammer</a>
 */
public class EntryStore implements Plugin, Constants {
    private static Logger LOG = Logger.getLogger(EntryStore.class);

    /**
     */
    public static final String PROP_INDEXDIR = "store.indexdir";
    public static final String PROP_ANALYZER = "store.analyzerclass";

    public static final String DEF_INDEXDIR = "index";

    private static final String URLHASH = "urlhash";

    private Registry registry = null;
    private String indexDir = null;
    private Class analyzerClass = null;
    private boolean initialized = false;

    /**
     */
    public EntryStore() {
    }

    /**
     *
     */
    public void init(Registry registry) throws InitializeException {
        this.registry = registry;
        Configuration conf = registry.getConfiguration();

        LOG.info("Initializing Plugin " + getClass());

        String className = null;
        try {
            className = conf.getProperty(PROP_ANALYZER);
            if (className == null)
                throw new InitializeException("No '" + PROP_ANALYZER + "'-argument provided");

            LOG.debug("Trying to load analyzer-class: " + className);
            analyzerClass = Class.forName(className);
            Analyzer testInstance = (Analyzer) analyzerClass.newInstance();

        } catch (ClassCastException ex) {
            throw new InitializeException(
                    "Specified class is not an instance of " + Analyzer.class + ": " + className);
        } catch (ClassNotFoundException ex) {
            throw new InitializeException("Can't load analyzer-class: " + className);

        } catch (Exception ex) {
            throw new InitializeException("Exception occured during database init", ex);
        }

        // determine indexDir and check if it exists
        indexDir = conf.getProperty(PROP_INDEXDIR);
        if (indexDir == null || "".equals(indexDir)) {
            String homeDir = conf.getHomeDir();
            if (homeDir != null)
                indexDir = homeDir + "/" + DEF_INDEXDIR;
            else
                indexDir = DEF_INDEXDIR;
        }

        LOG.debug("Directory of index is: " + indexDir);
        LOG.info("Finished initialize of " + getClass());

        initialized = true;
    }

    /**
     *
     */
    public void shutdown(Registry registry) throws Exception {
        LOG.debug("Shutting down EntryStore!");
        initialized = false;
    }

    /**
     *
     */
    public boolean isInitialized() {
        return initialized;
    }

    /**
     */
    public int getLimit() {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        int limit = 10;

        try {
            String limitStr = registry.getConfiguration().getProperty(PROP_LIMIT);
            limit = limitStr != null ? Integer.valueOf(limitStr).intValue() : limit;
        } catch (NumberFormatException ex) {
        }

        return limit;
    }

    /**
     */
    public int getOffset() {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        int offset = 0;
        try {
            String offsetStr = registry.getConfiguration().getProperty(PROP_OFFSET);

            offset = offsetStr != null ? Integer.valueOf(offsetStr).intValue() : offset;
            if (offset < 0)
                offset = 0;

        } catch (NumberFormatException ex) {
        }

        return offset;
    }

    /**
     */
    public int getDocNum() throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        int numdocs = 0;
        IndexReader reader = null;
        try {
            reader = getReader();
            numdocs = reader.numDocs();
        } finally {
            if (reader != null)
                reader.close();
        }

        return numdocs;
    }

    /**
     * 
     * @return
     * @throws IOException
     */
    public List getAllTags() throws IOException {
        List tags = new ArrayList();

        if (IndexReader.indexExists(indexDir)) {

            IndexReader reader = null;
            try {
                LOG.info("Getting all Tags from index");

                reader = getReader();
                TermEnum terms = reader.terms(new Term(Entry.TAGS, ""));

                while (Entry.TAGS.equals(terms.term().field())) {
                    tags.add(terms.term().text());

                    if (!terms.next())
                        break;
                }

            } finally {
                if (reader != null)
                    reader.close();
            }
        }
        return tags;
    }

    /**
     * 
     */
    public EntryHitList getAllEntries(boolean pub) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (!IndexReader.indexExists(indexDir))
            return new EntryHitList(null, null);

        BooleanQuery query = new BooleanQuery();

        TermQuery term = new TermQuery(new Term(Entry.ENTRY_MARKER, Entry.ENTRY_MARKER));
        query.add(term, true, false);

        if (pub) {
            TermQuery pubTerm = new TermQuery(new Term(Entry.PUBLIC, "true"));
            query.add(pubTerm, true, false);
        }

        IndexSearcher searcher = new IndexSearcher(indexDir);

        Sort sort = determineSort();
        LOG.debug("Sort Instance: " + sort);

        Hits hits = searcher.search(query, sort);

        LOG.info("Found " + hits.length() + " matches for query: <" + query + "> Returning ALL!");

        return new EntryHitList(hits, searcher);
    }

    /**
     */
    public EntryList getEntries(boolean pub) throws IOException {
        return getEntries(getOffset(), getLimit(), pub);
    }

    /**
     * @param pub if true, this method returns only Entries which pub: field is 
     * set to "true", if false all Entries are returned
     */
    public EntryList getEntries(int offset, int limit, boolean pub) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (!IndexReader.indexExists(indexDir))
            return new EntryList();

        IndexSearcher searcher = null;
        try {

            BooleanQuery query = new BooleanQuery();

            TermQuery term = new TermQuery(new Term(Entry.ENTRY_MARKER, Entry.ENTRY_MARKER));
            query.add(term, true, false);

            if (pub) {
                TermQuery pubTerm = new TermQuery(new Term(Entry.PUBLIC, "true"));
                query.add(pubTerm, true, false);
            }

            searcher = new IndexSearcher(indexDir);

            Sort sort = determineSort();
            LOG.debug("Sort Instance: " + sort);

            Hits hits = searcher.search(query, sort);

            LOG.info("Found " + hits.length() + " matches for query: <" + query + ">");

            return fillEntryList(hits, offset, limit);

        } finally {
            if (searcher != null)
                searcher.close();
        }
    }

    /**
     * @return an {@link EntryList EntryList}-objects that's never <code>null</code>
     */
    public EntryList search(String queryStr) throws IOException, ParseException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (!IndexReader.indexExists(indexDir))
            return new EntryList();

        IndexSearcher searcher = null;
        try {
            QueryParser parser = new QueryParser(Entry.ALL, createAnalyzer());
            parser.setOperator(QueryParser.DEFAULT_OPERATOR_AND);
            Query query = parser.parse(queryStr);

            searcher = new IndexSearcher(indexDir);

            Sort sort = determineSort();
            LOG.debug("Sort Instance: " + sort);

            Hits hits = searcher.search(query, sort);

            LOG.info("Found " + hits.length() + " matches for query: <" + query + ">");

            return fillEntryList(hits);

        } finally {
            if (searcher != null)
                searcher.close();
        }
    }

    /**
     * @param field
     * @param after default to current Date
     * @param before default to current Date
     * @return an {@link EntryList EntryList}-objects that's never <code>null</code>
     */
    public EntryList getEntriesByDate(String field, Date after, Date before) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (field == null || "".equals(field))
            throw new IllegalArgumentException("Parameter 'field' is not allowed to be null or empty");

        if (!IndexReader.indexExists(indexDir))
            return new EntryList();

        if (after == null)
            after = new Date();
        if (before == null)
            before = new Date();

        IndexSearcher searcher = null;
        try {
            Term afterTerm = new Term(field, DateUtil.formatSearchableEntryDate(after));
            Term beforeTerm = new Term(field, DateUtil.formatSearchableEntryDate(before));

            Query query = new RangeQuery(afterTerm, beforeTerm, true); // search inclusively
            searcher = new IndexSearcher(indexDir);

            Sort sort = determineSort();
            LOG.debug("Sort Instance: " + sort);

            Hits hits = searcher.search(query, sort);

            LOG.debug("Found " + hits.length() + " matches for query: <" + query + ">");

            return fillEntryList(hits, 0, Integer.MAX_VALUE);

        } finally {
            if (searcher != null)
                searcher.close();
        }
    }

    /**
     * @return <code>null</code> if there is no entry with the
     * specified URL
     */
    public Entry getEntry(URL url) throws IOException {
        Entry[] entries = new Entry[0];

        if (IndexReader.indexExists(indexDir)) {
            entries = getEntries(url, null);

            if (entries.length > 1)
                LOG.warn("More than one Entry found for URL " + url);
        }

        return entries.length > 0 ? entries[0] : null;
    }

    /**
     *
     */
    public Entry addEntry(Entry entry) throws IOException {
        return addEntry(entry, false);
    }

    /**
     *
     */
    public Entry addEntry(Entry entry, boolean force) throws IOException {
        return addEntries(new Entry[] { entry }, force)[0];
    }

    /**
     *
     */
    public Entry[] addEntries(Entry[] entries) throws IOException {
        return addEntries(entries, false);
    }

    /**
     * @exception DuplicateEntryException
     */
    public Entry[] addEntries(Entry[] entries, boolean force) throws IOException {
        if (entries == null)
            return new Entry[0];

        // check if any of the Entries are already stored in index 
        if (!force && IndexReader.indexExists(indexDir)) {

            IndexReader reader = null;
            try {
                List duplicateUrls = new ArrayList();

                reader = getReader();
                for (int i = 0; i < entries.length; i++) {
                    Entry[] stored = getEntries(entries[i].getUrl(), reader);
                    if (stored.length > 0)
                        duplicateUrls.add(entries[i].getUrl());
                }

                // now throw exception if we encountered one or more duplicate Entries
                if (duplicateUrls.size() > 0)
                    throw new DuplicateEntriesException((URL[]) duplicateUrls.toArray(new URL[0]));

            } finally {
                if (reader != null)
                    reader.close();
            }

        }

        return storeEntries(entries);
    }

    /**
     *
     */
    public int deleteEntry(URL url) throws IOException {
        return deleteEntries(new URL[] { url }, null);
    }

    /**
     *
     */
    public int deleteEntries(URL[] urls) throws IOException {
        return deleteEntries(urls, null);
    }

    /**
     *
     */
    public int deleteEntries(Entry[] entries) throws IOException {
        if (entries == null)
            throw new IllegalArgumentException("Parameter 'entries' is not allowed to be null!");

        List urls = new ArrayList();
        for (int i = 0; i < entries.length; i++) {
            if (entries[i] != null)
                urls.add(entries[i].getUrl());
        }

        return deleteEntries((URL[]) urls.toArray(new URL[0]), null);
    }

    // ============ private Helper methods ============

    /**
     *
     */
    private Entry[] getEntries(URL url, IndexReader reader) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (url == null)
            throw new IllegalArgumentException("Parameter 'url' is not allowed to be null");

        boolean closeReader = false;
        TermDocs docs = null;
        try {
            LOG.debug("Getting Entry with URL: " + url);

            if (reader == null) {
                reader = getReader();
                closeReader = true;
            }

            Term term = new Term(URLHASH, computeHash(url));
            docs = reader.termDocs(term);

            List entries = new ArrayList();
            while (docs.next()) {
                entries.add(new Entry(reader.document(docs.doc()), 0));
            }

            LOG.debug("Found " + entries.size() + " entries for URL " + url);

            return (Entry[]) entries.toArray(new Entry[0]);

        } finally {

            if (closeReader && reader != null)
                reader.close();

            if (docs != null)
                docs.close();
        }
    }

    /**
     * The deletion of the specified Entries is synchronized with the adding of
     * Entries. So it's not possible to simultaneously add and delete Entries.
     * 
     * @param urls the URLs of the Entries that should be deleted, may not be null
     * @return number of deleted Entries
     * @exception IOException if the writing to the index fails due to some I/O reason
     * @exception IllegalStateException if the object was not properly initialized yet.
     * @exception IllegalArgumentException if parameter <code>urls</code> is null.
     */
    private synchronized int deleteEntries(URL[] urls, IndexReader reader) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (urls == null)
            throw new IllegalArgumentException("Parameter 'urls' is not allowed to be null");

        int numDeleted = 0;
        boolean closeReader = false;

        try {
            if (reader == null) {
                reader = getReader();
                closeReader = true;
            }

            for (int i = 0; i < urls.length; i++) {
                LOG.info("DELETING Entry for URL: " + urls[i]);
                numDeleted += reader.delete(new Term(URLHASH, computeHash(urls[i])));
            }

            LOG.debug("Deleted " + numDeleted + " Entries for URLs: " + Arrays.asList(urls));

        } finally {
            if (closeReader && reader != null)
                reader.close();
        }

        return numDeleted;
    }

    /**
     * It's not possible to simultaneously add and delete Entries.
     *
     * @param entry the <code>Entry</code>-object that should be added to the store,
     * if this is null, no action will be taken.
     * @exception IOException if the writing to the index fails due to some I/O reason
     * @exception IllegalStateException if the object was not properly initialized yet.
     */
    private synchronized Entry[] storeEntries(Entry[] entries) throws IOException {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        if (entries == null)
            throw new IllegalArgumentException("Parameter 'entries' is not allowed to be null");

        int newAdded = entries.length;

        IndexWriter writer = null;
        IndexReader reader = null;
        try {
            if (IndexReader.indexExists(indexDir)) {
                reader = getReader();
                for (int i = 0; i < entries.length; i++) {
                    deleteEntries(new URL[] { entries[i].getUrl() }, reader);
                }
                reader.close();
                reader = null;
            }

            writer = getWriter();

            Date now = new Date();
            for (int i = 0; i < entries.length; i++) {
                LOG.info("Adding Entry to index: " + entries[i].getUrl().toString());

                entries[i].setEdited(now);

                Document doc = entries[i].getDocument();
                doc.add(Field.Keyword(URLHASH, computeHash(entries[i].getUrl())));
                writer.addDocument(doc);
            }

            // TODO defer this to worker thread in background
            if (optimizeNeeded(newAdded)) {
                LOG.debug("Optimize Threshold reached! Optimizing index!");
                writer.optimize();
                newAdded = Integer.MIN_VALUE;
            }

        } finally {
            if (writer != null)
                writer.close();

            if (reader != null)
                reader.close();
        }

        persistProperties(newAdded);

        return entries;
    }

    /**
     * 
     */
    private boolean optimizeNeeded(int newAdded) {
        String addedStr = registry.getConfiguration().getProperty(ADDED_SINCE_OPTIMIZE, "0");
        String thresStr = registry.getConfiguration().getProperty(OPTIMIZE_THRESHOLD, "1");

        int added = Integer.valueOf(addedStr).intValue() + newAdded;
        int threshold = Integer.valueOf(thresStr).intValue();

        if (LOG.isDebugEnabled()) {
            LOG.debug("Added since last IndexWriter.optimize(): " + added);
            LOG.debug("Optimize Threshold: " + threshold);
        }

        return added >= threshold;
    }

    /**
     * @exception IllegalArgumentException if the provided sort field is not available
     * for sorting
     * @exception IllegalStateException if the object was not properly initialized yet.     
     */
    private Sort determineSort() {
        if (!isInitialized())
            throw new IllegalStateException("Database must be initialized before use!");

        String sortField = registry.getConfiguration().getProperty(PROP_SORTFIELD);

        LOG.debug("Specified Sort Field: " + sortField);

        Sort sort = Sort.RELEVANCE;
        if (sortField != null && !"".equals(sortField)) {

            for (int i = 0; i < Entry.STRING_SORT_FIELDS.length; i++) {
                if (Entry.STRING_SORT_FIELDS[i].equals(sortField))
                    return new Sort(new SortField(sortField, SortField.STRING));
            }

            for (int i = 0; i < Entry.INTEGER_SORT_FIELDS.length; i++) {
                if (Entry.INTEGER_SORT_FIELDS[i].equals(sortField))
                    return new Sort(new SortField(sortField, SortField.INT));
            }

            if (sort == null)
                throw new IllegalArgumentException("Illegal sort field: " + sortField);
        }

        return sort;
    }

    // ============ private Helper methods ============

    /**
     *
     */
    private IndexWriter getWriter() throws IOException {
        boolean createIndex = IndexReader.indexExists(indexDir) ? false : true;

        IndexWriter writer = new IndexWriter(indexDir, createAnalyzer(), createIndex);
        writer.maxFieldLength = 1000000;

        return writer;
    }

    /**
     * @return the current {@link #reader reader}-object. Creates a new instance
     * if not happened already.
     */
    private IndexReader getReader() throws IOException {
        return IndexReader.open(indexDir);
    }

    /**
     *
     */
    private Analyzer createAnalyzer() throws IllegalStateException {
        String exceptionMsg = null;
        Analyzer analyzer = null;

        try {

            analyzer = (Analyzer) analyzerClass.newInstance();

        } catch (InstantiationException ex) {
            exceptionMsg = "The provided Analyzer-class could not be instantiated: " + ex.getMessage();
        } catch (ExceptionInInitializerError ex) {
            exceptionMsg = "The provided Analyzer-class could not be instantiated: " + ex.getMessage();
        } catch (SecurityException ex) {
            exceptionMsg = "The provided Analyzer-class could not be instantiated: " + ex.getMessage();
        } catch (IllegalAccessException ex) {
            exceptionMsg = "The provided Analyzer-class could not be instantiated: " + ex.getMessage();
        } finally {

            if (exceptionMsg != null)
                throw new IllegalStateException(exceptionMsg);

        }

        return analyzer;
    }

    /**
     * 
     */
    private EntryList fillEntryList(Hits hits) throws IOException {
        return fillEntryList(hits, getOffset(), getLimit());
    }

    /**
     * 
     */
    private EntryList fillEntryList(Hits hits, int offset, int limit) throws IOException {
        if (hits == null)
            throw new IllegalArgumentException("Parameter 'hits' is not allowed to be null");

        int hitsNum = hits.length();

        EntryList entries = new EntryList(hitsNum);

        LOG.debug("Offset is : " + offset + " / Limit is: " + limit);

        entries.setLimit(limit);
        entries.setOffset(offset);

        if (hitsNum > offset) {
            // Hits class throws an IndexOutOfBoundsException just like an
            // array, when an element is requested, that's outside the
            // hits index bounds
            int lastElem = hitsNum >= offset + limit ? offset + limit : hitsNum;

            for (int i = offset; i < lastElem; i++) {
                entries.add(new Entry(hits.doc(i), hits.score(i)));
            }
        }

        return entries;
    }

    /**
     *
     */
    private void persistProperties(int newAdded) throws IOException {
        Configuration conf = registry.getConfiguration();

        if (newAdded == Integer.MIN_VALUE) {
            conf.setProperty(ADDED_SINCE_OPTIMIZE, "0");
        } else {
            int oldAdded = Integer.valueOf(conf.getProperty(ADDED_SINCE_OPTIMIZE, "0")).intValue();
            conf.setProperty(ADDED_SINCE_OPTIMIZE, String.valueOf(oldAdded + newAdded));
        }

        conf.persist(new String[] { ADDED_SINCE_OPTIMIZE });
    }

    /**
     *
     */
    private String computeHash(URL url) {
        String algorithm = "MD5";

        try {

            MessageDigest md5 = MessageDigest.getInstance(algorithm);
            byte[] end = md5.digest(url.toString().getBytes());

            StringBuffer endString = new StringBuffer();
            for (int i = 0; i < end.length; i++) {
                // convert unsigned byte into signed int
                int tmp = end[i] & 0xFF;
                endString.append(tmp < 16 ? "0" : "").append(Integer.toHexString(tmp));
            }

            return endString.toString();

        } catch (NoSuchAlgorithmException ex) {
            throw new IllegalStateException("FATAL: Your system does not support '" + algorithm + "' hashing");
        }
    }

}