org.bibsonomy.lucene.util.generator.LuceneGenerateResourceIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.bibsonomy.lucene.util.generator.LuceneGenerateResourceIndex.java

Source

/**
 *
 *  BibSonomy-Lucene - A blue social bookmark and publication sharing system.
 *
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *                            http://www.kde.cs.uni-kassel.de/
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.bibsonomy.lucene.util.generator;

import static org.bibsonomy.lucene.util.LuceneBase.CFG_INDEX_ID_DELIMITER;
import static org.bibsonomy.lucene.util.LuceneBase.CFG_LUCENE_INDEX_PREFIX;
import static org.bibsonomy.lucene.util.LuceneBase.SQL_BLOCKSIZE;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.NoSuchDirectoryException;
import org.bibsonomy.lucene.database.LuceneDBInterface;
import org.bibsonomy.lucene.param.LucenePost;
import org.bibsonomy.lucene.util.LuceneBase;
import org.bibsonomy.lucene.util.LuceneResourceConverter;
import org.bibsonomy.model.Group;
import org.bibsonomy.model.Post;
import org.bibsonomy.model.Resource;

/**
 * reads bibsonomy data from database and builds lucene index for all resource
 * entries
 * 
 * @author sst
 * @author fei
 * @version $Id: LuceneGenerateResourceIndex.java,v 1.10 2011-05-28 12:27:35 nosebrain Exp $
 * 
 * @param <R> the resource of the index to generate
 */
public abstract class LuceneGenerateResourceIndex<R extends Resource> implements Runnable {
    protected static final Log log = LogFactory.getLog(LuceneGenerateResourceIndex.class);

    /** database logic */
    protected LuceneDBInterface<R> dbLogic;

    /** path to the bookmark index */
    private String luceneResourceIndexPath;

    /** maximum length of fields in the lucene index */
    private IndexWriter.MaxFieldLength mfl;

    /** writes the bookmark index */
    protected IndexWriter indexWriter;

    /** default analyzer */
    private Analyzer analyzer = null;

    /** converts post model objects to lucene documents */
    private LuceneResourceConverter<R> resourceConverter;

    private GenerateIndexCallback callback = null;

    /** the progress-percentage if index-generation is running */
    private int progressPercentage;

    /** set to true if the generator is currently generating an index */
    private boolean isRunning;

    /**
     * constructor 
     */
    public LuceneGenerateResourceIndex() {
        // load configuration
        init();
    }

    /** 
     * reads in parameters from the properties file and stores them in 
     * the corresponding attributes 
     */
    private void init() {
        // initialize run time configuration
        LuceneBase.initRuntimeConfiguration();

        // load the db drivers
        try {
            Class.forName(LuceneBase.getDbDriverName());
        } catch (final Exception e) {
            log.error(
                    "Error loading the mysql driver. Please check, that the mysql connector library is available. ["
                            + e.getMessage() + "]");
        }

        // 1) index files
        this.luceneResourceIndexPath = LuceneBase.getIndexBasePath() + CFG_LUCENE_INDEX_PREFIX + getResourceName();//props.getProperty(LUCENE_INDEX_PATH_PREFIX + getResourceName());

        // 2) maximal field width in the index
        this.mfl = LuceneBase.getMaximumFieldLength();
    }

    /**
     * frees allocated resources and closes all files
     * @throws IOException 
     * @throws CorruptIndexException 
     */
    public void shutdown() throws CorruptIndexException, IOException {
        indexWriter.close();

        if (callback != null) {
            callback.done();
            callback = null;
        }
    }

    /**
     * Read in data from database and build index. 
     * 
     * Database as well as index files are configured in the lucene.properties file.
     * 
     * @param copyRedundantIndices if set to true, the newly generated index will also be copied to the redunant indices
     * @throws CorruptIndexException
     * @throws IOException
     * @throws ClassNotFoundException
     * @throws SQLException
     */
    public void generateIndex(final boolean copyRedundantIndices)
            throws CorruptIndexException, IOException, ClassNotFoundException, SQLException {
        // Allow only one index-generation at a time.
        if (isRunning)
            return;
        isRunning = true;

        // Delete the old index, if exists
        deleteIndex(0);

        // open index
        createEmptyIndex();

        // generate index
        createIndexFromDatabase();

        // create redundant indeces
        if (copyRedundantIndices) {
            log.info("Creating " + LuceneBase.getRedundantCnt() + " redundant indeces.");
            this.copyRedundantIndeces();
        }

        isRunning = false;
    }

    /**
     * Generate Index including redundant indices
     * @throws CorruptIndexException 
     * @throws IOException 
     * @throws ClassNotFoundException 
     * @throws SQLException 
     */
    public void generateIndex() throws CorruptIndexException, IOException, ClassNotFoundException, SQLException {
        generateIndex(true);
    }

    /**
     * Create empty index. Attributes must already be configured (via init()).
     *  
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public void createEmptyIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
        // create index, possibly overwriting existing index files
        log.info("Creating empty lucene index...");
        final Directory indexDirectory = FSDirectory
                .open(new File(this.luceneResourceIndexPath + CFG_INDEX_ID_DELIMITER + "0"));
        indexWriter = new IndexWriter(indexDirectory, getAnalyzer(), true, mfl);
    }

    /**
     * creates index of bookmark entries
     * 
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void createIndexFromDatabase() throws CorruptIndexException, IOException {
        log.info("Filling index with database post entries.");

        // set up resource specific data structures
        setUp();

        // number of post entries to calculate progress
        //FIXME: the number of posts is wrong
        final int numberOfPosts = this.dbLogic.getNumberOfPosts();
        progressPercentage = 0;
        log.info("Number of post entries: " + this.dbLogic.getNumberOfPosts());

        // initialize variables
        final Integer lastTasId = this.dbLogic.getLastTasId();
        Date lastLogDate = this.dbLogic.getLastLogDate();

        if (lastLogDate == null) {
            lastLogDate = new Date(System.currentTimeMillis() - 1000);
        }

        // get all relevant bookmarks from bookmark table
        int i = 0; // number of evaluated entries 
        int is = 0; // number of spam entries 

        log.info("Start writing data to lucene index");

        // read block wise all posts
        List<LucenePost<R>> postList = null;
        int skip = 0;
        do {
            postList = this.dbLogic.getPostEntries(skip, SQL_BLOCKSIZE);
            skip += postList.size(); // TODO: += SQL_BLOCKSIZE?!
            log.info("Read " + skip + " entries.");

            // cycle through all posts of currently read block
            for (final LucenePost<R> postEntry : postList) {
                // update management fields
                postEntry.setLastLogDate(lastLogDate);
                postEntry.setLastTasId(lastTasId);

                // create index document from post model
                final Document post = this.resourceConverter.readPost(postEntry);

                // add (non-spam) document to index
                // FIXME: is this check necessary?
                if (isNotSpammer(postEntry)) {
                    indexWriter.addDocument(post);
                    i++;
                } else {
                    is++;
                }
            }
            progressPercentage = (int) Math.round(100 * ((double) skip / numberOfPosts));
            log.info(progressPercentage + "% of index-generation done!");

        } while (postList.size() == SQL_BLOCKSIZE);

        // optimize index
        log.info("optimizing index " + luceneResourceIndexPath);
        indexWriter.optimize();

        // close bookmark-indexWriter
        log.info("closing index " + luceneResourceIndexPath);
        indexWriter.close();

        // all done
        log.info("(" + i + " indexed entries, " + is + " not indexed spam entries)");
    }

    /**
     * Get the progress-percentage
     * @return the progressPercentage
     */
    public int getProgressPercentage() {
        return progressPercentage;
    }

    /** Run the index-generation in a thread. */
    @Override
    public void run() {
        try {
            generateIndex(false);
        } catch (final Exception e) {
            log.error("Failed to generate " + getResourceName() + "-index!", e);
        } finally {
            try {
                shutdown();
            } catch (final Exception e) {
                log.error("Failed to close index-writer!", e);
            }
        }
    }

    /**
     * test if given post is a spam post
     * 
     * @param post
     * @return <code>true</code> iff the post user is a spammer
     */
    protected boolean isNotSpammer(final Post<? extends Resource> post) {
        for (final Group group : post.getGroups()) {
            if (group.getGroupId() < 0) {
                /*
                 * spammer group found => user is spammer
                 */
                return false;
            }
        }
        return true;
    }

    //------------------------------------------------------------------------
    // private helper
    //------------------------------------------------------------------------
    /**
     * copy created index to redundant indeces
     */
    public void copyRedundantIndeces() {
        final File inputFile = new File(this.luceneResourceIndexPath + CFG_INDEX_ID_DELIMITER + "0");
        for (int i = 1; i < LuceneBase.getRedundantCnt(); i++) {
            try {
                deleteIndex(i);
                final File outputFile = new File(this.luceneResourceIndexPath + CFG_INDEX_ID_DELIMITER + i);
                log.info("Copying index " + i);
                copyDirectory(inputFile, outputFile);
                log.info("Done.");
            } catch (final Exception e) {
                log.error("Error copying index to index file " + i, e);
            }
        }
    }

    /** 
     * Delete one index identified by its id
     * @param id the index-id
     * */
    public void deleteIndex(final int id) {
        try {
            final File directory = new File(this.luceneResourceIndexPath + CFG_INDEX_ID_DELIMITER + id);
            final Directory indexDirectory = FSDirectory.open(directory);

            log.info("Deleting index " + directory.getAbsolutePath() + "...");
            for (final String filename : indexDirectory.listAll()) {
                indexDirectory.deleteFile(filename);
                log.debug("Deleted " + filename);
            }
            log.info("Success.");
        } catch (final NoSuchDirectoryException e) {
            log.warn("Tried to delete the lucene-index-directory but it could not be found.", e);
        } catch (final IOException e) {
            log.error("Could not delete directory-content before index-generation or index-copy.", e);
        }
    }

    /**
     * Copies all files under srcDir to dstDir.
      * If dstDir does not exist, it will be created.
     * @param srcDir
     * @param dstDir
     * @throws IOException
     */
    public void copyDirectory(final File srcDir, final File dstDir) throws IOException {
        if (srcDir.isDirectory()) {
            if (!dstDir.exists()) {
                dstDir.mkdir();
            }

            final String[] children = srcDir.list();
            for (final String child : children) {
                copyDirectory(new File(srcDir, child), new File(dstDir, child));
            }
        } else {
            copyFile(srcDir, dstDir);
        }
    }

    /**
     * Fast & simple file copy.
     * 
     * @param source 
     * @param dest 
     * @throws IOException 
     */
    public static void copyFile(final File source, final File dest) throws IOException {
        FileChannel in = null, out = null;
        try {
            in = new FileInputStream(source).getChannel();
            out = new FileOutputStream(dest).getChannel();

            final long size = in.size();
            final MappedByteBuffer buf = in.map(FileChannel.MapMode.READ_ONLY, 0, size);

            out.write(buf);

        } finally {
            if (in != null)
                in.close();
            if (out != null)
                out.close();
        }
    }

    /**
     * @return get managed resource name
     */
    protected abstract String getResourceName();

    /** set up resource specific data structures */
    protected void setUp() {
        // noop
    }

    /**
     * @return the dbLogic
     */
    public LuceneDBInterface<R> getLogic() {
        return dbLogic;
    }

    /**
     * @param dbLogic the dbLogic to set
     */
    public void setLogic(final LuceneDBInterface<R> dbLogic) {
        this.dbLogic = dbLogic;
    }

    /**
     * @return the analyzer
     */
    public Analyzer getAnalyzer() {
        return analyzer;
    }

    /**
     * @param analyzer the analyzer to set
     */
    public void setAnalyzer(final Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    /**
     * @return the resourceConverter
     */
    public LuceneResourceConverter<R> getResourceConverter() {
        return resourceConverter;
    }

    /**
     * @param resourceConverter the resourceConverter to set
     */
    public void setResourceConverter(final LuceneResourceConverter<R> resourceConverter) {
        this.resourceConverter = resourceConverter;
    }

    /**
     * @param callback the callback to set
     *  */
    public void registerCallback(final GenerateIndexCallback callback) {
        this.callback = callback;
    }
}