com.nuvolect.deepdive.lucene.Index.java Source code

Java tutorial

Introduction

Here is the source code for com.nuvolect.deepdive.lucene.Index.java

Source

/*
 * Copyright (c) 2018 Nuvolect LLC.
 * This software is offered for free under conditions of the GPLv3 open source software license.
 * Contact Nuvolect LLC for a less restrictive commercial license if you would like to use the software
 * without the GPLv3 restrictions.
 */

package com.nuvolect.deepdive.lucene;

import com.nuvolect.deepdive.main.CConst;
import com.nuvolect.deepdive.util.LogUtil;
import com.nuvolect.deepdive.util.OmniFile;
import com.nuvolect.deepdive.util.OmniUtil;

import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.json.JSONException;
import org.json.JSONObject;
import org.lukhnos.portmobile.file.Paths;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;

/**
 * Index a Lucene database given an path.
 * index_state:{nil, filetree, indexing, complete, interrupted}
 * filetree : gen recursive list of files, { total_docs == 0, indexed_docs==0}
 * indexing : index files, populate { total_docs, indexed_docs }
 * complete : index is complete, are the same { total_docs, indexed_do
 * interrupted: index process was interrupted and is not complete
 */
public class Index {

    public static enum INDEX_STATE {
        nil, filetree, indexing, interrupted, complete
    };

    private static INDEX_STATE m_index_state = INDEX_STATE.nil;

    private static final Object m_lock = new Object();
    private static Thread m_indexThread = null;
    private static ThreadGroup m_threadGroup = null;
    private static String INDEX_THREAD_GROUP = "Index thread group";
    private static String INDEX_THREAD = "Index thread";
    private static int STACK_SIZE = 20 * 1024 * 1024;
    private static boolean m_fileTreeActive = false;
    private static final boolean[] m_interrupt = { false };
    private static final int[] m_totalDocs = { 0 };
    private static final int[] m_indexedDocs = { 0 };
    private static final String[] m_error = { "" };

    public static JSONObject index(final String volumeId, final String searchPath, final boolean forceIndex) {

        if (m_interrupt[0]) {

            LogUtil.log(LogUtil.LogType.INDEX, "Index canceled post interrupt");

            m_interrupt[0] = false;
            return responseInterruptIndexing();
        }

        OmniFile cacheDir = IndexUtil.getCacheDir(volumeId, searchPath);
        boolean cacheDirCreated = false;
        try {
            cacheDirCreated = OmniUtil.forceMkdir(cacheDir);
        } catch (IOException e) {
            return responseFolderCreateError(searchPath);
        }

        final String luceneDirPath = cacheDir.getAbsolutePath();

        boolean cacheDirExists = !cacheDirCreated;
        boolean indexingOngoing = m_indexThread != null && m_indexThread.isAlive();
        boolean indexingRequired = !cacheDirExists || forceIndex;

        synchronized (m_lock) {

            if (indexingOngoing) {

                if (m_fileTreeActive)
                    m_index_state = INDEX_STATE.filetree;
                else
                    m_index_state = INDEX_STATE.indexing;
            } else {
                if (indexingRequired)
                    m_index_state = INDEX_STATE.indexing;
                else
                    m_index_state = INDEX_STATE.complete;
            }
        }

        if (indexingRequired || indexingOngoing) {

            if (indexingOngoing) {

                // Nothing to do, let the background process run. Monitor m_indexedDocs for progress.
            } else {

                synchronized (m_lock) {
                    m_index_state = INDEX_STATE.filetree;
                    m_totalDocs[0] = 0;
                    m_indexedDocs[0] = 0;
                    m_error[0] = "";
                }
                m_threadGroup = new ThreadGroup(INDEX_THREAD_GROUP);
                m_indexThread = new Thread(m_threadGroup, new Runnable() {
                    @Override
                    public void run() {

                        //                        Analyzer analyzer = new org.apache.lucene.analysis.core.WhitespaceAnalyzer();
                        //                        Analyzer analyzer = new org.apache.lucene.analysis.core.KeywordAnalyzer();
                        //                        Analyzer analyzer = new org.apache.lucene.analysis.standard.StandardAnalyzer();
                        Analyzer analyzer = new org.apache.lucene.analysis.core.SimpleAnalyzer();
                        IndexWriterConfig config = new IndexWriterConfig(analyzer);
                        IndexWriter iwriter = null;

                        try {
                            Directory m_directory = FSDirectory.open(Paths.get(luceneDirPath));
                            iwriter = new IndexWriter(m_directory, config);
                            iwriter.deleteAll();
                            iwriter.commit();
                        } catch (IOException e) {
                            LogUtil.logException(LogUtil.LogType.INDEX, e);
                            m_error[0] = "IndexWriter constructor exception";
                        }

                        synchronized (m_lock) {
                            m_fileTreeActive = true;
                            m_index_state = INDEX_STATE.filetree;
                        }
                        Collection<OmniFile> files = IndexUtil.getFilePaths(volumeId, searchPath);

                        synchronized (m_lock) {
                            m_index_state = INDEX_STATE.indexing;
                            m_fileTreeActive = false;
                            m_totalDocs[0] = files.size();
                            m_indexedDocs[0] = 0;
                        }

                        try {

                            for (OmniFile file : files) {

                                if (m_interrupt[0]) {
                                    LogUtil.log(LogUtil.LogType.INDEX, "Iterator loop canceled");
                                    break;
                                }

                                String path = file.getPath();

                                //                                LogUtil.log(LogUtil.LogType.INDEX, "indexing: " + path);// this is a bit excessive
                                iwriter.addDocument(makeDoc(volumeId, path));
                                synchronized (m_lock) {
                                    ++m_indexedDocs[0];
                                }
                            }

                            iwriter.commit();
                            iwriter.close();
                            synchronized (m_lock) {
                                m_index_state = m_interrupt[0] ? INDEX_STATE.interrupted : INDEX_STATE.complete;
                                m_totalDocs[0] = m_indexedDocs[0];
                            }

                        } catch (Exception e) {
                            LogUtil.logException(LogUtil.LogType.INDEX, e);
                            m_error[0] = "IndexWriter addDocument exception";
                        }
                    }
                }, INDEX_THREAD, STACK_SIZE);

                m_indexThread.setPriority(Thread.MAX_PRIORITY);
                m_indexThread.start();
            }
        } else {

            // Indexing is complete
            // Get number of documents indexed
            try {
                Directory directory = FSDirectory.open(Paths.get(luceneDirPath));
                DirectoryReader ireader = DirectoryReader.open(directory);
                synchronized (m_lock) {
                    m_indexedDocs[0] = ireader.numDocs();
                    m_totalDocs[0] = m_indexedDocs[0];
                    m_index_state = INDEX_STATE.complete;
                }
                ireader.close();
                directory.close();
            } catch (IOException e) {
                LogUtil.logException(LogUtil.LogType.INDEX, e);
            }
        }

        JSONObject result = new JSONObject();
        try {
            synchronized (m_lock) {
                result.put("index_state", m_index_state.toString());
                result.put("error", m_error[0]);
                result.put("indexed_docs", m_indexedDocs[0]);
                result.put("total_docs", m_totalDocs[0]);
                //                result.put("full_path", cacheDir.getAbsolutePath());
                result.put("search_path", searchPath);
            }
        } catch (JSONException e) {
            e.printStackTrace();
        }

        return result;
    }

    public static JSONObject interrupt() {

        synchronized (m_lock) {
            m_interrupt[0] = true;
        }
        LogUtil.log(LogUtil.LogType.INDEX,
                "interrupting---------------================================================");
        return responseInterruptIndexing();
    }

    private static JSONObject responseInterruptIndexing() {

        JSONObject result = new JSONObject();
        try {
            result.put("index_state", INDEX_STATE.interrupted.toString());
            result.put("error", "");
            result.put("indexed_docs", m_indexedDocs[0]);
            result.put("total_docs", m_totalDocs[0]);
            result.put("search_path", "");

        } catch (JSONException e) {
            e.printStackTrace();
        }

        return result;
    }

    private static JSONObject responseFolderCreateError(String searchPath) {

        JSONObject result = new JSONObject();
        try {
            result.put("index_state", INDEX_STATE.complete.toString());
            result.put("error", "Folder error: " + searchPath);
            result.put("indexed_docs", 0);
            result.put("total_docs", 0);
            result.put("search_path", searchPath);

        } catch (JSONException e) {
            e.printStackTrace();
        }

        return result;
    }

    /**
     * Build a single document to be indexed with additional data to be returned with search results.
     * @param volumeId // Volume of the file
     * @param path // Path to the file
     * @return
     * @throws IllegalArgumentException
     * @throws FileNotFoundException
     */
    private static Document makeDoc(String volumeId, String path)
            throws IllegalArgumentException, FileNotFoundException {

        Document doc = new Document();

        String fileName = FilenameUtils.getName(path);
        // Tokenize, index and store
        doc.add(new TextField(CConst.FIELD_FILENAME, fileName, Field.Store.YES));

        // Only stored, not indexed
        doc.add(new StoredField(CConst.FIELD_VOLUME, volumeId));

        // Only stored, not indexed
        doc.add(new StoredField(CConst.FIELD_PATH, path));

        // Index only, do not store
        OmniFile file = new OmniFile(volumeId, path);
        java.io.Reader reader = new java.io.FileReader(file.getStdFile());
        doc.add(new Field(CConst.FIELD_CONTENT, reader, TextField.TYPE_NOT_STORED));

        return doc;
    }
}