edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java Source code

Introduction

Here is the source code for edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java
Source

/*
#
# Copyright 2012 The Trustees of Indiana University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------
#
# Project: knn
# File:  LuceneUtil.java
# Description:  
#
# -----------------------------------------------------------------
# 
 */

package edu.indiana.d2i.htrc.io.index.lucene;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;

import edu.indiana.d2i.htrc.HTRCConstants;
import edu.indiana.d2i.htrc.io.index.Dictionary;
import edu.indiana.d2i.htrc.io.index.filter.DictionaryFilter;
import edu.indiana.d2i.htrc.io.index.filter.FrequencyFilter;
import edu.indiana.d2i.htrc.io.index.filter.HTRCFilter;
import edu.indiana.d2i.htrc.io.index.filter.StopWordFilter;
import edu.indiana.d2i.htrc.io.index.filter.WordLengthFilter;

public class LuceneClient {

    private static final Log logger = LogFactory.getLog(LuceneClient.class);

    private static LuceneClient client = null;

    private IndexSearcher indexSearcher = null;
    private IndexReader indexReader = null;
    private Dictionary dictionary = null;

    private HTRCFilter filter = null;

    private long elapsedTime = 0;

    private LuceneClient(Configuration conf) throws IOException {
        String directory = conf.get(HTRCConstants.LUCENE_INDEX_PATH);
        // String directory = conf.get("htrc.lucene.index.path");
        FileSystem fs = FileSystem.get(conf);
        Path indexPath = new Path(directory);
        Directory dir = new FileSystemDirectory(fs, indexPath, false, conf);
        indexSearcher = new IndexSearcher(dir);
        indexReader = IndexReader.open(dir);

        dictionary = new Dictionary(conf);

        // dynamic load the filter ??
        //      filter = new StopWordFilter();
        filter = new StopWordFilter("stopwords.txt"); // found in the classpath
        filter.addNextFilter(new DictionaryFilter(dictionary));
        filter.addNextFilter(new FrequencyFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_FREQUENCE, 2)));
        filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2)));
    }

    public void close() throws IOException {
        indexReader.close();
        indexSearcher.close();
    }

    public Vector getTFVector(String volumeId) throws IOException {
        Vector result = new RandomAccessSparseVector(dictionary.size());

        logger.info("Get TF vector for " + volumeId);

        TermQuery termquery = new TermQuery(new Term("id", volumeId));
        TopDocs hits = indexSearcher.search(termquery, indexSearcher.maxDoc());
        ScoreDoc[] docs = hits.scoreDocs;
        int docId = docs[0].doc; // only one hit!!!
        TermPositionVector vector = (TermPositionVector) indexReader.getTermFreqVector(docId, "ocr");

        long t0 = System.nanoTime();
        String[] terms = vector.getTerms();
        int[] freq = vector.getTermFrequencies();
        for (int j = 0; j < terms.length; j++) {
            // if (dictionary.containsKey(terms[j])) {
            // result.setQuick(dictionary.get(terms[j]), freq[j]);
            // }

            if (filter.accept(terms[j], freq[j])) {
                result.setQuick(dictionary.get(terms[j]), freq[j]);
            }
        }
        long t1 = System.nanoTime();
        elapsedTime += t1 - t0;

        return result;
    }

    public long getCPUTime() {
        return elapsedTime;
    }

    public static LuceneClient createLuceneClient(Configuration conf) throws IOException {
        if (client == null)
            client = new LuceneClient(conf);
        return client;
    }
}