iit.cs570.assign1.web.managedbean.SIMBM25Searcher.java Source code

Java tutorial

Introduction

Here is the source code for iit.cs570.assign1.web.managedbean.SIMBM25Searcher.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package iit.cs570.assign1.web.managedbean;

import iit.cs570.assign1.web.util.TheCrawlersConstants;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import iit.cs570.assign1.web.interfaces.*;
import iit.cs570.assign1.web.util.SimRankUtil;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.faces.context.FacesContext;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;

/**
 *
 * @author Neelesh
 */
public class SIMBM25Searcher {

    private List<SearchResult> results;
    //private int searchcount;
    IndexSearcher indexSearcher;
    QueryParser queryParser;
    Query query;
    IndexReader reader;

    public SIMBM25Searcher(String indexDirectoryPath) throws IOException {
        queryParser = new QueryParser(Version.LUCENE_42, TheCrawlersConstants.TEXT,
                new StandardAnalyzer(Version.LUCENE_42));
        Directory indexDirectory = FSDirectory.open(new File(indexDirectoryPath));
        reader = DirectoryReader.open(indexDirectory);

    }

    public List<SearchResult> search(String searchQuery, List<SearchResult> bm25Results)
            throws IOException, ParseException {

        results = new ArrayList<SearchResult>();
        HashMap<Integer, Float> combinedScore = new HashMap<Integer, Float>();

        Connection conn = null;
        PreparedStatement stmt = null;
        System.out.println("Query to be processed is ---->" + searchQuery);
        query = queryParser.parse(searchQuery);

        for (int i = 0; i < 5; i++) {
            int id = DocumentMapBuilder.getInstance().getDocumentNameIdMap().get(bm25Results.get(i).getFileName());
            float bm25Score = bm25Results.get(i).getScore();
            System.out.println("Id of the file is given as: " + id);
            System.out.println("bm25 score is : " + bm25Score);
            try {

                conn = SimRankUtil.getInstance().getConnection();
                stmt = conn.prepareStatement(TheCrawlersConstants.SELECT_SIMRANK_SQL);
                stmt.setInt(1, id);
                ResultSet rs = stmt.executeQuery();
                while (rs.next()) {

                    int col = rs.getInt(TheCrawlersConstants.SIMRANK_COL2);
                    float score = rs.getFloat(TheCrawlersConstants.SIMRANK_COL3);

                    if (combinedScore.get(col) == null) {
                        combinedScore.put(col, bm25Score * score);
                    } else {
                        combinedScore.put(col, combinedScore.get(col) + (bm25Score * score));
                    }

                }

            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        try {
            Iterator it = combinedScore.entrySet().iterator();
            while (it.hasNext()) {
                Map.Entry<Integer, Float> pair = (Map.Entry<Integer, Float>) it.next();
                Document doc = reader.document(pair.getKey());
                String summary = getBestFragment(doc);

                if (summary != null) {
                    SearchResult res = new SearchResult(doc.get(TheCrawlersConstants.TITLE),
                            doc.get(TheCrawlersConstants.FILE_NAME), pair.getValue(), summary);
                    results.add(res);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        //int id = DocumentMapBuilder.getInstance().getDocumentNameIdMap().get(fileName);
        //System.out.println("Id of the file is given as: " + id);
        //        try {
        //            int count = 0;
        //            conn = SimRankUtil.getInstance().getConnection();
        //            stmt = conn.prepareStatement(TheCrawlersConstants.SELECT_SIMRANK_SQL);
        //            stmt.setInt(1, id);
        //            ResultSet rs = stmt.executeQuery();
        //            while (rs.next() && count < TheCrawlersConstants.TOP_SIM_RANK_RESULTS) {
        //                count++;
        //                int col = rs.getInt(TheCrawlersConstants.SIMRANK_COL2);
        //                float score = rs.getFloat(TheCrawlersConstants.SIMRANK_COL3);                
        //                Document doc = reader.document(col);
        //                String summary =  getBestFragment(doc);
        ////                if(summary==null || "".equalsIgnoreCase(summary))
        ////                {
        ////                    summary = doc.get(TheCrawlersConstants.TEXT);
        ////                    if(summary!=null)
        ////                        summary= "Query terms not found in the document. Showing content. "+summary.substring(0, 75);
        ////                }
        //                if(summary!=null)
        //                {
        //                    SearchResult res = new SearchResult(doc.get(TheCrawlersConstants.TITLE), doc.get(TheCrawlersConstants.FILE_NAME), score,summary);
        //                results.add(res);}
        //
        //            }
        //          //  searchcount = results.size();
        //        } catch (Exception e) {
        //            e.printStackTrace();
        //        }

        return results;

    }

    public void close() throws IOException {
        reader.close();
    }

    public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
        Scorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(scorer);
        return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
                doc.get(TheCrawlersConstants.TEXT));
    }
}