edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java Source code

Introduction

Here is the source code for edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java
Source

package edu.wayne.cs.severe.ir4se.lucene;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class SearchFiles {

    // private static int documentType; // 1 - file document; 2 - paragraph
    // document;

    private SearchFiles() {
    }

    @SuppressWarnings("deprecation")
    public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath,
            String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms,
            boolean append) throws Exception {

        String index = indexDirectoryPath;
        FileWriter f = new FileWriter(index + "../NotFound.txt", true);

        for (int i = 0; i < targetClasses.length; i++) {
            String target = targetClasses[i];
            boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target);
            if (!found)
                f.append("Target doc " + i + " - " + target + " not found in index!\n");
        }
        f.close();
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true);

        int numDocs = reader.numDocs();
        System.out.println("The number of documents in the index is: " + numDocs);

        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

        String[] fields;
        fields = new String[1];
        fields[0] = "contents";

        if (!runIndividualTerms) {
            MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
            int hitsPerPage = numDocs;
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
            Query query = parser.parse(queryString);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            System.out.println("The number of hits is: " + hits.length);

            // file with the results (score and position) only for the relevant
            // documents
            // the file contains entries in the following format:
            // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...)
            FileWriter fwRelevant = new FileWriter(fileOutput, append);

            String path = "";
            String docName = "";
            String docPathAndName = "";
            for (String target : targetClasses) {
                boolean found = false;
                for (int i = 0; i < hits.length; i++) {
                    int docId = hits[i].doc;
                    Document d = searcher.doc(docId);
                    path = d.get("path");

                    float score = hits[i].score;

                    if (documentType == 2) {
                        docName = d.get("docName");

                        docPathAndName = path.toLowerCase() + "." + docName.toLowerCase();

                        if (target.equalsIgnoreCase(docPathAndName)) {
                            fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";"
                                    + hits.length + ";" + numDocs + ";" + score + "\n");
                            found = true;
                            break;
                        }
                    } else if (documentType == 1) {
                        File pathDir = new File(path.trim());
                        String fileName = pathDir.getName();
                        docName = fileName.replaceAll(".txt", "");
                        fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                    }
                }
                if (found == false)
                    fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n");

            }
            // fw.close();
            fwRelevant.close();
            reader.close();
        } else // runIndividualTerms = true
        {
            /**
             * each query will be divided in its constituent terms and each term
             * will be run as a separate query
             **/
            /**
             * this is useful to determine the similarity of each of the terms
             * in a query to a target document so that we determine which terms
             * in the query tend to lead to the best results, i.e., to finding
             * the targets sooner
             **/

            SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString,
                    fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append);

            FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt"));
            fw.write(
                    "\n\n\n------------------------------------------------------------------------------------\n\n");
            fw.write("                               Results for query " + queryNumber + "\n");
            fw.write("------------------------------------------------------------------------------------\n\n");

            // file with the results (score and position) only for the relevant
            // documents
            // the file contains entries in the following format:
            // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...)
            // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...)
            // ...
            FileWriter fwRelevant = new FileWriter(
                    fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt"));

            String[] queryTerms = queryString.split(" ");
            for (int l = 0; l < queryTerms.length; l++) {
                MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
                int hitsPerPage = numDocs;
                TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

                String q = queryTerms[l];
                Query query = parser.parse(q);
                searcher.search(query, collector);
                ScoreDoc[] hits = collector.topDocs().scoreDocs;
                fw.write("TERM " + (l + 1) + ": " + q + "\n\n");
                fwRelevant.write("\n" + queryNumber + "," + q);
                for (int i = 0; i < hits.length; i++) {
                    int docId = hits[i].doc;
                    Document d = searcher.doc(docId);
                    String path = d.get("path");
                    float score = hits[i].score;
                    if (documentType == 2) {
                        String docName = d.get("docName");
                        fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n");
                        for (int k = 0; k < targetClasses.length; k++) {
                            if (docName.equalsIgnoreCase(targetClasses[k])) {
                                String contents = d.get("contents");
                                int frequency = countOccurrences(contents, q);// tf
                                fwRelevant.write("," + frequency);

                                fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df
                                fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score);
                                break;
                            }
                        }
                    } else if (documentType == 1) {
                        File pathDir = new File(path);
                        String fileName = pathDir.getName();
                        String docName = fileName.replaceAll(".txt", "");
                        fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                    }
                }
                fw.write("\n\n\n");
            }
            fw.close();
            f.close();
            fwRelevant.close();
            reader.close();
        }
    }

    public static int countOccurrences(String arg1, String arg2) {
        int count = 0;
        int index = 0;
        while ((index = arg1.indexOf(arg2, index)) != -1) {
            ++index;
            ++count;
        }
        return count;
    }

    public static void runQueriesFromDir(String system, int documentType, String indexDirectoryPath,
            String queryDirPath, String resultsDirPath, boolean runIndividualTerms) {
        File dirQuery = new File(queryDirPath);
        int queryNumber = 1;
        if (dirQuery.isDirectory()) {
            String[] children = dirQuery.list();
            for (int i = 0; i < children.length; i++) {
                File child = new File(queryDirPath + "/" + children[i]);
                if (child.isFile()) {
                    try {
                        BufferedReader inQuery = new BufferedReader(new FileReader(child));
                        String line;
                        String[] targetDocs = null;
                        int numberTargetDocs = 0;
                        // first line in each query file is the query number
                        // second line is the query
                        // third line is the number of relevant docs
                        // the following lines are the paths and names of the
                        // relevant docs
                        int k = -3;
                        String query = "";
                        while ((line = inQuery.readLine()) != null) {
                            if (line.trim().length() >= 1) {
                                if (k == -3)// first line - query number
                                    queryNumber = Integer.parseInt(line.trim());
                                if (k == -2)// second line - query text
                                    query = line.trim();
                                else {
                                    if (k == -1)// third line - number of
                                                // relevant docs
                                    {
                                        numberTargetDocs = Integer.parseInt(line.trim());
                                        targetDocs = new String[numberTargetDocs];
                                    } else if (k >= 0)// other lines - the
                                                      // relevant docs paths
                                                      // and names
                                    {
                                        targetDocs[k] = line.trim();
                                    }
                                }
                                k++;

                            }
                        }
                        inQuery.close();
                        String resultFilePath = resultsDirPath + "/results_" + child.getName();
                        if (runIndividualTerms == false) {
                            SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                                    resultFilePath, targetDocs, false, false);
                        } else // runIndividualTerms = true
                        /**
                         * each query will be divided in its constituent terms
                         * and each term will be run as a separate query
                         **/
                        /**
                         * this is useful to determine the similarity of each of
                         * the terms in a query to a target document so that we
                         * determine which terms in the query tend to lead to
                         * the best results, i.e., to finding the targets sooner
                         **/
                        {
                            SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                                    resultFilePath, targetDocs, true, false);
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }

                else if (child.isDirectory())
                    runQueriesFromDir(system, documentType, indexDirectoryPath, child.getAbsolutePath(),
                            resultsDirPath + "/" + child.getName(), runIndividualTerms);
            }
        }
    }

    public static void runQueriesFromFile(String system, int documentType, String indexDirectoryPath,
            String queryFilePath, String resultsFilePath, boolean runIndividualTerms, boolean append) {
        /*
         * All queries are located in the same file. The query entries are
         * separated by a blank line. Each entry consist of: query number (1st
         * line), query terms (2nd line), number of relevant docs (3rd line),
         * paths and names of the relevant docs (rest of the lines)
         */
        File fileQuery = new File(queryFilePath);
        if (fileQuery.isFile()) {
            try {
                BufferedReader inQuery = new BufferedReader(new FileReader(fileQuery));
                String line;
                String[] targetDocs = null;
                int numberTargetDocs = 0;
                int lineNumber = 0;
                int queryNumber = 1;
                String query = "";

                while ((line = inQuery.readLine()) != null) {
                    // it is not a blank line
                    if (!line.trim().isEmpty()) {
                        lineNumber++;
                        switch (lineNumber) {
                        case 1:
                            queryNumber = Integer.parseInt(line.trim());
                            break;
                        case 2:
                            query = line.trim().toLowerCase();
                            break;

                        case 3:
                            numberTargetDocs = Integer.parseInt(line.trim());
                            targetDocs = new String[numberTargetDocs];
                            break;

                        default:
                            if (lineNumber >= 4)
                                targetDocs[lineNumber - 4] = processPath(line.trim());
                            break;
                        }
                    } else {
                        if (!runIndividualTerms) {
                            SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                                    resultsFilePath, targetDocs, false, append);
                        } else {
                            /*
                             * Each query will be divided in its constituent
                             * terms and each term will be run as a separate
                             * query. This is useful to determine the similarity
                             * of each of the terms in a query to a target
                             * document so that we determine which terms in the
                             * query tend to lead to the best results, i.e., to
                             * finding the targets sooner
                             */
                            SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                                    resultsFilePath, targetDocs, true, append);
                        }
                        lineNumber = 0;
                    }
                }

                if (!runIndividualTerms) {
                    SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                            resultsFilePath, targetDocs, false, append);
                } else {
                    /*
                     * each query will be divided in its constituent terms and
                     * each term will be run as a separate query
                     * 
                     * this is useful to determine the similarity of each of the
                     * terms in a query to a target document so that we
                     * determine which terms in the query tend to lead to the
                     * best results, i.e., to finding the targets sooner
                     */
                    SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, query,
                            resultsFilePath, targetDocs, true, append);
                }

                inQuery.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("Query file (" + queryFilePath + ") is not valid!");
        }
    }

    /**
     * This method uses a custom HitCollector implementation which simply prints
     * out the docId and score of every matching document.
     * 
     * This simulates the streaming search use case, where all hits are supposed
     * to be processed, regardless of their relevance.
     */
    public static void doStreamingSearch(final IndexSearcher searcher, Query query) throws IOException {
        Collector streamingHitCollector = new Collector() {
            private Scorer scorer;
            private int i = 0;
            private int docBase;

            // simply print docId and score of every matching document
            public void collect(int docNumber) throws IOException {
                try {

                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            public boolean acceptsDocsOutOfOrder() {
                return true;
            }

            public void setNextReader(IndexReader reader, int docBase) throws IOException {
                this.docBase = docBase;
            }

            public void setScorer(Scorer scorer) throws IOException {
                this.scorer = scorer;
            }
        };

        searcher.search(query, streamingHitCollector);

    }

    public static String processPath(String path) {
        String res = path.toLowerCase();
        res = res.replaceAll("\n", "");
        res = res.replaceAll("\r", " ");
        res = res.replaceAll("\t", " ");
        res = res.replaceAll("/", ".");
        // res = res.replaceAll("::", ".");
        res = res.replaceAll(", ", ",");
        if (res.startsWith("."))
            res = path.replaceFirst(".", "");
        return res;
    }

}