lucenesearche.HW3.java Source code

Introduction

Here is the source code for lucenesearche.HW3.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package lucenesearche;

/**
 *
 * @author shantanu
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Formatter;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * To create Apache Lucene index in a folder and add files into this index based
 * on the input of the user.
 */
public class HW3 {
    private static Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    private static Analyzer sAnalyzer = new SimpleAnalyzer(Version.LUCENE_47);

    private IndexWriter writer;
    private ArrayList<File> queue = new ArrayList<File>();

    public static void main(String[] args) throws IOException {
        System.out.println(
                "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

        String indexLocation = null;
        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        String s = br.readLine();

        HW3 indexer = null;
        try {
            indexLocation = s;
            indexer = new HW3(s);
        } catch (Exception ex) {
            System.out.println("Cannot create index..." + ex.getMessage());
            System.exit(-1);
        }
        String query1, query2, query3, query4;
        query1 = "Lucene_Results_Stopped.txt";
        query2 = "Lucene_Q2_top100.txt";
        query3 = "Lucene_Q3_top100.txt";
        query4 = "Lucene_Q4_top100.txt";

        File luceneFile = new File(query1); // change filename for each query
        int query_id;

        // ===================================================
        // read input from user until he enters q for quit
        // ===================================================
        while (!s.equalsIgnoreCase("q")) {
            try {
                System.out.println(
                        "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
                System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
                s = br.readLine();
                if (s.equalsIgnoreCase("q")) {
                    break;
                }

                // try to add file into the index
                indexer.indexFileOrDirectory(s);
            } catch (Exception e) {
                System.out.println("Error indexing " + s + " : " + e.getMessage());
            }
        }

        // ===================================================
        // after adding, we always have to call the
        // closeIndex, otherwise the index is not created
        // ===================================================
        indexer.closeIndex();

        // =========================================================
        // Now search
        // =========================================================
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
        IndexSearcher searcher = new IndexSearcher(reader);
        //TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        Formatter f = new Formatter();

        s = "";
        File file1 = new File(
                "C:\\Users\\shantanu\\Downloads\\NetBeansProjects\\LuceneSearchE\\src\\lucenesearche\\query_stopped.txt");
        ScoreDoc[] hits;
        try {
            BufferedReader b = new BufferedReader(new FileReader(file1));
            query_id = 1;
            FileInputStream fis = new FileInputStream(
                    "C:\\Users\\shantanu\\Downloads\\NetBeansProjects\\LuceneSearchE\\src\\lucenesearche\\query_stopped.txt");
            Scanner scanner = new Scanner(fis);

            luceneFile.createNewFile();
            FileWriter writer = new FileWriter(luceneFile);

            while (scanner.hasNextLine()) {
                String line;

                //line = b.readLine();
                line = scanner.nextLine();
                if (line == null)
                    break;

                System.out.println(b.readLine());
                //s = br.readLine();
                if (s.equalsIgnoreCase("q")) {
                    break;
                }
                TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
                Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(line);
                searcher.search(q, collector);
                //System.out.println(searcher);

                hits = collector.topDocs().scoreDocs;

                System.out.println(hits.length);

                // 4. display results
                // change this for new query 
                //writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s","Query ID","Q0","Document Name","Rank","Cosine Similarity Score","System Name\n"));
                System.out.println("Found " + hits.length + " hits.");
                //System.out.println(f.format("%-10s %-10s %-80s %-10s %-40s %-20s","Query ID","Q0","Document Name","Rank","Cosine Similarity Score","System Name"));
                for (int i = 0; i < hits.length; ++i) {
                    Formatter fmt = new Formatter();
                    int docId = hits[i].doc;
                    Document d = searcher.doc(docId);
                    //System.out.println(d.get("filename"));
                    //System.out.println((i+1) +". " + d.get("path")+" "+ hits[i].score);
                    String a = d.get("filename");
                    String parts = a.substring(0, a.indexOf('.'));
                    //System.out.println(parts);

                    writer.append(String.format("%-10s %-10s %-30s %-10s %-30s", query_id, "Q0", parts, (i + 1),
                            hits[i].score));
                    writer.append('\n');
                    writer.flush();
                    //System.out.println(fmt.format("%-10s %-10s %-80s %-10s %-40s %-20s",""+query_id,"Q0",""+d.get("path"),""+(i + 1),""+hits[i].score,"Shantanu-SYS-001"));
                }

                // 5. term stats --> watch out for which "version" of the term
                // must be checked here instead!
                /*Term termInstance = new Term("contents", s);
                long termFreq = reader.totalTermFreq(termInstance);
                long docCount = reader.docFreq(termInstance);
                System.out.println(s + " Term Frequency " + termFreq
                   + " - Document Frequency " + docCount);*/
                query_id += 1;
            }
            writer.close();
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.toString());
            //break;
        }

    }

    /**
     * Constructor
     * 
     * @param indexDir
     *            the name of the folder in which the index should be created
     * @throws java.io.IOException
     *             when exception creating index.
     */
    HW3(String indexDir) throws IOException {

        FSDirectory dir = FSDirectory.open(new File(indexDir));

        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, sAnalyzer);

        writer = new IndexWriter(dir, config);
    }

    /**
     * Indexes a file or directory
     * 
     * @param fileName
     *            the name of a text file or a folder we wish to add to the
     *            index
     * @throws java.io.IOException
     *             when exception
     */
    public void indexFileOrDirectory(String fileName) throws IOException {
        // ===================================================
        // gets the list of files in a folder (if user has submitted
        // the name of a folder) or gets a single file name (is user
        // has submitted only the file name)
        // ===================================================
        addFiles(new File(fileName));

        int originalNumDocs = writer.numDocs();
        for (File f : queue) {
            FileReader fr = null;
            try {
                Document doc = new Document();

                // ===================================================
                // add contents of file
                // ===================================================
                fr = new FileReader(f);
                doc.add(new TextField("contents", fr));
                doc.add(new StringField("path", f.getPath(), Field.Store.YES));
                doc.add(new StringField("filename", f.getName(), Field.Store.YES));

                writer.addDocument(doc);
                System.out.println("Added: " + f);
            } catch (Exception e) {
                System.out.println("Could not add: " + f);
            } finally {
                fr.close();
            }
        }

        int newNumDocs = writer.numDocs();
        System.out.println("");
        System.out.println("************************");
        System.out.println((newNumDocs - originalNumDocs) + " documents added.");
        System.out.println("************************");

        queue.clear();
    }

    private void addFiles(File file) {

        if (!file.exists()) {
            System.out.println(file + " does not exist.");
        }
        if (file.isDirectory()) {
            for (File f : file.listFiles()) {
                addFiles(f);
            }
        } else {
            String filename = file.getName().toLowerCase();
            // ===================================================
            // Only index text files
            // ===================================================
            if (filename.endsWith(".htm") || filename.endsWith(".html") || filename.endsWith(".xml")
                    || filename.endsWith(".txt")) {
                queue.add(file);
            } else {
                System.out.println("Skipped " + filename);
            }
        }
    }

    /**
     * Close the index.
     * 
     * @throws java.io.IOException
     *             when exception closing
     */
    public void closeIndex() throws IOException {
        writer.close();
    }
}