edu.cuhk.hccl.cmd.AppSearchEngine.java Source code

Java tutorial

Introduction

Here is the source code for edu.cuhk.hccl.cmd.AppSearchEngine.java

Source

/**
 * Copyright (C) 2014 Pengfei Liu <pfliu@se.cuhk.edu.hk>
 * The Chinese University of Hong Kong.
 *
 * This file is part of smart-search-web.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.cuhk.hccl.cmd;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import edu.cuhk.hccl.expander.QueryExpander;
import edu.cuhk.hccl.expander.WordNetExpander;
import edu.cuhk.hccl.expander.WordVectorExpander;

/**
 * This program indexs a folder with text documents and supports phrase query
 * @author Pengfei Liu (pfliu@se.cuhk.edu.hk)
 *
 */
public class AppSearchEngine {

    public final static String CONTENT_FIELD = "content";
    public final static String PATH_FIELD = "path";

    private static QueryExpander expander = null;

    public static final FieldType TERM_STORED = new FieldType();

    static {
        TERM_STORED.setIndexed(true);
        TERM_STORED.setTokenized(true);
        TERM_STORED.setStored(true);
        TERM_STORED.setStoreTermVectors(true);
        TERM_STORED.setStoreTermVectorPositions(true);
        TERM_STORED.freeze();
    }

    public static void main(String[] args) throws IOException {

        // Get parameters
        CommandLineParser parser = new BasicParser();
        Options options = createOptions();

        File dataFolder = null;
        String queryStr = null;
        int topK = 0;
        File resultFile = null;
        String queryType = null;
        File similarityFile = null;

        try {
            CommandLine line = parser.parse(options, args);

            dataFolder = new File(line.getOptionValue('d'));
            queryStr = line.getOptionValue('q');
            queryType = line.getOptionValue('t');

            topK = Integer.parseInt(line.getOptionValue('k'));
            resultFile = new File(line.getOptionValue('f'));
            similarityFile = new File(line.getOptionValue('s'));

            if (line.hasOption('m')) {
                String modelPath = line.getOptionValue('m');

                if (queryType.equalsIgnoreCase("WordVector")) {
                    expander = new WordVectorExpander(modelPath);
                } else if (queryType.equalsIgnoreCase("WordNet")) {
                    expander = new WordNetExpander(modelPath);
                } else {
                    System.out.println("Please choose a correct expander: WordNet or WordVector!");
                    System.exit(-1);
                }
            }

        } catch (ParseException exp) {
            System.out.println("Error in parameters: \n" + exp.getMessage());
            System.exit(-1);
        }

        // Create Index
        StandardAnalyzer analyzer = new StandardAnalyzer();
        Directory index = createIndex(dataFolder, analyzer);

        // Build query
        Query query = buildQuery(analyzer, queryStr, queryType);

        // Search index for topK hits
        IndexReader reader = DirectoryReader.open(index);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector = TopScoreDocCollector.create(topK, true);
        searcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        // Show search results
        System.out.println("\n[INFO] " + hits.length + " hits were returned:");
        List<String> hitLines = new ArrayList<String>();

        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);

            String line = (i + 1) + "\t" + d.get(PATH_FIELD) + "\t" + hits[i].score;

            System.out.println(line);

            hitLines.add(line);
        }

        // Compute cosine similarity between documents
        List<String> simLines = new ArrayList<String>();
        for (int m = 0; m < hits.length; m++) {
            int doc1 = hits[m].doc;
            Terms terms1 = reader.getTermVector(doc1, CONTENT_FIELD);

            for (int n = m + 1; n < hits.length; n++) {
                int doc2 = hits[n].doc;
                Terms terms2 = reader.getTermVector(doc2, CONTENT_FIELD);

                CosineDocumentSimilarity cosine = new CosineDocumentSimilarity(terms1, terms2);
                double similarity = cosine.getCosineSimilarity();
                String line = searcher.doc(doc1).get(PATH_FIELD) + "\t" + searcher.doc(doc2).get(PATH_FIELD) + "\t"
                        + similarity;
                simLines.add(line);
            }
        }

        // Release resources
        reader.close();
        if (expander != null) {
            expander.close();
        }

        // Save search results
        System.out.println("\n[INFO] Search results are saved in file: " + resultFile.getPath());
        FileUtils.writeLines(resultFile, hitLines, false);

        System.out.println("\n[INFO] Cosine similarities are saved in file: " + similarityFile.getPath());
        FileUtils.writeLines(similarityFile, simLines, false);
    }

    /***
     * Create Options for program parameters
     * @return
     */
    private static Options createOptions() {

        Options options = new Options();

        options.addOption("d", "data", true, "Data set to index and search");
        options.addOption("q", "query", true, "Query string");
        options.addOption("t", "type", true, "Query type: boolean, phrase or expand");
        options.addOption("m", "model", true, "Model file for word vectors");
        options.addOption("k", "topK", true, "Number of hits to return");
        options.addOption("f", "file", true, "Output search result to a file");
        options.addOption("s", "similarity", true, "Output similarity result to a file");
        return options;
    }

    /**
     * Build a Query
     * @param analyzer
     * @param queryStr
     * @param queryType
     * @return
     */
    private static Query buildQuery(StandardAnalyzer analyzer, String queryStr, String queryType) {

        QueryParser parser = new QueryParser(CONTENT_FIELD, analyzer);

        Query query = null;

        if (queryType.equalsIgnoreCase("PHRASE")) {

            query = parser.createPhraseQuery(CONTENT_FIELD, queryStr);

        } else if (queryType.equalsIgnoreCase("BOOLEAN")) {

            query = parser.createBooleanQuery(CONTENT_FIELD, queryStr);

        } else {

            // Find synonymous words of queryStr
            List<String> synWords = expander.expandQuery(queryStr, 5);
            StringBuffer newQueryBuffer = new StringBuffer();
            for (String word : synWords) {
                newQueryBuffer.append(word);
                newQueryBuffer.append(" ");
                newQueryBuffer.append("OR");
                newQueryBuffer.append(" ");
            }

            query = parser.createBooleanQuery(CONTENT_FIELD, newQueryBuffer.toString());

        }

        return query;
    }

    /**
     * Create index of RAMDirectory from a data folder with text files
     * @param dataFolder
     * @param analyzer
     * @return
     * @throws IOException
     */
    private static Directory createIndex(File dataFolder, StandardAnalyzer analyzer) throws IOException {

        Directory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
        IndexWriter writer = new IndexWriter(index, config);

        Collection<File> files = FileUtils.listFiles(dataFolder, null, true);
        for (File file : files) {
            String path = file.getPath();
            String content = FileUtils.readFileToString(file);

            Document doc = new Document();

            doc.add(new StringField(PATH_FIELD, path, Field.Store.YES));
            doc.add(new Field(CONTENT_FIELD, content, TERM_STORED));

            writer.addDocument(doc);

            System.out.println("[INFO] Indexing file: " + path);
        }

        System.out.println("\n[INFO]" + files.size() + " files has been indexed.");

        writer.close();

        return index;
    }
}