Java tutorial
/** * Copyright (C) 2014 Pengfei Liu <pfliu@se.cuhk.edu.hk> * The Chinese University of Hong Kong. * * This file is part of smart-search-web. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.cuhk.hccl.cmd; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Terms; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import edu.cuhk.hccl.expander.QueryExpander; import edu.cuhk.hccl.expander.WordNetExpander; import edu.cuhk.hccl.expander.WordVectorExpander; /** * This program indexs a folder with text documents and supports phrase query * @author Pengfei Liu (pfliu@se.cuhk.edu.hk) * */ public class AppSearchEngine { public final static String CONTENT_FIELD = "content"; public final static String PATH_FIELD = "path"; private static QueryExpander expander = null; public static final FieldType TERM_STORED = new FieldType(); static { TERM_STORED.setIndexed(true); TERM_STORED.setTokenized(true); TERM_STORED.setStored(true); TERM_STORED.setStoreTermVectors(true); TERM_STORED.setStoreTermVectorPositions(true); TERM_STORED.freeze(); } public static void main(String[] args) throws IOException { // Get parameters CommandLineParser parser = new BasicParser(); Options options = createOptions(); File dataFolder = null; String queryStr = null; int topK = 0; File resultFile = null; String queryType = null; File similarityFile = null; try { CommandLine line = parser.parse(options, args); dataFolder = new File(line.getOptionValue('d')); queryStr = line.getOptionValue('q'); queryType = line.getOptionValue('t'); topK = Integer.parseInt(line.getOptionValue('k')); resultFile = new File(line.getOptionValue('f')); similarityFile = new File(line.getOptionValue('s')); if (line.hasOption('m')) { String modelPath = line.getOptionValue('m'); if (queryType.equalsIgnoreCase("WordVector")) { expander = new WordVectorExpander(modelPath); } else if (queryType.equalsIgnoreCase("WordNet")) { expander = new WordNetExpander(modelPath); } else { System.out.println("Please choose a correct expander: WordNet or WordVector!"); System.exit(-1); } } } catch (ParseException exp) { System.out.println("Error in parameters: \n" + exp.getMessage()); System.exit(-1); } // Create Index StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = createIndex(dataFolder, analyzer); // Build query Query query = buildQuery(analyzer, queryStr, queryType); // Search index for topK hits IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(topK, true); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // Show search results System.out.println("\n[INFO] " + hits.length + " hits were returned:"); List<String> hitLines = new ArrayList<String>(); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); String line = (i + 1) + "\t" + d.get(PATH_FIELD) + "\t" + hits[i].score; System.out.println(line); hitLines.add(line); } // Compute cosine similarity between documents List<String> simLines = new ArrayList<String>(); for (int m = 0; m < hits.length; m++) { int doc1 = hits[m].doc; Terms terms1 = reader.getTermVector(doc1, CONTENT_FIELD); for (int n = m + 1; n < hits.length; n++) { int doc2 = hits[n].doc; Terms terms2 = reader.getTermVector(doc2, CONTENT_FIELD); CosineDocumentSimilarity cosine = new CosineDocumentSimilarity(terms1, terms2); double similarity = cosine.getCosineSimilarity(); String line = searcher.doc(doc1).get(PATH_FIELD) + "\t" + searcher.doc(doc2).get(PATH_FIELD) + "\t" + similarity; simLines.add(line); } } // Release resources reader.close(); if (expander != null) { expander.close(); } // Save search results System.out.println("\n[INFO] Search results are saved in file: " + resultFile.getPath()); FileUtils.writeLines(resultFile, hitLines, false); System.out.println("\n[INFO] Cosine similarities are saved in file: " + similarityFile.getPath()); FileUtils.writeLines(similarityFile, simLines, false); } /*** * Create Options for program parameters * @return */ private static Options createOptions() { Options options = new Options(); options.addOption("d", "data", true, "Data set to index and search"); options.addOption("q", "query", true, "Query string"); options.addOption("t", "type", true, "Query type: boolean, phrase or expand"); options.addOption("m", "model", true, "Model file for word vectors"); options.addOption("k", "topK", true, "Number of hits to return"); options.addOption("f", "file", true, "Output search result to a file"); options.addOption("s", "similarity", true, "Output similarity result to a file"); return options; } /** * Build a Query * @param analyzer * @param queryStr * @param queryType * @return */ private static Query buildQuery(StandardAnalyzer analyzer, String queryStr, String queryType) { QueryParser parser = new QueryParser(CONTENT_FIELD, analyzer); Query query = null; if (queryType.equalsIgnoreCase("PHRASE")) { query = parser.createPhraseQuery(CONTENT_FIELD, queryStr); } else if (queryType.equalsIgnoreCase("BOOLEAN")) { query = parser.createBooleanQuery(CONTENT_FIELD, queryStr); } else { // Find synonymous words of queryStr List<String> synWords = expander.expandQuery(queryStr, 5); StringBuffer newQueryBuffer = new StringBuffer(); for (String word : synWords) { newQueryBuffer.append(word); newQueryBuffer.append(" "); newQueryBuffer.append("OR"); newQueryBuffer.append(" "); } query = parser.createBooleanQuery(CONTENT_FIELD, newQueryBuffer.toString()); } return query; } /** * Create index of RAMDirectory from a data folder with text files * @param dataFolder * @param analyzer * @return * @throws IOException */ private static Directory createIndex(File dataFolder, StandardAnalyzer analyzer) throws IOException { Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer); IndexWriter writer = new IndexWriter(index, config); Collection<File> files = FileUtils.listFiles(dataFolder, null, true); for (File file : files) { String path = file.getPath(); String content = FileUtils.readFileToString(file); Document doc = new Document(); doc.add(new StringField(PATH_FIELD, path, Field.Store.YES)); doc.add(new Field(CONTENT_FIELD, content, TERM_STORED)); writer.addDocument(doc); System.out.println("[INFO] Indexing file: " + path); } System.out.println("\n[INFO]" + files.size() + " files has been indexed."); writer.close(); return index; } }