Java tutorial
/* * Copyright (C) 2014 * * This file is part of InformationRetrieval. * * InformationRetrieval is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 3 of the License. * * InformationRetrieval is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with InformationRetrieval. If not, see <http://www.gnu.org/licenses/>. */ package de.minecrawler.search; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import de.minecrawler.data.CrawledWebsite; import de.minecrawler.data.CrawledWebsiteResult; /** * General class for search engine providig a Lucene based search on a * directory. The implementation defines the directory for the index. */ public abstract class AbstractSearchEngine { public AbstractSearchEngine(Object... args) throws Exception { this.dir = createDirectory(args); } // Fields for the indices protected static final String FIELD_TITLE = "title"; protected static final String FIELD_BODY = "body"; protected static final String FIELD_URL = "url"; private static final String[] FIELDS = { FIELD_BODY, FIELD_TITLE }; protected final static Version LUCENE_VERSION = Version.LUCENE_46; protected final static Analyzer ANALYZER = new StandardAnalyzer(LUCENE_VERSION); protected Directory dir; /** * Creates a directory for the index. * * @param args * Possible arguments to pass by * @return A directory for the index */ protected abstract Directory createDirectory(Object... args); /** * Starts a search on the parsed documents using a search query. The default * maximum number of results is 10. * * @param queryString * The query string <a href= * "http://lucene.apache.org/core/4_1_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html" * >Query Format</a> * * @return List of results */ public List<CrawledWebsiteResult> search(String queryString) { return search(queryString, 10); } /** * Starts a search on the parsed documents using a search query. * * @param queryString * The query string <a href= * "http://lucene.apache.org/core/4_1_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html" * >Query Format</a> * @param limit * The maximum numer of results * @return List of results */ public List<CrawledWebsiteResult> search(String queryString, int limit) { try { DirectoryReader ireader = DirectoryReader.open(this.dir); IndexSearcher isearcher = new IndexSearcher(ireader); QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, FIELDS, ANALYZER); Query query = parser.parse(queryString); ScoreDoc[] hits = isearcher.search(query, null, limit).scoreDocs; List<CrawledWebsiteResult> result = new ArrayList<CrawledWebsiteResult>(); for (int i = 0; i < hits.length; ++i) { Document hitDoc = isearcher.doc(hits[i].doc); CrawledWebsite website = extractWebsite(hitDoc); result.add(new CrawledWebsiteResult(website, i + 1, hits[i].score)); } ireader.close(); return result; } catch (IOException e) { e.printStackTrace(); return Collections.<CrawledWebsiteResult>emptyList(); } catch (ParseException e) { System.out.println("Wrong query! Check your query format!"); System.out.println(e.getMessage()); return Collections.<CrawledWebsiteResult>emptyList(); } } /** * Creates a wrapper from the document. * * @param doc * The document containing the information * @return Container class */ protected CrawledWebsite extractWebsite(Document doc) { return new CrawledWebsite(doc.get(FIELD_BODY), doc.get(FIELD_TITLE), doc.get(FIELD_URL)); } }