List of usage examples for org.apache.lucene.index.memory MemoryIndex MemoryIndex
public MemoryIndex()
From source file:ch.sentric.hbase.prospective.Percolator.java
License:Apache License
/** * Tries to find a set of queries that match the given document. * * @param doc//from ww w . ja va 2s . c o m * the Lucene document * @return the matching queries * @throws IOException * if an I/O error occurs */ public Response<T> percolate(final Document doc, final Map<T, Query> queries) throws IOException { // first, parse the source doc into a MemoryIndex final MemoryIndex memoryIndex = new MemoryIndex(); for (final Fieldable field : doc.getFields()) { if (!field.isIndexed()) { continue; } final TokenStream tokenStream = field.tokenStreamValue(); if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.getBoost()); } else { final Reader reader = field.readerValue(); if (reader != null) { memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), reader), field.getBoost()); } else { final String value = field.stringValue(); if (value != null) { memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), new CharSequenceReader(value)), field.getBoost()); } } } } // do the search final IndexSearcher searcher = memoryIndex.createSearcher(); final Map<T, Query> matches = new HashMap<T, Query>(0); if (queries != null && !queries.isEmpty()) { final ExistsCollector collector = new ExistsCollector(); for (final Map.Entry<T, Query> entry : queries.entrySet()) { collector.reset(); searcher.search(entry.getValue(), collector); if (collector.exists()) { matches.put(entry.getKey(), entry.getValue()); } } } return new Response<T>(matches); }
From source file:com.appspot.socialinquirer.server.service.impl.StackExchangeServiceImpl.java
License:Apache License
/** * Match with keywords.//w w w .jav a 2s .c om * * @param keywords the keywords * @param userText the user text * @return true, if successful */ public boolean matchWithKeywords(List<String> keywords, String userText) { MemoryIndex index = new MemoryIndex(); index.addField("text", userText, createEnglishAnalyzer()); QueryParser parser = new QueryParser("text", createEnglishAnalyzer()); BooleanQuery query = new BooleanQuery(); for (String keyword : keywords) { try { query.add(parser.parse(keyword), BooleanClause.Occur.SHOULD); } catch (ParseException e) { } } float score = index.search(query); return score > 0.0f; }
From source file:com.jaeksoft.searchlib.classifier.Classifier.java
License:Open Source License
public void classification(Client client, IndexDocument document) throws SearchLibException, ParseException, SyntaxError, IOException { rwl.r.lock();//from www.j a va 2 s.c om try { MemoryIndex index = new MemoryIndex(); LanguageEnum lang = document.getLang(); Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang); for (FieldContent fieldContent : document) { String fieldName = fieldContent.getField(); String concatValues = fieldContent.getMergedValues(" "); index.addField(fieldName, concatValues, analyzer); } if (method == ClassificationMethodEnum.MULTIVALUED) multivaluedClassification(client, document, lang, index); else if (method == ClassificationMethodEnum.BESTSCORE) bestScoreClassification(client, document, lang, index); } finally { rwl.r.unlock(); } }
From source file:com.jaeksoft.searchlib.parser.HtmlParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum forcedLang) throws IOException, SearchLibException { titleBoost = getFloatProperty(ClassPropertyEnum.TITLE_BOOST); boostTagMap = new TreeMap<String, BoostTag>(); boostTagMap.put("h1", new BoostTag(ClassPropertyEnum.H1_BOOST)); boostTagMap.put("h2", new BoostTag(ClassPropertyEnum.H2_BOOST)); boostTagMap.put("h3", new BoostTag(ClassPropertyEnum.H3_BOOST)); boostTagMap.put("h4", new BoostTag(ClassPropertyEnum.H4_BOOST)); boostTagMap.put("h5", new BoostTag(ClassPropertyEnum.H5_BOOST)); boostTagMap.put("h6", new BoostTag(ClassPropertyEnum.H6_BOOST)); ignoreMetaNoIndex = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOINDEX); ignoreMetaNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW); ignoreLinkNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW); ignoreUntitledDocuments = getBooleanProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS); ignoreNonCanonical = getBooleanProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL); String currentCharset = null; String headerCharset = null;//w w w.j a va 2s .c o m String detectedCharset = null; IndexDocument sourceDocument = getSourceDocument(); if (sourceDocument != null) { FieldValueItem fieldValueItem = sourceDocument .getFieldValue(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0); if (fieldValueItem != null) headerCharset = fieldValueItem.getValue(); if (headerCharset == null) { fieldValueItem = sourceDocument.getFieldValue(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), 0); if (fieldValueItem != null) headerCharset = fieldValueItem.getValue(); } currentCharset = headerCharset; } if (currentCharset == null) { detectedCharset = streamLimiter.getDetectedCharset(); currentCharset = detectedCharset; } if (currentCharset == null) { currentCharset = getProperty(ClassPropertyEnum.DEFAULT_CHARSET).getValue(); } String xPathExclusions = getProperty(ClassPropertyEnum.XPATH_EXCLUSION).getValue(); Set<Object> xPathExclusionsSet = null; if (!StringUtils.isEmpty(xPathExclusions)) xPathExclusionsSet = new HashSet<Object>(); HtmlParserEnum htmlParserEnum = HtmlParserEnum.find(getProperty(ClassPropertyEnum.HTML_PARSER).getValue()); HtmlDocumentProvider htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter, xPathExclusions, xPathExclusionsSet); if (htmlProvider == null) return; URL currentURL = htmlProvider.getBaseHref(); IndexDocument srcDoc = getSourceDocument(); String streamOriginalUrl = streamLimiter.getOriginURL(); try { if (currentURL == null && !StringUtils.isEmpty(streamOriginalUrl)) currentURL = LinkUtils.newEncodedURL(streamOriginalUrl); if (currentURL == null && srcDoc != null) { FieldValueItem fvi = srcDoc.getFieldValue(UrlItemFieldEnum.INSTANCE.url.getName(), 0); if (fvi != null) currentURL = LinkUtils.newEncodedURL(fvi.getValue()); } } catch (URISyntaxException e) { throw new IOException(e); } URL canonicalURL = htmlProvider.getCanonicalLink(currentURL); if (canonicalURL != null) { String canUrl = canonicalURL.toExternalForm(); addDetectedLink(canUrl); if (ignoreNonCanonical) { String curUrl = currentURL.toExternalForm(); if (!canUrl.equals(curUrl)) { isCanonical = false; return; } } } isCanonical = true; String title = htmlProvider.getTitle(); if (ignoreUntitledDocuments) if (title == null || title.length() == 0) return; ParserResultItem result = getNewParserResultItem(); addFieldTitle(result, title); result.addField(ParserFieldEnum.htmlProvider, htmlProvider.getName()); // Check ContentType charset in meta http-equiv String metaCharset = htmlProvider.getMetaCharset(); String selectedCharset = selectCharset(headerCharset, metaCharset, detectedCharset); if (selectedCharset != null) { if (!selectedCharset.equals(currentCharset)) { currentCharset = selectedCharset; htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter, xPathExclusions, xPathExclusionsSet); } } StringWriter writer = new StringWriter(); IOUtils.copy(streamLimiter.getNewInputStream(), writer, currentCharset); result.addField(ParserFieldEnum.htmlSource, writer.toString()); writer.close(); HtmlNodeAbstract<?> rootNode = htmlProvider.getRootNode(); if (rootNode == null) return; for (HtmlNodeAbstract<?> metaNode : htmlProvider.getMetas()) { String metaName = metaNode.getAttributeText("name"); if (metaName != null && metaName.startsWith(OPENSEARCHSERVER_FIELD)) { String field = metaName.substring(OPENSEARCHSERVER_FIELD_LENGTH); String[] fields = field.split("\\."); if (fields != null) { String content = metaNode.getAttributeText("content"); result.addDirectFields(fields, content); } } } result.addField(ParserFieldEnum.charset, currentCharset); String metaRobots = null; String metaDcLanguage = null; String metaContentLanguage = null; for (HtmlNodeAbstract<?> node : htmlProvider.getMetas()) { String attr_name = node.getAttributeText("name"); String attr_http_equiv = node.getAttributeText("http-equiv"); if ("keywords".equalsIgnoreCase(attr_name)) result.addField(ParserFieldEnum.meta_keywords, HtmlDocumentProvider.getMetaContent(node)); else if ("description".equalsIgnoreCase(attr_name)) result.addField(ParserFieldEnum.meta_description, HtmlDocumentProvider.getMetaContent(node)); else if ("robots".equalsIgnoreCase(attr_name)) metaRobots = HtmlDocumentProvider.getMetaContent(node); else if ("dc.language".equalsIgnoreCase(attr_name)) metaDcLanguage = HtmlDocumentProvider.getMetaContent(node); else if ("content-language".equalsIgnoreCase(attr_http_equiv)) metaContentLanguage = HtmlDocumentProvider.getMetaContent(node); } boolean metaRobotsFollow = true; boolean metaRobotsNoIndex = false; if (metaRobots != null) { metaRobots = metaRobots.toLowerCase(); if (metaRobots.contains("noindex") && !ignoreMetaNoIndex) { metaRobotsNoIndex = true; result.addField(ParserFieldEnum.meta_robots, "noindex"); } if (metaRobots.contains("nofollow") && !ignoreMetaNoFollow) { metaRobotsFollow = false; result.addField(ParserFieldEnum.meta_robots, "nofollow"); } } UrlFilterItem[] urlFilterList = getUrlFilterList(); boolean removeFragment = ClassPropertyEnum.KEEP_REMOVE_LIST[1] .equalsIgnoreCase(getProperty(ClassPropertyEnum.URL_FRAGMENT).getValue()); List<HtmlNodeAbstract<?>> nodes = rootNode.getAllNodes("a", "frame", "img"); if (srcDoc != null && nodes != null && metaRobotsFollow) { for (HtmlNodeAbstract<?> node : nodes) { String href = null; String rel = null; String nodeName = node.getNodeName(); if ("a".equals(nodeName)) { href = node.getAttributeText("href"); rel = node.getAttributeText("rel"); } else if ("frame".equals(nodeName) || "img".equals(nodeName)) { href = node.getAttributeText("src"); } boolean follow = true; if (rel != null) if (rel.contains("nofollow") && !ignoreLinkNoFollow) follow = false; URL newUrl = null; if (href != null) if (!href.startsWith("javascript:")) if (currentURL != null) { href = StringEscapeUtils.unescapeXml(href); newUrl = LinkUtils.getLink(currentURL, href, urlFilterList, removeFragment); } if (newUrl != null) { ParserFieldEnum field = null; if (newUrl.getHost().equalsIgnoreCase(currentURL.getHost())) { if (follow) field = ParserFieldEnum.internal_link; else field = ParserFieldEnum.internal_link_nofollow; } else { if (follow) field = ParserFieldEnum.external_link; else field = ParserFieldEnum.external_link_nofollow; } String link = newUrl.toExternalForm(); result.addField(field, link); if (follow) addDetectedLink(link); } } } if (!metaRobotsNoIndex) { nodes = rootNode.getNodes("html", "body"); if (nodes == null || nodes.size() == 0) nodes = rootNode.getNodes("html"); if (nodes != null && nodes.size() > 0) { StringBuilder sb = new StringBuilder(); getBodyTextContent(result, sb, nodes.get(0), true, null, 1024, xPathExclusionsSet); result.addField(ParserFieldEnum.body, sb); } } // Identification de la langue: Locale lang = null; String langMethod = null; String[] pathHtml = { "html" }; nodes = rootNode.getNodes(pathHtml); if (nodes != null && nodes.size() > 0) { langMethod = "html lang attribute"; String l = nodes.get(0).getAttributeText("lang"); if (l != null) lang = Lang.findLocaleISO639(l); } if (lang == null && metaContentLanguage != null) { langMethod = "meta http-equiv content-language"; lang = Lang.findLocaleISO639(metaContentLanguage); } if (lang == null && metaDcLanguage != null) { langMethod = "meta dc.language"; lang = Lang.findLocaleISO639(metaDcLanguage); } if (lang != null) { result.addField(ParserFieldEnum.lang, lang.getLanguage()); result.addField(ParserFieldEnum.lang_method, langMethod); } else if (!metaRobotsNoIndex) lang = result.langDetection(10000, ParserFieldEnum.body); if (getFieldMap().isMapped(ParserFieldEnum.generated_title)) { StringBuilder sb = new StringBuilder(); try { if (!StringUtils.isEmpty(streamOriginalUrl)) sb.append(new URI(streamOriginalUrl).getHost()); } catch (URISyntaxException e) { Logging.error(e); } String generatedTitle = null; for (Map.Entry<String, BoostTag> entry : boostTagMap.entrySet()) { BoostTag boostTag = entry.getValue(); if (boostTag.firstContent != null) { generatedTitle = boostTag.firstContent; break; } } if (generatedTitle == null) { final String FIELD_TITLE = "contents"; MemoryIndex bodyMemoryIndex = new MemoryIndex(); Analyzer bodyAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36); String bodyText = result.getMergedBodyText(100000, " ", ParserFieldEnum.body); bodyMemoryIndex.addField(FIELD_TITLE, bodyText, bodyAnalyzer); IndexSearcher indexSearcher = bodyMemoryIndex.createSearcher(); IndexReader indexReader = indexSearcher.getIndexReader(); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.setAnalyzer(bodyAnalyzer); mlt.setFieldNames(new String[] { FIELD_TITLE }); mlt.setMinWordLen(3); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); String[] words = mlt.retrieveInterestingTerms(0); if (words != null && words.length > 0) generatedTitle = words[0]; } if (generatedTitle != null) { if (sb.length() > 0) sb.append(" - "); sb.append(generatedTitle); } if (sb.length() > 67) { int pos = sb.indexOf(" ", 60); if (pos == -1) pos = 67; sb.setLength(pos); sb.append("..."); } result.addField(ParserFieldEnum.generated_title, sb.toString()); } }
From source file:com.jaeksoft.searchlib.snippet.Fragment.java
License:Open Source License
public final double searchScore(final String fieldName, final CompiledAnalyzer analyzer, final Query query) { searchScore = 0;//from ww w . j av a 2s . com if (query == null || analyzer == null) return 0; MemoryIndex index = new MemoryIndex(); index.addField(fieldName, originalText, analyzer); searchScore = index.search(query); return searchScore; }
From source file:com.orientechnologies.lucene.operator.OLuceneTextOperator.java
License:Apache License
@Override public Object evaluateRecord(OIdentifiable iRecord, ODocument iCurrentResult, OSQLFilterCondition iCondition, Object iLeft, Object iRight, OCommandContext iContext) { OLuceneFullTextIndex index = involvedIndex(iRecord, iCurrentResult, iCondition, iLeft, iRight); if (index == null) { throw new OCommandExecutionException("Cannot evaluate lucene condition without index configuration."); }//from w ww. j ava 2s.com MemoryIndex memoryIndex = (MemoryIndex) iContext.getVariable("_memoryIndex"); if (memoryIndex == null) { memoryIndex = new MemoryIndex(); iContext.setVariable("_memoryIndex", memoryIndex); } memoryIndex.reset(); Document doc = index.buildDocument(iLeft); for (IndexableField field : doc.getFields()) { memoryIndex.addField(field.name(), field.stringValue(), index.analyzer(field.name())); } Query query = null; try { query = index.buildQuery(iRight); } catch (Exception e) { throw new OCommandExecutionException("Error executing lucene query.", e); } return memoryIndex.search(query) > 0.0f; }
From source file:com.orientechnologies.lucene.test.LuceneBooleanIndexTest.java
License:Apache License
@Test public void testMemoryIndex() throws ParseException { // TODO To be used in evaluate Record MemoryIndex index = new MemoryIndex(); Document doc = new Document(); doc.add(new StringField("text", "my text", Field.Store.YES)); StandardAnalyzer analyzer = new StandardAnalyzer(); for (IndexableField field : doc.getFields()) { index.addField(field.name(), field.stringValue(), analyzer); }//from w ww . j a va 2s.c o m QueryParser parser = new QueryParser("text", analyzer); float score = index.search(parser.parse("+text:my")); }
From source file:com.orientechnologies.lucene.tx.OLuceneTxChangesMultiRid.java
License:Apache License
public boolean isDeleted(Document document, Object key, OIdentifiable value) { boolean match = false; List<String> strings = deleted.get(value.getIdentity().toString()); if (strings != null) { MemoryIndex memoryIndex = new MemoryIndex(); for (String string : strings) { Query q = engine.deleteQuery(string, value); memoryIndex.reset();/*from ww w.j a va 2 s . co m*/ for (IndexableField field : document.getFields()) { memoryIndex.addField(field.name(), field.stringValue(), new KeywordAnalyzer()); } match = match || (memoryIndex.search(q) > 0.0f); } return match; } return match; }
From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java
License:Apache License
private IndexReader getReaderForField(String field) throws IOException { if (wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) { tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); cachedTokenStream = true;// w w w. j a v a 2 s .c o m } IndexReader reader = readers.get(field); if (reader == null) { MemoryIndex indexer = new MemoryIndex(); indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); tokenStream.reset(); IndexSearcher searcher = indexer.createSearcher(); reader = searcher.getIndexReader(); readers.put(field, reader); } return reader; }
From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java
License:Apache License
public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException { memoryIndex = new MemoryIndex(); IndexSearcher searcher = memoryIndex.createSearcher(); leafReader = (LeafReader) searcher.getIndexReader(); this.generalAnalyzer = generalAnalyzer; }