List of usage examples for org.apache.lucene.index.memory MemoryIndex addField
public void addField(String fieldName, TokenStream stream, int positionIncrementGap)
From source file:ch.sentric.hbase.prospective.Percolator.java
License:Apache License
/** * Tries to find a set of queries that match the given document. * * @param doc/* w w w. j a v a 2 s . com*/ * the Lucene document * @return the matching queries * @throws IOException * if an I/O error occurs */ public Response<T> percolate(final Document doc, final Map<T, Query> queries) throws IOException { // first, parse the source doc into a MemoryIndex final MemoryIndex memoryIndex = new MemoryIndex(); for (final Fieldable field : doc.getFields()) { if (!field.isIndexed()) { continue; } final TokenStream tokenStream = field.tokenStreamValue(); if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.getBoost()); } else { final Reader reader = field.readerValue(); if (reader != null) { memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), reader), field.getBoost()); } else { final String value = field.stringValue(); if (value != null) { memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), new CharSequenceReader(value)), field.getBoost()); } } } } // do the search final IndexSearcher searcher = memoryIndex.createSearcher(); final Map<T, Query> matches = new HashMap<T, Query>(0); if (queries != null && !queries.isEmpty()) { final ExistsCollector collector = new ExistsCollector(); for (final Map.Entry<T, Query> entry : queries.entrySet()) { collector.reset(); searcher.search(entry.getValue(), collector); if (collector.exists()) { matches.put(entry.getKey(), entry.getValue()); } } } return new Response<T>(matches); }
From source file:com.appspot.socialinquirer.server.service.impl.StackExchangeServiceImpl.java
License:Apache License
/** * Match with keywords.//from w w w. j a v a 2s .co m * * @param keywords the keywords * @param userText the user text * @return true, if successful */ public boolean matchWithKeywords(List<String> keywords, String userText) { MemoryIndex index = new MemoryIndex(); index.addField("text", userText, createEnglishAnalyzer()); QueryParser parser = new QueryParser("text", createEnglishAnalyzer()); BooleanQuery query = new BooleanQuery(); for (String keyword : keywords) { try { query.add(parser.parse(keyword), BooleanClause.Occur.SHOULD); } catch (ParseException e) { } } float score = index.search(query); return score > 0.0f; }
From source file:com.jaeksoft.searchlib.classifier.Classifier.java
License:Open Source License
public void classification(Client client, IndexDocument document) throws SearchLibException, ParseException, SyntaxError, IOException { rwl.r.lock();/* w w w.jav a 2s . com*/ try { MemoryIndex index = new MemoryIndex(); LanguageEnum lang = document.getLang(); Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang); for (FieldContent fieldContent : document) { String fieldName = fieldContent.getField(); String concatValues = fieldContent.getMergedValues(" "); index.addField(fieldName, concatValues, analyzer); } if (method == ClassificationMethodEnum.MULTIVALUED) multivaluedClassification(client, document, lang, index); else if (method == ClassificationMethodEnum.BESTSCORE) bestScoreClassification(client, document, lang, index); } finally { rwl.r.unlock(); } }
From source file:com.jaeksoft.searchlib.parser.HtmlParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum forcedLang) throws IOException, SearchLibException { titleBoost = getFloatProperty(ClassPropertyEnum.TITLE_BOOST); boostTagMap = new TreeMap<String, BoostTag>(); boostTagMap.put("h1", new BoostTag(ClassPropertyEnum.H1_BOOST)); boostTagMap.put("h2", new BoostTag(ClassPropertyEnum.H2_BOOST)); boostTagMap.put("h3", new BoostTag(ClassPropertyEnum.H3_BOOST)); boostTagMap.put("h4", new BoostTag(ClassPropertyEnum.H4_BOOST)); boostTagMap.put("h5", new BoostTag(ClassPropertyEnum.H5_BOOST)); boostTagMap.put("h6", new BoostTag(ClassPropertyEnum.H6_BOOST)); ignoreMetaNoIndex = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOINDEX); ignoreMetaNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW); ignoreLinkNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW); ignoreUntitledDocuments = getBooleanProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS); ignoreNonCanonical = getBooleanProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL); String currentCharset = null; String headerCharset = null;// www . ja va 2s . com String detectedCharset = null; IndexDocument sourceDocument = getSourceDocument(); if (sourceDocument != null) { FieldValueItem fieldValueItem = sourceDocument .getFieldValue(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0); if (fieldValueItem != null) headerCharset = fieldValueItem.getValue(); if (headerCharset == null) { fieldValueItem = sourceDocument.getFieldValue(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), 0); if (fieldValueItem != null) headerCharset = fieldValueItem.getValue(); } currentCharset = headerCharset; } if (currentCharset == null) { detectedCharset = streamLimiter.getDetectedCharset(); currentCharset = detectedCharset; } if (currentCharset == null) { currentCharset = getProperty(ClassPropertyEnum.DEFAULT_CHARSET).getValue(); } String xPathExclusions = getProperty(ClassPropertyEnum.XPATH_EXCLUSION).getValue(); Set<Object> xPathExclusionsSet = null; if (!StringUtils.isEmpty(xPathExclusions)) xPathExclusionsSet = new HashSet<Object>(); HtmlParserEnum htmlParserEnum = HtmlParserEnum.find(getProperty(ClassPropertyEnum.HTML_PARSER).getValue()); HtmlDocumentProvider htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter, xPathExclusions, xPathExclusionsSet); if (htmlProvider == null) return; URL currentURL = htmlProvider.getBaseHref(); IndexDocument srcDoc = getSourceDocument(); String streamOriginalUrl = streamLimiter.getOriginURL(); try { if (currentURL == null && !StringUtils.isEmpty(streamOriginalUrl)) currentURL = LinkUtils.newEncodedURL(streamOriginalUrl); if (currentURL == null && srcDoc != null) { FieldValueItem fvi = srcDoc.getFieldValue(UrlItemFieldEnum.INSTANCE.url.getName(), 0); if (fvi != null) currentURL = LinkUtils.newEncodedURL(fvi.getValue()); } } catch (URISyntaxException e) { throw new IOException(e); } URL canonicalURL = htmlProvider.getCanonicalLink(currentURL); if (canonicalURL != null) { String canUrl = canonicalURL.toExternalForm(); addDetectedLink(canUrl); if (ignoreNonCanonical) { String curUrl = currentURL.toExternalForm(); if (!canUrl.equals(curUrl)) { isCanonical = false; return; } } } isCanonical = true; String title = htmlProvider.getTitle(); if (ignoreUntitledDocuments) if (title == null || title.length() == 0) return; ParserResultItem result = getNewParserResultItem(); addFieldTitle(result, title); result.addField(ParserFieldEnum.htmlProvider, htmlProvider.getName()); // Check ContentType charset in meta http-equiv String metaCharset = htmlProvider.getMetaCharset(); String selectedCharset = selectCharset(headerCharset, metaCharset, detectedCharset); if (selectedCharset != null) { if (!selectedCharset.equals(currentCharset)) { currentCharset = selectedCharset; htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter, xPathExclusions, xPathExclusionsSet); } } StringWriter writer = new StringWriter(); IOUtils.copy(streamLimiter.getNewInputStream(), writer, currentCharset); result.addField(ParserFieldEnum.htmlSource, writer.toString()); writer.close(); HtmlNodeAbstract<?> rootNode = htmlProvider.getRootNode(); if (rootNode == null) return; for (HtmlNodeAbstract<?> metaNode : htmlProvider.getMetas()) { String metaName = metaNode.getAttributeText("name"); if (metaName != null && metaName.startsWith(OPENSEARCHSERVER_FIELD)) { String field = metaName.substring(OPENSEARCHSERVER_FIELD_LENGTH); String[] fields = field.split("\\."); if (fields != null) { String content = metaNode.getAttributeText("content"); result.addDirectFields(fields, content); } } } result.addField(ParserFieldEnum.charset, currentCharset); String metaRobots = null; String metaDcLanguage = null; String metaContentLanguage = null; for (HtmlNodeAbstract<?> node : htmlProvider.getMetas()) { String attr_name = node.getAttributeText("name"); String attr_http_equiv = node.getAttributeText("http-equiv"); if ("keywords".equalsIgnoreCase(attr_name)) result.addField(ParserFieldEnum.meta_keywords, HtmlDocumentProvider.getMetaContent(node)); else if ("description".equalsIgnoreCase(attr_name)) result.addField(ParserFieldEnum.meta_description, HtmlDocumentProvider.getMetaContent(node)); else if ("robots".equalsIgnoreCase(attr_name)) metaRobots = HtmlDocumentProvider.getMetaContent(node); else if ("dc.language".equalsIgnoreCase(attr_name)) metaDcLanguage = HtmlDocumentProvider.getMetaContent(node); else if ("content-language".equalsIgnoreCase(attr_http_equiv)) metaContentLanguage = HtmlDocumentProvider.getMetaContent(node); } boolean metaRobotsFollow = true; boolean metaRobotsNoIndex = false; if (metaRobots != null) { metaRobots = metaRobots.toLowerCase(); if (metaRobots.contains("noindex") && !ignoreMetaNoIndex) { metaRobotsNoIndex = true; result.addField(ParserFieldEnum.meta_robots, "noindex"); } if (metaRobots.contains("nofollow") && !ignoreMetaNoFollow) { metaRobotsFollow = false; result.addField(ParserFieldEnum.meta_robots, "nofollow"); } } UrlFilterItem[] urlFilterList = getUrlFilterList(); boolean removeFragment = ClassPropertyEnum.KEEP_REMOVE_LIST[1] .equalsIgnoreCase(getProperty(ClassPropertyEnum.URL_FRAGMENT).getValue()); List<HtmlNodeAbstract<?>> nodes = rootNode.getAllNodes("a", "frame", "img"); if (srcDoc != null && nodes != null && metaRobotsFollow) { for (HtmlNodeAbstract<?> node : nodes) { String href = null; String rel = null; String nodeName = node.getNodeName(); if ("a".equals(nodeName)) { href = node.getAttributeText("href"); rel = node.getAttributeText("rel"); } else if ("frame".equals(nodeName) || "img".equals(nodeName)) { href = node.getAttributeText("src"); } boolean follow = true; if (rel != null) if (rel.contains("nofollow") && !ignoreLinkNoFollow) follow = false; URL newUrl = null; if (href != null) if (!href.startsWith("javascript:")) if (currentURL != null) { href = StringEscapeUtils.unescapeXml(href); newUrl = LinkUtils.getLink(currentURL, href, urlFilterList, removeFragment); } if (newUrl != null) { ParserFieldEnum field = null; if (newUrl.getHost().equalsIgnoreCase(currentURL.getHost())) { if (follow) field = ParserFieldEnum.internal_link; else field = ParserFieldEnum.internal_link_nofollow; } else { if (follow) field = ParserFieldEnum.external_link; else field = ParserFieldEnum.external_link_nofollow; } String link = newUrl.toExternalForm(); result.addField(field, link); if (follow) addDetectedLink(link); } } } if (!metaRobotsNoIndex) { nodes = rootNode.getNodes("html", "body"); if (nodes == null || nodes.size() == 0) nodes = rootNode.getNodes("html"); if (nodes != null && nodes.size() > 0) { StringBuilder sb = new StringBuilder(); getBodyTextContent(result, sb, nodes.get(0), true, null, 1024, xPathExclusionsSet); result.addField(ParserFieldEnum.body, sb); } } // Identification de la langue: Locale lang = null; String langMethod = null; String[] pathHtml = { "html" }; nodes = rootNode.getNodes(pathHtml); if (nodes != null && nodes.size() > 0) { langMethod = "html lang attribute"; String l = nodes.get(0).getAttributeText("lang"); if (l != null) lang = Lang.findLocaleISO639(l); } if (lang == null && metaContentLanguage != null) { langMethod = "meta http-equiv content-language"; lang = Lang.findLocaleISO639(metaContentLanguage); } if (lang == null && metaDcLanguage != null) { langMethod = "meta dc.language"; lang = Lang.findLocaleISO639(metaDcLanguage); } if (lang != null) { result.addField(ParserFieldEnum.lang, lang.getLanguage()); result.addField(ParserFieldEnum.lang_method, langMethod); } else if (!metaRobotsNoIndex) lang = result.langDetection(10000, ParserFieldEnum.body); if (getFieldMap().isMapped(ParserFieldEnum.generated_title)) { StringBuilder sb = new StringBuilder(); try { if (!StringUtils.isEmpty(streamOriginalUrl)) sb.append(new URI(streamOriginalUrl).getHost()); } catch (URISyntaxException e) { Logging.error(e); } String generatedTitle = null; for (Map.Entry<String, BoostTag> entry : boostTagMap.entrySet()) { BoostTag boostTag = entry.getValue(); if (boostTag.firstContent != null) { generatedTitle = boostTag.firstContent; break; } } if (generatedTitle == null) { final String FIELD_TITLE = "contents"; MemoryIndex bodyMemoryIndex = new MemoryIndex(); Analyzer bodyAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36); String bodyText = result.getMergedBodyText(100000, " ", ParserFieldEnum.body); bodyMemoryIndex.addField(FIELD_TITLE, bodyText, bodyAnalyzer); IndexSearcher indexSearcher = bodyMemoryIndex.createSearcher(); IndexReader indexReader = indexSearcher.getIndexReader(); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.setAnalyzer(bodyAnalyzer); mlt.setFieldNames(new String[] { FIELD_TITLE }); mlt.setMinWordLen(3); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); String[] words = mlt.retrieveInterestingTerms(0); if (words != null && words.length > 0) generatedTitle = words[0]; } if (generatedTitle != null) { if (sb.length() > 0) sb.append(" - "); sb.append(generatedTitle); } if (sb.length() > 67) { int pos = sb.indexOf(" ", 60); if (pos == -1) pos = 67; sb.setLength(pos); sb.append("..."); } result.addField(ParserFieldEnum.generated_title, sb.toString()); } }
From source file:com.jaeksoft.searchlib.snippet.Fragment.java
License:Open Source License
public final double searchScore(final String fieldName, final CompiledAnalyzer analyzer, final Query query) { searchScore = 0;//from w w w.j av a2 s. c om if (query == null || analyzer == null) return 0; MemoryIndex index = new MemoryIndex(); index.addField(fieldName, originalText, analyzer); searchScore = index.search(query); return searchScore; }
From source file:com.orientechnologies.lucene.operator.OLuceneTextOperator.java
License:Apache License
@Override public Object evaluateRecord(OIdentifiable iRecord, ODocument iCurrentResult, OSQLFilterCondition iCondition, Object iLeft, Object iRight, OCommandContext iContext) { OLuceneFullTextIndex index = involvedIndex(iRecord, iCurrentResult, iCondition, iLeft, iRight); if (index == null) { throw new OCommandExecutionException("Cannot evaluate lucene condition without index configuration."); }//from w w w.ja va 2 s. c om MemoryIndex memoryIndex = (MemoryIndex) iContext.getVariable("_memoryIndex"); if (memoryIndex == null) { memoryIndex = new MemoryIndex(); iContext.setVariable("_memoryIndex", memoryIndex); } memoryIndex.reset(); Document doc = index.buildDocument(iLeft); for (IndexableField field : doc.getFields()) { memoryIndex.addField(field.name(), field.stringValue(), index.analyzer(field.name())); } Query query = null; try { query = index.buildQuery(iRight); } catch (Exception e) { throw new OCommandExecutionException("Error executing lucene query.", e); } return memoryIndex.search(query) > 0.0f; }
From source file:com.orientechnologies.lucene.test.LuceneBooleanIndexTest.java
License:Apache License
@Test public void testMemoryIndex() throws ParseException { // TODO To be used in evaluate Record MemoryIndex index = new MemoryIndex(); Document doc = new Document(); doc.add(new StringField("text", "my text", Field.Store.YES)); StandardAnalyzer analyzer = new StandardAnalyzer(); for (IndexableField field : doc.getFields()) { index.addField(field.name(), field.stringValue(), analyzer); }//ww w . j av a 2s.co m QueryParser parser = new QueryParser("text", analyzer); float score = index.search(parser.parse("+text:my")); }
From source file:com.orientechnologies.lucene.tx.OLuceneTxChangesMultiRid.java
License:Apache License
public boolean isDeleted(Document document, Object key, OIdentifiable value) { boolean match = false; List<String> strings = deleted.get(value.getIdentity().toString()); if (strings != null) { MemoryIndex memoryIndex = new MemoryIndex(); for (String string : strings) { Query q = engine.deleteQuery(string, value); memoryIndex.reset();// w w w.ja v a 2 s. c om for (IndexableField field : document.getFields()) { memoryIndex.addField(field.name(), field.stringValue(), new KeywordAnalyzer()); } match = match || (memoryIndex.search(q) > 0.0f); } return match; } return match; }
From source file:org.elasticsearch.index.percolator.ExtractQueryTermsServiceTests.java
License:Apache License
public void testCreateQueryMetadataQuery() throws Exception { MemoryIndex memoryIndex = new MemoryIndex(false); memoryIndex.addField("field1", "the quick brown fox jumps over the lazy dog", new WhitespaceAnalyzer()); memoryIndex.addField("field2", "some more text", new WhitespaceAnalyzer()); memoryIndex.addField("_field3", "unhide me", new WhitespaceAnalyzer()); memoryIndex.addField("field4", "123", new WhitespaceAnalyzer()); IndexReader indexReader = memoryIndex.createSearcher().getIndexReader(); Query query = ExtractQueryTermsService.createQueryTermsQuery(indexReader, QUERY_TERMS_FIELD, UNKNOWN_QUERY_FIELD); assertThat(query, instanceOf(TermsQuery.class)); // no easy way to get to the terms in TermsQuery, // if there a less then 16 terms then it gets rewritten to bq and then we can easily check the terms BooleanQuery booleanQuery = (BooleanQuery) ((ConstantScoreQuery) query.rewrite(indexReader)).getQuery(); assertThat(booleanQuery.clauses().size(), equalTo(15)); assertClause(booleanQuery, 0, QUERY_TERMS_FIELD, "_field3\u0000me"); assertClause(booleanQuery, 1, QUERY_TERMS_FIELD, "_field3\u0000unhide"); assertClause(booleanQuery, 2, QUERY_TERMS_FIELD, "field1\u0000brown"); assertClause(booleanQuery, 3, QUERY_TERMS_FIELD, "field1\u0000dog"); assertClause(booleanQuery, 4, QUERY_TERMS_FIELD, "field1\u0000fox"); assertClause(booleanQuery, 5, QUERY_TERMS_FIELD, "field1\u0000jumps"); assertClause(booleanQuery, 6, QUERY_TERMS_FIELD, "field1\u0000lazy"); assertClause(booleanQuery, 7, QUERY_TERMS_FIELD, "field1\u0000over"); assertClause(booleanQuery, 8, QUERY_TERMS_FIELD, "field1\u0000quick"); assertClause(booleanQuery, 9, QUERY_TERMS_FIELD, "field1\u0000the"); assertClause(booleanQuery, 10, QUERY_TERMS_FIELD, "field2\u0000more"); assertClause(booleanQuery, 11, QUERY_TERMS_FIELD, "field2\u0000some"); assertClause(booleanQuery, 12, QUERY_TERMS_FIELD, "field2\u0000text"); assertClause(booleanQuery, 13, QUERY_TERMS_FIELD, "field4\u0000123"); assertClause(booleanQuery, 14, UNKNOWN_QUERY_FIELD, ""); }
From source file:org.elasticsearch.index.query.MoreLikeThisQueryBuilderTests.java
License:Apache License
/** * Here we could go overboard and use a pre-generated indexed random document for a given Item, * but for now we'd prefer to simply return the id as the content of the document and that for * every field./* ww w .j a v a2 s . com*/ */ private static Fields generateFields(String[] fieldNames, String text) throws IOException { MemoryIndex index = new MemoryIndex(); for (String fieldName : fieldNames) { index.addField(fieldName, text, new WhitespaceAnalyzer()); } return MultiFields.getFields(index.createSearcher().getIndexReader()); }