Example usage for org.apache.lucene.index.memory MemoryIndex addField

List of usage examples for org.apache.lucene.index.memory MemoryIndex addField

Introduction

In this page you can find the example usage for org.apache.lucene.index.memory MemoryIndex addField.

Prototype

public void addField(String fieldName, TokenStream stream, int positionIncrementGap) 

Source Link

Document

Iterates over the given token stream and adds the resulting terms to the index; Equivalent to adding a tokenized, indexed, termVectorStored, unstored, Lucene org.apache.lucene.document.Field .

Usage

From source file:ch.sentric.hbase.prospective.Percolator.java

License:Apache License

/**
* Tries to find a set of queries that match the given document.
* 
* @param doc/* w  w w.  j a  v a 2 s  .  com*/
*            the Lucene document
* @return the matching queries
* @throws IOException
*             if an I/O error occurs
*/
public Response<T> percolate(final Document doc, final Map<T, Query> queries) throws IOException {
    // first, parse the source doc into a MemoryIndex
    final MemoryIndex memoryIndex = new MemoryIndex();

    for (final Fieldable field : doc.getFields()) {
        if (!field.isIndexed()) {
            continue;
        }

        final TokenStream tokenStream = field.tokenStreamValue();
        if (tokenStream != null) {
            memoryIndex.addField(field.name(), tokenStream, field.getBoost());
        } else {
            final Reader reader = field.readerValue();
            if (reader != null) {
                memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), reader),
                        field.getBoost());
            } else {
                final String value = field.stringValue();
                if (value != null) {
                    memoryIndex.addField(field.name(),
                            analyzer.reusableTokenStream(field.name(), new CharSequenceReader(value)),
                            field.getBoost());
                }
            }
        }
    }

    // do the search
    final IndexSearcher searcher = memoryIndex.createSearcher();
    final Map<T, Query> matches = new HashMap<T, Query>(0);

    if (queries != null && !queries.isEmpty()) {
        final ExistsCollector collector = new ExistsCollector();
        for (final Map.Entry<T, Query> entry : queries.entrySet()) {
            collector.reset();
            searcher.search(entry.getValue(), collector);
            if (collector.exists()) {
                matches.put(entry.getKey(), entry.getValue());
            }
        }
    }

    return new Response<T>(matches);
}

From source file:com.appspot.socialinquirer.server.service.impl.StackExchangeServiceImpl.java

License:Apache License

/**
 * Match with keywords.//from   w w w. j  a v  a 2s  .co  m
 *
 * @param keywords the keywords
 * @param userText the user text
 * @return true, if successful
 */
public boolean matchWithKeywords(List<String> keywords, String userText) {
    MemoryIndex index = new MemoryIndex();
    index.addField("text", userText, createEnglishAnalyzer());
    QueryParser parser = new QueryParser("text", createEnglishAnalyzer());
    BooleanQuery query = new BooleanQuery();
    for (String keyword : keywords) {
        try {
            query.add(parser.parse(keyword), BooleanClause.Occur.SHOULD);
        } catch (ParseException e) {
        }
    }

    float score = index.search(query);

    return score > 0.0f;
}

From source file:com.jaeksoft.searchlib.classifier.Classifier.java

License:Open Source License

public void classification(Client client, IndexDocument document)
        throws SearchLibException, ParseException, SyntaxError, IOException {
    rwl.r.lock();/*  w w  w.jav  a 2s . com*/
    try {
        MemoryIndex index = new MemoryIndex();
        LanguageEnum lang = document.getLang();
        Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang);
        for (FieldContent fieldContent : document) {
            String fieldName = fieldContent.getField();
            String concatValues = fieldContent.getMergedValues(" ");
            index.addField(fieldName, concatValues, analyzer);
        }
        if (method == ClassificationMethodEnum.MULTIVALUED)
            multivaluedClassification(client, document, lang, index);
        else if (method == ClassificationMethodEnum.BESTSCORE)
            bestScoreClassification(client, document, lang, index);

    } finally {
        rwl.r.unlock();
    }
}

From source file:com.jaeksoft.searchlib.parser.HtmlParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum forcedLang)
        throws IOException, SearchLibException {

    titleBoost = getFloatProperty(ClassPropertyEnum.TITLE_BOOST);
    boostTagMap = new TreeMap<String, BoostTag>();
    boostTagMap.put("h1", new BoostTag(ClassPropertyEnum.H1_BOOST));
    boostTagMap.put("h2", new BoostTag(ClassPropertyEnum.H2_BOOST));
    boostTagMap.put("h3", new BoostTag(ClassPropertyEnum.H3_BOOST));
    boostTagMap.put("h4", new BoostTag(ClassPropertyEnum.H4_BOOST));
    boostTagMap.put("h5", new BoostTag(ClassPropertyEnum.H5_BOOST));
    boostTagMap.put("h6", new BoostTag(ClassPropertyEnum.H6_BOOST));
    ignoreMetaNoIndex = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOINDEX);
    ignoreMetaNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW);
    ignoreLinkNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW);
    ignoreUntitledDocuments = getBooleanProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS);
    ignoreNonCanonical = getBooleanProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL);

    String currentCharset = null;
    String headerCharset = null;// www  . ja  va 2s .  com
    String detectedCharset = null;

    IndexDocument sourceDocument = getSourceDocument();

    if (sourceDocument != null) {
        FieldValueItem fieldValueItem = sourceDocument
                .getFieldValue(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0);
        if (fieldValueItem != null)
            headerCharset = fieldValueItem.getValue();
        if (headerCharset == null) {
            fieldValueItem = sourceDocument.getFieldValue(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(),
                    0);
            if (fieldValueItem != null)
                headerCharset = fieldValueItem.getValue();
        }
        currentCharset = headerCharset;
    }

    if (currentCharset == null) {
        detectedCharset = streamLimiter.getDetectedCharset();
        currentCharset = detectedCharset;
    }

    if (currentCharset == null) {
        currentCharset = getProperty(ClassPropertyEnum.DEFAULT_CHARSET).getValue();
    }

    String xPathExclusions = getProperty(ClassPropertyEnum.XPATH_EXCLUSION).getValue();
    Set<Object> xPathExclusionsSet = null;
    if (!StringUtils.isEmpty(xPathExclusions))
        xPathExclusionsSet = new HashSet<Object>();

    HtmlParserEnum htmlParserEnum = HtmlParserEnum.find(getProperty(ClassPropertyEnum.HTML_PARSER).getValue());

    HtmlDocumentProvider htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter,
            xPathExclusions, xPathExclusionsSet);
    if (htmlProvider == null)
        return;

    URL currentURL = htmlProvider.getBaseHref();
    IndexDocument srcDoc = getSourceDocument();
    String streamOriginalUrl = streamLimiter.getOriginURL();
    try {
        if (currentURL == null && !StringUtils.isEmpty(streamOriginalUrl))
            currentURL = LinkUtils.newEncodedURL(streamOriginalUrl);
        if (currentURL == null && srcDoc != null) {
            FieldValueItem fvi = srcDoc.getFieldValue(UrlItemFieldEnum.INSTANCE.url.getName(), 0);
            if (fvi != null)
                currentURL = LinkUtils.newEncodedURL(fvi.getValue());
        }
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }

    URL canonicalURL = htmlProvider.getCanonicalLink(currentURL);
    if (canonicalURL != null) {
        String canUrl = canonicalURL.toExternalForm();
        addDetectedLink(canUrl);
        if (ignoreNonCanonical) {
            String curUrl = currentURL.toExternalForm();
            if (!canUrl.equals(curUrl)) {
                isCanonical = false;
                return;
            }
        }
    }
    isCanonical = true;

    String title = htmlProvider.getTitle();
    if (ignoreUntitledDocuments)
        if (title == null || title.length() == 0)
            return;

    ParserResultItem result = getNewParserResultItem();

    addFieldTitle(result, title);

    result.addField(ParserFieldEnum.htmlProvider, htmlProvider.getName());

    // Check ContentType charset in meta http-equiv
    String metaCharset = htmlProvider.getMetaCharset();

    String selectedCharset = selectCharset(headerCharset, metaCharset, detectedCharset);

    if (selectedCharset != null) {
        if (!selectedCharset.equals(currentCharset)) {
            currentCharset = selectedCharset;
            htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter,
                    xPathExclusions, xPathExclusionsSet);
        }
    }

    StringWriter writer = new StringWriter();
    IOUtils.copy(streamLimiter.getNewInputStream(), writer, currentCharset);
    result.addField(ParserFieldEnum.htmlSource, writer.toString());
    writer.close();

    HtmlNodeAbstract<?> rootNode = htmlProvider.getRootNode();
    if (rootNode == null)
        return;

    for (HtmlNodeAbstract<?> metaNode : htmlProvider.getMetas()) {
        String metaName = metaNode.getAttributeText("name");
        if (metaName != null && metaName.startsWith(OPENSEARCHSERVER_FIELD)) {
            String field = metaName.substring(OPENSEARCHSERVER_FIELD_LENGTH);
            String[] fields = field.split("\\.");
            if (fields != null) {
                String content = metaNode.getAttributeText("content");
                result.addDirectFields(fields, content);
            }
        }
    }

    result.addField(ParserFieldEnum.charset, currentCharset);

    String metaRobots = null;

    String metaDcLanguage = null;

    String metaContentLanguage = null;

    for (HtmlNodeAbstract<?> node : htmlProvider.getMetas()) {
        String attr_name = node.getAttributeText("name");
        String attr_http_equiv = node.getAttributeText("http-equiv");
        if ("keywords".equalsIgnoreCase(attr_name))
            result.addField(ParserFieldEnum.meta_keywords, HtmlDocumentProvider.getMetaContent(node));
        else if ("description".equalsIgnoreCase(attr_name))
            result.addField(ParserFieldEnum.meta_description, HtmlDocumentProvider.getMetaContent(node));
        else if ("robots".equalsIgnoreCase(attr_name))
            metaRobots = HtmlDocumentProvider.getMetaContent(node);
        else if ("dc.language".equalsIgnoreCase(attr_name))
            metaDcLanguage = HtmlDocumentProvider.getMetaContent(node);
        else if ("content-language".equalsIgnoreCase(attr_http_equiv))
            metaContentLanguage = HtmlDocumentProvider.getMetaContent(node);
    }

    boolean metaRobotsFollow = true;
    boolean metaRobotsNoIndex = false;
    if (metaRobots != null) {
        metaRobots = metaRobots.toLowerCase();
        if (metaRobots.contains("noindex") && !ignoreMetaNoIndex) {
            metaRobotsNoIndex = true;
            result.addField(ParserFieldEnum.meta_robots, "noindex");
        }
        if (metaRobots.contains("nofollow") && !ignoreMetaNoFollow) {
            metaRobotsFollow = false;
            result.addField(ParserFieldEnum.meta_robots, "nofollow");
        }
    }

    UrlFilterItem[] urlFilterList = getUrlFilterList();

    boolean removeFragment = ClassPropertyEnum.KEEP_REMOVE_LIST[1]
            .equalsIgnoreCase(getProperty(ClassPropertyEnum.URL_FRAGMENT).getValue());

    List<HtmlNodeAbstract<?>> nodes = rootNode.getAllNodes("a", "frame", "img");
    if (srcDoc != null && nodes != null && metaRobotsFollow) {
        for (HtmlNodeAbstract<?> node : nodes) {
            String href = null;
            String rel = null;
            String nodeName = node.getNodeName();
            if ("a".equals(nodeName)) {
                href = node.getAttributeText("href");
                rel = node.getAttributeText("rel");
            } else if ("frame".equals(nodeName) || "img".equals(nodeName)) {
                href = node.getAttributeText("src");
            }
            boolean follow = true;
            if (rel != null)
                if (rel.contains("nofollow") && !ignoreLinkNoFollow)
                    follow = false;
            URL newUrl = null;
            if (href != null)
                if (!href.startsWith("javascript:"))
                    if (currentURL != null) {
                        href = StringEscapeUtils.unescapeXml(href);
                        newUrl = LinkUtils.getLink(currentURL, href, urlFilterList, removeFragment);
                    }
            if (newUrl != null) {
                ParserFieldEnum field = null;
                if (newUrl.getHost().equalsIgnoreCase(currentURL.getHost())) {
                    if (follow)
                        field = ParserFieldEnum.internal_link;
                    else
                        field = ParserFieldEnum.internal_link_nofollow;
                } else {
                    if (follow)
                        field = ParserFieldEnum.external_link;
                    else
                        field = ParserFieldEnum.external_link_nofollow;
                }
                String link = newUrl.toExternalForm();
                result.addField(field, link);
                if (follow)
                    addDetectedLink(link);
            }
        }
    }

    if (!metaRobotsNoIndex) {
        nodes = rootNode.getNodes("html", "body");
        if (nodes == null || nodes.size() == 0)
            nodes = rootNode.getNodes("html");
        if (nodes != null && nodes.size() > 0) {
            StringBuilder sb = new StringBuilder();
            getBodyTextContent(result, sb, nodes.get(0), true, null, 1024, xPathExclusionsSet);
            result.addField(ParserFieldEnum.body, sb);
        }
    }

    // Identification de la langue:
    Locale lang = null;
    String langMethod = null;
    String[] pathHtml = { "html" };
    nodes = rootNode.getNodes(pathHtml);
    if (nodes != null && nodes.size() > 0) {
        langMethod = "html lang attribute";
        String l = nodes.get(0).getAttributeText("lang");
        if (l != null)
            lang = Lang.findLocaleISO639(l);
    }
    if (lang == null && metaContentLanguage != null) {
        langMethod = "meta http-equiv content-language";
        lang = Lang.findLocaleISO639(metaContentLanguage);
    }
    if (lang == null && metaDcLanguage != null) {
        langMethod = "meta dc.language";
        lang = Lang.findLocaleISO639(metaDcLanguage);
    }

    if (lang != null) {
        result.addField(ParserFieldEnum.lang, lang.getLanguage());
        result.addField(ParserFieldEnum.lang_method, langMethod);
    } else if (!metaRobotsNoIndex)
        lang = result.langDetection(10000, ParserFieldEnum.body);

    if (getFieldMap().isMapped(ParserFieldEnum.generated_title)) {

        StringBuilder sb = new StringBuilder();
        try {
            if (!StringUtils.isEmpty(streamOriginalUrl))
                sb.append(new URI(streamOriginalUrl).getHost());
        } catch (URISyntaxException e) {
            Logging.error(e);
        }

        String generatedTitle = null;
        for (Map.Entry<String, BoostTag> entry : boostTagMap.entrySet()) {
            BoostTag boostTag = entry.getValue();
            if (boostTag.firstContent != null) {
                generatedTitle = boostTag.firstContent;
                break;
            }
        }

        if (generatedTitle == null) {
            final String FIELD_TITLE = "contents";

            MemoryIndex bodyMemoryIndex = new MemoryIndex();
            Analyzer bodyAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            String bodyText = result.getMergedBodyText(100000, " ", ParserFieldEnum.body);
            bodyMemoryIndex.addField(FIELD_TITLE, bodyText, bodyAnalyzer);

            IndexSearcher indexSearcher = bodyMemoryIndex.createSearcher();
            IndexReader indexReader = indexSearcher.getIndexReader();
            MoreLikeThis mlt = new MoreLikeThis(indexReader);
            mlt.setAnalyzer(bodyAnalyzer);
            mlt.setFieldNames(new String[] { FIELD_TITLE });
            mlt.setMinWordLen(3);
            mlt.setMinTermFreq(1);
            mlt.setMinDocFreq(1);

            String[] words = mlt.retrieveInterestingTerms(0);
            if (words != null && words.length > 0)
                generatedTitle = words[0];
        }

        if (generatedTitle != null) {
            if (sb.length() > 0)
                sb.append(" - ");
            sb.append(generatedTitle);
        }

        if (sb.length() > 67) {
            int pos = sb.indexOf(" ", 60);
            if (pos == -1)
                pos = 67;
            sb.setLength(pos);
            sb.append("...");
        }
        result.addField(ParserFieldEnum.generated_title, sb.toString());
    }

}

From source file:com.jaeksoft.searchlib.snippet.Fragment.java

License:Open Source License

public final double searchScore(final String fieldName, final CompiledAnalyzer analyzer, final Query query) {
    searchScore = 0;//from w w w.j  av  a2  s. c  om
    if (query == null || analyzer == null)
        return 0;
    MemoryIndex index = new MemoryIndex();
    index.addField(fieldName, originalText, analyzer);
    searchScore = index.search(query);
    return searchScore;
}

From source file:com.orientechnologies.lucene.operator.OLuceneTextOperator.java

License:Apache License

@Override
public Object evaluateRecord(OIdentifiable iRecord, ODocument iCurrentResult, OSQLFilterCondition iCondition,
        Object iLeft, Object iRight, OCommandContext iContext) {

    OLuceneFullTextIndex index = involvedIndex(iRecord, iCurrentResult, iCondition, iLeft, iRight);

    if (index == null) {
        throw new OCommandExecutionException("Cannot evaluate lucene condition without index configuration.");
    }//from w  w w.ja  va 2  s.  c om
    MemoryIndex memoryIndex = (MemoryIndex) iContext.getVariable("_memoryIndex");
    if (memoryIndex == null) {
        memoryIndex = new MemoryIndex();
        iContext.setVariable("_memoryIndex", memoryIndex);
    }
    memoryIndex.reset();
    Document doc = index.buildDocument(iLeft);

    for (IndexableField field : doc.getFields()) {
        memoryIndex.addField(field.name(), field.stringValue(), index.analyzer(field.name()));
    }
    Query query = null;
    try {
        query = index.buildQuery(iRight);
    } catch (Exception e) {
        throw new OCommandExecutionException("Error executing lucene query.", e);
    }
    return memoryIndex.search(query) > 0.0f;
}

From source file:com.orientechnologies.lucene.test.LuceneBooleanIndexTest.java

License:Apache License

@Test
public void testMemoryIndex() throws ParseException {
    // TODO To be used in evaluate Record
    MemoryIndex index = new MemoryIndex();

    Document doc = new Document();
    doc.add(new StringField("text", "my text", Field.Store.YES));
    StandardAnalyzer analyzer = new StandardAnalyzer();

    for (IndexableField field : doc.getFields()) {
        index.addField(field.name(), field.stringValue(), analyzer);
    }//ww w  . j  av  a  2s.co  m

    QueryParser parser = new QueryParser("text", analyzer);
    float score = index.search(parser.parse("+text:my"));

}

From source file:com.orientechnologies.lucene.tx.OLuceneTxChangesMultiRid.java

License:Apache License

public boolean isDeleted(Document document, Object key, OIdentifiable value) {
    boolean match = false;
    List<String> strings = deleted.get(value.getIdentity().toString());
    if (strings != null) {
        MemoryIndex memoryIndex = new MemoryIndex();
        for (String string : strings) {
            Query q = engine.deleteQuery(string, value);
            memoryIndex.reset();//  w  w  w.ja v  a  2  s. c om
            for (IndexableField field : document.getFields()) {
                memoryIndex.addField(field.name(), field.stringValue(), new KeywordAnalyzer());
            }
            match = match || (memoryIndex.search(q) > 0.0f);
        }
        return match;
    }
    return match;
}

From source file:org.elasticsearch.index.percolator.ExtractQueryTermsServiceTests.java

License:Apache License

public void testCreateQueryMetadataQuery() throws Exception {
    MemoryIndex memoryIndex = new MemoryIndex(false);
    memoryIndex.addField("field1", "the quick brown fox jumps over the lazy dog", new WhitespaceAnalyzer());
    memoryIndex.addField("field2", "some more text", new WhitespaceAnalyzer());
    memoryIndex.addField("_field3", "unhide me", new WhitespaceAnalyzer());
    memoryIndex.addField("field4", "123", new WhitespaceAnalyzer());

    IndexReader indexReader = memoryIndex.createSearcher().getIndexReader();
    Query query = ExtractQueryTermsService.createQueryTermsQuery(indexReader, QUERY_TERMS_FIELD,
            UNKNOWN_QUERY_FIELD);
    assertThat(query, instanceOf(TermsQuery.class));

    // no easy way to get to the terms in TermsQuery,
    // if there a less then 16 terms then it gets rewritten to bq and then we can easily check the terms
    BooleanQuery booleanQuery = (BooleanQuery) ((ConstantScoreQuery) query.rewrite(indexReader)).getQuery();
    assertThat(booleanQuery.clauses().size(), equalTo(15));
    assertClause(booleanQuery, 0, QUERY_TERMS_FIELD, "_field3\u0000me");
    assertClause(booleanQuery, 1, QUERY_TERMS_FIELD, "_field3\u0000unhide");
    assertClause(booleanQuery, 2, QUERY_TERMS_FIELD, "field1\u0000brown");
    assertClause(booleanQuery, 3, QUERY_TERMS_FIELD, "field1\u0000dog");
    assertClause(booleanQuery, 4, QUERY_TERMS_FIELD, "field1\u0000fox");
    assertClause(booleanQuery, 5, QUERY_TERMS_FIELD, "field1\u0000jumps");
    assertClause(booleanQuery, 6, QUERY_TERMS_FIELD, "field1\u0000lazy");
    assertClause(booleanQuery, 7, QUERY_TERMS_FIELD, "field1\u0000over");
    assertClause(booleanQuery, 8, QUERY_TERMS_FIELD, "field1\u0000quick");
    assertClause(booleanQuery, 9, QUERY_TERMS_FIELD, "field1\u0000the");
    assertClause(booleanQuery, 10, QUERY_TERMS_FIELD, "field2\u0000more");
    assertClause(booleanQuery, 11, QUERY_TERMS_FIELD, "field2\u0000some");
    assertClause(booleanQuery, 12, QUERY_TERMS_FIELD, "field2\u0000text");
    assertClause(booleanQuery, 13, QUERY_TERMS_FIELD, "field4\u0000123");
    assertClause(booleanQuery, 14, UNKNOWN_QUERY_FIELD, "");
}

From source file:org.elasticsearch.index.query.MoreLikeThisQueryBuilderTests.java

License:Apache License

/**
 * Here we could go overboard and use a pre-generated indexed random document for a given Item,
 * but for now we'd prefer to simply return the id as the content of the document and that for
 * every field./* ww  w .j a v a2  s  .  com*/
 */
private static Fields generateFields(String[] fieldNames, String text) throws IOException {
    MemoryIndex index = new MemoryIndex();
    for (String fieldName : fieldNames) {
        index.addField(fieldName, text, new WhitespaceAnalyzer());
    }
    return MultiFields.getFields(index.createSearcher().getIndexReader());
}