Example usage for org.apache.lucene.index.memory MemoryIndex MemoryIndex

List of usage examples for org.apache.lucene.index.memory MemoryIndex MemoryIndex

Introduction

In this page you can find the example usage for org.apache.lucene.index.memory MemoryIndex MemoryIndex.

Prototype

public MemoryIndex() 

Source Link

Document

Constructs an empty instance that will not store offsets or payloads.

Usage

From source file:ch.sentric.hbase.prospective.Percolator.java

License:Apache License

/**
* Tries to find a set of queries that match the given document.
* 
* @param doc//from  ww  w .  ja  va  2s .  c  o m
*            the Lucene document
* @return the matching queries
* @throws IOException
*             if an I/O error occurs
*/
public Response<T> percolate(final Document doc, final Map<T, Query> queries) throws IOException {
    // first, parse the source doc into a MemoryIndex
    final MemoryIndex memoryIndex = new MemoryIndex();

    for (final Fieldable field : doc.getFields()) {
        if (!field.isIndexed()) {
            continue;
        }

        final TokenStream tokenStream = field.tokenStreamValue();
        if (tokenStream != null) {
            memoryIndex.addField(field.name(), tokenStream, field.getBoost());
        } else {
            final Reader reader = field.readerValue();
            if (reader != null) {
                memoryIndex.addField(field.name(), analyzer.reusableTokenStream(field.name(), reader),
                        field.getBoost());
            } else {
                final String value = field.stringValue();
                if (value != null) {
                    memoryIndex.addField(field.name(),
                            analyzer.reusableTokenStream(field.name(), new CharSequenceReader(value)),
                            field.getBoost());
                }
            }
        }
    }

    // do the search
    final IndexSearcher searcher = memoryIndex.createSearcher();
    final Map<T, Query> matches = new HashMap<T, Query>(0);

    if (queries != null && !queries.isEmpty()) {
        final ExistsCollector collector = new ExistsCollector();
        for (final Map.Entry<T, Query> entry : queries.entrySet()) {
            collector.reset();
            searcher.search(entry.getValue(), collector);
            if (collector.exists()) {
                matches.put(entry.getKey(), entry.getValue());
            }
        }
    }

    return new Response<T>(matches);
}

From source file:com.appspot.socialinquirer.server.service.impl.StackExchangeServiceImpl.java

License:Apache License

/**
 * Match with keywords.//w  w w .jav  a  2s  .c om
 *
 * @param keywords the keywords
 * @param userText the user text
 * @return true, if successful
 */
public boolean matchWithKeywords(List<String> keywords, String userText) {
    MemoryIndex index = new MemoryIndex();
    index.addField("text", userText, createEnglishAnalyzer());
    QueryParser parser = new QueryParser("text", createEnglishAnalyzer());
    BooleanQuery query = new BooleanQuery();
    for (String keyword : keywords) {
        try {
            query.add(parser.parse(keyword), BooleanClause.Occur.SHOULD);
        } catch (ParseException e) {
        }
    }

    float score = index.search(query);

    return score > 0.0f;
}

From source file:com.jaeksoft.searchlib.classifier.Classifier.java

License:Open Source License

public void classification(Client client, IndexDocument document)
        throws SearchLibException, ParseException, SyntaxError, IOException {
    rwl.r.lock();//from www.j  a  va  2 s.c  om
    try {
        MemoryIndex index = new MemoryIndex();
        LanguageEnum lang = document.getLang();
        Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang);
        for (FieldContent fieldContent : document) {
            String fieldName = fieldContent.getField();
            String concatValues = fieldContent.getMergedValues(" ");
            index.addField(fieldName, concatValues, analyzer);
        }
        if (method == ClassificationMethodEnum.MULTIVALUED)
            multivaluedClassification(client, document, lang, index);
        else if (method == ClassificationMethodEnum.BESTSCORE)
            bestScoreClassification(client, document, lang, index);

    } finally {
        rwl.r.unlock();
    }
}

From source file:com.jaeksoft.searchlib.parser.HtmlParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum forcedLang)
        throws IOException, SearchLibException {

    titleBoost = getFloatProperty(ClassPropertyEnum.TITLE_BOOST);
    boostTagMap = new TreeMap<String, BoostTag>();
    boostTagMap.put("h1", new BoostTag(ClassPropertyEnum.H1_BOOST));
    boostTagMap.put("h2", new BoostTag(ClassPropertyEnum.H2_BOOST));
    boostTagMap.put("h3", new BoostTag(ClassPropertyEnum.H3_BOOST));
    boostTagMap.put("h4", new BoostTag(ClassPropertyEnum.H4_BOOST));
    boostTagMap.put("h5", new BoostTag(ClassPropertyEnum.H5_BOOST));
    boostTagMap.put("h6", new BoostTag(ClassPropertyEnum.H6_BOOST));
    ignoreMetaNoIndex = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOINDEX);
    ignoreMetaNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW);
    ignoreLinkNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW);
    ignoreUntitledDocuments = getBooleanProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS);
    ignoreNonCanonical = getBooleanProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL);

    String currentCharset = null;
    String headerCharset = null;//w w  w.j  a  va 2s .c o  m
    String detectedCharset = null;

    IndexDocument sourceDocument = getSourceDocument();

    if (sourceDocument != null) {
        FieldValueItem fieldValueItem = sourceDocument
                .getFieldValue(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0);
        if (fieldValueItem != null)
            headerCharset = fieldValueItem.getValue();
        if (headerCharset == null) {
            fieldValueItem = sourceDocument.getFieldValue(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(),
                    0);
            if (fieldValueItem != null)
                headerCharset = fieldValueItem.getValue();
        }
        currentCharset = headerCharset;
    }

    if (currentCharset == null) {
        detectedCharset = streamLimiter.getDetectedCharset();
        currentCharset = detectedCharset;
    }

    if (currentCharset == null) {
        currentCharset = getProperty(ClassPropertyEnum.DEFAULT_CHARSET).getValue();
    }

    String xPathExclusions = getProperty(ClassPropertyEnum.XPATH_EXCLUSION).getValue();
    Set<Object> xPathExclusionsSet = null;
    if (!StringUtils.isEmpty(xPathExclusions))
        xPathExclusionsSet = new HashSet<Object>();

    HtmlParserEnum htmlParserEnum = HtmlParserEnum.find(getProperty(ClassPropertyEnum.HTML_PARSER).getValue());

    HtmlDocumentProvider htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter,
            xPathExclusions, xPathExclusionsSet);
    if (htmlProvider == null)
        return;

    URL currentURL = htmlProvider.getBaseHref();
    IndexDocument srcDoc = getSourceDocument();
    String streamOriginalUrl = streamLimiter.getOriginURL();
    try {
        if (currentURL == null && !StringUtils.isEmpty(streamOriginalUrl))
            currentURL = LinkUtils.newEncodedURL(streamOriginalUrl);
        if (currentURL == null && srcDoc != null) {
            FieldValueItem fvi = srcDoc.getFieldValue(UrlItemFieldEnum.INSTANCE.url.getName(), 0);
            if (fvi != null)
                currentURL = LinkUtils.newEncodedURL(fvi.getValue());
        }
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }

    URL canonicalURL = htmlProvider.getCanonicalLink(currentURL);
    if (canonicalURL != null) {
        String canUrl = canonicalURL.toExternalForm();
        addDetectedLink(canUrl);
        if (ignoreNonCanonical) {
            String curUrl = currentURL.toExternalForm();
            if (!canUrl.equals(curUrl)) {
                isCanonical = false;
                return;
            }
        }
    }
    isCanonical = true;

    String title = htmlProvider.getTitle();
    if (ignoreUntitledDocuments)
        if (title == null || title.length() == 0)
            return;

    ParserResultItem result = getNewParserResultItem();

    addFieldTitle(result, title);

    result.addField(ParserFieldEnum.htmlProvider, htmlProvider.getName());

    // Check ContentType charset in meta http-equiv
    String metaCharset = htmlProvider.getMetaCharset();

    String selectedCharset = selectCharset(headerCharset, metaCharset, detectedCharset);

    if (selectedCharset != null) {
        if (!selectedCharset.equals(currentCharset)) {
            currentCharset = selectedCharset;
            htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter,
                    xPathExclusions, xPathExclusionsSet);
        }
    }

    StringWriter writer = new StringWriter();
    IOUtils.copy(streamLimiter.getNewInputStream(), writer, currentCharset);
    result.addField(ParserFieldEnum.htmlSource, writer.toString());
    writer.close();

    HtmlNodeAbstract<?> rootNode = htmlProvider.getRootNode();
    if (rootNode == null)
        return;

    for (HtmlNodeAbstract<?> metaNode : htmlProvider.getMetas()) {
        String metaName = metaNode.getAttributeText("name");
        if (metaName != null && metaName.startsWith(OPENSEARCHSERVER_FIELD)) {
            String field = metaName.substring(OPENSEARCHSERVER_FIELD_LENGTH);
            String[] fields = field.split("\\.");
            if (fields != null) {
                String content = metaNode.getAttributeText("content");
                result.addDirectFields(fields, content);
            }
        }
    }

    result.addField(ParserFieldEnum.charset, currentCharset);

    String metaRobots = null;

    String metaDcLanguage = null;

    String metaContentLanguage = null;

    for (HtmlNodeAbstract<?> node : htmlProvider.getMetas()) {
        String attr_name = node.getAttributeText("name");
        String attr_http_equiv = node.getAttributeText("http-equiv");
        if ("keywords".equalsIgnoreCase(attr_name))
            result.addField(ParserFieldEnum.meta_keywords, HtmlDocumentProvider.getMetaContent(node));
        else if ("description".equalsIgnoreCase(attr_name))
            result.addField(ParserFieldEnum.meta_description, HtmlDocumentProvider.getMetaContent(node));
        else if ("robots".equalsIgnoreCase(attr_name))
            metaRobots = HtmlDocumentProvider.getMetaContent(node);
        else if ("dc.language".equalsIgnoreCase(attr_name))
            metaDcLanguage = HtmlDocumentProvider.getMetaContent(node);
        else if ("content-language".equalsIgnoreCase(attr_http_equiv))
            metaContentLanguage = HtmlDocumentProvider.getMetaContent(node);
    }

    boolean metaRobotsFollow = true;
    boolean metaRobotsNoIndex = false;
    if (metaRobots != null) {
        metaRobots = metaRobots.toLowerCase();
        if (metaRobots.contains("noindex") && !ignoreMetaNoIndex) {
            metaRobotsNoIndex = true;
            result.addField(ParserFieldEnum.meta_robots, "noindex");
        }
        if (metaRobots.contains("nofollow") && !ignoreMetaNoFollow) {
            metaRobotsFollow = false;
            result.addField(ParserFieldEnum.meta_robots, "nofollow");
        }
    }

    UrlFilterItem[] urlFilterList = getUrlFilterList();

    boolean removeFragment = ClassPropertyEnum.KEEP_REMOVE_LIST[1]
            .equalsIgnoreCase(getProperty(ClassPropertyEnum.URL_FRAGMENT).getValue());

    List<HtmlNodeAbstract<?>> nodes = rootNode.getAllNodes("a", "frame", "img");
    if (srcDoc != null && nodes != null && metaRobotsFollow) {
        for (HtmlNodeAbstract<?> node : nodes) {
            String href = null;
            String rel = null;
            String nodeName = node.getNodeName();
            if ("a".equals(nodeName)) {
                href = node.getAttributeText("href");
                rel = node.getAttributeText("rel");
            } else if ("frame".equals(nodeName) || "img".equals(nodeName)) {
                href = node.getAttributeText("src");
            }
            boolean follow = true;
            if (rel != null)
                if (rel.contains("nofollow") && !ignoreLinkNoFollow)
                    follow = false;
            URL newUrl = null;
            if (href != null)
                if (!href.startsWith("javascript:"))
                    if (currentURL != null) {
                        href = StringEscapeUtils.unescapeXml(href);
                        newUrl = LinkUtils.getLink(currentURL, href, urlFilterList, removeFragment);
                    }
            if (newUrl != null) {
                ParserFieldEnum field = null;
                if (newUrl.getHost().equalsIgnoreCase(currentURL.getHost())) {
                    if (follow)
                        field = ParserFieldEnum.internal_link;
                    else
                        field = ParserFieldEnum.internal_link_nofollow;
                } else {
                    if (follow)
                        field = ParserFieldEnum.external_link;
                    else
                        field = ParserFieldEnum.external_link_nofollow;
                }
                String link = newUrl.toExternalForm();
                result.addField(field, link);
                if (follow)
                    addDetectedLink(link);
            }
        }
    }

    if (!metaRobotsNoIndex) {
        nodes = rootNode.getNodes("html", "body");
        if (nodes == null || nodes.size() == 0)
            nodes = rootNode.getNodes("html");
        if (nodes != null && nodes.size() > 0) {
            StringBuilder sb = new StringBuilder();
            getBodyTextContent(result, sb, nodes.get(0), true, null, 1024, xPathExclusionsSet);
            result.addField(ParserFieldEnum.body, sb);
        }
    }

    // Identification de la langue:
    Locale lang = null;
    String langMethod = null;
    String[] pathHtml = { "html" };
    nodes = rootNode.getNodes(pathHtml);
    if (nodes != null && nodes.size() > 0) {
        langMethod = "html lang attribute";
        String l = nodes.get(0).getAttributeText("lang");
        if (l != null)
            lang = Lang.findLocaleISO639(l);
    }
    if (lang == null && metaContentLanguage != null) {
        langMethod = "meta http-equiv content-language";
        lang = Lang.findLocaleISO639(metaContentLanguage);
    }
    if (lang == null && metaDcLanguage != null) {
        langMethod = "meta dc.language";
        lang = Lang.findLocaleISO639(metaDcLanguage);
    }

    if (lang != null) {
        result.addField(ParserFieldEnum.lang, lang.getLanguage());
        result.addField(ParserFieldEnum.lang_method, langMethod);
    } else if (!metaRobotsNoIndex)
        lang = result.langDetection(10000, ParserFieldEnum.body);

    if (getFieldMap().isMapped(ParserFieldEnum.generated_title)) {

        StringBuilder sb = new StringBuilder();
        try {
            if (!StringUtils.isEmpty(streamOriginalUrl))
                sb.append(new URI(streamOriginalUrl).getHost());
        } catch (URISyntaxException e) {
            Logging.error(e);
        }

        String generatedTitle = null;
        for (Map.Entry<String, BoostTag> entry : boostTagMap.entrySet()) {
            BoostTag boostTag = entry.getValue();
            if (boostTag.firstContent != null) {
                generatedTitle = boostTag.firstContent;
                break;
            }
        }

        if (generatedTitle == null) {
            final String FIELD_TITLE = "contents";

            MemoryIndex bodyMemoryIndex = new MemoryIndex();
            Analyzer bodyAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            String bodyText = result.getMergedBodyText(100000, " ", ParserFieldEnum.body);
            bodyMemoryIndex.addField(FIELD_TITLE, bodyText, bodyAnalyzer);

            IndexSearcher indexSearcher = bodyMemoryIndex.createSearcher();
            IndexReader indexReader = indexSearcher.getIndexReader();
            MoreLikeThis mlt = new MoreLikeThis(indexReader);
            mlt.setAnalyzer(bodyAnalyzer);
            mlt.setFieldNames(new String[] { FIELD_TITLE });
            mlt.setMinWordLen(3);
            mlt.setMinTermFreq(1);
            mlt.setMinDocFreq(1);

            String[] words = mlt.retrieveInterestingTerms(0);
            if (words != null && words.length > 0)
                generatedTitle = words[0];
        }

        if (generatedTitle != null) {
            if (sb.length() > 0)
                sb.append(" - ");
            sb.append(generatedTitle);
        }

        if (sb.length() > 67) {
            int pos = sb.indexOf(" ", 60);
            if (pos == -1)
                pos = 67;
            sb.setLength(pos);
            sb.append("...");
        }
        result.addField(ParserFieldEnum.generated_title, sb.toString());
    }

}

From source file:com.jaeksoft.searchlib.snippet.Fragment.java

License:Open Source License

public final double searchScore(final String fieldName, final CompiledAnalyzer analyzer, final Query query) {
    searchScore = 0;//from   ww  w  . j  av  a 2s .  com
    if (query == null || analyzer == null)
        return 0;
    MemoryIndex index = new MemoryIndex();
    index.addField(fieldName, originalText, analyzer);
    searchScore = index.search(query);
    return searchScore;
}

From source file:com.orientechnologies.lucene.operator.OLuceneTextOperator.java

License:Apache License

@Override
public Object evaluateRecord(OIdentifiable iRecord, ODocument iCurrentResult, OSQLFilterCondition iCondition,
        Object iLeft, Object iRight, OCommandContext iContext) {

    OLuceneFullTextIndex index = involvedIndex(iRecord, iCurrentResult, iCondition, iLeft, iRight);

    if (index == null) {
        throw new OCommandExecutionException("Cannot evaluate lucene condition without index configuration.");
    }//from   w  ww.  j  ava 2s.com
    MemoryIndex memoryIndex = (MemoryIndex) iContext.getVariable("_memoryIndex");
    if (memoryIndex == null) {
        memoryIndex = new MemoryIndex();
        iContext.setVariable("_memoryIndex", memoryIndex);
    }
    memoryIndex.reset();
    Document doc = index.buildDocument(iLeft);

    for (IndexableField field : doc.getFields()) {
        memoryIndex.addField(field.name(), field.stringValue(), index.analyzer(field.name()));
    }
    Query query = null;
    try {
        query = index.buildQuery(iRight);
    } catch (Exception e) {
        throw new OCommandExecutionException("Error executing lucene query.", e);
    }
    return memoryIndex.search(query) > 0.0f;
}

From source file:com.orientechnologies.lucene.test.LuceneBooleanIndexTest.java

License:Apache License

@Test
public void testMemoryIndex() throws ParseException {
    // TODO To be used in evaluate Record
    MemoryIndex index = new MemoryIndex();

    Document doc = new Document();
    doc.add(new StringField("text", "my text", Field.Store.YES));
    StandardAnalyzer analyzer = new StandardAnalyzer();

    for (IndexableField field : doc.getFields()) {
        index.addField(field.name(), field.stringValue(), analyzer);
    }//from w ww .  j a  va  2s.c  o  m

    QueryParser parser = new QueryParser("text", analyzer);
    float score = index.search(parser.parse("+text:my"));

}

From source file:com.orientechnologies.lucene.tx.OLuceneTxChangesMultiRid.java

License:Apache License

public boolean isDeleted(Document document, Object key, OIdentifiable value) {
    boolean match = false;
    List<String> strings = deleted.get(value.getIdentity().toString());
    if (strings != null) {
        MemoryIndex memoryIndex = new MemoryIndex();
        for (String string : strings) {
            Query q = engine.deleteQuery(string, value);
            memoryIndex.reset();/*from  ww w.j a va  2  s  . co  m*/
            for (IndexableField field : document.getFields()) {
                memoryIndex.addField(field.name(), field.stringValue(), new KeywordAnalyzer());
            }
            match = match || (memoryIndex.search(q) > 0.0f);
        }
        return match;
    }
    return match;
}

From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java

License:Apache License

private IndexReader getReaderForField(String field) throws IOException {
    if (wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
        tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
        cachedTokenStream = true;// w w  w. j a  v  a  2 s  .c o  m
    }
    IndexReader reader = readers.get(field);
    if (reader == null) {
        MemoryIndex indexer = new MemoryIndex();
        indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
        tokenStream.reset();
        IndexSearcher searcher = indexer.createSearcher();
        reader = searcher.getIndexReader();
        readers.put(field, reader);
    }

    return reader;
}

From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java

License:Apache License

public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException {
    memoryIndex = new MemoryIndex();
    IndexSearcher searcher = memoryIndex.createSearcher();
    leafReader = (LeafReader) searcher.getIndexReader();
    this.generalAnalyzer = generalAnalyzer;
}