org.apache.jena.query.text.TextIndexLucene.java Source code

Introduction

Here is the source code for org.apache.jena.query.text.TextIndexLucene.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.query.text;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.lang3.StringUtils;
import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.datatypes.TypeMapper;
import org.apache.jena.datatypes.xsd.XSDDatatype;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.query.text.analyzer.MultilingualAnalyzer;
import org.apache.jena.sparql.util.NodeFactoryExtra;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.store.Directory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TextIndexLucene implements TextIndex {
    private static Logger log = LoggerFactory.getLogger(TextIndexLucene.class);

    private static int MAX_N = 10000;
    // prefix for storing datatype URIs in the index, to distinguish them from language tags
    private static final String DATATYPE_PREFIX = "^^";

    private static final String RIGHT_ARROW = "\u21a6";
    private static final String LEFT_ARROW = "\u21a4";
    private static final String DIVIDES = "\u2223";
    private static final String Z_MORE_SEPS = "([\\p{Z}\u0f0b\0f0c\0f0d\180e]*?)";

    public static final FieldType ftIRI;
    static {
        ftIRI = new FieldType();
        ftIRI.setTokenized(false);
        ftIRI.setStored(true);
        ftIRI.setIndexOptions(IndexOptions.DOCS);
        ftIRI.freeze();
    }
    public static final FieldType ftString = StringField.TYPE_NOT_STORED;

    private final EntityDefinition docDef;
    private final Directory directory;
    private final Analyzer analyzer;
    private final Analyzer queryAnalyzer;
    private final String queryParserType;
    private final FieldType ftText;
    private final boolean isMultilingual;

    // The IndexWriter can't be final because we may have to recreate it if rollback() is called.
    // However, it needs to be volatile in case the next write transaction is on a different thread,
    // but we do not need locking because we are assuming that there can only be one writer
    // at a time (enforced elsewhere).
    private volatile IndexWriter indexWriter;

    /**
     * Constructs a new TextIndexLucene.
     *
     * @param directory The Lucene Directory for the index
     * @param config The config definition for the index instantiation.
     */
    public TextIndexLucene(Directory directory, TextIndexConfig config) {
        this.directory = directory;
        this.docDef = config.getEntDef();

        this.isMultilingual = config.isMultilingualSupport();
        if (this.isMultilingual && config.getEntDef().getLangField() == null) {
            //multilingual index cannot work without lang field
            docDef.setLangField("lang");
        }

        // create the analyzer as a wrapper that uses KeywordAnalyzer for
        // entity and graph fields and the configured analyzer(s) for all other
        Map<String, Analyzer> analyzerPerField = new HashMap<>();
        analyzerPerField.put(docDef.getEntityField(), new KeywordAnalyzer());
        if (docDef.getGraphField() != null)
            analyzerPerField.put(docDef.getGraphField(), new KeywordAnalyzer());
        if (docDef.getLangField() != null)
            analyzerPerField.put(docDef.getLangField(), new KeywordAnalyzer());

        for (String field : docDef.fields()) {
            Analyzer _analyzer = docDef.getAnalyzer(field);
            if (_analyzer != null) {
                analyzerPerField.put(field, _analyzer);
            }
        }

        Analyzer defaultAnalyzer = (null != config.getAnalyzer()) ? config.getAnalyzer() : new StandardAnalyzer();
        if (this.isMultilingual)
            defaultAnalyzer = new MultilingualAnalyzer(defaultAnalyzer);
        this.analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerPerField);
        this.queryAnalyzer = (null != config.getQueryAnalyzer()) ? config.getQueryAnalyzer() : this.analyzer;
        this.queryParserType = config.getQueryParser();
        this.ftText = config.isValueStored() ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED;
        if (config.isValueStored() && docDef.getLangField() == null)
            log.warn(
                    "Values stored but langField not set. Returned values will not have language tag or datatype.");

        openIndexWriter();
    }

    private void openIndexWriter() {
        IndexWriterConfig wConfig = new IndexWriterConfig(analyzer);
        try {
            indexWriter = new IndexWriter(directory, wConfig);
            // Force a commit to create the index, otherwise querying before writing will cause an exception
            indexWriter.commit();
        } catch (IndexFormatTooOldException e) {
            throw new TextIndexException("jena-text/Lucene cannot use indexes created before Jena 3.3.0. "
                    + "Please rebuild your text index using jena.textindexer from Jena 3.3.0 or above.", e);
        } catch (IOException e) {
            throw new TextIndexException(e);
        }
    }

    public Directory getDirectory() {
        return directory;
    }

    public Analyzer getAnalyzer() {
        return analyzer;
    }

    public Analyzer getQueryAnalyzer() {
        return queryAnalyzer;
    }

    public IndexWriter getIndexWriter() {
        return indexWriter;
    }

    @Override
    public void prepareCommit() {
        try {
            indexWriter.prepareCommit();
        } catch (IOException e) {
            throw new TextIndexException(e);
        }
    }

    @Override
    public void commit() {
        try {
            indexWriter.commit();
        } catch (IOException e) {
            throw new TextIndexException(e);
        }
    }

    @Override
    public void rollback() {
        IndexWriter idx = indexWriter;
        indexWriter = null;
        try {
            idx.rollback();
        } catch (IOException e) {
            throw new TextIndexException(e);
        }

        // The rollback will close the indexWriter, so we need to reopen it
        openIndexWriter();
    }

    @Override
    public void close() {
        try {
            indexWriter.close();
        } catch (IOException ex) {
            throw new TextIndexException(ex);
        }
    }

    @Override
    public void updateEntity(Entity entity) {
        if (log.isDebugEnabled())
            if (log.isTraceEnabled() && entity != null)
                log.trace("Update entity: " + entity.toStringDetail());
            else
                log.debug("Update entity: " + entity);
        try {
            updateDocument(entity);
        } catch (IOException e) {
            throw new TextIndexException(e);
        }
    }

    protected void updateDocument(Entity entity) throws IOException {
        Document doc = doc(entity);
        Term term = new Term(docDef.getEntityField(), entity.getId());
        indexWriter.updateDocument(term, doc);
        log.trace("updated: {}", doc);
    }

    @Override
    public void addEntity(Entity entity) {
        if (log.isDebugEnabled())
            if (log.isTraceEnabled() && entity != null)
                log.trace("Add entity: " + entity.toStringDetail());
            else
                log.debug("Add entity: " + entity);
        try {
            addDocument(entity);
        } catch (IOException e) {
            throw new TextIndexException(e);
        }
    }

    protected void addDocument(Entity entity) throws IOException {
        Document doc = doc(entity);
        indexWriter.addDocument(doc);
        log.trace("added: {}", doc);
    }

    @Override
    public void deleteEntity(Entity entity) {
        if (docDef.getUidField() == null)
            return;

        if (log.isDebugEnabled())
            if (log.isTraceEnabled() && entity != null)
                log.trace("Delete entity: " + entity.toStringDetail());
            else
                log.debug("Delete entity: " + entity);
        try {
            Map<String, Object> map = entity.getMap();
            String property = map.keySet().iterator().next();
            String value = (String) map.get(property);
            String hash = entity.getChecksum(property, value);
            Term uid = new Term(docDef.getUidField(), hash);
            indexWriter.deleteDocuments(uid);

        } catch (Exception e) {
            throw new TextIndexException(e);
        }
    }

    protected Document doc(Entity entity) {
        Document doc = new Document();
        Field entField = new Field(docDef.getEntityField(), entity.getId(), ftIRI);
        doc.add(entField);

        String graphField = docDef.getGraphField();
        if (graphField != null) {
            Field gField = new Field(graphField, entity.getGraph(), ftIRI);
            doc.add(gField);
        }

        String langField = docDef.getLangField();
        String uidField = docDef.getUidField();

        for (Entry<String, Object> e : entity.getMap().entrySet()) {
            doc.add(new Field(e.getKey(), (String) e.getValue(), ftText));
            if (langField != null) {
                String lang = entity.getLanguage();
                RDFDatatype datatype = entity.getDatatype();
                if (lang != null && !"".equals(lang)) {
                    doc.add(new Field(langField, lang, StringField.TYPE_STORED));
                    if (this.isMultilingual) {
                        // add a field that uses a language-specific analyzer via MultilingualAnalyzer
                        doc.add(new Field(e.getKey() + "_" + lang, (String) e.getValue(), ftText));
                    }
                } else if (datatype != null && !datatype.equals(XSDDatatype.XSDstring)) {
                    // for non-string and non-langString datatypes, store the datatype in langField
                    doc.add(new Field(langField, DATATYPE_PREFIX + datatype.getURI(), StringField.TYPE_STORED));
                }
            }
            if (uidField != null) {
                String hash = entity.getChecksum(e.getKey(), (String) e.getValue());
                doc.add(new Field(uidField, hash, StringField.TYPE_STORED));
            }
        }
        return doc;
    }

    @Override
    public Map<String, Node> get(String uri) {
        try {
            IndexReader indexReader = DirectoryReader.open(directory);
            List<Map<String, Node>> x = get$(indexReader, uri);
            if (x.size() == 0)
                return null;
            // if ( x.size() > 1)
            // throw new TextIndexException("Multiple entires for "+uri) ;
            return x.get(0);
        } catch (Exception ex) {
            throw new TextIndexException(ex);
        }
    }

    private QueryParser getQueryParser(Analyzer analyzer) {
        switch (queryParserType) {
        case "QueryParser":
            return new QueryParser(docDef.getPrimaryField(), analyzer);
        case "AnalyzingQueryParser":
            return new AnalyzingQueryParser(docDef.getPrimaryField(), analyzer);
        case "ComplexPhraseQueryParser":
            return new ComplexPhraseQueryParser(docDef.getPrimaryField(), analyzer);
        default:
            log.warn("Unknown query parser type '" + queryParserType + "'. Defaulting to standard QueryParser");
            return new QueryParser(docDef.getPrimaryField(), analyzer);
        }
    }

    private Query parseQuery(String queryString, Analyzer analyzer) throws ParseException {
        QueryParser queryParser = getQueryParser(analyzer);
        queryParser.setAllowLeadingWildcard(true);
        Query query = queryParser.parse(queryString);
        return query;
    }

    private List<Map<String, Node>> get$(IndexReader indexReader, String uri) throws ParseException, IOException {
        String escaped = QueryParserBase.escape(uri);
        String qs = docDef.getEntityField() + ":" + escaped;
        Query query = parseQuery(qs, queryAnalyzer);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        ScoreDoc[] sDocs = indexSearcher.search(query, 1).scoreDocs;
        List<Map<String, Node>> records = new ArrayList<>();

        for (ScoreDoc sd : sDocs) {
            Document doc = indexSearcher.doc(sd.doc);
            String[] x = doc.getValues(docDef.getEntityField());
            if (x.length != 1) {
            }
            String uriStr = x[0];
            Map<String, Node> record = new HashMap<>();
            Node entity = NodeFactory.createURI(uriStr);
            record.put(docDef.getEntityField(), entity);

            for (String f : docDef.fields()) {
                // log.info("Field: "+f) ;
                String[] values = doc.getValues(f);
                for (String v : values) {
                    Node n = entryToNode(v);
                    record.put(f, n);
                }
                records.add(record);
            }
        }
        return records;
    }

    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang) {
        return query(property, qs, graphURI, lang, MAX_N);
    }

    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang, int limit) {
        return query(property, qs, graphURI, lang, MAX_N, null);
    }

    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang, int limit,
            String highlight) {
        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            return query$(indexReader, property, qs, graphURI, lang, limit, highlight);
        } catch (ParseException ex) {
            throw new TextIndexParseException(qs, ex.getMessage());
        } catch (Exception ex) {
            throw new TextIndexException(ex);
        }
    }

    private List<TextHit> simpleResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field)
            throws IOException {
        List<TextHit> results = new ArrayList<>();

        for (ScoreDoc sd : sDocs) {
            Document doc = indexSearcher.doc(sd.doc);
            log.trace("simpleResults[{}]: {}", sd.doc, doc);
            String entity = doc.get(docDef.getEntityField());

            Node literal = null;
            //            String field = (property != null) ? docDef.getField(property) : docDef.getPrimaryField();
            String lexical = doc.get(field);

            if (lexical != null) {
                String doclang = doc.get(docDef.getLangField());
                if (doclang != null) {
                    if (doclang.startsWith(DATATYPE_PREFIX)) {
                        String datatype = doclang.substring(DATATYPE_PREFIX.length());
                        TypeMapper tmap = TypeMapper.getInstance();
                        literal = NodeFactory.createLiteral(lexical, tmap.getSafeTypeByName(datatype));
                    } else {
                        literal = NodeFactory.createLiteral(lexical, doclang);
                    }
                } else {
                    literal = NodeFactory.createLiteral(lexical);
                }
            }

            String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
            Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;

            Node subject = TextQueryFuncs.stringToNode(entity);
            TextHit hit = new TextHit(subject, sd.score, literal, graph);
            results.add(hit);
        }

        return results;
    }

    class HighlightOpts {
        int maxFrags = 3;
        int fragSize = 128;
        String start = RIGHT_ARROW;
        String end = LEFT_ARROW;
        String fragSep = DIVIDES;
        boolean joinHi = true;
        boolean joinFrags = true;

        public HighlightOpts(String optStr) {
            String[] opts = optStr.trim().split("\\|");
            for (String opt : opts) {
                opt = opt.trim();
                if (opt.startsWith("m:")) {
                    try {
                        maxFrags = Integer.parseInt(opt.substring(2));
                    } catch (Exception ex) {
                    }
                } else if (opt.startsWith("z:")) {
                    try {
                        fragSize = Integer.parseInt(opt.substring(2));
                    } catch (Exception ex) {
                    }
                } else if (opt.startsWith("s:")) {
                    start = opt.substring(2);
                } else if (opt.startsWith("e:")) {
                    end = opt.substring(2);
                } else if (opt.startsWith("f:")) {
                    fragSep = opt.substring(2);
                } else if (opt.startsWith("jh:")) {
                    String v = opt.substring(3);
                    if ("n".equals(v)) {
                        joinHi = false;
                    }
                } else if (opt.startsWith("jf:")) {
                    String v = opt.substring(3);
                    if ("n".equals(v)) {
                        joinFrags = false;
                    }
                }
            }
        }
    }

    private String frags2string(TextFragment[] frags, HighlightOpts opts) {
        String sep = "";
        String rez = "";

        for (TextFragment f : frags) {
            String s = opts.joinHi ? f.toString().replaceAll(opts.end + Z_MORE_SEPS + opts.start, "$1")
                    : f.toString();
            rez += sep + s;
            sep = opts.fragSep;
        }

        return rez;
    }

    private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field,
            String highlight) throws IOException, InvalidTokenOffsetsException {
        List<TextHit> results = new ArrayList<>();

        HighlightOpts opts = new HighlightOpts(highlight);

        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));

        for (ScoreDoc sd : sDocs) {
            Document doc = indexSearcher.doc(sd.doc);
            log.trace("highlightResults[{}]: {}", sd.doc, doc);
            String entity = doc.get(docDef.getEntityField());

            Node literal = null;
            String lexical = doc.get(field);
            if (lexical != null) {
                String docLang = doc.get(docDef.getLangField());
                TokenStream tokenStream = analyzer.tokenStream(field, lexical);
                TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags,
                        opts.maxFrags);
                String rez = frags2string(frags, opts);

                literal = NodeFactory.createLiteral(rez, docLang);
            }

            String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
            Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;

            Node subject = TextQueryFuncs.stringToNode(entity);
            TextHit hit = new TextHit(subject, sd.score, literal, graph);
            results.add(hit);
        }
        return results;
    }

    private List<TextHit> query$(IndexReader indexReader, Node property, String qs, String graphURI, String lang,
            int limit, String highlight) throws ParseException, IOException, InvalidTokenOffsetsException {
        String textField = docDef.getField(property);
        String textClause;
        String langClause = null;
        String graphClause = null;

        //for language-based search extension
        if (getDocDef().getLangField() != null) {
            String langField = getDocDef().getLangField();
            if (StringUtils.isNotEmpty(lang)) {
                if (this.isMultilingual && !lang.equals("none")) {
                    textField = (textField == null ? docDef.getPrimaryField() : textField) + "_" + lang;
                }
                langClause = !"none".equals(lang) ? langField + ":" + lang : "-" + langField + ":*";
            }
        }

        if (textField != null)
            textClause = textField + ":" + qs;
        else
            textClause = qs;

        String effectiveField = (textField != null) ? textField : docDef.getPrimaryField();

        if (graphURI != null) {
            String escaped = QueryParserBase.escape(graphURI);
            graphClause = getDocDef().getGraphField() + ":" + escaped;
        }

        String queryString = textClause;
        if (langClause != null)
            queryString = "(" + queryString + ") AND " + langClause;
        if (graphClause != null)
            queryString = "(" + queryString + ") AND " + graphClause;

        log.debug("Lucene query: {} ({})", queryString, limit);

        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Query query = parseQuery(queryString, queryAnalyzer);
        if (limit <= 0)
            limit = MAX_N;
        ScoreDoc[] sDocs = indexSearcher.search(query, limit).scoreDocs;

        if (highlight != null) {
            return highlightResults(sDocs, indexSearcher, query, effectiveField, highlight);
        } else {
            return simpleResults(sDocs, indexSearcher, query, effectiveField);
        }
    }

    @Override
    public EntityDefinition getDocDef() {
        return docDef;
    }

    private Node entryToNode(String v) {
        // TEMP
        return NodeFactoryExtra.createLiteralNode(v, null, null);
    }
}