org.talend.dataquality.semantic.api.LocalDictionaryCache.java Source code

Introduction

Here is the source code for org.talend.dataquality.semantic.api.LocalDictionaryCache.java
Source

// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.semantic.api;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.talend.dataquality.semantic.index.ClassPathDirectory;
import org.talend.dataquality.semantic.index.DictionarySearcher;
import org.talend.dataquality.semantic.model.DQDocument;

public class LocalDictionaryCache {

    private static final Logger LOGGER = Logger.getLogger(LocalDictionaryCache.class);

    private SearcherManager mgr;

    LocalDictionaryCache(String contextName) {
        try {
            URI ddPath = CategoryRegistryManager.getInstance(contextName).getDictionaryURI();
            Directory dir = ClassPathDirectory.open(ddPath);
            mgr = new SearcherManager(dir, null);
        } catch (IOException e) {
            LOGGER.error("Failed to read local dictionary cache! ", e);
        } catch (URISyntaxException e) {
            LOGGER.error("Failed to parse index URI! ", e);
        }
    }

    private List<DQDocument> dqDocListFromTopDocs(String categoryName, TopDocs docs) throws IOException {
        mgr.maybeRefresh();
        IndexSearcher searcher = mgr.acquire();
        IndexReader reader = searcher.getIndexReader();
        List<DQDocument> dqDocList = new ArrayList<>();
        for (ScoreDoc scoreDoc : docs.scoreDocs) {
            Document luceneDoc = reader.document(scoreDoc.doc);
            DQDocument dqDoc = DictionaryUtils.dictionaryEntryFromDocument(luceneDoc, categoryName);
            dqDocList.add(dqDoc);
        }
        mgr.release(searcher);
        return dqDocList;
    }

    /**
     * get a list of DQDocument of all dictionary entries.
     */
    public List<DQDocument> listDocuments(String categoryName, int offset, int n) {
        try {
            TopDocs docs = sendListDocumentsQuery(categoryName, offset, n);
            return dqDocListFromTopDocs(categoryName, docs);
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        return Collections.emptyList();
    }

    private Query getListDocumentsQuery(String categoryName) throws IOException {
        return new TermQuery(new Term(DictionarySearcher.F_WORD, categoryName));
    }

    private TopDocs sendListDocumentsQuery(String categoryName, int offset, int n) throws IOException {
        mgr.maybeRefresh();
        IndexSearcher searcher = mgr.acquire();
        TopDocs result = null;
        if (offset <= 0) {
            result = searcher.search(getListDocumentsQuery(categoryName), n);
        } else {
            TopDocs topDocs = searcher.search(getListDocumentsQuery(categoryName), offset + n);
            Query q = new TermQuery(new Term(DictionarySearcher.F_WORD, categoryName));
            result = searcher.searchAfter(topDocs.scoreDocs[Math.min(topDocs.totalHits, offset) - 1], q, n);
        }
        mgr.release(searcher);
        return result;
    }

    /**
     * Suggest dictionary values
     * 
     * @param categoryName the category name
     * @param input the string to search
     * @return all dictionary values containing the input string
     */
    public Set<String> suggestValues(String categoryName, String input) {
        return suggestValues(categoryName, input, 100);
    }

    /**
     * Suggest dictionary values
     * 
     * @param categoryName the category name
     * @param input the string to search
     * @param num number of results
     * @return all dictionary values containing the input string
     */
    public Set<String> suggestValues(String categoryName, String input, int num) {
        if (input != null) {
            final String trimmedInput = input.trim();
            if (trimmedInput.length() >= 2) {
                Set<String> values = doSuggestValues(categoryName, trimmedInput, num, true);
                if (values.isEmpty()) {
                    return doSuggestValues(categoryName, trimmedInput, num, false);
                } else {
                    return values;
                }
            }
        }
        return Collections.emptySet();
    }

    private Set<String> doSuggestValues(String categoryName, String input, int num, boolean isPrefixSearch) {
        String jointInput = DictionarySearcher.getJointTokens(input);
        String queryString = isPrefixSearch ? jointInput + "*" : "*" + jointInput + "*";

        final BooleanQuery booleanQuery = new BooleanQuery();
        final Query catQuery = new TermQuery(new Term(DictionarySearcher.F_WORD, categoryName));
        booleanQuery.add(catQuery, BooleanClause.Occur.MUST);
        final Query wildcardQuery = new WildcardQuery(new Term(DictionarySearcher.F_SYNTERM, queryString));
        booleanQuery.add(wildcardQuery, BooleanClause.Occur.MUST);

        Set<String> results = new TreeSet<String>();

        try {
            mgr.maybeRefresh();
            IndexSearcher searcher = mgr.acquire();
            IndexReader reader = searcher.getIndexReader();
            TopDocs topDocs = searcher.search(booleanQuery, num);
            mgr.release(searcher);
            for (int i = 0; i < topDocs.scoreDocs.length; i++) {
                Document doc = reader.document(topDocs.scoreDocs[i].doc);
                IndexableField[] fields = doc.getFields(DictionarySearcher.F_RAW);
                for (IndexableField f : fields) {
                    final String str = f.stringValue();
                    if (isPrefixSearch) {
                        if (StringUtils.startsWithIgnoreCase(str, input) || StringUtils
                                .startsWithIgnoreCase(DictionarySearcher.getJointTokens(str), jointInput)) {
                            results.add(str);
                        }
                    } else {// infix search
                        if (StringUtils.containsIgnoreCase(str, input) || StringUtils
                                .containsIgnoreCase(DictionarySearcher.getJointTokens(str), jointInput)) {
                            results.add(str);
                        }
                    }
                }
            }
        } catch (IOException e) {
            LOGGER.trace(e.getMessage(), e);
        }
        return results;
    }
}