com.mothsoft.alexis.dao.DocumentDaoImpl.java Source code

Introduction

Here is the source code for com.mothsoft.alexis.dao.DocumentDaoImpl.java
Source

/*   Copyright 2012 Tim Garrett, Mothsoft LLC
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package com.mothsoft.alexis.dao;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.persistence.EntityManager;
import javax.persistence.LockModeType;
import javax.persistence.PersistenceContext;
import javax.persistence.Query;

import org.apache.commons.lang.time.StopWatch;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Version;
import org.hibernate.CacheMode;
import org.hibernate.QueryException;
import org.hibernate.ScrollMode;
import org.hibernate.ScrollableResults;
import org.hibernate.Session;
import org.hibernate.search.FullTextQuery;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.hibernate.search.SearchFactory;
import org.hibernate.search.indexes.IndexReaderAccessor;
import org.springframework.stereotype.Repository;

import com.mothsoft.alexis.domain.DataRange;
import com.mothsoft.alexis.domain.Document;
import com.mothsoft.alexis.domain.DocumentContent;
import com.mothsoft.alexis.domain.DocumentScore;
import com.mothsoft.alexis.domain.DocumentState;
import com.mothsoft.alexis.domain.DocumentTerm;
import com.mothsoft.alexis.domain.Edge;
import com.mothsoft.alexis.domain.Graph;
import com.mothsoft.alexis.domain.ImportantNamedEntity;
import com.mothsoft.alexis.domain.ImportantTerm;
import com.mothsoft.alexis.domain.Node;
import com.mothsoft.alexis.domain.SortOrder;
import com.mothsoft.alexis.domain.StopWords;
import com.mothsoft.alexis.domain.TFIDF;
import com.mothsoft.alexis.domain.TopicDocument;
import com.mothsoft.alexis.security.CurrentUserUtil;

@Repository
public class DocumentDaoImpl implements DocumentDao {

    private static final Logger logger = Logger.getLogger(DocumentDaoImpl.class);
    private static final DocumentState ANY_DOCUMENT_STATE = null;
    private static final Date NO_DATE = null;
    private static final String CONTENT_TEXT_FIELD_NAME = "content.text";

    @PersistenceContext
    private EntityManager em;

    public DocumentDaoImpl() throws IOException {
    }

    public void setEm(final EntityManager em) {
        this.em = em;
    }

    public void add(final Document document) {
        this.em.persist(document);
    }

    public void add(final DocumentContent content) {
        this.em.persist(content);
    }

    public void bulkUpdateDocumentState(DocumentState queryState, DocumentState nextState) {

        final Query query = this.em
                .createQuery("SELECT d FROM Document d WHERE d.intState = :queryState ORDER BY d.id ASC");
        query.setParameter("queryState", queryState.getValue());

        @SuppressWarnings("unchecked")
        final List<Document> documents = query.getResultList();

        for (final Document document : documents) {
            document.setState(nextState);
        }
    }

    public Document findByUrl(final String url) {
        final Query query = this.em.createQuery("FROM Document WHERE url = :url");
        query.setParameter("url", url);

        @SuppressWarnings("unchecked")
        final List<Document> results = query.getResultList();

        if (results.size() == 1) {
            return results.get(0);
        }

        return null;
    }

    public Document get(final Long id) {
        if (CurrentUserUtil.isSystem()) {
            return this.em.find(Document.class, id);
        } else {
            final Long userId = CurrentUserUtil.getCurrentUserId();

            final Query query = this.em
                    .createQuery("select d from Document d inner join d.documentUsers du inner join du.user user "
                            + "where user.id = :userId and d.id = :docId");
            query.setParameter("userId", userId);
            query.setParameter("docId", id);

            final Document document = (Document) query.getSingleResult();
            return document;
        }
    }

    public void update(final Document document) {
        this.em.merge(document);
    }

    public DataRange<Document> listDocumentsByOwner(final Long userId, final int first, final int count) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final SortOrder sortOrder = SortOrder.DATE_DESC;
        final DataRange<DocumentScore> scoredRange = this.searchWithAllOptions(userId, false, null, null, sortOrder,
                null /* ignore start date */, null /* ignore end date */, first, count);

        final List<Document> range = new ArrayList<Document>(scoredRange.getRange().size());

        for (final DocumentScore scoredDoc : scoredRange.getRange()) {
            range.add(scoredDoc.getDocument());
        }

        final DataRange<Document> dataRange = new DataRange<Document>(range, scoredRange.getFirstRow(),
                scoredRange.getTotalRowsAvailable());

        stopWatch.stop();
        logger.debug(stopWatch.toString());

        return dataRange;
    }

    public DataRange<Document> listDocumentsInTopicsByOwner(final Long userId, final int first, final int count) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final SortOrder sortOrder = SortOrder.DATE_DESC;
        final DataRange<DocumentScore> scoredRange = this.searchWithAllOptions(userId, true,
                DocumentState.MATCHED_TO_TOPICS, null, sortOrder, null, null, first, count);

        final List<Document> range = new ArrayList<Document>(scoredRange.getRange().size());

        for (final DocumentScore scoredDoc : scoredRange.getRange()) {
            range.add(scoredDoc.getDocument());
        }

        final DataRange<Document> dataRange = new DataRange<Document>(range, scoredRange.getFirstRow(),
                scoredRange.getTotalRowsAvailable());

        stopWatch.stop();
        logger.debug(stopWatch.toString());

        return dataRange;
    }

    public Document findAndLockOneDocument(final DocumentState state) {
        final Query query = this.em.createQuery("from Document where intState = :state order by id asc");
        query.setParameter("state", state.getValue());
        query.setMaxResults(1);

        @SuppressWarnings("unchecked")
        final List<Document> results = query.getResultList();

        if (results.isEmpty()) {
            return null;
        }

        final Document document = results.get(0);
        document.lock();
        this.em.merge(document);
        return document;
    }

    public List<ImportantTerm> getImportantTerms(Long userId, Date startDate, Date endDate, int count,
            boolean filterStopWords) {
        final FullTextQuery fullTextQuery = this.buildFullTextQuery(null, userId, startDate, endDate, false,
                ANY_DOCUMENT_STATE, FullTextQuery.DOCUMENT_ID);
        return getImportantTerms(fullTextQuery, count, filterStopWords);
    }

    @SuppressWarnings("unchecked")
    private List<ImportantTerm> getImportantTerms(FullTextQuery fullTextQuery, int count, boolean filterStopWords) {
        final Long start = System.currentTimeMillis();
        final List<Object[]> results = fullTextQuery.list();
        final LinkedHashMap<String, Tuple<Integer, Float>> termCountMap = new LinkedHashMap<String, Tuple<Integer, Float>>();

        final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
        final SearchFactory searchFactory = fullTextSession.getSearchFactory();
        final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
        final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);
        final IndexSearcher searcher = new IndexSearcher(reader);

        final List<ImportantTerm> importantTerms;
        final int numDocs;
        try {
            numDocs = reader.numDocs();
            Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);

            if (logger.isDebugEnabled()) {
                logger.debug(String.format("Found %d matching Lucene documents of %d in reader", results.size(),
                        numDocs));
            }

            // loop over all the matching documents
            for (final Object[] ith : results) {
                int docId = ((Number) ith[0]).intValue();
                final TermFreqVector tfv = reader.getTermFreqVector(docId, CONTENT_TEXT_FIELD_NAME);

                if (tfv == null) {
                    continue;
                }

                final String[] terms = tfv.getTerms();
                final int[] freqs = tfv.getTermFrequencies();

                // total document size
                int size = 0;

                for (int freq : freqs) {
                    size += freq;
                }

                if (logger.isDebugEnabled()) {
                    logger.debug(
                            String.format("Lucene document %d has %d terms, to be merged with running count %d",
                                    docId, size, termCountMap.size()));
                }

                // loop over the terms and aggregate the counts and tf-idf
                int i = 0;
                for (final String term : terms) {
                    if (StopWords.ENGLISH.contains(term)) {
                        continue;
                    }

                    luceneTerm = luceneTerm.createTerm(term);
                    final int termCount = freqs[i++];

                    final Tuple<Integer, Float> countScore;
                    if (termCountMap.containsKey(term)) {
                        countScore = termCountMap.get(term);
                        countScore.t1 += termCount;
                        countScore.t2 += (TFIDF.score(term, termCount, size, numDocs,
                                searcher.docFreq(luceneTerm)));
                    } else {
                        countScore = new Tuple<Integer, Float>();
                        countScore.t1 = termCount;
                        countScore.t2 = (TFIDF.score(term, termCount, size, numDocs, searcher.docFreq(luceneTerm)));
                        termCountMap.put(term, countScore);
                    }
                }
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Completed Lucene document processing.");
            }

            importantTerms = new ArrayList<ImportantTerm>(termCountMap.size());

            // find max TF-IDF
            float maxTfIdf = 0.0f;
            for (final Tuple<Integer, Float> ith : termCountMap.values()) {
                if (ith.t2 > maxTfIdf) {
                    maxTfIdf = ith.t2;
                }
            }

            for (final Map.Entry<String, Tuple<Integer, Float>> entry : termCountMap.entrySet()) {
                final int ithCount = entry.getValue().t1;
                final float ithTfIdf = entry.getValue().t2;
                importantTerms.add(new ImportantTerm(entry.getKey(), ithCount, ithTfIdf, maxTfIdf));
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Completed term aggregation, will clear term map");
            }

            termCountMap.clear();

        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            try {
                searcher.close();
            } catch (IOException e) {
                logger.warn("Failed to close searcher: " + e, e);
            }
            ira.close(reader);
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Sorting terms");
        }

        Collections.sort(importantTerms, new Comparator<ImportantTerm>() {
            @Override
            public int compare(ImportantTerm term1, ImportantTerm term2) {
                return -1 * term1.getTfIdf().compareTo(term2.getTfIdf());
            }
        });

        if (logger.isDebugEnabled()) {
            logger.debug("Term sort complete");
        }

        if (importantTerms.isEmpty() || importantTerms.size() < count) {
            if (logger.isDebugEnabled()) {
                logger.debug("Will return full list.");
            }
            logger.debug("Timer: " + (System.currentTimeMillis() - start));
            return importantTerms;
        } else {
            if (logger.isDebugEnabled()) {
                logger.debug(
                        "Will return sublist containing " + count + " of " + importantTerms.size() + " terms.");
            }

            logger.debug("Timer: " + (System.currentTimeMillis() - start));
            return importantTerms.subList(0, count);
        }
    }

    @SuppressWarnings("unchecked")
    public List<ImportantTerm> getImportantTerms(Long documentId, int howMany, boolean filterStopWords) {

        final Query query;

        if (filterStopWords) {
            query = this.em.createQuery("select dt from DocumentTerm dt join dt.document d join dt.term t "
                    + " where d.id = :documentId and t.valueLowercase NOT IN (:stopWords) "
                    + "   and dt.tfIdf is not null order by dt.tfIdf DESC");
            query.setParameter("stopWords", StopWords.ENGLISH);
        } else {
            query = this.em.createQuery("select dt from DocumentTerm dt join dt.document d join dt.term t "
                    + " where d.id = :documentId and dt.tfIdf is not null order by dt.tfIdf DESC");
        }
        query.setParameter("documentId", documentId);
        query.setMaxResults(howMany);

        final List<DocumentTerm> documentTerms = query.getResultList();

        float maxTfIdf = -1.0f;

        for (final DocumentTerm documentTerm : documentTerms) {
            if (documentTerm.getTfIdf() > maxTfIdf) {
                maxTfIdf = documentTerm.getTfIdf();
            }
        }

        final List<ImportantTerm> importantTerms = new ArrayList<ImportantTerm>(documentTerms.size());

        for (final DocumentTerm documentTerm : documentTerms) {
            importantTerms.add(new ImportantTerm(documentTerm.getTerm().getValueLowercase(),
                    documentTerm.getCount(), documentTerm.getTfIdf(), maxTfIdf));
        }

        return importantTerms;
    }

    public List<Document> listTopDocuments(Long userId, Date startDate, Date endDate, int count) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final Query query = this.em
                .createQuery("select d from Topic topic join topic.topicDocuments td join td.document d "
                        + "   where topic.userId = :userId "
                        + "     and td.creationDate > :startDate and td.creationDate < :endDate "
                        + "     and td.score > 0.2                                            "
                        + "     order by td.score desc");
        query.setParameter("userId", userId);
        query.setParameter("startDate", startDate);
        query.setParameter("endDate", endDate);
        query.setFirstResult(0);
        query.setMaxResults(count);

        query.setLockMode(LockModeType.NONE);

        @SuppressWarnings("unchecked")
        final List<Document> range = query.getResultList();

        stopWatch.stop();
        logger.debug(stopWatch.toString());

        return range;
    }

    @Override
    public ScrollableResults scrollableSearch(Long userId, DocumentState state, String queryString,
            SortOrder sortOrder, Date startDate, Date endDate) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final FullTextQuery fullTextQuery = this.buildFullTextQuery(queryString, userId, startDate, endDate, false,
                state, FullTextQuery.THIS, FullTextQuery.SCORE);

        final Sort sort;
        switch (sortOrder) {
        case DATE_ASC:
            sort = new Sort(new SortField("id", SortField.LONG));
            break;
        case DATE_DESC:
            sort = new Sort(new SortField("id", SortField.LONG, true));
            break;
        case RELEVANCE:
            sort = new Sort(SortField.FIELD_SCORE, new SortField("id", SortField.LONG, true));
            break;
        default:
            throw new IllegalArgumentException("Unexpected SortOrder: " + sortOrder.name());
        }
        fullTextQuery.setSort(sort);

        fullTextQuery.setFetchSize(50);
        fullTextQuery.setReadOnly(true);
        fullTextQuery.setCacheable(false);
        fullTextQuery.setCacheMode(CacheMode.IGNORE);

        final ScrollableResults result = fullTextQuery.scroll(ScrollMode.FORWARD_ONLY);

        stopWatch.stop();
        logger.debug(stopWatch.toString());

        return result;
    }

    public DataRange<DocumentScore> searchByOwnerAndExpression(Long userId, String queryString, SortOrder sortOrder,
            Date startDate, Date endDate, int first, int count) {
        final boolean requireTopicsForUser = false;
        return searchWithAllOptions(userId, requireTopicsForUser, null, queryString, sortOrder, startDate, endDate,
                first, count);
    }

    public int searchResultCount(Long userId, DocumentState state, String queryString, Date startDate,
            Date endDate) {
        final DataRange<DocumentScore> range = searchByOwnerAndStateAndExpression(userId, state, queryString,
                startDate, endDate, 0, 1);
        return range.getTotalRowsAvailable();
    }

    public DataRange<DocumentScore> searchByOwnerAndStateAndExpression(Long userId, DocumentState state,
            String queryString, Date startDate, Date endDate, int first, int count) {
        final boolean requireTopicsForUser = false;
        return searchWithAllOptions(userId, requireTopicsForUser, state, queryString, null /* default */, startDate,
                endDate, first, count);
    }

    private DataRange<DocumentScore> searchWithAllOptions(final Long userId, final boolean requireTopicsForUser,
            final DocumentState state, final String queryString, final SortOrder sortOrder, final Date startDate,
            final Date endDate, final int first, final int count) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final FullTextQuery fullTextQuery = this.buildFullTextQuery(queryString, userId, startDate, endDate,
                requireTopicsForUser, state, FullTextQuery.THIS, FullTextQuery.SCORE);

        fullTextQuery.setFirstResult(first);
        fullTextQuery.setMaxResults(count);

        // optional sort order
        if (sortOrder == null || sortOrder == SortOrder.RELEVANCE) {
            final Sort defaultSort = new Sort(SortField.FIELD_SCORE, new SortField("id", SortField.LONG, true));
            fullTextQuery.setSort(defaultSort);
        } else if (sortOrder == SortOrder.DATE_DESC) {
            final Sort sort = new Sort(new SortField("creationDate", SortField.LONG, true));
            fullTextQuery.setSort(sort);
        } else if (sortOrder == SortOrder.DATE_ASC) {
            final Sort sort = new Sort(new SortField("creationDate", SortField.LONG));
            fullTextQuery.setSort(sort);
        }

        @SuppressWarnings("unchecked")
        final List<Object[]> results = fullTextQuery.list();
        final List<DocumentScore> range = new ArrayList<DocumentScore>(results.size());

        // copy to DocumentScore holder objects
        for (final Object[] ith : results) {
            final Document ithDoc = (Document) ith[0];
            final Float ithScore = (Float) ith[1];
            range.add(new DocumentScore(ithDoc, ithScore));
        }

        final int totalRows = fullTextQuery.getResultSize();
        final DataRange<DocumentScore> result = new DataRange<DocumentScore>(range, first, totalRows);

        stopWatch.stop();
        logger.debug(stopWatch.toString());
        return result;
    }

    private FullTextQuery buildFullTextQuery(final String queryString, final Long userId, final Date startDate,
            final Date endDate, final boolean requireTopicsForUser, final DocumentState state,
            final String... projectionConstants) {
        final String[] fields = new String[] { "title", "description", CONTENT_TEXT_FIELD_NAME, "author" };
        final MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields,
                new StandardAnalyzer(Version.LUCENE_35));

        org.apache.lucene.search.BooleanQuery compositeQuery = new org.apache.lucene.search.BooleanQuery();

        if (queryString != null) {
            org.apache.lucene.search.Query luceneTextQuery;
            try {
                luceneTextQuery = parser.parse(queryString);
                compositeQuery.add(luceneTextQuery, Occur.MUST);
            } catch (ParseException e) {
                throw new QueryException(e);
            }
        }

        org.apache.lucene.search.Query luceneSecurityQuery = NumericRangeQuery.newLongRange("user", userId, userId,
                true, true);
        compositeQuery.add(luceneSecurityQuery, Occur.MUST);

        if (startDate != null || endDate != null) {
            final Long startMillis = startDate == null ? 0 : startDate.getTime();
            final Long endMillis = endDate == null ? Long.MAX_VALUE : endDate.getTime();
            org.apache.lucene.search.Query dateRangeQuery = NumericRangeQuery.newLongRange("creationDate",
                    startMillis, endMillis, true, true);
            compositeQuery.add(dateRangeQuery, Occur.MUST);
        }

        if (requireTopicsForUser) {
            org.apache.lucene.search.Query topicUserQuery = NumericRangeQuery.newLongRange("topicUser", userId,
                    userId, true, true);
            compositeQuery.add(topicUserQuery, Occur.MUST);
        }

        if (state != null) {
            final int stateInt = state.getValue();
            org.apache.lucene.search.Query stateQuery = NumericRangeQuery.newIntRange("state", stateInt, stateInt,
                    true, true);
            compositeQuery.add(stateQuery, Occur.MUST);
        }

        final Session session = (Session) this.em.getDelegate();
        final FullTextSession fullTextSession = Search.getFullTextSession(session);
        final FullTextQuery fullTextQuery = fullTextSession.createFullTextQuery(compositeQuery)
                .setProjection(projectionConstants);
        return fullTextQuery;
    }

    // FIXME - this arose after making documents shared by multiple users while
    // topics are still private. Users were seeing the names of other users'
    // topics. Tried filters, formulas, left joins, and about everything else I
    // could think of.
    // Left joins were especially troublesome as it seemed impossible to write
    // joins that would handle all 3 of the following scenarios:
    // 1.) No topics assigned to a document, 2.) No topics *for the current
    // user* assigned to a document, 3.) Topics assigned to current user.
    // Invariably, one of these 3 would be broken.
    // This is more performant than a lot of other options I thought of but
    // it still requires cirumventing what it seems a framework like Hibernate
    // or JPA should be able to provide. It is also not lazy-loadable and should
    // be used with great care on large collections or objects where collection
    // may not be read.
    public List<TopicDocument> getTopicDocuments(final Long documentId) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final Long userId = CurrentUserUtil.getCurrentUserId();

        final Query query = this.em.createQuery("select td " + "from TopicDocument td join td.topic topic "
                + "where td.document.id = :documentId and topic.userId = :userId " + "order by td.score desc");
        query.setParameter("userId", userId);
        query.setParameter("documentId", documentId);
        @SuppressWarnings("unchecked")
        final List<TopicDocument> filteredTopicDocuments = (List<TopicDocument>) query.getResultList();

        stopWatch.stop();
        logger.debug(stopWatch.toString());
        return filteredTopicDocuments;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * com.mothsoft.alexis.dao.DocumentDao#getRelatedTerms(java.lang.String,
     * java.lang.Long, int)
     */
    @SuppressWarnings("unchecked")
    public Graph getRelatedTerms(final String queryString, final Long userId, final int howMany) {
        final StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        final FullTextQuery fullTextQuery = this.buildFullTextQuery(queryString, userId, NO_DATE, NO_DATE, false,
                DocumentState.MATCHED_TO_TOPICS, FullTextQuery.ID);

        // find the specified number of terms from the most recent 100 documents
        // that match the query
        final Sort sort = new Sort(new SortField("creationDate", SortField.LONG, true));
        fullTextQuery.setSort(sort);
        fullTextQuery.setFirstResult(0);
        fullTextQuery.setMaxResults(100);

        final List<Long> documentIds = new ArrayList<Long>(100);
        final List<Long> termIds = new ArrayList<Long>(100);

        final List<Object[]> results = fullTextQuery.list();

        for (final Object[] ith : results) {
            final Long id = (Long) ith[0];
            documentIds.add(id);
        }

        final Map<String, Node> nodes = new LinkedHashMap<String, Node>();
        final Node root = new Node(queryString, Boolean.TRUE);
        nodes.put(queryString, root);

        final Map<String, Edge> edges = new HashMap<String, Edge>();

        if (!documentIds.isEmpty()) {
            final Session session = (Session) this.em.getDelegate();
            final org.hibernate.SQLQuery termsQuery = session.createSQLQuery("SELECT term.id "
                    + "        FROM document_term dt INNER JOIN term on term.id = dt.term_id "
                    + "        WHERE dt.document_id IN (:documentIds) GROUP BY term.id ORDER BY SUM(dt.tf_idf) DESC");
            termsQuery.setParameterList("documentIds", documentIds);
            termsQuery.setMaxResults(100);
            termIds.addAll((List<Long>) termsQuery.list());
        }

        if (!documentIds.isEmpty() && !termIds.isEmpty()) {

            final Session session = (Session) this.em.getDelegate();
            final org.hibernate.SQLQuery associationsQuery = session.createSQLQuery(
                    "SELECT CONCAT(a.term_value) term_a_value, CONCAT(b.term_value) term_b_value, SUM(da.association_weight) sum_weight "
                            + "      FROM document_association da "
                            + "      INNER JOIN term a ON da.term_a_id = a.id "
                            + "        AND a.part_of_speech NOT IN (1, 3, 18, 19, 25, 39, 40) "
                            + "        AND length(a.term_value) > 2 "
                            + "      INNER JOIN term b ON da.term_b_id = b.id "
                            + "        AND b.part_of_speech NOT IN (1, 3, 18, 19, 25, 39, 40) "
                            + "        AND length(b.term_value) > 2 "
                            + "      WHERE da.document_id IN (:documentIds) AND (da.term_a_id IN (:termIds) OR da.term_b_id IN (:termIds)) "
                            + "      GROUP BY a.id, b.id ORDER BY sum_weight DESC");
            associationsQuery.setParameterList("documentIds", documentIds);
            associationsQuery.setParameterList("termIds", termIds);
            associationsQuery.setMaxResults(howMany);

            final List<Object[]> relatedTermsResults = associationsQuery.list();

            final Set<String> aNodeKeys = new HashSet<String>();
            final Set<String> bNodeKeys = new HashSet<String>();

            for (final Object[] ith : relatedTermsResults) {
                final String a = (String) ith[0];
                final String b = (String) ith[1];

                if (!nodes.containsKey(a)) {
                    final Node node = new Node(a);
                    nodes.put(a, node);
                }

                if (!nodes.containsKey(b)) {
                    final Node node = new Node(b);
                    nodes.put(b, node);
                }

                if (a.equals(b)) {
                    continue;
                }

                final String edgeKey = a + "||" + b;
                final String edgeKeyInverse = b + "||" + a;
                if (!edges.containsKey(edgeKey) && !edges.containsKey(edgeKeyInverse)) {
                    final Node nodeA = nodes.get(a);
                    final Node nodeB = nodes.get(b);

                    aNodeKeys.add(a);
                    bNodeKeys.add(b);

                    final Edge edge = new Edge(nodeA, nodeB);
                    edges.put(edgeKey, edge);
                }
            }

            // "orphan" handling, any b that is not also an a needs an edge from
            // root
            final Set<String> orphanKeys = new HashSet<String>();
            orphanKeys.addAll(bNodeKeys);
            orphanKeys.removeAll(aNodeKeys);

            for (final String orphanKey : orphanKeys) {
                final Node orphan = nodes.get(orphanKey);
                final Edge orphanToParent = new Edge(root, orphan);
                edges.put(root.getName() + "||" + orphan.getName(), orphanToParent);
            }
        }

        final List<Node> nodeList = new ArrayList<Node>(nodes.size());
        // keep root as first element
        nodes.remove(root.getName());
        nodeList.add(root);
        nodeList.addAll(nodes.values());

        final Graph graph = new Graph(nodeList, new ArrayList<Edge>(edges.values()));

        stopWatch.stop();
        logger.info("Related terms search took: " + stopWatch.toString());

        return graph;
    }

    @SuppressWarnings("unchecked")
    @Override
    public List<ImportantNamedEntity> getImportantNamedEntities(Long userId, Date startDate, Date endDate,
            int howMany) {
        final Query query = this.em
                .createQuery("SELECT NEW com.mothsoft.alexis.domain.ImportantNamedEntity(ne.name, sum(ne.count)) "
                        + "FROM DocumentNamedEntity ne JOIN ne.document document JOIN document.documentUsers documentUser "
                        + "WHERE document.creationDate >= :startDate AND document.creationDate <= :endDate AND documentUser.user.id = :userId "
                        + "GROUP BY ne.name ORDER BY sum(ne.count) DESC");
        query.setParameter("startDate", startDate);
        query.setParameter("endDate", endDate);
        query.setParameter("userId", userId);
        query.setMaxResults(howMany);
        return query.getResultList();
    }

    @SuppressWarnings("unchecked")
    @Override
    public List<ImportantNamedEntity> getImportantNamedEntitiesForDocument(Long documentId, int howMany) {
        final Query query = this.em
                .createQuery("SELECT NEW com.mothsoft.alexis.domain.ImportantNamedEntity(ne.name, sum(ne.count)) "
                        + "FROM DocumentNamedEntity ne JOIN ne.document document WHERE document.id = :documentId "
                        + "GROUP BY ne.name ORDER BY sum(ne.count) DESC");
        query.setParameter("documentId", documentId);
        query.setMaxResults(howMany);
        return query.getResultList();
    }

    private class Tuple<T1, T2> {
        public T1 t1;
        public T2 t2;
    }
}