edu.sabanciuniv.sentilab.sare.controllers.entitymanagers.LexiconBuilderController.java Source code

Java tutorial

Introduction

Here is the source code for edu.sabanciuniv.sentilab.sare.controllers.entitymanagers.LexiconBuilderController.java

Source

/*
 * Sentilab SARE: a Sentiment Analysis Research Environment
 * Copyright (C) 2013 Sabanci University Sentilab
 * http://sentilab.sabanciuniv.edu
 * 
 * This file is part of SARE.
 * 
 * SARE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *  
 * SARE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with SARE. If not, see <http://www.gnu.org/licenses/>.
 */

package edu.sabanciuniv.sentilab.sare.controllers.entitymanagers;

import java.util.*;

import javax.persistence.*;

import org.apache.commons.lang3.Validate;

import com.google.common.base.Predicate;
import com.google.common.collect.*;

import edu.sabanciuniv.sentilab.sare.models.base.document.*;
import edu.sabanciuniv.sentilab.sare.models.base.documentStore.*;
import edu.sabanciuniv.sentilab.utils.*;
import edu.sabanciuniv.sentilab.utils.text.nlp.base.LinguisticToken;

/**
 * A controller for {@link LexiconBuilderDocumentStore} operations.
 * @author Mus'ab Husaini
 */
public class LexiconBuilderController extends PersistentDocumentStoreController {

    private TypedQuery<LexiconBuilderDocument> getDocumentsQuery(EntityManager em,
            LexiconBuilderDocumentStore builder, Boolean seen) {
        TypedQuery<LexiconBuilderDocument> query = em.createQuery(
                String.format("SELECT doc FROM LexiconBuilderDocument doc " + "WHERE doc.store=:builder %s "
                        + "ORDER BY doc.weight DESC", seen != null ? " AND doc.flag=:seen" : ""),
                LexiconBuilderDocument.class);
        query.setParameter("builder", builder);
        if (seen != null) {
            query.setParameter("seen", seen);
        }

        return query;
    }

    /**
     * Refreshes the state of the given builder based on its base corpus and adds any missing documents.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to refresh.
     * @return the supplied {@link LexiconBuilderDocumentStore} object.
     */
    public LexiconBuilderDocumentStore refreshBuilder(EntityManager em, LexiconBuilderDocumentStore builder) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");
        Validate.notNull(builder.getCorpus(), CannedMessages.NULL_ARGUMENT, "builder.corpus");

        TypedQuery<FullTextDocument> ftdQuery = em.createQuery("SELECT d FROM FullTextDocument d "
                + "WHERE d.store=:corpus "
                + "AND NOT EXISTS (SELECT bd FROM LexiconBuilderDocument bd WHERE bd.store=:builder AND bd.baseDocument=d)",
                FullTextDocument.class);

        ftdQuery.setParameter("corpus", builder.getCorpus()).setParameter("builder", builder);

        for (FullTextDocument document : ftdQuery.getResultList()) {
            LexiconBuilderDocument lbd = new LexiconBuilderDocument(document);
            builder.addDocument(lbd);
            em.persist(lbd);
        }
        return builder;
    }

    /**
     * Finds the builder associated with a given corpus and lexicon.
     * @param em the {@link EntityManager} to use.
     * @param corpus the {@link DocumentCorpus} being used to build the lexicon.
     * @param lexicon the {@link Lexicon} being built.
     * @return the {@link LexiconBuilderDocumentStore} object found, if any; {@code null} otherwise.
     */
    public LexiconBuilderDocumentStore findBuilder(EntityManager em, DocumentCorpus corpus, Lexicon lexicon) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(corpus, CannedMessages.NULL_ARGUMENT, "corpus");
        Validate.notNull(lexicon, CannedMessages.NULL_ARGUMENT, "lexicon");

        TypedQuery<LexiconBuilderDocumentStore> query = em.createQuery(
                "SELECT b FROM LexiconBuilderDocumentStore b "
                        + "WHERE b.baseStore=:corpus AND :lexicon MEMBER OF b.referencedObjects",
                LexiconBuilderDocumentStore.class);
        query.setMaxResults(1).setParameter("corpus", corpus).setParameter("lexicon", lexicon);

        return this.getSingleResult(query);
    }

    /**
     * Gets all the documents being used to build the lexicon.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to use.
     * @param seen whether to show only seen or unseen documents; {@code null} means no filtering.
     * @return the {@link List} of {@link LexiconBuilderDocument} objects.
     */
    public List<LexiconBuilderDocument> getDocuments(EntityManager em, LexiconBuilderDocumentStore builder,
            Boolean seen) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");

        TypedQuery<LexiconBuilderDocument> query = this.getDocumentsQuery(em, builder, seen);
        return query.getResultList();
    }

    /**
     * Gets all the documents being used to build the lexicon.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to use.
     * @return the {@link List} of {@link LexiconBuilderDocument} objects.
     */
    public List<LexiconBuilderDocument> getDocuments(EntityManager em, LexiconBuilderDocumentStore builder) {
        return this.getDocuments(em, builder, null);
    }

    /**
     * Gets the document at the given rank.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to use.
     * @param rank the rank of the document. If {@code null}, this returns the same result as {@code getNextDocument}.
     * @return the {@link LexiconBuilderDocument} at the given rank.
     */
    public LexiconBuilderDocument getDocument(EntityManager em, LexiconBuilderDocumentStore builder, Long rank) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");

        if (rank == null) {
            return this.getNextDocument(em, builder);
        }

        TypedQuery<LexiconBuilderDocument> query = this.getDocumentsQuery(em, builder, null);
        query.setFirstResult(rank.intValue());
        query.setMaxResults(1);
        LexiconBuilderDocument doc = this.getSingleResult(query);
        doc.setRank(rank);
        return doc;
    }

    /**
     * Gets the previously seen tokens for the given {@link LexiconBuilderDocumentStore}.
     * @param em the {@link EntityManager} to use.
     * @param builder the identifier of the {@link LexiconBuilderDocumentStore} to use.
     * @return the {@link List} of {@link LexiconDocument} objects.
     */
    public List<LexiconDocument> getSeenTokens(EntityManager em, LexiconBuilderDocumentStore builder) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");

        TypedQuery<LexiconDocument> query = em
                .createQuery("SELECT doc FROM LexiconDocument doc WHERE doc.store=:builder", LexiconDocument.class);
        query.setParameter("builder", builder);

        return query.getResultList();
    }

    /**
     * Gets a value indicating whether the provided token has been previously seen or not.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to use.
     * @param token the token to look for.
     * @return {@code true} if the token was seen; {@code false} otherwise.
     */
    public boolean isSeenToken(EntityManager em, LexiconBuilderDocumentStore builder, String token) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");

        Query query = em.createQuery(
                "SELECT doc FROM LexiconDocument doc " + "WHERE doc.store=:builder AND doc.title=:token");
        query.setMaxResults(1).setParameter("builder", builder).setParameter("token", token);

        return query.getResultList().size() > 0;
    }

    /**
     * Gets the unseen document with the next highest weight.
     * @param em the {@link EntityManager} to use.
     * @param builder the {@link LexiconBuilderDocumentStore} to use.
     * @return the {@link LexiconBuilderDocument} with the highest weight among unseen documents.
     */
    public LexiconBuilderDocument getNextDocument(EntityManager em, LexiconBuilderDocumentStore builder) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(builder, CannedMessages.NULL_ARGUMENT, "builder");

        TypedQuery<LexiconBuilderDocument> query = this.getDocumentsQuery(em, builder, false);
        query.setMaxResults(1);
        if (query.getResultList().size() != 1) {
            return null;
        }
        LexiconBuilderDocument document = query.getSingleResult();
        query = this.getDocumentsQuery(em, builder, null);
        document.setRank((long) query.getResultList().indexOf(document));
        return document;
    }

    /**
     * Sets the provided document and all tokens contained therein as having been seen.
     * @param em the {@link EntityManager} to use.
     * @param document the {@link LexiconBuilderDocument} object to mark as seen.
     * @param seenTags a delimited list of POS tags to mark as seen.
     * @return the supplied {@link LexiconBuilderDocument}.
     */
    public LexiconBuilderDocument setSeenDocument(EntityManager em, LexiconBuilderDocument document,
            String seenTags) {
        Validate.notNull(em, CannedMessages.NULL_ARGUMENT, "em");
        Validate.notNull(document, CannedMessages.NULL_ARGUMENT, "document");
        Validate.notNull(document.getStore(), CannedMessages.NULL_ARGUMENT, "document.store");
        Validate.notNull(document.getBaseDocument(), CannedMessages.NULL_ARGUMENT, "document.baseDocument");

        document.setSeen(true);

        if (document.getBaseDocument() instanceof FullTextDocument
                && document.getStore() instanceof LexiconBuilderDocumentStore) {
            List<LexiconDocument> seenTokens = this.getSeenTokens(em,
                    (LexiconBuilderDocumentStore) document.getStore());
            FullTextDocument ftDoc = (FullTextDocument) document.getBaseDocument();
            for (final LinguisticToken token : ftDoc.getParsedContent().getTokens()) {
                if (seenTags != null && !token.getPosTag().is(seenTags)) {
                    continue;
                }

                LexiconDocument seenToken = Iterables.find(seenTokens, new Predicate<LexiconDocument>() {
                    @Override
                    public boolean apply(LexiconDocument seenToken) {
                        return seenToken.getContent().equalsIgnoreCase(token.toString());
                    }
                }, null);

                if (seenToken == null) {
                    seenToken = (LexiconDocument) new LexiconDocument().setContent(token.toString())
                            .setStore(document.getStore());

                    em.persist(seenToken);
                }
            }
        }

        if (em.contains(document)) {
            em.refresh(document);
        } else {
            em.persist(document);
        }

        return document;
    }

    /**
     * Sets the provided document and all tokens contained therein as having been seen.
     * @param em the {@link EntityManager} to use.
     * @param document the {@link LexiconBuilderDocument} object to mark as seen.
     * @return the supplied {@link LexiconBuilderDocument}.
     */
    public LexiconBuilderDocument setSeenDocument(EntityManager em, LexiconBuilderDocument document) {
        return this.setSeenDocument(em, document, null);
    }
}