eu.project.ttc.models.index.MemoryTermIndex.java Source code

Introduction

Here is the source code for eu.project.ttc.models.index.MemoryTermIndex.java
Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.models.index;

import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.OccurrenceStore;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermBuilder;
import eu.project.ttc.models.TermClass;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.selectors.TermSelector;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.TermOccAnnotation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
import eu.project.ttc.utils.TermSuiteUtils;

/**
 * The in-memory implementation of a {@link TermIndex}.
 * 
 * @author Damien Cram
 *
 */
public class MemoryTermIndex implements TermIndex {
    private static final Logger LOGGER = LoggerFactory.getLogger(MemoryTermIndex.class);
    private static final String MSG_NO_SUCH_PROVIDER = "No such value provider: %s";
    public static final String MSG_NO_SUCH_TERM = "No such term in term index: %s";

    private static final String MEASURE_WR = "wr";
    private static final String MEASURE_WRLOG = "wrLog";
    private static final String MEASURE_FREQUENCY = "frequency";

    /**
     * The occurrence store 
     */
    private OccurrenceStore occurrenceStore;

    /*
     * The root index of terms. Variants must not be referenced at 
     * this level of index. They me be indexed from their base-term
     * instead. 
     */
    private Map<Integer, Term> termsById = Maps.newHashMap();
    private List<Term> rankedTerms = Lists.newArrayList();
    private Map<String, Term> termsByGroupingKey = Maps.newHashMap();
    private Map<String, CustomTermIndex> customIndexes = Maps.newHashMap();
    private Map<String, TermMeasure> termMeasures = Maps.newHashMap();
    private Map<String, Word> wordIndex = Maps.newHashMap();
    private Map<String, Document> documents = Maps.newHashMap();
    private Set<TermClass> termClasses = Sets.newHashSet();

    private String name;
    private Lang lang;
    private String corpusId;

    private int currentDocumentId = 0;
    private int currentId = 0;
    private int nbWordAnnotations = 0;
    private int nbSpottedTerms = 0;

    public MemoryTermIndex(String name, Lang lang, OccurrenceStore occurrenceStore) {
        this.lang = lang;
        this.name = name;
        this.occurrenceStore = occurrenceStore;
        this.termMeasures.put(MEASURE_WR, new WRMeasure(this));
        this.termMeasures.put(MEASURE_WRLOG, new WRLogMeasure(this));
        this.termMeasures.put(MEASURE_FREQUENCY, new FrequencyMeasure(this));
    }

    @Override
    public void addTerm(Term term) {
        Preconditions.checkArgument(!this.termsByGroupingKey.containsKey(term.getGroupingKey()));
        Preconditions.checkNotNull(term.getId());
        Preconditions.checkArgument(!this.termsById.containsKey(term.getId()));

        this.termsByGroupingKey.put(term.getGroupingKey(), term);
        this.termsById.put(term.getId(), term);
        for (CustomTermIndex termIndex : this.customIndexes.values())
            termIndex.indexTerm(this, term);
        for (TermWord tw : term.getWords())
            privateAddWord(tw.getWord(), false);
    }

    @Override
    public void addWord(Word word) {
        privateAddWord(word, true);
    }

    private void privateAddWord(Word word, boolean failIfAlredyPresent) {
        if (failIfAlredyPresent)
            Preconditions.checkArgument(!this.wordIndex.containsKey(word.getLemma()));
        this.wordIndex.put(word.getLemma(), word);
    }

    @Override
    public Collection<Term> getTerms() {
        return Collections.unmodifiableCollection(this.termsByGroupingKey.values());
    }

    @Override
    public Word getWord(String wordId) {
        return this.wordIndex.get(wordId);
    }

    public Word addWord(WordAnnotation anno) {
        String swKey = anno.getLemma();
        Word word = this.wordIndex.get(swKey);
        if (word == null) {
            word = new Word(anno.getLemma(), anno.getStem());
            this.wordIndex.put(swKey, word);
        }
        return word;
    }

    @Override
    public Term addTermOccurrence(TermOccAnnotation annotation, String fileUrl, boolean keepOccurrence) {
        this.nbSpottedTerms++;
        String termGroupingKey = TermSuiteUtils.getGroupingKey(annotation);
        Term term = this.termsByGroupingKey.get(termGroupingKey);
        if (term == null) {
            TermBuilder builder = TermBuilder.start(this);
            for (int i = 0; i < annotation.getWords().size(); i++) {
                WordAnnotation wa = annotation.getWords(i);
                Word w = this.addWord(wa);
                builder.addWord(w, annotation.getPattern(i));
            }
            builder.setSpottingRule(annotation.getSpottingRuleName());
            term = builder.createAndAddToIndex();
        }
        term.addOccurrence(new TermOccurrence(term, annotation.getCoveredText(), this.getDocument(fileUrl),
                annotation.getBegin(), annotation.getEnd()), keepOccurrence);
        return term;
    }

    @Override
    public Iterator<Term> singleWordTermIterator() {
        return new SingleMultiWordIterator(true);
    }

    @Override
    public Iterator<Term> multiWordTermIterator() {
        return new SingleMultiWordIterator(false);
    }

    @Override
    public Iterator<Term> compoundWordTermIterator() {
        return new CompoundIterator();
    };

    private abstract class TermIterator extends AbstractIterator<Term> {
        protected Term t;
        protected Iterator<Term> it;

        private TermIterator() {
            super();
            this.it = MemoryTermIndex.this.termsByGroupingKey.values().iterator();
        }
    }

    private class SingleMultiWordIterator extends TermIterator {
        /*
         * Single word it if false, multiword if true
         */
        private boolean singleMultiWordToggle;

        private SingleMultiWordIterator(boolean singleMultiWordToogle) {
            super();
            this.singleMultiWordToggle = singleMultiWordToogle;
        }

        @Override
        protected Term computeNext() {
            while (it.hasNext()) {
                if ((t = it.next()).isSingleWord() == this.singleMultiWordToggle)
                    return t;
            }
            return endOfData();
        }
    }

    private class CompoundIterator extends TermIterator {

        @Override
        protected Term computeNext() {
            while (it.hasNext()) {
                if ((t = it.next()).isSingleWord() && t.isCompound())
                    return t;
            }
            return endOfData();
        }

    }

    @Override
    public int newId() {
        while (this.termsById.containsKey(currentId))
            this.currentId++;
        return this.currentId;
    }

    //   @Override
    //   public TermBuilder newTerm(String termId) {
    //      return new TermBuilder(this).setGroupingKey(termId);
    //   }

    @Override
    public CustomTermIndex getCustomIndex(String indexName) {
        if (this.customIndexes.get(indexName) == null) {
            TermValueProvider valueProvider = TermValueProviders.get(indexName, this);
            createCustomIndex(indexName, valueProvider);
        }
        return this.customIndexes.get(indexName);
    }

    @Override
    public CustomTermIndex createCustomIndex(String indexName, TermValueProvider valueProvider) {
        //      Preconditions.checkArgument(
        //            !this.customIndexes.containsKey(indexName),
        //            String.format("Custom term index %s already exists.", indexName));
        Preconditions.checkArgument(valueProvider != null, MSG_NO_SUCH_PROVIDER, indexName);
        CustomTermIndexImpl customIndex = new CustomTermIndexImpl(valueProvider);
        this.customIndexes.put(indexName, customIndex);

        LOGGER.debug("Indexing {} terms to index {}", this.getTerms().size(), indexName);
        for (Term t : this.getTerms())
            customIndex.indexTerm(this, t);
        return customIndex;
    }

    @Override
    public void dropCustomIndex(String indexName) {
        this.customIndexes.remove(indexName);
    }

    @Override
    public Collection<Word> getWords() {
        return Collections.unmodifiableCollection(this.wordIndex.values());
    }

    @Override
    public Term getTermByGroupingKey(String groupingKey) {
        //      Preconditions.checkArgument(this.termsByGroupingKey.containsKey(groupingKey),MSG_NO_SUCH_TERM, groupingKey);
        return this.termsByGroupingKey.get(groupingKey);
    }

    @Override
    public Term getTermById(int id) {
        return this.termsById.get(id);
    }

    @Override
    public void cleanOrphanWords() {
        Set<String> usedWordLemmas = Sets.newHashSet();
        for (Term t : getTerms()) {
            for (TermWord tw : t.getWords())
                usedWordLemmas.add(tw.getWord().getLemma());
        }
        Iterator<Entry<String, Word>> it = wordIndex.entrySet().iterator();
        Entry<String, Word> entry;
        while (it.hasNext()) {
            entry = it.next();
            if (!usedWordLemmas.contains(entry.getValue().getLemma()))
                it.remove();
        }
    }

    @Override
    public void removeTerm(Term t) {

        removeTermOnly(t);
        occurrenceStore.removeTerm(t);
    }

    private void removeTermOnly(Term t) {
        termsByGroupingKey.remove(t.getGroupingKey());
        termsById.remove(t.getId());

        // remove from custom indexes
        for (CustomTermIndex customIndex : customIndexes.values())
            customIndex.removeTerm(this, t);

        // remove from variants
        Set<TermVariation> toRem = Sets.newHashSet();
        for (TermVariation v : t.getVariations())
            toRem.add(v);
        for (TermVariation v : toRem)
            t.removeTermVariation(v);

        // remove from variants
        toRem = Sets.newHashSet();
        for (TermVariation v : t.getBases())
            toRem.add(v);
        for (TermVariation v : toRem)
            v.getBase().removeTermVariation(v);

        /*
         * Removes from context vectors.
         * 
         * We assumes that if this term has a context vector 
         * then all others terms may have this term as co-term,
         * thus they must be checked from removal.
         * 
         */
        if (t.isContextVectorComputed()) {
            for (Term o : termsById.values()) {
                if (o.isContextVectorComputed())
                    o.getContextVector().removeCoTerm(t);
            }
        }
    }

    @Override
    public String getName() {
        return this.name;
    }

    @Override
    public int hashCode() {
        return this.name.hashCode();
    }

    @Override
    public Document getDocument(String url) {
        Document document = documents.get(url);
        if (document == null) {
            document = new Document(currentDocumentId++, url);
            documents.put(url, document);
        }
        return document;
    }

    @Override
    public Collection<Document> getDocuments() {
        return this.documents.values();
    }

    @Override
    public void createOccurrenceIndex() {
        for (Term t : this.getTerms()) {
            for (TermOccurrence o : t.getOccurrences()) {
                /*
                 * Explicitely index all occurrences within each source document. The context 
                 * generation would not work without that step.
                 * 
                 * FIXME Move these occurrence indexes inside the present AE (because the 
                 * indexes are never used anywhere else).
                 */
                o.getSourceDocument().indexTermOccurrence(o);
            }
        }
    }

    @Override
    public void clearOccurrenceIndex() {
        for (Document d : this.getDocuments())
            d.clearOccurrenceIndex();
    }

    @Override
    public String toString() {
        return MoreObjects.toStringHelper(this).addValue(name).add("terms", this.termsById.size()).toString();
    }

    @Override
    public Lang getLang() {
        return this.lang;
    }

    @Override
    public String getCorpusId() {
        return corpusId;
    }

    @Override
    public void setCorpusId(String corpusId) {
        this.corpusId = corpusId;
    }

    @Override
    public Collection<TermClass> getTermClasses() {
        return Collections.unmodifiableSet(termClasses);
    }

    @Override
    public void classifyTerms(Term classHead, Iterable<Term> classTerms) {
        Preconditions.checkArgument(Iterables.contains(classTerms, classHead),
                "head must be contained in class terms");
        TermClass termClass = new TermClass(classHead, classTerms);
        this.termClasses.add(termClass);
        for (Term t2 : termClass)
            t2.setTermClass(termClass);
    }

    @Override
    public void setWordAnnotationsNum(int nbWordAnnotations) {
        this.nbWordAnnotations = nbWordAnnotations;
    }

    @Override
    public int getWordAnnotationsNum() {
        return this.nbWordAnnotations;
    }

    @Override
    public TermMeasure getWRMeasure() {
        return this.termMeasures.get(MEASURE_WR);
    }

    @Override
    public TermMeasure getWRLogMeasure() {
        return this.termMeasures.get(MEASURE_WRLOG);
    }

    @Override
    public TermMeasure getFrequencyMeasure() {
        return this.termMeasures.get(MEASURE_FREQUENCY);
    }

    @Override
    public Iterable<TermMeasure> getMeasures() {
        return this.termMeasures.values();
    }

    @Override
    public int getSpottedTermsNum() {
        return nbSpottedTerms;
    }

    @Override
    public void setSpottedTermsNum(int spottedTermsNum) {
        this.nbSpottedTerms = spottedTermsNum;
    }

    @Override
    public OccurrenceStore getOccurrenceStore() {
        return this.occurrenceStore;
    }

    @Override
    public void deleteMany(TermSelector selector) {
        List<Term> rem = Lists.newArrayList();
        for (Term t : termsById.values()) {
            if (selector.select(t))
                rem.add(t);
        }
        for (Term t : rem)
            removeTermOnly(t);
        occurrenceStore.deleteMany(selector);

    }

    @Override
    public void importCas(JCas cas, boolean keepOccurrence) {
        SourceDocumentInformation sdi = JCasUtils.getSourceDocumentAnnotation(cas).get();
        FSIterator<Annotation> iterator = cas.getAnnotationIndex().iterator();
        while (iterator.hasNext()) {
            Annotation anno = iterator.next();
            if (anno instanceof WordAnnotation) {
                this.nbWordAnnotations++;
            } else if (anno instanceof TermOccAnnotation) {
                addTermOccurrence((TermOccAnnotation) anno, sdi.getUri(), keepOccurrence);
            }
        }
    }
}