com.joliciel.talismane.tokeniser.patterns.IntervalPatternTokeniser.java Source code

Java tutorial

Introduction

Here is the source code for com.joliciel.talismane.tokeniser.patterns.IntervalPatternTokeniser.java

Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2012 Assaf Urieli
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.talismane.tokeniser.patterns;

import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.joliciel.talismane.filters.FilterService;
import com.joliciel.talismane.filters.Sentence;
import com.joliciel.talismane.machineLearning.ClassificationObserver;
import com.joliciel.talismane.machineLearning.Decision;
import com.joliciel.talismane.machineLearning.DecisionMaker;
import com.joliciel.talismane.machineLearning.features.FeatureResult;
import com.joliciel.talismane.machineLearning.features.FeatureService;
import com.joliciel.talismane.machineLearning.features.RuntimeEnvironment;
import com.joliciel.talismane.tokeniser.TaggedToken;
import com.joliciel.talismane.tokeniser.Token;
import com.joliciel.talismane.tokeniser.TokenSequence;
import com.joliciel.talismane.tokeniser.Tokeniser;
import com.joliciel.talismane.tokeniser.TokeniserDecisionFactory;
import com.joliciel.talismane.tokeniser.TokeniserOutcome;
import com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence;
import com.joliciel.talismane.tokeniser.TokeniserService;
import com.joliciel.talismane.tokeniser.features.TokenFeatureService;
import com.joliciel.talismane.tokeniser.features.TokeniserContext;
import com.joliciel.talismane.tokeniser.features.TokeniserContextFeature;
import com.joliciel.talismane.tokeniser.filters.TokenFilter;
import com.joliciel.talismane.tokeniser.filters.TokenSequenceFilter;
import com.joliciel.talismane.tokeniser.filters.TokenPlaceholder;
import com.joliciel.talismane.utils.PerformanceMonitor;

/**
 * The interval pattern tokeniser first splits the text into individual tokens based on a list of separators,
 * each of which is assigned a default value for that separator.
 * 
 * The tokeniser then takes a list of patterns, and for each pattern in the list, tries to match it to a sequence of tokens within the sentence.
 * If a match is found, the final decision for each token interval in this sequence is deferred to a TokeniserDecisionMaker.
 * If not, the default values are retained.
 * 
 * Overlapping sequences are handled gracefully: if a given interval is 2nd in sequence A, but 1st in sequence B, it will receive the
 * n-gram feature from sequence A and a bunch of contextual features from sequence B, and the final decision will be taken based on the
 * combination of all features. However, this can result in a strange compound that doesn't exist in any pattern nor in the training corpus.
 * 
 * The motivation for this pattern tokeniser is to concentrate training and decisions on difficult cases, rather than blurring the
 * training model with oodles of obvious cases.
 * 
 * @author Assaf Urieli
 *
 */
class IntervalPatternTokeniser implements PatternTokeniser {
    private static final Log LOG = LogFactory.getLog(IntervalPatternTokeniser.class);
    private static final PerformanceMonitor MONITOR = PerformanceMonitor.getMonitor(IntervalPatternTokeniser.class);

    private static final DecimalFormat df = new DecimalFormat("0.0000");

    private DecisionMaker<TokeniserOutcome> decisionMaker;

    private TokeniserService tokeniserService;
    private TokeniserPatternService tokeniserPatternService;
    private TokenFeatureService tokenFeatureService;
    private FilterService filterService;
    private FeatureService featureService;

    private TokeniserPatternManager tokeniserPatternManager;
    private int beamWidth;
    private Set<TokeniserContextFeature<?>> tokeniserContextFeatures;
    private List<TokenSequenceFilter> tokenSequenceFilters = new ArrayList<TokenSequenceFilter>();

    private List<ClassificationObserver<TokeniserOutcome>> observers = new ArrayList<ClassificationObserver<TokeniserOutcome>>();

    private List<TokenFilter> tokenFilters = new ArrayList<TokenFilter>();
    private TokeniserDecisionFactory tokeniserDecisionFactory = new TokeniserDecisionFactory();

    /**
     * Reads separator defaults and test patterns from the default file for this locale.
     * @param locale
     */
    public IntervalPatternTokeniser(TokeniserPatternManager tokeniserPatternManager,
            Set<TokeniserContextFeature<?>> tokeniserContextFeatures, int beamWidth) {
        this.tokeniserPatternManager = tokeniserPatternManager;
        this.beamWidth = beamWidth;
        this.tokeniserContextFeatures = tokeniserContextFeatures;
    }

    @Override
    public List<TokenSequence> tokenise(String text) {
        Sentence sentence = filterService.getSentence();
        sentence.setText(text);
        return this.tokenise(sentence);
    }

    @Override
    public List<TokenSequence> tokenise(Sentence sentence) {
        List<TokenisedAtomicTokenSequence> decisionSequences = this.tokeniseWithDecisions(sentence);
        List<TokenSequence> tokenSequences = new ArrayList<TokenSequence>();
        for (TokenisedAtomicTokenSequence decisionSequence : decisionSequences) {
            tokenSequences.add(decisionSequence.inferTokenSequence());
        }
        return tokenSequences;
    }

    @Override
    public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(String text) {
        Sentence sentence = filterService.getSentence();
        sentence.setText(text);
        return this.tokeniseWithDecisions(sentence);
    }

    @Override
    public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence) {
        MONITOR.startTask("tokeniseWithDecisions");
        try {
            // apply any pre-tokenisation decisions via filters
            // we only want one placeholder per start index - the first one that gets added
            Map<Integer, TokenPlaceholder> placeholderMap = new HashMap<Integer, TokenPlaceholder>();
            for (TokenFilter tokenFilter : this.tokenFilters) {
                Set<TokenPlaceholder> myPlaceholders = tokenFilter.apply(sentence.getText());
                for (TokenPlaceholder placeholder : myPlaceholders) {
                    if (!placeholderMap.containsKey(placeholder.getStartIndex())) {
                        placeholderMap.put(placeholder.getStartIndex(), placeholder);
                    }
                }
                if (LOG.isTraceEnabled()) {
                    if (myPlaceholders.size() > 0) {
                        LOG.trace("TokenFilter: " + tokenFilter);
                        LOG.trace("placeholders: " + myPlaceholders);
                    }
                }
            }

            Set<TokenPlaceholder> placeholders = new HashSet<TokenPlaceholder>(placeholderMap.values());

            // Initially, separate the sentence into tokens using the separators provided
            TokenSequence tokenSequence = this.tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS,
                    placeholders);

            // apply any pre-processing filters that have been added
            for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) {
                tokenSequenceFilter.apply(tokenSequence);
            }

            // Assign each separator its default value
            List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
            List<Decision<TokeniserOutcome>> defaultDecisions = new ArrayList<Decision<TokeniserOutcome>>(
                    defaultOutcomes.size());
            for (TokeniserOutcome outcome : defaultOutcomes) {
                Decision<TokeniserOutcome> tokeniserDecision = this.tokeniserDecisionFactory
                        .createDefaultDecision(outcome);
                tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
                tokeniserDecision.addAuthority("_" + "DefaultDecision");
                defaultDecisions.add(tokeniserDecision);
            }
            List<TokenisedAtomicTokenSequence> sequences = null;

            // For each test pattern, see if anything in the sentence matches it
            if (this.decisionMaker != null) {
                Set<Token> tokensToCheck = new HashSet<Token>();
                MONITOR.startTask("pattern matching");
                try {
                    for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
                        Set<Token> tokensToCheckForThisPattern = new HashSet<Token>();
                        List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(tokenSequence);
                        for (TokenPatternMatchSequence tokenPatternMatch : matchesForThisPattern) {
                            if (LOG.isTraceEnabled())
                                tokensToCheckForThisPattern.addAll(tokenPatternMatch.getTokensToCheck());
                            tokensToCheck.addAll(tokenPatternMatch.getTokensToCheck());
                        }
                        if (LOG.isTraceEnabled()) {
                            if (tokensToCheckForThisPattern.size() > 0) {
                                LOG.trace("Parsed pattern: " + parsedPattern);
                                LOG.trace("tokensToCheck: " + tokensToCheckForThisPattern);
                            }
                        }
                    }
                } finally {
                    MONITOR.endTask("pattern matching");
                }

                // we want to create the n most likely token sequences
                // the sequence has to correspond to a token pattern

                // initially create a heap with a single, empty sequence
                PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
                TokenisedAtomicTokenSequence emptySequence = this.getTokeniserService()
                        .getTokenisedAtomicTokenSequence(sentence, 0);
                heap.add(emptySequence);
                int i = 0;
                for (Token token : tokenSequence.listWithWhiteSpace()) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Token : \"" + token.getText() + "\"");
                    }
                    // build a new heap for this iteration
                    PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
                    heap = new PriorityQueue<TokenisedAtomicTokenSequence>();

                    // limit the heap breadth to K
                    int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth()
                            : previousHeap.size();
                    for (int j = 0; j < maxSequences; j++) {
                        TokenisedAtomicTokenSequence history = previousHeap.poll();

                        // Find the separating & non-separating decisions
                        List<Decision<TokeniserOutcome>> decisions = null;
                        if (tokensToCheck.contains(token)) {
                            // test the features on the current token
                            TokeniserContext context = new TokeniserContext(token, history);
                            List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
                            MONITOR.startTask("analyse features");
                            try {
                                for (TokeniserContextFeature<?> feature : tokeniserContextFeatures) {
                                    RuntimeEnvironment env = this.featureService.getRuntimeEnvironment();
                                    FeatureResult<?> featureResult = feature.check(context, env);
                                    if (featureResult != null) {
                                        tokenFeatureResults.add(featureResult);
                                    }
                                }

                                if (LOG.isTraceEnabled()) {
                                    for (FeatureResult<?> featureResult : tokenFeatureResults) {
                                        LOG.trace(featureResult.toString());
                                    }
                                }
                            } finally {
                                MONITOR.endTask("analyse features");
                            }

                            MONITOR.startTask("make decision");
                            try {
                                decisions = this.decisionMaker.decide(tokenFeatureResults);

                                for (ClassificationObserver<TokeniserOutcome> observer : this.observers)
                                    observer.onAnalyse(token, tokenFeatureResults, decisions);

                                for (Decision<TokeniserOutcome> decision : decisions) {
                                    decision.addAuthority(this.getClass().getSimpleName());
                                    for (TokenPatternMatch tokenMatch : token.getMatches()) {
                                        decision.addAuthority(tokenMatch.getPattern().toString());
                                    }
                                }
                            } finally {
                                MONITOR.endTask("make decision");
                            }
                        } else {
                            decisions = new ArrayList<Decision<TokeniserOutcome>>();
                            decisions.add(defaultDecisions.get(i));
                        }

                        MONITOR.startTask("heap sort");
                        try {
                            for (Decision<TokeniserOutcome> decision : decisions) {
                                TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService
                                        .getTaggedToken(token, decision);

                                TokenisedAtomicTokenSequence tokenisedSequence = this.getTokeniserService()
                                        .getTokenisedAtomicTokenSequence(history);
                                tokenisedSequence.add(taggedToken);
                                if (decision.isStatistical())
                                    tokenisedSequence.addDecision(decision);
                                heap.add(tokenisedSequence);
                            }
                        } finally {
                            MONITOR.endTask("heap sort");
                        }

                    } // next sequence in the old heap
                    i++;
                } // next token

                sequences = new ArrayList<TokenisedAtomicTokenSequence>();
                i = 0;
                while (!heap.isEmpty()) {
                    sequences.add(heap.poll());
                    i++;
                    if (i >= this.getBeamWidth())
                        break;
                }
            } else {
                sequences = new ArrayList<TokenisedAtomicTokenSequence>();
                TokenisedAtomicTokenSequence defaultSequence = this.getTokeniserService()
                        .getTokenisedAtomicTokenSequence(sentence, 0);
                int i = 0;
                for (Token token : tokenSequence.listWithWhiteSpace()) {
                    TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token,
                            defaultDecisions.get(i++));
                    defaultSequence.add(taggedToken);
                }
                sequences.add(defaultSequence);
            } // have decision maker?

            LOG.debug("####Final token sequences:");
            int j = 1;
            for (TokenisedAtomicTokenSequence sequence : sequences) {
                TokenSequence newTokenSequence = sequence.inferTokenSequence();
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Token sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
                    LOG.debug("Atomic sequence: " + sequence);
                    LOG.debug("Resulting sequence: " + newTokenSequence);
                }
                // need to re-apply the pre-processing filters, because the tokens are all new
                // Question: why can't we conserve the initial tokens when they haven't changed at all?
                // Answer: because the tokenSequence and index in the sequence is referenced by the token.
                // Question: should we create a separate class, Token and TokenInSequence,
                // one with index & sequence access & one without?
                for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) {
                    tokenSequenceFilter.apply(newTokenSequence);
                }
                if (LOG.isDebugEnabled()) {
                    LOG.debug("After filters: " + newTokenSequence);
                }
            }

            return sequences;
        } finally {
            MONITOR.endTask("tokeniseWithDecisions");
        }
    }

    /**
     * The test patterns - only token sequences matching these patterns will
     * be submitted to further decision.
     * @return
     */
    public List<String> getTestPatterns() {
        return this.getTokeniserPatternManager().getTestPatterns();
    }

    /**
     * The decision maker to make decisions for any separators within token
     * sub-sequences that need further testing.
     * @return
     */
    public DecisionMaker<TokeniserOutcome> getDecisionMaker() {
        return decisionMaker;
    }

    public void setDecisionMaker(DecisionMaker<TokeniserOutcome> decisionMaker) {
        this.decisionMaker = decisionMaker;
    }

    public TokeniserService getTokeniserService() {
        return tokeniserService;
    }

    public void setTokeniserService(TokeniserService tokeniserService) {
        this.tokeniserService = tokeniserService;
    }

    public TokeniserPatternManager getTokeniserPatternManager() {
        return tokeniserPatternManager;
    }

    public void setTokeniserPatternManager(TokeniserPatternManager tokeniserPatternManager) {
        this.tokeniserPatternManager = tokeniserPatternManager;
    }

    public int getBeamWidth() {
        return beamWidth;
    }

    public TokeniserPatternService getTokeniserPatternService() {
        return tokeniserPatternService;
    }

    public void setTokeniserPatternService(TokeniserPatternService tokeniserPatternService) {
        this.tokeniserPatternService = tokeniserPatternService;
    }

    public TokenFeatureService getTokenFeatureService() {
        return tokenFeatureService;
    }

    public void setTokenFeatureService(TokenFeatureService tokenFeatureService) {
        this.tokenFeatureService = tokenFeatureService;
    }

    /**
     * Filters to be applied to the atoms, prior to tokenising.
     * @return
     */
    public List<TokenSequenceFilter> getTokenSequenceFilters() {
        return tokenSequenceFilters;
    }

    public void addTokenSequenceFilter(TokenSequenceFilter tokenSequenceFilter) {
        this.tokenSequenceFilters.add(tokenSequenceFilter);
    }

    @Override
    public void addObserver(ClassificationObserver<TokeniserOutcome> observer) {
        this.observers.add(observer);
    }

    public List<TokenFilter> getTokenFilters() {
        return tokenFilters;
    }

    public void addTokenFilter(TokenFilter tokenFilter) {
        this.tokenFilters.add(tokenFilter);
    }

    public FilterService getFilterService() {
        return filterService;
    }

    public void setFilterService(FilterService filterService) {
        this.filterService = filterService;
    }

    public FeatureService getFeatureService() {
        return featureService;
    }

    public void setFeatureService(FeatureService featureService) {
        this.featureService = featureService;
    }

}