com.joliciel.talismane.tokeniser.patterns.CompoundPatternTokeniser.java Source code

Introduction

Here is the source code for com.joliciel.talismane.tokeniser.patterns.CompoundPatternTokeniser.java
Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2012 Assaf Urieli
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.talismane.tokeniser.patterns;

import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.ArrayList;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.joliciel.talismane.filters.FilterService;
import com.joliciel.talismane.filters.Sentence;
import com.joliciel.talismane.machineLearning.ClassificationObserver;
import com.joliciel.talismane.machineLearning.Decision;
import com.joliciel.talismane.machineLearning.DecisionMaker;
import com.joliciel.talismane.machineLearning.features.FeatureResult;
import com.joliciel.talismane.machineLearning.features.FeatureService;
import com.joliciel.talismane.machineLearning.features.RuntimeEnvironment;
import com.joliciel.talismane.tokeniser.TaggedToken;
import com.joliciel.talismane.tokeniser.Token;
import com.joliciel.talismane.tokeniser.TokenSequence;
import com.joliciel.talismane.tokeniser.Tokeniser;
import com.joliciel.talismane.tokeniser.TokeniserDecisionFactory;
import com.joliciel.talismane.tokeniser.TokeniserOutcome;
import com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence;
import com.joliciel.talismane.tokeniser.TokeniserService;
import com.joliciel.talismane.tokeniser.features.TokenFeatureService;
import com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeature;
import com.joliciel.talismane.tokeniser.filters.TokenFilter;
import com.joliciel.talismane.tokeniser.filters.TokenSequenceFilter;
import com.joliciel.talismane.tokeniser.filters.TokenPlaceholder;
import com.joliciel.talismane.utils.PerformanceMonitor;

/**
 * The compound pattern tokeniser first splits the text into individual tokens based on a list of separators,
 * each of which is assigned a default value for that separator.
 * 
 * The tokeniser then takes a list of patterns, and for each pattern in the list, tries to match it to a sequence of tokens within the sentence.
 * If a match is found, a join/separate decision is taken for the sequence as a whole. If not, the default values are retained.
 * However, to allow for rare overlapping sequences, if the join/separate decision would result in default decisions for the entire sequence, we only mark the first interval
 * in the sequence, and allow another pattern to match the remaining tokens.
 * Otherwise, we skip all tokens in this sequence before trying to match.
 * 
 * The motivation for this pattern tokeniser is to concentrate training and decisions on difficult cases, rather than blurring the
 * training model with oodles of obvious cases.
 * Furthermore, we have virtually eliminated strange broken compounds, which was possible lower-down in the beam using the interval approach,
 * because the n-gram features used in that approach generally contained no counter-examples, leading to the "missing category" phenomenon with a
 * relatively high score for the missing category.
 * 
 * @author Assaf Urieli
 *
 */
class CompoundPatternTokeniser implements PatternTokeniser {
    private static final Log LOG = LogFactory.getLog(CompoundPatternTokeniser.class);
    private static final PerformanceMonitor MONITOR = PerformanceMonitor.getMonitor(CompoundPatternTokeniser.class);

    private static final DecimalFormat df = new DecimalFormat("0.0000");

    private DecisionMaker<TokeniserOutcome> decisionMaker;

    private TokeniserService tokeniserService;
    private TokeniserPatternService tokeniserPatternService;
    private TokenFeatureService tokenFeatureService;
    private FilterService filterService;
    private FeatureService featureService;

    private TokeniserPatternManager tokeniserPatternManager;
    private int beamWidth;
    private Set<TokenPatternMatchFeature<?>> features;
    private List<TokenSequenceFilter> tokenSequenceFilters = new ArrayList<TokenSequenceFilter>();

    private List<ClassificationObserver<TokeniserOutcome>> observers = new ArrayList<ClassificationObserver<TokeniserOutcome>>();
    private TokeniserDecisionFactory tokeniserDecisionFactory = new TokeniserDecisionFactory();

    private List<TokenFilter> tokenFilters = new ArrayList<TokenFilter>();

    /**
     * Reads separator defaults and test patterns from the default file for this locale.
     * @param locale
     */
    public CompoundPatternTokeniser(TokeniserPatternManager tokeniserPatternManager,
            Set<TokenPatternMatchFeature<?>> features, int beamWidth) {
        this.tokeniserPatternManager = tokeniserPatternManager;
        this.beamWidth = beamWidth;
        this.features = features;
    }

    @Override
    public List<TokenSequence> tokenise(String text) {
        Sentence sentence = filterService.getSentence();
        sentence.setText(text);
        return this.tokenise(sentence);
    }

    @Override
    public List<TokenSequence> tokenise(Sentence sentence) {
        List<TokenisedAtomicTokenSequence> decisionSequences = this.tokeniseWithDecisions(sentence);
        List<TokenSequence> tokenSequences = new ArrayList<TokenSequence>();
        for (TokenisedAtomicTokenSequence decisionSequence : decisionSequences) {
            tokenSequences.add(decisionSequence.inferTokenSequence());
        }
        return tokenSequences;
    }

    @Override
    public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(String text) {
        Sentence sentence = filterService.getSentence();
        sentence.setText(text);
        return this.tokeniseWithDecisions(sentence);
    }

    @Override
    public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence) {
        MONITOR.startTask("tokeniseWithDecisions");
        try {
            // apply any pre-tokenisation decisions via filters
            // we only want one placeholder per start index - the first one that gets added
            Map<Integer, TokenPlaceholder> placeholderMap = new HashMap<Integer, TokenPlaceholder>();
            for (TokenFilter tokenFilter : this.tokenFilters) {
                Set<TokenPlaceholder> myPlaceholders = tokenFilter.apply(sentence.getText());
                for (TokenPlaceholder placeholder : myPlaceholders) {
                    if (!placeholderMap.containsKey(placeholder.getStartIndex())) {
                        placeholderMap.put(placeholder.getStartIndex(), placeholder);
                    }
                }
                if (LOG.isTraceEnabled()) {
                    if (myPlaceholders.size() > 0) {
                        LOG.trace("TokenFilter: " + tokenFilter);
                        LOG.trace("placeholders: " + myPlaceholders);
                    }
                }
            }

            Set<TokenPlaceholder> placeholders = new HashSet<TokenPlaceholder>(placeholderMap.values());

            // Initially, separate the sentence into tokens using the separators provided
            TokenSequence tokenSequence = this.tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS,
                    placeholders);

            // apply any pre-processing filters that have been added
            for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) {
                tokenSequenceFilter.apply(tokenSequence);
            }

            // Assign each separator its default value
            List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
            List<Decision<TokeniserOutcome>> defaultDecisions = new ArrayList<Decision<TokeniserOutcome>>(
                    defaultOutcomes.size());
            for (TokeniserOutcome outcome : defaultOutcomes) {
                Decision<TokeniserOutcome> tokeniserDecision = this.tokeniserDecisionFactory
                        .createDefaultDecision(outcome);
                tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
                tokeniserDecision.addAuthority("_" + "DefaultDecision");
                defaultDecisions.add(tokeniserDecision);
            }

            List<TokenisedAtomicTokenSequence> sequences = null;

            // For each test pattern, see if anything in the sentence matches it
            if (this.decisionMaker != null) {
                List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
                Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
                Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>();
                Set<Token> matchedTokens = new HashSet<Token>();

                MONITOR.startTask("pattern matching");
                try {
                    for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
                        List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(tokenSequence);
                        for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                            matchingSequences.add(matchSequence);
                            matchedTokens.addAll(matchSequence.getTokensToCheck());

                            TokenPatternMatch primaryMatch = null;
                            Token token = matchSequence.getTokensToCheck().get(0);

                            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                            if (matchSequences == null) {
                                matchSequences = new TreeSet<TokenPatternMatchSequence>();
                                tokenMatchSequenceMap.put(token, matchSequences);
                            }
                            matchSequences.add(matchSequence);

                            for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) {
                                if (patternMatch.getToken().equals(token)) {
                                    primaryMatch = patternMatch;
                                    break;
                                }
                            }

                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Found match: " + primaryMatch);
                            }
                            primaryMatchMap.put(matchSequence, primaryMatch);
                        }
                    }
                } finally {
                    MONITOR.endTask("pattern matching");
                }

                // we want to create the n most likely token sequences
                // the sequence has to correspond to a token pattern
                Map<TokenPatternMatchSequence, List<Decision<TokeniserOutcome>>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision<TokeniserOutcome>>>();

                for (TokenPatternMatchSequence matchSequence : matchingSequences) {
                    TokenPatternMatch match = primaryMatchMap.get(matchSequence);
                    LOG.debug("next pattern match: " + match.toString());
                    List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
                    MONITOR.startTask("analyse features");
                    try {
                        for (TokenPatternMatchFeature<?> feature : features) {
                            RuntimeEnvironment env = this.featureService.getRuntimeEnvironment();
                            FeatureResult<?> featureResult = feature.check(match, env);
                            if (featureResult != null) {
                                tokenFeatureResults.add(featureResult);
                            }
                        }

                        if (LOG.isTraceEnabled()) {
                            for (FeatureResult<?> featureResult : tokenFeatureResults) {
                                LOG.trace(featureResult.toString());
                            }
                        }
                    } finally {
                        MONITOR.endTask("analyse features");
                    }

                    List<Decision<TokeniserOutcome>> decisions = null;
                    MONITOR.startTask("make decision");
                    try {
                        decisions = this.decisionMaker.decide(tokenFeatureResults);

                        for (ClassificationObserver<TokeniserOutcome> observer : this.observers)
                            observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions);

                        for (Decision<TokeniserOutcome> decision : decisions) {
                            decision.addAuthority("_" + this.getClass().getSimpleName());
                            decision.addAuthority("_" + "Patterns");
                            decision.addAuthority(match.getPattern().getName());
                        }
                    } finally {
                        MONITOR.endTask("make decision");
                    }

                    matchSequenceDecisionMap.put(matchSequence, decisions);
                }

                // initially create a heap with a single, empty sequence
                PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
                TokenisedAtomicTokenSequence emptySequence = this.getTokeniserService()
                        .getTokenisedAtomicTokenSequence(sentence, 0);
                heap.add(emptySequence);

                for (int i = 0; i < tokenSequence.listWithWhiteSpace().size(); i++) {
                    Token token = tokenSequence.listWithWhiteSpace().get(i);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Token : \"" + token.getText() + "\"");
                    }

                    // build a new heap for this iteration
                    PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
                    heap = new PriorityQueue<TokenisedAtomicTokenSequence>();

                    if (i == 0) {
                        // first token is always "separate" from the outside world
                        Decision<TokeniserOutcome> decision = this.tokeniserDecisionFactory
                                .createDefaultDecision(TokeniserOutcome.SEPARATE);
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "DefaultDecision");

                        TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token,
                                decision);

                        TokenisedAtomicTokenSequence newSequence = this.getTokeniserService()
                                .getTokenisedAtomicTokenSequence(emptySequence);
                        newSequence.add(taggedToken);
                        heap.add(newSequence);
                        continue;
                    }

                    // limit the heap breadth to K
                    int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth()
                            : previousHeap.size();
                    MONITOR.startTask("heap sort");
                    try {
                        for (int j = 0; j < maxSequences; j++) {
                            TokenisedAtomicTokenSequence history = previousHeap.poll();

                            // Find the separating & non-separating decisions
                            if (history.size() > i) {
                                // token already added as part of a sequence introduced by another token
                                heap.add(history);
                            } else if (tokenMatchSequenceMap.containsKey(token)) {
                                // token begins one or more match sequences
                                // these are ordered from shortest to longest (via TreeSet)
                                List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>(
                                        tokenMatchSequenceMap.get(token));

                                // Since sequences P1..Pn contain each other,
                                // there can be exactly matchSequences.size() consistent solutions
                                // Assume the default is separate
                                // 0: all separate
                                // 1: join P1, separate rest
                                // 2: join P2, separate rest
                                // ...
                                // n: join Pn
                                // We need to add each of these to the heap
                                // by taking the product of all probabilities consistent with each solution
                                // The probabities for each solution are (j=join, s=separate)
                                // All separate: s1 x s2 x ... x sn
                                // P1: j1 x s2 x ... x sn
                                // P2: j1 x j2 x ... x sn
                                // ...
                                // Pn: j1 x j2 x ... x jn
                                // Any solution of the form s1 x j2 would be inconsistent, and is not considered
                                // If Pi and Pj start and end on the exact same token, then the solution for both is
                                // Pi: j1 x ... x ji x jj x sj+1 ... x sn
                                // Pj: j1 x ... x ji x jj x sj+1 ... x sn
                                // Note of course that we're never likely to have more than two Ps here,
                                // but we need a solution for more just to be sure to be sure
                                TokeniserOutcome defaultOutcome = defaultDecisions
                                        .get(token.getIndexWithWhiteSpace()).getOutcome();
                                TokeniserOutcome otherOutcome = null;
                                if (defaultOutcome == TokeniserOutcome.SEPARATE)
                                    otherOutcome = TokeniserOutcome.JOIN;
                                else
                                    otherOutcome = TokeniserOutcome.SEPARATE;

                                double[] decisionProbs = new double[matchSequences.size() + 1];
                                for (int k = 0; k < decisionProbs.length; k++)
                                    decisionProbs[k] = 1;

                                // Note: k0 = default decision (e.g. separate all), k1=first pattern
                                // p1 = first pattern
                                int p = 1;
                                int prevEndIndex = -1;
                                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                                    int endIndex = matchSequence.getTokensToCheck()
                                            .get(matchSequence.getTokensToCheck().size() - 1).getEndIndex();
                                    List<Decision<TokeniserOutcome>> decisions = matchSequenceDecisionMap
                                            .get(matchSequence);
                                    for (Decision<TokeniserOutcome> decision : decisions) {
                                        for (int k = 0; k < decisionProbs.length; k++) {
                                            if (decision.getOutcome() == defaultOutcome) {
                                                // e.g. separate in most cases
                                                if (k < p && endIndex > prevEndIndex)
                                                    decisionProbs[k] *= decision.getProbability();
                                                else if (k + 1 < p && endIndex <= prevEndIndex)
                                                    decisionProbs[k] *= decision.getProbability();
                                            } else {
                                                // e.g. join in most cases
                                                if (k >= p && endIndex > prevEndIndex)
                                                    decisionProbs[k] *= decision.getProbability();
                                                else if (k + 1 >= p && endIndex <= prevEndIndex)
                                                    decisionProbs[k] *= decision.getProbability();
                                            }
                                        } // next k
                                    } // next decision (only 2 of these)
                                    prevEndIndex = endIndex;
                                    p++;
                                }

                                // transform to probability distribution
                                double sumProbs = 0;
                                for (int k = 0; k < decisionProbs.length; k++)
                                    sumProbs += decisionProbs[k];

                                if (sumProbs > 0)
                                    for (int k = 0; k < decisionProbs.length; k++)
                                        decisionProbs[k] /= sumProbs;

                                // Apply default decision
                                // Since this is the default decision for all tokens in the sequence, we don't add the other tokens for now,
                                // so as to allow them
                                // to get examined one at a time, just in case one of them starts its own separate sequence
                                Decision<TokeniserOutcome> defaultDecision = this.tokeniserDecisionFactory
                                        .createDecision(defaultOutcome.getCode(), decisionProbs[0]);
                                defaultDecision.addAuthority("_" + this.getClass().getSimpleName());
                                defaultDecision.addAuthority("_" + "Patterns");
                                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                                    defaultDecision.addAuthority(matchSequence.getTokenPattern().getName());
                                }

                                TaggedToken<TokeniserOutcome> defaultTaggedToken = this.tokeniserService
                                        .getTaggedToken(token, defaultDecision);
                                TokenisedAtomicTokenSequence defaultSequence = this.getTokeniserService()
                                        .getTokenisedAtomicTokenSequence(history);
                                defaultSequence.add(defaultTaggedToken);
                                defaultSequence.addDecision(defaultDecision);
                                heap.add(defaultSequence);

                                // Apply one non-default decision per match sequence
                                for (int k = 0; k < matchSequences.size(); k++) {
                                    TokenPatternMatchSequence matchSequence = matchSequences.get(k);
                                    double prob = decisionProbs[k + 1];
                                    Decision<TokeniserOutcome> decision = this.tokeniserDecisionFactory
                                            .createDecision(otherOutcome.getCode(), prob);
                                    decision.addAuthority("_" + this.getClass().getSimpleName());
                                    decision.addAuthority("_" + "Patterns");
                                    decision.addAuthority(matchSequence.getTokenPattern().getName());

                                    TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService
                                            .getTaggedToken(token, decision);

                                    TokenisedAtomicTokenSequence newSequence = this.getTokeniserService()
                                            .getTokenisedAtomicTokenSequence(history);
                                    newSequence.add(taggedToken);
                                    newSequence.addDecision(decision);

                                    // The decision is NOT the default decision for all tokens in the sequence, add all other tokens
                                    // in this sequence to the solution
                                    for (Token tokenInSequence : matchSequence.getTokensToCheck()) {
                                        if (tokenInSequence.equals(token)) {
                                            continue;
                                        }
                                        Decision<TokeniserOutcome> decisionInSequence = this.tokeniserDecisionFactory
                                                .createDefaultDecision(decision.getOutcome());
                                        decisionInSequence.addAuthority("_" + this.getClass().getSimpleName());
                                        decisionInSequence.addAuthority("_" + "DecisionInSequence");
                                        decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default");
                                        decisionInSequence.addAuthority("_" + "Patterns");
                                        TaggedToken<TokeniserOutcome> taggedTokenInSequence = this.tokeniserService
                                                .getTaggedToken(tokenInSequence, decisionInSequence);
                                        newSequence.add(taggedTokenInSequence);
                                    }

                                    heap.add(newSequence);

                                } // next sequence
                            } else {
                                // token doesn't start match sequence, and hasn't already been added to the current sequence
                                Decision<TokeniserOutcome> decision = defaultDecisions.get(i);
                                if (matchedTokens.contains(token)) {
                                    decision = this.tokeniserDecisionFactory
                                            .createDefaultDecision(decision.getOutcome());
                                    decision.addAuthority("_" + this.getClass().getSimpleName());
                                    decision.addAuthority("_" + "DecisionInSequence");
                                    decision.addAuthority("_" + "DecisionInSequence_default");
                                    decision.addAuthority("_" + "Patterns");
                                }
                                TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService
                                        .getTaggedToken(token, decision);

                                TokenisedAtomicTokenSequence newSequence = this.getTokeniserService()
                                        .getTokenisedAtomicTokenSequence(history);
                                newSequence.add(taggedToken);
                                heap.add(newSequence);
                            }

                        } // next sequence in the old heap
                    } finally {
                        MONITOR.endTask("heap sort");
                    }
                } // next token

                sequences = new ArrayList<TokenisedAtomicTokenSequence>();
                int k = 0;
                while (!heap.isEmpty()) {
                    sequences.add(heap.poll());
                    k++;
                    if (k >= this.getBeamWidth())
                        break;
                }
            } else {
                sequences = new ArrayList<TokenisedAtomicTokenSequence>();
                TokenisedAtomicTokenSequence defaultSequence = this.getTokeniserService()
                        .getTokenisedAtomicTokenSequence(sentence, 0);
                int i = 0;
                for (Token token : tokenSequence.listWithWhiteSpace()) {
                    TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token,
                            defaultDecisions.get(i++));
                    defaultSequence.add(taggedToken);
                }
                sequences.add(defaultSequence);
            } // have decision maker?

            LOG.debug("####Final token sequences:");
            int j = 1;
            for (TokenisedAtomicTokenSequence sequence : sequences) {
                TokenSequence newTokenSequence = sequence.inferTokenSequence();
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Token sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
                    LOG.debug("Atomic sequence: " + sequence);
                    LOG.debug("Resulting sequence: " + newTokenSequence);
                }
                // need to re-apply the pre-processing filters, because the tokens are all new
                // Question: why can't we conserve the initial tokens when they haven't changed at all?
                // Answer: because the tokenSequence and index in the sequence is referenced by the token.
                // Question: should we create a separate class, Token and TokenInSequence,
                // one with index & sequence access & one without?
                for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) {
                    tokenSequenceFilter.apply(newTokenSequence);
                }
                if (LOG.isDebugEnabled()) {
                    LOG.debug("After filters:      " + newTokenSequence);
                }
            }

            return sequences;
        } finally {
            MONITOR.endTask("tokeniseWithDecisions");
        }
    }

    TokenisedAtomicTokenSequence applyDecision(Token token, Decision<TokeniserOutcome> decision,
            TokenisedAtomicTokenSequence history, TokenPatternMatchSequence matchSequence,
            Decision<TokeniserOutcome> defaultDecision) {
        TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token, decision);

        TokenisedAtomicTokenSequence tokenisedSequence = this.getTokeniserService()
                .getTokenisedAtomicTokenSequence(history);
        tokenisedSequence.add(taggedToken);
        if (decision.isStatistical())
            tokenisedSequence.addDecision(decision);

        if (matchSequence != null) {
            for (Token otherToken : matchSequence.getTokensToCheck()) {
                if (otherToken.equals(token)) {
                    continue;
                }
                TaggedToken<TokeniserOutcome> anotherTaggedToken = this.tokeniserService.getTaggedToken(otherToken,
                        decision);
                tokenisedSequence.add(anotherTaggedToken);
            }
        }

        return tokenisedSequence;

    }

    /**
     * The test patterns - only token sequences matching these patterns will
     * be submitted to further decision.
     * @return
     */
    public List<String> getTestPatterns() {
        return this.getTokeniserPatternManager().getTestPatterns();
    }

    /**
     * The decision maker to make decisions for any separators within token
     * sub-sequences that need further testing.
     * @return
     */
    public DecisionMaker<TokeniserOutcome> getDecisionMaker() {
        return decisionMaker;
    }

    public void setDecisionMaker(DecisionMaker<TokeniserOutcome> decisionMaker) {
        this.decisionMaker = decisionMaker;
    }

    public TokeniserService getTokeniserService() {
        return tokeniserService;
    }

    public void setTokeniserService(TokeniserService tokeniserService) {
        this.tokeniserService = tokeniserService;
    }

    public TokeniserPatternManager getTokeniserPatternManager() {
        return tokeniserPatternManager;
    }

    public void setTokeniserPatternManager(TokeniserPatternManager tokeniserPatternManager) {
        this.tokeniserPatternManager = tokeniserPatternManager;
    }

    public int getBeamWidth() {
        return beamWidth;
    }

    public TokeniserPatternService getTokeniserPatternService() {
        return tokeniserPatternService;
    }

    public void setTokeniserPatternService(TokeniserPatternService tokeniserPatternService) {
        this.tokeniserPatternService = tokeniserPatternService;
    }

    public TokenFeatureService getTokenFeatureService() {
        return tokenFeatureService;
    }

    public void setTokenFeatureService(TokenFeatureService tokenFeatureService) {
        this.tokenFeatureService = tokenFeatureService;
    }

    /**
     * Filters to be applied to the atoms, prior to tokenising.
     * @return
     */
    public List<TokenSequenceFilter> getTokenSequenceFilters() {
        return tokenSequenceFilters;
    }

    public void addTokenSequenceFilter(TokenSequenceFilter tokenSequenceFilter) {
        this.tokenSequenceFilters.add(tokenSequenceFilter);
    }

    @Override
    public void addObserver(ClassificationObserver<TokeniserOutcome> observer) {
        this.observers.add(observer);
    }

    public List<TokenFilter> getTokenFilters() {
        return tokenFilters;
    }

    public void addTokenFilter(TokenFilter tokenFilter) {
        this.tokenFilters.add(tokenFilter);
    }

    public FilterService getFilterService() {
        return filterService;
    }

    public void setFilterService(FilterService filterService) {
        this.filterService = filterService;
    }

    public FeatureService getFeatureService() {
        return featureService;
    }

    public void setFeatureService(FeatureService featureService) {
        this.featureService = featureService;
    }

}