org.trnltk.morphology.contextless.parser.ContextlessMorphologicParser.java Source code

Java tutorial

Introduction

Here is the source code for org.trnltk.morphology.contextless.parser.ContextlessMorphologicParser.java

Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.morphology.contextless.parser;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.log4j.Logger;
import org.trnltk.model.letter.TurkishSequence;
import org.trnltk.model.lexicon.Root;
import org.trnltk.model.morpheme.MorphemeContainer;
import org.trnltk.model.suffix.SuffixFormApplication;
import org.trnltk.morphology.contextless.rootfinder.RootFinderChain;
import org.trnltk.morphology.morphotactics.SuffixGraphState;
import org.trnltk.morphology.morphotactics.SuffixGraphStateType;
import org.trnltk.util.MorphemeContainerFormatter;

import java.util.*;

/**
 * A form-based morphologic parser implementation which does not the context of the input.
 */
public class ContextlessMorphologicParser implements MorphologicParser {
    private final Logger logger = Logger.getLogger(ContextlessMorphologicParser.class);

    private final MandatoryTransitionApplier mandatoryTransitionApplier;
    private ContextlessMorphologicParserListener listener;

    private final SuffixFormGraph suffixFormGraph;
    private final PredefinedPaths predefinedPaths;
    private final RootFinderChain rootFinderChain;
    private final SuffixApplier suffixApplier;

    public ContextlessMorphologicParser(final SuffixFormGraph suffixFormGraph,
            final PredefinedPaths predefinedPaths, final RootFinderChain rootFinderChain,
            final SuffixApplier suffixApplier) {
        this.suffixFormGraph = suffixFormGraph;
        this.predefinedPaths = predefinedPaths;
        this.rootFinderChain = rootFinderChain;
        this.suffixApplier = suffixApplier;
        this.mandatoryTransitionApplier = new MandatoryTransitionApplier(suffixFormGraph.getSuffixGraph(),
                suffixApplier);
    }

    @Override
    public List<List<MorphemeContainer>> parseAll(List<TurkishSequence> inputs) {
        return new ArrayList<List<MorphemeContainer>>(
                Lists.transform(inputs, new Function<TurkishSequence, List<MorphemeContainer>>() {
                    @Override
                    public List<MorphemeContainer> apply(TurkishSequence input) {
                        return parse(input);
                    }
                }));
    }

    @Override
    public List<List<MorphemeContainer>> parseAllStr(List<String> inputs) {
        return new ArrayList<List<MorphemeContainer>>(
                Lists.transform(inputs, new Function<String, List<MorphemeContainer>>() {
                    @Override
                    public List<MorphemeContainer> apply(String input) {
                        return parseStr(input);
                    }
                }));
    }

    @Override
    public List<MorphemeContainer> parseStr(String input) {
        return this.parse(new TurkishSequence(input));
    }

    @Override
    public LinkedList<MorphemeContainer> parse(final TurkishSequence input) {
        // * find initial containers --> find possible roots and create containers around them
        // * apply mandatory transitions
        // * traverse until there are no candidates --> find all results

        if (logger.isDebugEnabled())
            logger.debug("Parsing input " + input);

        final List<MorphemeContainer> candidateMorphemeContainers = this.findInitialMorphemeContainers(input);

        if (logger.isDebugEnabled()) {
            logger.debug(
                    String.format("Found %d candidate morpheme containers", candidateMorphemeContainers.size()));
            for (MorphemeContainer morphemeContainer : candidateMorphemeContainers) {
                logger.debug("\t " + morphemeContainer.toString());
            }
        }

        logger.debug("Applying mandatory transitions to candidates");

        final List<MorphemeContainer> candidateMorphemeContainersWithMandatoryTransitions = mandatoryTransitionApplier
                .applyMandatoryTransitionsToMorphemeContainers(candidateMorphemeContainers, input);

        final LinkedList<MorphemeContainer> results = new LinkedList<MorphemeContainer>();
        final LinkedList<MorphemeContainer> newCandidates = this
                .traverseCandidates(candidateMorphemeContainersWithMandatoryTransitions, results, input);

        if (CollectionUtils.isNotEmpty(newCandidates))
            throw new IllegalStateException(
                    "There are still parse morpheme containers to traverse, but traversing is finished : "
                            + newCandidates.toString());

        return results;
    }

    private LinkedList<MorphemeContainer> traverseCandidates(final List<MorphemeContainer> candidates,
            final List<MorphemeContainer> results, final TurkishSequence input) {
        // * traverse all containers recursively --> go through the suffix graph for all containers, apply transitions and traverse the new containers

        if (logger.isDebugEnabled()) {
            logger.debug("Gonna traverse " + candidates.size() + " candidates:");
            for (MorphemeContainer candidate : candidates) {
                logger.debug("\t " + candidate);
            }
        }

        LinkedList<MorphemeContainer> newCandidates = new LinkedList<MorphemeContainer>();
        for (MorphemeContainer candidateMorphemeContainer : candidates) {
            if (logger.isDebugEnabled())
                logger.debug(" Traversing candidate: %s" + candidateMorphemeContainer);
            final List<MorphemeContainer> morphemeContainersForCandidate = this
                    .traverseCandidate(candidateMorphemeContainer, input);
            for (MorphemeContainer morphemeContainerForCandidate : morphemeContainersForCandidate) {
                if (SuffixGraphStateType.TERMINAL.equals(morphemeContainerForCandidate.getLastState().getType())) {
                    if (StringUtils.isBlank(morphemeContainerForCandidate.getRemainingSurface())) {
                        results.add(morphemeContainerForCandidate);
                        if (logger.isDebugEnabled()) {
                            logger.debug("Found a terminal result --------------------->");
                            logger.debug(morphemeContainerForCandidate);
                            logger.debug(MorphemeContainerFormatter
                                    .formatMorphemeContainerWithForms(morphemeContainerForCandidate));
                        }
                    } else {
                        if (logger.isDebugEnabled())
                            logger.debug("Found a terminal result, but there is still remaining to parse : "
                                    + morphemeContainerForCandidate);
                    }
                } else {
                    newCandidates.add(morphemeContainerForCandidate);
                }
            }
        }

        // call recursively until nothing to traverse!
        if (CollectionUtils.isNotEmpty(newCandidates)) {
            List<MorphemeContainer> previousCandidates = newCandidates;
            newCandidates = this.traverseCandidates(previousCandidates, results, input);
            if (listener != null) {
                Sets.SetView<MorphemeContainer> invalidatedMorphemeContainers = Sets.difference(
                        new HashSet<MorphemeContainer>(previousCandidates),
                        new HashSet<MorphemeContainer>(newCandidates));
                for (MorphemeContainer invalidatedMorphemeContainer : invalidatedMorphemeContainers) {
                    if (SuffixGraphStateType.TERMINAL
                            .equals(invalidatedMorphemeContainer.getLastState().getType())) {
                        if (StringUtils.isBlank(invalidatedMorphemeContainer.getRemainingSurface()))
                            continue;
                    }
                    this.listener.onMorphemeContainerInvalidated(invalidatedMorphemeContainer);
                }
            }
        }

        return newCandidates;
    }

    private LinkedList<MorphemeContainer> traverseCandidate(final MorphemeContainer initialContainer,
            final TurkishSequence input) {
        // * traverse one container --> try all possible suffix transitions for the container and find the new containers
        //  >>> where the transitions are applied

        if (SuffixGraphStateType.TERMINAL.equals(initialContainer.getLastState().getType()))
            return Lists.newLinkedList(Arrays.asList(initialContainer));

        final SuffixFormGraphNodeKey currentSuffixFormGraphNodeKey = new SuffixFormGraphNodeKey(
                initialContainer.getLastState(), initialContainer.getPhoneticAttributes());
        final SuffixFormGraphNode currentNode = this.suffixFormGraph.getNode(currentSuffixFormGraphNodeKey);
        if (currentNode == null) {
            throw new IllegalStateException("Node not found for key : " + currentSuffixFormGraphNodeKey.getState()
                    + " set: "
                    + new PhoneticAttributeSets().getNumberForSet(currentSuffixFormGraphNodeKey.getPhonAttrSet()));
        }

        final LinkedList<MorphemeContainer> newCandidates = new LinkedList<MorphemeContainer>();

        final Set<SuffixFormGraphSuffixEdge> edges = this
                .getApplicableSuffixesOfNodeForMorphemeContainer(currentNode, initialContainer);
        if (logger.isDebugEnabled()) {
            if (CollectionUtils.isEmpty(edges))
                logger.debug(String.format(
                        "  No applicable transition edges found for morpheme_container from node %s", currentNode));
            else
                logger.debug(
                        String.format("  Found applicable transition edges for morpheme_container from node %s: %s",
                                currentNode, edges));
        }

        if (logger.isDebugEnabled())
            logger.debug(String.format("  Found applicable suffixes for morpheme_container from node %s: %s",
                    currentNode, edges));

        for (SuffixFormGraphSuffixEdge transitionEdge : edges) {
            final SuffixFormApplication suffixFormApplication = transitionEdge.getSuffixFormApplication();

            if (logger.isDebugEnabled())
                logger.debug(String.format("   Going to try suffixFormApplication : %s", suffixFormApplication));

            final SuffixGraphState targetState = transitionEdge.getTargetSuffixFormGraphNode()
                    .getSuffixFormGraphNodeKey().getState();
            final MorphemeContainer morphemeContainerForSuffixFormApplication = this.suffixApplier
                    .trySuffixFormApplication(initialContainer, suffixFormApplication, targetState,
                            transitionEdge.getPhoneticExpectations(), input);
            if (morphemeContainerForSuffixFormApplication != null)
                newCandidates.add(morphemeContainerForSuffixFormApplication);

            if (logger.isDebugEnabled())
                logger.debug(String.format("   Applied edge : %s . Applied morpheme container %s", transitionEdge,
                        morphemeContainerForSuffixFormApplication));
        }

        return newCandidates;
    }

    private Set<SuffixFormGraphSuffixEdge> getApplicableSuffixesOfNodeForMorphemeContainer(
            final SuffixFormGraphNode node, final MorphemeContainer morphemeContainer) {
        if (logger.isDebugEnabled()) {
            logger.debug("  Finding applicable suffixes for morpheme_container from node " + node + " : "
                    + morphemeContainer);
            logger.debug("   Found outputs " + node.getEdges());
        }

        Set<SuffixFormGraphSuffixEdge> edges = node.getEdges();

        edges = Sets.filter(edges, new Predicate<SuffixFormGraphSuffixEdge>() {
            @Override
            public boolean apply(SuffixFormGraphSuffixEdge input) {
                final String appliedSuffixForm = input.getSuffixFormApplication().getActualSuffixForm();
                return morphemeContainer.getRemainingSurface().startsWith(appliedSuffixForm);
            }
        });
        if (logger.isDebugEnabled())
            logger.debug("   Filtered out suffix forms which are not beginning of remaining surface "
                    + morphemeContainer.getSuffixesSinceDerivationSuffix() + " : " + edges);

        edges = Sets.filter(edges, new Predicate<SuffixFormGraphSuffixEdge>() {
            @Override
            public boolean apply(SuffixFormGraphSuffixEdge input) {
                return !morphemeContainer.getSuffixesSinceDerivationSuffix()
                        .contains(input.getSuffixFormApplication().getSuffixForm().getSuffix());
            }
        });
        if (logger.isDebugEnabled())
            logger.debug("   Filtered out the applied suffixes since last derivation "
                    + morphemeContainer.getSuffixesSinceDerivationSuffix() + " : " + edges);

        return edges;
    }

    private LinkedList<MorphemeContainer> findInitialMorphemeContainers(final TurkishSequence input) {
        // find roots for input and create containers around them

        final LinkedList<MorphemeContainer> candidates = new LinkedList<MorphemeContainer>();

        for (int i = 1; i < input.length() + 1; i++) {
            final TurkishSequence partialInput = input.subsequence(0, i);

            final List<Root> roots = this.rootFinderChain.findRootsForPartialInput(partialInput, input);

            if (logger.isDebugEnabled()) {
                logger.debug(String.format("Found %d root candidates for partial input '%s':", roots.size(),
                        partialInput));
                for (Root root : roots) {
                    logger.debug("\t " + root.toString());
                }
            }

            if (this.predefinedPaths == null) {
                for (Root root : roots) {
                    final String remainingInput = input.substring(root.getSequence().length());
                    final SuffixGraphState defaultStateForRoot = this.suffixFormGraph.getDefaultStateForRoot(root);
                    Validate.notNull(defaultStateForRoot, "No node found for root " + root);

                    final MorphemeContainer morphemeContainer = new MorphemeContainer(root, defaultStateForRoot,
                            remainingInput);
                    candidates.add(morphemeContainer);
                }
            } else {
                for (Root root : roots) {
                    final String remainingInput = input.substring(root.getSequence().length());

                    final SuffixGraphState defaultStateForRoot = this.suffixFormGraph.getDefaultStateForRoot(root);
                    if (defaultStateForRoot == null)
                        throw new IllegalStateException("No default state found for root " + root);

                    if (this.predefinedPaths.hasPathsForRoot(root)) {
                        final Set<MorphemeContainer> predefinedMorphemeContainers = this.predefinedPaths
                                .getPaths(root);
                        if (logger.isDebugEnabled()) {
                            logger.debug("Found predefined morpheme containers for root candidate " + root + " : "
                                    + predefinedMorphemeContainers);
                        }
                        for (MorphemeContainer predefinedMorphemeContainer : predefinedMorphemeContainers) {
                            if (input.startsWith(predefinedMorphemeContainer.getSurfaceSoFar())) {
                                if (logger.isDebugEnabled())
                                    logger.debug("Predefined morpheme_container is applicable "
                                            + predefinedMorphemeContainer);
                                //entry is cloned and since the remaining surface can be different, it is set.
                                MorphemeContainer clone = new MorphemeContainer(predefinedMorphemeContainer, input);
                                candidates.add(clone);
                            } else {
                                if (logger.isDebugEnabled())
                                    logger.debug("Predefined morpheme container is not applicable, skipping "
                                            + predefinedMorphemeContainer);
                            }
                        }
                    } else {
                        candidates.add(new MorphemeContainer(root, defaultStateForRoot, remainingInput));
                    }
                }
            }
        }

        return candidates;
    }

    public void setListener(ContextlessMorphologicParserListener listener) {
        this.listener = listener;
    }
}