org.trnltk.morphology.contextless.rootfinder.BruteForceVerbRootFinder.java Source code

Introduction

Here is the source code for org.trnltk.morphology.contextless.rootfinder.BruteForceVerbRootFinder.java
Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.morphology.contextless.rootfinder;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.Validate;
import org.apache.commons.lang3.tuple.Pair;
import org.trnltk.model.letter.TurkishSequence;

import org.trnltk.model.lexicon.*;
import org.trnltk.model.letter.TurkishAlphabet;
import org.trnltk.model.suffix.SuffixFormSequence;
import org.trnltk.morphology.morphotactics.SuffixFormSequenceApplier;
import org.trnltk.morphology.phonetics.PhoneticsAnalyzer;
import org.trnltk.morphology.phonetics.PhoneticsEngine;
import org.trnltk.model.letter.TurkishChar;
import org.trnltk.model.letter.TurkicLetter;

import java.util.*;

/**
 * Finds the possible roots by brute force.
 * <p/>
 * Checks for the signs of the orthographic changes, and finds roots according to that.
 * Considers progressive vowel drop (bala+yor -> balyor), voicing (git+er -> gider), aorist A (yap+ar), aorist I (gel+ir),
 * causatives and passives.
 * <p/>
 * Returns phonetically valid verbs. For example 'rk' and 'bylt' are valid, but 'zanh' is not valid.
 * <p/>
 * In verbs voicing only occurs on roots ending with 't', so others (pk) are ignored.
 * Ignores inverse harmony, since verbs don't have it.
 */
public class BruteForceVerbRootFinder implements RootFinder {
    private final PhoneticsEngine phoneticsEngine = new PhoneticsEngine(new SuffixFormSequenceApplier());
    private final PhoneticsAnalyzer phoneticsAnalyzer = new PhoneticsAnalyzer();
    private static final SuffixFormSequence INFINITIVE_SUFFIX_FORM = new SuffixFormSequence("mAk");

    @Override
    public boolean handles(TurkishSequence partialInput, TurkishSequence wholeSurface) {
        if (partialInput == null || partialInput.isBlank())
            return false;

        if (wholeSurface == null || wholeSurface.isBlank())
            return false;

        if (!wholeSurface.startsWith(partialInput))
            return false;

        if (partialInput.length() < 2) // not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary
            return false;

        final TurkishChar lastVowel = partialInput.getLastVowel();
        if (lastVowel == null)
            return false;

        if (wholeSurface.length() > partialInput.length()) {
            final TurkishChar firstCharAfterPartialInput = wholeSurface.charAt(partialInput.length());
            if (Character.isUpperCase(firstCharAfterPartialInput.getCharValue()))
                return false;
        }

        return true;
    }

    @Override
    @SuppressWarnings({ "UnnecessaryLocalVariable", "ConstantConditions" })
    public Collection<DynamicRoot> findRootsForPartialInput(TurkishSequence partialInput,
            TurkishSequence wholeSurface) {
        final TurkishChar lastVowel = partialInput.getLastVowel();

        final TurkishSequence rootSeq = partialInput;
        final TurkishSequence lemmaSeq = rootSeq;
        final TurkishSequence lemmaRootSeq = lemmaSeq;
        final PrimaryPos primaryPos = PrimaryPos.Verb;
        final SecondaryPos secondaryPos = null;
        final EnumSet<LexemeAttribute> lexemeAttributes = EnumSet.noneOf(LexemeAttribute.class);
        final DynamicLexeme lexeme = new DynamicLexeme(lemmaSeq.getUnderlyingString(),
                lemmaRootSeq.getUnderlyingString(), primaryPos, secondaryPos, lexemeAttributes);
        final EnumSet<PhoneticExpectation> phoneticExpectations = EnumSet.noneOf(PhoneticExpectation.class);
        final EnumSet<PhoneticAttribute> phoneticAttributes = phoneticsAnalyzer
                .calculatePhoneticAttributes(partialInput, lexemeAttributes);

        final DynamicRoot noAttrRoot = new DynamicRoot(rootSeq, lexeme, phoneticAttributes, phoneticExpectations);

        this.setLexemeAndPhoneticAttributes(Arrays.asList(noAttrRoot));
        this.setLemma(Arrays.asList(noAttrRoot));

        final TurkishChar lastChar = partialInput.getLastChar();
        final TurkicLetter lastLetter = lastChar.getLetter();

        final boolean partialSurfaceCanBeRootOfAVerb = this.seemsLikeAValidVerbRoot(partialInput);

        if (wholeSurface.equals(partialInput))
            return partialSurfaceCanBeRootOfAVerb ? Arrays.asList(noAttrRoot)
                    : Collections.<DynamicRoot>emptyList();

        final TurkishChar firstCharAfterPartialInput = wholeSurface.charAt(partialInput.length());

        final TurkicLetter firstLetterAfterPartialInput = firstCharAfterPartialInput.getLetter();

        final String wholeSurfaceStr = wholeSurface.getUnderlyingString();
        final String partialInputStr = partialInput.getUnderlyingString();

        final boolean mightHaveProgressiveVowelDrop = !lastLetter.isVowel() && strStartsWithAnyAdditionOfStr(
                wholeSurfaceStr, partialInputStr, Arrays.asList("iyor", "yor", "uyor", "yor"));

        final boolean mightHaveAorist_A = !lastLetter.isVowel()
                && strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr, Arrays.asList("ar", "er"));

        // no Aorist_I for -ur, -r
        final boolean mightHaveAorist_I = !lastLetter.isVowel()
                && strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr, Arrays.asList("r", "ir"));

        // for other letters, no voicing in verbs. {git+er->gider} vs {yapar, aar, diker}
        final boolean voicingMightHaveHappened = lastLetter.equals(TurkishAlphabet.L_d)
                && firstLetterAfterPartialInput.isVowel();

        final Set<DynamicRoot> possibleProgressiveVowelDropRoots = mightHaveProgressiveVowelDrop
                ? this.getProgressiveDropRoots(noAttrRoot, lastVowel)
                : new HashSet<DynamicRoot>();

        final Set<DynamicRoot> possibleAorist_A_Roots = mightHaveAorist_A ? this.getAorist_A_Roots(noAttrRoot)
                : new HashSet<DynamicRoot>();

        final Set<DynamicRoot> possibleAorist_I_Roots = mightHaveAorist_I ? this.getAorist_I_Roots(noAttrRoot)
                : new HashSet<DynamicRoot>();

        final Set<DynamicRoot> possibleCausativeRoots = this.getPossibleCausativeRoots(lastLetter, partialInput,
                wholeSurface, noAttrRoot);
        final Set<DynamicRoot> possiblePassiveRoots = this.getPossiblePassiveRoots(lastLetter, partialInput,
                wholeSurface, noAttrRoot);

        if (voicingMightHaveHappened) {
            Function<DynamicRoot, DynamicRoot> voicingRootFunction = new Function<DynamicRoot, DynamicRoot>() {
                @Override
                public DynamicRoot apply(DynamicRoot input) {
                    return getPossibleVoicingRoot(input);
                }
            };

            final Collection<DynamicRoot> possibleProgressiveVowelDropRoots_voicing = Collections2
                    .transform(ImmutableSet.copyOf(possibleProgressiveVowelDropRoots), voicingRootFunction);
            possibleProgressiveVowelDropRoots.addAll(possibleProgressiveVowelDropRoots_voicing);

            final Collection<DynamicRoot> possibleAorist_A_Roots_voicing = Collections2
                    .transform(ImmutableSet.copyOf(possibleAorist_A_Roots), voicingRootFunction);
            possibleAorist_A_Roots.addAll(possibleAorist_A_Roots_voicing);

            final Collection<DynamicRoot> possibleAorist_I_Roots_voicing = Collections2
                    .transform(ImmutableSet.copyOf(possibleAorist_I_Roots), voicingRootFunction);
            possibleAorist_A_Roots.addAll(possibleAorist_I_Roots_voicing);

            final Collection<DynamicRoot> possibleCausativeRoots_voicing = Collections2
                    .transform(ImmutableSet.copyOf(possibleCausativeRoots), voicingRootFunction);
            possibleCausativeRoots.addAll(possibleCausativeRoots_voicing);

            final Collection<DynamicRoot> possiblePassiveRoots_voicing = Collections2
                    .transform(ImmutableSet.copyOf(possiblePassiveRoots), voicingRootFunction);
            possiblePassiveRoots.addAll(possiblePassiveRoots_voicing);
        }

        final HashSet<DynamicRoot> generatedRoots = new HashSet<DynamicRoot>();

        generatedRoots.add(noAttrRoot);

        if (voicingMightHaveHappened)
            generatedRoots.add(this.getPossibleVoicingRoot(noAttrRoot));

        generatedRoots.addAll(possibleProgressiveVowelDropRoots);
        generatedRoots.addAll(possibleAorist_A_Roots);
        generatedRoots.addAll(possibleAorist_I_Roots);
        generatedRoots.addAll(possibleCausativeRoots);
        generatedRoots.addAll(possiblePassiveRoots);

        this.setLexemeAndPhoneticAttributes(generatedRoots);
        this.setLemma(generatedRoots);

        return Collections2.filter(generatedRoots, new Predicate<DynamicRoot>() {
            @Override
            public boolean apply(DynamicRoot input) {
                return seemsLikeAValidVerbRoot(new TurkishSequence(input.getLexeme().getLemmaRoot()));
            }
        });
    }

    private void setLexemeAndPhoneticAttributes(Collection<DynamicRoot> generatedRoots) {
        for (DynamicRoot generatedRoot : generatedRoots) {
            final DynamicLexeme lexeme = generatedRoot.getLexeme();
            final TurkishSequence rootSeq = generatedRoot.getSequence();
            final String rootStr = rootSeq.getUnderlyingString();
            generatedRoot.setPhoneticAttributes(
                    this.phoneticsAnalyzer.calculatePhoneticAttributes(rootSeq, lexeme.getAttributes()));
            if (rootStr.endsWith("d") && lexeme.getLemmaRoot().endsWith("t")) {
                lexeme.getAttributes().remove(LexemeAttribute.NoVoicing);
                lexeme.getAttributes().add(LexemeAttribute.Voicing);
            } else {
                lexeme.getAttributes().remove(LexemeAttribute.Voicing);
                lexeme.getAttributes().add(LexemeAttribute.NoVoicing);
            }
        }
    }

    private void setLemma(Collection<DynamicRoot> generatedRoots) {
        for (DynamicRoot generatedRoot : generatedRoots) {
            final DynamicLexeme lexeme = generatedRoot.getLexeme();
            final Pair<TurkishSequence, String> applicationPair = this.phoneticsEngine.apply(
                    new TurkishSequence(lexeme.getLemmaRoot()), generatedRoot.getPhoneticAttributes(),
                    INFINITIVE_SUFFIX_FORM, lexeme.getAttributes());
            final TurkishSequence word = applicationPair.getLeft();
            final String appliedSuffixForm = applicationPair.getRight();
            Validate.isTrue(!word.isBlank());
            Validate.notBlank(appliedSuffixForm);
            lexeme.setLemma(word.getUnderlyingString() + appliedSuffixForm);
        }
    }

    private boolean seemsLikeAValidVerbRoot(TurkishSequence partialInput) {
        final TurkishChar lastChar = partialInput.getLastChar();
        final TurkicLetter lastLetter = lastChar.getLetter();
        final TurkishChar previousChar = partialInput.charAt(partialInput.length() - 2);
        final TurkicLetter previousLetter = previousChar.getLetter();

        return lastLetter.isVowel() || previousLetter.isVowel()
                || (Arrays.asList(TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n)
                        .contains(previousLetter) && !lastLetter.isContinuant());
    }

private Set<DynamicRoot> getProgressiveDropRoots(DynamicRoot noAttrRoot, TurkishChar lastVowel) {
    /*
    bala - +Iyor --> balyor
    elle  - +Iyor --> elliyor
    oyna  - +Iyor --> oynuyor
    syle - +Iyor --> sylyor
    kaz  - +Iyor --> kazyor
    kaz   - +Iyor --> kazyor

    bayor   : balamak or balmak (skip balumak)
    elliyor   : ellemek or ellimek (skip ellmek)
    oynuyor   : oynamak or oynumak (skip oynmak)
    sylyor  : sylemek or sylmek (skip sylimek)
    kazyor   : kazamak or kazmak (skip kazumak)
    */

    final List<Character> droppedVowels = new ArrayList<Character>();
    final boolean lastVowelIsFrontal = lastVowel.getLetter().isFrontal();
    final boolean lastVowelIsRounded = lastVowel.getLetter().isRounded();

    // since there is no inverse harmony in verbs, we can determine the dropped vowel
    if (!lastVowelIsFrontal) {
        droppedVowels.add('a');
        if (!lastVowelIsRounded)
            droppedVowels.add('');
        else
            droppedVowels.add('u');
    } else {
        droppedVowels.add('e');
        if (!lastVowelIsRounded)
            droppedVowels.add('i');
        else
            droppedVowels.add('');
    }

    final HashSet<DynamicRoot> generatedRoots = new HashSet<DynamicRoot>();

    for (Character droppedVowel : droppedVowels) {
        final DynamicRoot generatedRoot = new DynamicRoot(noAttrRoot);
        generatedRoot.getLexeme().setLemmaRoot(generatedRoot.getLexeme().getLemmaRoot() + droppedVowel);
        generatedRoot.getLexeme().getAttributes().add(LexemeAttribute.ProgressiveVowelDrop);

        generatedRoots.add(generatedRoot);
    }

    return generatedRoots;
}

    private Set<DynamicRoot> getAorist_A_Roots(DynamicRoot noAttrRoot) {
        final DynamicRoot generatedRoot = new DynamicRoot(noAttrRoot);
        generatedRoot.getLexeme().getAttributes().add(LexemeAttribute.Aorist_A);
        return Sets.newHashSet(generatedRoot);
    }

    private Set<DynamicRoot> getAorist_I_Roots(DynamicRoot noAttrRoot) {
        final DynamicRoot generatedRoot = new DynamicRoot(noAttrRoot);
        generatedRoot.getLexeme().getAttributes().add(LexemeAttribute.Aorist_I);
        return Sets.newHashSet(generatedRoot);
    }

    private Set<DynamicRoot> getPossibleCausativeRoots(TurkicLetter lastLetter, TurkishSequence partialInput,
            TurkishSequence wholeSurface, DynamicRoot noAttrRoot) {
        // no voicing can happen on causative_t
        final String wholeSurfaceStr = wholeSurface.getUnderlyingString();
        final String partialInputStr = partialInput.getUnderlyingString();

        final boolean mightHaveCausative_t = wholeSurfaceStr.startsWith(partialInputStr + 't')
                && (lastLetter.isContinuant() || lastLetter.isVowel());

        final boolean mightHaveCausative_Ir = this.strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr,
                Arrays.asList("r", "ir", "ur", "r"));

        // no voicing can happen on causative_It
        final boolean mightHaveCausative_It = this.strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr,
                Arrays.asList("t", "it", "ut", "t"));

        final boolean mightHaveCausative_Ar = this.strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr,
                Arrays.asList("ar", "er"));

        final boolean mightHaveCausative_dIr = this.strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr,
                Arrays.asList("dr", "dir", "dur", "dr", "tr", "tir", "tur", "tr"));

        final ImmutableMap<LexemeAttribute, Boolean> mightHaveCausatives = new ImmutableMap.Builder<LexemeAttribute, Boolean>()
                .put(LexemeAttribute.Causative_t, mightHaveCausative_t)
                .put(LexemeAttribute.Causative_Ir, mightHaveCausative_Ir)
                .put(LexemeAttribute.Causative_It, mightHaveCausative_It)
                .put(LexemeAttribute.Causative_Ar, mightHaveCausative_Ar)
                .put(LexemeAttribute.Causative_dIr, mightHaveCausative_dIr).build();

        final HashSet<DynamicRoot> causativeRoots = new HashSet<DynamicRoot>();

        for (Map.Entry<LexemeAttribute, Boolean> lexemeAttributeBooleanEntry : mightHaveCausatives.entrySet()) {
            final LexemeAttribute causativeAttr = lexemeAttributeBooleanEntry.getKey();
            final Boolean mightHaveHappened = lexemeAttributeBooleanEntry.getValue();

            if (!mightHaveHappened)
                continue;

            // cannot have other causatives at the same time
            // cannot have any other passive at the same time
            // cannot have progressive vowel drop at the same time
            // cannot have aorist_A or aorist_I at the same time

            final DynamicRoot generatedRoot = new DynamicRoot(noAttrRoot);

            generatedRoot.getLexeme().setAttributes(EnumSet.of(causativeAttr));

            generatedRoot.setPhoneticAttributes(this.phoneticsAnalyzer.calculatePhoneticAttributes(partialInput,
                    generatedRoot.getLexeme().getAttributes()));

            causativeRoots.add(generatedRoot);

        }

        return causativeRoots;
    }

    private Set<DynamicRoot> getPossiblePassiveRoots(TurkicLetter lastLetter, TurkishSequence partialInput,
            TurkishSequence wholeSurface, DynamicRoot noAttrRoot) {
        final String wholeSurfaceStr = wholeSurface.getUnderlyingString();
        final String partialInputStr = partialInput.getUnderlyingString();

        final boolean mightHavePassive_Il = (!lastLetter.isVowel() && this.strStartsWithAnyAdditionOfStr(
                wholeSurfaceStr, partialInputStr, Arrays.asList("l", "il", "ul", "l")))
                || (lastLetter.isVowel() && wholeSurfaceStr.startsWith(partialInputStr + 'l'));

        final boolean mightHavePassive_In = (!lastLetter.isVowel() && this.strStartsWithAnyAdditionOfStr(
                wholeSurfaceStr, partialInputStr, Arrays.asList("n", "in", "un", "n")))
                || (lastLetter.isVowel() && wholeSurfaceStr.startsWith(partialInputStr + 'n'));

        final boolean mightHavePassive_InIl = (!lastLetter.isVowel() && this.strStartsWithAnyAdditionOfStr(
                wholeSurfaceStr, partialInputStr, Arrays.asList("nl", "inil", "unul", "nl")))
                || (lastLetter.isVowel() && this.strStartsWithAnyAdditionOfStr(wholeSurfaceStr, partialInputStr,
                        Arrays.asList("nl", "nil", "nul", "nl")));

        final ImmutableMap<LexemeAttribute, Boolean> mightHavePassives = new ImmutableMap.Builder<LexemeAttribute, Boolean>()
                .put(LexemeAttribute.Passive_Il, mightHavePassive_Il)
                .put(LexemeAttribute.Passive_In, mightHavePassive_In)
                .put(LexemeAttribute.Passive_InIl, mightHavePassive_InIl).build();

        final HashSet<DynamicRoot> passiveRoots = new HashSet<DynamicRoot>();

        for (Map.Entry<LexemeAttribute, Boolean> lexemeAttributeBooleanEntry : mightHavePassives.entrySet()) {
            final LexemeAttribute passiveAttr = lexemeAttributeBooleanEntry.getKey();
            final Boolean mightHaveHappened = lexemeAttributeBooleanEntry.getValue();

            if (!mightHaveHappened)
                continue;

            // cannot have other passives at the same time
            // cannot have any other causative at the same time
            // cannot have progressive vowel drop at the same time
            // cannot have aorist_A or aorist_I at the same time

            final DynamicRoot generatedRoot = new DynamicRoot(noAttrRoot);

            generatedRoot.getLexeme().setAttributes(EnumSet.of(passiveAttr));

            generatedRoot.setPhoneticAttributes(this.phoneticsAnalyzer.calculatePhoneticAttributes(partialInput,
                    generatedRoot.getLexeme().getAttributes()));

            passiveRoots.add(generatedRoot);

        }

        return passiveRoots;
    }

    private DynamicRoot getPossibleVoicingRoot(DynamicRoot root) {
        // return only the reverse_voiced root
        Validate.isTrue(root.getSequence().getLastChar().getLetter().equals(TurkishAlphabet.L_d),
                "This is weird! This method should have been called after possible voicing was already checked.");
        final DynamicRoot cloneRoot = new DynamicRoot(root);
        // ignoring Voicing+ProgressiveVowelDrop
        final String orgLemmaRoot = cloneRoot.getLexeme().getLemmaRoot();
        cloneRoot.getLexeme()
                .setLemma(orgLemmaRoot.substring(0, orgLemmaRoot.length() - 1) + TurkishAlphabet.L_t.charValue());
        cloneRoot.getLexeme().setLemmaRoot(cloneRoot.getLexeme().getLemma());
        cloneRoot.getLexeme().getAttributes().add(LexemeAttribute.Voicing);
        return cloneRoot;
    }

    private boolean strStartsWithAnyAdditionOfStr(String wholeSurfaceStr, String partialInputStr,
            List<String> suffixes) {
        for (String suffix : suffixes) {
            if (wholeSurfaceStr.startsWith(partialInputStr + suffix))
                return true;
        }
        return false;
    }
}