org.trnltk.morphology.lexicon.ImmutableRootGenerator.java Source code

Introduction

Here is the source code for org.trnltk.morphology.lexicon.ImmutableRootGenerator.java
Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.morphology.lexicon;

import com.google.common.base.Predicate;
import com.google.common.collect.*;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.commons.lang3.tuple.Pair;
import org.trnltk.model.lexicon.ImmutableLexeme;
import org.trnltk.model.lexicon.ImmutableRoot;
import org.trnltk.model.lexicon.Lexeme;
import org.trnltk.model.lexicon.LexemeAttribute;
import org.trnltk.model.lexicon.PrimaryPos;
import org.trnltk.model.lexicon.PhoneticAttribute;
import org.trnltk.model.lexicon.PhoneticExpectation;
import org.trnltk.model.letter.TurkicLetter;
import org.trnltk.model.letter.TurkishAlphabet;
import org.trnltk.morphology.phonetics.PhoneticsAnalyzer;

import java.util.Collection;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;

public class ImmutableRootGenerator {
    private static final ImmutableSet<LexemeAttribute> MODIFIERS_TO_WATCH = Sets.immutableEnumSet(
            LexemeAttribute.Doubling, LexemeAttribute.LastVowelDrop, LexemeAttribute.ProgressiveVowelDrop,
            LexemeAttribute.InverseHarmony, LexemeAttribute.Voicing, LexemeAttribute.VoicingOpt,
            LexemeAttribute.Special, LexemeAttribute.EndsWithAyn);

    private static final ImmutableMap<Pair<String, PrimaryPos>, String> rootChanges = new ImmutableMap.Builder<Pair<String, PrimaryPos>, String>()
            .put(Pair.of("ben", PrimaryPos.Pronoun), "ban").put(Pair.of("sen", PrimaryPos.Pronoun), "san")
            .put(Pair.of("demek", PrimaryPos.Verb), "di").put(Pair.of("yemek", PrimaryPos.Verb), "yi")
            .put(Pair.of("hepsi", PrimaryPos.Pronoun), "hep").put(Pair.of("ora", PrimaryPos.Pronoun), "or")
            .put(Pair.of("bura", PrimaryPos.Pronoun), "bur").put(Pair.of("ura", PrimaryPos.Pronoun), "ur")
            .put(Pair.of("nere", PrimaryPos.Pronoun), "ner").put(Pair.of("ieri", (PrimaryPos) null), "ier") // applicable to all forms of the word
            .put(Pair.of("dar", (PrimaryPos) null), "dar") // applicable to all forms of the word
            .put(Pair.of("birbiri", PrimaryPos.Pronoun), "birbir").build();

    private PhoneticsAnalyzer phoneticsAnalyzer = new PhoneticsAnalyzer();

    public Collection<ImmutableRoot> generateAll(final Set<Lexeme> lexemes) {
        HashSet<ImmutableRoot> all = new HashSet<ImmutableRoot>();
        for (Lexeme lexeme : lexemes) {
            all.addAll(this.generate(lexeme));
        }
        return all;
    }

    public HashSet<ImmutableRoot> generate(final Lexeme lexeme) {
        if (CollectionUtils.containsAny(lexeme.getAttributes(), MODIFIERS_TO_WATCH)) {
            return this.generateModifiedRootNodes(lexeme);
        } else {
            Set<PhoneticAttribute> phoneticAttributes = phoneticsAnalyzer
                    .calculatePhoneticAttributes(lexeme.getLemmaRoot(), lexeme.getAttributes());
            final ImmutableRoot root = new ImmutableRoot(lexeme.getLemmaRoot(), lexeme,
                    Sets.immutableEnumSet(phoneticAttributes), null);
            return Sets.newHashSet(root);
        }
    }

    private HashSet<ImmutableRoot> generateModifiedRootNodes(final Lexeme lexeme) {
        final Set<LexemeAttribute> lexemeAttributes = lexeme.getAttributes();
        if (lexemeAttributes.contains(LexemeAttribute.Special))
            return this.handleSpecialRoots(lexeme);

        if (lexemeAttributes.contains(LexemeAttribute.EndsWithAyn)) { //kind of hack, didn't like it :(
            // if the word ends with Ayn
            // create roots with that attribute, and without that attribute
            // when creating with that attribute, add a VowelStart expectation

            final HashSet<ImmutableRoot> immutableRoots = new HashSet<ImmutableRoot>();

            final EnumSet<LexemeAttribute> lexemeAttributesWithoutAyn = EnumSet.copyOf(lexemeAttributes);
            lexemeAttributesWithoutAyn.remove(LexemeAttribute.EndsWithAyn);
            final Lexeme lexemeWithoutAttrEndsWithAyn = new ImmutableLexeme(lexeme.getLemma(),
                    lexeme.getLemmaRoot(), lexeme.getPrimaryPos(), lexeme.getSecondaryPos(),
                    Sets.immutableEnumSet(lexemeAttributesWithoutAyn));
            final HashSet<ImmutableRoot> rootsWithoutAynApplied = this
                    .generateModifiedRootNodes(lexemeWithoutAttrEndsWithAyn);
            immutableRoots.addAll(rootsWithoutAynApplied);

            for (ImmutableRoot immutableRoot : rootsWithoutAynApplied) {
                final ImmutableSet<PhoneticAttribute> phoneticAttributesWithoutAynApplied = immutableRoot
                        .getPhoneticAttributes();
                final HashSet<PhoneticAttribute> phoneticAttributesWithAynApplied = Sets
                        .newHashSet(phoneticAttributesWithoutAynApplied);
                phoneticAttributesWithAynApplied.remove(PhoneticAttribute.LastLetterVowel);
                phoneticAttributesWithAynApplied.add(PhoneticAttribute.LastLetterConsonant);
                final ImmutableRoot immutableRootWithAynApplied = new ImmutableRoot(immutableRoot.getSequence(),
                        immutableRoot.getLexeme(), Sets.immutableEnumSet(phoneticAttributesWithAynApplied),
                        Sets.immutableEnumSet(PhoneticExpectation.VowelStart));
                immutableRoots.add(immutableRootWithAynApplied);
            }

            // just before returning, set correct lexeme again
            final HashSet<ImmutableRoot> immutableRootsWithCorrectLexemeAttr = new HashSet<ImmutableRoot>();
            for (ImmutableRoot immutableRoot : immutableRoots) {
                immutableRootsWithCorrectLexemeAttr.add(new ImmutableRoot(immutableRoot.getSequence(), lexeme,
                        immutableRoot.getPhoneticAttributes(), immutableRoot.getPhoneticExpectations()));
            }

            return immutableRootsWithCorrectLexemeAttr;
        }

        final String lemmaRoot = lexeme.getLemmaRoot();
        String modifiedRootStr = lexeme.getLemmaRoot();

        final EnumSet<PhoneticAttribute> originalPhoneticAttrs = phoneticsAnalyzer
                .calculatePhoneticAttributes(lexeme.getLemmaRoot(), null);
        final EnumSet<PhoneticAttribute> modifiedPhoneticAttrs = phoneticsAnalyzer
                .calculatePhoneticAttributes(lexeme.getLemmaRoot(), null);

        final EnumSet<PhoneticExpectation> originalPhoneticExpectations = EnumSet.noneOf(PhoneticExpectation.class);
        final EnumSet<PhoneticExpectation> modifiedPhoneticExpectations = EnumSet.noneOf(PhoneticExpectation.class);

        if (CollectionUtils.containsAny(lexemeAttributes,
                Sets.immutableEnumSet(LexemeAttribute.Voicing, LexemeAttribute.VoicingOpt))) {
            final TurkicLetter lastLetter = TurkishAlphabet
                    .getLetter(modifiedRootStr.charAt(modifiedRootStr.length() - 1));
            final TurkicLetter voicedLastLetter = lemmaRoot.endsWith("nk") ? TurkishAlphabet.L_g
                    : TurkishAlphabet.voice(lastLetter);
            Validate.notNull(voicedLastLetter);
            modifiedRootStr = modifiedRootStr.substring(0, modifiedRootStr.length() - 1)
                    + voicedLastLetter.charValue();

            modifiedPhoneticAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);

            if (!lexemeAttributes.contains(LexemeAttribute.VoicingOpt)) {
                originalPhoneticExpectations.add(PhoneticExpectation.ConsonantStart);
            }

            modifiedPhoneticExpectations.add(PhoneticExpectation.VowelStart);
        }

        if (lexemeAttributes.contains(LexemeAttribute.Doubling)) {
            modifiedRootStr += modifiedRootStr.charAt(modifiedRootStr.length() - 1);
            originalPhoneticExpectations.add(PhoneticExpectation.ConsonantStart);
            modifiedPhoneticExpectations.add(PhoneticExpectation.VowelStart);
        }

        if (lexemeAttributes.contains(LexemeAttribute.LastVowelDrop)) {
            modifiedRootStr = modifiedRootStr.substring(0, modifiedRootStr.length() - 2)
                    + modifiedRootStr.charAt(modifiedRootStr.length() - 1);
            if (!PrimaryPos.Verb.equals(lexeme.getPrimaryPos()))
                originalPhoneticExpectations.add(PhoneticExpectation.ConsonantStart);

            modifiedPhoneticExpectations.add(PhoneticExpectation.VowelStart);
        }

        if (lexemeAttributes.contains(LexemeAttribute.InverseHarmony)) {
            originalPhoneticAttrs.add(PhoneticAttribute.LastVowelFrontal);
            originalPhoneticAttrs.remove(PhoneticAttribute.LastVowelBack);
            modifiedPhoneticAttrs.add(PhoneticAttribute.LastVowelFrontal);
            modifiedPhoneticAttrs.remove(PhoneticAttribute.LastVowelBack);
        }

        if (lexemeAttributes.contains(LexemeAttribute.ProgressiveVowelDrop)) {
            modifiedRootStr = modifiedRootStr.substring(0, modifiedRootStr.length() - 1);
            if (this.hasVowel(modifiedRootStr)) {
                modifiedPhoneticAttrs.clear();
                modifiedPhoneticAttrs.addAll(phoneticsAnalyzer.calculatePhoneticAttributes(modifiedRootStr, null));
            }
            modifiedPhoneticExpectations.add(PhoneticExpectation.VowelStart);
        }

        ImmutableRoot originalRoot = new ImmutableRoot(lexeme.getLemmaRoot(), lexeme,
                Sets.immutableEnumSet(originalPhoneticAttrs), Sets.immutableEnumSet(originalPhoneticExpectations));
        ImmutableRoot modifiedRoot = new ImmutableRoot(modifiedRootStr, lexeme,
                Sets.immutableEnumSet(modifiedPhoneticAttrs), Sets.immutableEnumSet(modifiedPhoneticExpectations));

        if (originalRoot.equals(modifiedRoot))
            return Sets.newHashSet(originalRoot);
        else
            return Sets.newHashSet(originalRoot, modifiedRoot);
    }

    private HashSet<ImmutableRoot> handleSpecialRoots(final Lexeme originalLexeme) {
        String changedRootStr = rootChanges.get(Pair.of(originalLexeme.getLemma(), originalLexeme.getPrimaryPos()));
        if (StringUtils.isBlank(changedRootStr))
            changedRootStr = rootChanges.get(Pair.of(originalLexeme.getLemma(), (PrimaryPos) null));

        Validate.notNull(changedRootStr, "Unhandled root change : " + originalLexeme);

        final Set<LexemeAttribute> attributes = originalLexeme.getAttributes();
        final EnumSet<LexemeAttribute> newAttributes = EnumSet.copyOf(attributes);
        newAttributes.remove(LexemeAttribute.Special);
        final Lexeme modifiedLexeme = new ImmutableLexeme(originalLexeme.getLemma(), originalLexeme.getLemmaRoot(),
                originalLexeme.getPrimaryPos(), originalLexeme.getSecondaryPos(),
                Sets.immutableEnumSet(newAttributes));

        final String unchangedRootStr = originalLexeme.getLemmaRoot();

        final ImmutableRoot rootUnchanged = new ImmutableRoot(unchangedRootStr, modifiedLexeme,
                Sets.immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(unchangedRootStr,
                        modifiedLexeme.getAttributes())),
                null);

        final ImmutableRoot rootChanged = new ImmutableRoot(changedRootStr, modifiedLexeme, Sets.immutableEnumSet(
                phoneticsAnalyzer.calculatePhoneticAttributes(changedRootStr, modifiedLexeme.getAttributes())),
                null);

        return Sets.newHashSet(rootUnchanged, rootChanged);

    }

    private boolean hasVowel(String str) {
        return Iterables.any(Lists.newArrayList(ArrayUtils.toObject(str.toCharArray())),
                new Predicate<Character>() {
                    @Override
                    public boolean apply(Character input) {
                        return TurkishAlphabet.getLetter(input).isVowel();
                    }
                });
    }
}