org.trnltk.util.MorphemeContainerFormatter.java Source code

Java tutorial

Introduction

Here is the source code for org.trnltk.util.MorphemeContainerFormatter.java

Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.util;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray;
import org.json.JSONObject;
import org.trnltk.model.lexicon.Lexeme;
import org.trnltk.model.lexicon.PrimaryPos;
import org.trnltk.model.lexicon.Root;
import org.trnltk.model.lexicon.SecondaryPos;
import org.trnltk.model.morpheme.MorphemeContainer;
import org.trnltk.model.suffix.*;
import org.trnltk.morphology.morphotactics.SuffixGraphState;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

@SuppressWarnings("JavaDoc")
public class MorphemeContainerFormatter {

    private static final ImmutableSet<Pair<PrimaryPos, SecondaryPos>> DERIVATION_GROUPING_FORMAT_SECONDARY_POS_TO_SKIP = new ImmutableSet.Builder<Pair<PrimaryPos, SecondaryPos>>()
            .add(Pair.of(PrimaryPos.Adverb, SecondaryPos.Question))
            .add(Pair.of(PrimaryPos.Adverb, SecondaryPos.Time))
            .add(Pair.of(PrimaryPos.Adjective, SecondaryPos.Question)).build();

    /**
     * @param morphemeContainer the MC
     * @return kitap+Noun+A3sg+Pnon+Dat for word 'kitaba'
     */
    public static String formatMorphemeContainer(final MorphemeContainer morphemeContainer) {
        final Root root = morphemeContainer.getRoot();
        final Lexeme lexeme = root.getLexeme();

        final StringBuilder b = new StringBuilder();
        b.append(String.format("%s+%s", lexeme.getLemmaRoot(),
                morphemeContainer.getRootState().getPrimaryPos().getStringForm()));

        if (lexeme.getSecondaryPos() != null)
            b.append("+").append(lexeme.getSecondaryPos());

        if (morphemeContainer.hasTransitions()) {
            for (SuffixTransition suffixTransition : morphemeContainer.getSuffixTransitions()) {
                // don't add free transitions to the formatted string
                final Suffix suffix = suffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix();
                if (suffix instanceof FreeTransitionSuffix || suffix instanceof ConditionalFreeTransitionSuffix)
                    continue;

                b.append("+").append(formatTransition(suffixTransition, false));
            }
        }

        return b.toString();
    }

    @SuppressWarnings("UnusedDeclaration")
    public static Collection<String> formatMorphemeContainers(
            final Collection<MorphemeContainer> morphemeContainers) {
        return Collections2.transform(morphemeContainers, new Function<MorphemeContainer, String>() {
            @Override
            public String apply(MorphemeContainer input) {
                return MorphemeContainerFormatter.formatMorphemeContainer(input);
            }
        });
    }

    public static List<String> formatMorphemeContainers(final List<MorphemeContainer> morphemeContainers) {
        return Lists.transform(morphemeContainers, new Function<MorphemeContainer, String>() {
            @Override
            public String apply(MorphemeContainer input) {
                return MorphemeContainerFormatter.formatMorphemeContainer(input);
            }
        });
    }

    /**
     * @param morphemeContainer the MC
     * @return kitab(kitap)+Noun+A3sg+Pnon+Dat(+yA[a]) for word 'kitaba'
     */
    public static String formatMorphemeContainerWithForms(final MorphemeContainer morphemeContainer) {
        final StringBuilder b = new StringBuilder();
        final Root root = morphemeContainer.getRoot();
        final Lexeme lexeme = root.getLexeme();

        b.append(String.format("%s(%s)+%s", root.getSequence().getUnderlyingString(), lexeme.getLemma(),
                morphemeContainer.getRootState().getPrimaryPos().getStringForm()));
        if (lexeme.getSecondaryPos() != null)
            b.append("+").append(lexeme.getSecondaryPos().getStringForm());

        if (morphemeContainer.hasTransitions()) {
            for (SuffixTransition suffixTransition : morphemeContainer.getSuffixTransitions()) {
                // don't add free transitions to the formatted string
                final Suffix suffix = suffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix();
                if (suffix instanceof FreeTransitionSuffix || suffix instanceof ConditionalFreeTransitionSuffix)
                    continue;

                b.append("+").append(formatTransition(suffixTransition, true));
            }
        }

        return b.toString();
    }

    /**
     * @param morphemeContainer the MC
     * @return (1,"kitap+Noun+A3sg+Pnon+Dat")
     */
    public static String formatMorphemeContainerWithDerivationGrouping(MorphemeContainer morphemeContainer) {
        return formatMorphemeContainerWithDerivationGrouping(morphemeContainer, true);
    }

    /**
     * @param morphemeContainer the MC
     * @return (1,"kitap+Noun+A3sg+Pnon+Dat") or ("kitap+Noun+A3sg+Pnon+Dat")
     */
    public static String formatMorphemeContainerWithDerivationGrouping(MorphemeContainer morphemeContainer,
            boolean addIndices) {
        final Lexeme lexeme = morphemeContainer.getRoot().getLexeme();
        final PrimaryPos primaryPos = lexeme.getPrimaryPos();
        final SecondaryPos secondaryPos = lexeme.getSecondaryPos();

        final String lemmaRoot = lexeme.getLemmaRoot();

        final String secondaryPosStr;
        if (secondaryPos != null) {
            if (DERIVATION_GROUPING_FORMAT_SECONDARY_POS_TO_SKIP.contains(Pair.of(primaryPos, secondaryPos)))
                secondaryPosStr = null;
            else
                secondaryPosStr = secondaryPos.getStringForm();
        } else {
            secondaryPosStr = null;
        }

        final String formattedLexeme = Joiner.on("+").skipNulls()
                .join(Arrays.asList(lemmaRoot, primaryPos.getStringForm(), secondaryPosStr));

        final List<List<String>> groups = new ArrayList<List<String>>();
        List<String> currentGroup = new ArrayList<String>(Arrays.asList(formattedLexeme));

        for (SuffixTransition suffixTransition : morphemeContainer.getSuffixTransitions()) {
            if (suffixTransition.isDerivational()) {
                groups.add(currentGroup);
                currentGroup = new ArrayList<String>(
                        Arrays.asList(suffixTransition.getTargetState().getPrimaryPos().getStringForm()));
            }

            final Suffix suffix = suffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix();
            if (suffix instanceof FreeTransitionSuffix || suffix instanceof ConditionalFreeTransitionSuffix)
                //noinspection UnnecessaryContinue
                continue;
            else
                currentGroup.add(suffix.getPrettyName());

        }

        groups.add(currentGroup);

        final List<String> formattedGroups = Lists.transform(groups, new Function<List<String>, String>() {
            @Override
            public String apply(List<String> input) {
                return Joiner.on("+").join(input);
            }
        });

        final StringBuilder builder = new StringBuilder();
        for (int i = 0; i < formattedGroups.size(); i++) {
            final String formattedGroup = formattedGroups.get(i);
            builder.append("(");
            if (addIndices)
                builder.append(i + 1).append(",");
            builder.append("\"").append(formattedGroup).append("\")");
        }

        return builder.toString();
    }

    /**
     * @param morphemeContainer the MC
     * @return {Root:"drd", LemmaRoot:"drt", Parts:[{POS:"Num", SPOS:"Card"}, {POS:"Adj", Suffixes:["Ord"]}, {POS:"Adj", Suffixes:["Ness", "Sth"]}, {POS:"Noun", Suffixes:["A3Sg", "Pnon", "Nom"]}]}
     */
    public static String formatMorphemeContainerDetailed(MorphemeContainer morphemeContainer) {
        try {
            final List<JSONObject> parts = new ArrayList<JSONObject>();
            JSONObject currentPart = null;

            List<SuffixTransition> suffixTransitions = morphemeContainer.getSuffixTransitions();
            for (int i = 0; i < suffixTransitions.size(); i++) {
                SuffixTransition suffixTransition = suffixTransitions.get(i);
                final Suffix suffix = suffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix();

                final SuffixGraphState targetState = suffixTransition.getTargetState();
                final boolean isDerivational = suffixTransition.isDerivational();
                if (isDerivational || i == 0) {
                    if (currentPart != null)
                        parts.add(currentPart);

                    currentPart = new JSONObject();
                    final String currentPos = targetState.getPrimaryPos().getStringForm();
                    final SecondaryPos currentSecondaryPos = targetState.getSecondaryPos();
                    final String currentSpos = currentSecondaryPos == null ? null
                            : currentSecondaryPos.getStringForm();
                    currentPart.put("POS", currentPos);
                    if (StringUtils.isNotBlank(currentSpos))
                        currentPart.put("SPOS", currentSpos);
                }

                if (suffix instanceof FreeTransitionSuffix || suffix instanceof ConditionalFreeTransitionSuffix)
                    continue;

                JSONArray suffixes;
                if (currentPart.has("Suffixes")) {
                    suffixes = currentPart.getJSONArray("Suffixes");
                } else {
                    suffixes = new JSONArray();
                    currentPart.put("Suffixes", suffixes);
                }

                suffixes.put(suffix.getPrettyName());
            }

            if (currentPart != null)
                parts.add(currentPart);

            final Lexeme lexeme = morphemeContainer.getRoot().getLexeme();
            final String rootStr = morphemeContainer.getRoot().getSequence().getUnderlyingString();
            final String lemmaRoot = lexeme.getLemmaRoot();
            final PrimaryPos primaryPos = lexeme.getPrimaryPos();
            final SecondaryPos secondaryPos = lexeme.getSecondaryPos();

            final JSONObject parentObject = new JSONObject();
            parentObject.put("Root", rootStr);
            parentObject.put("LemmaRoot", lemmaRoot);
            parentObject.put("RootPos", primaryPos.getStringForm());
            if (secondaryPos != null)
                parentObject.put("RootSpos", secondaryPos.getStringForm());
            if (CollectionUtils.isNotEmpty(parts))
                parentObject.put("Parts", parts);

            return parentObject.toString();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private static String formatTransition(final SuffixTransition suffixTransition, final boolean includeForm) {
        final StringBuilder b = new StringBuilder();

        if (suffixTransition.isDerivational()) {
            final SuffixGraphState targetState = suffixTransition.getTargetState();
            b.append(targetState.getPrimaryPos().getStringForm()).append("+");
            if (targetState.getSecondaryPos() != null)
                b.append(targetState.getSecondaryPos().getStringForm()).append("+");
        }

        final SuffixForm suffixForm = suffixTransition.getSuffixFormApplication().getSuffixForm();
        final String actualSuffixForm = suffixTransition.getSuffixFormApplication().getActualSuffixForm();
        if (includeForm && StringUtils.isNotBlank(actualSuffixForm)
                && StringUtils.isAlphanumeric(actualSuffixForm)) {
            b.append(String.format("%s(%s[%s])", suffixForm.getSuffix().getPrettyName(),
                    suffixForm.getForm().getSuffixFormStr(), actualSuffixForm));
        } else {
            b.append(suffixForm.getSuffix().getPrettyName());
        }

        return b.toString();
    }

    public static Collection<String> formatMorphemeContainersWithDerivationGrouping(
            Collection<MorphemeContainer> morphemeContainers) {
        return Collections2.transform(morphemeContainers, new Function<MorphemeContainer, String>() {
            @Override
            public String apply(MorphemeContainer input) {
                return MorphemeContainerFormatter.formatMorphemeContainerWithDerivationGrouping(input);
            }
        });
    }
}