org.trnltk.model.letter.TurkishAlphabet.java Source code

Java tutorial

Introduction

Here is the source code for org.trnltk.model.letter.TurkishAlphabet.java

Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.model.letter;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
import org.apache.commons.lang3.StringUtils;
import org.trnltk.util.Constants;

import java.util.Arrays;

import static org.trnltk.model.letter.TurkicLetter.builder;

/**
 * Contains Turkish Letters, Turkish Letter equivalent chars, several helper methods.
 * TurkishAlphabet only contains small case letters.
 */
@SuppressWarnings({"UnusedDeclaration", "WeakerAccess"})
public class TurkishAlphabet {

    // Turkish specific characters.
    public static final char C_CC = '\u00c7'; // 
    public static final char C_cc = '\u00e7'; // 
    public static final char C_GG = '\u011e'; // 
    public static final char C_gg = '\u011f'; // 
    public static final char C_ii = '\u0131'; // 
    public static final char C_II = '\u0130'; // 
    public static final char C_OO = '\u00d6'; // 
    public static final char C_oo = '\u00f6'; // 
    public static final char C_SS = '\u015e'; // 
    public static final char C_ss = '\u015f'; // 
    public static final char C_UU = '\u00dc'; // 
    public static final char C_uu = '\u00fc'; // 

    // letters used in turkish text having circumflex.
    public static final char A_CIRC = '\u00c2'; // 
    public static final char a_CIRC = '\u00e2'; // 
    public static final char I_CIRC = '\u00ce'; // 
    public static final char i_CIRC = '\u00ee'; // 
    public static final char U_CIRC = '\u00db'; // 
    public static final char u_CIRC = '\u00fb'; // 

    /**
     * Turkish Letters. q,x,w is also added for foreign proper nouns. They are marked as 'foreign'
     */
    public static final TurkicLetter L_a = builder('a', 1).vowel().build();
    public static final TurkicLetter L_b = builder('b', 2).build();
    public static final TurkicLetter L_c = builder('c', 3).build();
    public static final TurkicLetter L_cc = builder(C_cc, 4).notInAscii().voiceless().similarAscii('c').build();
    public static final TurkicLetter L_d = builder('d', 5).build();
    public static final TurkicLetter L_e = builder('e', 6).vowel().frontalVowel().build();
    public static final TurkicLetter L_f = builder('f', 7).continuant().voiceless().build();
    public static final TurkicLetter L_g = builder('g', 8).build();
    public static final TurkicLetter L_gg = builder(C_gg, 9).continuant().notInAscii().similarAscii('g').build();
    public static final TurkicLetter L_h = builder('h', 10).continuant().voiceless().build();
    public static final TurkicLetter L_ii = builder(C_ii, 11).vowel().notInAscii().similarAscii('i').build();
    public static final TurkicLetter L_i = builder('i', 12).vowel().frontalVowel().build();
    public static final TurkicLetter L_j = builder('j', 13).continuant().build();
    public static final TurkicLetter L_k = builder('k', 14).voiceless().build();
    public static final TurkicLetter L_l = builder('l', 15).continuant().build();
    public static final TurkicLetter L_m = builder('m', 16).continuant().build();
    public static final TurkicLetter L_n = builder('n', 17).continuant().build();
    public static final TurkicLetter L_o = builder('o', 18).vowel().roundedVowel().build();
    public static final TurkicLetter L_oo = builder(C_oo, 19).vowel().frontalVowel().roundedVowel().notInAscii().similarAscii('o').build();
    public static final TurkicLetter L_p = builder('p', 20).voiceless().build();
    public static final TurkicLetter L_r = builder('r', 21).continuant().build();
    public static final TurkicLetter L_s = builder('s', 22).continuant().voiceless().build();
    public static final TurkicLetter L_ss = builder(C_ss, 23).continuant().notInAscii().voiceless().similarAscii('s').build();
    public static final TurkicLetter L_t = builder('t', 24).voiceless().build();
    public static final TurkicLetter L_u = builder('u', 25).vowel().roundedVowel().build();
    public static final TurkicLetter L_uu = builder(C_uu, 26).vowel().roundedVowel().frontalVowel().similarAscii('u').notInAscii().build();
    public static final TurkicLetter L_v = builder('v', 27).continuant().build();
    public static final TurkicLetter L_y = builder('y', 28).continuant().build();
    public static final TurkicLetter L_z = builder('z', 29).continuant().build();
    // Not Turkish but sometimes appears in geographical names etc.
    public static final TurkicLetter L_q = builder('q', 30).foreign().build();
    public static final TurkicLetter L_w = builder('w', 31).foreign().build();
    public static final TurkicLetter L_x = builder('x', 32).foreign().build();
    // Circumflexed letters
    public static final TurkicLetter L_ac = builder(a_CIRC, 33).vowel().similarAscii('a').notInAscii().build();
    public static final TurkicLetter L_ic = builder(i_CIRC, 34).vowel().frontalVowel().similarAscii('i').notInAscii().build();
    public static final TurkicLetter L_uc = builder(u_CIRC, 35).vowel().frontalVowel().similarAscii('u').roundedVowel().notInAscii().build();

    // Punctuations
    public static final TurkicLetter P_Dot = builder('.', 33).build();
    public static final TurkicLetter P_Comma = builder(',', 34).build();
    public static final TurkicLetter P_Hyphen = builder('-', 35).build();
    public static final TurkicLetter P_Colon = builder(':', 36).build();
    public static final TurkicLetter P_Semicolon = builder(';', 37).build();
    public static final TurkicLetter P_Plus = builder('+', 38).build();
    public static final TurkicLetter P_Popen = builder('(', 39).build();
    public static final TurkicLetter P_Pclose = builder(')', 40).build();
    public static final TurkicLetter P_Bopen = builder('[', 41).build();
    public static final TurkicLetter P_Bclose = builder(']', 42).build();
    public static final TurkicLetter P_CBopen = builder('{', 43).build();
    public static final TurkicLetter P_CBclose = builder('}', 44).build();
    public static final TurkicLetter P_QuestionMark = builder('?', 45).build();
    public static final TurkicLetter P_ExcMark = builder('!', 46).build();
    public static final TurkicLetter P_SQuote = builder('\'', 47).build();
    public static final TurkicLetter P_DQuote = builder('\"', 48).build();
    public static final TurkicLetter P_Slash = builder('/', 49).build();
    public static final TurkicLetter P_Percent = builder('%', 50).build();
    public static final TurkicLetter P_Number = builder('#', 51).build();
    public static final TurkicLetter P_Dollar = builder('$', 52).build();
    public static final TurkicLetter P_Yen = builder('', 53).build();
    public static final TurkicLetter P_Pound = builder('', 54).build();
    public static final TurkicLetter P_Euro = builder('', 55).build();

    // numbers
    public static final TurkicLetter N_0 = builder('0', 100).build();
    public static final TurkicLetter N_1 = builder('1', 101).build();
    public static final TurkicLetter N_2 = builder('2', 102).build();
    public static final TurkicLetter N_3 = builder('3', 103).build();
    public static final TurkicLetter N_4 = builder('4', 104).build();
    public static final TurkicLetter N_5 = builder('5', 105).build();
    public static final TurkicLetter N_6 = builder('6', 106).build();
    public static final TurkicLetter N_7 = builder('7', 107).build();
    public static final TurkicLetter N_8 = builder('8', 108).build();
    public static final TurkicLetter N_9 = builder('9', 109).build();


    public static final TurkicLetter[] TURKISH_LETTERS = {
            L_a, L_b, L_c, L_cc, L_d, L_e, L_f, L_g,
            L_gg, L_h, L_ii, L_i, L_j, L_k, L_l, L_m,
            L_n, L_o, L_oo, L_p, L_r, L_s, L_ss, L_t,
            L_u, L_uu, L_v, L_y, L_z, L_q, L_w, L_x,
            L_ac, L_ic, L_uc,
            P_Dot, P_Comma, P_Hyphen, P_Colon, P_Semicolon,
            P_Plus, P_Popen, P_Pclose, P_Bopen, P_Bclose, P_CBopen, P_CBclose,
            P_QuestionMark, P_ExcMark, P_SQuote, P_DQuote, P_Slash, P_Percent, P_Number,
            P_Dollar, P_Yen, P_Pound, P_Euro,
            N_0, N_1, N_2, N_3, N_4, N_5, N_6, N_7, N_8, N_9
    };

    public static final TurkicLetter[] TURKISH_ALPHA_LETTERS = {
            L_a, L_ac, L_b, L_c, L_cc, L_d, L_e, L_f, L_g,
            L_gg, L_h, L_ii, L_i, L_ic, L_j, L_k, L_l, L_m,
            L_n, L_o, L_oo, L_p, L_q, L_r, L_s, L_ss, L_t,
            L_u, L_uu, L_uc, L_v, L_w, L_x, L_y, L_z
    };

    public static final TurkicLetter[] TURKISH_PUNC_LETTERS = {
            P_Dot, P_Comma, P_Hyphen, P_Colon, P_Semicolon,
            P_Plus, P_Popen, P_Pclose, P_Bopen, P_Bclose, P_CBopen, P_CBclose,
            P_QuestionMark, P_ExcMark, P_SQuote, P_DQuote, P_Slash, P_Percent, P_Number,
            P_Dollar, P_Yen, P_Pound, P_Euro
    };

    public static final TurkicLetter[] TURKISH_NUMERIC_LETTERS = {
            N_0, N_1, N_2, N_3, N_4, N_5, N_6, N_7, N_8, N_9
    };

    // 0x15f is the maximum char value in turkish specific characters. It is the size
    // of our lookup tables. This could be done better, but for now it works.
    private static final int MAX_CHAR_VALUE = 0x20ac + 1;
    private static final TurkicLetter[] CHAR_TO_LETTER_LOOKUP = new TurkicLetter[MAX_CHAR_VALUE];
    private static final boolean[] VALID_CHAR_TABLE = new boolean[MAX_CHAR_VALUE];

    static {
        Arrays.fill(CHAR_TO_LETTER_LOOKUP, TurkicLetter.UNDEFINED);
        Arrays.fill(VALID_CHAR_TABLE, false);
        for (TurkicLetter turkicLetter : TURKISH_LETTERS) {
            final char c = turkicLetter.charValue();
            CHAR_TO_LETTER_LOOKUP[c] = turkicLetter;
            VALID_CHAR_TABLE[c] = true;

            char upperCase = StringUtils.upperCase(String.valueOf(c), Constants.TURKISH_LOCALE).charAt(0);
            if (upperCase != c) {
                CHAR_TO_LETTER_LOOKUP[upperCase] = turkicLetter;
                VALID_CHAR_TABLE[upperCase] = true;
            }

        }
    }

    protected static final ImmutableMap<TurkicLetter, TurkicLetter> devoicingMap = new ImmutableMap.Builder<TurkicLetter, TurkicLetter>()
            .put(L_b, L_p)
            .put(L_c, L_cc)
            .put(L_d, L_t)
            .put(L_g, L_k)
            .put(L_gg, L_k)
            .build();

    protected static final ImmutableMap<TurkicLetter, TurkicLetter> voicingMap = new ImmutableMap.Builder<TurkicLetter, TurkicLetter>().
            put(L_p, L_b).
            put(L_k, L_gg).
            put(L_cc, L_c).
            put(L_t, L_d).
            put(L_g, L_gg).
            build();

    public static final ImmutableSetMultimap<TurkicLetter, TurkicLetter> Inverse_Voicing_Map = new ImmutableSetMultimap.Builder<TurkicLetter, TurkicLetter>()
            .put(TurkishAlphabet.L_b, TurkishAlphabet.L_p)
            .put(TurkishAlphabet.L_c, TurkishAlphabet.L_cc)
            .put(TurkishAlphabet.L_d, TurkishAlphabet.L_t)
            .put(TurkishAlphabet.L_g, TurkishAlphabet.L_k)
            .put(TurkishAlphabet.L_gg, TurkishAlphabet.L_g)
            .put(TurkishAlphabet.L_gg, TurkishAlphabet.L_k)
            .build();
    public static final ImmutableSet<TurkicLetter> Devoicable_Letters = ImmutableSet.copyOf(org.trnltk.model.letter.TurkishAlphabet.devoicingMap.keySet());
    public static final ImmutableSet<TurkicLetter> Voicable_Letters = ImmutableSet.copyOf(org.trnltk.model.letter.TurkishAlphabet.voicingMap.keySet());

    private TurkishAlphabet() {
        throw new UnsupportedOperationException();
    }

    /**
     * Devoices a turkish letter.
     * <ul>
     * <li>b -> p (*)</li>
     * <li>c -> </li>
     * <li>d -> t</li>
     * <li>g -> k (*)</li>
     * <li> -> k (*)</li>
     * <li>Otherwise -> null</li>
     * </ul>
     * <p/>
     * * = not really applicable, since there is no suffix starting with b, g or 
     *
     * @param l Letter to devoice
     * @return Devoiced letter or null if letter is not devoicable
     */
    public static TurkicLetter devoice(TurkicLetter l) {
        return devoicingMap.get(l);
    }

    /**
     * Voices a turkish letter.
     * <ul>
     * <li>p -> b</li>
     * <li>k -> </li>
     * <li> -> c</li>
     * <li>t -> d</li>
     * <li>g -> </li>
     * <li>Otherwise -> null</li>
     * </ul>
     *
     * @param l Letter to voice
     * @return Voiced letter or null if letter is not voicable
     */
    public static TurkicLetter voice(TurkicLetter l) {
        return voicingMap.get(l);
    }

    /**
     * Returns the TurkicLetter equivalent of character c.
     *
     * @param c input character
     * @return TurkishLetter equivalent.
     * @throws IllegalArgumentException if input character is out of alphabet.
     */
    public static TurkicLetter getLetter(char c) {
        if (c >= MAX_CHAR_VALUE || !VALID_CHAR_TABLE[c])
            return TurkicLetter.builder(c, 9999).build();
        else
            return CHAR_TO_LETTER_LOOKUP[c];
    }

    public static TurkishChar getChar(char c) {
        final TurkicLetter letterForChar = getLetter(c);
        return new TurkishChar(c, letterForChar);
    }

    /**
     * Checks if a character is part of TurkishAlphabet.
     *
     * @param c character to check
     * @return true if it is part of the Turkish alphabet. false otherwise
     */
    public static boolean isValid(char c) {
        return c < MAX_CHAR_VALUE && VALID_CHAR_TABLE[c];
    }

    public static String capitalize(String str) {
        if (StringUtils.isEmpty(str))
            return str;

        final char c = str.charAt(0);
        final char upperChar = StringUtils.upperCase(String.valueOf(c), Constants.TURKISH_LOCALE).charAt(0);
        return String.valueOf(upperChar) + str.substring(1);
    }

    public static String uncapitalize(String str) {
        if (StringUtils.isEmpty(str))
            return str;

        final char c = str.charAt(0);
        final char upperChar = StringUtils.lowerCase(String.valueOf(c), Constants.TURKISH_LOCALE).charAt(0);
        return String.valueOf(upperChar) + str.substring(1);
    }

}