org.trnltk.morphology.contextless.parser.PhoneticAttributeSets.java Source code

Java tutorial

Introduction

Here is the source code for org.trnltk.morphology.contextless.parser.PhoneticAttributeSets.java

Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.morphology.contextless.parser;

import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.trnltk.model.lexicon.PhoneticAttribute;
import org.trnltk.model.lexicon.PhoneticAttributeMetadata;

import java.util.HashSet;
import java.util.Set;

/**
 * Stores all possible phonetic attributes combinations.
 * <p/>
 * This is to be pre-computed.
 * <p/>
 * Finding possible phonetic attributes are done as the following way:
 * <ol>
 * <li>Compute power set of all {@link PhoneticAttribute}s</li>
 * <li>Discard sets that are not valid</li>
 * </ol>
 * <p/>
 * Validity of a combination is checked using the {@link PhoneticAttributeMetadata} where rules are defined for each
 * {@link PhoneticAttribute} in terms of what other {@link PhoneticAttribute}s could make sense with an attribute.
 * <p/>
 * Quality of the metadata rules are regularly checked in the build system.
 * That means, valid {@link PhoneticAttribute} combinations are checked against found natural combinations from a big
 * corpus. There must be no possible combination found with the metadata which does not exist in the corpus.
 */
public class PhoneticAttributeSets {

    private final ImmutableMap<Long, Set<PhoneticAttribute>> validPhoneticAttributeSetsMap;
    private final ImmutableCollection<Set<PhoneticAttribute>> validPhoneticAttributeSets;

    public PhoneticAttributeSets() {
        this.validPhoneticAttributeSetsMap = this.findValidSets();
        this.validPhoneticAttributeSets = validPhoneticAttributeSetsMap.values();
    }

    protected ImmutableMap<Long, Set<PhoneticAttribute>> findValidSets() {

        final Set<Set<PhoneticAttribute>> phoneticAttributePowerSets = Sets
                .powerSet(new HashSet<PhoneticAttribute>(Lists.newArrayList(PhoneticAttribute.values())));

        final ImmutableMap.Builder<Long, Set<PhoneticAttribute>> validPhoneticAttributeSetsMapBuilder = new ImmutableMap.Builder<Long, Set<PhoneticAttribute>>();
        for (Set<PhoneticAttribute> set : phoneticAttributePowerSets) {
            if (PhoneticAttributeMetadata.isValid(set)) {
                validPhoneticAttributeSetsMapBuilder.put(getNumberForSet(set), set);
            }
        }

        return validPhoneticAttributeSetsMapBuilder.build();
    }

    public ImmutableMap<Long, Set<PhoneticAttribute>> getValidPhoneticAttributeSetsMap() {
        return validPhoneticAttributeSetsMap;
    }

    public ImmutableCollection<Set<PhoneticAttribute>> getValidPhoneticAttributeSets() {
        return validPhoneticAttributeSets;
    }

    public long getNumberForSet(Set<PhoneticAttribute> set) {
        long number = 0L;
        for (PhoneticAttribute phoneticAttribute : PhoneticAttribute.values()) {
            if (set.contains(phoneticAttribute))
                number = number | 0x1L;

            number = number << 1;
        }

        number = number >> 1;

        return number;
    }
}