ca.phon.ipadictionary.impl.TransliterationDictionary.java Source code

Introduction

Here is the source code for ca.phon.ipadictionary.impl.TransliterationDictionary.java
Source

/*
 * Phon - An open source tool for research in phonology.
 * Copyright (C) 2005 - 2015, Gregory Hedlund <ghedlund@mun.ca> and Yvan Rose <yrose@mun.ca>
 * Dept of Linguistics, Memorial University <https://phon.ca>
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package ca.phon.ipadictionary.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;

import ca.phon.ipadictionary.IPADictionary;
import ca.phon.ipadictionary.exceptions.IPADictionaryExecption;
import ca.phon.ipadictionary.spi.IPADictionarySPI;
import ca.phon.ipadictionary.spi.LanguageInfo;
import ca.phon.ipadictionary.spi.Metadata;
import ca.phon.ipadictionary.spi.NameInfo;
import ca.phon.util.Language;
import de.susebox.jtopas.Flags;
import de.susebox.jtopas.StandardTokenizer;
import de.susebox.jtopas.StandardTokenizerProperties;
import de.susebox.jtopas.StringSource;
import de.susebox.jtopas.Token;
import de.susebox.jtopas.Tokenizer;
import de.susebox.jtopas.TokenizerException;
import de.susebox.jtopas.TokenizerProperties;
import de.susebox.jtopas.TokenizerSource;

/**
 * An IPADictionary implementation that uses a tokenizer and lookup
 * table for generating ipa transcriptions.
 * 
 */
public class TransliterationDictionary implements IPADictionarySPI, LanguageInfo, NameInfo, Metadata {

    private enum MetadataToken {
        NAME("name"), LANGUAGE("lang"), PREPROCESSEXPR("prefind"), PREPROCESSREPLACE("prereplace"), POSTPROCESSEXPR(
                "postfind"), POSTPROCESSREPLACE("postreplace"), OTHER("other");

        private String value;

        private MetadataToken(String v) {
            this.value = v;
        }

        @Override
        public String toString() {
            return this.value;
        }
    }

    private static final Logger LOGGER = Logger.getLogger(TransliterationDictionary.class.getName());

    // token <-> phone mappings
    private Map<String, String> tokenMap;

    private URL mapFile;

    /**
     * Dictionary name
     * Loaded from the dictionary file using the MetadataToken 'name'
     * E.g.,
     * 
     * #name English
     */
    private String name = null;

    /**
     * Dictionary language
     * Loaded from the dictionary file using the MetadataToken 'lang'
     * followed by the appropriate 3-letter ISO-639-3 code.
     * E.g.,
     * 
     * #lang eng
     */
    private Language language = null;

    /**
     * Other metadata values.  Common values are
     * 'provider' and 'website'.  E.g.,
     * 
     * #provider University of Here
     * #website http://www.uoh.org/
     */
    private Map<String, String> metadata = new TreeMap<String, String>();

    /**
     * Regex pattern for pre-processing text
     */
    private Pattern preFindPattern;

    /**
     * Replace expression used for each instance of preFindPattern found
     * in the orthographic text
     */
    private String preReplaceExpr;

    /**
     * Regex pattern for post-processing text
     * 
     */
    private Pattern postFindPattern;

    /**
     * Replace expression used for each instance of postFindPattern found
     */
    private String postReplaceExpr;

    public TransliterationDictionary(URL mapFile) {
        super();
        this.mapFile = mapFile;
    }

    @Override
    public String getName() {
        if (name == null) {
            try {
                readMetadataFromStream(mapFile.openStream());
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, e.getLocalizedMessage(), e);
            }
        }
        return name;
    }

    @Override
    public Language getLanguage() {
        if (language == null) {
            try {
                readMetadataFromStream(mapFile.openStream());
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, e.getLocalizedMessage(), e);
            }
        }
        return language;
    }

    @Override
    public String[] lookup(String orthography) throws IPADictionaryExecption {

        if (preFindPattern != null && preReplaceExpr != null) {
            final Matcher m = preFindPattern.matcher(orthography);
            orthography = m.replaceAll(preReplaceExpr);
        }

        final StringBuilder builder = new StringBuilder();
        final Tokenizer tokenizer = createTokenizer();

        try {
            final TokenizerSource source = new StringSource(orthography);
            tokenizer.setSource(source);

            while (tokenizer.hasMoreToken()) {
                final Token token = tokenizer.nextToken();
                if (token.getType() == Token.SPECIAL_SEQUENCE) {
                    builder.append(token.getCompanion());
                } else if (token.getType() == Token.NORMAL) {
                    // add unknown sequences to return value
                    builder.append(token.getImage());

                }
            }
        } catch (TokenizerException e) {
            LOGGER.log(Level.SEVERE, e.getLocalizedMessage(), e);
        }

        String builderStr = builder.toString();

        if (postFindPattern != null && postReplaceExpr != null) {
            final Matcher m = postFindPattern.matcher(builderStr);
            builderStr = m.replaceAll(postReplaceExpr);
        }

        return new String[] { builderStr };
    }

    private Map<String, String> getTokenMap() {
        if (tokenMap == null) {
            try {
                readTokenMap(mapFile.openStream());
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, e.getLocalizedMessage(), e);
            }
        }
        return tokenMap;
    }

    /**
     * (RegEx) Pattern used to read dicationary entries from file
     */
    private Pattern getPattern() {
        String regex = "(.*)" + "(\\p{Space}+)" + "(.*)";
        return Pattern.compile(regex);
    }

    private void readTokenMap(InputStream is) throws IOException {
        tokenMap = new LinkedHashMap<String, String>();

        final InputStreamReader in = new InputStreamReader(is, "UTF-8");
        final BufferedReader reader = new BufferedReader(in);

        final Pattern dictPattern = getPattern();

        String line = null;
        while ((line = reader.readLine()) != null) {
            if (line.startsWith("#")) {
                // ignore as a comment
                continue;
            }

            final Matcher m = dictPattern.matcher(line);
            if (m.matches()) {
                final String key = StringUtils.strip(m.group(1));
                final String ipa = StringUtils.strip(m.group(3));

                if (key.length() > 0 && ipa.length() > 0) {
                    tokenMap.put(key, ipa);
                }
            }
        }
        reader.close();
    }

    /*
     * Create the tokenizer that will be used on the input string
     */
    private Tokenizer createTokenizer() {
        final TokenizerProperties props = new StandardTokenizerProperties();
        props.setSeparators(null);
        props.setParseFlags(Flags.F_KEEP_DATA | Flags.F_SINGLE_LINE_STRING);

        final Map<String, String> map = getTokenMap();

        for (String key : map.keySet()) {
            props.addSpecialSequence(key, map.get(key));
        }

        final Tokenizer tokenizer = new StandardTokenizer(props);
        return tokenizer;
    }

    @Override
    public String getMetadataValue(String key) {
        return metadata.get(key);
    }

    @Override
    public Iterator<String> metadataKeyIterator() {
        return metadata.keySet().iterator();
    }

    /**
     * Read dictionary metadata from the given stream.
     * Reading will end when the first non-commented
     * line is encountered (i.e., the first transcription
     * pair.)
     * 
     * @param is
     * @throws IOException if an error occurs while
     *  reading from the stream
     */
    private void readMetadataFromStream(InputStream is) throws IOException {
        InputStreamReader in = new InputStreamReader(is);
        BufferedReader reader = new BufferedReader(in);

        String line = null;
        while ((line = reader.readLine()) != null) {
            if (!line.startsWith("#"))
                break;

            String comment = line.substring(1);

            // check to see if the comment is a metadata instruction
            int firstTokenEnd = comment.indexOf(' ');
            String metaTkn = comment.substring(0, firstTokenEnd);

            //         for(MetadataToken metaTkn:MetadataToken.values()) {
            //            if(metaTkn.toString().equalsIgnoreCase(firstToken)) {
            // process metadata
            String metaDataInfo = comment.substring(firstTokenEnd + 1);
            processMetadata(metaTkn, metaDataInfo);
            //            }
            //         }
        }
        reader.close();
    }

    /**
     * Process metadata value
     * 
     * @param token the type of metadata to process
     * @param value the value of the metadata
     */
    private void processMetadata(String token, String value) {
        if (token.equalsIgnoreCase(MetadataToken.NAME.toString())) {
            // set name as value
            this.name = value;
        } else if (token.equalsIgnoreCase(MetadataToken.LANGUAGE.toString())) {
            // attempt to load language
            final Language lang = Language.parseLanguage(value);
            this.language = lang;
        } else if (token.equalsIgnoreCase(MetadataToken.PREPROCESSEXPR.toString())) {
            preFindPattern = Pattern.compile(value);
        } else if (token.equalsIgnoreCase(MetadataToken.PREPROCESSREPLACE.toString())) {
            preReplaceExpr = value;
        } else if (token.equalsIgnoreCase(MetadataToken.POSTPROCESSEXPR.toString())) {
            postFindPattern = Pattern.compile(value);
        } else if (token.equalsIgnoreCase(MetadataToken.POSTPROCESSREPLACE.toString())) {
            postReplaceExpr = value;
        } else {
            metadata.put(token, value);
        }
    }

    @Override
    public void install(IPADictionary dict) {
        dict.putExtension(LanguageInfo.class, this);
        dict.putExtension(NameInfo.class, this);
        dict.putExtension(Metadata.class, this);
    }

}