Java tutorial
/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.StringTokenizer; import java.util.TreeMap; import marytts.datatypes.MaryData; import marytts.datatypes.MaryDataType; import marytts.datatypes.MaryXML; import marytts.exceptions.MaryConfigurationException; import marytts.fst.FSTLookup; import marytts.language.de.phonemiser.Inflection; import marytts.language.de.phonemiser.PhonemiseDenglish; import marytts.language.de.phonemiser.Result; import marytts.modules.synthesis.PAConverter; import marytts.server.MaryProperties; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.apache.commons.io.FileUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.traversal.NodeIterator; /** * The phonemiser module -- java implementation. * * @author Marc Schröder */ public class JPhonemiser extends marytts.modules.JPhonemiser { private Inflection inflection; private FSTLookup usEnglishLexicon = null; private String logUnknownFileName = null; private Map<String, Integer> unknown2Frequency = null; private String logEnglishFileName = null; private Map<String, Integer> english2Frequency = null; private PhonemiseDenglish phonemiseDenglish; public JPhonemiser() throws IOException, MaryConfigurationException { super("JPhonemiser_de", MaryDataType.PARTSOFSPEECH, MaryDataType.PHONEMES, "de.allophoneset", "de.userdict", "de.lexicon", "de.lettertosound"); } public void startup() throws Exception { super.startup(); phonemiseDenglish = new PhonemiseDenglish(this); inflection = new Inflection(); if (MaryProperties.getBoolean("de.phonemiser.logunknown")) { String logBasepath = MaryProperties.maryBase() + File.separator + "log" + File.separator; File logDir = new File(logBasepath); try { if (!logDir.isDirectory()) { logger.info("Creating log directory " + logDir.getCanonicalPath()); FileUtils.forceMkdir(logDir); } logUnknownFileName = MaryProperties.getFilename("de.phonemiser.logunknown.filename", logBasepath + "de_unknown.txt"); unknown2Frequency = new HashMap<String, Integer>(); logEnglishFileName = MaryProperties.getFilename("de.phonemiser.logenglish.filename", logBasepath + "de_english-words.txt"); english2Frequency = new HashMap<String, Integer>(); } catch (IOException e) { logger.info("Could not create log directory " + logDir.getCanonicalPath() + " Logging disabled!", e); } } if (MaryProperties.getBoolean("de.phonemiser.useenglish")) { InputStream usLexStream = MaryProperties.getStream("en_US.lexicon"); if (usLexStream != null) { try { usEnglishLexicon = new FSTLookup(usLexStream, MaryProperties.getProperty("en_US.lexicon")); } catch (Exception e) { logger.info("Cannot load English lexicon '" + MaryProperties.getProperty("en_US.lexicon") + "'", e); } } } } public void shutdown() { if (logUnknownFileName != null || logEnglishFileName != null) { try { /* print unknown words */ //open file PrintWriter logUnknown = new PrintWriter( new OutputStreamWriter(new FileOutputStream(logUnknownFileName), "UTF-8")); //sort the words Set<String> unknownWords = unknown2Frequency.keySet(); SortedMap<Integer, List<String>> freq2Unknown = new TreeMap<Integer, List<String>>(); for (String nextUnknown : unknownWords) { int nextFreq = unknown2Frequency.get(nextUnknown); //logUnknown.println(nextFreq+" "+nextUnknown); if (freq2Unknown.containsKey(nextFreq)) { List<String> unknowns = freq2Unknown.get(nextFreq); unknowns.add(nextUnknown); } else { List<String> unknowns = new ArrayList<String>(); unknowns.add(nextUnknown); freq2Unknown.put(nextFreq, unknowns); } } //print the words for (int nextFreq : freq2Unknown.keySet()) { List<String> unknowns = freq2Unknown.get(nextFreq); for (int i = 0; i < unknowns.size(); i++) { String unknownWord = (String) unknowns.get(i); logUnknown.println(nextFreq + " " + unknownWord); } } //close file logUnknown.flush(); logUnknown.close(); /* print english words */ //open the file PrintWriter logEnglish = new PrintWriter( new OutputStreamWriter(new FileOutputStream(logEnglishFileName), "UTF-8")); //sort the words SortedMap<Integer, List<String>> freq2English = new TreeMap<Integer, List<String>>(); for (String nextEnglish : english2Frequency.keySet()) { int nextFreq = english2Frequency.get(nextEnglish); if (freq2English.containsKey(nextFreq)) { List<String> englishWords = freq2English.get(nextFreq); englishWords.add(nextEnglish); } else { List<String> englishWords = new ArrayList<String>(); englishWords.add(nextEnglish); freq2English.put(nextFreq, englishWords); } } //print the words for (int nextFreq : freq2English.keySet()) { List<String> englishWords = freq2English.get(nextFreq); for (int i = 0; i < englishWords.size(); i++) { logEnglish.println(nextFreq + " " + englishWords.get(i)); } } //close file logEnglish.flush(); logEnglish.close(); } catch (Exception e) { logger.info("Error printing log files for english and unknown words", e); } } } @Override public MaryData process(MaryData d) throws Exception { Document doc = d.getDocument(); inflection.determineEndings(doc); NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN); Element t = null; while ((t = (Element) it.nextNode()) != null) { String text; // Do not touch tokens for which a transcription is already // given (exception: transcription contains a '*' character: if (t.hasAttribute("ph") && t.getAttribute("ph").indexOf('*') == -1) { continue; } if (t.hasAttribute("sounds_like")) text = t.getAttribute("sounds_like"); else text = MaryDomUtils.tokenText(t); String pos = null; // use part-of-speech if available if (t.hasAttribute("pos")) { pos = t.getAttribute("pos"); } boolean isEnglish = false; if (t.hasAttribute("xml:lang") && MaryUtils.subsumes(Locale.ENGLISH, MaryUtils.string2locale(t.getAttribute("xml:lang")))) { isEnglish = true; } if (text != null && !text.equals("")) { // If text consists of several parts (e.g., because that was // inserted into the sounds_like attribute), each part // is transcribed separately. StringBuilder ph = new StringBuilder(); String g2pMethod = null; StringTokenizer st = new StringTokenizer(text, " -"); while (st.hasMoreTokens()) { String graph = st.nextToken(); StringBuilder helper = new StringBuilder(); String phon = null; if (isEnglish && usEnglishLexicon != null) { phon = phonemiseEn(graph); if (phon != null) helper.append("foreign:en"); } if (phon == null) { phon = phonemise(graph, pos, helper); } if (ph.length() == 0) { // first part // The g2pMethod of the combined beast is // the g2pMethod of the first constituant. g2pMethod = helper.toString(); ph.append(phon); } else { // following parts ph.append(" - "); // Reduce primary to secondary stress: ph.append(phon.replace('\'', ',')); } } if (ph != null && ph.length() > 0) { setPh(t, ph.toString()); t.setAttribute("g2p_method", g2pMethod); } } } MaryData result = new MaryData(outputType(), d.getLocale()); result.setDocument(doc); return result; } /** * Phonemise the word text. This starts with a simple lexicon lookup, * followed by some heuristics, and finally applies letter-to-sound rules * if nothing else was successful. * @param text the textual (graphemic) form of a word. * @param g2pMethod This is an awkward way to return a second * String parameter via a StringBuilder. If a phonemisation of the text is * found, this parameter will be filled with the method of phonemisation * ("lexicon", ... "rules"). * @return a phonemisation of the text if one can be generated, or * null if no phonemisation method was successful. */ @Override public String phonemise(String text, String pos, StringBuilder g2pMethod) { // First, try a simple userdict and lexicon lookup: String result = userdictLookup(text, pos); if (result != null) { g2pMethod.append("userdict"); return result; } result = lexiconLookup(text, pos); if (result != null) { g2pMethod.append("lexicon"); return result; } /** // Not found? Try a compound "analysis": result = compoundSearch(text); //logger.debug("Compound here: "+compoundSearch(text)); if (result != null) { g2pMethod.append("compound"); return result; }**/ // Lookup attempts failed. Try normalising exotic letters // (diacritics on vowels, etc.), look up again: String normalised = MaryUtils.normaliseUnicodeLetters(text, Locale.GERMAN); if (!normalised.equals(text)) { // First, try a simple userdict and lexicon lookup: result = userdictLookup(normalised, pos); if (result != null) { g2pMethod.append("userdict"); return result; } result = lexiconLookup(normalised, pos); if (result != null) { g2pMethod.append("lexicon"); return result; } /** // Not found? Try a compound "analysis": result = compoundSearch(normalised); if (result != null) { g2pMethod.append("compound"); return result; }**/ } // plain English word must be looked up in English lexicon before phonemiseDenglish starts if (usEnglishLexicon != null) { String englishTranscription = phonemiseEn(text); if (englishTranscription != null) { g2pMethod.append("foreign:en"); logger.debug(text + " is English"); if (logEnglishFileName != null) { String englishText = text.trim(); if (english2Frequency.containsKey(englishText)) { int textFreq = english2Frequency.get(englishText); textFreq++; english2Frequency.put(englishText, textFreq); } else { english2Frequency.put(englishText, 1); } } return englishTranscription; } } Result resultingWord = phonemiseDenglish.processWord(text, usEnglishLexicon != null); result = resultingWord.getTranscription(); boolean usedOtherLanguageToPhonemise = resultingWord.isUsedOtherLanguageToPhonemise(); //logger.debug("input for PD: "+text); if (result != null) { result = allophoneSet.splitAllophoneString(result); if (usedOtherLanguageToPhonemise) { g2pMethod.append("phonemiseDenglish"); return result; } else { g2pMethod.append("compound"); return result; } } // Cannot find it in the lexicon -- apply letter-to-sound rules // to the normalised form String phones = lts.predictPronunciation(normalised); result = lts.syllabify(phones); if (result != null) { if (logUnknownFileName != null) { String unknownText = text.trim(); if (unknown2Frequency.containsKey(unknownText)) { int textFreq = unknown2Frequency.get(unknownText); textFreq++; unknown2Frequency.put(unknownText, textFreq); } else { unknown2Frequency.put(unknownText, new Integer(1)); } } g2pMethod.append("rules"); return result; } return null; } /** * Try to determine an English transcription of the text according to English rules, but using German Sampa. * @param text Word to transcribe * @return the transcription, or null if none could be determined. */ public String phonemiseEn(String text) { assert usEnglishLexicon != null; // We get here only if there is an English lexicon String normalisedEn = MaryUtils.normaliseUnicodeLetters(text, Locale.US); normalisedEn = normalisedEn.toLowerCase(); String[] transcriptions = usEnglishLexicon.lookup(normalisedEn); assert transcriptions != null; // if nothing is found, an array of length 0 is returned. if (transcriptions.length == 0) { return null; } String usSampa = transcriptions[0]; String deSampa = PAConverter.sampaEnString2sampaDeString(usSampa); //logger.debug("converted "+usSampa+" to "+deSampa); return deSampa; } /** * This method tries to decompose a compound. It calls itself recursively. * @param text the word to be transcribed. * @return the SAMPA transcription of text, or null if none was found. */ /* private String compoundSearch(String text) { // Chop off longest possible prefixes and try to look them up // in the lexicon. Any part must have a minimum length of 3 characters. //System.out.println("Compound Search is starting with: "+text); for (int i=text.length() - 3; i >= 3; i--) { //-3!!! >= 3!!! String firstPhon = null; String fugePhon = null; String restPhon = null; String prefix = text.substring(0, i); firstPhon = userdictLookup(prefix); if (firstPhon == null) firstPhon = lexiconLookup(prefix); if (firstPhon != null) { // found a valid prefix String rest = text.substring(i); logger.debug("Rest is: "+rest); // Is the rest a simple lexical entry? restPhon = userdictLookup(rest); if (restPhon == null) restPhon = lexiconLookup(rest); // Or can the rest be analysed as a compound? if (restPhon == null) restPhon = compoundSearch(rest); // Or does it help if we cut off a Fuge? if (restPhon == null) { String [] helper = fugeSearch(rest); //hier scheint er nicht mehr reinzugehen //logger.debug("fugeSearch(rest) is: " + fugeSearch(rest)); if (helper != null && helper.length == 2) { fugePhon = helper[0]; String rest2 = helper[1]; assert fugePhon != null; assert rest2 != null; restPhon = userdictLookup(rest2); if (restPhon == null) restPhon = lexiconLookup(rest2); if (restPhon == null) restPhon = compoundSearch(rest2); } } if (restPhon != null) // success! return firstPhon + (fugePhon != null ? fugePhon : "") + "-" + restPhon; } } return null; } */ /** * Try to cut off a Fuge morpheme at the beginning of suffix. * @param suffix a part of a word with a prefix already removed. * @return a two-item String array. First string is the transcription of the * Fuge found, second is the suffix after the Fuge was removed. Returns * null if no Fuge was found. */ /* private String[] fugeSearch(String suffix) { String fugePhon = null; int fugeLength = 0; if (suffix.startsWith("es")) { fugePhon = "@s"; fugeLength = 2; } else if (suffix.startsWith("en")) { fugePhon = "@n"; fugeLength = 2; } else if (suffix.startsWith("n")) { fugePhon = "n"; fugeLength = 1; } else if (suffix.startsWith("s")) { fugePhon = "s"; fugeLength = 1; } else if (suffix.startsWith("e")) { fugePhon = "@"; fugeLength = 1; } if (fugePhon != null) { // found a Fuge String[] returnValue = new String[2]; returnValue[0] = fugePhon; returnValue[1] = suffix.substring(fugeLength); return returnValue; } else { return null; } } */ }