Java tutorial
/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool; import org.apache.commons.lang.StringUtils; import org.languagetool.chunking.Chunker; import org.languagetool.databroker.ResourceDataBroker; import org.languagetool.language.Contributor; import org.languagetool.rules.Rule; import org.languagetool.rules.patterns.Unifier; import org.languagetool.rules.patterns.UnifierConfiguration; import org.languagetool.synthesis.Synthesizer; import org.languagetool.tagging.Tagger; import org.languagetool.tagging.disambiguation.Disambiguator; import org.languagetool.tagging.disambiguation.xx.DemoDisambiguator; import org.languagetool.tagging.xx.DemoTagger; import org.languagetool.tokenizers.*; import org.languagetool.tools.MultiKeyProperties; import org.languagetool.tools.StringTools; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Constructor; import java.net.URL; import java.util.*; /** * Base class for any supported language (English, German, etc). Language classes * are detected at runtime by searching the classpath for files named * {@code META-INF/org/languagetool/language-module.properties}. Those file(s) * need to contain a key {@code languageClasses} which specifies the fully qualified * class name(s), e.g. {@code org.languagetool.language.English}. Use commas to specify * more than one class. */ public abstract class Language { private static final String PROPERTIES_PATH = "META-INF/org/languagetool/language-module.properties"; private static final String PROPERTIES_KEY = "languageClasses"; private static List<Language> externalLanguages = new ArrayList<>(); private boolean isExternalLanguage = false; private List<String> externalRuleFiles = new ArrayList<>(); /** * All languages supported by LanguageTool. This includes at least a "demo" language * for testing. */ public static Language[] LANGUAGES = getLanguages(); private static Language[] getLanguages() { final List<Language> languages = new ArrayList<>(); final Set<String> languageClassNames = new HashSet<>(); try { final Enumeration<URL> propertyFiles = Language.class.getClassLoader().getResources(PROPERTIES_PATH); while (propertyFiles.hasMoreElements()) { final URL url = propertyFiles.nextElement(); try (InputStream inputStream = url.openStream()) { // We want to be able to read properties file with duplicate key, as produced by // Maven when merging files: final MultiKeyProperties props = new MultiKeyProperties(inputStream); final List<String> classNamesStr = props.getProperty(PROPERTIES_KEY); if (classNamesStr == null) { throw new RuntimeException("Key '" + PROPERTIES_KEY + "' not found in " + url); } for (String classNames : classNamesStr) { final String[] classNamesSplit = classNames.split("\\s*,\\s*"); for (String className : classNamesSplit) { if (languageClassNames.contains(className)) { // avoid duplicates - this way we are robust against problems with the maven assembly // plugin which aggregates files more than once (in case the deployment descriptor // contains both <format>zip</format> and <format>dir</format>): continue; } languages.add(createLanguageObjects(url, className)); languageClassNames.add(className); } } } } } catch (IOException e) { throw new RuntimeException(e); } return languages.toArray(new Language[languages.size()]); } private static Language createLanguageObjects(URL url, String className) { try { final Class<?> aClass = Class.forName(className); final Constructor<?> constructor = aClass.getConstructor(); return (Language) constructor.newInstance(); } catch (ClassNotFoundException e) { throw new RuntimeException( "Class '" + className + "' specified in " + url + " could not be found in classpath", e); } catch (Exception e) { throw new RuntimeException( "Object for class '" + className + "' specified in " + url + " could not created", e); } } /** * All languages supported by LanguageTool, but without the demo language. */ public static final Language[] REAL_LANGUAGES = getRealLanguages(); /** * Returns all languages supported by LanguageTool but without the demo language. * In contrast to Language.REAL_LANGUAGES contains external languages as well. * @return All supported languages. * @since 2.6 */ public static Language[] getRealLanguages() { List<Language> result = new ArrayList<>(); for (Language lang : LANGUAGES) { if (!"xx".equals(lang.getShortName())) { // skip demo language result.add(lang); } } return result.toArray(new Language[result.size()]); } private static final Language[] BUILTIN_LANGUAGES = LANGUAGES; private static final Disambiguator DEMO_DISAMBIGUATOR = new DemoDisambiguator(); private static final Tagger DEMO_TAGGER = new DemoTagger(); private static final SentenceTokenizer SENTENCE_TOKENIZER = new SimpleSentenceTokenizer(); private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer(); private UnifierConfiguration unifierConfiguration = new UnifierConfiguration(); private UnifierConfiguration disambiguationUnifierConfiguration = new UnifierConfiguration(); // ------------------------------------------------------------------------- /** * Get this language's two character code, e.g. <code>en</code> for English. * The country parameter (e.g. "US"), if any, is not returned. * @return language code */ public abstract String getShortName(); /** * Get this language's name in English, e.g. <code>English</code> or * <code>German (Germany)</code>. * @return language name */ public abstract String getName(); /** * Set this language's name in English. * @since 2.6 */ public abstract void setName(final String name); /** * Get this language's country options , e.g. <code>US</code> (as in <code>en-US</code>) or * <code>PL</code> (as in <code>pl-PL</code>). * @return String[] - array of country options for the language. */ public abstract String[] getCountries(); /** * Get this language's variant, e.g. <code>valencia</code> (as in <code>ca-ES-valencia</code>) * or <code>null</code>. * Attention: not to be confused with "country" option * @return String - variant for the language. * @since 2.3 */ public String getVariant() { return null; } /** * Get enabled rules different from the default ones for this language variant. * * @return enabled rules for the language variant. * @since 2.4 */ public List<String> getDefaultEnabledRulesForVariant() { return new ArrayList<>(); } /** * Get disabled rules different from the default ones for this language variant. * * @return disabled rules for the language variant. * @since 2.4 */ public List<String> getDefaultDisabledRulesForVariant() { return new ArrayList<>(); } /** * Get the name(s) of the maintainer(s) for this language or <code>null</code>. */ public abstract Contributor[] getMaintainers(); /** * Get the rules classes that should run for texts in this language. * @since 1.4 (signature modified in 2.7) */ public abstract List<Rule> getRelevantRules(ResourceBundle messages) throws IOException; // ------------------------------------------------------------------------- /** * Get this language's Java locale, not considering the country code. */ public Locale getLocale() { return new Locale(getShortName()); } /** * Get this language's Java locale, considering language code and country code (if any). * @since 2.1 */ public Locale getLocaleWithCountryAndVariant() { if (getCountries().length > 0) { if (getVariant() != null) { return new Locale(getShortName(), getCountries()[0], getVariant()); } else { return new Locale(getShortName(), getCountries()[0]); } } else { return getLocale(); } } /** * Get the location of the rule file(s) in a form like {@code /org/languagetool/rules/de/grammar.xml}. */ public List<String> getRuleFileNames() { final List<String> ruleFiles = new ArrayList<>(); ruleFiles.addAll(getExternalRuleFiles()); final ResourceDataBroker dataBroker = JLanguageTool.getDataBroker(); ruleFiles.add(dataBroker.getRulesDir() + "/" + getShortName() + "/" + JLanguageTool.PATTERN_FILE); if (getShortNameWithCountryAndVariant().length() > 2) { final String fileName = getShortName() + "/" + getShortNameWithCountryAndVariant() + "/" + JLanguageTool.PATTERN_FILE; if (dataBroker.ruleFileExists(fileName)) { ruleFiles.add(dataBroker.getRulesDir() + "/" + fileName); } } return ruleFiles; } /** * @since 2.6 */ public List<String> getExternalRuleFiles() { return externalRuleFiles; } /** * Adds an external rule file to the language. After running this method, * one has to run JLanguageTool.activateDefaultPatternRules() to make sure * that all external rules are activated. * * @param externalRuleFile Absolute file path to rules. * @since 2.6 */ public void addExternalRuleFile(String externalRuleFile) { externalRuleFiles.add(externalRuleFile); } /** * Languages that have country variants need to overwrite this to select their most common variant. * @return default country variant or {@code null} * @since 1.8 */ public Language getDefaultLanguageVariant() { return null; } /** * Get this language's part-of-speech disambiguator implementation or {@code null}. */ public Disambiguator getDisambiguator() { return DEMO_DISAMBIGUATOR; } /** * Get this language's part-of-speech tagger implementation. The tagger must not * be {@code null}, but it can be a trivial pseudo-tagger that only assigns {@code null} tags. */ public Tagger getTagger() { return DEMO_TAGGER; } /** * Get this language's sentence tokenizer implementation. */ public SentenceTokenizer getSentenceTokenizer() { return SENTENCE_TOKENIZER; } /** * Get this language's word tokenizer implementation. */ public Tokenizer getWordTokenizer() { return WORD_TOKENIZER; } /** * Get this language's chunker implementation or {@code null}. * @since 2.3 */ public Chunker getChunker() { return null; } /** * Get this language's part-of-speech synthesizer implementation or {@code null}. */ public Synthesizer getSynthesizer() { return null; } /** * Get this language's feature unifier. * @return Feature unifier for analyzed tokens. */ public Unifier getUnifier() { return unifierConfiguration.createUnifier(); } /** * Get this language's feature unifier used for disambiguation. * Note: it might be different from the normal rule unifier. * @return Feature unifier for analyzed tokens. */ public Unifier getDisambiguationUnifier() { return disambiguationUnifierConfiguration.createUnifier(); } /** * @since 2.3 */ public UnifierConfiguration getUnifierConfiguration() { return unifierConfiguration; } /** * @since 2.3 */ public UnifierConfiguration getDisambiguationUnifierConfiguration() { return disambiguationUnifierConfiguration; } /** * Get the name of the language translated to the current locale, * if available. Otherwise, get the untranslated name. */ public final String getTranslatedName(final ResourceBundle messages) { try { return messages.getString(getShortNameWithCountryAndVariant()); } catch (final MissingResourceException e) { try { return messages.getString(getShortName()); } catch (final MissingResourceException e1) { return getName(); } } } /** * Get the short name of the language with country and variant (if any), if it is * a single-country language. For generic language classes, get only a two- or * three-character code. * @since 1.8 */ public final String getShortNameWithCountryAndVariant() { String name = getShortName(); if (getCountries().length == 1 && !name.contains("-x-")) { // e.g. "de-DE-x-simple-language" name += "-" + getCountries()[0]; if (getVariant() != null) { // e.g. "ca-ES-valencia" name += "-" + getVariant(); } } return name; } /** * Start symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}. * Note that the array must be of equal length as {@link #getUnpairedRuleEndSymbols()} and the sequence of * starting symbols must match exactly the sequence of ending symbols. */ public String[] getUnpairedRuleStartSymbols() { return new String[] { "[", "(", "{", "\"", "'" }; } /** * End symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}. * @see #getUnpairedRuleStartSymbols() */ public String[] getUnpairedRuleEndSymbols() { return new String[] { "]", ")", "}", "\"", "'" }; } // ------------------------------------------------------------------------- /** * Re-inits the built-in languages and adds the specified ones. */ public static void reInit(final List<Language> languages) { LANGUAGES = new Language[BUILTIN_LANGUAGES.length + languages.size()]; int i = BUILTIN_LANGUAGES.length; System.arraycopy(BUILTIN_LANGUAGES, 0, LANGUAGES, 0, BUILTIN_LANGUAGES.length); for (final Language lang : languages) { LANGUAGES[i++] = lang; } externalLanguages = languages; } /** * Return languages that are not built-in but have been added manually. */ public static List<Language> getExternalLanguages() { return externalLanguages; } /** * Return all languages supported by LanguageTool. * @return A list of all languages, including external ones and country variants (e.g. {@code en-US}) */ public static List<Language> getAllLanguages() { final List<Language> langList = new ArrayList<>(); Collections.addAll(langList, LANGUAGES); langList.addAll(externalLanguages); return langList; } /** * Get the Language object for the given language name. * * @param languageName e.g. <code>English</code> or <code>German</code> (case is significant) * @return a Language object or {@code null} if there is no such language */ public static Language getLanguageForName(final String languageName) { for (Language element : Language.LANGUAGES) { if (languageName.equals(element.getName())) { return element; } } return null; } /** * Get the Language object for the given short language name. * * @param langCode e.g. <code>en</code> or <code>es-US</code> * @return a Language object * @throws IllegalArgumentException if the language is not supported or if the language code is invalid */ public static Language getLanguageForShortName(final String langCode) { final Language language = getLanguageForShortNameOrNull(langCode); if (language == null) { final List<String> codes = new ArrayList<>(); for (Language realLanguage : LANGUAGES) { codes.add(realLanguage.getShortNameWithCountryAndVariant()); } Collections.sort(codes); throw new IllegalArgumentException("'" + langCode + "' is not a language code known to LanguageTool." + " Supported language codes are: " + StringUtils.join(codes, ", ") + ". The list of languages is read from " + PROPERTIES_PATH + " in the Java classpath. See http://wiki.languagetool.org/java-api for details."); } return language; } /** * Return whether a language with the given language code is supported. Which languages * are supported depends on the classpath when the {@code Language} object is initialized. * * @param langCode e.g. {@code en} or {@code en-US} * @return true if the language is supported * @throws IllegalArgumentException in some cases of an invalid language code format * @since 2.1 */ public static boolean isLanguageSupported(final String langCode) { return getLanguageForShortNameOrNull(langCode) != null; } private static Language getLanguageForShortNameOrNull(final String langCode) { StringTools.assureSet(langCode, "langCode"); Language result = null; if (langCode.contains("-x-")) { // e.g. "de-DE-x-simple-language" for (Language element : Language.LANGUAGES) { if (element.getShortName().equalsIgnoreCase(langCode)) { return element; } } } else if (langCode.contains("-")) { final String[] parts = langCode.split("-"); if (parts.length == 2) { // e.g. en-US for (Language element : Language.LANGUAGES) { if (parts[0].equalsIgnoreCase(element.getShortName()) && element.getCountries().length == 1 && parts[1].equalsIgnoreCase(element.getCountries()[0])) { result = element; break; } } } else if (parts.length == 3) { // e.g. ca-ES-valencia for (Language element : Language.LANGUAGES) { if (parts[0].equalsIgnoreCase(element.getShortName()) && element.getCountries().length == 1 && parts[1].equalsIgnoreCase(element.getCountries()[0]) && parts[2].equalsIgnoreCase(element.getVariant())) { result = element; break; } } } else { throw new IllegalArgumentException("'" + langCode + "' isn't a valid language code"); } } else { for (Language element : Language.LANGUAGES) { if (langCode.equalsIgnoreCase(element.getShortName())) { result = element; break; } } } return result; } /** * Get the best match for a locale, using American English as the final fallback if nothing * else fits. The returned language will be a country variant language (e.g. British English, not just English) * if available. * @since 1.8 * @throws RuntimeException if no language was found and American English as a fallback is not available */ public static Language getLanguageForLocale(final Locale locale) { final Language language = getLanguageForLanguageNameAndCountry(locale); if (language != null) { return language; } else { final Language firstFallbackLanguage = getLanguageForLanguageNameOnly(locale); if (firstFallbackLanguage != null) { return firstFallbackLanguage; } } for (Language aLanguage : REAL_LANGUAGES) { if (aLanguage.getShortNameWithCountryAndVariant().equals("en-US")) { return aLanguage; } } throw new RuntimeException("No appropriate language found, not even en-US. Supported languages: " + Arrays.toString(REAL_LANGUAGES)); } private static Language getLanguageForLanguageNameAndCountry(Locale locale) { for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage())) { final List<String> countryVariants = Arrays.asList(language.getCountries()); if (countryVariants.contains(locale.getCountry())) { return language; } } } return null; } private static Language getLanguageForLanguageNameOnly(Locale locale) { // use default variant if available: for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage()) && language.hasVariant()) { final Language defaultVariant = language.getDefaultLanguageVariant(); if (defaultVariant != null) { return defaultVariant; } } } // use the first match otherwise (which should be the only match): for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage()) && !language.hasVariant()) { return language; } } return null; } @Override public final String toString() { return getName(); } /** * Whether this is a country variant of another language, i.e. whether it doesn't * directly extend {@link Language}, but a subclass of {@link Language}. * @since 1.8 */ public final boolean isVariant() { for (Language language : LANGUAGES) { final boolean skip = language.getShortNameWithCountryAndVariant() .equals(getShortNameWithCountryAndVariant()); if (!skip && language.getClass().isAssignableFrom(getClass())) { return true; } } return false; } /** * Whether this class has at least one subclass that implements variants of this language. * @since 1.8 */ public final boolean hasVariant() { for (Language language : LANGUAGES) { final boolean skip = language.getShortNameWithCountryAndVariant() .equals(getShortNameWithCountryAndVariant()); if (!skip && getClass().isAssignableFrom(language.getClass())) { return true; } } return false; } public boolean isExternal() { return isExternalLanguage; } /** * Sets the language as external. Useful for * making a copy of an existing language. * @since 2.6 */ public void makeExternal() { isExternalLanguage = true; } /** * Return true if this is the same language as the given one, considering country * variants only if set for both languages. For example: en = en, en = en-GB, en-GB = en-GB, * but en-US != en-GB * @since 1.8 */ public boolean equalsConsiderVariantsIfSpecified(Language otherLanguage) { if (getShortName().equals(otherLanguage.getShortName())) { final boolean thisHasCountry = hasCountry(); final boolean otherHasCountry = otherLanguage.hasCountry(); return !(thisHasCountry && otherHasCountry) || getShortNameWithCountryAndVariant() .equals(otherLanguage.getShortNameWithCountryAndVariant()); } else { return false; } } private boolean hasCountry() { return getCountries().length == 1; } }