Back to project page sdk-hyphenation.
The source code is released under:
GNU Lesser General Public License
If you think the Android project sdk-hyphenation listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.
package org.silpa.hyphenation.text; //w w w. j av a 2 s . c om import android.content.Context; import org.silpa.guesslanguage.GuessLanguage; import org.silpa.hyphenation.R; import org.silpa.hyphenation.text.Utf8TexParser.TexParserException; import org.silpa.hyphenation.util.ErrorHandler; import org.silpa.hyphenation.util.List; import org.silpa.hyphenation.util.LoggingErrorHandler; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; import java.util.logging.Logger; /** * insert soft hyphens at all allowed locations uses TeX hyphenation tables */ public class Hyphenator { //Hyphens from the wikipedia article: https://en.wikipedia.org/wiki/Hyphen#Unicode public static final char HYPHEN = '\u2010'; public static final char HYPHEN_MINUS = '\u002d'; public static final char SOFT_HYPHEN = '\u00ad'; public static final char NON_BREAKING_HYPHEN = '\u2011'; private static final char ZERO_WIDTH_SPACE = '\u200b'; private final ForwardingErrorHandler errorHandler; private RuleDefinition ruleSet; private final ByteScanner b; // Guess Language private GuessLanguage guessLanguage; private Context mContext; private static Map<String, String> indicHyphenRules = new HashMap<>(); static { indicHyphenRules.put("as", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n2?2\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n?1\n2?1\n2?1\n2??1\n2?1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("bn", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n2?2\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n?1\n2?1\n2?1\n2??1\n2?1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("gu", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2??2\n2?2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("hi", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("kn", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("ml", "\\patterns{\n2??2\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1??1\n1??1\n1?1\n1?1\n1?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2??2\n???2\n???2\n???2\n???2\n???2\n???2\n2?????\n2?????\n2?????\n2?????\n2?????\n2?????\n2?\n2?\n2?\n2?\n2?\n2?\n}\n\\hyphenation{\n}"); indicHyphenRules.put("mr", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("or", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2?1\n2??1\n2??2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("pa", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2??2\n2?2\n2?2\n}\n\\hyphenation{\n}"); indicHyphenRules.put("ta", "\\patterns{\n2??2\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1??1\n1??1\n1?1\n1?1\n1?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2?1\n2?1\n2?1\n2??1\n}\n\\hyphenation{\n}"); indicHyphenRules.put("te", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}"); } /** * Constructor * Creates an uninitialized instance of Hyphenator. The same instance can be * reused for different hyphenation tables. * * @param context context of application */ public Hyphenator(Context context) { errorHandler = new ForwardingErrorHandler(new LoggingErrorHandler(Logger.getLogger(this.getClass().getCanonicalName()))); b = new ByteScanner(errorHandler); this.mContext = context; this.guessLanguage = new GuessLanguage(this.mContext); } public RuleDefinition getRuleSet() { return ruleSet; } public void setRuleSet(RuleDefinition scanner) { this.ruleSet = scanner; } public ErrorHandler getErrorHandler() { return errorHandler.getTarget(); } /** * installs error handler. * * @param eh ErrorHandler used while parsing and hyphenating * @see org.silpa.hyphenation.util.ErrorHandler */ public void setErrorHandler(ErrorHandler eh) { errorHandler.setTarget(eh); } /** * Loads a hyphenation table with a reader. This enables the use of UTF-8 pattern files. * Note that escape codes in the original tex-files are not supported, e.g. ^^f6. * This method also differs in that multiple calls to loadTable are not joined, only the * most recent pattern file is used. * Only "\pattern{" and "\hyphenation{" groups are supported. * * @param reader a reader containing hyphenation patterns (most likely a file) * @throws TexParserException if there are problems reading the input */ public void loadTable(Reader reader) throws TexParserException { Utf8TexParser parser = new Utf8TexParser(); ruleSet = parser.parse(reader); } /** * loads hyphenation table * * @param in hyphenation table * @throws java.io.IOException IOException while reading rules */ public void loadTable(java.io.InputStream in) throws java.io.IOException { int[] codelist = new int[256]; { for (int i = 0; i != 256; ++i) codelist[i] = i; } loadTable(in, codelist); } /** * loads hyphenation table and code list for non-ucs encoding * * @param in hyphenation table * @param codelist an array of 256 elements. maps one-byte codes to UTF codes * @throws java.io.IOException IOException while reading rules */ public void loadTable(java.io.InputStream in, int[] codelist) throws java.io.IOException { b.scan(in, codelist); ruleSet = b; } /** * performs hyphenation * * @param phrase string to hyphenate * @return the string with soft hyphens inserted */ public String hyphenate(String phrase) { return hyphenate(phrase, 1, 1); } /** * performs hyphenation * * @param phrase string to hyphenate * @param leftHyphenMin unbreakable characters at the beginning of each word in the * phrase * @param rightHyphenMin unbreakable characters at the end of each word in the phrase * @return the string with soft hyphens inserted */ public String hyphenate(String phrase, int leftHyphenMin, int rightHyphenMin) { // Check input leftHyphenMin = Math.max(leftHyphenMin, 1); rightHyphenMin = Math.max(rightHyphenMin, 1); // Ignore short phrases (early out) if (phrase.length() < rightHyphenMin + leftHyphenMin) { return phrase; } int processedOffset = Integer.MIN_VALUE; int ich = 0; char[] sourcePhraseChars = new char[phrase.length() + 1]; sourcePhraseChars[sourcePhraseChars.length - 1] = (char) 0; phrase.getChars(0, phrase.length(), sourcePhraseChars, 0); char[] hyphenatedPhraseChars = new char[sourcePhraseChars.length * 2 - 1]; int ihy = 0; boolean inword = false; while (true) { if (inword) { if (Character.isLetter(sourcePhraseChars[ich])) { ich++; } else { // last character will be reprocessed in the other // state int length = ich - processedOffset; String word = new String(sourcePhraseChars, processedOffset, length).toLowerCase(); int[] hyphenQualificationPoints = ruleSet .getException(word); if (hyphenQualificationPoints == null) { char[] wordChars = extractWord(sourcePhraseChars, processedOffset, length); hyphenQualificationPoints = applyHyphenationRules( wordChars, length); } // now inserting soft hyphens if (leftHyphenMin + rightHyphenMin <= length) { for (int i = 0; i < leftHyphenMin - 1; i++) { hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++]; } for (int i = leftHyphenMin - 1; i < length - rightHyphenMin; i++) { hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++]; if (hyphenQualificationPoints[i] % 2 == 1) hyphenatedPhraseChars[ihy++] = SOFT_HYPHEN; } for (int i = length - rightHyphenMin; i < length; i++) { hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++]; } } else { //Word is to short to hyphenate, so just copy for (int i = 0; i != length; ++i) { hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++]; } } inword = false; } } else { if (Character.isLetter(sourcePhraseChars[ich])) { processedOffset = ich; inword = true; // processedOffset remembers the start of the word } else { if (sourcePhraseChars[ich] == (char) 0) break; // zero is a guard inserted earlier hyphenatedPhraseChars[ihy++] = sourcePhraseChars[ich]; if (sourcePhraseChars[ich] == HYPHEN_MINUS || sourcePhraseChars[ich] == HYPHEN) { hyphenatedPhraseChars[ihy++] = ZERO_WIDTH_SPACE; } } ich++; } } return new String(hyphenatedPhraseChars, 0, ihy); } /** * performs hyphenation with auto detection of language. * Object must be created with Hyphenator(Context context) * * @param phrase string to hyphenate * @return hyphenated string */ public String hyphenateWithDetectLangauge(String phrase) { return hyphenateWithDetectLangauge(phrase, 1, 1); } /** * performs hyphenation with auto detection of language. * Object must be created with Hyphenator(Context context) * * @param phrase string to hyphenate * @param leftHyphenMin unbreakable characters at the beginning of each word in the * phrase * @param rightHyphenMin unbreakable characters at the end of each word in the phrase * @return the string with soft hyphens inserted */ public String hyphenateWithDetectLangauge(String phrase, int leftHyphenMin, int rightHyphenMin) { if (guessLanguage == null) { return null; } String lang = guessLanguage.guessLanguage(phrase); if (indicHyphenRules.get(lang) == null) { return phrase; } try { if (lang.equals("en")) { this.loadTable(this.mContext.getResources().openRawResource(R.raw.silpa_sdk_hyph_en)); } else { RuleDefinition rules = new Utf8TexParser().parse(indicHyphenRules.get(lang)); this.setRuleSet(rules); } } catch (TexParserException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return hyphenate(phrase, leftHyphenMin, rightHyphenMin); } /** * Extract a word from a char array. The word is converted to lower case and * a '.' character is appended to the beginning and end of the new array. * * @param chars The character array to extract a smaller section from * @param wordStart First character to include from the source array <b>chars</b>. * @param wordLength Number of characters to include from the source array * <b>chars</b> * @return Word converted so lower case and surrounded by '.' */ private char[] extractWord(char[] chars, int wordStart, int wordLength) { char[] echars = new char[wordLength + 2]; echars[0] = echars[echars.length - 1] = '.'; for (int i = 0; i < wordLength; i++) { echars[1 + i] = Character.toLowerCase(chars[wordStart + i]); } return echars; } /** * Generate a hyphen qualification points for a word by applying rules. * * @param wordChars Word surrounded by '.' characters * @param length Length of the word (excluding '.' characters) * @return hyphen qualification points for the word */ @SuppressWarnings("rawtypes") private int[] applyHyphenationRules(final char[] wordChars, final int length) { int[] hyphenQualificationPoints = new int[wordChars.length + 1]; for (int istart = 0; istart < length; istart++) { List rules = ruleSet.getPatternTree((int) wordChars[istart]); int i = istart; java.util.Enumeration rulesEnumeration = rules.elements(); while (rulesEnumeration.hasMoreElements()) { rules = (List) rulesEnumeration.nextElement(); if (((Character) rules.head()).charValue() == wordChars[i]) { rules = rules.longTail(); // values int[] nodevalues = (int[]) rules.head(); for (int inv = 0; inv < nodevalues.length; inv++) { if (nodevalues[inv] > hyphenQualificationPoints[istart + inv]) { hyphenQualificationPoints[istart + inv] = nodevalues[inv]; } } i++; if (i == wordChars.length) { break; } rulesEnumeration = rules.longTail().elements(); // child // nodes } } } int[] newvalues = new int[length]; System.arraycopy(hyphenQualificationPoints, 2, newvalues, 0, length); // save // 12 // bytes; // senseless hyphenQualificationPoints = newvalues; return hyphenQualificationPoints; } private class ForwardingErrorHandler implements ErrorHandler { private ErrorHandler target; public ForwardingErrorHandler(ErrorHandler target) { this.target = target; } public ErrorHandler getTarget() { return target; } public void setTarget(ErrorHandler target) { this.target = target; } public void debug(String domain, String message) { target.debug(domain, message); } public void info(String s) { target.info(s); } public void warning(String s) { target.warning(s); } public void error(String s) { target.error(s); } public void exception(String s, Exception e) { target.exception(s, e); } } }