Here you can find the source of normalizeWidth(String text)
Normalize the <a href="https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms"> width</a> of characters in the supplied text.
Parameter | Description |
---|---|
text | a parameter |
public static String normalizeWidth(String text)
//package com.java2s; /************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. //from w w w . ja va2 s. co m Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007 Didier Briel and Tiago Saboga 2007 Zoltan Bartko - bartkozoltan@bartkozoltan.com 2008 Andrzej Sawula 2010-2013 Alex Buloichik 2015 Zoltan Bartko, Aaron Madlon-Kay 2016 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ import java.text.Normalizer; public class Main { /** * Normalize the * <a href="https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms"> * width</a> of characters in the supplied text. Specifically: * <ul> * <li>ASCII characters will become halfwidth * <li>Katakana characters will become fullwidth * <li>Hangul will become fullwidth * <li>Letter-like symbols and squared Latin abbreviations will be * decomposed to ASCII * </ul> * This method was adapted from <a href= * "https://bitbucket.org/okapiframework/okapi/src/52143104fcfc7eda204d04dfbbc273189f3a7f0f/okapi/steps/fullwidthconversion/src/main/java/net/sf/okapi/steps/fullwidthconversion/FullWidthConversionStep.java"> * FullWidthConversionStep.java</a> in the Okapi Framework under GPLv2+. * * @param text * @return Normalized-width text */ public static String normalizeWidth(String text) { StringBuilder sb = new StringBuilder(text); int ch; for (int i = 0; i < sb.length(); i++) { ch = sb.charAt(i); // ASCII if ((ch >= 0xFF01) && (ch <= 0xFF5E)) { sb.setCharAt(i, (char) (ch - 0xFEE0)); continue; } if (ch == 0x3000) { sb.setCharAt(i, ' '); } switch (ch) { // Katakana case 0xFF61: sb.setCharAt(i, (char) 0x3002); break; case 0xFF62: sb.setCharAt(i, (char) 0x300C); break; case 0xFF63: sb.setCharAt(i, (char) 0x300D); break; case 0xFF64: sb.setCharAt(i, (char) 0x3001); break; case 0xFF65: sb.setCharAt(i, (char) 0x30FB); break; case 0xFF66: sb.setCharAt(i, (char) 0x30F2); break; case 0xFF67: sb.setCharAt(i, (char) 0x30A1); break; case 0xFF68: sb.setCharAt(i, (char) 0x30A3); break; case 0xFF69: sb.setCharAt(i, (char) 0x30A5); break; case 0xFF6A: sb.setCharAt(i, (char) 0x30A7); break; case 0xFF6B: sb.setCharAt(i, (char) 0x30A9); break; case 0xFF6C: sb.setCharAt(i, (char) 0x30E3); break; case 0xFF6D: sb.setCharAt(i, (char) 0x30E5); break; case 0xFF6E: sb.setCharAt(i, (char) 0x30E7); break; case 0xFF6F: sb.setCharAt(i, (char) 0x30C3); break; case 0xFF70: sb.setCharAt(i, (char) 0x30FC); break; case 0xFF71: sb.setCharAt(i, (char) 0x30A2); break; case 0xFF72: sb.setCharAt(i, (char) 0x30A4); break; case 0xFF73: sb.setCharAt(i, (char) 0x30A6); break; case 0xFF74: sb.setCharAt(i, (char) 0x30A8); break; case 0xFF75: sb.setCharAt(i, (char) 0x30AA); break; case 0xFF76: sb.setCharAt(i, (char) 0x30AB); break; case 0xFF77: sb.setCharAt(i, (char) 0x30AD); break; case 0xFF78: sb.setCharAt(i, (char) 0x30AF); break; case 0xFF79: sb.setCharAt(i, (char) 0x30B1); break; case 0xFF7A: sb.setCharAt(i, (char) 0x30B3); break; case 0xFF7B: sb.setCharAt(i, (char) 0x30B5); break; case 0xFF7C: sb.setCharAt(i, (char) 0x30B7); break; case 0xFF7D: sb.setCharAt(i, (char) 0x30B9); break; case 0xFF7E: sb.setCharAt(i, (char) 0x30BB); break; case 0xFF7F: sb.setCharAt(i, (char) 0x30BD); break; case 0xFF80: sb.setCharAt(i, (char) 0x30BF); break; case 0xFF81: sb.setCharAt(i, (char) 0x30C1); break; case 0xFF82: sb.setCharAt(i, (char) 0x30C4); break; case 0xFF83: sb.setCharAt(i, (char) 0x30C6); break; case 0xFF84: sb.setCharAt(i, (char) 0x30C8); break; case 0xFF85: sb.setCharAt(i, (char) 0x30CA); break; case 0xFF86: sb.setCharAt(i, (char) 0x30CB); break; case 0xFF87: sb.setCharAt(i, (char) 0x30CC); break; case 0xFF88: sb.setCharAt(i, (char) 0x30CD); break; case 0xFF89: sb.setCharAt(i, (char) 0x30CE); break; case 0xFF8A: sb.setCharAt(i, (char) 0x30CF); break; case 0xFF8B: sb.setCharAt(i, (char) 0x30D2); break; case 0xFF8C: sb.setCharAt(i, (char) 0x30D5); break; case 0xFF8D: sb.setCharAt(i, (char) 0x30D8); break; case 0xFF8E: sb.setCharAt(i, (char) 0x30DB); break; case 0xFF8F: sb.setCharAt(i, (char) 0x30DE); break; case 0xFF90: sb.setCharAt(i, (char) 0x30DF); break; case 0xFF91: sb.setCharAt(i, (char) 0x30E0); break; case 0xFF92: sb.setCharAt(i, (char) 0x30E1); break; case 0xFF93: sb.setCharAt(i, (char) 0x30E2); break; case 0xFF94: sb.setCharAt(i, (char) 0x30E4); break; case 0xFF95: sb.setCharAt(i, (char) 0x30E6); break; case 0xFF96: sb.setCharAt(i, (char) 0x30E8); break; case 0xFF97: sb.setCharAt(i, (char) 0x30E9); break; case 0xFF98: sb.setCharAt(i, (char) 0x30EA); break; case 0xFF99: sb.setCharAt(i, (char) 0x30EB); break; case 0xFF9A: sb.setCharAt(i, (char) 0x30EC); break; case 0xFF9B: sb.setCharAt(i, (char) 0x30ED); break; case 0xFF9C: sb.setCharAt(i, (char) 0x30EF); break; case 0xFF9D: sb.setCharAt(i, (char) 0x30F3); break; case 0xFF9E: sb.setCharAt(i, (char) 0x3099); break; case 0xFF9F: sb.setCharAt(i, (char) 0x309A); break; } // Hangul if ((ch > 0xFFA1) && (ch <= 0xFFBE)) { sb.setCharAt(i, (char) (ch - 0xCE70)); continue; } switch (ch) { // Hangul case 0xFFA0: sb.setCharAt(i, (char) 0x3164); break; case 0xFFDA: sb.setCharAt(i, (char) 0x3161); break; case 0xFFDB: sb.setCharAt(i, (char) 0x3162); break; case 0xFFDC: sb.setCharAt(i, (char) 0x3163); break; // Others case 0xFFE8: sb.setCharAt(i, (char) 0x2502); break; case 0xFFE9: sb.setCharAt(i, (char) 0x2190); break; case 0xFFEA: sb.setCharAt(i, (char) 0x2191); break; case 0xFFEB: sb.setCharAt(i, (char) 0x2192); break; case 0xFFEC: sb.setCharAt(i, (char) 0x2193); break; case 0xFFED: sb.setCharAt(i, (char) 0x25A0); break; case 0xFFEE: sb.setCharAt(i, (char) 0x25CB); break; } // Process letter-like symbols switch (ch) { case 0x2100: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/c"); i += 2; break; case 0x2101: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/s"); i += 2; break; case 0x2105: sb.setCharAt(i, 'c'); sb.insert(i + 1, "/o"); i += 2; break; case 0x2103: sb.setCharAt(i, (char) 0x00B0); sb.insert(i + 1, "C"); i++; break; case 0x2109: sb.setCharAt(i, (char) 0x00B0); sb.insert(i + 1, "F"); i++; break; case 0x2116: sb.setCharAt(i, 'N'); sb.insert(i + 1, "o"); i++; break; case 0x212A: sb.setCharAt(i, 'K'); break; case 0x212B: sb.setCharAt(i, (char) 0x00C5); break; } switch (ch) { // Squared Latin Abbreviations 1 case 0x3371: sb.setCharAt(i, 'h'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x3372: sb.setCharAt(i, 'd'); sb.insert(i + 1, "a"); i++; break; case 0x3373: sb.setCharAt(i, 'A'); sb.insert(i + 1, "U"); i++; break; case 0x3374: sb.setCharAt(i, 'b'); sb.insert(i + 1, "ar"); i += 2; break; case 0x3375: sb.setCharAt(i, 'o'); sb.insert(i + 1, "V"); i++; break; case 0x3376: sb.setCharAt(i, 'p'); sb.insert(i + 1, "c"); i++; break; case 0x3377: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m"); i++; break; case 0x3378: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x3379: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x337A: sb.setCharAt(i, 'I'); sb.insert(i + 1, "U"); i++; break; // Squared Latin Abbreviations 2 case 0x3380: sb.setCharAt(i, 'p'); sb.insert(i + 1, "A"); i++; break; case 0x3381: sb.setCharAt(i, 'n'); sb.insert(i + 1, "A"); i++; break; case 0x3382: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "A"); i++; break; case 0x3383: sb.setCharAt(i, 'm'); sb.insert(i + 1, "A"); i++; break; case 0x3384: sb.setCharAt(i, 'k'); sb.insert(i + 1, "A"); i++; break; case 0x3385: sb.setCharAt(i, 'K'); sb.insert(i + 1, "B"); i++; break; case 0x3386: sb.setCharAt(i, 'M'); sb.insert(i + 1, "B"); i++; break; case 0x3387: sb.setCharAt(i, 'G'); sb.insert(i + 1, "B"); i++; break; case 0x3388: sb.setCharAt(i, 'c'); sb.insert(i + 1, "al"); i += 2; break; case 0x3389: sb.setCharAt(i, 'k'); sb.insert(i + 1, "cal"); i += 3; break; case 0x338A: sb.setCharAt(i, 'p'); sb.insert(i + 1, "F"); i++; break; case 0x338B: sb.setCharAt(i, 'n'); sb.insert(i + 1, "F"); i++; break; case 0x338C: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "F"); i++; break; case 0x338D: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "g"); i++; break; case 0x338E: sb.setCharAt(i, 'm'); sb.insert(i + 1, "g"); i++; break; case 0x338F: sb.setCharAt(i, 'k'); sb.insert(i + 1, "g"); i++; break; case 0x3390: sb.setCharAt(i, 'H'); sb.insert(i + 1, "z"); i++; break; case 0x3391: sb.setCharAt(i, 'k'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3392: sb.setCharAt(i, 'M'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3393: sb.setCharAt(i, 'G'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3394: sb.setCharAt(i, 'T'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3395: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "\u2113"); i++; break; case 0x3396: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3397: sb.setCharAt(i, 'd'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3398: sb.setCharAt(i, 'k'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3399: sb.setCharAt(i, 'f'); sb.insert(i + 1, "m"); i++; break; case 0x339A: sb.setCharAt(i, 'n'); sb.insert(i + 1, "m"); i++; break; case 0x339B: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "m"); i++; break; case 0x339C: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m"); i++; break; case 0x339D: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m"); i++; break; case 0x339E: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m"); i++; break; case 0x339F: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A0: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A1: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u00B2"); i++; break; case 0x33A2: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A3: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A4: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A5: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u00B3"); i++; break; case 0x33A6: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A7: sb.setCharAt(i, 'm'); sb.insert(i + 1, "/s"); i += 2; break; case 0x33A8: sb.setCharAt(i, 'm'); sb.insert(i + 1, "/s\u00B2"); i += 3; break; case 0x33A9: sb.setCharAt(i, 'P'); sb.insert(i + 1, "a"); i++; break; case 0x33AA: sb.setCharAt(i, 'k'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AB: sb.setCharAt(i, 'M'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AC: sb.setCharAt(i, 'G'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AD: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad"); i += 2; break; case 0x33AE: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad/s"); i += 4; break; case 0x33AF: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad/s\u00B2"); i += 5; break; case 0x33B0: sb.setCharAt(i, 'p'); sb.insert(i + 1, "s"); i++; break; case 0x33B1: sb.setCharAt(i, 'n'); sb.insert(i + 1, "s"); i++; break; case 0x33B2: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "s"); i++; break; case 0x33B3: sb.setCharAt(i, 'm'); sb.insert(i + 1, "s"); i++; break; case 0x33B4: sb.setCharAt(i, 'p'); sb.insert(i + 1, "V"); i++; break; case 0x33B5: sb.setCharAt(i, 'n'); sb.insert(i + 1, "V"); i++; break; case 0x33B6: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "V"); i++; break; case 0x33B7: sb.setCharAt(i, 'm'); sb.insert(i + 1, "V"); i++; break; case 0x33B8: sb.setCharAt(i, 'k'); sb.insert(i + 1, "V"); i++; break; case 0x33B9: sb.setCharAt(i, 'M'); sb.insert(i + 1, "V"); i++; break; case 0x33BA: sb.setCharAt(i, 'p'); sb.insert(i + 1, "W"); i++; break; case 0x33BB: sb.setCharAt(i, 'n'); sb.insert(i + 1, "W"); i++; break; case 0x33BC: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "W"); i++; break; case 0x33BD: sb.setCharAt(i, 'm'); sb.insert(i + 1, "W"); i++; break; case 0x33BE: sb.setCharAt(i, 'k'); sb.insert(i + 1, "W"); i++; break; case 0x33BF: sb.setCharAt(i, 'M'); sb.insert(i + 1, "W"); i++; break; case 0x33C0: sb.setCharAt(i, 'k'); sb.insert(i + 1, "\u03A9"); i++; break; case 0x33C1: sb.setCharAt(i, 'M'); sb.insert(i + 1, "\u03A9"); i++; break; case 0x33C2: sb.setCharAt(i, 'a'); sb.insert(i + 1, ".m."); i += 3; break; case 0x33C3: sb.setCharAt(i, 'B'); sb.insert(i + 1, "q"); i++; break; case 0x33C4: sb.setCharAt(i, 'c'); sb.insert(i + 1, "c"); i++; break; case 0x33C5: sb.setCharAt(i, 'c'); sb.insert(i + 1, "d"); i++; break; case 0x33C6: sb.setCharAt(i, 'C'); sb.insert(i + 1, "/kg"); i += 3; break; case 0x33C7: sb.setCharAt(i, 'C'); sb.insert(i + 1, "o."); i += 2; break; case 0x33C8: sb.setCharAt(i, 'd'); sb.insert(i + 1, "B"); i++; break; case 0x33C9: sb.setCharAt(i, 'G'); sb.insert(i + 1, "y"); i++; break; case 0x33CA: sb.setCharAt(i, 'h'); sb.insert(i + 1, "a"); i++; break; case 0x33CB: sb.setCharAt(i, 'H'); sb.insert(i + 1, "P"); i++; break; case 0x33CC: sb.setCharAt(i, 'i'); sb.insert(i + 1, "n"); i++; break; case 0x33CD: sb.setCharAt(i, 'K'); sb.insert(i + 1, "K"); i++; break; case 0x33CE: sb.setCharAt(i, 'K'); sb.insert(i + 1, "M"); i++; break; case 0x33CF: sb.setCharAt(i, 'K'); sb.insert(i + 1, "t"); i++; break; case 0x33D0: sb.setCharAt(i, 'l'); sb.insert(i + 1, "m"); i++; break; case 0x33D1: sb.setCharAt(i, 'l'); sb.insert(i + 1, "n"); i++; break; case 0x33D2: sb.setCharAt(i, 'l'); sb.insert(i + 1, "og"); i += 2; break; case 0x33D3: sb.setCharAt(i, 'l'); sb.insert(i + 1, "x"); i++; break; case 0x33D4: sb.setCharAt(i, 'm'); sb.insert(i + 1, "b"); i++; break; case 0x33D5: sb.setCharAt(i, 'm'); sb.insert(i + 1, "il"); i += 2; break; case 0x33D6: sb.setCharAt(i, 'm'); sb.insert(i + 1, "ol"); i += 2; break; case 0x33D7: sb.setCharAt(i, 'p'); sb.insert(i + 1, "H"); i++; break; case 0x33D8: sb.setCharAt(i, 'p'); sb.insert(i + 1, ".m."); i += 3; break; case 0x33D9: sb.setCharAt(i, 'P'); sb.insert(i + 1, "PM"); i += 2; break; case 0x33DA: sb.setCharAt(i, 'P'); sb.insert(i + 1, "R"); i++; break; case 0x33DB: sb.setCharAt(i, 's'); sb.insert(i + 1, "r"); i++; break; case 0x33DC: sb.setCharAt(i, 'S'); sb.insert(i + 1, "v"); i++; break; case 0x33DD: sb.setCharAt(i, 'W'); sb.insert(i + 1, "b"); i++; break; case 0x33DE: sb.setCharAt(i, 'v'); sb.insert(i + 1, "/m"); i += 2; break; case 0x33DF: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/m"); i += 2; break; // Squared Latin Abbreviations 3 case 0x33FF: sb.setCharAt(i, 'g'); sb.insert(i + 1, "al"); i += 2; break; } } String result = sb.toString(); if (text.equals(result)) { // No characters were changed. Return the original text so that // composition of unrelated characters is not affected. return text; } return normalizeUnicode(result); } /** * Apply Unicode NFC normalization to a string. */ public static String normalizeUnicode(CharSequence text) { return Normalizer.isNormalized(text, Normalizer.Form.NFC) ? text.toString() : Normalizer.normalize(text, Normalizer.Form.NFC); } }