Java tutorial
/* * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package java.net; import java.io.InputStream; import java.io.IOException; import java.security.AccessController; import java.security.PrivilegedAction; import sun.net.idn.StringPrep; import sun.net.idn.Punycode; import sun.text.normalizer.UCharacterIterator; /** * Provides methods to convert internationalized domain names (IDNs) between * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation. * Internationalized domain names can use characters from the entire range of * Unicode, while traditional domain names are restricted to ASCII characters. * ACE is an encoding of Unicode strings that uses only ASCII characters and * can be used with software (such as the Domain Name System) that only * understands traditional domain names. * * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert * domain name string back and forth. * * <p>The behavior of aforementioned conversion process can be adjusted by various flags: * <ul> * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted * can contain code points that are unassigned in Unicode 3.2, which is the * Unicode version on which IDN conversion is based. If the flag is not used, * the presence of such unassigned code points is treated as an error. * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>. * It is an error if they don't meet the requirements. * </ul> * These flags can be logically OR'ed together. * * <p>The security consideration is important with respect to internationalization * domain name support. For example, English domain names may be <i>homographed</i> * - maliciously misspelled by substitution of non-Latin letters. * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a> * discusses security issues of IDN support as well as possible solutions. * Applications are responsible for taking adequate security measures when using * international domain names. * * @author Edward Wang * @since 1.6 * */ public final class IDN { /** * Flag to allow processing of unassigned code points */ public static final int ALLOW_UNASSIGNED = 0x01; /** * Flag to turn on the check against STD-3 ASCII rules */ public static final int USE_STD3_ASCII_RULES = 0x02; /** * Translates a string from Unicode to ASCII Compatible Encoding (ACE), * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. * * <p>ToASCII operation can fail. ToASCII fails if any step of it fails. * If ToASCII operation fails, an IllegalArgumentException will be thrown. * In this case, the input string should not be used in an internationalized domain name. * * <p> A label is an individual part of a domain name. The original ToASCII operation, * as defined in RFC 3490, only operates on a single label. This method can handle * both label and entire domain name, by assuming that labels in a domain name are * always separated by dots. The following characters are recognized as dots: * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), * and \uFF61 (halfwidth ideographic full stop). if dots are * used as label separators, this method also changes all of them to \u002E (full stop) * in output translated string. * * @param input the string to be processed * @param flag process flag; can be 0 or any logical OR of possible flags * * @return the translated {@code String} * * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification */ public static String toASCII(String input, int flag) { int p = 0, q = 0; StringBuilder out = new StringBuilder(); if (isRootLabel(input)) { return "."; } while (p < input.length()) { q = searchDots(input, p); out.append(toASCIIInternal(input.substring(p, q), flag)); if (q != (input.length())) { // has more labels, or keep the trailing dot as at present out.append('.'); } p = q + 1; } return out.toString(); } /** * Translates a string from Unicode to ASCII Compatible Encoding (ACE), * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. * * <p> This convenience method works as if by invoking the * two-argument counterpart as follows: * <blockquote> * {@link #toASCII(String, int) toASCII}(input, 0); * </blockquote> * * @param input the string to be processed * * @return the translated {@code String} * * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification */ public static String toASCII(String input) { return toASCII(input, 0); } /** * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. * * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified. * * <p> A label is an individual part of a domain name. The original ToUnicode operation, * as defined in RFC 3490, only operates on a single label. This method can handle * both label and entire domain name, by assuming that labels in a domain name are * always separated by dots. The following characters are recognized as dots: * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), * and \uFF61 (halfwidth ideographic full stop). * * @param input the string to be processed * @param flag process flag; can be 0 or any logical OR of possible flags * * @return the translated {@code String} */ public static String toUnicode(String input, int flag) { int p = 0, q = 0; StringBuilder out = new StringBuilder(); if (isRootLabel(input)) { return "."; } while (p < input.length()) { q = searchDots(input, p); out.append(toUnicodeInternal(input.substring(p, q), flag)); if (q != (input.length())) { // has more labels, or keep the trailing dot as at present out.append('.'); } p = q + 1; } return out.toString(); } /** * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. * * <p> This convenience method works as if by invoking the * two-argument counterpart as follows: * <blockquote> * {@link #toUnicode(String, int) toUnicode}(input, 0); * </blockquote> * * @param input the string to be processed * * @return the translated {@code String} */ public static String toUnicode(String input) { return toUnicode(input, 0); } /* ---------------- Private members -------------- */ // ACE Prefix is "xn--" private static final String ACE_PREFIX = "xn--"; private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length(); private static final int MAX_LABEL_LENGTH = 63; // single instance of nameprep private static StringPrep namePrep = null; static { InputStream stream = null; try { final String IDN_PROFILE = "uidna.spp"; if (System.getSecurityManager() != null) { stream = AccessController.doPrivileged(new PrivilegedAction<>() { public InputStream run() { return StringPrep.class.getResourceAsStream(IDN_PROFILE); } }); } else { stream = StringPrep.class.getResourceAsStream(IDN_PROFILE); } namePrep = new StringPrep(stream); stream.close(); } catch (IOException e) { // should never reach here assert false; } } /* ---------------- Private operations -------------- */ // // to suppress the default zero-argument constructor // private IDN() { } // // toASCII operation; should only apply to a single label // private static String toASCIIInternal(String label, int flag) { // step 1 // Check if the string contains code points outside the ASCII range 0..0x7c. boolean isASCII = isAllASCII(label); StringBuffer dest; // step 2 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here if (!isASCII) { UCharacterIterator iter = UCharacterIterator.getInstance(label); try { dest = namePrep.prepare(iter, flag); } catch (java.text.ParseException e) { throw new IllegalArgumentException(e); } } else { dest = new StringBuffer(label); } // step 8, move forward to check the smallest number of the code points // the length must be inside 1..63 if (dest.length() == 0) { throw new IllegalArgumentException("Empty label is not a legal name"); } // step 3 // Verify the absence of non-LDH ASCII code points // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f // Verify the absence of leading and trailing hyphen boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0); if (useSTD3ASCIIRules) { for (int i = 0; i < dest.length(); i++) { int c = dest.charAt(i); if (isNonLDHAsciiCodePoint(c)) { throw new IllegalArgumentException("Contains non-LDH ASCII characters"); } } if (dest.charAt(0) == '-' || dest.charAt(dest.length() - 1) == '-') { throw new IllegalArgumentException("Has leading or trailing hyphen"); } } if (!isASCII) { // step 4 // If all code points are inside 0..0x7f, skip to step 8 if (!isAllASCII(dest.toString())) { // step 5 // verify the sequence does not begin with ACE prefix if (!startsWithACEPrefix(dest)) { // step 6 // encode the sequence with punycode try { dest = Punycode.encode(dest, null); } catch (java.text.ParseException e) { throw new IllegalArgumentException(e); } dest = toASCIILower(dest); // step 7 // prepend the ACE prefix dest.insert(0, ACE_PREFIX); } else { throw new IllegalArgumentException("The input starts with the ACE Prefix"); } } } // step 8 // the length must be inside 1..63 if (dest.length() > MAX_LABEL_LENGTH) { throw new IllegalArgumentException("The label in the input is too long"); } return dest.toString(); } // // toUnicode operation; should only apply to a single label // private static String toUnicodeInternal(String label, int flag) { boolean[] caseFlags = null; StringBuffer dest; // step 1 // find out if all the codepoints in input are ASCII boolean isASCII = isAllASCII(label); if (!isASCII) { // step 2 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here try { UCharacterIterator iter = UCharacterIterator.getInstance(label); dest = namePrep.prepare(iter, flag); } catch (Exception e) { // toUnicode never fails; if any step fails, return the input string return label; } } else { dest = new StringBuffer(label); } // step 3 // verify ACE Prefix if (startsWithACEPrefix(dest)) { // step 4 // Remove the ACE Prefix String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length()); try { // step 5 // Decode using punycode StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null); // step 6 // Apply toASCII String toASCIIOut = toASCII(decodeOut.toString(), flag); // step 7 // verify if (toASCIIOut.equalsIgnoreCase(dest.toString())) { // step 8 // return output of step 5 return decodeOut.toString(); } } catch (Exception ignored) { // no-op } } // just return the input return label; } // // LDH stands for "letter/digit/hyphen", with characters restricted to the // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen // <->. // Non LDH refers to characters in the ASCII range, but which are not // letters, digits or the hyphen. // // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F // private static boolean isNonLDHAsciiCodePoint(int ch) { return (0x0000 <= ch && ch <= 0x002C) || (0x002E <= ch && ch <= 0x002F) || (0x003A <= ch && ch <= 0x0040) || (0x005B <= ch && ch <= 0x0060) || (0x007B <= ch && ch <= 0x007F); } // // search dots in a string and return the index of that character; // or if there is no dots, return the length of input string // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), // and \uFF61 (halfwidth ideographic full stop). // private static int searchDots(String s, int start) { int i; for (i = start; i < s.length(); i++) { if (isLabelSeparator(s.charAt(i))) { break; } } return i; } // // to check if a string is a root label, ".". // private static boolean isRootLabel(String s) { return (s.length() == 1 && isLabelSeparator(s.charAt(0))); } // // to check if a character is a label separator, i.e. a dot character. // private static boolean isLabelSeparator(char c) { return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61'); } // // to check if a string only contains US-ASCII code point // private static boolean isAllASCII(String input) { boolean isASCII = true; for (int i = 0; i < input.length(); i++) { int c = input.charAt(i); if (c > 0x7F) { isASCII = false; break; } } return isASCII; } // // to check if a string starts with ACE-prefix // private static boolean startsWithACEPrefix(StringBuffer input) { boolean startsWithPrefix = true; if (input.length() < ACE_PREFIX_LENGTH) { return false; } for (int i = 0; i < ACE_PREFIX_LENGTH; i++) { if (toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)) { startsWithPrefix = false; } } return startsWithPrefix; } private static char toASCIILower(char ch) { if ('A' <= ch && ch <= 'Z') { return (char) (ch + 'a' - 'A'); } return ch; } private static StringBuffer toASCIILower(StringBuffer input) { StringBuffer dest = new StringBuffer(); for (int i = 0; i < input.length(); i++) { dest.append(toASCIILower(input.charAt(i))); } return dest; } }