Some Unicode characters can be encoded in more than one way.
A character can be represented by either a single Basic Multilingual Plane(BMP) character or a surrogate pair.
Javascript String normalize()
returns a string containing the Unicode Normalization Form of the given string.
str.normalize([form])
"NFC"
, "NFD"
, "NFKC"
, or "NFKD"
.The form specifies the Unicode Normalization Form.
It defaults to "NFC".
These values have the following meanings:
Form | Meaning |
---|---|
NFC | Canonical Decomposition, followed by Canonical Composition. |
NFD | Canonical Decomposition. |
NFKC | Compatibility Decomposition, followed by Canonical Composition. |
NFKD | Compatibility Decomposition. |
For example, consider the following:
U+00C5 is Latin capital letter A with ring above
console.log(String.fromCharCode(0x00C5));
U+212B is the same letter.
console.log(String.fromCharCode(0x212B));
U+0041 and U+030A combined together is the same letter.
console.log(String.fromCharCode(0x0041, 0x030A)); let a1 = String.fromCharCode(0x00C5), a2 = String.fromCharCode(0x212B), a3 = String.fromCharCode(0x0041, 0x030A); console.log(a1, a2, a3); /*from w w w. j a v a2 s .c om*/ console.log(a1 === a2); // false console.log(a1 === a3); // false console.log(a2 === a3); // false
We can determine if a string is already normalized by checking it against the return value of normalize()
:
let a1 = String.fromCharCode(0x00C5), a2 = String.fromCharCode(0x212B), a3 = String.fromCharCode(0x0041, 0x030A); // U+00C5 is the NFC/NFKC normalized form of 0+212B console.log(a1 === a1.normalize("NFD")); // false console.log(a1 === a1.normalize("NFC")); // true console.log(a1 === a1.normalize("NFKD")); // false console.log(a1 === a1.normalize("NFKC")); // true // U+212B is non-normalized console.log(a2 === a2.normalize("NFD")); // false console.log(a2 === a2.normalize("NFC")); // false console.log(a2 === a2.normalize("NFKD")); // false console.log(a2 === a2.normalize("NFKC")); // false // U+0041/U+030A is the NFD/NFKD normalized form of 0+212B console.log(a3 === a3.normalize("NFD")); // true console.log(a3 === a3.normalize("NFC")); // false console.log(a3 === a3.normalize("NFKD")); // true console.log(a3 === a3.normalize("NFKC")); // false a1 = String.fromCharCode(0x00C5), a2 = String.fromCharCode(0x212B), a3 = String.fromCharCode(0x0041, 0x030A); console.log(a1.normalize("NFD") === a2.normalize("NFD")); // true console.log(a2.normalize("NFKC") === a3.normalize("NFKC")); // true console.log(a1.normalize("NFC") === a3.normalize("NFC")); // true
Another example
let string1 = '\u00F1'; let string2 = '\u006E\u0303'; console.log(string1 === string2); // w w w . j a v a 2 s . c o m console.log(string1.length); console.log(string2.length); //canonical equivalence normalization string1 = string1.normalize('NFD'); string2 = string2.normalize('NFD'); console.log(string1 === string2); // true console.log(string1.length); // 2 console.log(string2.length); // 2 //specify "NFC" to get the composed canonical form //multiple code points are replaced with single code points where possible. string1 = string1.normalize('NFC'); string2 = string2.normalize('NFC'); console.log(string1 === string2); // true console.log(string1.length); // 1 console.log(string2.length); // 1 console.log(string2.codePointAt(0).toString(16)); // f1 //Compatibility normalization string1 = string1.normalize('NFKD'); string2 = string2.normalize('NFKD'); console.log(string1); console.log(string2); console.log(string1 === string2); console.log(string1.length); console.log(string2.length);