Java tutorial
//package com.java2s; public class Main { /*** This method ensures that the output String has only * * valid XML unicode characters as specified by the * * XML 1.0 standard. For reference, please see * * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the * * standard</a>. This method will return an empty * * String if the input is null or empty. * * @param in The String whose non-valid characters we want to remove. * * @return The in String, stripped of non-valid characters. * */ public static String stripNonValidXMLCharacters(String s) { StringBuilder out = new StringBuilder(); // Used to hold the output. int codePoint; // Used to reference the current character. //String ss = "\ud801\udc00"; // This is actualy one unicode character, represented by two code units!!!. int i = 0; while (i < s.length()) { codePoint = s.codePointAt(i); // This is the unicode code of the character. if ((codePoint == 0x9) || // Consider testing larger ranges first to improve speed. (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) { out.append(Character.toChars(codePoint)); } i += Character.charCount(codePoint); // Increment with the number of code units(java chars) needed to represent a Unicode char. } return out.toString(); } }