Here you can find the source of cleanText(String s)
public static String cleanText(String s)
//package com.java2s; public class Main { public static final char BULLET = 8226; public static String cleanText(String s) { s = s.trim();//from w ww .j a v a2 s . c o m int len = s.length(); StringBuffer cleanValue = new StringBuffer(len); for (int i = 0; i < len; i++) { char ch = s.charAt(i); if (ch > 127) { System.out.println( "WARNING: Non ASCII character " + ch + " (" + (int) ch + ") in following string\n" + s); } String convertedCh = String.valueOf(ch); boolean handled = true; //These strange character codes are what we see after copy/pasting text //from MS Word. switch (ch) { case 210: //Open double quote case 8220: case 211: //Close double quote case 8221: convertedCh = String.valueOf('"'); break; case 212: //Open single quote case 8216: case 213: //Close single quote case 8217: convertedCh = String.valueOf('\''); break; case 8218: convertedCh = String.valueOf(','); break; case 8594: //Right arrow convertedCh = "->"; break; case 65533: case 208: // dashes case 209: case 8211: case 8212: convertedCh = String.valueOf('-'); break; case 8195: //Funny space which messes up Java parsing. case 160: convertedCh = String.valueOf(' '); break; case 165: //Bullet point case BULLET: //Convert all bullets to a single special value. //It will be converted later convertedCh = String.valueOf(BULLET); break; default: handled = false; } if (ch > 127 && !handled) { System.out.println("WARNING: Unhandled non ASCII character " + ch + "(" + (int) ch + ")"); } cleanValue.append(convertedCh); } return cleanValue.toString(); } }