List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:org.codelabor.example.xss.commons.StringEscapeUtilsTest.java
public void testEscapeHtml() { String beforeReplace = "<b>hello, world!</b>"; logger.debug("beforeReplace: {}", beforeReplace); String afterEscape = StringEscapeUtils.escapeHtml4(beforeReplace); logger.debug("StringEscapeUtils.escapeHtml4: {}", afterEscape); String afterUnescape = StringEscapeUtils.unescapeHtml4(afterEscape); logger.debug("StringEscapeUtils.unescapeHtml4: {}", afterUnescape); }
From source file:org.coursera.courier.grammar.ParseUtils.java
private static String unescapeDocstring(String escaped) { // unescape "/*" and "*/" String commentUnescaped = escaped.replace("/*", "/*").replace("*/", "*/"); return StringEscapeUtils.unescapeHtml4(commentUnescaped); }
From source file:org.dbgl.util.searchengine.WebSearchEngine.java
protected static String unescapeHtml(final String htmlChunk) { String result = replaceTag(HTML_BR_UNCLOSED, "\n", htmlChunk); result = replaceTag(HTML_BR_CLOSED, "\n", result); result = replaceTag(HTML_BR_CLOSED_ALT, "\n", result); result = replaceTag(" ", " ", result); result = replaceTag("'", "'", result); return StringEscapeUtils.unescapeHtml4(StringUtils.strip(result)); }
From source file:org.dice_research.topicmodeling.io.reuters.ReutersStringParser.java
public String parseString(String s) { StringBuilder newString = new StringBuilder(); char chars[] = s.toCharArray(); char c;/*from ww w .ja v a 2 s.c o m*/ /* * 0 - normal state * 1 - saw "&" before * 2 - saw "&[#A-Za-z]" before * 3 - saw a whitespace character before */ int state = 0; int diffToPos = 0; for (int pos = 0; pos < chars.length; ++pos) { c = chars[pos]; switch (state) { case 0: { switch (c) { case '\r': case '\n': case '\t': case 0xA0: case ' ': { newString.append(' '); state = 3; break; } case '&': { state = 1; break; } default: { newString.append(c); break; } } break; } case 1: { switch (c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '#': { state = 2; diffToPos = 2; break; } case '\r': case '\n': case '\t': case 0xA0: case ' ': { newString.append(chars[pos - 1]); newString.append(' '); state = 3; break; } default: { newString.append(chars[pos - 1]); newString.append(c); state = 0; } } break; } case 2: { if (diffToPos > 7) { // no encoded character has such a long encoding newString.append(s.substring(pos - diffToPos, pos + 1)); state = 0; break; } switch (c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ++diffToPos; break; } case ';': { newString.append(StringEscapeUtils.unescapeHtml4(s.substring(pos - diffToPos, pos + 1))); state = 0; break; } case '\r': case '\n': case '\t': case 0xA0: case ' ': { newString.append(s.substring(pos - diffToPos, pos + 1)); state = 3; break; } default: { newString.append(s.substring(pos - diffToPos, pos + 1)); state = 0; } } break; } case 3: { switch (c) { case '\r': case '\n': case '\t': case 0xA0: case ' ': { // nothing to do break; } case '&': { state = 1; break; } default: { newString.append(c); state = 0; break; } } break; } } // switch (state) } return newString.toString(); }
From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.NewsDeMarkupRemovingSupplierDecorator.java
private void handleHtmlEncodedChar(StringBuilder cleanText, String text, int pos, int length) { cleanText.append(StringEscapeUtils.unescapeHtml4(text.substring(pos, pos + length))); }
From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.SimpleHtmlCleaner.java
public String clearText(String text) { States state = States.NORMAL_TEXT;//from w ww .j a v a 2 s . c o m char c; int diffToPos; int startPos = 0; int lastPos = 0; StringBuilder cleanedString = new StringBuilder(); char chars[] = text.toCharArray(); for (int i = 0; i < text.length(); ++i) { c = chars[i]; switch (state) { case NORMAL_TEXT: { switch (c) { case '<': { state = States.TAG_STARTED; startPos = i; break; } case '&': { state = States.ENCODED_CHAR_STARTED; startPos = i; break; } } break; } case TAG_STARTED: { if (c == '>') { cleanedString.append(text.substring(lastPos, startPos)); lastPos = i + 1; state = States.NORMAL_TEXT; } break; } case ENCODED_CHAR_STARTED: { diffToPos = startPos - i; if (diffToPos > 7) { // no encoded character has such a long encoding state = States.NORMAL_TEXT; break; } switch (c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { break; } case '#': { if (diffToPos > 1) { state = States.NORMAL_TEXT; } break; } case '<': { state = States.TAG_STARTED; startPos = i; break; } case ';': { cleanedString.append(text.substring(lastPos, startPos)); lastPos = i + 1; cleanedString.append(StringEscapeUtils.unescapeHtml4(text.substring(startPos, lastPos))); state = States.NORMAL_TEXT; break; } default: { state = States.NORMAL_TEXT; } } break; } } // switch (state) { } // for (int i = 0; i < text.length(); ++i) cleanedString.append(text.substring(lastPos)); return cleanedString.toString(); }
From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.WikipediaMarkupDeletingDecorator.java
private static String unescapeSymbols(final String text) { Pattern pat = Pattern.compile("(&[#\\p{Alnum}][\\p{Alnum}]*;)"); Matcher matchRef = pat.matcher(text); StringBuilder cleanText = new StringBuilder(); int textPos = 0; while (matchRef.find()) { cleanText.append(text.substring(textPos, matchRef.start())); cleanText.append(StringEscapeUtils.unescapeHtml4(text.substring(matchRef.start(), matchRef.end()))); textPos = matchRef.end();/*from w w w .j a v a2 s.c om*/ } cleanText.append(text.substring(textPos)); return cleanText.toString(); }
From source file:org.drftpd.util.HttpUtils.java
public static String htmlToString(String input) { String str = input.replaceAll("\n", ""); str = StringEscapeUtils.unescapeHtml4(str); str = Normalizer.normalize(str, Normalizer.Form.NFD); str = str.replaceAll("\\P{InBasic_Latin}", ""); while (str.contains("<")) { int startPos = str.indexOf("<"); int endPos = str.indexOf(">", startPos); if (endPos > startPos) { String beforeTag = str.substring(0, startPos); String afterTag = str.substring(endPos + 1); str = beforeTag + afterTag;//from w w w .j ava2 s. c om } } return str; }
From source file:org.eclipse.agail.recommenderserver.Recommenders.java
private static ListOfWFs updateFlowsNodes(ListOfWFs wflist) { for (int i = 0; i < wflist.getWfList().size(); i++) { // UPDATE LINK wflist.getWfList().get(i).setHref("https://flows.nodered.org" + wflist.getWfList().get(i).getHref()); char[] out = new char[12000]; URL url;/* www . j a va 2 s .c o m*/ try { url = new URL(wflist.getWfList().get(i).getHref().toString()); HttpsURLConnection con = (HttpsURLConnection) url.openConnection(); con.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2"); con.connect(); InputStream input = con.getInputStream(); byte[] bytes = IOUtils.toByteArray(input); String str = new String(bytes); String substr = str; // ADD DESCRIPTION // flow-description"> or "flow-title"> int start = str.lastIndexOf("flow-description\">"); if (start != -1) { substr = str.substring(start); start = substr.indexOf(">"); start += 1; substr = substr.substring(start); } else { start = str.lastIndexOf("flow-title\">"); start += 12; substr = str.substring(start); start = substr.indexOf("<p>"); start += 3; substr = substr.substring(start); } int end = substr.indexOf("</p>"); String desc = substr.substring(0, end); wflist.getWfList().get(i).setDescription(desc); // ADD JS CODE if (wflist.getWfList().get(i).getType().equals("flow")) { start = str.indexOf("javascript\">"); start += 12; substr = str.substring(start); end = substr.indexOf("</pre>"); StringEscapeUtils util = new StringEscapeUtils(); String code = substr.substring(0, end); code = util.unescapeHtml4(code); // code = code.replace(""", "\""); // code = code.replace("<", "<"); // code = code.replace(">", ">"); // code = code.replace("=", "="); // code = code.replace("'", "'"); // code = code.replace("/", "/"); wflist.getWfList().get(i).setJavascriptCode(code); } // ADD INSTALL COMMAND if (wflist.getWfList().get(i).getType().equals("node")) { start = str.indexOf("<code>npm install "); start += 6; substr = str.substring(start); end = substr.indexOf("</code>"); String command = substr.substring(0, end); wflist.getWfList().get(i).setInstallCommand(command); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } return wflist; }
From source file:org.eclipse.agail.recommenderserver.Test.java
public static void decodingHtml() { StringEscapeUtils util = new StringEscapeUtils(); String test = "" < > = ' /"; System.out.println("Before: " + test); test = util.unescapeHtml4(test); System.out.println("After: " + test); }