Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input)

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:org.codelabor.example.xss.commons.StringEscapeUtilsTest.java

public void testEscapeHtml() {
    String beforeReplace = "<b>hello, world!</b>";
    logger.debug("beforeReplace: {}", beforeReplace);
    String afterEscape = StringEscapeUtils.escapeHtml4(beforeReplace);
    logger.debug("StringEscapeUtils.escapeHtml4: {}", afterEscape);
    String afterUnescape = StringEscapeUtils.unescapeHtml4(afterEscape);
    logger.debug("StringEscapeUtils.unescapeHtml4: {}", afterUnescape);
}

From source file:org.coursera.courier.grammar.ParseUtils.java

private static String unescapeDocstring(String escaped) {
    // unescape "/*" and "*/"
    String commentUnescaped = escaped.replace("&#47;&#42;", "/*").replace("&#42;&#47;", "*/");
    return StringEscapeUtils.unescapeHtml4(commentUnescaped);
}

From source file:org.dbgl.util.searchengine.WebSearchEngine.java

protected static String unescapeHtml(final String htmlChunk) {
    String result = replaceTag(HTML_BR_UNCLOSED, "\n", htmlChunk);
    result = replaceTag(HTML_BR_CLOSED, "\n", result);
    result = replaceTag(HTML_BR_CLOSED_ALT, "\n", result);
    result = replaceTag("&nbsp;", " ", result);
    result = replaceTag("&apos;", "'", result);
    return StringEscapeUtils.unescapeHtml4(StringUtils.strip(result));
}

From source file:org.dice_research.topicmodeling.io.reuters.ReutersStringParser.java

public String parseString(String s) {
    StringBuilder newString = new StringBuilder();
    char chars[] = s.toCharArray();
    char c;/*from  ww  w .ja v a  2  s.c  o m*/
    /*
     * 0 - normal state
     * 1 - saw "&" before
     * 2 - saw "&[#A-Za-z]" before
     * 3 - saw a whitespace character before
     */
    int state = 0;
    int diffToPos = 0;
    for (int pos = 0; pos < chars.length; ++pos) {
        c = chars[pos];
        switch (state) {
        case 0: {
            switch (c) {
            case '\r':
            case '\n':
            case '\t':
            case 0xA0:
            case ' ': {
                newString.append(' ');
                state = 3;
                break;
            }
            case '&': {
                state = 1;
                break;
            }
            default: {
                newString.append(c);
                break;
            }
            }
            break;
        }
        case 1: {
            switch (c) {
            case 'a':
            case 'b':
            case 'c':
            case 'd':
            case 'e':
            case 'f':
            case 'g':
            case 'h':
            case 'i':
            case 'j':
            case 'k':
            case 'l':
            case 'm':
            case 'n':
            case 'o':
            case 'p':
            case 'q':
            case 'r':
            case 's':
            case 't':
            case 'u':
            case 'v':
            case 'w':
            case 'x':
            case 'y':
            case 'z':
            case 'A':
            case 'B':
            case 'C':
            case 'D':
            case 'E':
            case 'F':
            case 'G':
            case 'H':
            case 'I':
            case 'J':
            case 'K':
            case 'L':
            case 'M':
            case 'N':
            case 'O':
            case 'P':
            case 'Q':
            case 'R':
            case 'S':
            case 'T':
            case 'U':
            case 'V':
            case 'W':
            case 'X':
            case 'Y':
            case 'Z':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            case '#': {
                state = 2;
                diffToPos = 2;
                break;
            }
            case '\r':
            case '\n':
            case '\t':
            case 0xA0:
            case ' ': {
                newString.append(chars[pos - 1]);
                newString.append(' ');
                state = 3;
                break;
            }
            default: {
                newString.append(chars[pos - 1]);
                newString.append(c);
                state = 0;
            }
            }
            break;
        }
        case 2: {
            if (diffToPos > 7) {
                // no encoded character has such a long encoding
                newString.append(s.substring(pos - diffToPos, pos + 1));
                state = 0;
                break;
            }
            switch (c) {
            case 'a':
            case 'b':
            case 'c':
            case 'd':
            case 'e':
            case 'f':
            case 'g':
            case 'h':
            case 'i':
            case 'j':
            case 'k':
            case 'l':
            case 'm':
            case 'n':
            case 'o':
            case 'p':
            case 'q':
            case 'r':
            case 's':
            case 't':
            case 'u':
            case 'v':
            case 'w':
            case 'x':
            case 'y':
            case 'z':
            case 'A':
            case 'B':
            case 'C':
            case 'D':
            case 'E':
            case 'F':
            case 'G':
            case 'H':
            case 'I':
            case 'J':
            case 'K':
            case 'L':
            case 'M':
            case 'N':
            case 'O':
            case 'P':
            case 'Q':
            case 'R':
            case 'S':
            case 'T':
            case 'U':
            case 'V':
            case 'W':
            case 'X':
            case 'Y':
            case 'Z':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9': {
                ++diffToPos;
                break;
            }
            case ';': {
                newString.append(StringEscapeUtils.unescapeHtml4(s.substring(pos - diffToPos, pos + 1)));
                state = 0;
                break;
            }
            case '\r':
            case '\n':
            case '\t':
            case 0xA0:
            case ' ': {
                newString.append(s.substring(pos - diffToPos, pos + 1));
                state = 3;
                break;
            }
            default: {
                newString.append(s.substring(pos - diffToPos, pos + 1));
                state = 0;
            }
            }
            break;
        }
        case 3: {
            switch (c) {
            case '\r':
            case '\n':
            case '\t':
            case 0xA0:
            case ' ': {
                // nothing to do
                break;
            }
            case '&': {
                state = 1;
                break;
            }
            default: {
                newString.append(c);
                state = 0;
                break;
            }
            }
            break;
        }
        } // switch (state)
    }

    return newString.toString();
}

From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.NewsDeMarkupRemovingSupplierDecorator.java

private void handleHtmlEncodedChar(StringBuilder cleanText, String text, int pos, int length) {
    cleanText.append(StringEscapeUtils.unescapeHtml4(text.substring(pos, pos + length)));
}

From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.SimpleHtmlCleaner.java

public String clearText(String text) {
    States state = States.NORMAL_TEXT;//from   w  ww .j  a  v a 2 s  .  c o  m
    char c;
    int diffToPos;
    int startPos = 0;
    int lastPos = 0;
    StringBuilder cleanedString = new StringBuilder();
    char chars[] = text.toCharArray();
    for (int i = 0; i < text.length(); ++i) {
        c = chars[i];
        switch (state) {
        case NORMAL_TEXT: {
            switch (c) {
            case '<': {
                state = States.TAG_STARTED;
                startPos = i;
                break;
            }
            case '&': {
                state = States.ENCODED_CHAR_STARTED;
                startPos = i;
                break;
            }
            }
            break;
        }
        case TAG_STARTED: {
            if (c == '>') {
                cleanedString.append(text.substring(lastPos, startPos));
                lastPos = i + 1;
                state = States.NORMAL_TEXT;
            }
            break;
        }
        case ENCODED_CHAR_STARTED: {
            diffToPos = startPos - i;
            if (diffToPos > 7) {
                // no encoded character has such a long encoding
                state = States.NORMAL_TEXT;
                break;
            }
            switch (c) {
            case 'a':
            case 'b':
            case 'c':
            case 'd':
            case 'e':
            case 'f':
            case 'g':
            case 'h':
            case 'i':
            case 'j':
            case 'k':
            case 'l':
            case 'm':
            case 'n':
            case 'o':
            case 'p':
            case 'q':
            case 'r':
            case 's':
            case 't':
            case 'u':
            case 'v':
            case 'w':
            case 'x':
            case 'y':
            case 'z':
            case 'A':
            case 'B':
            case 'C':
            case 'D':
            case 'E':
            case 'F':
            case 'G':
            case 'H':
            case 'I':
            case 'J':
            case 'K':
            case 'L':
            case 'M':
            case 'N':
            case 'O':
            case 'P':
            case 'Q':
            case 'R':
            case 'S':
            case 'T':
            case 'U':
            case 'V':
            case 'W':
            case 'X':
            case 'Y':
            case 'Z':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9': {
                break;
            }
            case '#': {
                if (diffToPos > 1) {
                    state = States.NORMAL_TEXT;
                }
                break;
            }
            case '<': {
                state = States.TAG_STARTED;
                startPos = i;
                break;
            }
            case ';': {
                cleanedString.append(text.substring(lastPos, startPos));
                lastPos = i + 1;
                cleanedString.append(StringEscapeUtils.unescapeHtml4(text.substring(startPos, lastPos)));
                state = States.NORMAL_TEXT;
                break;
            }
            default: {
                state = States.NORMAL_TEXT;
            }
            }
            break;
        }
        } // switch (state) {
    } // for (int i = 0; i < text.length(); ++i)
    cleanedString.append(text.substring(lastPos));
    return cleanedString.toString();
}

From source file:org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.WikipediaMarkupDeletingDecorator.java

private static String unescapeSymbols(final String text) {
    Pattern pat = Pattern.compile("(&[#\\p{Alnum}][\\p{Alnum}]*;)");
    Matcher matchRef = pat.matcher(text);
    StringBuilder cleanText = new StringBuilder();

    int textPos = 0;

    while (matchRef.find()) {
        cleanText.append(text.substring(textPos, matchRef.start()));
        cleanText.append(StringEscapeUtils.unescapeHtml4(text.substring(matchRef.start(), matchRef.end())));
        textPos = matchRef.end();/*from  w w w .j a v a2  s.c  om*/
    }
    cleanText.append(text.substring(textPos));
    return cleanText.toString();
}

From source file:org.drftpd.util.HttpUtils.java

public static String htmlToString(String input) {
    String str = input.replaceAll("\n", "");
    str = StringEscapeUtils.unescapeHtml4(str);
    str = Normalizer.normalize(str, Normalizer.Form.NFD);
    str = str.replaceAll("\\P{InBasic_Latin}", "");
    while (str.contains("<")) {
        int startPos = str.indexOf("<");
        int endPos = str.indexOf(">", startPos);
        if (endPos > startPos) {
            String beforeTag = str.substring(0, startPos);
            String afterTag = str.substring(endPos + 1);
            str = beforeTag + afterTag;//from   w w w  .j  ava2  s.  c om
        }
    }
    return str;
}

From source file:org.eclipse.agail.recommenderserver.Recommenders.java

private static ListOfWFs updateFlowsNodes(ListOfWFs wflist) {

    for (int i = 0; i < wflist.getWfList().size(); i++) {

        // UPDATE LINK
        wflist.getWfList().get(i).setHref("https://flows.nodered.org" + wflist.getWfList().get(i).getHref());

        char[] out = new char[12000];
        URL url;/* www . j  a va 2 s  .c o  m*/
        try {

            url = new URL(wflist.getWfList().get(i).getHref().toString());
            HttpsURLConnection con = (HttpsURLConnection) url.openConnection();
            con.setRequestProperty("User-Agent",
                    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2");
            con.connect();
            InputStream input = con.getInputStream();
            byte[] bytes = IOUtils.toByteArray(input);

            String str = new String(bytes);
            String substr = str;
            // ADD DESCRIPTION
            // flow-description"> or "flow-title">
            int start = str.lastIndexOf("flow-description\">");
            if (start != -1) {
                substr = str.substring(start);
                start = substr.indexOf(">");
                start += 1;
                substr = substr.substring(start);
            }

            else {
                start = str.lastIndexOf("flow-title\">");
                start += 12;
                substr = str.substring(start);
                start = substr.indexOf("<p>");
                start += 3;
                substr = substr.substring(start);
            }

            int end = substr.indexOf("</p>");
            String desc = substr.substring(0, end);
            wflist.getWfList().get(i).setDescription(desc);

            // ADD JS CODE
            if (wflist.getWfList().get(i).getType().equals("flow")) {

                start = str.indexOf("javascript\">");
                start += 12;
                substr = str.substring(start);
                end = substr.indexOf("</pre>");

                StringEscapeUtils util = new StringEscapeUtils();

                String code = substr.substring(0, end);
                code = util.unescapeHtml4(code);
                //               code = code.replace("&quot;", "\"");
                //               code = code.replace("&lt;", "<");
                //               code = code.replace("&gt;", ">");
                //               code = code.replace("&#x3D;", "=");
                //               code = code.replace("&#39;", "'");
                //               code = code.replace("&#x2F;", "/");
                wflist.getWfList().get(i).setJavascriptCode(code);
            }

            // ADD INSTALL COMMAND
            if (wflist.getWfList().get(i).getType().equals("node")) {
                start = str.indexOf("<code>npm install ");
                start += 6;
                substr = str.substring(start);
                end = substr.indexOf("</code>");

                String command = substr.substring(0, end);
                wflist.getWfList().get(i).setInstallCommand(command);
            }

        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    return wflist;
}

From source file:org.eclipse.agail.recommenderserver.Test.java

public static void decodingHtml() {

    StringEscapeUtils util = new StringEscapeUtils();
    String test = "&quot; &lt; &gt; &#x3D; &#39; &#x2F;";
    System.out.println("Before: " + test);
    test = util.unescapeHtml4(test);
    System.out.println("After: " + test);
}