Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input)

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:org.esigate.impl.UrlRewriter.java

private String unescapeHtml(String url) {
    // Unescape entities, ex: &apos; or &#39;
    url = StringEscapeUtils.unescapeHtml4(url);
    return url;
}

From source file:org.experiment.wikipedia.wikiclean.WikiClean.java

public String clean(String page) {
    String content = getWikiMarkup(page);

    if (!withFooter) {
        content = removeFooter(content);
    }//from   ww w . j  av  a2s  .c o  m

    content = removeRefs(content);
    content = removeInterWikiLinks(content);
    content = removeParentheticals(content);
    content = fixUnitConversion(content);
    content = ImageCaptionsRemover.remove(content);
    content = DoubleBracesRemover.remove(content);
    content = removeHtmlComments(content);
    content = removeEmphasis(content);
    content = removeHeadings(content);
    content = removeCategoryLinks(content);
    content = removeLinks(content);
    content = removeMath(content);
    content = removeGallery(content);
    content = removeNoToc(content);
    content = removeIndentation(content);

    content = TableRemover.remove(content);

    // For some reason, some HTML entities are doubly encoded.
    content = StringEscapeUtils.unescapeHtml4(StringEscapeUtils.unescapeHtml4(content));
    content = removeHtmlTags(content);

    // Finally, fold multiple newlines.
    content = compressMultipleNewlines(content);

    if (withTitle) {
        return getTitle(page) + "\n\n" + content.trim();
    }

    return content.trim();
}

From source file:org.htmlcleaner.XWikiDOMSerializer.java

/**
 * Remove any existing CDATA section and unencode HTML entities that are not inside a CDATA block.
 *
 * @param content the text input to transform
 * @return the transformed content that will be wrapped inside a CDATA block
 *///ww w  .j a va 2  s  .  c o  m
private String processCDATABlocks(String content) {
    StringBuffer result = new StringBuffer();
    Matcher matcher = CDATA_PATTERN.matcher(content);
    int cursor = 0;
    while (matcher.find()) {
        result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor, matcher.start())));
        result.append(content.substring(matcher.start() + 9, matcher.end() - matcher.group(1).length()));
        cursor = matcher.end() - matcher.group(1).length() + 3;
    }
    // Copy the remaining text data in the result buffer
    if (cursor < content.length()) {
        result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor)));
    }
    // Ensure ther's no invalid <![CDATA[ or ]]> remaining.
    String contentResult = result.toString().replace("<![CDATA[", "").replace("]]>", "");

    return contentResult;
}

From source file:org.intermine.bio.io.gff3.GFF3Record.java

private void parseAttribute(String argAttributeString, String line) throws IOException {
    String attributeString = StringEscapeUtils.unescapeHtml4(argAttributeString);

    String[] sTok = attributeString.split("(?<!\\\\);");

    for (int j = 0; j < sTok.length; j++) {
        String attVal = sTok[j].trim();

        if (attVal.length() == 0) {
            continue;
        }//from   www  . j  a v  a 2s. com

        String attName;
        List<String> valList = new ArrayList<String>();
        int spaceIndx = attVal.indexOf("=");
        if (spaceIndx == -1) {
            throw new IOException(
                    "the attributes section must contain name=value pairs, " + "while parsing: " + line);
        } else {
            attName = attVal.substring(0, spaceIndx);
            attributeString = attVal.substring(spaceIndx + 1).trim();

            if (!"\"\"".equals(attributeString)) {
                while (attributeString.length() > 0) {
                    if (attributeString.startsWith("\"")) {
                        attributeString = attributeString.substring(1);
                        int quoteIndx = attributeString.indexOf("\"");
                        if (quoteIndx > 0) {
                            valList.add(attributeString.substring(0, quoteIndx));
                            attributeString = attributeString.substring(quoteIndx + 1).trim();
                            if (attributeString.startsWith(",")) {
                                attributeString = attributeString.substring(1).trim();
                            }
                        } else {
                            throw new IOException("unmatched quote in this line: " + line
                                    + " (reading attribute: " + attName + ", " + attributeString + ")");
                        }
                    } else {
                        int commaIndx = attributeString.indexOf(",");
                        if (commaIndx == -1) {
                            valList.add(attributeString);
                            attributeString = "";
                        } else {
                            valList.add(attributeString.substring(0, commaIndx));
                            attributeString = attributeString.substring(commaIndx + 1).trim();
                        }
                    }
                }
            }
        }
        // Decode values
        for (int i = 0; i < valList.size(); i++) {
            String value = valList.get(i);
            if (!"Target".equals(attName) && !"Gap".equals(attName)) {
                value = URLDecoder.decode(value, "UTF-8");
            }
            value = XmlUtil.fixEntityNames(value);
            valList.set(i, value);
        }
        attributes.put(attName, valList);
    }
}

From source file:org.jamwiki.parser.jflex.AbstractHeadingTag.java

/**
 *
 *//*from   w w  w  .jav a 2 s  .  co m*/
private String buildTagName(JFlexLexer lexer, String tocText) {
    // re-convert any &uuml; or other (converted by the parser) entities back
    String tagName = StringEscapeUtils.unescapeHtml4(tocText);
    return lexer.getParserInput().getTableOfContents().buildUniqueName(tagName);
}

From source file:org.jamwiki.parser.jflex.AbstractHeadingTag.java

/**
 * Parse a Mediawiki heading of the form "==heading==" and return the
 * resulting HTML output.//from w  ww. ja v a  2  s.  c  o m
 */
public String parse(JFlexLexer lexer, String raw, Object... args) throws ParserException {
    if (logger.isTraceEnabled()) {
        logger.trace("heading: " + raw + " (" + lexer.yystate() + ")");
    }
    // the wikiheading tag may match a preceding newline, so strip it
    raw = raw.trim();
    int level = this.generateTagLevel(raw, args);
    String tagText = this.generateTagText(raw, args);
    String tocText = this.buildTocText(lexer, tagText);
    String tagName = this.buildTagName(lexer, tocText);
    if (lexer.getMode() <= JFlexParser.MODE_SLICE) {
        String sectionName = StringEscapeUtils.unescapeHtml4(tocText);
        lexer.getParserOutput().setSectionName(sectionName);
        return raw;
    }
    if (!(lexer instanceof JAMWikiLexer)) {
        throw new IllegalStateException(
                "Cannot parse heading tags except with instances of JAMWikiLexer or in slice/splice mode");
    }
    JAMWikiLexer jamwikiLexer = (JAMWikiLexer) lexer;
    if (jamwikiLexer.paragraphIsOpen()) {
        // close any open paragraph
        jamwikiLexer.popTag("p");
    }
    return this.generateOutput(jamwikiLexer, tagName, tocText, tagText, level, raw, args);
}

From source file:org.jamwiki.parser.jflex.ImageLinkTag.java

/**
 *
 *//*from w w  w.  j  ava2s.c  o m*/
private ImageMetadata parseImageParams(ParserInput parserInput, ParserOutput parserOutput, int mode,
        String paramText) throws ParserException {
    ImageMetadata imageMetadata = new ImageMetadata();
    if (StringUtils.isBlank(paramText)) {
        return imageMetadata;
    }
    String[] tokens = paramText.split("\\|");
    Matcher matcher;
    String caption = "";
    tokenLoop: for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        if (StringUtils.isBlank(token)) {
            continue;
        }
        token = token.trim();
        for (ImageBorderEnum border : EnumSet.allOf(ImageBorderEnum.class)) {
            // special case - for legacy reasons Mediawiki supports "thumbnail" instead of "thumb"
            if (StringUtils.equalsIgnoreCase(token, "thumbnail")) {
                token = "thumb";
            }
            if (border.toString().equalsIgnoreCase(token)) {
                if (border == ImageBorderEnum.BORDER) {
                    // border can be combined with frameless, so set a second attribute to track it
                    imageMetadata.setBordered(true);
                    if (imageMetadata.getBorder() == ImageBorderEnum.FRAMELESS) {
                        continue tokenLoop;
                    }
                }
                imageMetadata.setBorder(border);
                continue tokenLoop;
            }
        }
        for (ImageHorizontalAlignmentEnum horizontalAlignment : EnumSet
                .allOf(ImageHorizontalAlignmentEnum.class)) {
            if (horizontalAlignment.toString().equalsIgnoreCase(token)) {
                imageMetadata.setHorizontalAlignment(horizontalAlignment);
                continue tokenLoop;
            }
        }
        for (ImageVerticalAlignmentEnum verticalAlignment : EnumSet.allOf(ImageVerticalAlignmentEnum.class)) {
            if (verticalAlignment.toString().equalsIgnoreCase(token)) {
                imageMetadata.setVerticalAlignment(verticalAlignment);
                continue tokenLoop;
            }
        }
        // if none of the above tokens matched then check for size or caption
        if (token.toLowerCase().endsWith("px")) {
            matcher = IMAGE_SIZE_PATTERN.matcher(token);
            if (matcher.find()) {
                String maxWidth = matcher.group(1);
                if (!StringUtils.isBlank(maxWidth)) {
                    imageMetadata.setMaxWidth(Integer.valueOf(maxWidth));
                }
                String maxHeight = matcher.group(2);
                if (!StringUtils.isBlank(maxHeight)) {
                    imageMetadata.setMaxHeight(Integer.valueOf(maxHeight));
                }
                continue tokenLoop;
            }
        }
        if (token.toLowerCase().startsWith("alt")) {
            matcher = IMAGE_ALT_PATTERN.matcher(token);
            if (matcher.find()) {
                imageMetadata.setAlt(matcher.group(1).trim());
                continue tokenLoop;
            }
        }
        if (token.toLowerCase().startsWith("link")) {
            matcher = IMAGE_LINK_PATTERN.matcher(token);
            if (matcher.find()) {
                imageMetadata.setLink(matcher.group(1).trim());
                continue tokenLoop;
            }
        }
        // this is a bit hackish.  string together any remaining content as a possible
        // caption, then parse it and strip out anything after the first pipe character
        caption += (StringUtils.isBlank(caption)) ? token : "|" + token;
    }
    // parse the caption and strip anything prior to the last "|" to handle syntax of
    // the form "[[File:Example.gif|caption1|caption2]]".
    if (!StringUtils.isBlank(caption)) {
        caption = JFlexParserUtil.parseFragment(parserInput, parserOutput, caption, mode);
        int pos = caption.indexOf('|');
        if (pos != -1) {
            caption = (pos >= (caption.length() - 1)) ? " " : caption.substring(pos + 1);
        }
        imageMetadata.setCaption(caption);
    }
    if (imageMetadata.getVerticalAlignment() != ImageVerticalAlignmentEnum.NOT_SPECIFIED
            && (imageMetadata.getBorder() == ImageBorderEnum.THUMB
                    || imageMetadata.getBorder() == ImageBorderEnum.FRAME)) {
        // per spec, vertical alignment can only be set for non-thumb and non-frame
        imageMetadata.setVerticalAlignment(ImageVerticalAlignmentEnum.NOT_SPECIFIED);
    }
    if (imageMetadata.getBorder() == ImageBorderEnum.THUMB
            || imageMetadata.getBorder() == ImageBorderEnum.FRAME) {
        // per spec, link can only be set for non-thumb and non-frame
        imageMetadata.setLink(null);
    }
    if (imageMetadata.getBorder() != ImageBorderEnum.THUMB && imageMetadata.getBorder() != ImageBorderEnum.FRAME
            && imageMetadata.getBorder() != ImageBorderEnum._GALLERY) {
        // per spec, captions are only displayed for thumbnails, framed images
        // and galleries, but the caption will be used as the alt and title for
        // other image types.
        if (!StringUtils.isBlank(imageMetadata.getCaption())) {
            // avoid double-escaping since the link builder escapes the title tag
            imageMetadata.setTitle(StringEscapeUtils.unescapeHtml4(imageMetadata.getCaption()));
        }
        if (imageMetadata.getAlt() == null) {
            // avoid double-escaping since the link builder escapes the alt tag
            imageMetadata.setAlt(StringEscapeUtils.unescapeHtml4(imageMetadata.getCaption()));
        }
        imageMetadata.setCaption(null);
    }
    if (imageMetadata.getBorder() == ImageBorderEnum.FRAME) {
        // per spec, frame cannot be resized
        imageMetadata.setMaxHeight(-1);
        imageMetadata.setMaxWidth(-1);
    }
    if ((imageMetadata.getBorder() == ImageBorderEnum.THUMB
            || imageMetadata.getBorder() == ImageBorderEnum.FRAMELESS) && imageMetadata.getMaxWidth() <= 0) {
        imageMetadata.setMaxWidth(DEFAULT_THUMBNAIL_WIDTH);
    }
    if (imageMetadata.getBordered() && (imageMetadata.getBorder() != ImageBorderEnum.BORDER
            && imageMetadata.getBorder() != ImageBorderEnum.FRAMELESS)) {
        // thumb, frame, etc handle borders differently
        imageMetadata.setBordered(false);
    }
    if (imageMetadata.getBorder() == ImageBorderEnum._GALLERY) {
        // internal use only
        imageMetadata.setHorizontalAlignment(ImageHorizontalAlignmentEnum.CENTER);
        // 10 pixels is for padding
        imageMetadata.setGalleryHeight(imageMetadata.getMaxHeight() + 10);
        // galleries use either the file name or nothing as the alt tag
        if (!StringUtils.isBlank(imageMetadata.getCaption())) {
            imageMetadata.setAlt("");
        }
    }
    return imageMetadata;
}

From source file:org.jbpm.designer.web.profile.impl.JbpmProfileImpl.java

public IDiagramMarshaller createMarshaller() {
    return new IDiagramMarshaller() {
        public String parseModel(String jsonModel, String preProcessingData) throws Exception {
            DroolsFactoryImpl.init();//from  www .  ja  va 2 s  . c o m
            BpsimFactoryImpl.init();
            Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller();
            JBPMBpmn2ResourceImpl res;
            res = (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel, preProcessingData);
            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
            res.save(outputStream, new HashMap<Object, Object>());
            return StringEscapeUtils.unescapeHtml4(outputStream.toString("UTF-8"));
        }

        public Definitions getDefinitions(String jsonModel, String preProcessingData) {
            try {
                Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller();
                JBPMBpmn2ResourceImpl res = (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel,
                        preProcessingData);
                return (Definitions) res.getContents().get(0);
            } catch (JsonParseException e) {
                return getDefaultDefinitions();
            } catch (IOException e) {
                return getDefaultDefinitions();
            }
        }

        public Resource getResource(String jsonModel, String preProcessingData) throws Exception {
            Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller();
            return (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel, preProcessingData);
        }
    };
}

From source file:org.jbpm.formModeler.service.bb.mvc.components.FactoryURL.java

public static FactoryURL getURL(String value) throws ParseException {
    ParsePosition pPos = new ParsePosition(0);
    Object[] o = msgf.parse(value, pPos);
    if (o == null)
        throw new ParseException("Cannot parse " + value + ". Error at position " + pPos.getErrorIndex(),
                pPos.getErrorIndex());//ww  w  . ja  v  a  2  s.c o  m
    String componentName = StringEscapeUtils.unescapeHtml4((String) o[0]);
    String propertyName = StringEscapeUtils.unescapeHtml4((String) o[1]);
    return new FactoryURL(componentName, propertyName);
}

From source file:org.jbpm.formModeler.service.bb.mvc.controller.requestChain.MultipartProcessor.java

public boolean processRequest(CommandRequest request) throws Exception {
    HTTPSettings httpSettings = HTTPSettings.lookup();
    HttpServletRequest httpReq = request.getRequestObject();
    HttpServletResponse httpRes = request.getResponseObject();
    String contentType = httpReq.getContentType();
    String method = httpReq.getMethod();
    if ("POST".equalsIgnoreCase(method) && contentType != null && contentType.startsWith("multipart")
            && httpSettings.isMultipartProcessing()) {
        log.debug("Found multipart request. Building wrapper");

        String tmpDir = SessionTmpDirFactory.getTmpDir(httpReq);
        if (log.isDebugEnabled())
            log.debug("Extracting to dir " + tmpDir);

        int maxSize = httpSettings.getMaxPostSize() * 1024;
        if (log.isDebugEnabled()) {
            log.debug("Max post size is : " + maxSize + " bytes");
            log.debug("Framework encoding is: " + httpSettings.getEncoding());
        }/*w  w  w  . ja  va2 s . c  o  m*/

        try {
            RequestMultipartWrapper wrap = new RequestMultipartWrapper(httpReq, tmpDir, maxSize,
                    httpSettings.getEncoding());
            log.debug("Multipart request parsed: ");
            log.debug("getting files from request");
            ControllerServletHelper.lookup().initThreadLocal(wrap, httpRes);
        } catch (IOException ioe) {
            log.warn("IOException processing multipart ", ioe);
            log.warn("Invalid " + method + ": URL=" + httpReq.getRequestURL() + ". QueryString="
                    + httpReq.getQueryString());
            URLMarkupGenerator markupGenerator = URLMarkupGenerator.lookup();
            if (markupGenerator != null) {
                Map paramsMap = new HashMap();
                paramsMap.put(RedirectionHandler.PARAM_PAGE_TO_REDIRECT, errorRedirectPage);
                String uri = ContextTag.getContextPath(markupGenerator.getMarkup(
                        "org.jbpm.formModeler.service.mvc.components.RedirectionHandler", "redirectToSection",
                        paramsMap), httpReq);
                uri = StringEscapeUtils.unescapeHtml4(uri);
                ControllerStatus.lookup()
                        .setResponse(new RedirectToURLResponse(uri, !uri.startsWith(httpReq.getContextPath())));
            }
            return false;
        }
    }
    return true;
}