List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:org.esigate.impl.UrlRewriter.java
private String unescapeHtml(String url) { // Unescape entities, ex: ' or ' url = StringEscapeUtils.unescapeHtml4(url); return url; }
From source file:org.experiment.wikipedia.wikiclean.WikiClean.java
public String clean(String page) { String content = getWikiMarkup(page); if (!withFooter) { content = removeFooter(content); }//from ww w . j av a2s .c o m content = removeRefs(content); content = removeInterWikiLinks(content); content = removeParentheticals(content); content = fixUnitConversion(content); content = ImageCaptionsRemover.remove(content); content = DoubleBracesRemover.remove(content); content = removeHtmlComments(content); content = removeEmphasis(content); content = removeHeadings(content); content = removeCategoryLinks(content); content = removeLinks(content); content = removeMath(content); content = removeGallery(content); content = removeNoToc(content); content = removeIndentation(content); content = TableRemover.remove(content); // For some reason, some HTML entities are doubly encoded. content = StringEscapeUtils.unescapeHtml4(StringEscapeUtils.unescapeHtml4(content)); content = removeHtmlTags(content); // Finally, fold multiple newlines. content = compressMultipleNewlines(content); if (withTitle) { return getTitle(page) + "\n\n" + content.trim(); } return content.trim(); }
From source file:org.htmlcleaner.XWikiDOMSerializer.java
/** * Remove any existing CDATA section and unencode HTML entities that are not inside a CDATA block. * * @param content the text input to transform * @return the transformed content that will be wrapped inside a CDATA block *///ww w .j a va 2 s . c o m private String processCDATABlocks(String content) { StringBuffer result = new StringBuffer(); Matcher matcher = CDATA_PATTERN.matcher(content); int cursor = 0; while (matcher.find()) { result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor, matcher.start()))); result.append(content.substring(matcher.start() + 9, matcher.end() - matcher.group(1).length())); cursor = matcher.end() - matcher.group(1).length() + 3; } // Copy the remaining text data in the result buffer if (cursor < content.length()) { result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor))); } // Ensure ther's no invalid <![CDATA[ or ]]> remaining. String contentResult = result.toString().replace("<![CDATA[", "").replace("]]>", ""); return contentResult; }
From source file:org.intermine.bio.io.gff3.GFF3Record.java
private void parseAttribute(String argAttributeString, String line) throws IOException { String attributeString = StringEscapeUtils.unescapeHtml4(argAttributeString); String[] sTok = attributeString.split("(?<!\\\\);"); for (int j = 0; j < sTok.length; j++) { String attVal = sTok[j].trim(); if (attVal.length() == 0) { continue; }//from www . j a v a 2s. com String attName; List<String> valList = new ArrayList<String>(); int spaceIndx = attVal.indexOf("="); if (spaceIndx == -1) { throw new IOException( "the attributes section must contain name=value pairs, " + "while parsing: " + line); } else { attName = attVal.substring(0, spaceIndx); attributeString = attVal.substring(spaceIndx + 1).trim(); if (!"\"\"".equals(attributeString)) { while (attributeString.length() > 0) { if (attributeString.startsWith("\"")) { attributeString = attributeString.substring(1); int quoteIndx = attributeString.indexOf("\""); if (quoteIndx > 0) { valList.add(attributeString.substring(0, quoteIndx)); attributeString = attributeString.substring(quoteIndx + 1).trim(); if (attributeString.startsWith(",")) { attributeString = attributeString.substring(1).trim(); } } else { throw new IOException("unmatched quote in this line: " + line + " (reading attribute: " + attName + ", " + attributeString + ")"); } } else { int commaIndx = attributeString.indexOf(","); if (commaIndx == -1) { valList.add(attributeString); attributeString = ""; } else { valList.add(attributeString.substring(0, commaIndx)); attributeString = attributeString.substring(commaIndx + 1).trim(); } } } } } // Decode values for (int i = 0; i < valList.size(); i++) { String value = valList.get(i); if (!"Target".equals(attName) && !"Gap".equals(attName)) { value = URLDecoder.decode(value, "UTF-8"); } value = XmlUtil.fixEntityNames(value); valList.set(i, value); } attributes.put(attName, valList); } }
From source file:org.jamwiki.parser.jflex.AbstractHeadingTag.java
/** * *//*from w w w .jav a 2 s . co m*/ private String buildTagName(JFlexLexer lexer, String tocText) { // re-convert any ü or other (converted by the parser) entities back String tagName = StringEscapeUtils.unescapeHtml4(tocText); return lexer.getParserInput().getTableOfContents().buildUniqueName(tagName); }
From source file:org.jamwiki.parser.jflex.AbstractHeadingTag.java
/** * Parse a Mediawiki heading of the form "==heading==" and return the * resulting HTML output.//from w ww. ja v a 2 s. c o m */ public String parse(JFlexLexer lexer, String raw, Object... args) throws ParserException { if (logger.isTraceEnabled()) { logger.trace("heading: " + raw + " (" + lexer.yystate() + ")"); } // the wikiheading tag may match a preceding newline, so strip it raw = raw.trim(); int level = this.generateTagLevel(raw, args); String tagText = this.generateTagText(raw, args); String tocText = this.buildTocText(lexer, tagText); String tagName = this.buildTagName(lexer, tocText); if (lexer.getMode() <= JFlexParser.MODE_SLICE) { String sectionName = StringEscapeUtils.unescapeHtml4(tocText); lexer.getParserOutput().setSectionName(sectionName); return raw; } if (!(lexer instanceof JAMWikiLexer)) { throw new IllegalStateException( "Cannot parse heading tags except with instances of JAMWikiLexer or in slice/splice mode"); } JAMWikiLexer jamwikiLexer = (JAMWikiLexer) lexer; if (jamwikiLexer.paragraphIsOpen()) { // close any open paragraph jamwikiLexer.popTag("p"); } return this.generateOutput(jamwikiLexer, tagName, tocText, tagText, level, raw, args); }
From source file:org.jamwiki.parser.jflex.ImageLinkTag.java
/** * *//*from w w w. j ava2s.c o m*/ private ImageMetadata parseImageParams(ParserInput parserInput, ParserOutput parserOutput, int mode, String paramText) throws ParserException { ImageMetadata imageMetadata = new ImageMetadata(); if (StringUtils.isBlank(paramText)) { return imageMetadata; } String[] tokens = paramText.split("\\|"); Matcher matcher; String caption = ""; tokenLoop: for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; if (StringUtils.isBlank(token)) { continue; } token = token.trim(); for (ImageBorderEnum border : EnumSet.allOf(ImageBorderEnum.class)) { // special case - for legacy reasons Mediawiki supports "thumbnail" instead of "thumb" if (StringUtils.equalsIgnoreCase(token, "thumbnail")) { token = "thumb"; } if (border.toString().equalsIgnoreCase(token)) { if (border == ImageBorderEnum.BORDER) { // border can be combined with frameless, so set a second attribute to track it imageMetadata.setBordered(true); if (imageMetadata.getBorder() == ImageBorderEnum.FRAMELESS) { continue tokenLoop; } } imageMetadata.setBorder(border); continue tokenLoop; } } for (ImageHorizontalAlignmentEnum horizontalAlignment : EnumSet .allOf(ImageHorizontalAlignmentEnum.class)) { if (horizontalAlignment.toString().equalsIgnoreCase(token)) { imageMetadata.setHorizontalAlignment(horizontalAlignment); continue tokenLoop; } } for (ImageVerticalAlignmentEnum verticalAlignment : EnumSet.allOf(ImageVerticalAlignmentEnum.class)) { if (verticalAlignment.toString().equalsIgnoreCase(token)) { imageMetadata.setVerticalAlignment(verticalAlignment); continue tokenLoop; } } // if none of the above tokens matched then check for size or caption if (token.toLowerCase().endsWith("px")) { matcher = IMAGE_SIZE_PATTERN.matcher(token); if (matcher.find()) { String maxWidth = matcher.group(1); if (!StringUtils.isBlank(maxWidth)) { imageMetadata.setMaxWidth(Integer.valueOf(maxWidth)); } String maxHeight = matcher.group(2); if (!StringUtils.isBlank(maxHeight)) { imageMetadata.setMaxHeight(Integer.valueOf(maxHeight)); } continue tokenLoop; } } if (token.toLowerCase().startsWith("alt")) { matcher = IMAGE_ALT_PATTERN.matcher(token); if (matcher.find()) { imageMetadata.setAlt(matcher.group(1).trim()); continue tokenLoop; } } if (token.toLowerCase().startsWith("link")) { matcher = IMAGE_LINK_PATTERN.matcher(token); if (matcher.find()) { imageMetadata.setLink(matcher.group(1).trim()); continue tokenLoop; } } // this is a bit hackish. string together any remaining content as a possible // caption, then parse it and strip out anything after the first pipe character caption += (StringUtils.isBlank(caption)) ? token : "|" + token; } // parse the caption and strip anything prior to the last "|" to handle syntax of // the form "[[File:Example.gif|caption1|caption2]]". if (!StringUtils.isBlank(caption)) { caption = JFlexParserUtil.parseFragment(parserInput, parserOutput, caption, mode); int pos = caption.indexOf('|'); if (pos != -1) { caption = (pos >= (caption.length() - 1)) ? " " : caption.substring(pos + 1); } imageMetadata.setCaption(caption); } if (imageMetadata.getVerticalAlignment() != ImageVerticalAlignmentEnum.NOT_SPECIFIED && (imageMetadata.getBorder() == ImageBorderEnum.THUMB || imageMetadata.getBorder() == ImageBorderEnum.FRAME)) { // per spec, vertical alignment can only be set for non-thumb and non-frame imageMetadata.setVerticalAlignment(ImageVerticalAlignmentEnum.NOT_SPECIFIED); } if (imageMetadata.getBorder() == ImageBorderEnum.THUMB || imageMetadata.getBorder() == ImageBorderEnum.FRAME) { // per spec, link can only be set for non-thumb and non-frame imageMetadata.setLink(null); } if (imageMetadata.getBorder() != ImageBorderEnum.THUMB && imageMetadata.getBorder() != ImageBorderEnum.FRAME && imageMetadata.getBorder() != ImageBorderEnum._GALLERY) { // per spec, captions are only displayed for thumbnails, framed images // and galleries, but the caption will be used as the alt and title for // other image types. if (!StringUtils.isBlank(imageMetadata.getCaption())) { // avoid double-escaping since the link builder escapes the title tag imageMetadata.setTitle(StringEscapeUtils.unescapeHtml4(imageMetadata.getCaption())); } if (imageMetadata.getAlt() == null) { // avoid double-escaping since the link builder escapes the alt tag imageMetadata.setAlt(StringEscapeUtils.unescapeHtml4(imageMetadata.getCaption())); } imageMetadata.setCaption(null); } if (imageMetadata.getBorder() == ImageBorderEnum.FRAME) { // per spec, frame cannot be resized imageMetadata.setMaxHeight(-1); imageMetadata.setMaxWidth(-1); } if ((imageMetadata.getBorder() == ImageBorderEnum.THUMB || imageMetadata.getBorder() == ImageBorderEnum.FRAMELESS) && imageMetadata.getMaxWidth() <= 0) { imageMetadata.setMaxWidth(DEFAULT_THUMBNAIL_WIDTH); } if (imageMetadata.getBordered() && (imageMetadata.getBorder() != ImageBorderEnum.BORDER && imageMetadata.getBorder() != ImageBorderEnum.FRAMELESS)) { // thumb, frame, etc handle borders differently imageMetadata.setBordered(false); } if (imageMetadata.getBorder() == ImageBorderEnum._GALLERY) { // internal use only imageMetadata.setHorizontalAlignment(ImageHorizontalAlignmentEnum.CENTER); // 10 pixels is for padding imageMetadata.setGalleryHeight(imageMetadata.getMaxHeight() + 10); // galleries use either the file name or nothing as the alt tag if (!StringUtils.isBlank(imageMetadata.getCaption())) { imageMetadata.setAlt(""); } } return imageMetadata; }
From source file:org.jbpm.designer.web.profile.impl.JbpmProfileImpl.java
public IDiagramMarshaller createMarshaller() { return new IDiagramMarshaller() { public String parseModel(String jsonModel, String preProcessingData) throws Exception { DroolsFactoryImpl.init();//from www . ja va 2 s . c o m BpsimFactoryImpl.init(); Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller(); JBPMBpmn2ResourceImpl res; res = (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel, preProcessingData); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); res.save(outputStream, new HashMap<Object, Object>()); return StringEscapeUtils.unescapeHtml4(outputStream.toString("UTF-8")); } public Definitions getDefinitions(String jsonModel, String preProcessingData) { try { Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller(); JBPMBpmn2ResourceImpl res = (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel, preProcessingData); return (Definitions) res.getContents().get(0); } catch (JsonParseException e) { return getDefaultDefinitions(); } catch (IOException e) { return getDefaultDefinitions(); } } public Resource getResource(String jsonModel, String preProcessingData) throws Exception { Bpmn2JsonUnmarshaller unmarshaller = new Bpmn2JsonUnmarshaller(); return (JBPMBpmn2ResourceImpl) unmarshaller.unmarshall(jsonModel, preProcessingData); } }; }
From source file:org.jbpm.formModeler.service.bb.mvc.components.FactoryURL.java
public static FactoryURL getURL(String value) throws ParseException { ParsePosition pPos = new ParsePosition(0); Object[] o = msgf.parse(value, pPos); if (o == null) throw new ParseException("Cannot parse " + value + ". Error at position " + pPos.getErrorIndex(), pPos.getErrorIndex());//ww w . ja v a 2 s.c o m String componentName = StringEscapeUtils.unescapeHtml4((String) o[0]); String propertyName = StringEscapeUtils.unescapeHtml4((String) o[1]); return new FactoryURL(componentName, propertyName); }
From source file:org.jbpm.formModeler.service.bb.mvc.controller.requestChain.MultipartProcessor.java
public boolean processRequest(CommandRequest request) throws Exception { HTTPSettings httpSettings = HTTPSettings.lookup(); HttpServletRequest httpReq = request.getRequestObject(); HttpServletResponse httpRes = request.getResponseObject(); String contentType = httpReq.getContentType(); String method = httpReq.getMethod(); if ("POST".equalsIgnoreCase(method) && contentType != null && contentType.startsWith("multipart") && httpSettings.isMultipartProcessing()) { log.debug("Found multipart request. Building wrapper"); String tmpDir = SessionTmpDirFactory.getTmpDir(httpReq); if (log.isDebugEnabled()) log.debug("Extracting to dir " + tmpDir); int maxSize = httpSettings.getMaxPostSize() * 1024; if (log.isDebugEnabled()) { log.debug("Max post size is : " + maxSize + " bytes"); log.debug("Framework encoding is: " + httpSettings.getEncoding()); }/*w w w . ja va2 s . c o m*/ try { RequestMultipartWrapper wrap = new RequestMultipartWrapper(httpReq, tmpDir, maxSize, httpSettings.getEncoding()); log.debug("Multipart request parsed: "); log.debug("getting files from request"); ControllerServletHelper.lookup().initThreadLocal(wrap, httpRes); } catch (IOException ioe) { log.warn("IOException processing multipart ", ioe); log.warn("Invalid " + method + ": URL=" + httpReq.getRequestURL() + ". QueryString=" + httpReq.getQueryString()); URLMarkupGenerator markupGenerator = URLMarkupGenerator.lookup(); if (markupGenerator != null) { Map paramsMap = new HashMap(); paramsMap.put(RedirectionHandler.PARAM_PAGE_TO_REDIRECT, errorRedirectPage); String uri = ContextTag.getContextPath(markupGenerator.getMarkup( "org.jbpm.formModeler.service.mvc.components.RedirectionHandler", "redirectToSection", paramsMap), httpReq); uri = StringEscapeUtils.unescapeHtml4(uri); ControllerStatus.lookup() .setResponse(new RedirectToURLResponse(uri, !uri.startsWith(httpReq.getContextPath()))); } return false; } } return true; }