List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:com.romeikat.datamessie.core.rss.task.rssCrawling.SourceCrawler.java
private String getDescription(final SyndEntry entry) { String description = entry.getDescription() == null ? null : entry.getDescription().getValue(); description = StringEscapeUtils.unescapeHtml4(description); return description; }
From source file:de.darkblue.bongloader2.utils.ToolBox.java
/** * cleans the string from all unneccessary things like html entities. * * @param in/*w w w.j a v a 2 s.c o m*/ * @return */ public static String cleanString(String in) { return StringEscapeUtils.unescapeHtml4(in); }
From source file:com.ryan.ryanreader.reddit.prepared.RedditPreparedComment.java
public HashSet<String> computeAllLinks() { return LinkHandler.computeAllLinks(StringEscapeUtils.unescapeHtml4(src.body_html)); }
From source file:net.krautchan.data.KCPosting.java
@Override public String toString() { String retVal = "Kc-Num: " + getKcNummer() + "\n"; retVal += "Hash: " + getThreadDbId() + "\n"; retVal += "Created: " + getCreationDate() + "\n"; retVal += "Author: " + getUser() + "\n"; retVal += "T-Code: " + getTripCode() + "\n"; retVal += "Title: " + getTitle().replaceAll("\\s+", " ") + "\n"; retVal += "Sage: " + isSage() + "\n"; retVal += "Content: " + StringEscapeUtils.unescapeHtml4(getOriginalContent()).replaceAll("\\s+", " ") .replaceAll("\\s</", "</").replaceAll("<img (.+?)\">", "<img $1\" />") .replaceAll("<a (.+?)\" >", "<a $1\">") + "\n"; retVal += "Imgs: \n"; String[] uids = getFileUids().toArray(new String[this.getFileUids().size()]); for (int i = 0; i < uids.length; i++) { if (uids[i] != null) { retVal += " " + getFile((uids[i])) + "\n"; } else {// ww w. j a v a2s. com retVal += " -\n"; } } return retVal; }
From source file:com.nttec.everychan.chans.sevenchan.SevenchanReader.java
@Override protected void parseThumbnail(String imgTag) { if (imgTag.contains("class=\"multithumbfirst\"") || imgTag.contains("class=\"multithumb\"")) { if (currentAttachments.size() > 0) { AttachmentModel attachment = currentAttachments.get(currentAttachments.size() - 1); int start, end; if ((start = imgTag.indexOf("src=\"")) != -1 && (end = imgTag.indexOf('\"', start + 5)) != -1) attachment.thumbnail = imgTag.substring(start + 5, end); Matcher byteSizeMatcher = ATTACHMENT_SIZE_PATTERN.matcher(imgTag); while (byteSizeMatcher.find()) { try { String digits = byteSizeMatcher.group(1); int multiplier = 1; String prefix = byteSizeMatcher.group(2); if (prefix != null) { if (prefix.equalsIgnoreCase("k")) multiplier = 1024; else if (prefix.equalsIgnoreCase("m")) multiplier = 1024 * 1024; }/*from w w w .j a v a 2 s . co m*/ int value = Math.round(Float.parseFloat(digits) / 1024 * multiplier); attachment.size = value; } catch (NumberFormatException e) { } } Matcher pxSizeMatcher = ATTACHMENT_PX_SIZE_PATTERN.matcher(imgTag); int indexEndPxSize = -1; while (pxSizeMatcher.find()) { try { int width = Integer.parseInt(pxSizeMatcher.group(1)); int height = Integer.parseInt(pxSizeMatcher.group(2)); attachment.width = width; attachment.height = height; indexEndPxSize = pxSizeMatcher.end(); } catch (NumberFormatException e) { } } if (indexEndPxSize != -1) { Matcher originalNameMatcher = ATTACHMENT_ORIGINAL_NAME_PATTERN.matcher(imgTag); if (originalNameMatcher.find(indexEndPxSize)) { String originalName = originalNameMatcher.group(1).trim(); if (originalName != null && originalName.length() > 0) { attachment.originalName = StringEscapeUtils.unescapeHtml4(originalName); } } } } } else if (imgTag.contains("/css/locked.gif")) { currentThread.isClosed = true; } else if (imgTag.contains("/css/sticky.gif")) { currentThread.isSticky = true; } else { int start, end; if ((start = imgTag.indexOf("src=\"")) != -1 && (end = imgTag.indexOf('\"', start + 5)) != -1) lastThumbnail = imgTag.substring(start + 5, end); } }
From source file:com.wellsandwhistles.android.redditsp.fragments.PostListingFragment.java
private void onSubredditReceived() { final String subtitle; if (mPostListingURL.getOrder() == null || mPostListingURL.getOrder() == PostSort.HOT) { if (mSubreddit.subscribers == null) { subtitle = getString(R.string.header_subscriber_count_unknown); } else {/*from w ww . ja va2s .co m*/ subtitle = getContext().getString(R.string.header_subscriber_count, NumberFormat.getNumberInstance(Locale.getDefault()).format(mSubreddit.subscribers)); } } else { subtitle = mPostListingURL.humanReadableUrl(); } getActivity().runOnUiThread(new Runnable() { @Override public void run() { setHeader(StringEscapeUtils.unescapeHtml4(mSubreddit.title), subtitle); getActivity().invalidateOptionsMenu(); } }); }
From source file:com.nttec.everychan.chans.krautchan.KrautReader.java
private void parseAttachment(String html) { Matcher attachmentMatcher = ATTACHMENT_LINKS_PATTERN.matcher(html); if (attachmentMatcher.find()) { AttachmentModel model = new AttachmentModel(); model.type = AttachmentModel.TYPE_OTHER_FILE; model.size = -1;//from w ww . ja v a 2s . com model.width = -1; model.height = -1; model.path = "/files/" + attachmentMatcher.group(1); String thumbnailGroup = attachmentMatcher.group(2); model.thumbnail = thumbnailGroup == null ? null : "/thumbnails/" + thumbnailGroup; String ext = model.path.substring(model.path.lastIndexOf('.') + 1).toLowerCase(Locale.US); if (ext.equals("png") || ext.equals("jpg") || ext.equals("jpeg")) model.type = AttachmentModel.TYPE_IMAGE_STATIC; else if (ext.equals("gif")) model.type = AttachmentModel.TYPE_IMAGE_GIF; else if (ext.equals("webm")) model.type = AttachmentModel.TYPE_VIDEO; else if (ext.equals("mp3") || ext.equals("ogg")) model.type = AttachmentModel.TYPE_AUDIO; Matcher origFilenameMatcher = ATTACHMENT_FILENAME_PATTERN.matcher(html); if (origFilenameMatcher.find()) { model.originalName = StringEscapeUtils .unescapeHtml4(RegexUtils.removeHtmlTags(origFilenameMatcher.group(1)).trim()); } Matcher infoMatcher = ATTACHMENT_INFO_PATTERN.matcher(html); if (infoMatcher.find()) { Matcher pxSizeMatcher = ATTACHMENT_PX_SIZE_PATTERN.matcher(infoMatcher.group(2)); if (pxSizeMatcher.find()) { try { int width = Integer.parseInt(pxSizeMatcher.group(1)); int height = Integer.parseInt(pxSizeMatcher.group(2)); model.width = width; model.height = height; } catch (NumberFormatException e) { } } Matcher byteSizeMatcher = ATTACHMENT_SIZE_PATTERN.matcher(infoMatcher.group(3)); if (byteSizeMatcher.find()) { try { String digits = byteSizeMatcher.group(1).replace(',', '.'); int multiplier = 1; String prefix = byteSizeMatcher.group(2); if (prefix != null) { if (prefix.equalsIgnoreCase("k")) multiplier = 1024; else if (prefix.equalsIgnoreCase("m")) multiplier = 1024 * 1024; } int value = Math.round(Float.parseFloat(digits) / 1024 * multiplier); model.size = value; } catch (NumberFormatException e) { } } } ++currentThread.attachmentsCount; currentAttachments.add(model); } }
From source file:com.nttec.everychan.api.util.WakabaReader.java
private void handleFilter(int filterIndex) throws IOException { if (inDate && filterIndex != FILTER_ENDDATE) dateBuffer.setLength(0);/*from www . j a v a2 s . c om*/ switch (filterIndex) { case FILTER_THREAD_END: finalizeThread(); break; case FILTER_ATTACHMENT: parseAttachment(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; case FILTER_ATTACHMENT_THUMBNAIL: parseThumbnail(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; case FILTER_POSTNUMBER: currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim(); break; case FILTER_SUBJECT_OP: case FILTER_SUBJECT: currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])) .trim(); break; case FILTER_POSTERNAME_OP: case FILTER_POSTERNAME: parseNameEmail(readUntilSequence(FILTERS_CLOSE[filterIndex])); inDate = true; break; case FILTER_TRIPCODE: currentPost.trip = StringEscapeUtils .unescapeHtml4(RegexUtils.removeHtmlTags(readUntilSequence(FILTERS_CLOSE[filterIndex]))).trim(); inDate = true; break; case FILTER_ENDDATE: if (dateBuffer.length() > FILTERS_OPEN[FILTER_ENDDATE].length) { String date = dateBuffer.substring(0, dateBuffer.length() - FILTERS_OPEN[FILTER_ENDDATE].length) .trim(); parseDate(date); } inDate = false; dateBuffer.setLength(0); break; case FILTER_OMITTEDPOSTS: parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; case FILTER_START_COMMENT: skipUntilSequence(FILTERS_CLOSE[filterIndex]); currentPost.comment = readPostComment(); finalizePost(); break; } }
From source file:com.screenslicer.core.scrape.type.Result.java
public void addUrl(Node node, String href, String title, boolean textSibling, boolean anchorSibling, boolean loneBlock, boolean image) { ++numUrls;//from ww w.j av a 2 s . c om String cleanHref = CommonUtil.strip(href, false); String cleanTitle = StringEscapeUtils .unescapeHtml4(StringEscapeUtils.unescapeXml(CommonUtil.strip(title, false))); cleanTitle = titleJunk.matcher(cleanTitle).replaceAll(""); String noPunctTitle = cleanTitle.replaceAll("\\p{Punct}", ""); if (CommonUtil.isEmpty(noPunctTitle)) { cleanTitle = ""; } String coreTitle = cleanTitle.replace(" ", "").replace(" ...", ""); if (!cleanHref.isEmpty() && !"#".equals(cleanHref) && !cleanTitle.isEmpty() && (!coreTitle.contains("/") || coreTitle.contains(" ") || !coreTitle.contains("."))) { int existingScore = 0; existingScore += titleHasTextSibling ? 0 : 1; existingScore += titleHasAnchorSibling ? 0 : 1; existingScore += !titleHasLoneBlock ? 0 : 1; existingScore += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? -10 : 0; int curScore = 0; curScore += textSibling ? 0 : 1; curScore += anchorSibling ? 0 : 1; curScore += !loneBlock ? 0 : 1; curScore += cleanTitle.matches("^\\p{Punct}.*$") ? -10 : 0; int curCompare = 0; if (url != null) { curCompare += cleanTitle.matches("^\\p{Punct}.*$") ? -5 : 0; curCompare += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? 9 : 0; curCompare += curScore < existingScore ? -2 : curScore > existingScore ? 2 : 0; curCompare += curScore <= 1 && curScore <= existingScore ? -1 : 0; curCompare += cleanTitle.length() <= urlTitle.length() / 2 ? -2 : cleanTitle.length() > urlTitle.length() * 2 ? 2 : 0; curCompare += cleanHref.contains("#") && !url.contains("#") ? -2 : !cleanHref.contains("#") && url.contains("#") ? 2 : 0; int nearestBlock = Util.nearestBlock(node); int existingBlock = Util.nearestBlock(urlNodes.get(url)); curCompare += nearestBlock > existingBlock ? -2 : nearestBlock == existingBlock ? 0 : 2; } if (!hasImgUrl && CommonUtil.isEmpty(altUrl) && CommonUtil.isEmpty(altUrlTitle) && (isImg(cleanTitle) || isImg(cleanHref))) { altUrl = cleanHref; altUrlTitle = cleanTitle; urlNodes.put(cleanHref, node); hasImgUrl = true; } else if (url == null || !image && titleHasImage || curCompare > 1) { if (url != null) { priorUrl = url; priorUrlTitle = urlTitle; addToSummary(urlTitle, true, urlNodes.get(url)); fallbackUrls.add(url); fallbackUrlTitles.add(urlTitle); } url = cleanHref; urlNodes.put(url, node); urlTitle = cleanTitle; titleHasTextSibling = textSibling; titleHasAnchorSibling = anchorSibling; titleHasLoneBlock = loneBlock; titleHasImage = image; } else if (curCompare > -1) { fallbackUrls.add(cleanHref); urlNodes.put(cleanHref, node); fallbackUrlTitles.add(cleanTitle); addToSummary(cleanTitle, true, node); } else { addToSummary(cleanTitle, true, node); } } }
From source file:com.screenslicer.core.scrape.type.ScrapeResult.java
public void addUrl(Node node, String href, String title, boolean textSibling, boolean anchorSibling, boolean loneBlock, boolean image) { ++numUrls;/*w ww . j a v a2 s . c o m*/ String cleanHref = CommonUtil.strip(href, false); String cleanTitle = StringEscapeUtils .unescapeHtml4(StringEscapeUtils.unescapeXml(CommonUtil.strip(title, false))); cleanTitle = titleJunk.matcher(cleanTitle).replaceAll(""); String noPunctTitle = cleanTitle.replaceAll("\\p{Punct}", ""); if (CommonUtil.isEmpty(noPunctTitle)) { cleanTitle = ""; } String coreTitle = cleanTitle.replace(" ", "").replace(" ...", ""); if (!cleanHref.isEmpty() && !"#".equals(cleanHref) && !cleanTitle.isEmpty() && (!coreTitle.contains("/") || coreTitle.contains(" ") || !coreTitle.contains("."))) { int existingScore = 0; existingScore += titleHasTextSibling ? 0 : 1; existingScore += titleHasAnchorSibling ? 0 : 1; existingScore += !titleHasLoneBlock ? 0 : 1; existingScore += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? -10 : 0; int curScore = 0; curScore += textSibling ? 0 : 1; curScore += anchorSibling ? 0 : 1; curScore += !loneBlock ? 0 : 1; curScore += cleanTitle.matches("^\\p{Punct}.*$") ? -10 : 0; int curCompare = 0; if (url != null) { curCompare += cleanTitle.matches("^\\p{Punct}.*$") ? -5 : 0; curCompare += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? 9 : 0; curCompare += curScore < existingScore ? -2 : curScore > existingScore ? 2 : 0; curCompare += curScore <= 1 && curScore <= existingScore ? -1 : 0; curCompare += cleanTitle.length() <= urlTitle.length() / 2 ? -2 : cleanTitle.length() > urlTitle.length() * 2 ? 2 : 0; curCompare += cleanHref.contains("#") && !url.contains("#") ? -2 : !cleanHref.contains("#") && url.contains("#") ? 2 : 0; int nearestBlock = NodeUtil.nearestBlock(node); int existingBlock = NodeUtil.nearestBlock(urlNodes.get(url)); curCompare += nearestBlock > existingBlock ? -2 : nearestBlock == existingBlock ? 0 : 2; } if (!hasImgUrl && CommonUtil.isEmpty(altUrl) && CommonUtil.isEmpty(altUrlTitle) && (isImg(cleanTitle) || isImg(cleanHref))) { altUrl = cleanHref; altUrlTitle = cleanTitle; urlNodes.put(cleanHref, node); hasImgUrl = true; } else if (url == null || !image && titleHasImage || curCompare > 1) { if (url != null) { priorUrl = url; priorUrlTitle = urlTitle; addToSummary(urlTitle, true, urlNodes.get(url)); fallbackUrls.add(url); fallbackUrlTitles.add(urlTitle); } url = cleanHref; urlNodes.put(url, node); urlTitle = cleanTitle; titleHasTextSibling = textSibling; titleHasAnchorSibling = anchorSibling; titleHasLoneBlock = loneBlock; titleHasImage = image; } else if (curCompare > -1) { fallbackUrls.add(cleanHref); urlNodes.put(cleanHref, node); fallbackUrlTitles.add(cleanTitle); addToSummary(cleanTitle, true, node); } else { addToSummary(cleanTitle, true, node); } } }