Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input) 

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:com.romeikat.datamessie.core.rss.task.rssCrawling.SourceCrawler.java

private String getDescription(final SyndEntry entry) {
    String description = entry.getDescription() == null ? null : entry.getDescription().getValue();
    description = StringEscapeUtils.unescapeHtml4(description);
    return description;
}

From source file:de.darkblue.bongloader2.utils.ToolBox.java

/**
 * cleans the string from all unneccessary things like html entities.
 *
 * @param in/*w w w.j a v  a  2  s.c  o m*/
 * @return
 */
public static String cleanString(String in) {
    return StringEscapeUtils.unescapeHtml4(in);
}

From source file:com.ryan.ryanreader.reddit.prepared.RedditPreparedComment.java

public HashSet<String> computeAllLinks() {
    return LinkHandler.computeAllLinks(StringEscapeUtils.unescapeHtml4(src.body_html));
}

From source file:net.krautchan.data.KCPosting.java

@Override
public String toString() {
    String retVal = "Kc-Num: " + getKcNummer() + "\n";
    retVal += "Hash: " + getThreadDbId() + "\n";
    retVal += "Created: " + getCreationDate() + "\n";
    retVal += "Author: " + getUser() + "\n";
    retVal += "T-Code: " + getTripCode() + "\n";
    retVal += "Title: " + getTitle().replaceAll("\\s+", " ") + "\n";
    retVal += "Sage: " + isSage() + "\n";
    retVal += "Content: " + StringEscapeUtils.unescapeHtml4(getOriginalContent()).replaceAll("\\s+", " ")
            .replaceAll("\\s</", "</").replaceAll("<img (.+?)\">", "<img $1\" />")
            .replaceAll("<a (.+?)\" >", "<a $1\">") + "\n";
    retVal += "Imgs: \n";
    String[] uids = getFileUids().toArray(new String[this.getFileUids().size()]);
    for (int i = 0; i < uids.length; i++) {
        if (uids[i] != null) {
            retVal += "   " + getFile((uids[i])) + "\n";
        } else {// ww  w.  j a v a2s.  com
            retVal += "   -\n";
        }
    }
    return retVal;
}

From source file:com.nttec.everychan.chans.sevenchan.SevenchanReader.java

@Override
protected void parseThumbnail(String imgTag) {
    if (imgTag.contains("class=\"multithumbfirst\"") || imgTag.contains("class=\"multithumb\"")) {
        if (currentAttachments.size() > 0) {
            AttachmentModel attachment = currentAttachments.get(currentAttachments.size() - 1);
            int start, end;
            if ((start = imgTag.indexOf("src=\"")) != -1 && (end = imgTag.indexOf('\"', start + 5)) != -1)
                attachment.thumbnail = imgTag.substring(start + 5, end);

            Matcher byteSizeMatcher = ATTACHMENT_SIZE_PATTERN.matcher(imgTag);
            while (byteSizeMatcher.find()) {
                try {
                    String digits = byteSizeMatcher.group(1);
                    int multiplier = 1;
                    String prefix = byteSizeMatcher.group(2);
                    if (prefix != null) {
                        if (prefix.equalsIgnoreCase("k"))
                            multiplier = 1024;
                        else if (prefix.equalsIgnoreCase("m"))
                            multiplier = 1024 * 1024;
                    }/*from w w w  .j a v  a  2  s  . co m*/
                    int value = Math.round(Float.parseFloat(digits) / 1024 * multiplier);
                    attachment.size = value;
                } catch (NumberFormatException e) {
                }
            }

            Matcher pxSizeMatcher = ATTACHMENT_PX_SIZE_PATTERN.matcher(imgTag);
            int indexEndPxSize = -1;
            while (pxSizeMatcher.find()) {
                try {
                    int width = Integer.parseInt(pxSizeMatcher.group(1));
                    int height = Integer.parseInt(pxSizeMatcher.group(2));
                    attachment.width = width;
                    attachment.height = height;
                    indexEndPxSize = pxSizeMatcher.end();
                } catch (NumberFormatException e) {
                }
            }

            if (indexEndPxSize != -1) {
                Matcher originalNameMatcher = ATTACHMENT_ORIGINAL_NAME_PATTERN.matcher(imgTag);
                if (originalNameMatcher.find(indexEndPxSize)) {
                    String originalName = originalNameMatcher.group(1).trim();
                    if (originalName != null && originalName.length() > 0) {
                        attachment.originalName = StringEscapeUtils.unescapeHtml4(originalName);
                    }
                }
            }

        }
    } else if (imgTag.contains("/css/locked.gif")) {
        currentThread.isClosed = true;
    } else if (imgTag.contains("/css/sticky.gif")) {
        currentThread.isSticky = true;
    } else {
        int start, end;
        if ((start = imgTag.indexOf("src=\"")) != -1 && (end = imgTag.indexOf('\"', start + 5)) != -1)
            lastThumbnail = imgTag.substring(start + 5, end);
    }
}

From source file:com.wellsandwhistles.android.redditsp.fragments.PostListingFragment.java

private void onSubredditReceived() {

    final String subtitle;

    if (mPostListingURL.getOrder() == null || mPostListingURL.getOrder() == PostSort.HOT) {
        if (mSubreddit.subscribers == null) {
            subtitle = getString(R.string.header_subscriber_count_unknown);
        } else {/*from   w ww  .  ja  va2s  .co m*/
            subtitle = getContext().getString(R.string.header_subscriber_count,
                    NumberFormat.getNumberInstance(Locale.getDefault()).format(mSubreddit.subscribers));
        }

    } else {
        subtitle = mPostListingURL.humanReadableUrl();
    }

    getActivity().runOnUiThread(new Runnable() {
        @Override
        public void run() {
            setHeader(StringEscapeUtils.unescapeHtml4(mSubreddit.title), subtitle);
            getActivity().invalidateOptionsMenu();
        }
    });

}

From source file:com.nttec.everychan.chans.krautchan.KrautReader.java

private void parseAttachment(String html) {
    Matcher attachmentMatcher = ATTACHMENT_LINKS_PATTERN.matcher(html);
    if (attachmentMatcher.find()) {
        AttachmentModel model = new AttachmentModel();
        model.type = AttachmentModel.TYPE_OTHER_FILE;
        model.size = -1;//from w ww  .  ja  v a  2s .  com
        model.width = -1;
        model.height = -1;
        model.path = "/files/" + attachmentMatcher.group(1);
        String thumbnailGroup = attachmentMatcher.group(2);
        model.thumbnail = thumbnailGroup == null ? null : "/thumbnails/" + thumbnailGroup;
        String ext = model.path.substring(model.path.lastIndexOf('.') + 1).toLowerCase(Locale.US);
        if (ext.equals("png") || ext.equals("jpg") || ext.equals("jpeg"))
            model.type = AttachmentModel.TYPE_IMAGE_STATIC;
        else if (ext.equals("gif"))
            model.type = AttachmentModel.TYPE_IMAGE_GIF;
        else if (ext.equals("webm"))
            model.type = AttachmentModel.TYPE_VIDEO;
        else if (ext.equals("mp3") || ext.equals("ogg"))
            model.type = AttachmentModel.TYPE_AUDIO;
        Matcher origFilenameMatcher = ATTACHMENT_FILENAME_PATTERN.matcher(html);
        if (origFilenameMatcher.find()) {
            model.originalName = StringEscapeUtils
                    .unescapeHtml4(RegexUtils.removeHtmlTags(origFilenameMatcher.group(1)).trim());
        }
        Matcher infoMatcher = ATTACHMENT_INFO_PATTERN.matcher(html);
        if (infoMatcher.find()) {
            Matcher pxSizeMatcher = ATTACHMENT_PX_SIZE_PATTERN.matcher(infoMatcher.group(2));
            if (pxSizeMatcher.find()) {
                try {
                    int width = Integer.parseInt(pxSizeMatcher.group(1));
                    int height = Integer.parseInt(pxSizeMatcher.group(2));
                    model.width = width;
                    model.height = height;
                } catch (NumberFormatException e) {
                }
            }
            Matcher byteSizeMatcher = ATTACHMENT_SIZE_PATTERN.matcher(infoMatcher.group(3));
            if (byteSizeMatcher.find()) {
                try {
                    String digits = byteSizeMatcher.group(1).replace(',', '.');
                    int multiplier = 1;
                    String prefix = byteSizeMatcher.group(2);
                    if (prefix != null) {
                        if (prefix.equalsIgnoreCase("k"))
                            multiplier = 1024;
                        else if (prefix.equalsIgnoreCase("m"))
                            multiplier = 1024 * 1024;
                    }
                    int value = Math.round(Float.parseFloat(digits) / 1024 * multiplier);
                    model.size = value;
                } catch (NumberFormatException e) {
                }
            }
        }
        ++currentThread.attachmentsCount;
        currentAttachments.add(model);
    }
}

From source file:com.nttec.everychan.api.util.WakabaReader.java

private void handleFilter(int filterIndex) throws IOException {
    if (inDate && filterIndex != FILTER_ENDDATE)
        dateBuffer.setLength(0);/*from   www  . j a v  a2 s  .  c  om*/
    switch (filterIndex) {
    case FILTER_THREAD_END:
        finalizeThread();
        break;
    case FILTER_ATTACHMENT:
        parseAttachment(readUntilSequence(FILTERS_CLOSE[filterIndex]));
        break;
    case FILTER_ATTACHMENT_THUMBNAIL:
        parseThumbnail(readUntilSequence(FILTERS_CLOSE[filterIndex]));
        break;
    case FILTER_POSTNUMBER:
        currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim();
        break;
    case FILTER_SUBJECT_OP:
    case FILTER_SUBJECT:
        currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex]))
                .trim();
        break;
    case FILTER_POSTERNAME_OP:
    case FILTER_POSTERNAME:
        parseNameEmail(readUntilSequence(FILTERS_CLOSE[filterIndex]));
        inDate = true;
        break;
    case FILTER_TRIPCODE:
        currentPost.trip = StringEscapeUtils
                .unescapeHtml4(RegexUtils.removeHtmlTags(readUntilSequence(FILTERS_CLOSE[filterIndex]))).trim();
        inDate = true;
        break;
    case FILTER_ENDDATE:
        if (dateBuffer.length() > FILTERS_OPEN[FILTER_ENDDATE].length) {
            String date = dateBuffer.substring(0, dateBuffer.length() - FILTERS_OPEN[FILTER_ENDDATE].length)
                    .trim();
            parseDate(date);
        }
        inDate = false;
        dateBuffer.setLength(0);
        break;
    case FILTER_OMITTEDPOSTS:
        parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex]));
        break;
    case FILTER_START_COMMENT:
        skipUntilSequence(FILTERS_CLOSE[filterIndex]);
        currentPost.comment = readPostComment();
        finalizePost();
        break;
    }
}

From source file:com.screenslicer.core.scrape.type.Result.java

public void addUrl(Node node, String href, String title, boolean textSibling, boolean anchorSibling,
        boolean loneBlock, boolean image) {
    ++numUrls;//from  ww  w.j  av  a 2  s  .  c om
    String cleanHref = CommonUtil.strip(href, false);
    String cleanTitle = StringEscapeUtils
            .unescapeHtml4(StringEscapeUtils.unescapeXml(CommonUtil.strip(title, false)));
    cleanTitle = titleJunk.matcher(cleanTitle).replaceAll("");
    String noPunctTitle = cleanTitle.replaceAll("\\p{Punct}", "");
    if (CommonUtil.isEmpty(noPunctTitle)) {
        cleanTitle = "";
    }
    String coreTitle = cleanTitle.replace(" ", "").replace(" ...", "");
    if (!cleanHref.isEmpty() && !"#".equals(cleanHref) && !cleanTitle.isEmpty()
            && (!coreTitle.contains("/") || coreTitle.contains(" ") || !coreTitle.contains("."))) {
        int existingScore = 0;
        existingScore += titleHasTextSibling ? 0 : 1;
        existingScore += titleHasAnchorSibling ? 0 : 1;
        existingScore += !titleHasLoneBlock ? 0 : 1;
        existingScore += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? -10 : 0;
        int curScore = 0;
        curScore += textSibling ? 0 : 1;
        curScore += anchorSibling ? 0 : 1;
        curScore += !loneBlock ? 0 : 1;
        curScore += cleanTitle.matches("^\\p{Punct}.*$") ? -10 : 0;
        int curCompare = 0;
        if (url != null) {
            curCompare += cleanTitle.matches("^\\p{Punct}.*$") ? -5 : 0;
            curCompare += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? 9 : 0;
            curCompare += curScore < existingScore ? -2 : curScore > existingScore ? 2 : 0;
            curCompare += curScore <= 1 && curScore <= existingScore ? -1 : 0;
            curCompare += cleanTitle.length() <= urlTitle.length() / 2 ? -2
                    : cleanTitle.length() > urlTitle.length() * 2 ? 2 : 0;
            curCompare += cleanHref.contains("#") && !url.contains("#") ? -2
                    : !cleanHref.contains("#") && url.contains("#") ? 2 : 0;
            int nearestBlock = Util.nearestBlock(node);
            int existingBlock = Util.nearestBlock(urlNodes.get(url));
            curCompare += nearestBlock > existingBlock ? -2 : nearestBlock == existingBlock ? 0 : 2;
        }
        if (!hasImgUrl && CommonUtil.isEmpty(altUrl) && CommonUtil.isEmpty(altUrlTitle)
                && (isImg(cleanTitle) || isImg(cleanHref))) {
            altUrl = cleanHref;
            altUrlTitle = cleanTitle;
            urlNodes.put(cleanHref, node);
            hasImgUrl = true;
        } else if (url == null || !image && titleHasImage || curCompare > 1) {
            if (url != null) {
                priorUrl = url;
                priorUrlTitle = urlTitle;
                addToSummary(urlTitle, true, urlNodes.get(url));
                fallbackUrls.add(url);
                fallbackUrlTitles.add(urlTitle);
            }
            url = cleanHref;
            urlNodes.put(url, node);
            urlTitle = cleanTitle;
            titleHasTextSibling = textSibling;
            titleHasAnchorSibling = anchorSibling;
            titleHasLoneBlock = loneBlock;
            titleHasImage = image;
        } else if (curCompare > -1) {
            fallbackUrls.add(cleanHref);
            urlNodes.put(cleanHref, node);
            fallbackUrlTitles.add(cleanTitle);
            addToSummary(cleanTitle, true, node);
        } else {
            addToSummary(cleanTitle, true, node);
        }
    }
}

From source file:com.screenslicer.core.scrape.type.ScrapeResult.java

public void addUrl(Node node, String href, String title, boolean textSibling, boolean anchorSibling,
        boolean loneBlock, boolean image) {
    ++numUrls;/*w ww . j  a  v  a2 s  .  c o m*/
    String cleanHref = CommonUtil.strip(href, false);
    String cleanTitle = StringEscapeUtils
            .unescapeHtml4(StringEscapeUtils.unescapeXml(CommonUtil.strip(title, false)));
    cleanTitle = titleJunk.matcher(cleanTitle).replaceAll("");
    String noPunctTitle = cleanTitle.replaceAll("\\p{Punct}", "");
    if (CommonUtil.isEmpty(noPunctTitle)) {
        cleanTitle = "";
    }
    String coreTitle = cleanTitle.replace(" ", "").replace(" ...", "");
    if (!cleanHref.isEmpty() && !"#".equals(cleanHref) && !cleanTitle.isEmpty()
            && (!coreTitle.contains("/") || coreTitle.contains(" ") || !coreTitle.contains("."))) {
        int existingScore = 0;
        existingScore += titleHasTextSibling ? 0 : 1;
        existingScore += titleHasAnchorSibling ? 0 : 1;
        existingScore += !titleHasLoneBlock ? 0 : 1;
        existingScore += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? -10 : 0;
        int curScore = 0;
        curScore += textSibling ? 0 : 1;
        curScore += anchorSibling ? 0 : 1;
        curScore += !loneBlock ? 0 : 1;
        curScore += cleanTitle.matches("^\\p{Punct}.*$") ? -10 : 0;
        int curCompare = 0;
        if (url != null) {
            curCompare += cleanTitle.matches("^\\p{Punct}.*$") ? -5 : 0;
            curCompare += urlTitle == null || urlTitle.matches("^\\p{Punct}.*$") ? 9 : 0;
            curCompare += curScore < existingScore ? -2 : curScore > existingScore ? 2 : 0;
            curCompare += curScore <= 1 && curScore <= existingScore ? -1 : 0;
            curCompare += cleanTitle.length() <= urlTitle.length() / 2 ? -2
                    : cleanTitle.length() > urlTitle.length() * 2 ? 2 : 0;
            curCompare += cleanHref.contains("#") && !url.contains("#") ? -2
                    : !cleanHref.contains("#") && url.contains("#") ? 2 : 0;
            int nearestBlock = NodeUtil.nearestBlock(node);
            int existingBlock = NodeUtil.nearestBlock(urlNodes.get(url));
            curCompare += nearestBlock > existingBlock ? -2 : nearestBlock == existingBlock ? 0 : 2;
        }
        if (!hasImgUrl && CommonUtil.isEmpty(altUrl) && CommonUtil.isEmpty(altUrlTitle)
                && (isImg(cleanTitle) || isImg(cleanHref))) {
            altUrl = cleanHref;
            altUrlTitle = cleanTitle;
            urlNodes.put(cleanHref, node);
            hasImgUrl = true;
        } else if (url == null || !image && titleHasImage || curCompare > 1) {
            if (url != null) {
                priorUrl = url;
                priorUrlTitle = urlTitle;
                addToSummary(urlTitle, true, urlNodes.get(url));
                fallbackUrls.add(url);
                fallbackUrlTitles.add(urlTitle);
            }
            url = cleanHref;
            urlNodes.put(url, node);
            urlTitle = cleanTitle;
            titleHasTextSibling = textSibling;
            titleHasAnchorSibling = anchorSibling;
            titleHasLoneBlock = loneBlock;
            titleHasImage = image;
        } else if (curCompare > -1) {
            fallbackUrls.add(cleanHref);
            urlNodes.put(cleanHref, node);
            fallbackUrlTitles.add(cleanTitle);
            addToSummary(cleanTitle, true, node);
        } else {
            addToSummary(cleanTitle, true, node);
        }
    }
}