Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input)

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:com.nttec.everychan.ui.presentation.Subscriptions.java

private static String htmlToComment(String html) {
    return StringEscapeUtils.unescapeHtml4(RegexUtils.removeHtmlTags(html.replaceAll("<(br|p)/?>", " ")));
}

From source file:net.chuzarski.crowdednews.utils.reddit.RedditRequest.java

/**
 * Creates a single RedditPost object from the given index of the postsJSONArray
 * @return RedditPost//ww  w  .j  a v  a  2  s  .c  om
 * @throws JSONException
 */
public RedditPost createSingleRedditPost(JSONObject postObj) throws JSONException, RedditException {

    RedditPost post;

    //all data in a post
    String title;
    String url;
    String name;
    String id;
    String domain;

    long createdUTC;
    boolean stickied;

    title = StringEscapeUtils.unescapeHtml4(postObj.getString("title"));
    name = postObj.getString("name");
    url = postObj.getString("url");
    id = postObj.getString("id");
    stickied = postObj.getBoolean("stickied");
    createdUTC = postObj.getLong("created_utc");
    domain = postObj.getString("domain");

    //meet our conditions
    if (domain.contains("self.")) {
        throw new RedditException(RedditErrors.REDDIT_SELF_POST);
    }

    post = new RedditPost.Builder(title, url).redditName(name).redditId(id).isStickied(stickied)
            .timeCreated(createdUTC).linkDomain(domain).build();

    return post;
}

From source file:com.romeikat.datamessie.core.rss.task.maintenance.MaintenanceTask.java

private void unescapeHtmlCharsFromContent(final TaskExecution taskExecution) throws TaskCancelledException {
    final HibernateSessionProvider sessionProvider = new HibernateSessionProvider(sessionFactory);
    // Get all IDs
    final List<Long> ids = documentDao.getIds(sessionProvider.getStatelessSession());
    sessionProvider.closeStatelessSession();
    // Process IDs in batches
    final List<List<Long>> batches = CollectionUtil.splitIntoSubListsBySize(ids, batchSize);
    for (final List<Long> batch : batches) {
        new ParallelProcessing<Long>(sessionFactory, batch) {
            @Override//from  ww  w.j  ava  2 s.  co m
            public void doProcessing(final HibernateSessionProvider sessionProvider, final Long documentId) {
                // Unescape characters
                final RawContent rawContent = rawContentDao.getEntity(sessionProvider.getStatelessSession(),
                        documentId);
                if (rawContent == null) {
                    return;
                }
                final String content = rawContent.getContent();
                final String unescapedContent = StringEscapeUtils.unescapeHtml4(content);
                // Remove any preprocessed information
                rawContent.setContent(unescapedContent);
                rawContentDao.update(sessionProvider.getStatelessSession(), rawContent);
                final Document document = documentDao.getEntity(sessionProvider.getStatelessSession(),
                        documentId);
                document.setState(DocumentProcessingState.DOWNLOADED);
                documentDao.update(sessionProvider.getStatelessSession(), document);
                final List<NamedEntityOccurrence> namedEntities = namedEntityOccurrenceDao
                        .getByDocument(sessionProvider.getStatelessSession(), documentId);
                for (final NamedEntityOccurrence namedEntity : namedEntities) {
                    namedEntityOccurrenceDao.delete(sessionProvider.getStatelessSession(), namedEntity);
                }
                LOG.info("Unescaped HTML characters from content of document {}", documentId);
            }
        };
        taskExecution.checkpoint();
    }
}

From source file:com.github.naoghuman.cm.model.category.CategoryModel.java

@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
    this.setId(in.readLong());
    this.setMatrixId(in.readLong());
    this.setGenerationTime(in.readLong());
    this.setTitle(StringEscapeUtils.unescapeHtml4(String.valueOf(in.readObject())));
    this.setDescription(StringEscapeUtils.unescapeHtml4(String.valueOf(in.readObject())));
}

From source file:com.nttec.everychan.chans.krautchan.KrautReader.java

private void handleFilter(int filterIndex) throws IOException {
    switch (filterIndex) {
    case FILTER_THREAD_END:
        finalizeThread();/*from   w w  w .j  a  v a  2  s  . c  o m*/
        break;
    case FILTER_POSTNUMBER:
        currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim();
        break;
    case FILTER_COUNTRYBALL:
    case FILTER_COUNTRYBALL_WAR:
        parseIcon(readUntilSequence(FILTERS_CLOSE[filterIndex]), filterIndex == FILTER_COUNTRYBALL_WAR);
        break;
    case FILTER_SUBJECT:
        currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex]))
                .trim();
        currentPost.subject = CryptoUtils.fixCloudflareEmails(currentPost.subject);
        break;
    case FILTER_POSTERNAME:
        currentPost.name = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex]))
                .trim();
        break;
    case FILTER_TRIPCODE:
        currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex]))
                .trim();
        break;
    case FILTER_ADMINMARK:
        skipUntilSequence(">".toCharArray());
        currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex]))
                .trim();
        break;
    case FILTER_DATE:
        String date = readUntilSequence(FILTERS_CLOSE[filterIndex]);
        int ms = 0;
        try {
            int dotPosition = date.lastIndexOf('.');
            if (dotPosition != -1) {
                ms = Integer.parseInt(date.substring(dotPosition + 1)) / 1000;
                date = date.substring(0, dotPosition);
            }
        } catch (NumberFormatException e) {
        }
        try {
            currentPost.timestamp = KRAUT_DATEFORMAT.parse(date).getTime() + ms;
        } catch (Exception e) {
            Logger.e(TAG, "unable to parse date", e);
        }
        break;
    case FILTER_SAGE:
        currentPost.sage = true;
        break;
    case FILTER_ATTACHMENT:
    case FILTER_ATTACHMENT_OP:
        String[] attachments = readUntilSequence(FILTERS_CLOSE[filterIndex]).split("</div>");
        for (String attachment : attachments)
            parseAttachment(attachment);
        break;
    case FILTER_START_COMMENT:
        skipUntilSequence(FILTERS_CLOSE[filterIndex]);
        currentPost.comment = readPostComment();
        finalizePost();
        break;
    case FILTER_OMITTEDPOSTS:
        parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex]));
        break;
    }
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByUi4j() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);// w w  w . j av  a 2 s .c o m
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Navigate to blank page.
    scrapeView.setProgressBarTaskText("requesting page");
    logger.info("Requesting page...");
    Page page = browserEngine.navigate(scrapeView.getWebsiteUrl());
    //page.show();
    logger.info("Requesting of page completed.");

    scrapeView.setProgressBarTaskText("viewing page as HTML");
    logger.info("View page as HTML");
    String html = page.getDocument().getBody().getInnerHTML();

    // Unescape html.
    scrapeView.setProgressBarTaskText("unescaping HTML");
    logger.info("Unescape html");
    html = StringEscapeUtils.unescapeHtml4(html);

    logger.info("Get selector");
    String selector = scrapeView.getSelector();
    if (!html.isEmpty() && !selector.isEmpty()) {
        scrapeView.setProgressBarTaskText("parsing HTML");
        logger.info("Parse HTML");
        Document doc = Jsoup.parse(html);

        scrapeView.setProgressBarTaskText("selecting elements in HTML");
        logger.info("select elements in HTML");
        Elements selectedElements = doc.select(selector);

        if (!selectedElements.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing selected elements");
            logger.info("Parse extracted elements");
            StringBuilder sb = new StringBuilder();
            for (Element element : selectedElements) {
                String body = element.html();
                sb.append(body);
                sb.append("\n");
                sb.append("\n");
            }
            scrapeView.setOutput(sb.toString());
        }
    }

    browserEngine.clearCookies();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}

From source file:com.nttec.everychan.api.AbstractVichanModule.java

protected PostModel mapPostModel(JSONObject object, String boardName) {
    PostModel model = new PostModel();
    model.number = Long.toString(object.getLong("no"));
    model.name = StringEscapeUtils
            .unescapeHtml4(RegexUtils.removeHtmlSpanTags(object.optString("name", "Anonymous")));
    model.subject = StringEscapeUtils.unescapeHtml4(object.optString("sub", ""));
    model.comment = object.optString("com", "");
    model.email = object.optString("email", "");
    model.trip = object.optString("trip", "");
    String capcode = object.optString("capcode", "none");
    if (!capcode.equals("none"))
        model.trip += "##" + capcode;
    String countryIcon = object.optString("country", "");
    if (!countryIcon.equals("")) {
        BadgeIconModel icon = new BadgeIconModel();
        icon.source = "/static/flags/" + countryIcon.toLowerCase(Locale.US) + ".png";
        icon.description = object.optString("country_name");
        model.icons = new BadgeIconModel[] { icon };
    }/* www.j  a  v  a 2  s  .c  o  m*/
    model.op = false;
    String id = object.optString("id", "");
    model.sage = id.equalsIgnoreCase("Heaven") || model.email.toLowerCase(Locale.US).contains("sage");
    if (!id.equals(""))
        model.name += (" ID:" + id);
    if (!id.equals("") && !id.equalsIgnoreCase("Heaven"))
        model.color = CryptoUtils.hashIdColor(id);
    model.timestamp = object.getLong("time") * 1000;
    model.parentThread = object.optString("resto", "0");
    if (model.parentThread.equals("0"))
        model.parentThread = model.number;

    List<AttachmentModel> attachments = null;
    boolean isSpoiler = object.optInt("spoiler") == 1;
    AttachmentModel rootAttachment = mapAttachment(object, boardName, isSpoiler);
    if (rootAttachment != null) {
        attachments = new ArrayList<>();
        attachments.add(rootAttachment);
        JSONArray extraFiles = object.optJSONArray("extra_files");
        if (extraFiles != null && extraFiles.length() != 0) {
            for (int i = 0, len = extraFiles.length(); i < len; ++i) {
                AttachmentModel attachment = mapAttachment(extraFiles.getJSONObject(i), boardName, isSpoiler);
                if (attachment != null)
                    attachments.add(attachment);
            }
        }
    }
    String embed = object.optString("embed", "");
    if (!embed.equals("")) {
        AttachmentModel embedAttachment = new AttachmentModel();
        embedAttachment.type = AttachmentModel.TYPE_OTHER_NOTFILE;
        Matcher linkMatcher = ATTACHMENT_EMBEDDED_LINK.matcher(embed);
        if (linkMatcher.find()) {
            embedAttachment.path = linkMatcher.group(1);
            if (embedAttachment.path.startsWith("//"))
                embedAttachment.path = (useHttps() ? "https:" : "http:") + embedAttachment.path;
            Matcher thumbMatcher = ATTACHMENT_EMBEDDED_THUMB.matcher(embed);
            if (thumbMatcher.find()) {
                embedAttachment.thumbnail = thumbMatcher.group(1);
                if (embedAttachment.thumbnail.startsWith("//"))
                    embedAttachment.thumbnail = (useHttps() ? "https:" : "http:") + embedAttachment.thumbnail;
            }
            embedAttachment.isSpoiler = isSpoiler;
            embedAttachment.size = -1;
            if (attachments != null)
                attachments.add(embedAttachment);
            else
                attachments = Collections.singletonList(embedAttachment);
        }
    }
    if (attachments != null)
        model.attachments = attachments.toArray(new AttachmentModel[attachments.size()]);
    return model;
}

From source file:com.romeikat.datamessie.core.base.service.download.ContentDownloader.java

public DownloadResult downloadContent(String url) {
    LOG.debug("Downloading content from {}", url);
    // In case of a new redirection for that source, use redirected URL
    URLConnection urlConnection = null;
    String originalUrl = null;//  w  w w  .ja  va 2 s  . c om
    org.jsoup.nodes.Document jsoupDocument = null;
    Integer statusCode = null;
    final LocalDateTime downloaded = LocalDateTime.now();
    try {
        urlConnection = getConnection(url);
        // Server-side redirection
        final String responseUrl = getResponseUrl(urlConnection);
        if (responseUrl != null) {
            final String redirectedUrl = getRedirectedUrl(url, responseUrl);
            if (isValidRedirection(url, redirectedUrl)) {
                originalUrl = url;
                url = redirectedUrl;
                closeUrlConnection(urlConnection);
                urlConnection = getConnection(url);
                LOG.debug("Redirection (server): {} -> {}", originalUrl, url);
            }
        }
        // Download content for further redirects
        final InputStream urlInputStream = asInputStream(urlConnection, true, false);
        final Charset charset = getCharset(urlConnection);
        jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
        final Elements metaTagsHtmlHeadLink;
        Elements metaTagsHtmlHeadMeta = null;
        // Meta redirection (<link rel="canonical" .../>)
        if (originalUrl == null) {
            metaTagsHtmlHeadLink = jsoupDocument.select("html head link");
            for (final Element metaTag : metaTagsHtmlHeadLink) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("rel")
                        && metaTagAttributes.get("rel").equalsIgnoreCase("canonical")
                        && metaTagAttributes.hasKey("href")) {
                    final String redirectedUrl = metaTagAttributes.get("href").trim();
                    if (isValidRedirection(url, redirectedUrl)) {
                        originalUrl = url;
                        url = redirectedUrl;
                        jsoupDocument = null;
                        LOG.debug("Redirection (<link rel=\"canonical\" .../>): {} -> {}", originalUrl, url);
                        break;
                    }
                }
            }
        }
        // Meta redirection (<meta http-equiv="refresh" .../>)
        if (originalUrl == null) {
            metaTagsHtmlHeadMeta = jsoupDocument.select("html head meta");
            for (final Element metaTag : metaTagsHtmlHeadMeta) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("http-equiv")
                        && metaTagAttributes.get("http-equiv").equalsIgnoreCase("refresh")
                        && metaTagAttributes.hasKey("content")) {
                    final String[] parts = metaTagAttributes.get("content").replace(" ", "").split("=", 2);
                    if (parts.length > 1) {
                        final String redirectedUrl = parts[1];
                        if (isValidRedirection(url, redirectedUrl)) {
                            originalUrl = url;
                            url = redirectedUrl;
                            jsoupDocument = null;
                            LOG.debug("Redirection (<meta http-equiv=\"refresh\" .../>): {} -> {}", originalUrl,
                                    url);
                            break;
                        }
                    }
                }
            }
        }
        // Meta redirection (<meta property="og:url" .../>)
        if (originalUrl == null) {
            for (final Element metaTag : metaTagsHtmlHeadMeta) {
                final Attributes metaTagAttributes = metaTag.attributes();
                if (metaTagAttributes.hasKey("property")
                        && metaTagAttributes.get("property").equalsIgnoreCase("og:url")
                        && metaTagAttributes.hasKey("content")) {
                    final String redirectedUrl = metaTagAttributes.get("content").trim();
                    if (isValidRedirection(url, redirectedUrl)) {
                        originalUrl = url;
                        url = redirectedUrl;
                        jsoupDocument = null;
                        LOG.debug("Redirection (<meta property=\"og:url\" .../>): {} -> {}", originalUrl, url);
                        break;
                    }
                }
            }
        }
    } catch (final Exception e) {
        if (e instanceof HttpStatusException) {
            statusCode = ((HttpStatusException) e).getStatusCode();
        }
        LOG.warn("Could not determine redirected URL for " + url, e);
    } finally {
        closeUrlConnection(urlConnection);
    }
    // Download content (if not yet done)
    String content = null;
    try {
        if (jsoupDocument == null) {
            LOG.debug("Downloading content from {}", url);
            urlConnection = getConnection(url);
            final InputStream urlInputStream = asInputStream(urlConnection, true, false);
            final Charset charset = getCharset(urlConnection);
            jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
        }
    } catch (final Exception e) {
        if (e instanceof HttpStatusException) {
            statusCode = ((HttpStatusException) e).getStatusCode();
        }
        // If the redirected URL does not exist, use the original URL instead
        if (originalUrl == null) {
            LOG.warn("Could not download content from " + url, e);
        }
        // If the redirected URL does not exist and a original URL is available, use the
        // original URL instead
        else {
            try {
                LOG.debug(
                        "Could not download content from redirected URL {}, downloading content from original URL {} instead",
                        url, originalUrl);
                urlConnection = getConnection(originalUrl);
                final InputStream urlInputStream = asInputStream(urlConnection, true, false);
                final Charset charset = getCharset(urlConnection);
                jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
                url = originalUrl;
                originalUrl = null;
                statusCode = null;
            } catch (final Exception e2) {
                LOG.warn("Could not download content from original URL " + url, e);
            }
        }
    } finally {
        closeUrlConnection(urlConnection);
    }
    if (jsoupDocument != null) {
        content = jsoupDocument.html();
    }
    // Strip non-valid characters as specified by the XML 1.0 standard
    final String validContent = xmlUtil.stripNonValidXMLCharacters(content);
    // Unescape HTML characters
    final String unescapedContent = StringEscapeUtils.unescapeHtml4(validContent);
    // Done
    final DownloadResult downloadResult = new DownloadResult(originalUrl, url, unescapedContent, downloaded,
            statusCode);
    return downloadResult;
}

From source file:com.google.publicalerts.cap.CapUtil.java

/**
 * @return {@true} if the input string contains HTML entities, {@code false}
 * otherwise/*from   www . j a  v a2 s  .com*/
 */
public static boolean containsHtmlEntities(String s) {
    return !StringEscapeUtils.unescapeHtml4(s).equals(s);
}

From source file:com.sangupta.comparator.HTMLComparer.java

/**
 * Test presence of each attribute from <code>st1</code> in <code>st2</code>. Also, the
 * values should be identical./*from w  w  w . j  ava  2 s.  com*/
 * 
 * @param st1
 * @param st2
 * @return
 */
private static boolean testAttributes(StartTag st1, StartTag st2) {
    List<Attribute> attributes1 = st1.getAttributes();

    if (attributes1.size() == 0) {
        return true;
    }

    for (Attribute attribute1 : attributes1) {
        String value2 = st2.getAttributeValue(attribute1.getName());
        if (value2 == null) {
            System.out.println("Attribute not present in stream2: attribute1=" + attribute1.getBegin()
                    + "; tag2=" + st2.getBegin());
            return false;
        }

        String value1 = StringEscapeUtils.unescapeHtml4(attribute1.getValue());
        value2 = StringEscapeUtils.unescapeHtml4(value2);

        if (!(value1.equals(value2))) {
            System.out.println("Attribute value mismatch: attribute1=" + attribute1.getBegin() + "; tag2="
                    + st2.getBegin());
            return false;
        }
    }

    return true;
}