Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input) 

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:com.romeikat.datamessie.core.processing.task.documentProcessing.cleaning.DocumentCleaner.java

public DocumentCleaningResult clean(final StatelessSession statelessSession,
        final DocumentsProcessingCache documentsProcessingCache, final Document document,
        final RawContent rawContent) throws Exception {
    // Extract/*from   w  ww .  j av  a  2s  .  com*/
    final String extractedContent = tagExctractor.extractContent(statelessSession, documentsProcessingCache,
            rawContent, document);

    // Remove boilerplate
    String cleanedContent = boilerplateRemover.removeBoilerplate(extractedContent);
    cleanedContent = StringEscapeUtils.unescapeHtml4(cleanedContent);

    // Done
    return new DocumentCleaningResult(cleanedContent);
}

From source file:jobhunter.cb.Client.java

public Job execute() throws IOException, URISyntaxException {
    l.debug("Connecting to {}", url);

    update("Connecting", 1L);
    final Document doc = Jsoup.connect(url).get();

    update("Parsing HTML", 2L);
    final Job job = Job.of();
    job.setPortal(CareerBuilderPlugin.portal);
    job.setLink(url);//from www.j  av a  2  s  .  c  om

    URLEncodedUtils.parse(new URI(url), "UTF-8").stream()
            .filter(nvp -> nvp.getName().equalsIgnoreCase("job_did")).findFirst()
            .ifPresent(param -> job.setExtId(param.getValue()));

    job.setPosition(doc.getElementById("job-title").text());
    job.setAddress(doc.getElementById("CBBody_Location").text());

    job.getCompany().setName(doc.getElementById("CBBody_CompanyName").text());

    StringBuilder description = new StringBuilder();

    description.append(StringEscapeUtils.unescapeHtml4(doc.getElementById("pnlJobDescription").html()));

    Element div = doc.getElementById("job-requirements");

    description.append(StringEscapeUtils.unescapeHtml4(div.getElementsByClass("section-body").html()));

    div = doc.getElementById("job-snapshot-section");

    description.append(StringEscapeUtils.unescapeHtml4(div.getElementsByClass("section-body").html()));

    job.setDescription(description.toString());
    update("Done", 3L);
    return job;
}

From source file:com.wellsandwhistles.android.redditsp.fragments.CommentPropertiesDialog.java

@Override
protected void prepare(AppCompatActivity context, LinearLayout items) {

    final RedditComment comment = getArguments().getParcelable("comment");

    items.addView(propView(context, "ID", comment.name, true));

    items.addView(propView(context, R.string.props_author, comment.author, false));

    if (comment.author_flair_text != null && comment.author_flair_text.length() > 0) {
        items.addView(propView(context, R.string.props_author_flair, comment.author_flair_text, false));
    }//www . j a v  a2 s .  c  o  m

    items.addView(propView(context, R.string.props_created,
            SRTime.formatDateTime(comment.created_utc * 1000, context), false));

    if (comment.edited instanceof Long) {
        items.addView(propView(context, R.string.props_edited,
                SRTime.formatDateTime((Long) comment.edited * 1000, context), false));
    } else {
        items.addView(propView(context, R.string.props_edited, R.string.props_never, false));
    }

    items.addView(propView(context, R.string.props_score, String.valueOf(comment.ups - comment.downs), false));

    items.addView(propView(context, R.string.props_subreddit, comment.subreddit, false));

    if (comment.body != null && comment.body.length() > 0) {
        items.addView(propView(context, R.string.props_body_markdown,
                StringEscapeUtils.unescapeHtml4(comment.body), false));
    }
}

From source file:emily.command.fun.CatFactCommand.java

@Override
public String execute(DiscordBot bot, String[] args, MessageChannel channel, User author,
        Message inputMessage) {/*from w ww.j a va2 s  .co m*/
    String catFact = getCatFact();
    if (catFact != null) {
        return StringEscapeUtils.unescapeHtml4(catFact);
    }
    return Templates.command.catfact_not_today.formatGuild(channel);
}

From source file:controllers.ClaController.java

public static String getField(String key, Map<String, String[]> fields, int maxLength) {
    String[] values = fields.get(key);
    if (values == null || values.length == 0) {
        throw new IllegalStateException(key + " can not be missing");
    }/*w w w . j  av a 2 s.  c  om*/
    String value = StringEscapeUtils.unescapeHtml4(values[0].trim());
    if (maxLength > 0) {
        value = value.substring(0, Math.min(value.length(), maxLength));
    }
    return value;
}

From source file:emily.command.fun.JokeCommand.java

@Override
public String execute(DiscordBot bot, String[] args, MessageChannel channel, User author,
        Message inputMessage) {/* w w w.  j  a v a 2s .  co  m*/
    bot.out.sendAsyncMessage(channel, Templates.command.joke_wait.formatGuild(channel), message -> {
        String joketxt = "";
        if (new Random().nextInt(100) < 80) {
            joketxt = CommandHandler.getCommand("reddit").execute(bot, new String[] { "jokes" }, channel,
                    author, null);
        } else {
            try {
                joketxt = getJokeFromWeb(URLEncoder.encode(author.getName(), "UTF-8"));
            } catch (UnsupportedEncodingException ignored) {
            }
        }
        if (joketxt != null && !joketxt.isEmpty()) {
            bot.out.editAsync(message, StringEscapeUtils
                    .unescapeHtml4(joketxt.replace(author.getName(), "<@" + author.getId() + ">")));
        } else {
            bot.out.editAsync(message, Templates.command.joke_not_today.formatGuild(channel));
        }
    });
    return "";
}

From source file:jobhunter.dice.Client.java

public Job execute() throws IOException, URISyntaxException {
    l.debug("Connecting to {}", url);

    update("Connecting", 1L);
    final Document doc = Jsoup.connect(url).get();

    update("Parsing HTML", 2L);
    final Job job = Job.of();
    job.setPortal(DicePlugin.portal);// www . j a va  2 s .c o m
    job.setLink(url);

    StringBuilder description = new StringBuilder();

    for (Element meta : doc.getElementsByTag("meta")) {
        l.debug("Checking {}", meta.toString());
        if (meta.attr("name").equals("twitter:text:job_title"))
            job.setPosition(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:company"))
            job.getCompany().setName(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:city"))
            job.setAddress(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:salary"))
            job.setSalary(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:job_description_web")) {
            description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content")));
        }

        if (meta.attr("name").equals("twitter:text:skills")) {
            description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content")));
        }
    }

    job.setDescription(description.toString());

    update("Done", 3L);
    return job;
}

From source file:com.nttec.everychan.chans.fourchan.FourchanJsonMapper.java

static PostModel mapPostModel(JSONObject object, String boardName) {
    PostModel model = new PostModel();
    model.number = Long.toString(object.getLong("no"));
    model.name = StringEscapeUtils
            .unescapeHtml4(RegexUtils.removeHtmlSpanTags(object.optString("name", "Anonymous")));
    model.subject = StringEscapeUtils.unescapeHtml4(object.optString("sub", ""));
    String comment = object.optString("com", "");
    comment = RegexUtils.replaceAll(comment, S_TAG, "<$1aibspoiler>");
    model.comment = LINKIFY ? RegexUtils.linkify(comment) : comment;
    model.email = null;/*from   ww  w  .j  av  a2s.  co m*/
    model.trip = object.optString("trip", "");
    String capcode = object.optString("capcode", "none");
    if (!capcode.equals("none"))
        model.trip += "##" + capcode;
    String countryIcon = object.optString("country", "");
    if (!countryIcon.equals("")) {
        BadgeIconModel icon = new BadgeIconModel();
        icon.source = "s.4cdn.org/image/country/"
                + /*(boardName.equals("pol") ? "troll/" : "") + */countryIcon.toLowerCase(Locale.US) + ".gif";
        icon.description = object.optString("country_name");
        model.icons = new BadgeIconModel[] { icon };
    }
    model.op = false;
    String id = object.optString("id", "");
    model.sage = id.equalsIgnoreCase("Heaven");
    if (!id.equals(""))
        model.name += (" ID:" + id);
    if (!id.equals("") && !id.equalsIgnoreCase("Heaven"))
        model.color = CryptoUtils.hashIdColor(id);
    model.timestamp = object.getLong("time") * 1000;
    model.parentThread = object.optString("resto", "0");
    if (model.parentThread.equals("0"))
        model.parentThread = model.number;
    String ext = object.optString("ext", "");
    if (!ext.equals("")) {
        AttachmentModel attachment = new AttachmentModel();
        switch (ext) {
        case ".jpg":
        case ".png":
            attachment.type = AttachmentModel.TYPE_IMAGE_STATIC;
            break;
        case ".gif":
            attachment.type = AttachmentModel.TYPE_IMAGE_GIF;
            break;
        case ".webm":
            attachment.type = AttachmentModel.TYPE_VIDEO;
            break;
        default:
            attachment.type = AttachmentModel.TYPE_OTHER_FILE;
        }
        attachment.size = object.optInt("fsize", -1);
        if (attachment.size > 0)
            attachment.size = Math.round(attachment.size / 1024f);
        attachment.width = object.optInt("w", -1);
        attachment.height = object.optInt("h", -1);
        attachment.originalName = object.optString("filename", "") + ext;
        attachment.isSpoiler = object.optInt("spoiler") == 1;
        long tim = object.optLong("tim");
        if (tim != 0) {
            attachment.thumbnail = "t.4cdn.org/" + boardName + "/" + Long.toString(tim) + "s.jpg";
            attachment.path = "i.4cdn.org/" + boardName + "/" + Long.toString(tim) + ext;
            model.attachments = new AttachmentModel[] { attachment };
        }

    }
    return model;
}

From source file:co.foxdev.foxbot.utils.Utils.java

public static String parseChatUrl(String stringToParse, User sender) {
    try {//from  w  w w  . ja  v  a2s. com
        Connection conn = Jsoup.connect(stringToParse);

        conn.followRedirects(true).userAgent(
                "FoxBot // http://foxbot.foxdev.co // Seeing this? It means your web address was posted on IRC and FoxBot is getting page info (title, size, content type) to send to the channel. Nothing to worry about.")
                .timeout(3000).maxBodySize(100000).ignoreContentType(true);

        Connection.Response response = conn.execute();
        Document doc = response.parse();
        String size = response.header("Content-Length") == null ? "Unknown"
                : (Integer.parseInt(response.header("Content-Length")) / 1024) + "kb";
        String contentType = response.contentType().contains(";") ? response.contentType().split(";")[0]
                : response.contentType();

        if (response.statusCode() != 200 && response.statusCode() != 302 && response.statusCode() != 301) {
            return colourise(String.format("(%s's URL) &cError: &r%s %s ", munge(sender.getNick()),
                    response.statusCode(), response.statusMessage()));
        }

        if (!contentType.contains("html")) {
            return colourise(String.format("(%s's URL) &2Content Type: &r%s &2Size: &r%s",
                    munge(sender.getNick()), contentType, size));
        }

        String title = doc.title() == null || doc.title().isEmpty() ? "No title found" : doc.title();

        if (stringToParse.matches("^https?://(www\\.)?youtube\\.com/watch.*")) {
            title = doc.select("span#eow-title").first().text();
            String views = doc.select("span.watch-view-count").first().text();
            String likes = doc.select("span.likes-count").first().text();
            String dislikes = doc.select("span.dislikes-count").first().text();
            String uploader = doc.select("a.g-hovercard.yt-uix-sessionlink.yt-user-name").first().text();

            return colourise(String.format(
                    "(%s's URL) &2Title: &r%s &2Uploader: &r%s &2Views: &r%s &2Rating: &a%s&r/&c%s",
                    munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), uploader, views, likes,
                    dislikes));
        }

        if (stringToParse.matches("^https?://(www\\.)?reddit\\.com/r/.*/comments/.*")) {
            String poster = doc.select("p.tagline").select("a.author").text().split(" ")[0];
            String comments = doc.select("a.comments").first().text().split(" ")[0];
            String likes = doc.select("span.upvotes").first().text().split(" ")[0];
            String dislikes = doc.select("span.downvotes").first().text().split(" ")[0];

            return colourise(String.format(
                    "(%s's URL) &2Title: &r%s &2Poster: &r%s &2Comments: &r%s &2Rating: &6%s&r/&9%s",
                    munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), poster, comments, likes,
                    dislikes));
        }
        return colourise(String.format("(%s's URL) &2Title: &r%s &2Content Type: &r%s &2Size: &r%s",
                munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), contentType, size));
    } catch (IllegalArgumentException ignored) {

    } catch (Exception ex) {
        foxbot.getLogger().error("Error occurred while parsing URL", ex);
    }
    return null;
}

From source file:com.datumbox.framework.utilities.text.cleaners.HTMLCleaner.java

public static String removeNonTextTagsAndAttributes(String text) {
    text = removeNonTextTags(text);//from   w w w.  j a  va 2  s  . c  o m

    Matcher m = REMOVE_ATTRIBUTES_PATTERN.matcher(text);
    if (m.find()) {
        text = m.replaceAll("<$1$2>");
    }

    text = StringEscapeUtils.unescapeHtml4(text);

    return text;
}