List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:com.romeikat.datamessie.core.processing.task.documentProcessing.cleaning.DocumentCleaner.java
public DocumentCleaningResult clean(final StatelessSession statelessSession, final DocumentsProcessingCache documentsProcessingCache, final Document document, final RawContent rawContent) throws Exception { // Extract/*from w ww . j av a 2s . com*/ final String extractedContent = tagExctractor.extractContent(statelessSession, documentsProcessingCache, rawContent, document); // Remove boilerplate String cleanedContent = boilerplateRemover.removeBoilerplate(extractedContent); cleanedContent = StringEscapeUtils.unescapeHtml4(cleanedContent); // Done return new DocumentCleaningResult(cleanedContent); }
From source file:jobhunter.cb.Client.java
public Job execute() throws IOException, URISyntaxException { l.debug("Connecting to {}", url); update("Connecting", 1L); final Document doc = Jsoup.connect(url).get(); update("Parsing HTML", 2L); final Job job = Job.of(); job.setPortal(CareerBuilderPlugin.portal); job.setLink(url);//from www.j av a 2 s . c om URLEncodedUtils.parse(new URI(url), "UTF-8").stream() .filter(nvp -> nvp.getName().equalsIgnoreCase("job_did")).findFirst() .ifPresent(param -> job.setExtId(param.getValue())); job.setPosition(doc.getElementById("job-title").text()); job.setAddress(doc.getElementById("CBBody_Location").text()); job.getCompany().setName(doc.getElementById("CBBody_CompanyName").text()); StringBuilder description = new StringBuilder(); description.append(StringEscapeUtils.unescapeHtml4(doc.getElementById("pnlJobDescription").html())); Element div = doc.getElementById("job-requirements"); description.append(StringEscapeUtils.unescapeHtml4(div.getElementsByClass("section-body").html())); div = doc.getElementById("job-snapshot-section"); description.append(StringEscapeUtils.unescapeHtml4(div.getElementsByClass("section-body").html())); job.setDescription(description.toString()); update("Done", 3L); return job; }
From source file:com.wellsandwhistles.android.redditsp.fragments.CommentPropertiesDialog.java
@Override protected void prepare(AppCompatActivity context, LinearLayout items) { final RedditComment comment = getArguments().getParcelable("comment"); items.addView(propView(context, "ID", comment.name, true)); items.addView(propView(context, R.string.props_author, comment.author, false)); if (comment.author_flair_text != null && comment.author_flair_text.length() > 0) { items.addView(propView(context, R.string.props_author_flair, comment.author_flair_text, false)); }//www . j a v a2 s . c o m items.addView(propView(context, R.string.props_created, SRTime.formatDateTime(comment.created_utc * 1000, context), false)); if (comment.edited instanceof Long) { items.addView(propView(context, R.string.props_edited, SRTime.formatDateTime((Long) comment.edited * 1000, context), false)); } else { items.addView(propView(context, R.string.props_edited, R.string.props_never, false)); } items.addView(propView(context, R.string.props_score, String.valueOf(comment.ups - comment.downs), false)); items.addView(propView(context, R.string.props_subreddit, comment.subreddit, false)); if (comment.body != null && comment.body.length() > 0) { items.addView(propView(context, R.string.props_body_markdown, StringEscapeUtils.unescapeHtml4(comment.body), false)); } }
From source file:emily.command.fun.CatFactCommand.java
@Override public String execute(DiscordBot bot, String[] args, MessageChannel channel, User author, Message inputMessage) {/*from w ww.j a va2 s .co m*/ String catFact = getCatFact(); if (catFact != null) { return StringEscapeUtils.unescapeHtml4(catFact); } return Templates.command.catfact_not_today.formatGuild(channel); }
From source file:controllers.ClaController.java
public static String getField(String key, Map<String, String[]> fields, int maxLength) { String[] values = fields.get(key); if (values == null || values.length == 0) { throw new IllegalStateException(key + " can not be missing"); }/*w w w . j av a 2 s. c om*/ String value = StringEscapeUtils.unescapeHtml4(values[0].trim()); if (maxLength > 0) { value = value.substring(0, Math.min(value.length(), maxLength)); } return value; }
From source file:emily.command.fun.JokeCommand.java
@Override public String execute(DiscordBot bot, String[] args, MessageChannel channel, User author, Message inputMessage) {/* w w w. j a v a 2s . co m*/ bot.out.sendAsyncMessage(channel, Templates.command.joke_wait.formatGuild(channel), message -> { String joketxt = ""; if (new Random().nextInt(100) < 80) { joketxt = CommandHandler.getCommand("reddit").execute(bot, new String[] { "jokes" }, channel, author, null); } else { try { joketxt = getJokeFromWeb(URLEncoder.encode(author.getName(), "UTF-8")); } catch (UnsupportedEncodingException ignored) { } } if (joketxt != null && !joketxt.isEmpty()) { bot.out.editAsync(message, StringEscapeUtils .unescapeHtml4(joketxt.replace(author.getName(), "<@" + author.getId() + ">"))); } else { bot.out.editAsync(message, Templates.command.joke_not_today.formatGuild(channel)); } }); return ""; }
From source file:jobhunter.dice.Client.java
public Job execute() throws IOException, URISyntaxException { l.debug("Connecting to {}", url); update("Connecting", 1L); final Document doc = Jsoup.connect(url).get(); update("Parsing HTML", 2L); final Job job = Job.of(); job.setPortal(DicePlugin.portal);// www . j a va 2 s .c o m job.setLink(url); StringBuilder description = new StringBuilder(); for (Element meta : doc.getElementsByTag("meta")) { l.debug("Checking {}", meta.toString()); if (meta.attr("name").equals("twitter:text:job_title")) job.setPosition(meta.attr("content")); if (meta.attr("name").equals("twitter:text:company")) job.getCompany().setName(meta.attr("content")); if (meta.attr("name").equals("twitter:text:city")) job.setAddress(meta.attr("content")); if (meta.attr("name").equals("twitter:text:salary")) job.setSalary(meta.attr("content")); if (meta.attr("name").equals("twitter:text:job_description_web")) { description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content"))); } if (meta.attr("name").equals("twitter:text:skills")) { description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content"))); } } job.setDescription(description.toString()); update("Done", 3L); return job; }
From source file:com.nttec.everychan.chans.fourchan.FourchanJsonMapper.java
static PostModel mapPostModel(JSONObject object, String boardName) { PostModel model = new PostModel(); model.number = Long.toString(object.getLong("no")); model.name = StringEscapeUtils .unescapeHtml4(RegexUtils.removeHtmlSpanTags(object.optString("name", "Anonymous"))); model.subject = StringEscapeUtils.unescapeHtml4(object.optString("sub", "")); String comment = object.optString("com", ""); comment = RegexUtils.replaceAll(comment, S_TAG, "<$1aibspoiler>"); model.comment = LINKIFY ? RegexUtils.linkify(comment) : comment; model.email = null;/*from ww w .j av a2s. co m*/ model.trip = object.optString("trip", ""); String capcode = object.optString("capcode", "none"); if (!capcode.equals("none")) model.trip += "##" + capcode; String countryIcon = object.optString("country", ""); if (!countryIcon.equals("")) { BadgeIconModel icon = new BadgeIconModel(); icon.source = "s.4cdn.org/image/country/" + /*(boardName.equals("pol") ? "troll/" : "") + */countryIcon.toLowerCase(Locale.US) + ".gif"; icon.description = object.optString("country_name"); model.icons = new BadgeIconModel[] { icon }; } model.op = false; String id = object.optString("id", ""); model.sage = id.equalsIgnoreCase("Heaven"); if (!id.equals("")) model.name += (" ID:" + id); if (!id.equals("") && !id.equalsIgnoreCase("Heaven")) model.color = CryptoUtils.hashIdColor(id); model.timestamp = object.getLong("time") * 1000; model.parentThread = object.optString("resto", "0"); if (model.parentThread.equals("0")) model.parentThread = model.number; String ext = object.optString("ext", ""); if (!ext.equals("")) { AttachmentModel attachment = new AttachmentModel(); switch (ext) { case ".jpg": case ".png": attachment.type = AttachmentModel.TYPE_IMAGE_STATIC; break; case ".gif": attachment.type = AttachmentModel.TYPE_IMAGE_GIF; break; case ".webm": attachment.type = AttachmentModel.TYPE_VIDEO; break; default: attachment.type = AttachmentModel.TYPE_OTHER_FILE; } attachment.size = object.optInt("fsize", -1); if (attachment.size > 0) attachment.size = Math.round(attachment.size / 1024f); attachment.width = object.optInt("w", -1); attachment.height = object.optInt("h", -1); attachment.originalName = object.optString("filename", "") + ext; attachment.isSpoiler = object.optInt("spoiler") == 1; long tim = object.optLong("tim"); if (tim != 0) { attachment.thumbnail = "t.4cdn.org/" + boardName + "/" + Long.toString(tim) + "s.jpg"; attachment.path = "i.4cdn.org/" + boardName + "/" + Long.toString(tim) + ext; model.attachments = new AttachmentModel[] { attachment }; } } return model; }
From source file:co.foxdev.foxbot.utils.Utils.java
public static String parseChatUrl(String stringToParse, User sender) { try {//from w w w . ja v a2s. com Connection conn = Jsoup.connect(stringToParse); conn.followRedirects(true).userAgent( "FoxBot // http://foxbot.foxdev.co // Seeing this? It means your web address was posted on IRC and FoxBot is getting page info (title, size, content type) to send to the channel. Nothing to worry about.") .timeout(3000).maxBodySize(100000).ignoreContentType(true); Connection.Response response = conn.execute(); Document doc = response.parse(); String size = response.header("Content-Length") == null ? "Unknown" : (Integer.parseInt(response.header("Content-Length")) / 1024) + "kb"; String contentType = response.contentType().contains(";") ? response.contentType().split(";")[0] : response.contentType(); if (response.statusCode() != 200 && response.statusCode() != 302 && response.statusCode() != 301) { return colourise(String.format("(%s's URL) &cError: &r%s %s ", munge(sender.getNick()), response.statusCode(), response.statusMessage())); } if (!contentType.contains("html")) { return colourise(String.format("(%s's URL) &2Content Type: &r%s &2Size: &r%s", munge(sender.getNick()), contentType, size)); } String title = doc.title() == null || doc.title().isEmpty() ? "No title found" : doc.title(); if (stringToParse.matches("^https?://(www\\.)?youtube\\.com/watch.*")) { title = doc.select("span#eow-title").first().text(); String views = doc.select("span.watch-view-count").first().text(); String likes = doc.select("span.likes-count").first().text(); String dislikes = doc.select("span.dislikes-count").first().text(); String uploader = doc.select("a.g-hovercard.yt-uix-sessionlink.yt-user-name").first().text(); return colourise(String.format( "(%s's URL) &2Title: &r%s &2Uploader: &r%s &2Views: &r%s &2Rating: &a%s&r/&c%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), uploader, views, likes, dislikes)); } if (stringToParse.matches("^https?://(www\\.)?reddit\\.com/r/.*/comments/.*")) { String poster = doc.select("p.tagline").select("a.author").text().split(" ")[0]; String comments = doc.select("a.comments").first().text().split(" ")[0]; String likes = doc.select("span.upvotes").first().text().split(" ")[0]; String dislikes = doc.select("span.downvotes").first().text().split(" ")[0]; return colourise(String.format( "(%s's URL) &2Title: &r%s &2Poster: &r%s &2Comments: &r%s &2Rating: &6%s&r/&9%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), poster, comments, likes, dislikes)); } return colourise(String.format("(%s's URL) &2Title: &r%s &2Content Type: &r%s &2Size: &r%s", munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), contentType, size)); } catch (IllegalArgumentException ignored) { } catch (Exception ex) { foxbot.getLogger().error("Error occurred while parsing URL", ex); } return null; }
From source file:com.datumbox.framework.utilities.text.cleaners.HTMLCleaner.java
public static String removeNonTextTagsAndAttributes(String text) { text = removeNonTextTags(text);//from w w w. j a va 2 s . c o m Matcher m = REMOVE_ATTRIBUTES_PATTERN.matcher(text); if (m.find()) { text = m.replaceAll("<$1$2>"); } text = StringEscapeUtils.unescapeHtml4(text); return text; }