List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:com.nttec.everychan.ui.presentation.Subscriptions.java
private static String htmlToComment(String html) { return StringEscapeUtils.unescapeHtml4(RegexUtils.removeHtmlTags(html.replaceAll("<(br|p)/?>", " "))); }
From source file:net.chuzarski.crowdednews.utils.reddit.RedditRequest.java
/** * Creates a single RedditPost object from the given index of the postsJSONArray * @return RedditPost//ww w .j a v a 2 s .c om * @throws JSONException */ public RedditPost createSingleRedditPost(JSONObject postObj) throws JSONException, RedditException { RedditPost post; //all data in a post String title; String url; String name; String id; String domain; long createdUTC; boolean stickied; title = StringEscapeUtils.unescapeHtml4(postObj.getString("title")); name = postObj.getString("name"); url = postObj.getString("url"); id = postObj.getString("id"); stickied = postObj.getBoolean("stickied"); createdUTC = postObj.getLong("created_utc"); domain = postObj.getString("domain"); //meet our conditions if (domain.contains("self.")) { throw new RedditException(RedditErrors.REDDIT_SELF_POST); } post = new RedditPost.Builder(title, url).redditName(name).redditId(id).isStickied(stickied) .timeCreated(createdUTC).linkDomain(domain).build(); return post; }
From source file:com.romeikat.datamessie.core.rss.task.maintenance.MaintenanceTask.java
private void unescapeHtmlCharsFromContent(final TaskExecution taskExecution) throws TaskCancelledException { final HibernateSessionProvider sessionProvider = new HibernateSessionProvider(sessionFactory); // Get all IDs final List<Long> ids = documentDao.getIds(sessionProvider.getStatelessSession()); sessionProvider.closeStatelessSession(); // Process IDs in batches final List<List<Long>> batches = CollectionUtil.splitIntoSubListsBySize(ids, batchSize); for (final List<Long> batch : batches) { new ParallelProcessing<Long>(sessionFactory, batch) { @Override//from ww w.j ava 2 s. co m public void doProcessing(final HibernateSessionProvider sessionProvider, final Long documentId) { // Unescape characters final RawContent rawContent = rawContentDao.getEntity(sessionProvider.getStatelessSession(), documentId); if (rawContent == null) { return; } final String content = rawContent.getContent(); final String unescapedContent = StringEscapeUtils.unescapeHtml4(content); // Remove any preprocessed information rawContent.setContent(unescapedContent); rawContentDao.update(sessionProvider.getStatelessSession(), rawContent); final Document document = documentDao.getEntity(sessionProvider.getStatelessSession(), documentId); document.setState(DocumentProcessingState.DOWNLOADED); documentDao.update(sessionProvider.getStatelessSession(), document); final List<NamedEntityOccurrence> namedEntities = namedEntityOccurrenceDao .getByDocument(sessionProvider.getStatelessSession(), documentId); for (final NamedEntityOccurrence namedEntity : namedEntities) { namedEntityOccurrenceDao.delete(sessionProvider.getStatelessSession(), namedEntity); } LOG.info("Unescaped HTML characters from content of document {}", documentId); } }; taskExecution.checkpoint(); } }
From source file:com.github.naoghuman.cm.model.category.CategoryModel.java
@Override public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { this.setId(in.readLong()); this.setMatrixId(in.readLong()); this.setGenerationTime(in.readLong()); this.setTitle(StringEscapeUtils.unescapeHtml4(String.valueOf(in.readObject()))); this.setDescription(StringEscapeUtils.unescapeHtml4(String.valueOf(in.readObject()))); }
From source file:com.nttec.everychan.chans.krautchan.KrautReader.java
private void handleFilter(int filterIndex) throws IOException { switch (filterIndex) { case FILTER_THREAD_END: finalizeThread();/*from w w w .j a v a 2 s . c o m*/ break; case FILTER_POSTNUMBER: currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim(); break; case FILTER_COUNTRYBALL: case FILTER_COUNTRYBALL_WAR: parseIcon(readUntilSequence(FILTERS_CLOSE[filterIndex]), filterIndex == FILTER_COUNTRYBALL_WAR); break; case FILTER_SUBJECT: currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])) .trim(); currentPost.subject = CryptoUtils.fixCloudflareEmails(currentPost.subject); break; case FILTER_POSTERNAME: currentPost.name = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])) .trim(); break; case FILTER_TRIPCODE: currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])) .trim(); break; case FILTER_ADMINMARK: skipUntilSequence(">".toCharArray()); currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])) .trim(); break; case FILTER_DATE: String date = readUntilSequence(FILTERS_CLOSE[filterIndex]); int ms = 0; try { int dotPosition = date.lastIndexOf('.'); if (dotPosition != -1) { ms = Integer.parseInt(date.substring(dotPosition + 1)) / 1000; date = date.substring(0, dotPosition); } } catch (NumberFormatException e) { } try { currentPost.timestamp = KRAUT_DATEFORMAT.parse(date).getTime() + ms; } catch (Exception e) { Logger.e(TAG, "unable to parse date", e); } break; case FILTER_SAGE: currentPost.sage = true; break; case FILTER_ATTACHMENT: case FILTER_ATTACHMENT_OP: String[] attachments = readUntilSequence(FILTERS_CLOSE[filterIndex]).split("</div>"); for (String attachment : attachments) parseAttachment(attachment); break; case FILTER_START_COMMENT: skipUntilSequence(FILTERS_CLOSE[filterIndex]); currentPost.comment = readPostComment(); finalizePost(); break; case FILTER_OMITTEDPOSTS: parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; } }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByUi4j() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);// w w w . j av a 2 s .c o m scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Navigate to blank page. scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); Page page = browserEngine.navigate(scrapeView.getWebsiteUrl()); //page.show(); logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as HTML"); logger.info("View page as HTML"); String html = page.getDocument().getBody().getInnerHTML(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); html = StringEscapeUtils.unescapeHtml4(html); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!html.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(html); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } browserEngine.clearCookies(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }
From source file:com.nttec.everychan.api.AbstractVichanModule.java
protected PostModel mapPostModel(JSONObject object, String boardName) { PostModel model = new PostModel(); model.number = Long.toString(object.getLong("no")); model.name = StringEscapeUtils .unescapeHtml4(RegexUtils.removeHtmlSpanTags(object.optString("name", "Anonymous"))); model.subject = StringEscapeUtils.unescapeHtml4(object.optString("sub", "")); model.comment = object.optString("com", ""); model.email = object.optString("email", ""); model.trip = object.optString("trip", ""); String capcode = object.optString("capcode", "none"); if (!capcode.equals("none")) model.trip += "##" + capcode; String countryIcon = object.optString("country", ""); if (!countryIcon.equals("")) { BadgeIconModel icon = new BadgeIconModel(); icon.source = "/static/flags/" + countryIcon.toLowerCase(Locale.US) + ".png"; icon.description = object.optString("country_name"); model.icons = new BadgeIconModel[] { icon }; }/* www.j a v a 2 s .c o m*/ model.op = false; String id = object.optString("id", ""); model.sage = id.equalsIgnoreCase("Heaven") || model.email.toLowerCase(Locale.US).contains("sage"); if (!id.equals("")) model.name += (" ID:" + id); if (!id.equals("") && !id.equalsIgnoreCase("Heaven")) model.color = CryptoUtils.hashIdColor(id); model.timestamp = object.getLong("time") * 1000; model.parentThread = object.optString("resto", "0"); if (model.parentThread.equals("0")) model.parentThread = model.number; List<AttachmentModel> attachments = null; boolean isSpoiler = object.optInt("spoiler") == 1; AttachmentModel rootAttachment = mapAttachment(object, boardName, isSpoiler); if (rootAttachment != null) { attachments = new ArrayList<>(); attachments.add(rootAttachment); JSONArray extraFiles = object.optJSONArray("extra_files"); if (extraFiles != null && extraFiles.length() != 0) { for (int i = 0, len = extraFiles.length(); i < len; ++i) { AttachmentModel attachment = mapAttachment(extraFiles.getJSONObject(i), boardName, isSpoiler); if (attachment != null) attachments.add(attachment); } } } String embed = object.optString("embed", ""); if (!embed.equals("")) { AttachmentModel embedAttachment = new AttachmentModel(); embedAttachment.type = AttachmentModel.TYPE_OTHER_NOTFILE; Matcher linkMatcher = ATTACHMENT_EMBEDDED_LINK.matcher(embed); if (linkMatcher.find()) { embedAttachment.path = linkMatcher.group(1); if (embedAttachment.path.startsWith("//")) embedAttachment.path = (useHttps() ? "https:" : "http:") + embedAttachment.path; Matcher thumbMatcher = ATTACHMENT_EMBEDDED_THUMB.matcher(embed); if (thumbMatcher.find()) { embedAttachment.thumbnail = thumbMatcher.group(1); if (embedAttachment.thumbnail.startsWith("//")) embedAttachment.thumbnail = (useHttps() ? "https:" : "http:") + embedAttachment.thumbnail; } embedAttachment.isSpoiler = isSpoiler; embedAttachment.size = -1; if (attachments != null) attachments.add(embedAttachment); else attachments = Collections.singletonList(embedAttachment); } } if (attachments != null) model.attachments = attachments.toArray(new AttachmentModel[attachments.size()]); return model; }
From source file:com.romeikat.datamessie.core.base.service.download.ContentDownloader.java
public DownloadResult downloadContent(String url) { LOG.debug("Downloading content from {}", url); // In case of a new redirection for that source, use redirected URL URLConnection urlConnection = null; String originalUrl = null;// w w w .ja va 2 s . c om org.jsoup.nodes.Document jsoupDocument = null; Integer statusCode = null; final LocalDateTime downloaded = LocalDateTime.now(); try { urlConnection = getConnection(url); // Server-side redirection final String responseUrl = getResponseUrl(urlConnection); if (responseUrl != null) { final String redirectedUrl = getRedirectedUrl(url, responseUrl); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; closeUrlConnection(urlConnection); urlConnection = getConnection(url); LOG.debug("Redirection (server): {} -> {}", originalUrl, url); } } // Download content for further redirects final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); final Elements metaTagsHtmlHeadLink; Elements metaTagsHtmlHeadMeta = null; // Meta redirection (<link rel="canonical" .../>) if (originalUrl == null) { metaTagsHtmlHeadLink = jsoupDocument.select("html head link"); for (final Element metaTag : metaTagsHtmlHeadLink) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("rel") && metaTagAttributes.get("rel").equalsIgnoreCase("canonical") && metaTagAttributes.hasKey("href")) { final String redirectedUrl = metaTagAttributes.get("href").trim(); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<link rel=\"canonical\" .../>): {} -> {}", originalUrl, url); break; } } } } // Meta redirection (<meta http-equiv="refresh" .../>) if (originalUrl == null) { metaTagsHtmlHeadMeta = jsoupDocument.select("html head meta"); for (final Element metaTag : metaTagsHtmlHeadMeta) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("http-equiv") && metaTagAttributes.get("http-equiv").equalsIgnoreCase("refresh") && metaTagAttributes.hasKey("content")) { final String[] parts = metaTagAttributes.get("content").replace(" ", "").split("=", 2); if (parts.length > 1) { final String redirectedUrl = parts[1]; if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<meta http-equiv=\"refresh\" .../>): {} -> {}", originalUrl, url); break; } } } } } // Meta redirection (<meta property="og:url" .../>) if (originalUrl == null) { for (final Element metaTag : metaTagsHtmlHeadMeta) { final Attributes metaTagAttributes = metaTag.attributes(); if (metaTagAttributes.hasKey("property") && metaTagAttributes.get("property").equalsIgnoreCase("og:url") && metaTagAttributes.hasKey("content")) { final String redirectedUrl = metaTagAttributes.get("content").trim(); if (isValidRedirection(url, redirectedUrl)) { originalUrl = url; url = redirectedUrl; jsoupDocument = null; LOG.debug("Redirection (<meta property=\"og:url\" .../>): {} -> {}", originalUrl, url); break; } } } } } catch (final Exception e) { if (e instanceof HttpStatusException) { statusCode = ((HttpStatusException) e).getStatusCode(); } LOG.warn("Could not determine redirected URL for " + url, e); } finally { closeUrlConnection(urlConnection); } // Download content (if not yet done) String content = null; try { if (jsoupDocument == null) { LOG.debug("Downloading content from {}", url); urlConnection = getConnection(url); final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); } } catch (final Exception e) { if (e instanceof HttpStatusException) { statusCode = ((HttpStatusException) e).getStatusCode(); } // If the redirected URL does not exist, use the original URL instead if (originalUrl == null) { LOG.warn("Could not download content from " + url, e); } // If the redirected URL does not exist and a original URL is available, use the // original URL instead else { try { LOG.debug( "Could not download content from redirected URL {}, downloading content from original URL {} instead", url, originalUrl); urlConnection = getConnection(originalUrl); final InputStream urlInputStream = asInputStream(urlConnection, true, false); final Charset charset = getCharset(urlConnection); jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url); url = originalUrl; originalUrl = null; statusCode = null; } catch (final Exception e2) { LOG.warn("Could not download content from original URL " + url, e); } } } finally { closeUrlConnection(urlConnection); } if (jsoupDocument != null) { content = jsoupDocument.html(); } // Strip non-valid characters as specified by the XML 1.0 standard final String validContent = xmlUtil.stripNonValidXMLCharacters(content); // Unescape HTML characters final String unescapedContent = StringEscapeUtils.unescapeHtml4(validContent); // Done final DownloadResult downloadResult = new DownloadResult(originalUrl, url, unescapedContent, downloaded, statusCode); return downloadResult; }
From source file:com.google.publicalerts.cap.CapUtil.java
/** * @return {@true} if the input string contains HTML entities, {@code false} * otherwise/*from www . j a v a2 s .com*/ */ public static boolean containsHtmlEntities(String s) { return !StringEscapeUtils.unescapeHtml4(s).equals(s); }
From source file:com.sangupta.comparator.HTMLComparer.java
/** * Test presence of each attribute from <code>st1</code> in <code>st2</code>. Also, the * values should be identical./*from w w w . j ava 2 s. com*/ * * @param st1 * @param st2 * @return */ private static boolean testAttributes(StartTag st1, StartTag st2) { List<Attribute> attributes1 = st1.getAttributes(); if (attributes1.size() == 0) { return true; } for (Attribute attribute1 : attributes1) { String value2 = st2.getAttributeValue(attribute1.getName()); if (value2 == null) { System.out.println("Attribute not present in stream2: attribute1=" + attribute1.getBegin() + "; tag2=" + st2.getBegin()); return false; } String value1 = StringEscapeUtils.unescapeHtml4(attribute1.getValue()); value2 = StringEscapeUtils.unescapeHtml4(value2); if (!(value1.equals(value2))) { System.out.println("Attribute value mismatch: attribute1=" + attribute1.getBegin() + "; tag2=" + st2.getBegin()); return false; } } return true; }