List of utility methods to do HTML Jsoup Document
void | applyCacheKeysToResourceUrls(Document document, long pluginModifiedTimestamp, Locale locale) apply Cache Keys To Resource Urls String cacheKey = getCacheKeyPathSegments(pluginModifiedTimestamp, locale); Elements injectedScripts = document.select("script[data-spark-injected]"); for (Element script : injectedScripts) { script.attr("src", cacheKey + "/" + script.attr("src")); Elements injectedStyles = document.select("link[data-spark-injected]"); for (Element style : injectedStyles) { style.attr("href", cacheKey + "/" + style.attr("href")); ... |
org.jsoup.nodes.Document | convertLinksToAbsolute(String link, org.jsoup.nodes.Document doc) convert Links To Absolute doc.setBaseUri(getBaseLink(link)); Elements links = doc.select("a"); for (Element e : links) { e.setBaseUri(doc.baseUri()); if (!e.attr("href").startsWith("#")) { e.attr("href", e.attr("abs:href")); links = doc.select("img"); for (Element e : links) { e.setBaseUri(doc.baseUri()); e.attr("src", e.attr("abs:src")); links = doc.select("script"); for (Element e : links) { e.setBaseUri(doc.baseUri()); e.attr("src", e.attr("abs:src")); links = doc.select("link"); for (Element e : links) { e.setBaseUri(doc.baseUri()); e.attr("href", e.attr("abs:href")); return doc; |
String | detectLanguage(Document doc) detect Language Element htmlTag = doc.select("html").first(); if (htmlTag.attributes().hasKey("lang")) { return htmlTag.attr("lang"); if (htmlTag.attributes().hasKey("xml:lang")) { return htmlTag.attr("xml:lang"); return null; ... |
Document | emptyDocument() empty Document return Jsoup.parse(HTML_HEADER + HTML_FOOTER);
|
Document | formatDocument(Document doc) format Document doc.getElementsByTag("script").remove(); doc.getElementsByTag("link").attr("rel", "stylesheet").remove(); doc.getElementsByTag("style").remove(); doc.getElementsByTag("img").addClass("img-responsive"); return doc; |
String | getAllText(Document document) get All Text StringBuilder text = new StringBuilder(); for (TextNode textNode : document.textNodes()) { text.append(textNode.getWholeText()); return text.toString(); |
int[] | getCategoryIds(final Document html) get Category Ids final String breadcrumbData = html.getElementsByTag("script").stream().map(Element::data) .filter(data -> data.contains("PageTemplate.set({breadcrumb:")).findFirst().get(); final String regex = Pattern.quote("PageTemplate.set({breadcrumb: [") + "([0-9,-]+)" + Pattern.quote("]});"); final String[] categoryIds = getRegexGroup(breadcrumbData, regex, 1).get().split(","); return Stream.of(categoryIds).mapToInt(Integer::parseInt).toArray(); |
List | getContainersForLink(Document document, String link) get Containers For Link List<Element> elements = new ArrayList<>(); for (Element element : document.body().getAllElements()) { if (containsLink(element, link)) elements.add(element); return elements; |
Elements | getDivForClass(Document document, String className) get Div For Class return document.select("div[id= " + className + "]"); |
Document | getDocument(CloseableHttpClient client, String url) get Document return Jsoup.parse(getString(client, url));
|