Java Utililty Methods HTML Jsoup Document

List of utility methods to do HTML Jsoup Document

Description

The list of methods to do HTML Jsoup Document are organized into topic(s).

Method

voidapplyCacheKeysToResourceUrls(Document document, long pluginModifiedTimestamp, Locale locale)
apply Cache Keys To Resource Urls
String cacheKey = getCacheKeyPathSegments(pluginModifiedTimestamp, locale);
Elements injectedScripts = document.select("script[data-spark-injected]");
for (Element script : injectedScripts) {
    script.attr("src", cacheKey + "/" + script.attr("src"));
Elements injectedStyles = document.select("link[data-spark-injected]");
for (Element style : injectedStyles) {
    style.attr("href", cacheKey + "/" + style.attr("href"));
...
org.jsoup.nodes.DocumentconvertLinksToAbsolute(String link, org.jsoup.nodes.Document doc)
convert Links To Absolute
doc.setBaseUri(getBaseLink(link));
Elements links = doc.select("a");
for (Element e : links) {
    e.setBaseUri(doc.baseUri());
    if (!e.attr("href").startsWith("#")) {
        e.attr("href", e.attr("abs:href"));
links = doc.select("img");
for (Element e : links) {
    e.setBaseUri(doc.baseUri());
    e.attr("src", e.attr("abs:src"));
links = doc.select("script");
for (Element e : links) {
    e.setBaseUri(doc.baseUri());
    e.attr("src", e.attr("abs:src"));
links = doc.select("link");
for (Element e : links) {
    e.setBaseUri(doc.baseUri());
    e.attr("href", e.attr("abs:href"));
return doc;
StringdetectLanguage(Document doc)
detect Language
Element htmlTag = doc.select("html").first();
if (htmlTag.attributes().hasKey("lang")) {
    return htmlTag.attr("lang");
if (htmlTag.attributes().hasKey("xml:lang")) {
    return htmlTag.attr("xml:lang");
return null;
...
DocumentemptyDocument()
empty Document
return Jsoup.parse(HTML_HEADER + HTML_FOOTER);
DocumentformatDocument(Document doc)
format Document
doc.getElementsByTag("script").remove();
doc.getElementsByTag("link").attr("rel", "stylesheet").remove();
doc.getElementsByTag("style").remove();
doc.getElementsByTag("img").addClass("img-responsive");
return doc;
StringgetAllText(Document document)
get All Text
StringBuilder text = new StringBuilder();
for (TextNode textNode : document.textNodes()) {
    text.append(textNode.getWholeText());
return text.toString();
int[]getCategoryIds(final Document html)
get Category Ids
final String breadcrumbData = html.getElementsByTag("script").stream().map(Element::data)
        .filter(data -> data.contains("PageTemplate.set({breadcrumb:")).findFirst().get();
final String regex = Pattern.quote("PageTemplate.set({breadcrumb: [") + "([0-9,-]+)"
        + Pattern.quote("]});");
final String[] categoryIds = getRegexGroup(breadcrumbData, regex, 1).get().split(",");
return Stream.of(categoryIds).mapToInt(Integer::parseInt).toArray();
ListgetContainersForLink(Document document, String link)
get Containers For Link
List<Element> elements = new ArrayList<>();
for (Element element : document.body().getAllElements()) {
    if (containsLink(element, link))
        elements.add(element);
return elements;
ElementsgetDivForClass(Document document, String className)
get Div For Class
return document.select("div[id= " + className + "]");
DocumentgetDocument(CloseableHttpClient client, String url)
get Document
return Jsoup.parse(getString(client, url));