List of utility methods to do HTML Parse Jsoup
String | getMetaValue(String html, String metaKey) get Meta Value try { Document doc = Jsoup.parse(html); Elements name = doc.head().getElementsByAttributeValue("name", metaKey); return name.get(0).attr("content"); } catch (Exception ex) { return ""; |
String | getPlainText(String htmlText) Turn a hunk of text that contains HTML into plaintext. Document d = Jsoup.parse(htmlText);
return d.text();
|
String | getPlainTextFromHtml(String html) Extracts plain text from given HTML String. html = html.replaceAll("(?i)>\\s*\\n*\\s*<b>", "><br><b>"); html = html.replaceAll("(?i)\\s*\\n*\\s*</?b>\\s*", " "); String breakTagPlaceholder = "%BREAK%"; html = html.replaceAll("(?i)(<br[^>]*>|\\n)", breakTagPlaceholder); html = html.replaceAll("(?i)(<p>|\\n)", breakTagPlaceholder); html = Jsoup.parse(html).text(); html = html.replaceAll("\\u00A0", " "); html = html.replaceAll(breakTagPlaceholder, "\n"); ... |
String | getTitle(String htmlContent) Get the title of the HTML. Document doc = Jsoup.parse(htmlContent); Elements titleNode = doc.select("head > title"); return titleNode.isEmpty() ? null : doc.title(); |
List | htmlArray2textArray(List html Arraytext Array List<String> cleanTextArray = new ArrayList<>(); if (htmlArray == null) { return cleanTextArray; for (String html : htmlArray) { cleanTextArray.add(Jsoup.parse(html).text()); return cleanTextArray; ... |
boolean | isHTMLEmpty(String textToCheck) is HTML Empty String emptyContent = "<p><br></p>"; Document doc = Jsoup.parse(textToCheck); Elements elements = doc.select("body").first().children(); if (elements.size() == 0) return true; for (Element el : elements) { if ("".equals(el.toString()) || emptyContent.equals(el.toString())) return true; ... |
Document | parse(final String html) Parses the specified html code. Document doc = Jsoup.parseBodyFragment(html);
doc.outputSettings().prettyPrint(false);
return doc;
|
Document | parse(InputStream input, String documentIRI, String encoding) parse if (documentIRI == null) { documentIRI = ""; if (encoding == null) { int c; do { c = input.read(); } while (c != -1 && Character.isWhitespace(c)); ... |
String | parse(String html) parse try { Document doc = Jsoup.parse(html); return doc.text(); } catch (NoClassDefFoundError e) { return ""; |
Document | parse(String html) convert html String to Document (A lot more easier to manage it) Document document = Jsoup.parse(html); if (setting != null) return document.outputSettings(setting); return document; |