List of utility methods to do HTML Parse Jsoup
String | getErrorMessage(String htmlStr) We need to show custom error message returned from the configuration management system. String errorMsg = html2text(htmlStr);
errorMsg = errorMsg.substring(
errorMsg.indexOf(HTTP_ERROR_MSG_START_OFFSET) + HTTP_ERROR_MSG_START_OFFSET.length(),
errorMsg.indexOf(HTTP_ERROR_MSG_END_OFFSET));
return errorMsg;
|
String | getExplanation(String html) get Explanation String text = Jsoup.parse(html).text(); int indexof_explanation = -1; int indexof_tomorrowspic = -1; int indexof_wekeepanarchive = -1; indexof_explanation = text.indexOf(EXPLANATION); indexof_tomorrowspic = text.indexOf(TOMORROWS_PIC); indexof_wekeepanarchive = text.indexOf(WE_KEEP_AN_ARCHIVE); if (indexof_explanation == -1 || (indexof_tomorrowspic == -1 && indexof_wekeepanarchive == -1)) { ... |
String | getFirstImageSrc(String html) get First Image Src if (html == null) return null; Elements es = Jsoup.parseBodyFragment(html).select("img"); if (es != null && es.size() > 0) return es.first().attr("src"); return null; |
String | getFirstSentence(final String html) Returns the first sentence of the specified HTML text. final Document newDoc = Document.createShell(""); final Element newBody = newDoc.body(); final Document document = parse(html); final Element body = document.body(); for (final Node node : body.childNodes()) { if (node instanceof TextNode) { final String text = ((TextNode) node).text(); final String[] parts = text.split("\\.(\\s+|$)", 2); ... |
List
| getFirstTableFromHTML(String result) Converts the first table in a HTML snippet to a list of list of strings. Document doc = parse(result); Element table = doc.select("table").get(0); List<List<String>> rowList = table.select("tr").stream() .map(tr -> tr.select("td").stream().map(element -> element.text()).collect(toList())) .collect(toList()); return rowList; |
Document | getHtml(String url, String ruta_fich) Gets the html. Document doc = Jsoup.connect(url).timeout(0).get();
return doc;
|
String | getHtmlBodyContent(String html) get Html Body Content if (html == null) return null; Document doc = Jsoup.parseBodyFragment(html); if (doc != null) { return doc.body().html(); return null; |
Elements | getHtmlInTag(String html, String tag) get html tag include tag too input (html):so you can remove the input tag by using #removeTag(String) return parse(html).child(0).getElementsByTag(tag);
|
String | getImageCredit(String html) get Image Credit String text = Jsoup.parse(html).text(); indices = new ArrayList<>(); int credit_index = -1; int credit_length = -1; for (String credit : CREDIT_STRINGS) { indices.add(new Integer(credit_index = text.indexOf(credit))); if (credit_index != -1) { credit_length = credit.length() + 1; ... |
List | getJSFileLinks(String html) get JS File Links Document doc = Jsoup.parse(html); Elements scriptSrc = doc.select("script[src]"); List<String> jsFileLinks = new ArrayList<String>(); for (Element script : scriptSrc) { if (script.attr("abs:src").endsWith(".js")) { jsFileLinks.add(script.attr("abs:src")); return jsFileLinks; |