List of utility methods to do HTML Parse Jsoup
String | extractRssUrl(String html, URI base) extract Rss Url Document d = Jsoup.parse(html); Elements links = d.getElementsByTag(LINK); for (Element link : links) { if (ALTERNATE.equalsIgnoreCase(link.attr(REL))) { String type = link.attr(TYPE); if (RSS.equalsIgnoreCase(type) || ATOM.equalsIgnoreCase(type)) { String href = link.attr(HREF); String title = link.attr(TITLE); ... |
String | filter(String html) filter return StringUtil.isBlank(html) ? "" : Jsoup.clean(html, content_filter); |
String | fixHtml(String htmlContent, String outputFile, String contentFile) fix Html Document doc = Jsoup.parseBodyFragment(htmlContent); doc.outputSettings().charset("ASCII"); String relPrefixPath = computeRelPath(outputFile, contentFile); if (!relPrefixPath.isEmpty()) { Elements imgElements = doc.getElementsByTag("img"); for (Element e : imgElements) { String src = e.attr("src"); if (src != null) { ... |
String | getContentFromHTML(String html) get Content From HTML Document doc = Jsoup.parse(html);
return doc.body().text();
|
List | getDistinctImageUrls(String htmlContent) Finds all image inclusions (looks for img tags).
Document doc = Jsoup.parse(htmlContent); Elements els = doc.select(IMG_SELECTOR); List<String> images = new ArrayList<>(els.size()); for (Element e : els) { String path = e.attr(SRC_ATTR); if (!images.contains(path)) { images.add(path); return images; |
Document | getDoc(Connection conn) get Doc Document doc = null; int retry = 0; IOException e1 = null; while (retry++ < 3) { try { doc = conn.get(); return doc; } catch (IOException e) { ... |
Document | getDoc(File file) get Doc Document result = null; try { result = Jsoup.parse(file, "UTF-8"); } catch (Exception e) { e.printStackTrace(); return result; |
Document | getDoc(String path) get Doc String fileContent = readFile(path, StandardCharsets.UTF_8).replaceAll("(?i)<br[^>]*>", LINE_START) .replaceAll("\n", LINE_START); return Jsoup.parse(fileContent); |
Document | getDoc(String url) get Doc try { Document document = Jsoup.connect(url).timeout(10000).get(); if (document == null) { document = Jsoup.connect(url).timeout(10000).get(); return document; } catch (IOException e) { System.out.println("get document error," + e.getMessage()); ... |
String | getDoctypeName(InputStream s) get Doctype Name final org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(s, "us-ascii", "", org.jsoup.parser.Parser.xmlParser()); List<org.jsoup.nodes.Node> nods = doc.childNodes(); for (org.jsoup.nodes.Node node : nods) if (node instanceof org.jsoup.nodes.DocumentType) { org.jsoup.nodes.DocumentType documentType = (org.jsoup.nodes.DocumentType) node; final String res = documentType.attr("name"); if (res != null) ... |