Java tutorial
package org.brnvrn; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import org.jsoup.Jsoup; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /* TaskWarrior is a project with a web site featuring a big HTML page with lots of tools/extensions described This program parse the TaskWarrior tools HTML page and extract the data in JSON format. - the HTML page used here is an extract stored in the jar file */ public class Main { static Pattern p = Pattern.compile("(\\d{4}-\\d{2}-\\d{2})"); // 2015-23-05 public static void main(String[] args) { Document doc = null; // the HTML tool page Document docObsolete = null; try { //Document doc = Jsoup.connect("http://taskwarrior.org/tools/").get(); ClassLoader classloader = Thread.currentThread().getContextClassLoader(); InputStream input = classloader.getResourceAsStream("Taskwarrior-Tools.html"); doc = Jsoup.parse(input, "UTF-8", "http://taskwarrior.org/"); input = classloader.getResourceAsStream("Taskwarrior-Tools-Obsolete.html"); docObsolete = Jsoup.parse(input, "UTF-8", "http://taskwarrior.org/"); } catch (IOException e) { e.printStackTrace(); } List<Tool> tools = new ArrayList<Tool>(100); ObjectMapper objectMapper = parseDocument(tools, doc, false); objectMapper = parseDocument(tools, docObsolete, true); try { objectMapper.writeValue(new FileOutputStream("data-tools.json"), tools); } catch (IOException e) { e.printStackTrace(); } } /** * Parse a HTML document, add tools to the list */ private static ObjectMapper parseDocument(List<Tool> tools, Document doc, boolean obsolete) { // http://jsoup.org/apidocs/org/jsoup/select/Selector.html Elements category_div = doc.select("div.container div.row:has(table)"); // we loop over each category table System.out.println("Parsing " + (obsolete ? "obsolete" : "") + " doc. ###"); System.out.println(" Found " + category_div.size() + " categories."); for (Element tool_div : category_div) { String category = tool_div.select("strong").text(); parseCategory(tools, tool_div, category, obsolete); } System.out.println(" Got " + tools.size() + " tools."); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(SerializationFeature.INDENT_OUTPUT); return objectMapper; } /** * Parse the HTML containing a category table and the interleaved comments ... */ private static void parseCategory(List<Tool> tools, Element tool_div, String category, boolean obsolete) { Tool tool = new Tool(obsolete); for (Node child : tool_div.select("tbody").first().childNodes()) { switch (child.nodeName()) { case "#comment": parseComment(tool, (Comment) child); break; case "tr": Element tr = (Element) child; if (tr.select("th").size() > 0) // Skip headings break; tool.setCategory(category); if (!parseTrTool(tool, tr)) System.out.println(" Could not parse: " + tr.outerHtml()); tools.add(tool); tool = new Tool(obsolete); break; } } } /** * Parse a tr HTML element describing the tool * @param tool is to be updated * @param tr brings the data * @return true if successful */ private static boolean parseTrTool(Tool tool, Element tr) { boolean success = true; Element nameLink = tr.select("td:eq(0)").first(); if (nameLink == null) return false; tool.setName(nameLink.text()); tool.setUrl(nameLink.getElementsByTag("a").attr("href")); tool.setLicense(tr.select("td:eq(2)").first().text()); tool.setCompatibility(tr.select("td:eq(3)").first().text()); // More complicated: We will extract and remove known nodes, the rest will be description Element tdDescription = tr.select("td:eq(1)").first(); Elements smalls = tdDescription.getElementsByTag("small"); for (Element small : smalls) { Element author = small.getElementsContainingText("Author").first(); if (author != null) { String authorsString = author.text(); authorsString = authorsString.substring(authorsString.indexOf(":") + 1); tool.addAuthor(authorsString.split(",")); small.remove(); } Element sourceCode = small.getElementsContainingText("ource").last(); if (sourceCode != null) { tool.setUrl_src(sourceCode.attr("href")); small.remove(); } } tdDescription.getElementsByTag("br").remove(); tool.setDescription(Jsoup.clean(tdDescription.html(), Whitelist.relaxed())); // ownText will miss the contained links in the description tool.setDescriptionText(tdDescription.text()); bestEffortThemeLanguage(tool); return success; } /* The HTML comments in the table contains info about last update and/or last verification */ private static boolean parseComment(Tool tool, Comment comment) { String field = comment.getData(); if (field == null) return false; if (field.contains("erified")) { Matcher m = p.matcher(field); if (m.find()) tool.setVerified(m.group(0)); } else if (field.contains("pdate")) { Matcher m = p.matcher(field); if (m.find()) tool.setLast_update(m.group(0)); } else System.out.println(" # Unknown comment " + field); return true; } /* As an extra ... Tries to extract theme and language info from the description MUST be called after the description is set! */ private static void bestEffortThemeLanguage(Tool tool) { if (tool.getDescription().toLowerCase().contains("python")) { tool.addLanguage("Python"); } if (tool.getDescription().toLowerCase().contains("lua")) { tool.addLanguage("Lua"); } if (tool.getDescription().toLowerCase().contains("php")) { tool.addLanguage("PHP"); } if (tool.getDescription().contains("GUI") || tool.getDescription().contains("GTK") || tool.getDescription().toLowerCase().contains("graphic")) { tool.addTheme("GUI"); } if (tool.getDescription().contains("XMPP")) { tool.addTheme("XMPP"); } if (tool.getDescription().toLowerCase().contains("android")) { tool.addTheme("Android"); } if (tool.getDescription().toLowerCase().contains("osx") || tool.getDescription().toLowerCase().contains("os x")) { tool.addTheme("OSX"); } if (tool.getDescription().toLowerCase().contains("web")) { tool.addTheme("Web"); } if (tool.getDescription().toLowerCase().contains("vim")) { tool.addTheme("Vim"); } if (tool.getDescriptionText().toLowerCase().contains("git")) { tool.addTheme("Git"); } if (tool.getDescription().toLowerCase().contains("ledger")) { tool.addTheme("Ledger"); } if (tool.getDescription().toLowerCase().contains("time")) { tool.addTheme("Time"); } if (tool.getDescription().toLowerCase().contains("mail") || tool.getDescription().toLowerCase().contains("smtp")) { tool.addTheme("Mail"); } } }