org.brnvrn.Main.java Source code

Java tutorial

Introduction

Here is the source code for org.brnvrn.Main.java

Source

package org.brnvrn;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
TaskWarrior is a project with a web site featuring a big HTML page with lots of tools/extensions described
This program parse the TaskWarrior tools HTML page and extract the data in JSON format.
- the HTML page used here is an extract stored in the jar file
 */
public class Main {

    static Pattern p = Pattern.compile("(\\d{4}-\\d{2}-\\d{2})"); // 2015-23-05

    public static void main(String[] args) {

        Document doc = null; // the HTML tool page
        Document docObsolete = null;
        try {
            //Document doc = Jsoup.connect("http://taskwarrior.org/tools/").get();
            ClassLoader classloader = Thread.currentThread().getContextClassLoader();
            InputStream input = classloader.getResourceAsStream("Taskwarrior-Tools.html");
            doc = Jsoup.parse(input, "UTF-8", "http://taskwarrior.org/");
            input = classloader.getResourceAsStream("Taskwarrior-Tools-Obsolete.html");
            docObsolete = Jsoup.parse(input, "UTF-8", "http://taskwarrior.org/");
        } catch (IOException e) {
            e.printStackTrace();
        }

        List<Tool> tools = new ArrayList<Tool>(100);
        ObjectMapper objectMapper = parseDocument(tools, doc, false);
        objectMapper = parseDocument(tools, docObsolete, true);

        try {
            objectMapper.writeValue(new FileOutputStream("data-tools.json"), tools);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Parse a HTML document, add tools to the list
     */
    private static ObjectMapper parseDocument(List<Tool> tools, Document doc, boolean obsolete) {
        // http://jsoup.org/apidocs/org/jsoup/select/Selector.html
        Elements category_div = doc.select("div.container div.row:has(table)"); // we loop over each category table
        System.out.println("Parsing " + (obsolete ? "obsolete" : "") + " doc.   ###");
        System.out.println(" Found " + category_div.size() + " categories.");

        for (Element tool_div : category_div) {
            String category = tool_div.select("strong").text();
            parseCategory(tools, tool_div, category, obsolete);
        }
        System.out.println(" Got " + tools.size() + " tools.");

        ObjectMapper objectMapper = new ObjectMapper();
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
        return objectMapper;
    }

    /**
     * Parse the HTML containing a category table and the interleaved comments ...
    */
    private static void parseCategory(List<Tool> tools, Element tool_div, String category, boolean obsolete) {
        Tool tool = new Tool(obsolete);
        for (Node child : tool_div.select("tbody").first().childNodes()) {
            switch (child.nodeName()) {
            case "#comment":
                parseComment(tool, (Comment) child);
                break;
            case "tr":
                Element tr = (Element) child;
                if (tr.select("th").size() > 0) // Skip headings
                    break;
                tool.setCategory(category);
                if (!parseTrTool(tool, tr))
                    System.out.println("  Could not parse: " + tr.outerHtml());
                tools.add(tool);
                tool = new Tool(obsolete);
                break;
            }
        }
    }

    /**
     * Parse a tr HTML element describing the tool
     * @param tool is to be updated
     * @param tr   brings the data
     * @return true if successful
     */
    private static boolean parseTrTool(Tool tool, Element tr) {
        boolean success = true;

        Element nameLink = tr.select("td:eq(0)").first();
        if (nameLink == null)
            return false;
        tool.setName(nameLink.text());
        tool.setUrl(nameLink.getElementsByTag("a").attr("href"));

        tool.setLicense(tr.select("td:eq(2)").first().text());

        tool.setCompatibility(tr.select("td:eq(3)").first().text());

        // More complicated: We will extract and remove known nodes, the rest will be description
        Element tdDescription = tr.select("td:eq(1)").first();
        Elements smalls = tdDescription.getElementsByTag("small");
        for (Element small : smalls) {
            Element author = small.getElementsContainingText("Author").first();
            if (author != null) {
                String authorsString = author.text();
                authorsString = authorsString.substring(authorsString.indexOf(":") + 1);
                tool.addAuthor(authorsString.split(","));
                small.remove();
            }
            Element sourceCode = small.getElementsContainingText("ource").last();
            if (sourceCode != null) {
                tool.setUrl_src(sourceCode.attr("href"));
                small.remove();
            }
        }
        tdDescription.getElementsByTag("br").remove();
        tool.setDescription(Jsoup.clean(tdDescription.html(), Whitelist.relaxed())); // ownText will miss the contained links in the description
        tool.setDescriptionText(tdDescription.text());

        bestEffortThemeLanguage(tool);

        return success;
    }

    /*
    The HTML comments in the table contains info about last update and/or last verification
     */
    private static boolean parseComment(Tool tool, Comment comment) {
        String field = comment.getData();
        if (field == null)
            return false;
        if (field.contains("erified")) {
            Matcher m = p.matcher(field);
            if (m.find())
                tool.setVerified(m.group(0));
        } else if (field.contains("pdate")) {
            Matcher m = p.matcher(field);
            if (m.find())
                tool.setLast_update(m.group(0));
        } else
            System.out.println("  # Unknown comment  " + field);
        return true;
    }

    /*
    As an extra ...
    Tries to extract theme and language info from the description
    MUST be called after the description is set!
     */
    private static void bestEffortThemeLanguage(Tool tool) {
        if (tool.getDescription().toLowerCase().contains("python")) {
            tool.addLanguage("Python");
        }
        if (tool.getDescription().toLowerCase().contains("lua")) {
            tool.addLanguage("Lua");
        }
        if (tool.getDescription().toLowerCase().contains("php")) {
            tool.addLanguage("PHP");
        }
        if (tool.getDescription().contains("GUI") || tool.getDescription().contains("GTK")
                || tool.getDescription().toLowerCase().contains("graphic")) {
            tool.addTheme("GUI");
        }
        if (tool.getDescription().contains("XMPP")) {
            tool.addTheme("XMPP");
        }
        if (tool.getDescription().toLowerCase().contains("android")) {
            tool.addTheme("Android");
        }
        if (tool.getDescription().toLowerCase().contains("osx")
                || tool.getDescription().toLowerCase().contains("os x")) {
            tool.addTheme("OSX");
        }
        if (tool.getDescription().toLowerCase().contains("web")) {
            tool.addTheme("Web");
        }
        if (tool.getDescription().toLowerCase().contains("vim")) {
            tool.addTheme("Vim");
        }
        if (tool.getDescriptionText().toLowerCase().contains("git")) {
            tool.addTheme("Git");
        }
        if (tool.getDescription().toLowerCase().contains("ledger")) {
            tool.addTheme("Ledger");
        }
        if (tool.getDescription().toLowerCase().contains("time")) {
            tool.addTheme("Time");
        }
        if (tool.getDescription().toLowerCase().contains("mail")
                || tool.getDescription().toLowerCase().contains("smtp")) {
            tool.addTheme("Mail");
        }
    }

}