Example usage for org.jsoup.nodes Element toString

List of usage examples for org.jsoup.nodes Element toString

Introduction

In this page you can find the example usage for org.jsoup.nodes Element toString.

Prototype

public String toString() 

Source Link

Document

Gets this node's outer HTML.

Usage

From source file:sk.svec.jan.acb.extraction.DiscussionFinder.java

public void findData(String path) throws Exception {
    dateCount = 0;/*from   www  . ja  v  a 2 s .co m*/
    maxDepth = 0;
    foundDateStringSwitch = false;
    foundDate = false;
    File input = new File(path);

    Date todayDate = new Date(input.lastModified());
    SimpleDateFormat dateFormat = new SimpleDateFormat("dd. MM. yyyy");
    today = dateFormat.format(todayDate);
    Date yesterdayDate = new Date(todayDate.getTime() - 1 * 24 * 3600 * 1000);
    yesterday = dateFormat.format(yesterdayDate);

    Document doc = Jsoup.parse(input, "UTF-8");
    Node node = doc;
    //Using EscapeMode.xhtml will give you output without entities. 
    //sprvne kdovanie
    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

    traversePage(node);

    String filePath = path.substring(0, path.lastIndexOf("/") + 1);
    String outputPath = filePath.replace("extracted", "results");

    //create folder for comments
    String fileName = path.substring(path.lastIndexOf("/") + 1, path.lastIndexOf("."));

    //        String commentFolderPath = outputPath + fileName + "_comments/";
    //        new File(commentFolderPath).mkdirs();
    //initialize
    allLevels = new ArrayList<HashMap<String, Integer>>();
    for (int i = 0; i <= maxDepth; i++) {
        allLevels.add(new HashMap<String, Integer>());
    }

    boolean findDocumentParts = findDocumentParts(node);
    if (findDocumentParts) {
        //            System.out.println(documentPart);

        Elements documentParts = doc.select(documentPartNode);
        int i = 0;
        for (Element documentPart : documentParts) {
            //                System.out.println(documentPart.toString()+"\n");
            DocumentPartFinder dpf = new DocumentPartFinder(documentPart.toString(), today, yesterday);
            //                 System.out.println("celly komentar "+dpf.getDoc().text());
            for (Node nod : dpf.getNodesToRemove()) {
                //            System.out.println(nod);
                dpf.removeNodes(dpf.getNode(), nod);
            }
            //                System.out.println("XXXXXXXX");
            String text = dpf.getDoc().text();

            if (text.trim().length() == 0) {
                text = "null";
            }
            //ak nenajdeme text alebo autora tak nevypiseme nic
            //                if (text.trim().length() != 0 && dpf.getAuthor() != null) {

            String name;
            if (dpf.getAuthor() == null) {
                name = "null";
                nullAuthor++;
            } else {
                name = dpf.getAuthor().trim();
            }

            String date;
            if (dpf.getDate() == null) {
                date = "null";
            } else {
                date = dpf.getDate().trim();
            }

            String title = "diskusia";

            //remove html tags
            title = html2text(title);
            name = html2text(name);
            date = html2text(date);

            //odstrani autor: xxx, datum: xxx atd
            //                if (name.indexOf(":") != -1) {
            //                    name = name.substring(name.indexOf(":") + 1);
            //                }

            date = findDateRegex(date);

            //nacitanie linku z exkterneho suboru       
            String linkPath = filePath.replace("extracted", "links");
            linkPath = linkPath + fileName + ".link";
            String link = new Scanner(new File(linkPath)).useDelimiter("\\A").next();

            String xmlPath = (outputPath + fileName + "_comment" + i + ".xml");

            linkAndPath.add("<a href=\"" + link + "\">" + link
                    + "</a> - <a href=\"/WebStructureDetection-web/getfile?name=" + xmlPath + "\"> " + xmlPath
                    + "</a>");

            WriteXMLFile wxmlf = new WriteXMLFile();
            wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlPath);

            //cesty pre autora, ak  nenaslo, ulozi do specialneho suboru
            String xmlFileName;
            if (name.compareTo("null") == 0) {
                xmlFileName = "deletedLinksLog.xml";
                name = "";
                date = "";
                text = "";
                title = xmlPath;
            } else {
                xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes()))
                        + ".xml";
            }
            // String xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml";
            StringTokenizer st = new StringTokenizer(outputPath, "/");
            //cesta k suboru output/sk/cas/ napriklad
            String outputPath2 = "";
            for (int j = 0; j < 3; j++) {
                outputPath2 += st.nextToken() + "/";
                //                    System.out.println(st.nextToken());
            }
            String xmlAuthorPath = outputPath2 + "author/" + xmlFileName;
            new File(outputPath2 + "author/").mkdirs();

            //ulozenie autora
            if (text.compareTo("null") != 0) {
                File f = new File(xmlAuthorPath);
                if (f.isFile()) {
                    wxmlf.addToXmlFile(link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath);
                } else {
                    wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(),
                            xmlAuthorPath);
                }

            }
            System.out.println("username: " + name);
            System.out.println("date: " + date);
            System.out.println("text: " + text);

            System.out.println("comment " + i + "extracted succesfully\n");
            //                }
            i++;
        }
    }

}

From source file:uk.co.certait.htmlexporter.demo.DemoTwo.java

public DemoTwo() throws Exception {
    Document document = Jsoup/*from ww  w  .  j  a  va2  s.  com*/
            .parse(new URL("http://news.bbc.co.uk/sport1/hi/football/eng_prem/table/8102708.stm"), 10000);
    Elements elements = document.getElementsByClass("fulltable");

    String table = null;

    for (Element element : elements) {
        table = element.toString();
    }

    String html = generateHTML(table);
    saveFile("league.html", html.getBytes());

    new ExcelExporter().exportHtml(html, new File("./league.xlsx"));
    new OdsExporter().exportHtml(html, new File("./league.ods"));
}