List of usage examples for org.jsoup.nodes Element toString
public String toString()
From source file:sk.svec.jan.acb.extraction.DiscussionFinder.java
public void findData(String path) throws Exception { dateCount = 0;/*from www . ja v a 2 s .co m*/ maxDepth = 0; foundDateStringSwitch = false; foundDate = false; File input = new File(path); Date todayDate = new Date(input.lastModified()); SimpleDateFormat dateFormat = new SimpleDateFormat("dd. MM. yyyy"); today = dateFormat.format(todayDate); Date yesterdayDate = new Date(todayDate.getTime() - 1 * 24 * 3600 * 1000); yesterday = dateFormat.format(yesterdayDate); Document doc = Jsoup.parse(input, "UTF-8"); Node node = doc; //Using EscapeMode.xhtml will give you output without entities. //sprvne kdovanie doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); traversePage(node); String filePath = path.substring(0, path.lastIndexOf("/") + 1); String outputPath = filePath.replace("extracted", "results"); //create folder for comments String fileName = path.substring(path.lastIndexOf("/") + 1, path.lastIndexOf(".")); // String commentFolderPath = outputPath + fileName + "_comments/"; // new File(commentFolderPath).mkdirs(); //initialize allLevels = new ArrayList<HashMap<String, Integer>>(); for (int i = 0; i <= maxDepth; i++) { allLevels.add(new HashMap<String, Integer>()); } boolean findDocumentParts = findDocumentParts(node); if (findDocumentParts) { // System.out.println(documentPart); Elements documentParts = doc.select(documentPartNode); int i = 0; for (Element documentPart : documentParts) { // System.out.println(documentPart.toString()+"\n"); DocumentPartFinder dpf = new DocumentPartFinder(documentPart.toString(), today, yesterday); // System.out.println("celly komentar "+dpf.getDoc().text()); for (Node nod : dpf.getNodesToRemove()) { // System.out.println(nod); dpf.removeNodes(dpf.getNode(), nod); } // System.out.println("XXXXXXXX"); String text = dpf.getDoc().text(); if (text.trim().length() == 0) { text = "null"; } //ak nenajdeme text alebo autora tak nevypiseme nic // if (text.trim().length() != 0 && dpf.getAuthor() != null) { String name; if (dpf.getAuthor() == null) { name = "null"; nullAuthor++; } else { name = dpf.getAuthor().trim(); } String date; if (dpf.getDate() == null) { date = "null"; } else { date = dpf.getDate().trim(); } String title = "diskusia"; //remove html tags title = html2text(title); name = html2text(name); date = html2text(date); //odstrani autor: xxx, datum: xxx atd // if (name.indexOf(":") != -1) { // name = name.substring(name.indexOf(":") + 1); // } date = findDateRegex(date); //nacitanie linku z exkterneho suboru String linkPath = filePath.replace("extracted", "links"); linkPath = linkPath + fileName + ".link"; String link = new Scanner(new File(linkPath)).useDelimiter("\\A").next(); String xmlPath = (outputPath + fileName + "_comment" + i + ".xml"); linkAndPath.add("<a href=\"" + link + "\">" + link + "</a> - <a href=\"/WebStructureDetection-web/getfile?name=" + xmlPath + "\"> " + xmlPath + "</a>"); WriteXMLFile wxmlf = new WriteXMLFile(); wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlPath); //cesty pre autora, ak nenaslo, ulozi do specialneho suboru String xmlFileName; if (name.compareTo("null") == 0) { xmlFileName = "deletedLinksLog.xml"; name = ""; date = ""; text = ""; title = xmlPath; } else { xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; } // String xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; StringTokenizer st = new StringTokenizer(outputPath, "/"); //cesta k suboru output/sk/cas/ napriklad String outputPath2 = ""; for (int j = 0; j < 3; j++) { outputPath2 += st.nextToken() + "/"; // System.out.println(st.nextToken()); } String xmlAuthorPath = outputPath2 + "author/" + xmlFileName; new File(outputPath2 + "author/").mkdirs(); //ulozenie autora if (text.compareTo("null") != 0) { File f = new File(xmlAuthorPath); if (f.isFile()) { wxmlf.addToXmlFile(link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } else { wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } } System.out.println("username: " + name); System.out.println("date: " + date); System.out.println("text: " + text); System.out.println("comment " + i + "extracted succesfully\n"); // } i++; } } }
From source file:uk.co.certait.htmlexporter.demo.DemoTwo.java
public DemoTwo() throws Exception { Document document = Jsoup/*from ww w . j a va2 s. com*/ .parse(new URL("http://news.bbc.co.uk/sport1/hi/football/eng_prem/table/8102708.stm"), 10000); Elements elements = document.getElementsByClass("fulltable"); String table = null; for (Element element : elements) { table = element.toString(); } String html = generateHTML(table); saveFile("league.html", html.getBytes()); new ExcelExporter().exportHtml(html, new File("./league.xlsx")); new OdsExporter().exportHtml(html, new File("./league.ods")); }