Java tutorial
/* * Copyright 2015 Jn vec * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package sk.svec.jan.acb.extraction; import java.io.File; import java.security.MessageDigest; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Scanner; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.util.regex.Pattern.CASE_INSENSITIVE; import org.apache.commons.codec.binary.Hex; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import static sk.svec.jan.acb.extraction.Finder.html2text; /** * * @author Jn vec */ public class DiscussionFinder { private Node authorNode; private Node dateNode; private Node textNode; private List<HashMap<String, Integer>> allLevels; private int maxDepth; private boolean foundDateStringSwitch; private String documentPartNode; private boolean foundDate; private String date; private int dateCount; private int dateScore; private String today; private String yesterday; private List<String> linkAndPath; private int nullAuthor; public List<String> getLinkAndPath() { return linkAndPath; } public Node getAuthorNode() { return authorNode; } public Node getDateNode() { return dateNode; } public Node getTextNode() { return textNode; } public String getDate() { return date; } public int getDateScore() { return dateScore; } public int getDateCount() { return dateCount; } public int getNullAuthor() { return nullAuthor; } public DiscussionFinder(String path) throws Exception { linkAndPath = new ArrayList<String>(); File folder = new File(path + "extracted/"); File[] listOfFiles = folder.listFiles(); for (File listOfFile : listOfFiles) { if (listOfFile.isFile()) { findData(path + "extracted/" + listOfFile.getName()); } } } public void findData(String path) throws Exception { dateCount = 0; maxDepth = 0; foundDateStringSwitch = false; foundDate = false; File input = new File(path); Date todayDate = new Date(input.lastModified()); SimpleDateFormat dateFormat = new SimpleDateFormat("dd. MM. yyyy"); today = dateFormat.format(todayDate); Date yesterdayDate = new Date(todayDate.getTime() - 1 * 24 * 3600 * 1000); yesterday = dateFormat.format(yesterdayDate); Document doc = Jsoup.parse(input, "UTF-8"); Node node = doc; //Using EscapeMode.xhtml will give you output without entities. //sprvne kdovanie doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); traversePage(node); String filePath = path.substring(0, path.lastIndexOf("/") + 1); String outputPath = filePath.replace("extracted", "results"); //create folder for comments String fileName = path.substring(path.lastIndexOf("/") + 1, path.lastIndexOf(".")); // String commentFolderPath = outputPath + fileName + "_comments/"; // new File(commentFolderPath).mkdirs(); //initialize allLevels = new ArrayList<HashMap<String, Integer>>(); for (int i = 0; i <= maxDepth; i++) { allLevels.add(new HashMap<String, Integer>()); } boolean findDocumentParts = findDocumentParts(node); if (findDocumentParts) { // System.out.println(documentPart); Elements documentParts = doc.select(documentPartNode); int i = 0; for (Element documentPart : documentParts) { // System.out.println(documentPart.toString()+"\n"); DocumentPartFinder dpf = new DocumentPartFinder(documentPart.toString(), today, yesterday); // System.out.println("celly komentar "+dpf.getDoc().text()); for (Node nod : dpf.getNodesToRemove()) { // System.out.println(nod); dpf.removeNodes(dpf.getNode(), nod); } // System.out.println("XXXXXXXX"); String text = dpf.getDoc().text(); if (text.trim().length() == 0) { text = "null"; } //ak nenajdeme text alebo autora tak nevypiseme nic // if (text.trim().length() != 0 && dpf.getAuthor() != null) { String name; if (dpf.getAuthor() == null) { name = "null"; nullAuthor++; } else { name = dpf.getAuthor().trim(); } String date; if (dpf.getDate() == null) { date = "null"; } else { date = dpf.getDate().trim(); } String title = "diskusia"; //remove html tags title = html2text(title); name = html2text(name); date = html2text(date); //odstrani autor: xxx, datum: xxx atd // if (name.indexOf(":") != -1) { // name = name.substring(name.indexOf(":") + 1); // } date = findDateRegex(date); //nacitanie linku z exkterneho suboru String linkPath = filePath.replace("extracted", "links"); linkPath = linkPath + fileName + ".link"; String link = new Scanner(new File(linkPath)).useDelimiter("\\A").next(); String xmlPath = (outputPath + fileName + "_comment" + i + ".xml"); linkAndPath.add("<a href=\"" + link + "\">" + link + "</a> - <a href=\"/WebStructureDetection-web/getfile?name=" + xmlPath + "\"> " + xmlPath + "</a>"); WriteXMLFile wxmlf = new WriteXMLFile(); wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlPath); //cesty pre autora, ak nenaslo, ulozi do specialneho suboru String xmlFileName; if (name.compareTo("null") == 0) { xmlFileName = "deletedLinksLog.xml"; name = ""; date = ""; text = ""; title = xmlPath; } else { xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; } // String xmlFileName = Hex.encodeHexString(MessageDigest.getInstance("MD5").digest(name.getBytes())) + ".xml"; StringTokenizer st = new StringTokenizer(outputPath, "/"); //cesta k suboru output/sk/cas/ napriklad String outputPath2 = ""; for (int j = 0; j < 3; j++) { outputPath2 += st.nextToken() + "/"; // System.out.println(st.nextToken()); } String xmlAuthorPath = outputPath2 + "author/" + xmlFileName; new File(outputPath2 + "author/").mkdirs(); //ulozenie autora if (text.compareTo("null") != 0) { File f = new File(xmlAuthorPath); if (f.isFile()) { wxmlf.addToXmlFile(link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } else { wxmlf.createXmlFile(name.trim(), link.trim(), title.trim(), date.trim(), text.trim(), xmlAuthorPath); } } System.out.println("username: " + name); System.out.println("date: " + date); System.out.println("text: " + text); System.out.println("comment " + i + "extracted succesfully\n"); // } i++; } } } private String findDateRegex(String date) { //dni cislom, oddelovac: medzera,bodka,pomlcka a mozno medzera, mesiace cislom a slovom, roky 2 a 4 miestne String daysNum = "\\d{1,2}"; String monthsEN = "(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t|tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?"; String monthsSK = "|jan(?:ur|ura)?|feb(?:rur|rura)?|mar(?:ec|ca)?|apr(?:l|la)?|mj(?:a)?|jn(?:a)?|jl(?:a)?|aug(?:usta)?|sep(?:tember|tembra)?|okt(?:ber|bra)?|nov(?:ember|embra)?|dec(?:ember|embra)?"; String monthsCZ = "|led(?:en|na)?|nor(?:a)?|bez(?:en|na)?|dub(?:en|na)?|kvt(?:en|na)?|?erv(?:en|na)?|?erven(?:ce|ec)?|srp(?:en|na)?|z(?:)?|j(?:en|na)?|list(?:opad|opadu)?|pros(?:inec|ince)?"; String monthsNum = "|\\d{1,2})"; String separator = "( |\\.|-|,|/) ?"; // medzera, bodka, pomlcka, nic a potom bud medzera alebo ne String years = "(\\d{2,4})?"; String todayYesterday = "|(dnes|v?era|today|yesterday)..?([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]"; //dnes 12:25 String strPattern = daysNum.concat(separator).concat(monthsEN).concat(monthsCZ).concat(monthsSK) .concat(monthsNum).concat(separator).concat(years).concat(todayYesterday); Pattern pattern = Pattern.compile(strPattern, CASE_INSENSITIVE); Matcher matcher = pattern.matcher(date); while (matcher.find()) { String foundExpression = matcher.group(); // System.out.println(foundExpression); // System.out.println(value); //convert today/yesterday to actual date Pattern todayYesterdayPattern = Pattern.compile(todayYesterday.substring(1), CASE_INSENSITIVE); Matcher todayYesterdayMatcher = todayYesterdayPattern.matcher(foundExpression); while (todayYesterdayMatcher.find()) { foundExpression = foundExpression.replaceAll("dnes|today", today); foundExpression = foundExpression.replaceAll("v?era|yesterday", yesterday); } return foundExpression; } return "null"; } private boolean findDocumentParts(Node root) { Node node = root; int depth = 0; while (node != null) { if (node.nodeName().compareTo("#text") != 0) { HashMap<String, Integer> level = allLevels.get(depth); // System.out.println(depth + " " + allLevels.size()); if (level.containsKey(node.nodeName() + "[class=" + node.attr("class") + "]")) { Integer get = level.get(node.nodeName() + "[class=" + node.attr("class") + "]"); level.put(node.nodeName() + "[class=" + node.attr("class") + "]", get + 1); } else { level.put(node.nodeName() + "[class=" + node.attr("class") + "]", 1); } } if (node.childNodeSize() > 0) { node = node.childNode(0); depth++; } else { while (node.nextSibling() == null && depth > 0) { node = node.parentNode(); depth--; } if (node == root) { break; } node = node.nextSibling(); } } //ak je 0 alebo 1 datum, vratime false, kedze sa to neda zistit if (dateCount < 2) { return false; } else { return findOnePart(dateCount); } } private boolean findOnePart(int commentCount) { int i = 0; for (HashMap<String, Integer> level : allLevels) { for (String nodeWithClass : level.keySet()) { if (level.get(nodeWithClass).compareTo(commentCount) == 0) { // System.out.println(i + " " + nodeWithClass+" "); nodeWithClass = nodeWithClass.replace("[class=]", ""); documentPartNode = nodeWithClass; // System.out.println("xxxxxxx"+documentPartNode); return true; } } i++; } return false; } private void traversePage(Node root) { Node node = root; int depth = 0; while (node != null) { // System.out.println(depth + " " + node.nodeName() + " " + node.childNodeSize()); // if(node.attr("class").compareTo("contribution")==0){ // System.out.println(depth); // } if (maxDepth < depth) { maxDepth = depth; } boolean analyze = analyze(node); if (analyze) { break; } if (node.childNodeSize() > 0) { node = node.childNode(0); depth++; } else { while (node.nextSibling() == null && depth > 0) { node = node.parentNode(); depth--; } if (node == root) { break; } node = node.nextSibling(); } } } private boolean analyze(Node node) { // System.out.println(node.nodeName()); for (Attribute attribute : node.attributes().asList()) { String key = attribute.getKey(); String value = attribute.getValue(); // System.out.println(" attr:" + key + " value:" + value); if (!foundDateStringSwitch) { foundDateStringSwitch = findDate(node, value); } if (foundDateStringSwitch) { boolean foundDateString = findDate(node, value); if (foundDateString) { String child = node.childNode(0).toString(); foundDate = findDateValue(node, child); dateScore = 10; } } else { foundDate = findDateValue(node, value); dateScore = 5; } } return false; // return foundDate && foundAuthor && foundText; } public boolean findDate(Node node, String value) { //slovo datum v texte nehladame if (node.nodeName().compareTo("#text") == 0) { return false; } //hladame text date, datum, dtum Pattern datePattern = Pattern.compile("date|datum|dtum", CASE_INSENSITIVE); Matcher dateMatcher = datePattern.matcher(value); while (dateMatcher.find()) { // if (value.contains("date") || value.contains("datum") || value.contains("dtum")) { return true; } return false; } public boolean findDateValue(Node node, String value) { //dni cislom, oddelovac: medzera,bodka,pomlcka a mozno medzera, mesiace cislom a slovom, roky 2 a 4 miestne String daysNum = "\\d{1,2}"; String monthsEN = "(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t|tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?"; String monthsSK = "|jan(?:ur|ura)?|feb(?:rur|rura)?|mar(?:ec|ca)?|apr(?:l|la)?|mj(?:a)?|jn(?:a)?|jl(?:a)?|aug(?:usta)?|sep(?:tember|tembra)?|okt(?:ber|bra)?|nov(?:ember|embra)?|dec(?:ember|embra)?"; String monthsCZ = "|led(?:en|na)?|nor(?:a)?|bez(?:en|na)?|dub(?:en|na)?|kvt(?:en|na)?|?erv(?:en|na)?|?erven(?:ce|ec)?|srp(?:en|na)?|z(?:)?|j(?:en|na)?|list(?:opad|opadu)?|pros(?:inec|ince)?"; String monthsNum = "|\\d{1,2})"; String separator = "( |\\.|-) ?"; // medzera, bodka, pomlcka, nic a potom bud medzera alebo ne String years = "(\\d{2,4})?"; String todayYesterday = "|(dnes|v?era|today|yesterday)..?([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]"; String strPattern = daysNum.concat(separator).concat(monthsEN).concat(monthsCZ).concat(monthsSK) .concat(monthsNum).concat(separator).concat(years).concat(todayYesterday); Pattern pattern = Pattern.compile(strPattern, CASE_INSENSITIVE); Matcher matcher = pattern.matcher(value); while (matcher.find()) { // String foundExpression = matcher.group(); // System.out.println(foundExpression); // System.out.println(value); //convert today/yesterday to actual date Pattern todayYesterdayPattern = Pattern.compile(todayYesterday.substring(1), CASE_INSENSITIVE); Matcher todayYesterdayMatcher = todayYesterdayPattern.matcher(value); while (todayYesterdayMatcher.find()) { value = value.replaceAll("dnes|today", today); value = value.replaceAll("v?era|yesterday", yesterday); } date = value; dateCount++; dateNode = node; //// System.out.println("date: " + date.trim()); return true; } return false; } }