List of usage examples for org.jsoup.select Elements Elements
public Elements()
From source file:module.entities.NameFinder.RegexNameFinder.java
/** * @param args the command line arguments *///from w ww .j a v a 2 s . c om public static void main(String[] args) throws SQLException, IOException { if (args.length == 1) { Config.configFile = args[0]; } long lStartTime = System.currentTimeMillis(); Timestamp startTime = new Timestamp(lStartTime); System.out.println("Regex Name Finder process started at: " + startTime); DB.initPostgres(); regexerId = DB.LogRegexFinder(lStartTime); initLexicons(); JSONObject obj = new JSONObject(); TreeMap<Integer, String> consultations = DB.getDemocracitConsultationBody(); Document doc; int count = 0; TreeMap<Integer, String> consFoundNames = new TreeMap<>(); TreeMap<Integer, String> consFoundRoles = new TreeMap<>(); for (int consId : consultations.keySet()) { String consBody = consultations.get(consId); String signName = "", roleName = ""; doc = Jsoup.parse(consBody); Elements allPars = new Elements(); Elements paragraphs = doc.select("p"); for (Element par : paragraphs) { if (par.html().contains("<br>")) { String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>"; Document internal_doc = Jsoup.parse(out); Elements subparagraphs = internal_doc.select("p"); allPars.addAll(subparagraphs); } else { allPars.add(par); } // System.out.println(formatedText); } String signature = getSignatureFromParagraphs(allPars); // System.out.println(signature); if (signature.contains("#")) { String[] sign_tokens = signature.split("#"); signName = sign_tokens[0]; if (sign_tokens.length > 1) { roleName = sign_tokens[1]; } consFoundNames.put(consId, signName.trim()); consFoundRoles.put(consId, roleName.trim()); count++; } else { System.err.println("--" + consId); } // } DB.insertDemocracitConsultationMinister(consFoundNames, consFoundRoles); TreeMap<Integer, String> consultationsCompletedText = DB.getDemocracitCompletedConsultationBody(); Document doc2; TreeMap<Integer, String> complConsFoundNames = new TreeMap<>(); int count2 = 0; for (int consId : consultationsCompletedText.keySet()) { String consBody = consultationsCompletedText.get(consId); String signName = "", roleName = ""; doc2 = Jsoup.parse(consBody); // if (doc.text().contains("<br>")) { // doc.text().replaceAll("(<[Bb][Rr]>)+", "<p>"); // } Elements allPars = new Elements(); Elements paragraphs = doc2.select("p"); for (Element par : paragraphs) { if (par.html().contains("<br>")) { String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>"; Document internal_doc = Jsoup.parse(out); Elements subparagraphs = internal_doc.select("p"); allPars.addAll(subparagraphs); } else { allPars.add(par); } } String signature = getSignatureFromParagraphs(allPars); if (signature.contains("#")) { String[] sign_tokens = signature.split("#"); signName = sign_tokens[0]; if (sign_tokens.length > 1) { roleName = sign_tokens[1]; } consFoundNames.put(consId, signName.trim()); consFoundRoles.put(consId, roleName.trim()); // System.out.println(consId); // System.out.println(signName.trim()); // System.out.println("***************"); count2++; } else { System.err.println("++" + consId); } } DB.insertDemocracitConsultationMinister(complConsFoundNames, consFoundRoles); long lEndTime = System.currentTimeMillis(); System.out.println("Regex Name Finder process finished at: " + startTime); obj.put("message", "Regex Name Finder finished with no errors"); obj.put("details", ""); DB.UpdateLogRegexFinder(lEndTime, regexerId, obj); DB.close(); }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() { final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>(); final Document document = SWAGGER_DOCUMENT.get(); final Elements tocElements = document.select(".toc"); final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children(); final Elements tocFilteredCategoryElements = new Elements(); for (final Element tocCategoryElement : tocCategoryElements) { final Element categoryLinkElement = tocCategoryElement.children().first(); tocFilteredCategoryElements.add(categoryLinkElement); }// w w w .j a va2 s .c o m final Elements tocIndividualElements = tocElements.select(".sectlevel2"); addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null); int categoryCount = 1; for (final Element tocIndividualElement : tocIndividualElements) { final Elements tocIndividualElementLinks = tocIndividualElement.select("a"); addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount); categoryCount++; } return titleLinkMap; }
From source file:be.ibridge.kettle.jsoup.JsoupInput.java
private void parseJsoup() throws Exception { // Read JSOUP source if (data.file != null) { data.jsoupReader = Jsoup.parse(new File(data.filename), "UTF-8"); } else {/*ww w.j a va 2 s . c om*/ if (meta.isReadUrl()) { data.jsoupReader = Jsoup.parse(new URL(data.stringToParse), 1000); } else { // read string data.jsoupReader = Jsoup.parse(data.stringToParse); } } List<Elements> resultList = new ArrayList<Elements>(); data.nrrecords = -1; data.recordnr = 0; String prevPath = ""; for (int i = 0; i < data.nrInputFields; i++) { String path = meta.getInputFields()[i].getPath(); Elements ja = data.jsoupReader.select(path); if (ja.size() > 0 && (data.nrrecords != -1 && data.nrrecords != ja.size() && ja != null)) { throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Error.BadStructure", ja.size(), path, prevPath, data.nrrecords)); } resultList.add(ja); if (data.nrrecords == -1 && ja != null) { data.nrrecords = ja.size(); } prevPath = path; } data.resultList = new ArrayList<Elements>(); Iterator<Elements> it = resultList.iterator(); while (it.hasNext()) { Elements j = it.next(); if (j == null || j.size() == 0) { if (data.nrrecords == -1) { data.nrrecords = 1; } // The object is empty means that we do not // find Jsoup path // We need here to create a dummy structure j = new Elements(); for (int i = 0; i < data.nrrecords; i++) { j.add(null); } } data.resultList.add(j); } resultList = null; if (log.isDetailed()) { logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.NrRecords", data.nrrecords)); } }
From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java
/** * * @param input_file//from w ww .j a v a 2 s .c o m * @param data_dir * @param output_file * @param error_sw */ public static void extract_cv_files(File input_file, File data_dir, File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; int idEmail = -1; int idScoreEmail = -1; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL)) idEmail = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL)) idScoreEmail = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { if (true) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); // DOWNLOAD HERE THE HOME PAGE //FileUtils.write(output_file_2, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { // DOWNLOAD HERE THE HOME PAGE // if(!results_dir.exists()) // results_dir.mkdirs(); // File homepage_results_dirs = new File(results_dir, "HOMEPAGE"); // if(!homepage_results_dirs.exists()) // homepage_results_dirs.mkdirs(); //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); Document content = null; String researcher_page_url = nextLine[idResearcherWebAddress]; File temp_file = null; if (p1.matcher(researcher_page_url).matches()) { } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); temp_file = File.createTempFile("internal-cv-files-", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp_file); long sizeInBytes = temp_file.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = null; } else { String text_content = FileUtils.readFileToString(temp_file); String check_string = ""; if (text_content.length() <= 100) { check_string = text_content.substring(0, text_content.length()); } else { check_string = text_content.substring(0, 100); } if (check_string.toLowerCase().contains("html")) { content = Jsoup.parse(text_content); content.setBaseUri(researcher_page_url); // DOWNLOAD HERE THE HOME PAGE // String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html"; // FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename)); // // String result = ""; // result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; // if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; // if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; // if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; // if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; // if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; // result += "\"" + filename + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; // result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; // if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; // result += "\r\n"; // // try { // FileUtils.write(output_file_2, result, "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } else { throw new Exception(researcher_page_url + " is not html document"); } } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = null; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root") .error("" + researcher_page_url + " could not loaded (out of memory)", ex2); error_sw.append("" + researcher_page_url + " could not loaded (out of memory)"); content = null; } finally { if (temp_file != null) temp_file.delete(); } } //Add sources to output { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; result += "\"HOMEPAGE\"" + CSV_SEPARATOR; result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } if (content != null) { Elements links = content.select("a[href]"); Elements links_worepeat = new Elements(); for (Element link : links) { boolean b = false; for (Element link_worepeat : links_worepeat) { if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) { b = true; break; } } if (!b) links_worepeat.add(link); } for (Element link : links_worepeat) { boolean b = false; link.setBaseUri(researcher_page_url); String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("CV found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "CV"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "B"; } else { score = "A"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } b = false; link.setBaseUri(researcher_page_url); clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : pub_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "PUB"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "-"; } else { score = "-"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } } reader.close(); } // reader = null; // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // reader.readNext(); // // int newIdResearcherWebpage = 3; // if(idFirstName != -1) newIdResearcherWebpage++; // if(idName != -1) newIdResearcherWebpage++; // if(idEmail != -1) newIdResearcherWebpage++; // if(idInstitutionName != -1) newIdResearcherWebpage++; // if(idWebAddress != -1) newIdResearcherWebpage++; // // List<Object[]> urls_times = new ArrayList<Object[]>(); // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // // Object[] url_time = new Object[2]; // url_time[0] = url; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url_time[0])){ // u[1] = (Integer)u[1] + 1; // b = true; // break; // } // } // // if(!b){ // url_time[1] = new Integer(1); // urls_times.add(url_time); // } // } // // reader.close(); // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // nextLine = reader.readNext(); // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url) && ((Integer)u[1] == 1)){ // b = true; // break; // } // } // // if(b){ // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // } // } // // reader.close(); } catch (Exception ex) { String error_msg = "Error extracting cv files from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }
From source file:com.screenslicer.core.util.BrowserUtil.java
private static WebElement toElement(Browser browser, HtmlNode htmlNode, Element body, boolean recurse) throws ActionFailed { if (body == null) { body = BrowserUtil.openElement(browser, true, null, null, null, null); }/* w ww . j av a2 s .co m*/ if (!CommonUtil.isEmpty(htmlNode.id)) { Elements elements = body.getElementsByAttributeValue("id", htmlNode.id); if (elements.size() == 1) { WebElement element = toElement(browser, elements.get(0), htmlNode, recurse); if (element != null) { return element; } } } List<Elements> selected = new ArrayList<Elements>(); if (!CommonUtil.isEmpty(htmlNode.tagName)) { selected.add(body.getElementsByTag(htmlNode.tagName)); } else if (!CommonUtil.isEmpty(htmlNode.href)) { selected.add(body.getElementsByTag("a")); } if (!CommonUtil.isEmpty(htmlNode.id)) { selected.add(body.getElementsByAttributeValue("id", htmlNode.id)); } if (!CommonUtil.isEmpty(htmlNode.name)) { selected.add(body.getElementsByAttributeValue("name", htmlNode.name)); } if (!CommonUtil.isEmpty(htmlNode.type)) { selected.add(body.getElementsByAttributeValue("type", htmlNode.type)); } if (!CommonUtil.isEmpty(htmlNode.value)) { selected.add(body.getElementsByAttributeValue("value", htmlNode.value)); } if (!CommonUtil.isEmpty(htmlNode.title)) { selected.add(body.getElementsByAttributeValue("title", htmlNode.title)); } if (!CommonUtil.isEmpty(htmlNode.role)) { selected.add(body.getElementsByAttributeValue("role", htmlNode.role)); } if (!CommonUtil.isEmpty(htmlNode.alt)) { selected.add(body.getElementsByAttributeValue("alt", htmlNode.alt)); } if (htmlNode.classes != null && htmlNode.classes.length > 0) { Map<Element, Integer> found = new HashMap<Element, Integer>(); for (int i = 0; i < htmlNode.classes.length; i++) { Elements elements = body.getElementsByClass(htmlNode.classes[i]); for (Element element : elements) { if (!found.containsKey(element)) { found.put(element, 0); } found.put(element, found.get(element) + 1); } } Elements elements = new Elements(); for (int i = htmlNode.classes.length; i > 0; i--) { for (Map.Entry<Element, Integer> entry : found.entrySet()) { if (entry.getValue() == i) { elements.add(entry.getKey()); } } if (!elements.isEmpty()) { break; } } selected.add(elements); } if (!CommonUtil.isEmpty(htmlNode.href)) { Elements hrefs = body.getElementsByAttribute("href"); Elements toAdd = new Elements(); String currentUrl = browser.getCurrentUrl(); String hrefGiven = htmlNode.href; for (Element href : hrefs) { String hrefFound = href.attr("href"); if (hrefGiven.equalsIgnoreCase(hrefFound)) { toAdd.add(href); toAdd.add(href); toAdd.add(href); } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.endsWith(hrefGiven)) { toAdd.add(href); toAdd.add(href); } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.contains(hrefGiven)) { toAdd.add(href); } else { String uriGiven = UrlUtil.toCanonicalUri(currentUrl, hrefGiven); String uriFound = UrlUtil.toCanonicalUri(currentUrl, hrefFound); if (uriGiven.equalsIgnoreCase(uriFound)) { toAdd.add(href); } } } selected.add(toAdd); } if (!CommonUtil.isEmpty(htmlNode.innerText)) { selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText))); selected.add(body.getElementsMatchingText("^\\s*" + Pattern.quote(htmlNode.innerText) + "\\s*$")); } if (htmlNode.multiple != null) { selected.add(body.getElementsByAttribute("multiple")); } Map<Element, Integer> votes = new HashMap<Element, Integer>(); for (Elements elements : selected) { for (Element element : elements) { if (!votes.containsKey(element)) { votes.put(element, 0); } votes.put(element, votes.get(element) + 2); if (!NodeUtil.isHidden(element)) { votes.put(element, votes.get(element) + 1); } } } int maxVote = 0; Element maxElement = null; for (Map.Entry<Element, Integer> entry : votes.entrySet()) { if (entry.getValue() > maxVote) { maxVote = entry.getValue(); maxElement = entry.getKey(); } } return toElement(browser, maxElement, htmlNode, recurse); }
From source file:com.screenslicer.core.util.Util.java
public static WebElement toElement(RemoteWebDriver driver, HtmlNode htmlNode, Element body) throws ActionFailed { if (body == null) { body = Util.openElement(driver, null, null, null); }//from w ww . j a v a 2s . co m if (!CommonUtil.isEmpty(htmlNode.id)) { WebElement element = toElement(driver, body.getElementById(htmlNode.id)); if (element != null) { return element; } } List<Elements> selected = new ArrayList<Elements>(); if (!CommonUtil.isEmpty(htmlNode.tagName)) { selected.add(body.getElementsByTag(htmlNode.tagName)); } else if (!CommonUtil.isEmpty(htmlNode.href)) { selected.add(body.getElementsByTag("a")); } if (!CommonUtil.isEmpty(htmlNode.name)) { selected.add(body.getElementsByAttributeValue("name", htmlNode.name)); } if (!CommonUtil.isEmpty(htmlNode.type)) { selected.add(body.getElementsByAttributeValue("type", htmlNode.type)); } if (!CommonUtil.isEmpty(htmlNode.value)) { selected.add(body.getElementsByAttributeValue("value", htmlNode.value)); } if (!CommonUtil.isEmpty(htmlNode.title)) { selected.add(body.getElementsByAttributeValue("title", htmlNode.title)); } if (htmlNode.classes != null && htmlNode.classes.length > 0) { Map<Element, Integer> found = new HashMap<Element, Integer>(); for (int i = 0; i < htmlNode.classes.length; i++) { Elements elements = body.getElementsByClass(htmlNode.classes[i]); for (Element element : elements) { if (!found.containsKey(element)) { found.put(element, 0); } found.put(element, found.get(element) + 1); } } Elements elements = new Elements(); for (int i = htmlNode.classes.length; i > 0; i--) { for (Map.Entry<Element, Integer> entry : found.entrySet()) { if (entry.getValue() == i) { elements.add(entry.getKey()); } } if (!elements.isEmpty()) { break; } } selected.add(elements); } if (!CommonUtil.isEmpty(htmlNode.href)) { Elements hrefs = body.getElementsByAttribute("href"); Elements toAdd = new Elements(); String currentUrl = driver.getCurrentUrl(); String hrefGiven = htmlNode.href; for (Element href : hrefs) { String hrefFound = href.attr("href"); if (hrefGiven.equalsIgnoreCase(hrefFound)) { toAdd.add(href); } else { String uriGiven = Util.toCanonicalUri(currentUrl, hrefGiven); String uriFound = Util.toCanonicalUri(currentUrl, hrefFound); if (uriGiven.equalsIgnoreCase(uriFound)) { toAdd.add(href); } } } selected.add(toAdd); } if (!CommonUtil.isEmpty(htmlNode.innerText)) { selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText))); } if (htmlNode.multiple != null) { selected.add(body.getElementsByAttribute("multiple")); } Map<Element, Integer> votes = new HashMap<Element, Integer>(); for (Elements elements : selected) { for (Element element : elements) { if (!Util.isHidden(element)) { if (!votes.containsKey(element)) { votes.put(element, 0); } votes.put(element, votes.get(element) + 1); } } } int maxVote = 0; Element maxElement = null; for (Map.Entry<Element, Integer> entry : votes.entrySet()) { if (entry.getValue() > maxVote) { maxVote = entry.getValue(); maxElement = entry.getKey(); } } return toElement(driver, maxElement); }
From source file:org.asqatasun.processor.DOMHandlerImpl.java
/** * This method should be called at each first selection of a RuleImplementation * to reset all the local collections./*from w w w . j ava2 s. c o m*/ * * @return the current instance */ @Override public DOMHandler beginCssLikeSelection() { // reset the local collection of elements selectedElements = new Elements(); return this; }
From source file:org.asqatasun.processor.DOMHandlerImpl.java
@Override public int getTotalNumberOfElements() { if (totalNumberOfElement == -1) { totalNumberOfElement = cssLikeSelectNodeSet("*").getSelectedElementNumber(); selectedElements = new Elements(); }/*from www . jav a2 s.c o m*/ return totalNumberOfElement; }
From source file:org.asqatasun.rules.elementchecker.text.TextEmptinessCheckerTest.java
@Override protected void setUp() throws Exception { super.setUp(); mockTextElementBuilder = createMock(TextElementBuilder.class); mockSSPHandler = createMock(SSPHandler.class); elements = new Elements(); element = new Element(Tag.valueOf("div"), ""); mockTestSolutionHandler = createMock(TestSolutionHandler.class); mockProcessRemarkService = createMock(ProcessRemarkService.class); }
From source file:org.asqatasun.rules.elementchecker.text.TextNotIdenticalToAttributeCheckerTest.java
@Override protected void setUp() throws Exception { super.setUp(); mockTextElementBuilder = createMock(TextElementBuilder.class); mockSSPHandler = createMock(SSPHandler.class); elements = new Elements(); element = new Element(Tag.valueOf("div"), ""); element.attr(AttributeStore.ALT_ATTR, "test"); mockTestSolutionHandler = createMock(TestSolutionHandler.class); mockProcessRemarkService = createMock(ProcessRemarkService.class); }