List of usage examples for org.dom4j Element elementIterator
Iterator<Element> elementIterator(QName qName);
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Reporting.ReportingUtils.java
License:Open Source License
public static void extractExpressionFromCSVID(String xmlfile, String csvfile, String outfile) throws Exception { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlfile); org.dom4j.Element root = document.getRootElement(); CSVReader csvreader = new CSVReader(new FileReader(csvfile), ';'); HashMap<String, List<String>> hm = new HashMap<String, List<String>>(); String[] nextLine;//from w ww. j a va 2s . c o m while ((nextLine = csvreader.readNext()) != null) { hm.put(nextLine[0], new ArrayList<String>()); } for (Iterator i1 = root.elementIterator("blockinfo"); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); String id = e1.attribute("id_entity").getValue(); //if(!id.equals("786643")) continue; List<String> al = hm.get(id); if (al == null) continue; int PA = 0, AS = 0; if (al.size() != 0) { int last = al.size() - 1; String[] ss = al.get(last).split("\t"); PA = Integer.parseInt(ss[0]); AS = Integer.parseInt(ss[1]); al.remove(last); } for (Object obj : e1.elements()) { org.dom4j.Element exp = (org.dom4j.Element) obj; String aux_content = exp.element("Content").getText().replace("\n", " ").replace("\r\n", " "); if (!al.contains(exp.getName() + "\t\t" + aux_content + "\r\n")) { if (exp.getName().contains("Activity")) PA++; if (exp.getName().contains("Studies")) AS++; al.add(exp.getName() + "\t\t" + aux_content + "\r\n"); } } //content += "\tPA: " + PA + " AS: " + AS + "\r\n"; al.add(PA + "\t" + AS); hm.put(id, al); } csvreader = new CSVReader(new FileReader(csvfile), ';'); while ((nextLine = csvreader.readNext()) != null) { String key = nextLine[0]; //System.out.println("ID: " + key + " " + hm.get(key)); List<String> al = hm.get(key); int size = al.size(); System.out.println(key + "\t" + (size != 0 ? al.get(size - 1) : "")); //for(String s : hm.get(key)) // System.out.println(s); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Reporting.ReportingUtils.java
License:Open Source License
public static void extractExpressions(String xmlfile, String csvfile, String outfile) throws Exception { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlfile); org.dom4j.Element root = document.getRootElement(); CSVReader csvreader = new CSVReader(new FileReader(csvfile), ';'); HashMap<String, String> hm = new HashMap<String, String>(); String[] nextLine;/*www .ja v a2s. com*/ while ((nextLine = csvreader.readNext()) != null) { //hm.put(nextLine[0], ""); } for (Iterator i1 = root.elementIterator("blockinfo"); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); String id = e1.attribute("id_entity").getValue(); //if(!id.equals("786643")) continue; int PA = 0, AS = 0; for (Object obj : e1.elements()) { org.dom4j.Element exp = (org.dom4j.Element) obj; if (exp.getName().contains("Activity")) { hm.put(exp.element("Title_name").getText(), ""); } } } csvreader = new CSVReader(new FileReader(csvfile), ';'); for (String key : hm.keySet()) { System.out.println(key); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.CleanerResearchersWebpages.java
License:Open Source License
/** * /*from w w w. j a v a 2 s . c o m*/ * @param elementResearcher * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @param sResearchGroupDescription * @param sResearchName * @param sResearchInitials * @param sStaffIndentifier * @return */ @Override protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) { for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String url = e5.getText(); if (!url.equals("")) { String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT); if (ext == null || ext == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML; String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE); if (type == null || type == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV; String filename = ResearchersPagePostProcessor.getHashFileName(type, url, ext); ResearchersPagePostProcessor.cleanFile(ResearchersPagePostProcessor.getCleanerProperties(), path, filename, filename); } } return true; }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java
License:Open Source License
public static void downloadResearchesPages(String destDir, LocalFormatType downloading_format_type, org.dom4j.Element elInstitution, boolean redownload) { try {//w ww. j a v a 2 s .c om String sInstitutionName = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchName = ""; String sResearchInitials = ""; org.dom4j.Element e1 = elInstitution; //(org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); File dirBase = null; dirBase = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory(destDir, false); File dirI = null; if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirI = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory( destDir + "\\" + sInstitutionName.replaceAll("[^a-z^A-Z]", "") + "\\", redownload); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); File dirUAD = null; if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirUAD = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory(dirI.getPath() + "\\" + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]", "") + "\\", redownload); for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); sResearchName = e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); sResearchInitials = e4.element(XMLTags.RESEARCHER_INITIALS).getText(); String sAux = sResearchName.replaceAll("[^a-z^A-Z]", "") + "#" + sResearchInitials.replaceAll("[^a-z^A-Z]", ""); File dirR = null; if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirR = DownloaderResearchersWebPagesXMLFormat .giveMeTheDirectory(dirUAD.getPath() + "\\" + sAux + "\\", false); else dirR = dirBase; for (Iterator i5 = e4.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String url = e5.getText(); if (!url.equals("")) { String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT); if (ext == null || ext == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML; String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE); if (type == null || type == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV; String fileDownloaded = ResearchersPagePostProcessor .downloadAndClean(dirR.getAbsolutePath(), type, url, ext, true, redownload); fileDownloaded = fileDownloaded; } } } } } } catch (Exception ex) { ProjectLogger.LOGGER.error("ERROR: " + ex.getMessage()); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java
License:Open Source License
public static void downloadAllResearchersPagesWithThreads(String xmlFile, String destDir, LocalFormatType downloading_format_type, int numberThreads, boolean redownload) { try {/*from w w w. ja v a 2 s . c om*/ org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); //if(redownload) // FileFootils.deleteDir(destDir + "\\"); String sInstitutionName = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchName = ""; String sResearchInitials = ""; File dir = new File(destDir + "\\"); if (!dir.exists()) { if (!dir.mkdir()) throw new Exception("Cant create " + dir.getPath()); } MonitorThread monitor = new MonitorThread(numberThreads); //downloaderResearchesPages[] ath = new DownloaderResearchersWebPagesXMLFormat[numberThreads]; //for(int i = 0; i < ath.length; i++) ath[i] = null; boolean bExit = false; boolean bAnyWorks = true; for (Iterator i1 = root.elementIterator("Institution"); i1.hasNext() && !bExit;) { if (monitor.canCreateNewThread()) { DownloaderResearchersWebPagesXMLFormat ath = new DownloaderResearchersWebPagesXMLFormat(destDir, downloading_format_type, (org.dom4j.Element) i1.next(), monitor, redownload); ath.start(); } else { sleep(5000); } } } catch (Exception ex) { // } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java
License:Open Source License
public static void downloadAllResearchersPages(String xmlFile, String destDir, LocalFormatType downloading_format_type, boolean redownload) { try {//from w ww . j a v a 2s . c o m //if(true) return; org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); FileFootils.deleteDir(destDir + "\\"); String sInstitutionName = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchName = ""; String sResearchInitials = ""; File dir = new File(destDir + "\\"); if (!dir.mkdir()) throw new Exception("Cant create " + dir.getPath()); for (Iterator i1 = root.elementIterator("Institution"); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); DownloaderResearchersWebPagesXMLFormat.downloadResearchesPages(destDir, downloading_format_type, e1, redownload); } } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsInFolder.java
License:Open Source License
/** * Reader folder of one researcher and takes the uri of clean file for to make infoblock. * @param elementResearcher/* www. j a v a2s .c o m*/ * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @param sResearchGroupDescription * @param sResearchName * @param sResearchInitials * @param sStaffIndentifier */ @Override protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) { File fAux = new File(path); File[] adirRW = fAux.listFiles(); if (adirRW != null) { for (File file : adirRW) { if (!file.isDirectory()) continue; boolean b = false; for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String sURL = e5.getText(); byte[] bytes = sURL.getBytes(); String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5)); if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) { b = true; File dirFinalFiles = new File(file.getPath()); File[] afinalFiles = dirFinalFiles.listFiles(); //Search all clean files for (File finalFile : afinalFiles) { if (finalFile.getName().contains("clean_")) { //FIXME if (!finalFile.getName().contains("pub")) if (finalFile.exists()) { rootOut.addElement("infoblock") .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT, sStaffIndentifier) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC) .addText(finalFile.getAbsolutePath()); } } } } } if (!b) Logger.getLogger("MyLog").warning("FILE MISSED: " + file.getPath()); } } return true; }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreator.java
License:Open Source License
/** * Reader folder of one researcher and takes the uri of clean file for to make infoblock. * Note: Read comment in top of file./*w ww.j a v a 2 s.co m*/ * @param elementResearcher * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @param sResearchGroupDescription * @param sResearchName * @param sResearchInitials * @param sStaffIndentifier */ @Override protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) { File fAux = new File(path); File[] adirRW = fAux.listFiles(); if (adirRW != null) { for (File file : adirRW) { for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String sURL = e5.getText(); byte[] bytes = sURL.getBytes(); String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5)); if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) { File dirFinalFiles = new File(file.getPath()); File[] afinalFiles = dirFinalFiles.listFiles(); //Search all clean files for (File finalFile : afinalFiles) { if (finalFile.getName().contains("clean_")) { //FIXME if (!finalFile.getName().contains("pub")) if (finalFile.exists()) { rootOut.addElement("infoblock") .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT, sStaffIndentifier) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC) .addText(finalFile.getAbsolutePath()); } } } } else { Logger.getLogger("MyLog").warning( "DIR NOT EQUAL: " + file.getPath() + " != " + sAuxxx.replace("\\\\", "\\")); } } } } return true; }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreatorWithFilter.java
License:Open Source License
/** * /* ww w .j av a 2 s . c om*/ * @param elementResearcher * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @param sResearchGroupDescription * @param sResearchName * @param sResearchInitials * @param sStaffIndentifier */ @Override protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) { File fAux = new File(path); File[] adirRW = fAux.listFiles(); if (adirRW != null) { for (File file : adirRW) { for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String sURL = e5.getText(); byte[] bytes = sURL.getBytes(); String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5)); if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) { File fileURL = new File(file.getPath() + "\\clean_index.html"); boolean b = true; org.jsoup.nodes.Document doc = null; try { // org.jsoup.nodes.Document doc2 = Jsoup.connect(sURL).get(); doc = Jsoup.parse(fileURL, "UTF-8", sURL); org.jsoup.select.Elements els = doc.body().select(xpathExp); //(":containsOwn(" + sLiteralExp + ")"); //logger.info("URL: " + fileURL.getPath() + " uri: " + doc.baseUri()); //logger.log(Level.INFO, fileURL.getAbsolutePath()); if (els.size() > 0) { //logger.info(fileURL.getAbsolutePath()); Logger.getLogger("MyLog").log(Level.INFO, "(" + xpathExp + ") => " + fileURL.getAbsolutePath()); hitsTable[0][0]++; rootOut.addElement("infoblock") .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT, sStaffIndentifier) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER) .addAttribute( DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC) .addText(fileURL.getAbsolutePath()); sContentForView += sURL + " - " + els.first().text() + " - " + (els.first().absUrl("href")) + "\r\n\r\n"; } else hitsTable[1][0]++; } catch (Exception ex) { Logger.getLogger("MyLog").log(Level.SEVERE, "ERROR URL: " + fileURL.getPath() + " Msg: " + ex.getMessage() + "."); } } } } } return true; }
From source file:eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ConversorFromXMLtoCSV.java
License:Open Source License
@Override protected boolean actionsInResearcherNode(org.dom4j.Element elementResearcher, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) { boolean has = false; for (Iterator i5 = elementResearcher.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String url = e5.getText(); if (!url.equals("")) { has = true;/*from w w w. j a v a 2 s .c o m*/ break; } } if (!has) { String[] entries = new String[7]; entries[0] = sStaffIndentifier; entries[1] = researcherNameInfo.whole_name; entries[2] = researcherNameInfo.first_name; entries[3] = researcherNameInfo.last_name; entries[4] = researcherNameInfo.initial; entries[5] = sUnitOfAssessment_Description; entries[6] = sInstitutionName; this.writer_nofound.writeNext(entries); } return true; }