List of usage examples for org.dom4j Element elementIterator
Iterator<Element> elementIterator(QName qName);
From source file:edu.umd.cs.marmoset.utilities.ParseWebXml.java
License:Apache License
public static ParseWebXml parse(String webXmlFileName) throws FileNotFoundException, DocumentException { File file = new File(webXmlFileName); FileInputStream fis = new FileInputStream(file); SAXReader reader = new SAXReader(); Document document = reader.read(fis); ParseWebXml webXml = new ParseWebXml(); Element root = document.getRootElement(); for (Iterator<?> ii = root.elementIterator("servlet-mapping"); ii.hasNext();) { Element elt = (Element) ii.next(); //System.out.print("name: " +elt.getName()); String urlPattern = null; String servletName = null; for (int jj = 0; jj < elt.nodeCount(); jj++) { Node node = elt.node(jj); if (node.getName() == null) continue; if (node.getName().equals(SERVLET_NAME)) { servletName = node.getText().trim(); if (webXml.tryToMapServlet(servletName, urlPattern)) break; } else if (node.getName().equals(SERVLET_URL_PATTERN)) { urlPattern = node.getText().trim(); if (webXml.tryToMapServlet(servletName, urlPattern)) break; }// w w w .j av a2 s . c om } //System.out.println(" is mapped thusly: " +servletName +" => "+ urlPattern); } for (Iterator<?> ii = root.elementIterator("filter-mapping"); ii.hasNext();) { Element elt = (Element) ii.next(); //System.out.print("name: " +elt.getName()); String filterName = null; String urlPattern = null; for (int jj = 0; jj < elt.nodeCount(); jj++) { Node node = elt.node(jj); if (node.getName() == null) continue; if (node.getName().equals(FILTER_NAME)) { filterName = node.getText().trim(); if (webXml.tryToCreateFilter(filterName, urlPattern)) break; } else if (node.getName().equals(FILTER_URL_PATTERN)) { urlPattern = node.getText().trim(); if (webXml.tryToCreateFilter(filterName, urlPattern)) break; } } //System.out.println(" is mapped thusly: " +filterName+ " => "+ urlPattern); } return webXml; }
From source file:edu.vt.middleware.ldap.dsml.AbstractDsml.java
License:Open Source License
/** * This will take a DSML <code>Element</code> containing an entry of type * <entry/> and convert it to an LDAP entry. * * @param entryElement <code>Element</code> of DSML content * * @return <code>LdapEntry</code> *///from w ww.jav a 2 s.co m protected LdapEntry createLdapEntry(final Element entryElement) { final LdapEntry ldapEntry = this.beanFactory.newLdapEntry(); ldapEntry.setDn(""); if (entryElement != null) { final String name = entryElement.attributeValue("dn"); if (name != null) { ldapEntry.setDn(name); } if (entryElement.hasContent()) { // load the attribute elements final Iterator<?> attrIterator = entryElement.elementIterator("attr"); while (attrIterator.hasNext()) { final Element attrElement = (Element) attrIterator.next(); final String attrName = attrElement.attributeValue("name"); if (attrName != null && attrElement.hasContent()) { final LdapAttribute ldapAttribute = this.beanFactory.newLdapAttribute(); ldapAttribute.setName(attrName); final Iterator<?> valueIterator = attrElement.elementIterator("value"); while (valueIterator.hasNext()) { final Element valueElement = (Element) valueIterator.next(); final String value = valueElement.getText(); if (value != null) { final String encoding = valueElement.attributeValue("encoding"); if (encoding != null && "base64".equals(encoding)) { ldapAttribute.getValues().add(LdapUtil.base64Decode(value)); } else { ldapAttribute.getValues().add(value); } } } ldapEntry.getLdapAttributes().addAttribute(ldapAttribute); } } } } return ldapEntry; }
From source file:edu.vt.middleware.ldap.dsml.Dsmlv1.java
License:Open Source License
/** * This will take a DSML <code>Element</code> containing an entry of type * <dsml:entry name="name"/> and convert it to an LDAP entry. * * @param entryElement <code>Element</code> of DSML content * * @return <code>LdapEntry</code> *///from w ww . ja v a2s. c om protected LdapEntry createLdapEntry(final Element entryElement) { final LdapEntry ldapEntry = this.beanFactory.newLdapEntry(); ldapEntry.setDn(""); if (entryElement != null) { final String name = entryElement.attributeValue("dn"); if (name != null) { ldapEntry.setDn(name); } if (entryElement.hasContent()) { final Iterator<?> ocIterator = entryElement.elementIterator("objectclass"); while (ocIterator.hasNext()) { final Element ocElement = (Element) ocIterator.next(); if (ocElement != null && ocElement.hasContent()) { final String ocName = "objectClass"; final LdapAttribute ldapAttribute = this.beanFactory.newLdapAttribute(); ldapAttribute.setName(ocName); final Iterator<?> valueIterator = ocElement.elementIterator("oc-value"); while (valueIterator.hasNext()) { final Element valueElement = (Element) valueIterator.next(); if (valueElement != null) { final String value = valueElement.getText(); if (value != null) { final String encoding = valueElement.attributeValue("encoding"); if (encoding != null && "base64".equals(encoding)) { ldapAttribute.getValues().add(LdapUtil.base64Decode(value)); } else { ldapAttribute.getValues().add(value); } } } } ldapEntry.getLdapAttributes().addAttribute(ldapAttribute); } } ldapEntry.getLdapAttributes() .addAttributes(super.createLdapEntry(entryElement).getLdapAttributes().getAttributes()); } } return ldapEntry; }
From source file:edu.wustl.geneconnect.bizlogic.AbstractBizLogicFactory.java
License:BSD License
/** * This method updates module map by parsing xml file * @param xmlFileName file to be parsed//from ww w. j av a 2 s . co m * @return moduleMap Map */ public final Map updateModuleMap(String xmlFileName) { Map moduleMap = new HashMap(); SAXReader saxReader = new SAXReader(); InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream(xmlFileName); Document document = null; try { document = saxReader.read(inputStream); Element businessLogics = document.getRootElement(); Iterator businessLogicIterator = businessLogics .elementIterator(GCConstants.BUSINESS_LOGIC_ELEMENT_ITERATOR); Element businessLogic = null; Element businessAction = null; Element instanceType = null; String instanceTypeString = null; String businessActionString = null; /** * Iterate over bizlogic.xml file and find the class ned to instantiate and return it. */ while (businessLogicIterator.hasNext()) { try { businessLogic = (Element) businessLogicIterator.next(); businessAction = businessLogic.element(GCConstants.BUSINESS_ACTION_ELEMENT); instanceType = businessLogic.element(GCConstants.INSTANCE_TYPE_ELEMENT); // moduleMap.put(businessAction.getStringValue(), Class.forName( // instanceType.getStringValue()).newInstance()); moduleMap.put(businessAction.getStringValue(), instanceType.getStringValue()); } catch (Exception e) { e.printStackTrace(); } } } catch (DocumentException e) { throw new GCRuntimeException(e); } catch (Exception e) { throw new GCRuntimeException(e); } return moduleMap; }
From source file:eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java
License:Open Source License
/** * In this block the crawler will try to extract the departments web adresses. * The block works with a org.dom4j.Element * Notes:/*from ww w . ja va 2s. com*/ * The function iterate the institution elemento taking all the UNIT_OF_ASSESSMENT to search all of them in same crawler call. * The UNIT_OF_ASSESSMENT will be stores in subjects array, next, it will be given to the crawler. * * @param elementInstitution * @param path * @param sInstitutionName * @param sWebAddress * @return */ @Override protected boolean actionsInInstitutionNode(org.dom4j.Element elementInstitution, String path, String sInstitutionName, String sWebAddress) { if (refuseExecution) return false; String crawler_data_folder = this.work_dir.getAbsolutePath() + File.separator + CRAWLER_DATA_FOLDERNAME; List<String> subjects = new ArrayList<String>(); String sSeed = sWebAddress; String sContainPattern = sSeed.replace("http://www.", ""); int index = sContainPattern.indexOf("/"); if (index == -1) index = sContainPattern.length() - 1; sContainPattern = sContainPattern.substring(0, index); ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName); /* * Taking subjects to search its web adresses */ String sUnitOfAssessment_Description = ""; for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2 .hasNext();) { sUnitOfAssessment_Description = i2.next().element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); subjects.add(sUnitOfAssessment_Description); ProjectLogger.LOGGER.info( "\tAdding subject '" + sUnitOfAssessment_Description + "' to search its section webpages"); } /* * Crawling to search the departments */ CrawlerDepartamentsV3Controller controllerDepts = null; try { String university_crawler_data_folder = crawler_data_folder + File.separator + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-crawler-data"; File university_crawler_data_dir = new File(university_crawler_data_folder); if (university_crawler_data_dir.exists()) FileFootils.deleteDir(university_crawler_data_dir); controllerDepts = new CrawlerDepartamentsV3Controller(university_crawler_data_folder, this.keywords_data_dir, subjects); controllerDepts.addSeed(sSeed); controllerDepts.setPolitenessDelay(200); controllerDepts.setMaximumCrawlDepth(3); controllerDepts.setMaximumPagesToFetch(-1); controllerDepts.setContainPattern(sContainPattern); controllerDepts.clearPossibleResults(); ProjectLogger.LOGGER .info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ") - [" + sSeed + "]"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerDepts.start(CrawlerDepartamentsV3.class, 1); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER .info("End crawling: " + sInstitutionName + " - Time: " + lTimerAux + " ms - [" + sSeed + "]"); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage(), ex); } finally { if (CrawlerTrace.isTraceUrlsActive() && controllerDepts != null) controllerDepts.closeCrawlerTrace(); controllerDepts.releaseResources(); } /* * Update results */ if (controllerDepts != null) { if (CrawlerTrace.isTraceSearchActive()) { CandidateTypeURL.printResults("Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerDepts.getPossibleResultsTYPE()); } /* * Adding departments web addresses to xml document */ for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2 .hasNext();) { org.dom4j.Element e2 = i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE(); Iterator<String> it = t.keySet().iterator(); // String department_of = CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG + sUnitOfAssessment_Description; //FIXME, TEST THIS //while(it.hasNext()) //{ // String department_of = it.next(); // if(department_of.toLowerCase().equals(CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG + sUnitOfAssessment_Description.toLowerCase())) // { List<CandidateTypeURL> lst = t.get(department_of); if (lst != null) { for (CandidateTypeURL ss : lst) { ProjectLogger.LOGGER .info("Add department '" + department_of + "' the url '" + ss.sURL + "'"); e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL); } } // break; // } //} } } return true; }
From source file:eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java
License:Open Source License
/** * /* w w w.ja v a 2 s .c o m*/ * @param elementUnitOfAssessment * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @return */ @Override protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) { if (refuseExecution) return false; String crawler_data_folder = this.work_dir + File.separator + CRAWLER_DATA_FOLDERNAME; List<String> department_web_addresses = new ArrayList<String>(); List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>(); String seed = sWebAddress; String contain_pattern = seed.replace("http://www.", ""); int index = contain_pattern.indexOf("/"); if (index == -1) index = contain_pattern.length() - 1; contain_pattern = contain_pattern.substring(0, index); /* * Taking departments webpages to search in the researchers webpages */ for (Iterator<org.dom4j.Element> department_web_address_it = elementUnitOfAssessment .elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); department_web_address_it.hasNext();) { org.dom4j.Element department_web_address_element = (org.dom4j.Element) department_web_address_it.next(); if (!department_web_address_element.getText().equals("")) department_web_addresses.add(department_web_address_element.getText()); } /* * If there is not department webpage, then, add the university web to find staff page and something similar */ if (department_web_addresses.isEmpty()) { ProjectLogger.LOGGER.info("There is not dept webpages for [" + sUnitOfAssessment_Description + " - " + sInstitutionName + "]. Adding " + sWebAddress); //department_web_addresses.add(sWebAddress); } /* * Taking researchers info to search the researchers webs */ for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) { org.dom4j.Element research_group_element = research_group_it.next(); for (Iterator<org.dom4j.Element> reseacher_it = research_group_element .elementIterator(XMLTags.RESEARCHER); reseacher_it.hasNext();) { org.dom4j.Element reseacher_element = reseacher_it.next(); String initials = reseacher_element.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = reseacher_element.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = reseacher_element.element(XMLTags.RESEARCHER_NAME) == null ? "" : reseacher_element.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name); researchers.add(rsi); } } if (researchers.size() > 0 && !department_web_addresses.isEmpty()) { /* * Crawling to search the researchers */ CrawlerResearchesPagesV3Controller controllerReseachers = null; try { String university_subject_crawler_data_folder = crawler_data_folder + File.separator + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-" + sUnitOfAssessment_Description.replaceAll("\\W+", "").toLowerCase() + "-crawler-data"; File university_subject_crawler_data_dir = new File(university_subject_crawler_data_folder); if (university_subject_crawler_data_dir.exists()) FileFootils.deleteDir(university_subject_crawler_data_dir); controllerReseachers = new CrawlerResearchesPagesV3Controller( university_subject_crawler_data_folder, this.keywords_data_dir, researchers); String sSeeds = ""; for (String s : department_web_addresses) { controllerReseachers.addSeed(s); sSeeds += s + ","; } controllerReseachers.setPolitenessDelay(200); controllerReseachers.setMaximumCrawlDepth(3); controllerReseachers.setMaximumPagesToFetch(-1); controllerReseachers.setContainPattern(contain_pattern); controllerReseachers.clearInterestingUrlsDetected(); ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " - [" + StringUtils.join(department_web_addresses, ",") + "]"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerReseachers.start(CrawlerResearchesPagesV3.class, 1); controllerReseachers.postProcessResults(); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER.info( "End crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " - Time: " + lTimerAux + " ms - [" + StringUtils.join(department_web_addresses, ",") + "]"); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage(), ex); } finally { if (CrawlerTrace.isTraceUrlsActive() && controllerReseachers != null) controllerReseachers.closeCrawlerTrace(); } /* * Update results */ if (controllerReseachers != null) { /* * Print the researchers */ if (CrawlerTrace.isTraceSearchActive()) { CandidateTypeURL .printResults( "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerReseachers.getInterestingUrlsDetected()); } counterTotal[0] = 0; counterSuccess[0] = 0; try { /* * Add researcher webs to xml document */ for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) { org.dom4j.Element research_group_element = research_group_it.next(); for (Iterator<org.dom4j.Element> researcher_it = research_group_element .elementIterator(XMLTags.RESEARCHER); researcher_it.hasNext();) { counterTotal[0]++; org.dom4j.Element researcher_element = researcher_it.next(); String initials = researcher_element.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = researcher_element.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = researcher_element.element(XMLTags.RESEARCHER_NAME) == null ? "" : researcher_element.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo researcher_name_info = new ResearcherNameInfo(last_name, initials, first_name, whole_name); researcher_name_info.first_name = CandidateTypeURL .getCanonicalName(researcher_name_info.first_name); researcher_name_info.last_name = CandidateTypeURL .getCanonicalName(researcher_name_info.last_name); researcher_name_info.initial = CandidateTypeURL .getCanonicalName(researcher_name_info.initial); researcher_name_info.whole_name = CandidateTypeURL .getCanonicalName(researcher_name_info.whole_name); TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers .getInterestingUrlsDetected(); List<CandidateTypeURL> lst = t .get(CrawlerResearchesPagesV3Controller.RESEARCHER_RESULT_TAG); boolean bExist = false; if (lst != null) { //FIXME, contains and remove better boolean lock1 = true; for (CandidateTypeURL ss : lst) { if (researcher_name_info.equals(ss.data)) { ProjectLogger.LOGGER.info("Add researcher '" + researcher_name_info + "' the url '" + ss.sURL + "'"); researcher_element.addElement(XMLTags.RESEARCHER_WEB_ADDRESS) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE, ss.sSubType) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT, ss.sExt) .addText(ss.sURL); lock1 = false; bExist = true; } } } if (bExist) { counterSuccess[0]++; } else { ProjectLogger.LOGGER.warn("No webpage for " + researcher_name_info); } } } } catch (Exception ex) { ProjectLogger.LOGGER.error("Error", ex); } /* * Show a little counting result */ ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); counterTotal[1] += 1; counterSuccess[1] += counterSuccess[0] > 0 ? 1 : 0; counterSuccess[2] += counterSuccess[0]; counterTotal[2] += counterTotal[0]; } } return true; }
From source file:eu.sisob.uma.crawler.MatrixResultBuilder.java
License:Open Source License
/** * /*from w w w . j a v a 2 s . c o m*/ * @param elementUnitOfAssessment * @param path * @param sInstitutionName * @param sWebAddress * @param sUnitOfAssessment_Description * @return */ protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path, String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) { if (!this.dept_axis.containsKey(sUnitOfAssessment_Description)) this.dept_axis.put(sUnitOfAssessment_Description, dept_axis.size()); int counterTotal = 0; int counterSuccess = 0; for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) { org.dom4j.Element research_group_element = research_group_it.next(); for (Iterator<org.dom4j.Element> researcher_it = research_group_element .elementIterator(XMLTags.RESEARCHER); researcher_it.hasNext();) { org.dom4j.Element researcher_element = researcher_it.next(); counterTotal++; if (researcher_element.elements(XMLTags.RESEARCHER_WEB_ADDRESS).size() > 0) counterSuccess++; } } this.resultsMatrix.get(sInstitutionName).put(sUnitOfAssessment_Description, new AbstractMap.SimpleEntry<Integer, Integer>(counterSuccess, counterTotal)); return true; }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java
License:Open Source License
public static void P1_step_collectResearcherLinks(String xmlFilePath, int numberOfCrawlers, String sControlInstitutionName) { try {//from w ww . j av a2s.co m /* * rootfolder is a folder where intermediate crawl data is * stored. */ String rootFolder = "temp/"; FileFootils.deleteDir(rootFolder); /* * numberOfCrawlers shows the number of concurrent threads * that should be initiated for crawling. */ File xmlFile = new File(xmlFilePath); org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); String sInstitutionName = ""; String sWebAddress = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchers = ""; String sResearchersInitials = ""; PageFetcher.startConnectionMonitorThread(); WebCrawler.setTraceLinkName(true); WebCrawler.setTracePageName(true); TreeMap<String, TreeMap<String, List<CandidateTypeURL>>> finalResults = new TreeMap<String, TreeMap<String, List<CandidateTypeURL>>>(); boolean bFlagInstitutionName = false; String sControlUnitOfAssessment_Description = ""; boolean bFlagUnitOfAssessmentName = false; boolean bSaveFile = true; boolean bSetEmptyAllResearchers = true; if (bSetEmptyAllResearchers) { File fField = new File(xmlFilePath.replace(".xml", "backup.xml")); FileOutputStream fileOS = new java.io.FileOutputStream(fField, false); OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8"); BufferedWriter bw = new java.io.BufferedWriter(writer); String sOut = document.asXML(); bw.write(sOut); bw.close(); ProjectLogger.LOGGER.info(xmlFilePath + " backuped."); } int[] counterSuccess = new int[3]; int[] counterTotal = new int[3]; for (int i = 0; i < counterSuccess.length; i++) counterSuccess[i] = 0; for (int i = 0; i < counterTotal.length; i++) counterTotal[i] = 0; for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { bSaveFile = false; org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText(); if (sWebAddress.charAt(sWebAddress.length() - 1) != '/') sWebAddress += "/"; if (!sInstitutionName.toLowerCase().contains(sControlInstitutionName.toLowerCase()) && !bFlagInstitutionName) continue; bFlagInstitutionName = true; List<String> subjects = new ArrayList<String>(); ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName); boolean bNeedToSearchDeparmentWebAddress = false; for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null && e2.element("DepartamentWebAddress").elements().size() != 0) { ProjectLogger.LOGGER .info("\tExist departments webaddress for " + sUnitOfAssessment_Description); } else { subjects.add(sUnitOfAssessment_Description); ProjectLogger.LOGGER .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description); bNeedToSearchDeparmentWebAddress = true; } } String sSeed = sWebAddress; String sContainPattern = sSeed.replace("http://www.", ""); int iAux = sContainPattern.indexOf("/"); sContainPattern = sContainPattern.substring(0, iAux); if (bNeedToSearchDeparmentWebAddress) { CrawlerDepartamentsV2Controller_deprecated controllerDepts = new CrawlerDepartamentsV2Controller_deprecated( rootFolder + sInstitutionName.replace(" ", ".") + ".Researchers", subjects); controllerDepts.addSeed(sSeed); controllerDepts.setPolitenessDelay(200); controllerDepts.setMaximumCrawlDepth(3); controllerDepts.setMaximumPagesToFetch(-1); controllerDepts.setContainPattern(sContainPattern); controllerDepts.clearPossibleResults(); ProjectLogger.LOGGER .info("======================================================================"); ProjectLogger.LOGGER.info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ")"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerDepts.start(CrawlerDepartamentsV2_deprecated.class, 1); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms"); ProjectLogger.LOGGER .info("======================================================================"); CandidateTypeURL.printResults( "Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerDepts.getPossibleResultsTYPE()); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION) .getText(); TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE(); Iterator<String> it = t.keySet().iterator(); while (it.hasNext()) { String s = it.next(); if (s.toLowerCase() .equals("department of " + sUnitOfAssessment_Description.toLowerCase())) { if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null && e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS).elements().size() != 0) { throw new Exception(sUnitOfAssessment_Description + " must be empty."); } List<CandidateTypeURL> lst = t.get(s); for (CandidateTypeURL ss : lst) { e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL); bSaveFile = true; } break; } } } } ProjectLogger.LOGGER.info("Researcher phase - " + sInstitutionName); if (sContainPattern != "") sContainPattern = sContainPattern; for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); List<String> lstDepartmentWebAddress = new ArrayList<String>(); for (Iterator i3 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); if (!e3.getText().equals("")) lstDepartmentWebAddress.add(e3.getText()); } if (lstDepartmentWebAddress.size() > 0) { ProjectLogger.LOGGER .info("\tExist departments webaddress for " + sUnitOfAssessment_Description); boolean bExistResearcherWebAddress = false; List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>(); for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (bSetEmptyAllResearchers) { boolean aux = true; while (aux) { org.dom4j.Element eaux = e4.element(XMLTags.RESEARCHER_WEB_ADDRESS); if (eaux != null) e4.remove(eaux); else aux = false; } } if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) == null) { String initials = e4.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? "" : e4.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name); researchers.add(rsi); bExistResearcherWebAddress = false; } else if (bSetEmptyAllResearchers) { throw new Exception( "XML element of " + e4.element(XMLTags.RESEARCHER_INITIALS).getText() + "," + e4.element(XMLTags.RESEARCHER_LASTNAME).getText() + " must not have researcher web address at this moment"); } } } if (!bExistResearcherWebAddress) { ProjectLogger.LOGGER.info("\tMiss researchers webaddress for " + sUnitOfAssessment_Description + ". Try to search."); CrawlerResearchesPagesV2Controller_deprecated controllerReseachers = new CrawlerResearchesPagesV2Controller_deprecated( rootFolder + sInstitutionName.replace(" ", ".") + "_" + sUnitOfAssessment_Description.replace(" ", "."), researchers); String sSeeds = ""; for (String s : lstDepartmentWebAddress) { controllerReseachers.addSeed(s); sSeeds += s + ","; } controllerReseachers.setPolitenessDelay(200); controllerReseachers.setMaximumCrawlDepth(3); controllerReseachers.setMaximumPagesToFetch(-1); controllerReseachers.setContainPattern(sContainPattern); controllerReseachers.clearInterestingUrlsDetected(); if (!sUnitOfAssessment_Description.contains(sControlUnitOfAssessment_Description) && !bFlagUnitOfAssessmentName) continue; bFlagUnitOfAssessmentName = true; ProjectLogger.LOGGER .info("======================================================================"); ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " (" + sSeeds + ")"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerReseachers.start(CrawlerResearchesPagesV2_deprecated.class, 1); controllerReseachers.postProcessResults(); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms"); ProjectLogger.LOGGER .info("======================================================================"); CandidateTypeURL.printResults( "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerReseachers.getInterestingUrlsDetected()); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); String initials = e4.element(XMLTags.RESEARCHER_INITIALS) == null ? "" : e4.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? "" : e4.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name); TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers .getInterestingUrlsDetected(); List<CandidateTypeURL> lst = t.get( CrawlerResearchesPagesV2Controller_deprecated.RESEARCHER_RESULT_TAG); boolean bExist = false; if (lst != null) { boolean lock1 = true; for (CandidateTypeURL ss : lst) { if (rsi.equals(ss.data)) { e4.addElement(XMLTags.RESEARCHER_WEB_ADDRESS) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE, ss.sSubType) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT, ss.sExt) .addText(ss.sURL); lock1 = false; bSaveFile = true; bExist = true; } } } if (bExist) { counterSuccess[0]++; } } } ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } else { ProjectLogger.LOGGER.info( "\tExist researchers webaddress for " + sUnitOfAssessment_Description + "."); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { counterSuccess[0]++; } } } ProjectLogger.LOGGER.info( "Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } } else { ProjectLogger.LOGGER .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { counterSuccess[0]++; } } } if (counterSuccess[0] > 0) ProjectLogger.LOGGER.info( "\tExist researchers webaddress for " + sUnitOfAssessment_Description + "."); else ProjectLogger.LOGGER.info("\tNot exist researchers webaddress for " + sUnitOfAssessment_Description + "."); ProjectLogger.LOGGER .info("Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } } counterSuccess[1] += counterSuccess[0]; counterTotal[1] += counterTotal[0]; if (bSaveFile) { File fField = new File(xmlFilePath); FileOutputStream fileOS = new java.io.FileOutputStream(fField, false); OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8"); BufferedWriter bw = new java.io.BufferedWriter(writer); String sOut = document.asXML(); bw.write(sOut); bw.close(); ProjectLogger.LOGGER.info(xmlFile + " updated."); } } ProjectLogger.LOGGER.info("Researches results:" + counterSuccess[1] + " / " + counterTotal[1]); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage(), ex); } finally { PageFetcher.stopConnectionMonitorThread(); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java
License:Open Source License
/** * Check effectivity of the recollection and recount found web pages. (of PROCESS STEP 1) * @param showOnlyBad//from w w w. ja v a 2 s . co m * @param topPercent */ public static void P1_checkEffectivityCollectResearcherLinks(String xmlFile, boolean showOnlyBad, float topPercent) { try { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); String sInstitutionName = ""; String sWebAddress = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchers = ""; String sResearchersInitials = ""; int[] counterSuccess = new int[2]; int[] counterTotal = new int[2]; for (int i = 0; i < counterSuccess.length; i++) counterSuccess[i] = 0; for (int i = 0; i < counterTotal.length; i++) counterTotal[i] = 0; if (showOnlyBad) { ProjectLogger.LOGGER.info("Show only departments with less than " + topPercent + "%.\r\n"); } for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText(); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); boolean bExistDept = false; String sURLs = ""; for (Iterator i5 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); sURLs += " " + e5.getText(); bExistDept = true; } // if(e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null) // { // bExistDept = true; // //ProjectLogger.LOGGER.info("\tExist departments webaddress for " + sUnitOfAssessment_Description); // } // else // { // bExistDept = false; // } String sOut = ""; if (!bExistDept) { sOut = "FAIL: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: " + sUnitOfAssessment_Description; } else { sOut = "SUCCESS: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: " + sUnitOfAssessment_Description + " URLS= " + sURLs; } counterTotal[0] = 0; counterSuccess[0] = 0; String researchersText = ""; String researchersMissText = ""; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); counterTotal[0]++; if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { researchersText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " " + e4.elementText(XMLTags.RESEARCHER_INITIALS); counterSuccess[0]++; } else { researchersMissText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " " + e4.elementText(XMLTags.RESEARCHER_INITIALS); } } } int percent = (counterSuccess[0] * 100) / counterTotal[0]; if (showOnlyBad) { if (percent <= topPercent) { ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info("BAD RESULTS: " + sOut); ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/" + counterTotal[0] + "\t(" + percent + " %)"); ProjectLogger.LOGGER.info("\tFound: " + researchersText); ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText); } } else { ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info(sOut); ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/" + counterTotal[0] + "\t(" + percent + " %)"); ProjectLogger.LOGGER.info("\tFound: " + researchersText); ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText); } counterTotal[1] += counterTotal[0]; counterSuccess[1] += counterSuccess[0]; } } ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info("TOTAL Researchers found: " + counterSuccess[1] + "/" + counterTotal[1]); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } }
From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java
License:Open Source License
public static void P2_redownloadInstitution(String xmlFile, String Institution, String destDir) throws Exception { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); if (e1.element(XMLTags.INSTITUTION_NAME).getText().equals(Institution)) { DownloaderResearchersWebPagesXMLFormat.downloadResearchesPages(destDir, LocalFormatType.PLAIN_DIRECTORY, e1, true); }//from w w w. j av a 2s. com } }