Example usage for org.dom4j Element elementIterator

List of usage examples for org.dom4j Element elementIterator

Introduction

In this page you can find the example usage for org.dom4j Element elementIterator.

Prototype

Iterator<Element> elementIterator(QName qName);

Source Link

Document

Returns an iterator over the elements contained in this element which match the given fully qualified name.

Usage

From source file:edu.umd.cs.marmoset.utilities.ParseWebXml.java

License:Apache License

public static ParseWebXml parse(String webXmlFileName) throws FileNotFoundException, DocumentException {
    File file = new File(webXmlFileName);

    FileInputStream fis = new FileInputStream(file);
    SAXReader reader = new SAXReader();
    Document document = reader.read(fis);

    ParseWebXml webXml = new ParseWebXml();

    Element root = document.getRootElement();

    for (Iterator<?> ii = root.elementIterator("servlet-mapping"); ii.hasNext();) {
        Element elt = (Element) ii.next();
        //System.out.print("name: " +elt.getName());

        String urlPattern = null;
        String servletName = null;
        for (int jj = 0; jj < elt.nodeCount(); jj++) {
            Node node = elt.node(jj);
            if (node.getName() == null)
                continue;
            if (node.getName().equals(SERVLET_NAME)) {
                servletName = node.getText().trim();
                if (webXml.tryToMapServlet(servletName, urlPattern))
                    break;
            } else if (node.getName().equals(SERVLET_URL_PATTERN)) {
                urlPattern = node.getText().trim();
                if (webXml.tryToMapServlet(servletName, urlPattern))
                    break;
            }// w  w w .j av a2  s  . c  om
        }
        //System.out.println(" is mapped thusly: " +servletName +" => "+ urlPattern);
    }

    for (Iterator<?> ii = root.elementIterator("filter-mapping"); ii.hasNext();) {
        Element elt = (Element) ii.next();
        //System.out.print("name: " +elt.getName());

        String filterName = null;
        String urlPattern = null;
        for (int jj = 0; jj < elt.nodeCount(); jj++) {
            Node node = elt.node(jj);
            if (node.getName() == null)
                continue;
            if (node.getName().equals(FILTER_NAME)) {
                filterName = node.getText().trim();
                if (webXml.tryToCreateFilter(filterName, urlPattern))
                    break;
            } else if (node.getName().equals(FILTER_URL_PATTERN)) {
                urlPattern = node.getText().trim();
                if (webXml.tryToCreateFilter(filterName, urlPattern))
                    break;
            }
        }
        //System.out.println(" is mapped thusly: " +filterName+ " => "+ urlPattern);

    }

    return webXml;
}

From source file:edu.vt.middleware.ldap.dsml.AbstractDsml.java

License:Open Source License

/**
 * This will take a DSML <code>Element</code> containing an entry of type
 * <entry/> and convert it to an LDAP entry.
 *
 * @param  entryElement  <code>Element</code> of DSML content
 *
 * @return  <code>LdapEntry</code>
 *///from   w ww.jav a 2  s.co  m
protected LdapEntry createLdapEntry(final Element entryElement) {
    final LdapEntry ldapEntry = this.beanFactory.newLdapEntry();
    ldapEntry.setDn("");

    if (entryElement != null) {

        final String name = entryElement.attributeValue("dn");
        if (name != null) {
            ldapEntry.setDn(name);
        }

        if (entryElement.hasContent()) {

            // load the attribute elements
            final Iterator<?> attrIterator = entryElement.elementIterator("attr");
            while (attrIterator.hasNext()) {
                final Element attrElement = (Element) attrIterator.next();
                final String attrName = attrElement.attributeValue("name");
                if (attrName != null && attrElement.hasContent()) {
                    final LdapAttribute ldapAttribute = this.beanFactory.newLdapAttribute();
                    ldapAttribute.setName(attrName);

                    final Iterator<?> valueIterator = attrElement.elementIterator("value");
                    while (valueIterator.hasNext()) {
                        final Element valueElement = (Element) valueIterator.next();
                        final String value = valueElement.getText();
                        if (value != null) {
                            final String encoding = valueElement.attributeValue("encoding");
                            if (encoding != null && "base64".equals(encoding)) {
                                ldapAttribute.getValues().add(LdapUtil.base64Decode(value));
                            } else {
                                ldapAttribute.getValues().add(value);
                            }
                        }
                    }
                    ldapEntry.getLdapAttributes().addAttribute(ldapAttribute);
                }
            }
        }
    }

    return ldapEntry;
}

From source file:edu.vt.middleware.ldap.dsml.Dsmlv1.java

License:Open Source License

/**
 * This will take a DSML <code>Element</code> containing an entry of type
 * <dsml:entry name="name"/> and convert it to an LDAP entry.
 *
 * @param  entryElement  <code>Element</code> of DSML content
 *
 * @return  <code>LdapEntry</code>
 *///from  w ww  .  ja  v a2s. c  om
protected LdapEntry createLdapEntry(final Element entryElement) {
    final LdapEntry ldapEntry = this.beanFactory.newLdapEntry();
    ldapEntry.setDn("");

    if (entryElement != null) {

        final String name = entryElement.attributeValue("dn");
        if (name != null) {
            ldapEntry.setDn(name);
        }

        if (entryElement.hasContent()) {

            final Iterator<?> ocIterator = entryElement.elementIterator("objectclass");
            while (ocIterator.hasNext()) {
                final Element ocElement = (Element) ocIterator.next();
                if (ocElement != null && ocElement.hasContent()) {
                    final String ocName = "objectClass";
                    final LdapAttribute ldapAttribute = this.beanFactory.newLdapAttribute();
                    ldapAttribute.setName(ocName);

                    final Iterator<?> valueIterator = ocElement.elementIterator("oc-value");
                    while (valueIterator.hasNext()) {
                        final Element valueElement = (Element) valueIterator.next();
                        if (valueElement != null) {
                            final String value = valueElement.getText();
                            if (value != null) {
                                final String encoding = valueElement.attributeValue("encoding");
                                if (encoding != null && "base64".equals(encoding)) {
                                    ldapAttribute.getValues().add(LdapUtil.base64Decode(value));
                                } else {
                                    ldapAttribute.getValues().add(value);
                                }
                            }
                        }
                    }
                    ldapEntry.getLdapAttributes().addAttribute(ldapAttribute);
                }
            }

            ldapEntry.getLdapAttributes()
                    .addAttributes(super.createLdapEntry(entryElement).getLdapAttributes().getAttributes());
        }
    }

    return ldapEntry;
}

From source file:edu.wustl.geneconnect.bizlogic.AbstractBizLogicFactory.java

License:BSD License

/**
 * This method updates module map by parsing xml file
 * @param xmlFileName file to be parsed//from  ww w. j av a  2  s  .  co  m
 * @return  moduleMap Map
 */
public final Map updateModuleMap(String xmlFileName) {
    Map moduleMap = new HashMap();
    SAXReader saxReader = new SAXReader();
    InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream(xmlFileName);
    Document document = null;

    try {

        document = saxReader.read(inputStream);
        Element businessLogics = document.getRootElement();
        Iterator businessLogicIterator = businessLogics
                .elementIterator(GCConstants.BUSINESS_LOGIC_ELEMENT_ITERATOR);
        Element businessLogic = null;
        Element businessAction = null;
        Element instanceType = null;
        String instanceTypeString = null;
        String businessActionString = null;
        /**
         * Iterate over bizlogic.xml file and find the class ned to instantiate and return it.
         */
        while (businessLogicIterator.hasNext()) {
            try {
                businessLogic = (Element) businessLogicIterator.next();
                businessAction = businessLogic.element(GCConstants.BUSINESS_ACTION_ELEMENT);
                instanceType = businessLogic.element(GCConstants.INSTANCE_TYPE_ELEMENT);
                //               moduleMap.put(businessAction.getStringValue(), Class.forName(
                //                     instanceType.getStringValue()).newInstance());
                moduleMap.put(businessAction.getStringValue(), instanceType.getStringValue());
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    } catch (DocumentException e) {
        throw new GCRuntimeException(e);
    } catch (Exception e) {
        throw new GCRuntimeException(e);
    }
    return moduleMap;
}

From source file:eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java

License:Open Source License

/**
 * In this block the crawler will try to extract the departments web adresses. 
 * The block works with a org.dom4j.Element
 * Notes:/*from  ww  w  . ja  va 2s. com*/
 *  The function iterate the institution elemento taking all the UNIT_OF_ASSESSMENT to search all of them in same crawler call.     
 *  The UNIT_OF_ASSESSMENT will be stores in subjects array, next, it will be given to the crawler.
 * 
 * @param elementInstitution      
 * @param path
 * @param sInstitutionName
 * @param sWebAddress     
 * @return  
 */
@Override
protected boolean actionsInInstitutionNode(org.dom4j.Element elementInstitution, String path,
        String sInstitutionName, String sWebAddress) {
    if (refuseExecution)
        return false;

    String crawler_data_folder = this.work_dir.getAbsolutePath() + File.separator + CRAWLER_DATA_FOLDERNAME;

    List<String> subjects = new ArrayList<String>();

    String sSeed = sWebAddress;
    String sContainPattern = sSeed.replace("http://www.", "");
    int index = sContainPattern.indexOf("/");
    if (index == -1)
        index = sContainPattern.length() - 1;
    sContainPattern = sContainPattern.substring(0, index);

    ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName);

    /*
     * Taking subjects to search its web adresses         
     */
    String sUnitOfAssessment_Description = "";
    for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2
            .hasNext();) {
        sUnitOfAssessment_Description = i2.next().element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();
        subjects.add(sUnitOfAssessment_Description);
        ProjectLogger.LOGGER.info(
                "\tAdding subject '" + sUnitOfAssessment_Description + "' to search its section webpages");
    }

    /*
     * Crawling to search the departments
     */
    CrawlerDepartamentsV3Controller controllerDepts = null;

    try {
        String university_crawler_data_folder = crawler_data_folder + File.separator
                + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-crawler-data";
        File university_crawler_data_dir = new File(university_crawler_data_folder);
        if (university_crawler_data_dir.exists())
            FileFootils.deleteDir(university_crawler_data_dir);

        controllerDepts = new CrawlerDepartamentsV3Controller(university_crawler_data_folder,
                this.keywords_data_dir, subjects);
        controllerDepts.addSeed(sSeed);
        controllerDepts.setPolitenessDelay(200);
        controllerDepts.setMaximumCrawlDepth(3);
        controllerDepts.setMaximumPagesToFetch(-1);
        controllerDepts.setContainPattern(sContainPattern);
        controllerDepts.clearPossibleResults();

        ProjectLogger.LOGGER
                .info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ") - [" + sSeed + "]");
        long lTimerAux = java.lang.System.currentTimeMillis();

        controllerDepts.start(CrawlerDepartamentsV3.class, 1);

        lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
        ProjectLogger.LOGGER
                .info("End crawling: " + sInstitutionName + " - Time: " + lTimerAux + " ms - [" + sSeed + "]");

    } catch (Exception ex) {
        ProjectLogger.LOGGER.error(ex.getMessage(), ex);
    } finally {
        if (CrawlerTrace.isTraceUrlsActive() && controllerDepts != null)
            controllerDepts.closeCrawlerTrace();

        controllerDepts.releaseResources();
    }

    /*
     * Update results
     */
    if (controllerDepts != null) {
        if (CrawlerTrace.isTraceSearchActive()) {
            CandidateTypeURL.printResults("Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE",
                    controllerDepts.getPossibleResultsTYPE());
        }

        /*
         * Adding departments web addresses to xml document
         */
        for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2
                .hasNext();) {
            org.dom4j.Element e2 = i2.next();
            sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();

            TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE();
            Iterator<String> it = t.keySet().iterator();

            //
            String department_of = CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG
                    + sUnitOfAssessment_Description;

            //FIXME, TEST THIS
            //while(it.hasNext())
            //{
            //    String department_of = it.next();
            //    if(department_of.toLowerCase().equals(CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG + sUnitOfAssessment_Description.toLowerCase()))
            //    {
            List<CandidateTypeURL> lst = t.get(department_of);
            if (lst != null) {
                for (CandidateTypeURL ss : lst) {
                    ProjectLogger.LOGGER
                            .info("Add department '" + department_of + "' the url '" + ss.sURL + "'");
                    e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL);
                }
            }
            //        break;
            //    }
            //}
        }
    }

    return true;
}

From source file:eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java

License:Open Source License

/**
 * /* w w w.ja v a  2 s  .c o m*/
 * @param elementUnitOfAssessment
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description     
 * @return  
 */
@Override
protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path,
        String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) {
    if (refuseExecution)
        return false;

    String crawler_data_folder = this.work_dir + File.separator + CRAWLER_DATA_FOLDERNAME;

    List<String> department_web_addresses = new ArrayList<String>();
    List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>();

    String seed = sWebAddress;
    String contain_pattern = seed.replace("http://www.", "");
    int index = contain_pattern.indexOf("/");
    if (index == -1)
        index = contain_pattern.length() - 1;
    contain_pattern = contain_pattern.substring(0, index);

    /*
     * Taking departments webpages to search in the researchers webpages
     */
    for (Iterator<org.dom4j.Element> department_web_address_it = elementUnitOfAssessment
            .elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); department_web_address_it.hasNext();) {
        org.dom4j.Element department_web_address_element = (org.dom4j.Element) department_web_address_it.next();
        if (!department_web_address_element.getText().equals(""))
            department_web_addresses.add(department_web_address_element.getText());
    }

    /*
     * If there is not department webpage, then, add the university web to find staff page and something similar
     */
    if (department_web_addresses.isEmpty()) {
        ProjectLogger.LOGGER.info("There is not dept webpages for [" + sUnitOfAssessment_Description + " - "
                + sInstitutionName + "]. Adding " + sWebAddress);
        //department_web_addresses.add(sWebAddress);
    }

    /*
     * Taking researchers info to search the researchers webs
     */
    for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment
            .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) {
        org.dom4j.Element research_group_element = research_group_it.next();

        for (Iterator<org.dom4j.Element> reseacher_it = research_group_element
                .elementIterator(XMLTags.RESEARCHER); reseacher_it.hasNext();) {
            org.dom4j.Element reseacher_element = reseacher_it.next();

            String initials = reseacher_element.element(XMLTags.RESEARCHER_INITIALS).getText();
            String last_name = reseacher_element.element(XMLTags.RESEARCHER_LASTNAME).getText();
            String first_name = reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                    : reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
            String whole_name = reseacher_element.element(XMLTags.RESEARCHER_NAME) == null ? ""
                    : reseacher_element.element(XMLTags.RESEARCHER_NAME).getText();

            ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name);
            researchers.add(rsi);
        }
    }

    if (researchers.size() > 0 && !department_web_addresses.isEmpty()) {
        /*
         * Crawling to search the researchers
         */
        CrawlerResearchesPagesV3Controller controllerReseachers = null;
        try {
            String university_subject_crawler_data_folder = crawler_data_folder + File.separator
                    + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-"
                    + sUnitOfAssessment_Description.replaceAll("\\W+", "").toLowerCase() + "-crawler-data";
            File university_subject_crawler_data_dir = new File(university_subject_crawler_data_folder);
            if (university_subject_crawler_data_dir.exists())
                FileFootils.deleteDir(university_subject_crawler_data_dir);

            controllerReseachers = new CrawlerResearchesPagesV3Controller(
                    university_subject_crawler_data_folder, this.keywords_data_dir, researchers);
            String sSeeds = "";
            for (String s : department_web_addresses) {
                controllerReseachers.addSeed(s);
                sSeeds += s + ",";
            }

            controllerReseachers.setPolitenessDelay(200);
            controllerReseachers.setMaximumCrawlDepth(3);
            controllerReseachers.setMaximumPagesToFetch(-1);
            controllerReseachers.setContainPattern(contain_pattern);
            controllerReseachers.clearInterestingUrlsDetected();

            ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - "
                    + sInstitutionName + " - [" + StringUtils.join(department_web_addresses, ",") + "]");
            long lTimerAux = java.lang.System.currentTimeMillis();

            controllerReseachers.start(CrawlerResearchesPagesV3.class, 1);

            controllerReseachers.postProcessResults();

            lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
            ProjectLogger.LOGGER.info(
                    "End crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " - Time: "
                            + lTimerAux + " ms - [" + StringUtils.join(department_web_addresses, ",") + "]");
        } catch (Exception ex) {
            ProjectLogger.LOGGER.error(ex.getMessage(), ex);
        } finally {
            if (CrawlerTrace.isTraceUrlsActive() && controllerReseachers != null)
                controllerReseachers.closeCrawlerTrace();
        }

        /*
         * Update results
         */
        if (controllerReseachers != null) {
            /*
             * Print the researchers
             */
            if (CrawlerTrace.isTraceSearchActive()) {
                CandidateTypeURL
                        .printResults(
                                "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " ("
                                        + sWebAddress + ") by TYPE",
                                controllerReseachers.getInterestingUrlsDetected());
            }

            counterTotal[0] = 0;
            counterSuccess[0] = 0;

            try {
                /*
                 * Add researcher webs to xml document             
                 */
                for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment
                        .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) {
                    org.dom4j.Element research_group_element = research_group_it.next();

                    for (Iterator<org.dom4j.Element> researcher_it = research_group_element
                            .elementIterator(XMLTags.RESEARCHER); researcher_it.hasNext();) {
                        counterTotal[0]++;
                        org.dom4j.Element researcher_element = researcher_it.next();

                        String initials = researcher_element.element(XMLTags.RESEARCHER_INITIALS).getText();
                        String last_name = researcher_element.element(XMLTags.RESEARCHER_LASTNAME).getText();
                        String first_name = researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null
                                ? ""
                                : researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                        String whole_name = researcher_element.element(XMLTags.RESEARCHER_NAME) == null ? ""
                                : researcher_element.element(XMLTags.RESEARCHER_NAME).getText();

                        ResearcherNameInfo researcher_name_info = new ResearcherNameInfo(last_name, initials,
                                first_name, whole_name);
                        researcher_name_info.first_name = CandidateTypeURL
                                .getCanonicalName(researcher_name_info.first_name);
                        researcher_name_info.last_name = CandidateTypeURL
                                .getCanonicalName(researcher_name_info.last_name);
                        researcher_name_info.initial = CandidateTypeURL
                                .getCanonicalName(researcher_name_info.initial);
                        researcher_name_info.whole_name = CandidateTypeURL
                                .getCanonicalName(researcher_name_info.whole_name);

                        TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers
                                .getInterestingUrlsDetected();

                        List<CandidateTypeURL> lst = t
                                .get(CrawlerResearchesPagesV3Controller.RESEARCHER_RESULT_TAG);

                        boolean bExist = false;
                        if (lst != null) {
                            //FIXME, contains and remove better
                            boolean lock1 = true;
                            for (CandidateTypeURL ss : lst) {
                                if (researcher_name_info.equals(ss.data)) {
                                    ProjectLogger.LOGGER.info("Add researcher '" + researcher_name_info
                                            + "' the url '" + ss.sURL + "'");
                                    researcher_element.addElement(XMLTags.RESEARCHER_WEB_ADDRESS)
                                            .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE, ss.sSubType)
                                            .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT, ss.sExt)
                                            .addText(ss.sURL);
                                    lock1 = false;
                                    bExist = true;
                                }
                            }
                        }
                        if (bExist) {
                            counterSuccess[0]++;
                        } else {
                            ProjectLogger.LOGGER.warn("No webpage for " + researcher_name_info);
                        }
                    }
                }
            } catch (Exception ex) {
                ProjectLogger.LOGGER.error("Error", ex);
            }

            /*
             * Show a little counting result
             */
            ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - "
                    + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]);
            counterTotal[1] += 1;
            counterSuccess[1] += counterSuccess[0] > 0 ? 1 : 0;

            counterSuccess[2] += counterSuccess[0];
            counterTotal[2] += counterTotal[0];
        }
    }

    return true;
}

From source file:eu.sisob.uma.crawler.MatrixResultBuilder.java

License:Open Source License

/**
 * /*from w w  w  .  j a  v  a 2 s  .  c  o  m*/
 * @param elementUnitOfAssessment
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description     
 * @return  
 */
protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path,
        String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) {
    if (!this.dept_axis.containsKey(sUnitOfAssessment_Description))
        this.dept_axis.put(sUnitOfAssessment_Description, dept_axis.size());

    int counterTotal = 0;
    int counterSuccess = 0;
    for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment
            .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) {
        org.dom4j.Element research_group_element = research_group_it.next();

        for (Iterator<org.dom4j.Element> researcher_it = research_group_element
                .elementIterator(XMLTags.RESEARCHER); researcher_it.hasNext();) {
            org.dom4j.Element researcher_element = researcher_it.next();

            counterTotal++;
            if (researcher_element.elements(XMLTags.RESEARCHER_WEB_ADDRESS).size() > 0)
                counterSuccess++;
        }
    }

    this.resultsMatrix.get(sInstitutionName).put(sUnitOfAssessment_Description,
            new AbstractMap.SimpleEntry<Integer, Integer>(counterSuccess, counterTotal));

    return true;
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java

License:Open Source License

public static void P1_step_collectResearcherLinks(String xmlFilePath, int numberOfCrawlers,
        String sControlInstitutionName) {
    try {//from w  ww  . j av a2s.co m
        /*
         * rootfolder is a folder where intermediate crawl data is
         * stored.
         */
        String rootFolder = "temp/";

        FileFootils.deleteDir(rootFolder);
        /*
         * numberOfCrawlers shows the number of concurrent threads
         * that should be initiated for crawling.
         */
        File xmlFile = new File(xmlFilePath);

        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        org.dom4j.Document document = reader.read(xmlFile);
        org.dom4j.Element root = document.getRootElement();

        String sInstitutionName = "";
        String sWebAddress = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchers = "";
        String sResearchersInitials = "";

        PageFetcher.startConnectionMonitorThread();

        WebCrawler.setTraceLinkName(true);
        WebCrawler.setTracePageName(true);

        TreeMap<String, TreeMap<String, List<CandidateTypeURL>>> finalResults = new TreeMap<String, TreeMap<String, List<CandidateTypeURL>>>();

        boolean bFlagInstitutionName = false;
        String sControlUnitOfAssessment_Description = "";
        boolean bFlagUnitOfAssessmentName = false;
        boolean bSaveFile = true;

        boolean bSetEmptyAllResearchers = true;

        if (bSetEmptyAllResearchers) {
            File fField = new File(xmlFilePath.replace(".xml", "backup.xml"));
            FileOutputStream fileOS = new java.io.FileOutputStream(fField, false);
            OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8");
            BufferedWriter bw = new java.io.BufferedWriter(writer);
            String sOut = document.asXML();
            bw.write(sOut);
            bw.close();
            ProjectLogger.LOGGER.info(xmlFilePath + " backuped.");
        }

        int[] counterSuccess = new int[3];
        int[] counterTotal = new int[3];
        for (int i = 0; i < counterSuccess.length; i++)
            counterSuccess[i] = 0;
        for (int i = 0; i < counterTotal.length; i++)
            counterTotal[i] = 0;

        for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) {
            bSaveFile = false;

            org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

            sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText();
            sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText();
            if (sWebAddress.charAt(sWebAddress.length() - 1) != '/')
                sWebAddress += "/";

            if (!sInstitutionName.toLowerCase().contains(sControlInstitutionName.toLowerCase())
                    && !bFlagInstitutionName)
                continue;
            bFlagInstitutionName = true;

            List<String> subjects = new ArrayList<String>();

            ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName);

            boolean bNeedToSearchDeparmentWebAddress = false;
            for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) {
                org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

                sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();
                //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20);

                if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null
                        && e2.element("DepartamentWebAddress").elements().size() != 0) {
                    ProjectLogger.LOGGER
                            .info("\tExist departments webaddress for " + sUnitOfAssessment_Description);
                } else {
                    subjects.add(sUnitOfAssessment_Description);
                    ProjectLogger.LOGGER
                            .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description);
                    bNeedToSearchDeparmentWebAddress = true;
                }
            }

            String sSeed = sWebAddress;
            String sContainPattern = sSeed.replace("http://www.", "");
            int iAux = sContainPattern.indexOf("/");
            sContainPattern = sContainPattern.substring(0, iAux);

            if (bNeedToSearchDeparmentWebAddress) {
                CrawlerDepartamentsV2Controller_deprecated controllerDepts = new CrawlerDepartamentsV2Controller_deprecated(
                        rootFolder + sInstitutionName.replace(" ", ".") + ".Researchers", subjects);
                controllerDepts.addSeed(sSeed);
                controllerDepts.setPolitenessDelay(200);
                controllerDepts.setMaximumCrawlDepth(3);
                controllerDepts.setMaximumPagesToFetch(-1);
                controllerDepts.setContainPattern(sContainPattern);
                controllerDepts.clearPossibleResults();

                ProjectLogger.LOGGER
                        .info("======================================================================");
                ProjectLogger.LOGGER.info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ")");
                long lTimerAux = java.lang.System.currentTimeMillis();

                controllerDepts.start(CrawlerDepartamentsV2_deprecated.class, 1);

                lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
                ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms");
                ProjectLogger.LOGGER
                        .info("======================================================================");

                CandidateTypeURL.printResults(
                        "Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE",
                        controllerDepts.getPossibleResultsTYPE());

                for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) {
                    org.dom4j.Element e2 = (org.dom4j.Element) i2.next();
                    sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION)
                            .getText();

                    TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE();
                    Iterator<String> it = t.keySet().iterator();

                    while (it.hasNext()) {
                        String s = it.next();
                        if (s.toLowerCase()
                                .equals("department of " + sUnitOfAssessment_Description.toLowerCase())) {
                            if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null
                                    && e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS).elements().size() != 0) {
                                throw new Exception(sUnitOfAssessment_Description + " must be empty.");
                            }

                            List<CandidateTypeURL> lst = t.get(s);
                            for (CandidateTypeURL ss : lst) {
                                e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL);
                                bSaveFile = true;
                            }
                            break;
                        }
                    }
                }
            }

            ProjectLogger.LOGGER.info("Researcher phase - " + sInstitutionName);

            if (sContainPattern != "")
                sContainPattern = sContainPattern;

            for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) {
                org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

                sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();
                //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20);

                List<String> lstDepartmentWebAddress = new ArrayList<String>();
                for (Iterator i3 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i3.hasNext();) {
                    org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                    if (!e3.getText().equals(""))
                        lstDepartmentWebAddress.add(e3.getText());
                }

                if (lstDepartmentWebAddress.size() > 0) {
                    ProjectLogger.LOGGER
                            .info("\tExist departments webaddress for " + sUnitOfAssessment_Description);

                    boolean bExistResearcherWebAddress = false;

                    List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>();

                    for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                        org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                        sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText();

                        for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                            org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                            if (bSetEmptyAllResearchers) {
                                boolean aux = true;
                                while (aux) {
                                    org.dom4j.Element eaux = e4.element(XMLTags.RESEARCHER_WEB_ADDRESS);
                                    if (eaux != null)
                                        e4.remove(eaux);
                                    else
                                        aux = false;
                                }
                            }

                            if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) == null) {
                                String initials = e4.element(XMLTags.RESEARCHER_INITIALS).getText();
                                String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME).getText();
                                String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                                String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_NAME).getText();

                                ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name,
                                        whole_name);
                                researchers.add(rsi);
                                bExistResearcherWebAddress = false;
                            } else if (bSetEmptyAllResearchers) {
                                throw new Exception(
                                        "XML element of " + e4.element(XMLTags.RESEARCHER_INITIALS).getText()
                                                + "," + e4.element(XMLTags.RESEARCHER_LASTNAME).getText()
                                                + " must not have researcher web address at this moment");
                            }
                        }
                    }

                    if (!bExistResearcherWebAddress) {
                        ProjectLogger.LOGGER.info("\tMiss researchers webaddress for "
                                + sUnitOfAssessment_Description + ". Try to search.");

                        CrawlerResearchesPagesV2Controller_deprecated controllerReseachers = new CrawlerResearchesPagesV2Controller_deprecated(
                                rootFolder + sInstitutionName.replace(" ", ".") + "_"
                                        + sUnitOfAssessment_Description.replace(" ", "."),
                                researchers);

                        String sSeeds = "";

                        for (String s : lstDepartmentWebAddress) {
                            controllerReseachers.addSeed(s);
                            sSeeds += s + ",";
                        }

                        controllerReseachers.setPolitenessDelay(200);
                        controllerReseachers.setMaximumCrawlDepth(3);
                        controllerReseachers.setMaximumPagesToFetch(-1);
                        controllerReseachers.setContainPattern(sContainPattern);
                        controllerReseachers.clearInterestingUrlsDetected();

                        if (!sUnitOfAssessment_Description.contains(sControlUnitOfAssessment_Description)
                                && !bFlagUnitOfAssessmentName)
                            continue;
                        bFlagUnitOfAssessmentName = true;

                        ProjectLogger.LOGGER
                                .info("======================================================================");
                        ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - "
                                + sInstitutionName + " (" + sSeeds + ")");
                        long lTimerAux = java.lang.System.currentTimeMillis();

                        controllerReseachers.start(CrawlerResearchesPagesV2_deprecated.class, 1);

                        controllerReseachers.postProcessResults();

                        lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
                        ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms");
                        ProjectLogger.LOGGER
                                .info("======================================================================");

                        CandidateTypeURL.printResults(
                                "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " ("
                                        + sWebAddress + ") by TYPE",
                                controllerReseachers.getInterestingUrlsDetected());

                        counterTotal[0] = 0;
                        counterSuccess[0] = 0;

                        for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                            org.dom4j.Element e3 = (org.dom4j.Element) i3.next();

                            for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                                counterTotal[0]++;
                                org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                                String initials = e4.element(XMLTags.RESEARCHER_INITIALS) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_INITIALS).getText();
                                String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_LASTNAME).getText();
                                String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                                String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? ""
                                        : e4.element(XMLTags.RESEARCHER_NAME).getText();

                                ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name,
                                        whole_name);

                                TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers
                                        .getInterestingUrlsDetected();

                                List<CandidateTypeURL> lst = t.get(
                                        CrawlerResearchesPagesV2Controller_deprecated.RESEARCHER_RESULT_TAG);

                                boolean bExist = false;
                                if (lst != null) {
                                    boolean lock1 = true;
                                    for (CandidateTypeURL ss : lst) {
                                        if (rsi.equals(ss.data)) {
                                            e4.addElement(XMLTags.RESEARCHER_WEB_ADDRESS)
                                                    .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE,
                                                            ss.sSubType)
                                                    .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT,
                                                            ss.sExt)
                                                    .addText(ss.sURL);
                                            lock1 = false;
                                            bSaveFile = true;
                                            bExist = true;
                                        }
                                    }
                                }
                                if (bExist) {
                                    counterSuccess[0]++;
                                }
                            }
                        }

                        ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - "
                                + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / "
                                + counterTotal[0]);

                    } else {
                        ProjectLogger.LOGGER.info(
                                "\tExist researchers webaddress for " + sUnitOfAssessment_Description + ".");

                        counterTotal[0] = 0;
                        counterSuccess[0] = 0;
                        for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                            org.dom4j.Element e3 = (org.dom4j.Element) i3.next();

                            for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                                counterTotal[0]++;

                                org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                                if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null
                                        && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) {
                                    counterSuccess[0]++;
                                }
                            }
                        }

                        ProjectLogger.LOGGER.info(
                                "Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description
                                        + " - " + counterSuccess[0] + " / " + counterTotal[0]);
                    }
                } else {
                    ProjectLogger.LOGGER
                            .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description);

                    counterTotal[0] = 0;
                    counterSuccess[0] = 0;
                    for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                        org.dom4j.Element e3 = (org.dom4j.Element) i3.next();

                        for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                            counterTotal[0]++;

                            org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                            if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null
                                    && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) {
                                counterSuccess[0]++;
                            }
                        }
                    }
                    if (counterSuccess[0] > 0)
                        ProjectLogger.LOGGER.info(
                                "\tExist researchers webaddress for " + sUnitOfAssessment_Description + ".");
                    else
                        ProjectLogger.LOGGER.info("\tNot exist researchers webaddress for "
                                + sUnitOfAssessment_Description + ".");

                    ProjectLogger.LOGGER
                            .info("Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description
                                    + " - " + counterSuccess[0] + " / " + counterTotal[0]);
                }
            }

            counterSuccess[1] += counterSuccess[0];
            counterTotal[1] += counterTotal[0];

            if (bSaveFile) {
                File fField = new File(xmlFilePath);
                FileOutputStream fileOS = new java.io.FileOutputStream(fField, false);
                OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8");
                BufferedWriter bw = new java.io.BufferedWriter(writer);
                String sOut = document.asXML();
                bw.write(sOut);
                bw.close();
                ProjectLogger.LOGGER.info(xmlFile + " updated.");
            }
        }

        ProjectLogger.LOGGER.info("Researches results:" + counterSuccess[1] + " / " + counterTotal[1]);
    } catch (Exception ex) {
        ProjectLogger.LOGGER.error(ex.getMessage(), ex);
    } finally {
        PageFetcher.stopConnectionMonitorThread();
    }
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java

License:Open Source License

/**
 * Check effectivity of the recollection and recount found web pages. (of PROCESS STEP 1)
 * @param showOnlyBad//from  w w w. ja v  a 2  s  .  co  m
 * @param topPercent
 */
public static void P1_checkEffectivityCollectResearcherLinks(String xmlFile, boolean showOnlyBad,
        float topPercent) {
    try {
        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        org.dom4j.Document document = reader.read(xmlFile);
        org.dom4j.Element root = document.getRootElement();

        String sInstitutionName = "";
        String sWebAddress = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchers = "";
        String sResearchersInitials = "";

        int[] counterSuccess = new int[2];
        int[] counterTotal = new int[2];
        for (int i = 0; i < counterSuccess.length; i++)
            counterSuccess[i] = 0;
        for (int i = 0; i < counterTotal.length; i++)
            counterTotal[i] = 0;

        if (showOnlyBad) {
            ProjectLogger.LOGGER.info("Show only departments with less than " + topPercent + "%.\r\n");
        }

        for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) {
            org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

            sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText();
            sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText();

            for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) {
                org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

                sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();
                //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20);

                boolean bExistDept = false;
                String sURLs = "";

                for (Iterator i5 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i5.hasNext();) {
                    org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                    sURLs += " " + e5.getText();
                    bExistDept = true;
                }

                //                    if(e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null)
                //                    {
                //                        bExistDept = true;
                //                        //ProjectLogger.LOGGER.info("\tExist departments webaddress for " + sUnitOfAssessment_Description);
                //                    }
                //                    else
                //                    {
                //                        bExistDept = false;
                //                    }

                String sOut = "";
                if (!bExistDept) {
                    sOut = "FAIL: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: "
                            + sUnitOfAssessment_Description;
                } else {
                    sOut = "SUCCESS: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: "
                            + sUnitOfAssessment_Description + " URLS= " + sURLs;
                }

                counterTotal[0] = 0;
                counterSuccess[0] = 0;

                String researchersText = "";
                String researchersMissText = "";
                for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                    org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                    sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText();

                    for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                        org.dom4j.Element e4 = (org.dom4j.Element) i4.next();
                        counterTotal[0]++;
                        if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null
                                && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) {
                            researchersText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " "
                                    + e4.elementText(XMLTags.RESEARCHER_INITIALS);
                            counterSuccess[0]++;
                        } else {
                            researchersMissText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " "
                                    + e4.elementText(XMLTags.RESEARCHER_INITIALS);
                        }
                    }
                }

                int percent = (counterSuccess[0] * 100) / counterTotal[0];
                if (showOnlyBad) {
                    if (percent <= topPercent) {
                        ProjectLogger.LOGGER.info("");
                        ProjectLogger.LOGGER.info("BAD RESULTS: " + sOut);
                        ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/"
                                + counterTotal[0] + "\t(" + percent + " %)");
                        ProjectLogger.LOGGER.info("\tFound: " + researchersText);
                        ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText);
                    }
                } else {
                    ProjectLogger.LOGGER.info("");
                    ProjectLogger.LOGGER.info(sOut);
                    ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/"
                            + counterTotal[0] + "\t(" + percent + " %)");
                    ProjectLogger.LOGGER.info("\tFound: " + researchersText);
                    ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText);
                }

                counterTotal[1] += counterTotal[0];
                counterSuccess[1] += counterSuccess[0];
            }
        }

        ProjectLogger.LOGGER.info("");
        ProjectLogger.LOGGER.info("TOTAL Researchers found: " + counterSuccess[1] + "/" + counterTotal[1]);
    } catch (Exception ex) {
        ProjectLogger.LOGGER.info(ex.getMessage());
    }
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.LocalResearchersWebPagesExtractor.java

License:Open Source License

public static void P2_redownloadInstitution(String xmlFile, String Institution, String destDir)
        throws Exception {
    org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
    org.dom4j.Document document = reader.read(xmlFile);
    org.dom4j.Element root = document.getRootElement();

    for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) {
        org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

        if (e1.element(XMLTags.INSTITUTION_NAME).getText().equals(Institution)) {
            DownloaderResearchersWebPagesXMLFormat.downloadResearchesPages(destDir,
                    LocalFormatType.PLAIN_DIRECTORY, e1, true);
        }//from   w w w. j av  a 2s.  com
    }
}