eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java Source code

Java tutorial

Introduction

Here is the source code for eu.sisob.uma.crawler.AirResearchersWebPagesExtractor.java

Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.crawler;

import eu.sisob.uma.crawler.ResearchersCrawlers.CandidateTypeURL;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerDepartamentsV3;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerDepartamentsV3Controller;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerResearchesPagesV3;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerResearchesPagesV3Controller;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.LocalFormatType;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ResearcherNameInfo;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.XMLTags;
import eu.sisob.uma.footils.File.FileFootils;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;

/**
 *
 ** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class AirResearchersWebPagesExtractor extends IteratorReseachersFile {
    final static String CRAWLER_DATA_FOLDERNAME = "tmp-crawler-data";

    int[] counterSuccess;
    int[] counterTotal;
    boolean refuseExecution;

    public int n_universities = 0;
    public int n_depts = 0;

    File keywords_data_dir;

    /**
     * 
     * @param sourceXmlDocument 
     * @param keywords_dir 
     * @param workdir               - Directory to working with data (workdir has temporal crawler folder, 
     * @param split
     */
    public AirResearchersWebPagesExtractor(org.dom4j.Document sourceXmlDocument, File keywords_data_dir,
            File work_dir, boolean split) {
        IteratorReseachersFile(sourceXmlDocument, work_dir, LocalFormatType.PLAIN_DIRECTORY);

        counterSuccess = new int[3];
        counterTotal = new int[3];
        for (int i = 0; i < counterSuccess.length; i++)
            counterSuccess[i] = 0;
        for (int i = 0; i < counterTotal.length; i++)
            counterTotal[i] = 0;

        refuseExecution = false;

        this.keywords_data_dir = keywords_data_dir;
    }

    /**
     * 
     * @param root 
     * @param keywords_dir
     * @param workdir 
     * @param split  
     */
    public AirResearchersWebPagesExtractor(org.dom4j.Element root, File keywords_data_dir, File work_dir,
            boolean split) {
        IteratorReseachersFile(root, work_dir, LocalFormatType.PLAIN_DIRECTORY);

        counterSuccess = new int[3];
        counterTotal = new int[3];
        for (int i = 0; i < counterSuccess.length; i++)
            counterSuccess[i] = 0;
        for (int i = 0; i < counterTotal.length; i++)
            counterTotal[i] = 0;

        refuseExecution = false;

        this.keywords_data_dir = keywords_data_dir;
    }

    /**
     * 
     * @throws Exception
     */
    @Override
    protected void beginActions() throws Exception {
        File dir = new File(this.work_dir, CRAWLER_DATA_FOLDERNAME);
        if (!dir.exists()) {
            dir.mkdir();
        }
    }

    /**
     * 
     * @throws Exception
     */
    @Override
    @SuppressWarnings("unchecked")
    protected void endActions() throws Exception {
        //Print result
        ProjectLogger.LOGGER.info("Researches results: Total :" + " - " + counterSuccess[2] + " / "
                + counterTotal[2] + " - N Depts: " + counterTotal[1] + " - N Depts with a least one res: "
                + counterSuccess[1]);

    }

    /**
     * In this block the crawler will try to extract the departments web adresses. 
     * The block works with a org.dom4j.Element
     * Notes:
     *  The function iterate the institution elemento taking all the UNIT_OF_ASSESSMENT to search all of them in same crawler call.     
     *  The UNIT_OF_ASSESSMENT will be stores in subjects array, next, it will be given to the crawler.
     * 
     * @param elementInstitution      
     * @param path
     * @param sInstitutionName
     * @param sWebAddress     
     * @return  
     */
    @Override
    protected boolean actionsInInstitutionNode(org.dom4j.Element elementInstitution, String path,
            String sInstitutionName, String sWebAddress) {
        if (refuseExecution)
            return false;

        String crawler_data_folder = this.work_dir.getAbsolutePath() + File.separator + CRAWLER_DATA_FOLDERNAME;

        List<String> subjects = new ArrayList<String>();

        String sSeed = sWebAddress;
        String sContainPattern = sSeed.replace("http://www.", "");
        int index = sContainPattern.indexOf("/");
        if (index == -1)
            index = sContainPattern.length() - 1;
        sContainPattern = sContainPattern.substring(0, index);

        ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName);

        /*
         * Taking subjects to search its web adresses         
         */
        String sUnitOfAssessment_Description = "";
        for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2
                .hasNext();) {
            sUnitOfAssessment_Description = i2.next().element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();
            subjects.add(sUnitOfAssessment_Description);
            ProjectLogger.LOGGER.info(
                    "\tAdding subject '" + sUnitOfAssessment_Description + "' to search its section webpages");
        }

        /*
         * Crawling to search the departments
         */
        CrawlerDepartamentsV3Controller controllerDepts = null;

        try {
            String university_crawler_data_folder = crawler_data_folder + File.separator
                    + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-crawler-data";
            File university_crawler_data_dir = new File(university_crawler_data_folder);
            if (university_crawler_data_dir.exists())
                FileFootils.deleteDir(university_crawler_data_dir);

            controllerDepts = new CrawlerDepartamentsV3Controller(university_crawler_data_folder,
                    this.keywords_data_dir, subjects);
            controllerDepts.addSeed(sSeed);
            controllerDepts.setPolitenessDelay(200);
            controllerDepts.setMaximumCrawlDepth(3);
            controllerDepts.setMaximumPagesToFetch(-1);
            controllerDepts.setContainPattern(sContainPattern);
            controllerDepts.clearPossibleResults();

            ProjectLogger.LOGGER
                    .info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ") - [" + sSeed + "]");
            long lTimerAux = java.lang.System.currentTimeMillis();

            controllerDepts.start(CrawlerDepartamentsV3.class, 1);

            lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
            ProjectLogger.LOGGER
                    .info("End crawling: " + sInstitutionName + " - Time: " + lTimerAux + " ms - [" + sSeed + "]");

        } catch (Exception ex) {
            ProjectLogger.LOGGER.error(ex.getMessage(), ex);
        } finally {
            if (CrawlerTrace.isTraceUrlsActive() && controllerDepts != null)
                controllerDepts.closeCrawlerTrace();

            controllerDepts.releaseResources();
        }

        /*
         * Update results
         */
        if (controllerDepts != null) {
            if (CrawlerTrace.isTraceSearchActive()) {
                CandidateTypeURL.printResults("Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE",
                        controllerDepts.getPossibleResultsTYPE());
            }

            /*
             * Adding departments web addresses to xml document
             */
            for (Iterator<org.dom4j.Element> i2 = elementInstitution.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2
                    .hasNext();) {
                org.dom4j.Element e2 = i2.next();
                sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();

                TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE();
                Iterator<String> it = t.keySet().iterator();

                //
                String department_of = CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG
                        + sUnitOfAssessment_Description;

                //FIXME, TEST THIS
                //while(it.hasNext())
                //{
                //    String department_of = it.next();
                //    if(department_of.toLowerCase().equals(CrawlerDepartamentsV3Controller.DEPARTMENT_OF_RESULT_TAG + sUnitOfAssessment_Description.toLowerCase()))
                //    {
                List<CandidateTypeURL> lst = t.get(department_of);
                if (lst != null) {
                    for (CandidateTypeURL ss : lst) {
                        ProjectLogger.LOGGER
                                .info("Add department '" + department_of + "' the url '" + ss.sURL + "'");
                        e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL);
                    }
                }
                //        break;
                //    }
                //}
            }
        }

        return true;
    }

    /**
     * 
     * @param elementUnitOfAssessment
     * @param path
     * @param sInstitutionName
     * @param sWebAddress
     * @param sUnitOfAssessment_Description     
     * @return  
     */
    @Override
    protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path,
            String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) {
        if (refuseExecution)
            return false;

        String crawler_data_folder = this.work_dir + File.separator + CRAWLER_DATA_FOLDERNAME;

        List<String> department_web_addresses = new ArrayList<String>();
        List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>();

        String seed = sWebAddress;
        String contain_pattern = seed.replace("http://www.", "");
        int index = contain_pattern.indexOf("/");
        if (index == -1)
            index = contain_pattern.length() - 1;
        contain_pattern = contain_pattern.substring(0, index);

        /*
         * Taking departments webpages to search in the researchers webpages
         */
        for (Iterator<org.dom4j.Element> department_web_address_it = elementUnitOfAssessment
                .elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); department_web_address_it.hasNext();) {
            org.dom4j.Element department_web_address_element = (org.dom4j.Element) department_web_address_it.next();
            if (!department_web_address_element.getText().equals(""))
                department_web_addresses.add(department_web_address_element.getText());
        }

        /*
         * If there is not department webpage, then, add the university web to find staff page and something similar
         */
        if (department_web_addresses.isEmpty()) {
            ProjectLogger.LOGGER.info("There is not dept webpages for [" + sUnitOfAssessment_Description + " - "
                    + sInstitutionName + "]. Adding " + sWebAddress);
            //department_web_addresses.add(sWebAddress);
        }

        /*
         * Taking researchers info to search the researchers webs
         */
        for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment
                .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) {
            org.dom4j.Element research_group_element = research_group_it.next();

            for (Iterator<org.dom4j.Element> reseacher_it = research_group_element
                    .elementIterator(XMLTags.RESEARCHER); reseacher_it.hasNext();) {
                org.dom4j.Element reseacher_element = reseacher_it.next();

                String initials = reseacher_element.element(XMLTags.RESEARCHER_INITIALS).getText();
                String last_name = reseacher_element.element(XMLTags.RESEARCHER_LASTNAME).getText();
                String first_name = reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                        : reseacher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                String whole_name = reseacher_element.element(XMLTags.RESEARCHER_NAME) == null ? ""
                        : reseacher_element.element(XMLTags.RESEARCHER_NAME).getText();

                ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name);
                researchers.add(rsi);
            }
        }

        if (researchers.size() > 0 && !department_web_addresses.isEmpty()) {
            /*
             * Crawling to search the researchers
             */
            CrawlerResearchesPagesV3Controller controllerReseachers = null;
            try {
                String university_subject_crawler_data_folder = crawler_data_folder + File.separator
                        + sInstitutionName.replaceAll("\\W+", "").toLowerCase() + "-"
                        + sUnitOfAssessment_Description.replaceAll("\\W+", "").toLowerCase() + "-crawler-data";
                File university_subject_crawler_data_dir = new File(university_subject_crawler_data_folder);
                if (university_subject_crawler_data_dir.exists())
                    FileFootils.deleteDir(university_subject_crawler_data_dir);

                controllerReseachers = new CrawlerResearchesPagesV3Controller(
                        university_subject_crawler_data_folder, this.keywords_data_dir, researchers);
                String sSeeds = "";
                for (String s : department_web_addresses) {
                    controllerReseachers.addSeed(s);
                    sSeeds += s + ",";
                }

                controllerReseachers.setPolitenessDelay(200);
                controllerReseachers.setMaximumCrawlDepth(3);
                controllerReseachers.setMaximumPagesToFetch(-1);
                controllerReseachers.setContainPattern(contain_pattern);
                controllerReseachers.clearInterestingUrlsDetected();

                ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - "
                        + sInstitutionName + " - [" + StringUtils.join(department_web_addresses, ",") + "]");
                long lTimerAux = java.lang.System.currentTimeMillis();

                controllerReseachers.start(CrawlerResearchesPagesV3.class, 1);

                controllerReseachers.postProcessResults();

                lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
                ProjectLogger.LOGGER.info(
                        "End crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " - Time: "
                                + lTimerAux + " ms - [" + StringUtils.join(department_web_addresses, ",") + "]");
            } catch (Exception ex) {
                ProjectLogger.LOGGER.error(ex.getMessage(), ex);
            } finally {
                if (CrawlerTrace.isTraceUrlsActive() && controllerReseachers != null)
                    controllerReseachers.closeCrawlerTrace();
            }

            /*
             * Update results
             */
            if (controllerReseachers != null) {
                /*
                 * Print the researchers
                 */
                if (CrawlerTrace.isTraceSearchActive()) {
                    CandidateTypeURL
                            .printResults(
                                    "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " ("
                                            + sWebAddress + ") by TYPE",
                                    controllerReseachers.getInterestingUrlsDetected());
                }

                counterTotal[0] = 0;
                counterSuccess[0] = 0;

                try {
                    /*
                     * Add researcher webs to xml document             
                     */
                    for (Iterator<org.dom4j.Element> research_group_it = elementUnitOfAssessment
                            .elementIterator(XMLTags.RESEARCHGROUP); research_group_it.hasNext();) {
                        org.dom4j.Element research_group_element = research_group_it.next();

                        for (Iterator<org.dom4j.Element> researcher_it = research_group_element
                                .elementIterator(XMLTags.RESEARCHER); researcher_it.hasNext();) {
                            counterTotal[0]++;
                            org.dom4j.Element researcher_element = researcher_it.next();

                            String initials = researcher_element.element(XMLTags.RESEARCHER_INITIALS).getText();
                            String last_name = researcher_element.element(XMLTags.RESEARCHER_LASTNAME).getText();
                            String first_name = researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME) == null
                                    ? ""
                                    : researcher_element.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                            String whole_name = researcher_element.element(XMLTags.RESEARCHER_NAME) == null ? ""
                                    : researcher_element.element(XMLTags.RESEARCHER_NAME).getText();

                            ResearcherNameInfo researcher_name_info = new ResearcherNameInfo(last_name, initials,
                                    first_name, whole_name);
                            researcher_name_info.first_name = CandidateTypeURL
                                    .getCanonicalName(researcher_name_info.first_name);
                            researcher_name_info.last_name = CandidateTypeURL
                                    .getCanonicalName(researcher_name_info.last_name);
                            researcher_name_info.initial = CandidateTypeURL
                                    .getCanonicalName(researcher_name_info.initial);
                            researcher_name_info.whole_name = CandidateTypeURL
                                    .getCanonicalName(researcher_name_info.whole_name);

                            TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers
                                    .getInterestingUrlsDetected();

                            List<CandidateTypeURL> lst = t
                                    .get(CrawlerResearchesPagesV3Controller.RESEARCHER_RESULT_TAG);

                            boolean bExist = false;
                            if (lst != null) {
                                //FIXME, contains and remove better
                                boolean lock1 = true;
                                for (CandidateTypeURL ss : lst) {
                                    if (researcher_name_info.equals(ss.data)) {
                                        ProjectLogger.LOGGER.info("Add researcher '" + researcher_name_info
                                                + "' the url '" + ss.sURL + "'");
                                        researcher_element.addElement(XMLTags.RESEARCHER_WEB_ADDRESS)
                                                .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE, ss.sSubType)
                                                .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT, ss.sExt)
                                                .addText(ss.sURL);
                                        lock1 = false;
                                        bExist = true;
                                    }
                                }
                            }
                            if (bExist) {
                                counterSuccess[0]++;
                            } else {
                                ProjectLogger.LOGGER.warn("No webpage for " + researcher_name_info);
                            }
                        }
                    }
                } catch (Exception ex) {
                    ProjectLogger.LOGGER.error("Error", ex);
                }

                /*
                 * Show a little counting result
                 */
                ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - "
                        + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]);
                counterTotal[1] += 1;
                counterSuccess[1] += counterSuccess[0] > 0 ? 1 : 0;

                counterSuccess[2] += counterSuccess[0];
                counterTotal[2] += counterTotal[0];
            }
        }

        return true;
    }
}