eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile.java Source code

Java tutorial

Introduction

Here is the source code for eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile.java

Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format;

import java.io.File;
import java.util.Iterator;
import org.apache.log4j.Logger;

/**
 * This abstract class iterate along the folder structure made for store copies of researchers webpages.
 * 
 * The structure is something like this:
 *      (base_directory => Universities folders => Discipline folders => Researchers foldes => webpages folders (hash of url)
 * Example:
 *  download_pages\UniversityofBirmingham\Chemistry\Worth#G\3767dadc 
 * @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
abstract public class IteratorReseachersFile {
    /**
     * 
     */
    protected File source_file_xml;
    /**
     * 
     */
    protected org.dom4j.Document sourceXmlDocument;
    /**
     * 
     */
    protected org.dom4j.Document document;
    /**
     * 
     */
    protected org.dom4j.Element root;
    /**
     * 
     */
    protected File work_dir;
    /**
     * 
     */
    //protected boolean createFolders;

    protected LocalFormatType local_format_type;

    /**
     * 
     * @param createFolders
     * @param sourceXmlFile
     * @param destDir
     */
    public void IteratorReseachersFile(File source_file_xml, File work_dir, LocalFormatType local_format_type) {
        this.source_file_xml = source_file_xml;
        this.sourceXmlDocument = null;
        this.root = null;
        this.work_dir = work_dir;
        //this.createFolders = createFolders;
        this.local_format_type = local_format_type;
    }

    /**
     * 
     * @param createFolders
     * @param sourceXmlDocument 
     * @param local_format_type 
     * @param destDir
     */
    public void IteratorReseachersFile(org.dom4j.Document sourceXmlDocument, File work_dir,
            LocalFormatType local_format_type) {
        this.source_file_xml = null;
        this.sourceXmlDocument = sourceXmlDocument;
        this.root = null;
        this.work_dir = work_dir;
        //this.createFolders = createFolders;
        this.local_format_type = local_format_type;
    }

    /**
     * 
     * @param createFolders
     * @param root 
     * @param sourceXmlDocument 
     * @param local_format_type 
     * @param destDir
     */
    public void IteratorReseachersFile(org.dom4j.Element root, File work_dir, LocalFormatType local_format_type) {
        this.source_file_xml = null;
        this.sourceXmlDocument = null;
        this.root = root;
        this.work_dir = work_dir;
        //this.createFolders = createFolders;
        this.local_format_type = local_format_type;
    }

    /**
     * 
     * @throws Exception
     */
    public boolean iterate() throws Exception {
        if (source_file_xml == null) {
            document = sourceXmlDocument;
            root = document.getRootElement();
        } else if (sourceXmlDocument != null) {
            org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
            document = reader.read(source_file_xml);
            root = document.getRootElement();
        } else if (root != null) {
            root = root;
        } else {
            return false;
        }

        String sInstitutionName = "";
        String sWebAddress = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchName = "";
        String sResearchFirstName = "";
        String sResearchLastName = "";
        String sResearchInitials = "";
        String sStaffIndentifier = "";

        String dirBase = work_dir + "\\";

        boolean end = false;

        try {
            beginActions();
        } catch (Exception ex) {
            end = true;
            Logger.getLogger("root").error("", ex);
        }

        for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext() && !end;) {
            org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

            sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText();
            sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText();

            String dirI = "";
            if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                dirI = dirBase + "\\" + sInstitutionName.replaceAll("[^a-z^A-Z]", "") + "\\";
            else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
                dirI = dirBase;

            end = !actionsInInstitutionNode(e1, dirI, sInstitutionName, sWebAddress);

            for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext() && !end;) {
                org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

                sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();

                String dirUAD = "";
                if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                    dirUAD = dirI + "\\" + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]", "") + "\\";
                else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
                    dirUAD = dirBase;

                end = !actionsInUnitOfAssessmentNode(e2, dirUAD, sInstitutionName, sWebAddress,
                        sUnitOfAssessment_Description);

                for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext() && !end;) {
                    org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                    sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText();

                    for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext() && !end;) {
                        org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                        sResearchLastName = e4.element(XMLTags.RESEARCHER_LASTNAME).getText();
                        sResearchInitials = e4.element(XMLTags.RESEARCHER_INITIALS).getText();
                        sResearchFirstName = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                                : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                        sResearchName = e4.element(XMLTags.RESEARCHER_NAME) == null ? ""
                                : e4.element(XMLTags.RESEARCHER_NAME).getText();

                        sStaffIndentifier = e4.element(XMLTags.RESEARCHER_STAFFIDENTIFIER).getText();

                        String sAux = sResearchLastName.replaceAll("[^a-z^A-Z]", "") + "#"
                                + sResearchInitials.replaceAll("[^a-z^A-Z]", "");

                        String dirR = "";
                        if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                            dirR = dirUAD + "\\" + sAux + "\\";
                        else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
                            dirR = dirBase;

                        ResearcherNameInfo rsi = new ResearcherNameInfo(sResearchLastName, sResearchInitials,
                                sResearchFirstName, sResearchName);
                        end = !actionsInResearcherNode(e4, dirR, sInstitutionName, sWebAddress,
                                sUnitOfAssessment_Description, sResearchGroupDescription, rsi, sStaffIndentifier);

                        for (Iterator i5 = e4.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext()
                                && !end;) {
                            org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                            String url = e5.getText();
                            if (!url.equals("")) {

                                String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT);
                                if (ext == null || ext == "")
                                    ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;
                                String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE);
                                if (type == null || type == "")
                                    ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;

                                end = !actionsInResearcherWebPageNode(e4, dirR, sInstitutionName, sWebAddress,
                                        sUnitOfAssessment_Description, sResearchGroupDescription, rsi,
                                        sStaffIndentifier, url, ext, type);
                            }
                        }
                    }
                }
            }
        }

        try {
            endActions();
        } catch (Exception ex) {
            Logger.getLogger("root").error("", ex);
        }

        return !end;
    }

    /**
     * 
     * @throws Exception
     */
    protected void beginActions() throws Exception {
    }

    /**
     * 
     * @param elementUnitOfAssessment
     * @param path
     * @param sInstitutionName
     * @param sWebAddress     
     * @return  
     */
    protected boolean actionsInInstitutionNode(org.dom4j.Element elementUnitOfAssessment, String path,
            String sInstitutionName, String sWebAddress) {
        return true;
    }

    /**
     * 
     * @param elementUnitOfAssessment
     * @param path
     * @param sInstitutionName
     * @param sWebAddress
     * @param sUnitOfAssessment_Description     
     * @return  
     */
    protected boolean actionsInUnitOfAssessmentNode(org.dom4j.Element elementUnitOfAssessment, String path,
            String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description) {
        return true;
    }

    /**
     * 
     * @param elementResearcher
     * @param path
     * @param sInstitutionName
     * @param sWebAddress
     * @param sUnitOfAssessment_Description
     * @param sResearchGroupDescription
     * @param sResearchWholeName 
     * @param sResearchFirstName 
     * @param sResearchLastName 
     * @param sResearchInitials
     * @param sStaffIndentifier
     * @return  
     */
    protected boolean actionsInResearcherNode(org.dom4j.Element elementResearcher, String path,
            String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description,
            String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
        return true;
    }

    /**
      * 
      * @param elementResearcher
      * @param path
      * @param sInstitutionName
      * @param sWebAddress
      * @param sUnitOfAssessment_Description
      * @param sResearchGroupDescription
      * @param sResearchWholeName 
      * @param sResearchFirstName 
      * @param sResearchLastName 
      * @param sResearchInitials
      * @param sStaffIndentifier
      * @return  
      */
    protected boolean actionsInResearcherWebPageNode(org.dom4j.Element elementResearcher, String path,
            String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description,
            String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier,
            String url, String ext, String type) {
        return true;
    }

    /**
     * 
     * @throws Exception
     */
    protected void endActions() throws Exception {
    }
}