eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.FileFormatConversor.java Source code

Introduction

Here is the source code for eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.FileFormatConversor.java
Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format;

import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import org.apache.log4j.Logger;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;

/**
 *
 * @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class FileFormatConversor {
    public static final String CSV_COL_ID = "ID"; //"StaffIdentifier"
    public static final String CSV_COL_NAME = "NAME"; //"Name"
    public static final String CSV_COL_FIRSTNAME = "FIRSTNAME"; //"FirstName"
    public static final String CSV_COL_LASTNAME = "LASTNAME"; //"LastName"
    public static final String CSV_COL_INITIALS = "INITIALS"; //"Initials"
    public static final String CSV_COL_EMAIL = "EMAIL"; //"Initials"
    public static final String CSV_COL_SUBJECT = "SUBJECT"; //"UnitOfAssessment_Description"
    public static final String CSV_COL_INSTITUTION_NAME = "INSTITUTION_NAME"; //"InstitutionName"
    public static final String CSV_COL_INSTITUTION_URL = "INSTITUTION_URL"; //"WebAddress"    
    public static final String CSV_COL_RESEARCHER_PAGE_URL = "RESEARCHER_PAGE_URL"; //"ResearcherWebAddress"
    public static final String CSV_COL_RESEARCHER_PAGE_TYPE = "RESEARCHER_PAGE_TYPE"; //"ResearcherWebAddress"
    public static final String CSV_COL_RESEARCHER_PAGE_EXT = "RESEARCHER_PAGE_EXT"; //"ResearcherWebAddress"
    public static final String CSV_COL_SCORE_URL = "SCORE_URL"; //"ResearcherWebAddress"
    public static final String CSV_COL_SCORE_EMAIL = "SCORE_EMAIL"; //"ResearcherWebAddress"
    public static final String CSV_COL_CV_FILE = "CV_FILE";
    public static final String CSV_COL_SCORE_CV_FILE = "SCORE_CV_FILE";

    /*
     * Create a Researcher XML File from Researcher CSV File (The researcher XML file will be used by crawler)
     * @param filePathCSV - filepath of input csv file
     * @param filePathXml - filepath of output xml file 
     * @return success indication
     */
    public static boolean createResearchersCSVFileFromXML(org.dom4j.Document sourceXmlDocument, File filePathCSV,
            File filePathCSV_nofound) throws FileNotFoundException, IOException {
        boolean success = false;

        ConversorFromXMLtoCSV c = new ConversorFromXMLtoCSV(sourceXmlDocument, filePathCSV, filePathCSV_nofound);
        try {
            success = c.iterate();
        } catch (Exception ex) {
            Logger.getLogger("root").error("Error iterating document to create '" + filePathCSV + "'");
        }
        return success;
    }

    /*
     * Create a Researcher XML File from Researcher CSV File (The researcher XML file will be used by crawler)
     * @param filePathCSV - filepath of input csv file
     * @param filePathXml - filepath of output xml file 
     * @return success indication
     */
    public static boolean createResearchersCSVFileFromXML(File[] fileXmlDocuments, File filePathCSV,
            File filePathCSV_nofound) throws FileNotFoundException, IOException {
        boolean success = false;

        boolean first = true;
        boolean last = false;

        CSVWriter writer = null;
        try {
            writer = new CSVWriter(new FileWriter(filePathCSV), ';');
        } catch (Exception ex) {
            Logger.getLogger("root").error("Error create writer to create '" + filePathCSV + "'");
            writer = null;
        }

        CSVWriter writer_nofound = null;
        try {
            writer_nofound = new CSVWriter(new FileWriter(filePathCSV_nofound), ';');
        } catch (Exception ex) {
            Logger.getLogger("root").error("Error create writer to create '" + filePathCSV + "'");
            writer = null;
        }

        if (writer != null && writer_nofound != null) {
            for (int i = 0; i < fileXmlDocuments.length; i++) {
                Document doc = null;
                org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();

                try {
                    doc = reader.read(fileXmlDocuments[i]);
                } catch (DocumentException ex) {
                    Logger.getLogger("root").error("Error opening xml document '" + fileXmlDocuments[i].getPath()
                            + "' to create '" + filePathCSV + "'");
                    doc = null;
                }

                if (i == fileXmlDocuments.length - 1)
                    last = true;

                if (doc != null) {
                    ConversorFromXMLtoCSV c = new ConversorFromXMLtoCSV(doc, writer, writer_nofound, first, last);

                    try {
                        success = c.iterate();
                    } catch (Exception ex) {
                        Logger.getLogger("root").error("Error iterating document to create '" + filePathCSV + "'");
                    }

                    if (first)
                        first = false;
                }
            }
        }

        return success;
    }

    /*
     * Create a Researcher XML File from Researcher CSV File (The researcher XML file will be used by crawler)
     * @param filePathCSV - filepath of input csv file
     * @param filePathXml - filepath of output xml file 
     * @return success indication
     */
    public static boolean createResearchersXMLFileFromCSV(File filePathCSV, File filePathXml)
            throws FileNotFoundException, IOException {
        boolean sucess = false;

        CSVReader reader = new CSVReader(new FileReader(filePathCSV), ';');
        String[] nextLine;
        int idStaffIdentifier = -1;
        int idName = -1;
        int idFirstName = -1;
        int idLastName = -1;
        int idInitials = -1;
        int idUnitOfAssessment_Description = -1;
        int idInstitutionName = -1;
        int idWebAddress = -1;
        int idResearchGroupDescription = -1;
        int idResearcherWebAddress = -1;
        int idResearcherWebAddressType = -1;
        int idResearcherWebAddressExt = -1;
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
            }
        }

        if (idLastName != -1 && idInitials != -1 && idStaffIdentifier != -1 && idWebAddress != -1
                && idInstitutionName != -1 && idUnitOfAssessment_Description != -1) {
            Document document = DocumentHelper.createDocument();
            Element root = document.addElement("root");

            while ((nextLine = reader.readNext()) != null) {
                for (int k = 0; k < nextLine.length; k++) {
                    nextLine[k] = nextLine[k].replace("'", "");
                }

                Node n = root.selectSingleNode(XMLTags.INSTITUTION + "/" + XMLTags.INSTITUTION_NAME + "[text()='"
                        + nextLine[idInstitutionName] + "']");
                Element eInstitution = null;
                if (n == null) {
                    eInstitution = root.addElement(XMLTags.INSTITUTION);
                    eInstitution.addElement(XMLTags.INSTITUTION_NAME).addCDATA(nextLine[idInstitutionName]);
                    eInstitution.addElement(XMLTags.INSTITUTION_WEBADDRESS)
                            .addCDATA(idWebAddress == -1 ? "" : nextLine[idWebAddress]);
                } else {
                    eInstitution = n.getParent();
                }
                {
                    n = eInstitution.selectSingleNode(
                            XMLTags.UNIT_OF_ASSESSMENT + "/" + XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION + "[text()='"
                                    + nextLine[idUnitOfAssessment_Description] + "']");
                    Element eUnitOfAssessment = null;
                    if (n == null) {
                        eUnitOfAssessment = eInstitution.addElement(XMLTags.UNIT_OF_ASSESSMENT);
                        eUnitOfAssessment.addElement(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION)
                                .addCDATA(nextLine[idUnitOfAssessment_Description]);
                        //eUnitOfAssessment.addElement("DepartamentWebAddress").addCDATA("");                        
                    } else {
                        eUnitOfAssessment = n.getParent();
                    }
                    {
                        String researchGroupDescription = "";
                        if (idResearchGroupDescription != -1) {
                            researchGroupDescription = nextLine[idResearchGroupDescription];
                        }
                        n = eUnitOfAssessment
                                .selectSingleNode(XMLTags.RESEARCHGROUP + "/" + XMLTags.RESEARCHGROUP_DESCRIPTION
                                        + "[text()='" + researchGroupDescription + "']");
                        Element eResearchGroup = null;
                        if (n == null) {
                            eResearchGroup = eUnitOfAssessment.addElement(XMLTags.RESEARCHGROUP);
                            eResearchGroup.addElement(XMLTags.RESEARCHGROUP_DESCRIPTION)
                                    .addCDATA(researchGroupDescription);
                        } else {
                            eResearchGroup = n.getParent();
                        }
                        {
                            Element eResearcher = eResearchGroup.addElement(XMLTags.RESEARCHER);
                            eResearcher.addElement(XMLTags.RESEARCHER_STAFFIDENTIFIER)
                                    .addCDATA(nextLine[idStaffIdentifier]);
                            if (idName != -1) {
                                eResearcher.addElement(XMLTags.RESEARCHER_NAME).addCDATA(nextLine[idName]);
                            }

                            if (idFirstName != -1) {
                                eResearcher.addElement(XMLTags.RESEARCHER_FIRSTNAME)
                                        .addCDATA(nextLine[idFirstName]);
                            }

                            eResearcher.addElement(XMLTags.RESEARCHER_LASTNAME).addCDATA(nextLine[idLastName]);
                            eResearcher.addElement(XMLTags.RESEARCHER_INITIALS).addCDATA(nextLine[idInitials]);

                            if (idResearcherWebAddress != -1) {
                                String researcherWebAddress = "";
                                String researcherWebAddressType = "";
                                String researcherWebAddressExt = "";

                                researcherWebAddress = nextLine[idResearcherWebAddress].trim();

                                if (!researcherWebAddress.equals("")) {

                                    if (idResearcherWebAddressType != -1)
                                        researcherWebAddressType = nextLine[idResearcherWebAddressType];
                                    else
                                        researcherWebAddressType = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;

                                    if (idResearcherWebAddressExt != -1)
                                        researcherWebAddressExt = nextLine[idResearcherWebAddressExt];
                                    else
                                        researcherWebAddressExt = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;

                                    eResearcher.addElement(XMLTags.RESEARCHER_WEB_ADDRESS)
                                            .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE,
                                                    researcherWebAddressType)
                                            .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT,
                                                    researcherWebAddressExt)
                                            .addCDATA(researcherWebAddress);
                                }
                            }
                        }
                    }
                }
            }

            reader.close();

            FileOutputStream fileOS = new java.io.FileOutputStream(filePathXml, false);
            OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8");
            BufferedWriter bw = new java.io.BufferedWriter(writer);
            String sOut = document.asXML();
            bw.write(sOut);
            bw.close();
            writer.close();
            fileOS.close();

            sucess = true;
        } else {
            sucess = false;
        }

        return sucess;

    }

    public static boolean checkResearchersCSV(File filePathCSV, boolean urls)
            throws FileNotFoundException, IOException {
        boolean sucess = false;

        CSVReader reader = new CSVReader(new FileReader(filePathCSV), ';');
        String[] nextLine;
        int idStaffIdentifier = -1;
        int idName = -1;
        int idFirstName = -1;
        int idLastName = -1;
        int idInitials = -1;
        int idUnitOfAssessment_Description = -1;
        int idInstitutionName = -1;
        int idWebAddress = -1;
        int idResearchGroupDescription = -1;
        int idResearcherWebAddress = -1;
        int idResearcherWebAddressType = -1;
        int idResearcherWebAddressExt = -1;
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
            }
        }

        if (idLastName != -1 && idInitials != -1 && idStaffIdentifier != -1
                && (!urls ? idWebAddress != -1 && idInstitutionName != -1 : idResearcherWebAddress != -1)) {
            sucess = true;
            while ((nextLine = reader.readNext()) != null) {
                if (!(idLastName < nextLine.length && idInitials < nextLine.length
                        && idStaffIdentifier < nextLine.length)) {
                    sucess = false;
                    break;
                }

                if (urls) {
                    if (!(idResearcherWebAddress < nextLine.length)) {
                        sucess = false;
                        break;
                    }
                } else {
                    if (!(idWebAddress < nextLine.length && idInstitutionName < nextLine.length)) {
                        sucess = false;
                        break;
                    }
                }
            }

            reader.close();
        } else {
            sucess = false;
        }

        return sucess;

    }
}