eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreator.java Source code

Introduction

Here is the source code for eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreator.java
Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.crawler.ResearchersCrawlers.Workers;

import eu.sisob.uma.api.prototypetextmining.globals.DataExchangeLiterals;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.LocalFormatType;
import eu.sisob.uma.crawler.ResearchersCrawlers.Utils.MurmurHash;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ResearcherNameInfo;
import java.io.*;
import java.util.Iterator;
import java.util.logging.Logger;
import org.dom4j.DocumentFactory;
import org.dom4j.Element;

/**
 * This class create an xml file that contains infoblock which parsing the prototypeTextMiningGate object.
 * The format is like this:
 *  <infoblock DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT=sStaffIndentifier 
 *             MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT=ID_TEXTMININGPARSER_GATERESEARCHER
 *             MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING=ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC>  
 *  Local URI of webpage
 *  </infoblock>
 ** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class ExportDocumentsOnXMLFileForTextMiningCreator extends IteratorReseachersFile {
    File dest_file_xml;
    org.dom4j.Document docOut;
    org.dom4j.Element rootOut;
    long lTimerAux;
    int hitsTable[][];

    /**
     * 
     * @param sourceXmlFile
     * @param downloadPagesDir
     * @param destXmlFile
     */
    public ExportDocumentsOnXMLFileForTextMiningCreator(File source_file_xml, File downloadPagesDir,
            File dest_file_xml, LocalFormatType local_format_type) {
        IteratorReseachersFile(source_file_xml, downloadPagesDir, local_format_type);
        this.dest_file_xml = dest_file_xml;
    }

    /**
     * 
     */
    @Override
    protected void beginActions() {
        docOut = new DocumentFactory().createDocument();
        rootOut = docOut.addElement("root");

        hitsTable = new int[2][5];

        lTimerAux = java.lang.System.currentTimeMillis();
    }

    /**
     * Reader folder of one researcher and takes the uri of clean file for to make infoblock. 
     * Note: Read comment in top of file.
     * @param elementResearcher
     * @param path
     * @param sInstitutionName
     * @param sWebAddress
     * @param sUnitOfAssessment_Description
     * @param sResearchGroupDescription
     * @param sResearchName
     * @param sResearchInitials
     * @param sStaffIndentifier
     */
    @Override
    protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName,
            String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription,
            ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
        File fAux = new File(path);

        File[] adirRW = fAux.listFiles();

        if (adirRW != null) {
            for (File file : adirRW) {
                for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) {
                    org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                    String sURL = e5.getText();

                    byte[] bytes = sURL.getBytes();
                    String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5));

                    if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) {
                        File dirFinalFiles = new File(file.getPath());
                        File[] afinalFiles = dirFinalFiles.listFiles();

                        //Search all clean files
                        for (File finalFile : afinalFiles) {
                            if (finalFile.getName().contains("clean_")) {
                                //FIXME
                                if (!finalFile.getName().contains("pub"))
                                    if (finalFile.exists()) {
                                        rootOut.addElement("infoblock")
                                                .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT,
                                                        sStaffIndentifier)
                                                .addAttribute(
                                                        DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT,
                                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER)
                                                .addAttribute(
                                                        DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING,
                                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC)
                                                .addText(finalFile.getAbsolutePath());
                                    }
                            }
                        }
                    } else {
                        Logger.getLogger("MyLog").warning(
                                "DIR NOT EQUAL: " + file.getPath() + " != " + sAuxxx.replace("\\\\", "\\"));
                    }
                }
            }
        }

        return true;
    }

    /**
     * 
     * @throws Exception
     */
    @Override
    protected void endActions() throws Exception {
        FileOutputStream fileOS = new java.io.FileOutputStream(dest_file_xml, false);
        OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8");
        BufferedWriter bw = new java.io.BufferedWriter(writer);
        String sOut = docOut.asXML();
        bw.write(sOut);
        bw.close();
        Logger.getLogger("MyLog").info(dest_file_xml + " export!.");
    }

}