eu.sisob.uma.NPL.Researchers.GateDataExtractorSingle.java Source code

Introduction

Here is the source code for eu.sisob.uma.NPL.Researchers.GateDataExtractorSingle.java
Source

/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.NPL.Researchers;

import gate.creole.ResourceInstantiationException;
import gate.util.GateException;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.dom4j.DocumentException;
import gate.Gate;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.apache.log4j.*;
import eu.sisob.uma.api.prototypetextmining.globals.DataExchangeLiterals;
import gate.Document;
import java.io.FileReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map.Entry;
import au.com.bytecode.opencsv.CSVReader;
import eu.sisob.uma.api.prototypetextmining.DataInputRepository;
import eu.sisob.uma.api.prototypetextmining.MiddleData;
import eu.sisob.uma.api.prototypetextmining.RepositoryPreprocessDataMiddleData;
import eu.sisob.uma.api.prototypetextmining.TextMiningParserMonitor;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.FileFormatConversor;
import java.net.URL;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;

/**
 *** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 * Data extraction tasks (prototype). See: DataExtractionPrototype_1.pdf.
 * INCLUDE: PROCESS STEPS 4
 */
public class GateDataExtractorSingle {
    /**
     *
     * @param path
     * @throws GateException
     * @throws MalformedURLException
     */
    public static void GateUp(String path) throws GateException, MalformedURLException {
        ProjectLogger.LOGGER.info("Initialising GATE ...");
        File home_path = new File(path);
        Gate.setGateHome(home_path);
        File plugins_path = new File(home_path + "//plugins");
        Gate.setPluginsHome(plugins_path);
        Gate.init();
        File gateHome = new File(Gate.getGateHome().getAbsolutePath()/* + "\\resources\\GATE-6.0"*/);
        File pluginsHome = new File(gateHome, "plugins");
        Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "ANNIE").toURL());
        Gate.getCreoleRegister().registerDirectories(new File(pluginsHome, "Tools").toURL());
        ProjectLogger.LOGGER.info("Done!");
    }

    /**
     * PROCESS STEPS 4
     * The Data Extractor uses GATE (Cunningham et al., 2011) for processing and annotating the
     * provided data, in order to extract useful information about the researchers.
     * Inputs:
     *  - Data in the form of blocks of information useful or interesting for extraction
     *    obtained from the third module.
     *  Format XML:
     *  <root>
     *  <infoblock id=researcherid type=I_INDEX_DATA_TYPE>content or URL<7infoblock>
     * Outputs:
     *  - Processed and annotated useful data stored in a repository. See TextMiningParserGateResearcher.iniAnnotatorCollectors
     * @param infoblocksXmlFile 
     * @param verbose 
     * @param verbose_dir 
     * @return RepositoryCVN filled with extracted data
     * @throws DocumentException  
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromXml(File infoblocksXmlFile,
            boolean verbose, File verbose_dir) throws DocumentException {
        RepositoryPreprocessDataMiddleData preprocessedRep = new RepositoryPreprocessDataMiddleData();

        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        org.dom4j.Document document = reader.read(infoblocksXmlFile); //("ResearcherPagesMonkeyTask.xml");

        org.dom4j.Element root = document.getRootElement();
        boolean bLock = false;
        int N_MAX = 100, count = 0;
        Random randomGenerator = new Random();

        bLock = false;
        for (Iterator i = root.elementIterator("infoblock"); i.hasNext();) {
            org.dom4j.Element ib = (org.dom4j.Element) i.next();

            MiddleData aoPreProcessData = new MiddleData(
                    ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT),
                    ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT),
                    ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING),
                    ib.getText(), null, verbose, verbose_dir);
            {
                //if(N_MAX > count)
                //if(ib.getText().contains("2f21a5ff"))
                {

                    preprocessedRep.addData(aoPreProcessData);
                    bLock = true;
                    count++;
                }
            }
        }

        ProjectLogger.LOGGER.info(count + " documents added");
        return preprocessedRep;
    }

    /**
     *
     * @param reader
     * @param separator
     * @param data_dir
     * @param verbose
     * @param verbose_dir
     * @param split_by_keyword
     * @param blocks_and_keywords
     * @return
     * @throws IOException
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromCSV(CSVReader reader,
            char separator, File data_dir, boolean verbose, File verbose_dir, boolean split_by_keyword,
            HashMap<String, String[]> blocks_and_keywords, File dest_dir) throws IOException {
        RepositoryPreprocessDataMiddleData preprocessedRep = new RepositoryPreprocessDataMiddleData();

        String[] nextLine;

        int idStaffIdentifier = -1;
        int idName = -1;
        int idFirstName = -1;
        int idLastName = -1;
        int idInitials = -1;
        int idUnitOfAssessment_Description = -1;
        int idInstitutionName = -1;
        int idWebAddress = -1;
        int idResearchGroupDescription = -1;
        int idResearcherWebAddress = -1;
        int idResearcherWebAddressType = -1;
        int idResearcherWebAddressExt = -1;
        if ((nextLine = reader.readNext()) != null) {
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
            }
        }

        if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
            Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");

            int count = 0;
            while ((nextLine = reader.readNext()) != null) {
                String file_reference = nextLine[idResearcherWebAddress];
                String researcher_page_url = nextLine[idResearcherWebAddress];

                if (p1.matcher(researcher_page_url).matches()) {
                    File f = new File(data_dir, researcher_page_url);
                    if (!f.exists()) {
                        throw new FileNotFoundException(researcher_page_url + " not found in the folder.");
                    }
                    researcher_page_url = f.toURI().toURL().toString();
                }

                String id = nextLine[idStaffIdentifier];
                String lastname = nextLine[idLastName];
                String initials = nextLine[idInitials];

                String name = idFirstName != -1 ? nextLine[idFirstName] : "";
                String firstname = idName != -1 ? nextLine[idName] : "";

                if (split_by_keyword && blocks_and_keywords != null && blocks_and_keywords != null) {
                    DocumentSplitter spliter = new DocumentSplitter(blocks_and_keywords);
                    Document doc = null;
                    boolean document_loaded = false;
                    try {
                        ProjectLogger.LOGGER.info("Opening " + researcher_page_url);
                        doc = gate.Factory.newDocument(new URL(researcher_page_url));
                        document_loaded = true;
                    } catch (Exception ex) {
                        ProjectLogger.LOGGER.error("Document not loaded ", ex);
                    }

                    if (document_loaded) {
                        List<Entry<String, String>> blocks = spliter.SplitDocument(doc.getContent().toString());
                        for (Entry<String, String> block : blocks) {
                            //if(block.getKey().equals(CVBlocks.CVBLOCK_PROFESSIONAL_ACTIVITY.toString()) ||
                            //           block.getKey().equals(CVBlocks.CVBLOCK_UNIVERSITY_STUDIES.toString()) ||
                            //          block.getKey().equals(CVBlocks.CVBLOCK_PERSONAL.toString()) ||
                            //           block.getKey().equals(""))
                            {
                                String desc = "";
                                try {
                                    desc = CVBlocks.CVBLOCK_DESCRIPTIONS[Integer.parseInt(block.getKey())];
                                } catch (Exception ex) {
                                    desc = CVBlocks.CVBLOCK_DESCRIPTIONS[CVBlocks.CVBLOCK_REST];
                                }

                                String output_filename = file_reference;
                                int index_slash = file_reference.substring(0, file_reference.length() - 1)
                                        .lastIndexOf("/");
                                if (index_slash != -1) {
                                    output_filename = file_reference.substring(index_slash);
                                }

                                output_filename = desc + "-" + id + "-"
                                        + output_filename.replaceAll("[^A-Za-z0-9]", "-");
                                File output_file = new File(dest_dir, output_filename);
                                FileUtils.write(output_file, block.getValue(), "UTF-8", false);
                                String output_fileurl = output_file.toURI().toURL().toString();
                                HashMap<String, String> extra_data = new HashMap<String, String>();
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_LASTNAME,
                                        lastname);
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_INITIALS,
                                        initials);
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_NAME, name);
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_FIRSTNAME,
                                        firstname);
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_DOCUMENT_NAME,
                                        desc + "-" + id + "-" + output_filename.replaceAll("[^A-Za-z0-9]", "-"));
                                extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_BLOCK_TYPE,
                                        block.getKey());

                                MiddleData md = new MiddleData(id,
                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER,
                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC,
                                        output_fileurl, //block.getValue(),
                                        extra_data, verbose, verbose_dir);
                                preprocessedRep.addData(md);
                            }
                        }
                    }
                } else {
                    //extra_data.put("block_type", "WHOLE_CV");
                    HashMap<String, String> extra_data = new HashMap<String, String>();
                    extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_LASTNAME, lastname);
                    extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_INITIALS, initials);
                    extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_NAME, name);
                    extra_data.put(DataExchangeLiterals.MIDDLE_ELEMENT_XML_EXTRADATA_FIRSTNAME, firstname);
                    MiddleData md = new MiddleData(id, DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER,
                            DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC,
                            researcher_page_url, extra_data, verbose, verbose_dir);
                    preprocessedRep.addData(md);
                }

                count++;
            }

            ProjectLogger.LOGGER.info(count + " documents added");
        } else {
            ProjectLogger.LOGGER.info("Miss some fields in the csv content");
        }

        return preprocessedRep;
    }

    /**
     *
     * @param csv_filepath
     * @param separator
     * @param data_dir
     * @param verbose
     * @param verbose_dir
     * @param split_by_keyword
     * @param blocks_and_keywords
     * @return
     * @throws IOException
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromCSVFile(File csv_filepath,
            char separator, File data_dir, boolean verbose, File verbose_dir, boolean split_by_keyword,
            HashMap<String, String[]> blocks_and_keywords, File dest_dir) throws IOException {
        //CSVReader reader = new CSVReader(new StringReader(csv_content), separator);
        CSVReader reader = new CSVReader(new FileReader(csv_filepath), separator);

        return createPreprocessRepositoryFromCSV(reader, separator, data_dir, verbose, verbose_dir,
                split_by_keyword, blocks_and_keywords, dest_dir);
    }

    /**
     *
     * @param csv_filepath
     * @param separator
     * @param data_dir
     * @return
     * @throws IOException
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromCSVFile(File csv_filepath,
            char separator, File data_dir) throws IOException {
        //CSVReader reader = new CSVReader(new StringReader(csv_content), separator);
        CSVReader reader = new CSVReader(new FileReader(csv_filepath), separator);

        return createPreprocessRepositoryFromCSV(reader, separator, data_dir, false, null, false, null, null);
    }

    /**
     *
     * @param csv_content
     * @param separator
     * @param data_dir
     * @return
     * @throws IOException
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromCSVContent(String csv_content,
            char separator, File data_dir) throws IOException {
        //CSVReader reader = new CSVReader(new StringReader(csv_content), separator);
        CSVReader reader = new CSVReader(new StringReader(csv_content), separator);

        return createPreprocessRepositoryFromCSV(reader, separator, data_dir, false, null, false, null, null);
    }

    /**
     *
     * @param preprocessedRep
     * @param processedRep
     * @return
     */
    public static DataInputRepository extractInformation(RepositoryPreprocessDataMiddleData preprocessedRep,
            DataInputRepository processedRep) {
        try {
            TextMiningParserMonitor monitor = new TextMiningParserMonitor(1, TextMiningParserGateResearcher.class,
                    DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER, processedRep, 5, preprocessedRep);
            monitor.launchParsers();

            ProjectLogger.LOGGER.info("PROCESS DONE!");

            return processedRep;
        } catch (org.dom4j.DocumentException spe) {
            StringBuffer sb = new StringBuffer(spe.toString());
            sb.append("\nLocMSG: " + spe.getLocalizedMessage());
            sb.append("\nMSG: " + spe.getMessage());
            System.out.println(sb.toString());
            return null;
        } catch (Exception ex) {
            ProjectLogger.LOGGER.error("Error: " + ex.getMessage());
            return null;
        }
    }

    /*
     * Read txt like:
     * 
     * ID1 ID2
     * ID1 ID2
     * ID1 ID2
     * 
     * make repository like:
     * 
     * 
     * 
     */
    /**
     *
     * @param csv_file
     * @param cvs_directory
     * @param withSplitter
     * @param verbose
     * @param verbose_dir
     * @return
     */
    public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryOfCVsFromTxt(File csv_file,
            File cvs_directory, boolean withSplitter, boolean verbose, File verbose_dir) {
        RepositoryPreprocessDataMiddleData preprocessedRep = new RepositoryPreprocessDataMiddleData();

        String path = csv_file.getParentFile().getAbsolutePath();

        HashMap<String, String> idToCvPath = new HashMap<String, String>();

        HashMap<String, String> id2Toid1 = new HashMap<String, String>();

        if (cvs_directory.exists()) {
            for (File f : cvs_directory.listFiles()) {
                if (f.isFile()) {
                    String name = f.getName().substring(0, f.getName().lastIndexOf("."));
                    try {
                        Integer.parseInt(name);
                        idToCvPath.put(name, f.getAbsolutePath());
                    } catch (Exception ex) {
                        ProjectLogger.LOGGER.error("Not file: " + ex.getMessage());
                    }
                }
            }

            FileReader fr;
            CSVReader reader;
            try {
                fr = new FileReader(csv_file);
                reader = new CSVReader(fr, '\t');
                String[] nextLine;
                while ((nextLine = reader.readNext()) != null) {
                    String id_researcher_1 = nextLine[0];
                    String id_researcher_2 = nextLine[1];

                    id2Toid1.put(id_researcher_2, id_researcher_1);
                }
            } catch (FileNotFoundException ex) {
                ProjectLogger.LOGGER.error("Error", ex);
            } catch (IOException ex) {
                ProjectLogger.LOGGER.error("Error", ex);
            }

            for (String id2 : idToCvPath.keySet()) {
                if (id2Toid1.containsKey(id2)) {
                    File cv_file = new File(idToCvPath.get(id2));

                    boolean document_loaded = false;
                    if (cv_file.exists()) {
                        if (withSplitter) {
                            DocumentSplitter spliter = new DocumentSplitter(
                                    CVBlocks.getCVBlocksAndKeywords(new File("keywords")));
                            Document doc = null;
                            try {
                                ProjectLogger.LOGGER.info("Opening " + cv_file.getName());
                                doc = gate.Factory.newDocument((cv_file).toURI().toURL());
                                document_loaded = true;
                            } catch (MalformedURLException ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            } catch (ResourceInstantiationException ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            } catch (Exception ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            }

                            if (document_loaded) {
                                List<Entry<String, String>> blocks = spliter
                                        .SplitDocument(doc.getContent().toString());

                                //System.out.println("||||||||||||||||||||||||||||||||||||||||||||||1");
                                //System.out.println(cv_file.getName());
                                //System.out.println("||||||||||||||||||||||||||||||||||||||||||||||1");

                                for (Entry<String, String> block : blocks) {
                                    //TESTING                                     
                                    //if(block.getKey().equals(""))                                    
                                    //    System.out.println("GENERAL");
                                    //else
                                    //    System.out.println(CVBlocks.CVBLOCK_DESCRIPTIONS[Integer.parseInt(block.getKey())]);

                                    //System.out.println("----------------------------------------------");
                                    //System.out.println(block.getValue());
                                    //System.out.println("----------------------------------------------");

                                    if (block.getKey().equals(CVBlocks.CVBLOCK_PROFESSIONAL_ACTIVITY.toString())
                                            || block.getKey().equals(CVBlocks.CVBLOCK_UNIVERSITY_STUDIES.toString())
                                            || block.getKey().equals(CVBlocks.CVBLOCK_PERSONAL.toString())
                                            || block.getKey().equals("")) {
                                        MiddleData aoPreProcessData = new MiddleData(id2Toid1.get(id2),
                                                DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER,
                                                DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC,
                                                block.getValue(), null, verbose, verbose_dir);
                                        //cv_file.getAbsolutePath());                                                      
                                        preprocessedRep.addData(aoPreProcessData);
                                    }
                                }
                            }
                        } else {
                            Document doc = null;
                            try {
                                ProjectLogger.LOGGER.info("Opening " + cv_file.getName());
                                doc = gate.Factory.newDocument((cv_file).toURI().toURL());
                                doc.setName(cv_file.getName());
                                document_loaded = true;
                            } catch (MalformedURLException ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            } catch (ResourceInstantiationException ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            } catch (Exception ex) {
                                ProjectLogger.LOGGER.error("Document not loaded ", ex);
                            }

                            if (document_loaded) {

                                //String content = doc.getContent().toString().replace("\r\n\r\n", "\r\n");

                                MiddleData aoPreProcessData = new MiddleData(id2Toid1.get(id2),
                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER,
                                        DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC,
                                        doc.getContent().toString(), null, verbose, verbose_dir);
                                preprocessedRep.addData(aoPreProcessData);
                            }
                        }
                    }
                }
            }
        }

        return preprocessedRep;
    }
}