eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java Source code

Java tutorial

Introduction

Here is the source code for eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java

Source

/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.NPL.Researchers;

import eu.sisob.uma.NPL.Researchers.Data.TraductionTablesOperations;
import eu.sisob.uma.NPL.Researchers.Freebase.LocationDataResolver;
import eu.sisob.uma.api.h2dbpool.H2DBPool;
import eu.sisob.uma.api.prototypetextmining.globals.CVItemExtracted;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.apache.log4j.Logger;
import org.dom4j.Element;

/**
 * Class with heuristic to able to extract more information from information extracted.
 *      For example:
 *          - Extract 
 * 
 *** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class DataResearcherAugmentedInformation {
    /*
     * Enrich researchers data information with other modules
     *  - Location of universities and organizations
     * 
     * @param doc
     * @param resolver 
     */
    /**
     *
     * @param doc
     * @param resolver
     */
    public static void resolveLocationOfEntities(org.dom4j.Document doc, LocationDataResolver resolver) {
        boolean verbose = resolver.verbose;
        org.dom4j.Element root = doc.getRootElement();

        for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) {
            org.dom4j.Element ib = (org.dom4j.Element) i.next();

            // Professional activities
            List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>();
            for (Object obj : ib.elements()) {
                org.dom4j.Element prof = (org.dom4j.Element) obj;
                if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName()))
                    profs.add(prof);
            }

            for (org.dom4j.Element prof : profs) {
                String entity_name = "";
                String element_name = "";

                /* */

                /*
                 * Trying to extract more information about the organization detected, like the location for example
                 * 
                 * Location searchs: 
                 *     Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try
                 *     to resolve the date first for the 3, next for the 2, and next for the 1.
                 * 
                 *     Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries).
                 *     But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries.
                 */
                org.dom4j.Element ent_name_3 = prof
                        .element(CVItemExtracted.ProfessionalActivity.Entity3_entityName);
                org.dom4j.Element ent_type_3 = prof.element(CVItemExtracted.ProfessionalActivity.Entity3_type);

                if (ent_name_3 != null && ent_type_3 != null) {
                    if (ent_type_3.getText().equals(
                            eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                        //"University of Massachusetts"
                        entity_name = ent_name_3.getText();
                        element_name = ent_name_3.getName();
                    }
                } else {
                    org.dom4j.Element ent_name_2 = prof
                            .element(CVItemExtracted.ProfessionalActivity.Entity2_entityName);
                    org.dom4j.Element ent_type_2 = prof.element(CVItemExtracted.ProfessionalActivity.Entity2_type);

                    if (ent_name_2 != null && ent_type_2 != null) {
                        if (ent_type_2.getText().equals(
                                eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                            entity_name = ent_name_2.getText();
                            element_name = ent_name_2.getName();
                        }
                    } else {
                        org.dom4j.Element ent_name_1 = prof
                                .element(CVItemExtracted.ProfessionalActivity.Entity1_entityName);
                        org.dom4j.Element ent_type_1 = prof
                                .element(CVItemExtracted.ProfessionalActivity.Entity1_type);

                        if (ent_name_1 != null && ent_type_1 != null) {
                            if (ent_type_1.getText().equals(
                                    eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                                entity_name = ent_name_1.getText();
                                element_name = ent_name_1.getName();
                            }
                        }
                    }
                }

                entity_name = entity_name.replace("  ", " ").trim();

                if (!entity_name.equals("")) {
                    ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name);
                    LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name);
                    if (location != null) {
                        ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location);

                        HashMap<String, String> map = new HashMap<String, String>();

                        map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_city, "city");
                        map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionName, "region");
                        map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionCode, "region_code");
                        map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryName, "country");
                        map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryCode, "country_code");
                        map.put(element_name, "canonic_name");

                        Element place = null;

                        // Update locations and entity name using map object
                        for (String key : map.keySet()) {
                            String value = location.getByName(map.get(key));

                            place = prof.element(key);
                            if (place == null) {
                                prof.addElement(key).setText(value);
                            } else {
                                ProjectLogger.LOGGER
                                        .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value);
                                place.setText(value);
                            }
                        }
                    }
                }
            }

            // Accredited Studies
            List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>();
            for (Object obj : ib.elements()) {
                org.dom4j.Element study = (org.dom4j.Element) obj;
                if (study.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName()))
                    studies.add(study);
            }

            for (org.dom4j.Element study : studies) {
                String entity_name = "";
                String element_name = "";

                /* */

                /*
                 * Trying to extract more information about the organization detected, like the location for example
                 * 
                 * Location searchs: 
                 *     Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try
                 *     to resolve the date first for the 3, next for the 2, and next for the 1.
                 * 
                 *     Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries).
                 *     But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries.
                 */
                org.dom4j.Element ent_name_3 = study
                        .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_entityName);
                org.dom4j.Element ent_type_3 = study
                        .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_type);

                if (ent_name_3 != null && ent_type_3 != null) {
                    if (ent_type_3.getText().equals(
                            eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                        //"University of Massachusetts"
                        entity_name = ent_name_3.getText();
                        element_name = ent_name_3.getName();
                    }
                } else {
                    org.dom4j.Element ent_name_2 = study
                            .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_entityName);
                    org.dom4j.Element ent_type_2 = study
                            .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_type);

                    if (ent_name_2 != null && ent_type_2 != null) {
                        if (ent_type_2.getText().equals(
                                eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                            entity_name = ent_name_2.getText();
                            element_name = ent_name_2.getName();
                        }
                    } else {
                        org.dom4j.Element ent_name_1 = study
                                .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_entityName);
                        org.dom4j.Element ent_type_1 = study
                                .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_type);

                        if (ent_name_1 != null && ent_type_1 != null) {
                            if (ent_type_1.getText().equals(
                                    eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                                entity_name = ent_name_1.getText();
                                element_name = ent_name_1.getName();
                            }
                        }
                    }
                }

                entity_name = entity_name.replace("  ", " ").trim();

                if (!entity_name.equals("")) {
                    ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name);
                    LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name);
                    if (location != null) {
                        ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location);

                        HashMap<String, String> map = new HashMap<String, String>();

                        map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_city, "city");
                        map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionName, "region");
                        map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionCode, "region_code");
                        map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryName, "country");
                        map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryCode, "country_code");
                        map.put(element_name, "canonic_name");

                        Element place = null;

                        // Update locations and entity name using map object
                        for (String key : map.keySet()) {
                            String value = location.getByName(map.get(key));

                            place = study.element(key);
                            if (place == null) {
                                study.addElement(key).setText(value);
                            } else {
                                ProjectLogger.LOGGER
                                        .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value);
                                place.setText(value);
                            }
                        }
                    }
                }
            }

        }
    }

    /*
     * Enrich researchers data information with other modules
     *  - From professional activities and university studies get the standar academic position
     *    F.E:  Lect. of Chemistry => Lecturer, 4
     * 
     * @param doc
     * @param resolver 
     */
    /**
     *
     * @param doc
     * @param dbpool_academic_trad_tables
     */
    public static void resolveAcademicPosistion(org.dom4j.Document doc, H2DBPool dbpool_academic_trad_tables) {
        org.dom4j.Element root = doc.getRootElement();

        Connection cnn = null;

        try {
            cnn = dbpool_academic_trad_tables.getConnection();
        } catch (ClassNotFoundException ex) {
            Logger.getRootLogger().error(ex.toString());
            cnn = null;
            return;
        } catch (SQLException ex) {
            Logger.getRootLogger().error(ex.toString());
            cnn = null;
            return;
        }

        for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) {
            org.dom4j.Element ib = (org.dom4j.Element) i.next();

            // Professional activities
            List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>();
            for (Object obj : ib.elements()) {
                org.dom4j.Element prof = (org.dom4j.Element) obj;
                if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName()))
                    profs.add(prof);
            }

            for (org.dom4j.Element prof : profs) {
                String title_name = "";

                /* */

                /*
                 * Try to get the standar cademic position of prof acti
                 */
                org.dom4j.Element title_name_element = prof
                        .element(CVItemExtracted.ProfessionalActivity.Title_name);

                if (title_name_element != null) {
                    title_name = title_name_element.getText();
                }

                while (title_name.contains("  "))
                    title_name = title_name.replace("  ", " ").trim();

                if (!title_name.equals("")) {
                    ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name);

                    Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name,
                            TraductionTablesOperations.TRAD_TABLE_PROF_ACTIVITIES, "cvn_trad_", "id_");

                    if (id_type != null) {
                        String standard_type = TraductionTablesOperations.getProfActivityStandardName(cnn, id_type);
                        ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type);
                        String key = CVItemExtracted.ProfessionalActivity.Position;
                        String value = standard_type;
                        Element position = prof.element(key);
                        if (position == null) {
                            prof.addElement(key).setText(standard_type);
                        } else {
                            position.setText(standard_type);
                            ProjectLogger.LOGGER
                                    .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value);
                        }
                    }
                }
            }

            /*
             * Try to get the standard cademic position of univ study
             */
            List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>();
            for (Object obj : ib.elements()) {
                org.dom4j.Element prof = (org.dom4j.Element) obj;
                if (prof.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName()))
                    profs.add(prof);
            }

            for (org.dom4j.Element study : studies) {
                String title_name = "";

                /* */

                /*
                 * 
                 */
                org.dom4j.Element title_name_element = study
                        .element(CVItemExtracted.AccreditedUniversityStudies.Title_name);

                if (title_name_element != null) {
                    title_name = title_name_element.getText();
                }

                while (!title_name.contains("  "))
                    title_name = title_name.replace("  ", " ").trim();

                if (!title_name.equals("")) {
                    ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name);

                    Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name,
                            TraductionTablesOperations.TRAD_TABLE_UNIVERSITY_STUDIES, "cvn_trad_", "id_");

                    if (id_type != null) {
                        String standard_type = TraductionTablesOperations.getUniversityStudyStandardName(cnn,
                                id_type);
                        ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type);
                        String key = CVItemExtracted.AccreditedUniversityStudies.Position;
                        String value = standard_type;
                        Element position = study.element(key);
                        if (position == null) {
                            study.addElement(key).setText(standard_type);
                        } else {
                            position.setText(standard_type);
                            ProjectLogger.LOGGER
                                    .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value);
                        }
                    }
                }
            }

        }
    }
}