Example usage for org.dom4j Element elementIterator

List of usage examples for org.dom4j Element elementIterator

Introduction

In this page you can find the example usage for org.dom4j Element elementIterator.

Prototype

Iterator<Element> elementIterator(QName qName);

Source Link

Document

Returns an iterator over the elements contained in this element which match the given fully qualified name.

Usage

From source file:eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile.java

License:Open Source License

/**
 * //from   w  w w  .j  a v a  2s .c  om
 * @throws Exception
 */
public boolean iterate() throws Exception {
    if (source_file_xml == null) {
        document = sourceXmlDocument;
        root = document.getRootElement();
    } else if (sourceXmlDocument != null) {
        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        document = reader.read(source_file_xml);
        root = document.getRootElement();
    } else if (root != null) {
        root = root;
    } else {
        return false;
    }

    String sInstitutionName = "";
    String sWebAddress = "";
    String sUnitOfAssessment_Description = "";
    String sResearchGroupDescription = "";
    String sResearchName = "";
    String sResearchFirstName = "";
    String sResearchLastName = "";
    String sResearchInitials = "";
    String sStaffIndentifier = "";

    String dirBase = work_dir + "\\";

    boolean end = false;

    try {
        beginActions();
    } catch (Exception ex) {
        end = true;
        Logger.getLogger("root").error("", ex);
    }

    for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext() && !end;) {
        org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

        sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText();
        sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText();

        String dirI = "";
        if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
            dirI = dirBase + "\\" + sInstitutionName.replaceAll("[^a-z^A-Z]", "") + "\\";
        else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
            dirI = dirBase;

        end = !actionsInInstitutionNode(e1, dirI, sInstitutionName, sWebAddress);

        for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext() && !end;) {
            org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

            sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();

            String dirUAD = "";
            if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                dirUAD = dirI + "\\" + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]", "") + "\\";
            else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
                dirUAD = dirBase;

            end = !actionsInUnitOfAssessmentNode(e2, dirUAD, sInstitutionName, sWebAddress,
                    sUnitOfAssessment_Description);

            for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext() && !end;) {
                org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText();

                for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext() && !end;) {
                    org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                    sResearchLastName = e4.element(XMLTags.RESEARCHER_LASTNAME).getText();
                    sResearchInitials = e4.element(XMLTags.RESEARCHER_INITIALS).getText();
                    sResearchFirstName = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? ""
                            : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText();
                    sResearchName = e4.element(XMLTags.RESEARCHER_NAME) == null ? ""
                            : e4.element(XMLTags.RESEARCHER_NAME).getText();

                    sStaffIndentifier = e4.element(XMLTags.RESEARCHER_STAFFIDENTIFIER).getText();

                    String sAux = sResearchLastName.replaceAll("[^a-z^A-Z]", "") + "#"
                            + sResearchInitials.replaceAll("[^a-z^A-Z]", "");

                    String dirR = "";
                    if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                        dirR = dirUAD + "\\" + sAux + "\\";
                    else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY))
                        dirR = dirBase;

                    ResearcherNameInfo rsi = new ResearcherNameInfo(sResearchLastName, sResearchInitials,
                            sResearchFirstName, sResearchName);
                    end = !actionsInResearcherNode(e4, dirR, sInstitutionName, sWebAddress,
                            sUnitOfAssessment_Description, sResearchGroupDescription, rsi, sStaffIndentifier);

                    for (Iterator i5 = e4.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext()
                            && !end;) {
                        org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                        String url = e5.getText();
                        if (!url.equals("")) {

                            String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT);
                            if (ext == null || ext == "")
                                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;
                            String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE);
                            if (type == null || type == "")
                                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;

                            end = !actionsInResearcherWebPageNode(e4, dirR, sInstitutionName, sWebAddress,
                                    sUnitOfAssessment_Description, sResearchGroupDescription, rsi,
                                    sStaffIndentifier, url, ext, type);
                        }
                    }
                }
            }
        }
    }

    try {
        endActions();
    } catch (Exception ex) {
        Logger.getLogger("root").error("", ex);
    }

    return !end;
}

From source file:eu.sisob.uma.NPL.Researchers.Data.ViewCreator_CSVandSheets.java

License:Open Source License

/**
 *
 * @param document//from  ww w.  java 2s .  c om
 * @param dest
 * @param create_spreadsheet
 * @param debug_mode
 */
public static void createViewFilesFromDataExtracted(org.dom4j.Document document, File dest,
        boolean create_spreadsheet, boolean debug_mode) {
    List<ViewsExporterUnit> units = new ArrayList<ViewsExporterUnit>();
    ViewsExporterUnit new_unit = null;
    //new_unit = new ViewsExporterUnit("AgentIdentification", dest.getAbsolutePath() + File.separator + agent_identification_file);
    new_unit = new ViewsExporterUnit(CVItemExtracted.AgentIdentification.class.getSimpleName(),
            dest.getAbsolutePath() + File.separator + agent_identification_file);

    //Put from XML. Remove "get"s and think that richer data machine will put new fields to xml                
    new_unit.map.put(CVItemExtracted.AgentIdentification.FirstFamilyName, "First Last Name"); //  "FirstFamilyName"
    new_unit.map.put(CVItemExtracted.AgentIdentification.SecondFamilyName, "Second Last Name");
    new_unit.map.put(CVItemExtracted.AgentIdentification.GivenName, "First Name");
    new_unit.map.put(CVItemExtracted.AgentIdentification.Gender, "Gender");
    new_unit.map.put(CVItemExtracted.AgentIdentification.Nationality, "Nationality");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthCity, "Birth City");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthRegion, "Birth Region");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthCountry, "Birth Country");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateDayMonthYear, "Birthday Day");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateMonthYear, "Birthday Month");
    new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateYear, "Birthday Year");
    new_unit.map.put(CVItemExtracted.AgentIdentification.Email, "Email");
    new_unit.map.put(CVItemExtracted.AgentIdentification.Phone, "Phone");
    new_unit.createMapIndex();
    try {
        new_unit.createNewCsv();
    } catch (IOException ex) {
        ProjectLogger.LOGGER.error(ex.getMessage());
        return;
    }

    units.add(new_unit);
    //new_unit = new ViewsExporterUnit("ProfessionalActivity", dest.getAbsolutePath() + File.separator + professional_activity_file);        
    new_unit = new ViewsExporterUnit(CVItemExtracted.ProfessionalActivity.class.getSimpleName(),
            dest.getAbsolutePath() + File.separator + professional_activity_file);

    new_unit.map.put(CVItemExtracted.ProfessionalActivity.Title_name, "Literal Position Name");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.Position, "Position Name"); //MISS IN GATE
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PositionNumber, "Position Number");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_dayMonthYear, "Start Date Day");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_monthYear, "Start Date Month");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_year, "Start Date Year");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_duration, "Duration");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity1_entityName, "Entity 1");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity2_entityName, "Entity 2");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity3_entityName, "Entity 3");
    //NEW
    //new_unit.map.put(CVItemExtracted.ProfessionalActivity., "Complete Instituion (e1+e2+e3)");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_city, "City");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionName, "Region");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionCode, "Region Code");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryName, "Country");
    new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryCode, "Country Code");
    new_unit.createMapIndex();
    try {
        new_unit.createNewCsv();
    } catch (IOException ex) {
        ProjectLogger.LOGGER.error(ex.getMessage());
        return;
    }

    units.add(new_unit);
    //new_unit = new ViewsExporterUnit("AccreditedUniversityStudies", dest.getAbsolutePath() + File.separator + accredited_university_studies_file);
    new_unit = new ViewsExporterUnit(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName(),
            dest.getAbsolutePath() + File.separator + accredited_university_studies_file);

    //NEW        
    //new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies, "Type Degree");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Title_name, "Literal Study Name");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Position, "Study Name");
    //NEW
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Hons, "hons");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_dayMonthYear,
            "Acchievement Date Day");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_MonthYear,
            "Acchievement Date Month");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_year, "Acchievement Date Year");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity1_entityName, "Entity 1");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity2_entityName, "Entity 2");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity3_entityName, "Entity 3");
    //NEW
    //new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies., "Complete Institution (e1+e2+e3)");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_city, "City");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionName, "Region");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionCode, "Region Code");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryName, "Country");
    new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryCode, "Country Code");
    new_unit.createMapIndex();
    try {
        new_unit.createNewCsv();
    } catch (IOException ex) {
        ProjectLogger.LOGGER.error(ex.getMessage());
        return;
    }

    units.add(new_unit);

    org.dom4j.Element root = document.getRootElement();
    boolean bLock = false;
    int count = 0;

    bLock = false;
    for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) {
        org.dom4j.Element elInfoBlock = (org.dom4j.Element) i.next();
        //            <blockinfo id_annotationrecollecting="default" id_entity="3626" URL="file:/...">
        //                <ProfessionalActivityNoCurrent>
        //                <Content>Research Fellow, University of Leicester (1984</Content>
        //                <DateInit_year>1984</DateInit_year>
        //                <Pattern>ProfessionalActivityPattern1</Pattern>
        //                <Entity_entityName>University of Leicester</Entity_entityName>
        //                <Title_name>Research Fellow</Title_name>
        //                </ProfessionalActivityNoCurrent>
        //                ...
        //            </blockinfo>
        String id_entity = elInfoBlock.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT);

        int countElements = 0;
        for (Object oCVNItem : elInfoBlock.elements()) {
            countElements++;
            org.dom4j.Element elCVNItem = (org.dom4j.Element) oCVNItem;
            String className = elCVNItem.getName();

            ViewsExporterUnit unit_ref = null;

            for (ViewsExporterUnit unit : units) {
                if (className.startsWith(unit.tag)) {
                    unit_ref = unit;
                    break;
                }
            }

            if (unit_ref == null) {
                ProjectLogger.LOGGER.info(className + " has not views exporter unit");
            } else {
                LinkedHashMap<String, String> values = new LinkedHashMap<String, String>();
                for (Object oCVNItemField : elCVNItem.elements()) {
                    org.dom4j.Element elCVNItemField = (org.dom4j.Element) oCVNItemField;
                    String methodName = elCVNItemField.getName();
                    String value = "";
                    String value_name = "";

                    if (methodName.equals("Pattern")) {
                        if (debug_mode)
                            value = elCVNItemField.getText();
                    } else if (methodName.equals("Content")) {
                        if (debug_mode)
                            value = elCVNItemField.getText();
                    } else if ((methodName.equals("Domain"))) {
                        if (debug_mode)
                            value = elCVNItemField.getText();
                    } else {
                        value_name = elCVNItemField.getName();
                        value = elCVNItemField.getText();
                        values.put(value_name, value);
                    }
                }

                if (elCVNItem.attributeValue("action_mode").equals("")
                        || elCVNItem.attributeValue("action_mode").equals("add")) {
                    try {
                        String[] line = unit_ref.createNewLine(id_entity, values);
                        unit_ref.AddCsv(line);
                    } catch (Exception ex) {
                        ProjectLogger.LOGGER.error(ex.getMessage());
                    }
                } else if (elCVNItem.attributeValue("action_mode").equals("overwrite")) {
                    try {
                        String[] line = unit_ref.createNewLine(id_entity, values);
                        unit_ref.UpdateCsv(id_entity, line);
                    } catch (Exception ex) {
                        ProjectLogger.LOGGER.error(ex.getMessage());
                    }
                }
            }
        }
    }

    int max_r = 0;
    int max_c = 0;

    for (ViewsExporterUnit unit : units) {
        if (max_c < unit.map_index.size() + 1)
            max_c = unit.map_index.size() + 1;

        if (max_r < unit.lines)
            max_r = unit.lines;

        try {
            unit.closeCsv();
        } catch (IOException ex) {
            ProjectLogger.LOGGER.error(ex.getMessage());
        }
    }

    if (create_spreadsheet) {
        //Create open document spread sheet
        TableModel model = new DefaultTableModel(max_r, max_c);

        // Save the data to an ODS file and open it.        
        final File file = new File(dest.getAbsolutePath() + File.separator + spreadsheet_w_all_data_file);

        try {
            SpreadSheet.createEmpty(model).saveAs(file);

            SpreadSheet spread_sheet = SpreadSheet.createFromFile(file);

            int i = 0;
            for (ViewsExporterUnit unit : units) {
                Sheet sheet = null;
                if (i > 0) {
                    sheet = spread_sheet.addSheet(i, unit.tag);
                } else {
                    sheet = spread_sheet.getSheet(i);
                    sheet.setName(unit.tag);
                }

                sheet.setRowCount(unit.lines);
                sheet.setColumnCount(unit.map_index.size() + 1);

                InputStreamReader fw1 = new InputStreamReader(new FileInputStream(unit.filepath), "UTF-8");
                CSVReader reader = new CSVReader(fw1, csv_separator);

                String[] line = null;
                int r = 0;

                while ((line = reader.readNext()) != null) {
                    for (int c = 0; c < line.length; c++) {
                        sheet.setValueAt(line[c], c, r);
                    }
                    r++;
                    //System.out.println(r);
                }
                i++;
            }

            spread_sheet.saveAs(file);

        } catch (FileNotFoundException ex) {
            ProjectLogger.LOGGER.error(ex.getMessage());
        } catch (IOException ex) {
            ProjectLogger.LOGGER.error(ex.getMessage());
        } catch (Exception ex) {
            ProjectLogger.LOGGER.error(ex.getMessage());
        } finally {

        }
    }

}

From source file:eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java

License:Open Source License

/**
 *
 * @param doc//w ww.ja  v a2  s . com
 * @param resolver
 */
public static void resolveLocationOfEntities(org.dom4j.Document doc, LocationDataResolver resolver) {
    boolean verbose = resolver.verbose;
    org.dom4j.Element root = doc.getRootElement();

    for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) {
        org.dom4j.Element ib = (org.dom4j.Element) i.next();

        // Professional activities
        List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>();
        for (Object obj : ib.elements()) {
            org.dom4j.Element prof = (org.dom4j.Element) obj;
            if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName()))
                profs.add(prof);
        }

        for (org.dom4j.Element prof : profs) {
            String entity_name = "";
            String element_name = "";

            /* */

            /*
             * Trying to extract more information about the organization detected, like the location for example
             * 
             * Location searchs: 
             *     Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try
             *     to resolve the date first for the 3, next for the 2, and next for the 1.
             * 
             *     Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries).
             *     But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries.
             */
            org.dom4j.Element ent_name_3 = prof
                    .element(CVItemExtracted.ProfessionalActivity.Entity3_entityName);
            org.dom4j.Element ent_type_3 = prof.element(CVItemExtracted.ProfessionalActivity.Entity3_type);

            if (ent_name_3 != null && ent_type_3 != null) {
                if (ent_type_3.getText().equals(
                        eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                    //"University of Massachusetts"
                    entity_name = ent_name_3.getText();
                    element_name = ent_name_3.getName();
                }
            } else {
                org.dom4j.Element ent_name_2 = prof
                        .element(CVItemExtracted.ProfessionalActivity.Entity2_entityName);
                org.dom4j.Element ent_type_2 = prof.element(CVItemExtracted.ProfessionalActivity.Entity2_type);

                if (ent_name_2 != null && ent_type_2 != null) {
                    if (ent_type_2.getText().equals(
                            eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                        entity_name = ent_name_2.getText();
                        element_name = ent_name_2.getName();
                    }
                } else {
                    org.dom4j.Element ent_name_1 = prof
                            .element(CVItemExtracted.ProfessionalActivity.Entity1_entityName);
                    org.dom4j.Element ent_type_1 = prof
                            .element(CVItemExtracted.ProfessionalActivity.Entity1_type);

                    if (ent_name_1 != null && ent_type_1 != null) {
                        if (ent_type_1.getText().equals(
                                eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                            entity_name = ent_name_1.getText();
                            element_name = ent_name_1.getName();
                        }
                    }
                }
            }

            entity_name = entity_name.replace("  ", " ").trim();

            if (!entity_name.equals("")) {
                ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name);
                LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name);
                if (location != null) {
                    ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location);

                    HashMap<String, String> map = new HashMap<String, String>();

                    map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_city, "city");
                    map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionName, "region");
                    map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionCode, "region_code");
                    map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryName, "country");
                    map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryCode, "country_code");
                    map.put(element_name, "canonic_name");

                    Element place = null;

                    // Update locations and entity name using map object
                    for (String key : map.keySet()) {
                        String value = location.getByName(map.get(key));

                        place = prof.element(key);
                        if (place == null) {
                            prof.addElement(key).setText(value);
                        } else {
                            ProjectLogger.LOGGER
                                    .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value);
                            place.setText(value);
                        }
                    }
                }
            }
        }

        // Accredited Studies
        List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>();
        for (Object obj : ib.elements()) {
            org.dom4j.Element study = (org.dom4j.Element) obj;
            if (study.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName()))
                studies.add(study);
        }

        for (org.dom4j.Element study : studies) {
            String entity_name = "";
            String element_name = "";

            /* */

            /*
             * Trying to extract more information about the organization detected, like the location for example
             * 
             * Location searchs: 
             *     Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try
             *     to resolve the date first for the 3, next for the 2, and next for the 1.
             * 
             *     Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries).
             *     But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries.
             */
            org.dom4j.Element ent_name_3 = study
                    .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_entityName);
            org.dom4j.Element ent_type_3 = study
                    .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_type);

            if (ent_name_3 != null && ent_type_3 != null) {
                if (ent_type_3.getText().equals(
                        eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                    //"University of Massachusetts"
                    entity_name = ent_name_3.getText();
                    element_name = ent_name_3.getName();
                }
            } else {
                org.dom4j.Element ent_name_2 = study
                        .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_entityName);
                org.dom4j.Element ent_type_2 = study
                        .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_type);

                if (ent_name_2 != null && ent_type_2 != null) {
                    if (ent_type_2.getText().equals(
                            eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                        entity_name = ent_name_2.getText();
                        element_name = ent_name_2.getName();
                    }
                } else {
                    org.dom4j.Element ent_name_1 = study
                            .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_entityName);
                    org.dom4j.Element ent_type_1 = study
                            .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_type);

                    if (ent_name_1 != null && ent_type_1 != null) {
                        if (ent_type_1.getText().equals(
                                eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) {
                            entity_name = ent_name_1.getText();
                            element_name = ent_name_1.getName();
                        }
                    }
                }
            }

            entity_name = entity_name.replace("  ", " ").trim();

            if (!entity_name.equals("")) {
                ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name);
                LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name);
                if (location != null) {
                    ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location);

                    HashMap<String, String> map = new HashMap<String, String>();

                    map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_city, "city");
                    map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionName, "region");
                    map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionCode, "region_code");
                    map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryName, "country");
                    map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryCode, "country_code");
                    map.put(element_name, "canonic_name");

                    Element place = null;

                    // Update locations and entity name using map object
                    for (String key : map.keySet()) {
                        String value = location.getByName(map.get(key));

                        place = study.element(key);
                        if (place == null) {
                            study.addElement(key).setText(value);
                        } else {
                            ProjectLogger.LOGGER
                                    .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value);
                            place.setText(value);
                        }
                    }
                }
            }
        }

    }
}

From source file:eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java

License:Open Source License

/**
 *
 * @param doc/* ww  w  . j  a va 2 s.  co m*/
 * @param dbpool_academic_trad_tables
 */
public static void resolveAcademicPosistion(org.dom4j.Document doc, H2DBPool dbpool_academic_trad_tables) {
    org.dom4j.Element root = doc.getRootElement();

    Connection cnn = null;

    try {
        cnn = dbpool_academic_trad_tables.getConnection();
    } catch (ClassNotFoundException ex) {
        Logger.getRootLogger().error(ex.toString());
        cnn = null;
        return;
    } catch (SQLException ex) {
        Logger.getRootLogger().error(ex.toString());
        cnn = null;
        return;
    }

    for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) {
        org.dom4j.Element ib = (org.dom4j.Element) i.next();

        // Professional activities
        List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>();
        for (Object obj : ib.elements()) {
            org.dom4j.Element prof = (org.dom4j.Element) obj;
            if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName()))
                profs.add(prof);
        }

        for (org.dom4j.Element prof : profs) {
            String title_name = "";

            /* */

            /*
             * Try to get the standar cademic position of prof acti
             */
            org.dom4j.Element title_name_element = prof
                    .element(CVItemExtracted.ProfessionalActivity.Title_name);

            if (title_name_element != null) {
                title_name = title_name_element.getText();
            }

            while (title_name.contains("  "))
                title_name = title_name.replace("  ", " ").trim();

            if (!title_name.equals("")) {
                ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name);

                Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name,
                        TraductionTablesOperations.TRAD_TABLE_PROF_ACTIVITIES, "cvn_trad_", "id_");

                if (id_type != null) {
                    String standard_type = TraductionTablesOperations.getProfActivityStandardName(cnn, id_type);
                    ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type);
                    String key = CVItemExtracted.ProfessionalActivity.Position;
                    String value = standard_type;
                    Element position = prof.element(key);
                    if (position == null) {
                        prof.addElement(key).setText(standard_type);
                    } else {
                        position.setText(standard_type);
                        ProjectLogger.LOGGER
                                .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value);
                    }
                }
            }
        }

        /*
         * Try to get the standard cademic position of univ study
         */
        List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>();
        for (Object obj : ib.elements()) {
            org.dom4j.Element prof = (org.dom4j.Element) obj;
            if (prof.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName()))
                profs.add(prof);
        }

        for (org.dom4j.Element study : studies) {
            String title_name = "";

            /* */

            /*
             * 
             */
            org.dom4j.Element title_name_element = study
                    .element(CVItemExtracted.AccreditedUniversityStudies.Title_name);

            if (title_name_element != null) {
                title_name = title_name_element.getText();
            }

            while (!title_name.contains("  "))
                title_name = title_name.replace("  ", " ").trim();

            if (!title_name.equals("")) {
                ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name);

                Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name,
                        TraductionTablesOperations.TRAD_TABLE_UNIVERSITY_STUDIES, "cvn_trad_", "id_");

                if (id_type != null) {
                    String standard_type = TraductionTablesOperations.getUniversityStudyStandardName(cnn,
                            id_type);
                    ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type);
                    String key = CVItemExtracted.AccreditedUniversityStudies.Position;
                    String value = standard_type;
                    Element position = study.element(key);
                    if (position == null) {
                        study.addElement(key).setText(standard_type);
                    } else {
                        position.setText(standard_type);
                        ProjectLogger.LOGGER
                                .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value);
                    }
                }
            }
        }

    }
}

From source file:eu.sisob.uma.NPL.Researchers.GateDataExtractorSingle.java

License:Open Source License

/**
 * PROCESS STEPS 4//from  w w w.  ja va  2  s .  c o  m
 * The Data Extractor uses GATE (Cunningham et al., 2011) for processing and annotating the
 * provided data, in order to extract useful information about the researchers.
 * Inputs:
 *  - Data in the form of blocks of information useful or interesting for extraction
 *    obtained from the third module.
 *  Format XML:
 *  <root>
 *  <infoblock id=researcherid type=I_INDEX_DATA_TYPE>content or URL<7infoblock>
 * Outputs:
 *  - Processed and annotated useful data stored in a repository. See TextMiningParserGateResearcher.iniAnnotatorCollectors
 * @param infoblocksXmlFile 
 * @param verbose 
 * @param verbose_dir 
 * @return RepositoryCVN filled with extracted data
 * @throws DocumentException  
 */
public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromXml(File infoblocksXmlFile,
        boolean verbose, File verbose_dir) throws DocumentException {
    RepositoryPreprocessDataMiddleData preprocessedRep = new RepositoryPreprocessDataMiddleData();

    org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
    org.dom4j.Document document = reader.read(infoblocksXmlFile); //("ResearcherPagesMonkeyTask.xml");

    org.dom4j.Element root = document.getRootElement();
    boolean bLock = false;
    int N_MAX = 100, count = 0;
    Random randomGenerator = new Random();

    bLock = false;
    for (Iterator i = root.elementIterator("infoblock"); i.hasNext();) {
        org.dom4j.Element ib = (org.dom4j.Element) i.next();

        MiddleData aoPreProcessData = new MiddleData(
                ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT),
                ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT),
                ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING),
                ib.getText(), null, verbose, verbose_dir);
        {
            //if(N_MAX > count)
            //if(ib.getText().contains("2f21a5ff"))
            {

                preprocessedRep.addData(aoPreProcessData);
                bLock = true;
                count++;
            }
        }
    }

    ProjectLogger.LOGGER.info(count + " documents added");
    return preprocessedRep;
}

From source file:fr.ens.transcriptome.teolenn.DesignReader.java

License:Open Source License

/**
 * Read the design file and run the design
 * @param designFile The design file/*from  w ww . jav a  2  s.  co m*/
 * @param genomeFile The genome file
 * @param genomeMaskedFile The genome masked file
 * @param outputDir The output dir
 * @throws TeolennException if an error occurs while computing the design
 * @throws IOException if an error occurs while reading the design file
 * @throws DocumentException if an error occurs while parsing the design file
 */
public void readDesign(final File designFile, final File genomeFile, final File genomeMaskedFile,
        File outputDir) throws TeolennException, IOException, DocumentException {

    this.design = new DesignCommand();
    this.constants = new Properties();

    logger.info(Globals.APP_NAME + " version " + Globals.APP_VERSION + " (" + Globals.APP_BUILD_NUMBER + " "
            + Globals.APP_BUILD_DATE + ")");

    SAXReader saxReader = new SAXReader();
    Document document = saxReader.read(new FileReader(designFile));

    Element root = document.getRootElement();
    Element designElement = root;

    double designFileVersion = 0.0;
    for (Iterator i1 = designElement.elementIterator("formatversion"); i1.hasNext();)
        designFileVersion = Double.parseDouble(((Element) i1.next()).getTextTrim());

    if (designFileVersion != Globals.DESIGN_FILE_VERSION) {
        System.err.println("Invalid version of your " + Globals.APP_NAME + " design file.");
        System.exit(1);
    }

    // constants element
    this.constants = getElementConstants(designElement);

    for (Iterator i2 = designElement.elementIterator("startposition"); i2.hasNext();) {

        final String sp = ((Element) i2.next()).getTextTrim();
        if ("1".equals(sp))
            this.design.setStart1(true);
        else
            this.design.setStart1(false);
    }
    setConstant("startPosition", "" + this.design.isStart1());

    // oligolength element
    for (Iterator i3 = designElement.elementIterator("oligolength"); i3.hasNext();)
        this.design.setOligoLength(Integer.parseInt(getValue(i3)));
    setConstant("oligolength", "" + this.design.getOligoLength());

    // oligoIntervallength element
    for (Iterator i4 = designElement.elementIterator("oligointervallength"); i4.hasNext();)
        this.design.setOligoIntervalLength(Integer.parseInt(getValue(i4)));
    setConstant("oligointervallength", "" + this.design.getOligoIntervalLength());

    // genomefile element
    if (genomeFile != null)
        this.design.setGenomeFile(genomeFile);
    else
        for (Iterator i5 = designElement.elementIterator("genomefile"); i5.hasNext();)
            this.design.setGenomeFile(new File(getValue(i5)));
    setConstant("genomefile", "" + this.design.getGenomeFile().getAbsolutePath());

    // genomemakedfile element
    if (genomeMaskedFile != null)
        this.design.setGenomeMaskedFile(genomeMaskedFile);
    else
        for (Iterator i6 = designElement.elementIterator("genomemaskedfile"); i6.hasNext();) {
            final String filename = getValue(i6);
            if (!"".equals(filename))
                this.design.setGenomeMaskedFile(new File(filename));
        }
    setConstant("genomemaskedfile", "" + this.design.getGenomeMaskedFile().getAbsolutePath());

    // outputdir element
    if (outputDir != null)
        this.design.setOutputDir(outputDir);
    else
        for (Iterator i7 = designElement.elementIterator("outputdir"); i7.hasNext();) {
            final String path = getValue(i7);
            if (!"".equals(path))
                this.design.setOutputDir((new File(path)).getCanonicalFile());
        }
    setConstant("outputdir", "" + this.design.getOutputDir().getAbsolutePath());

    if (this.design.getGenomeFile() == null || !this.design.getGenomeFile().isFile())
        throw new InvalidParameterException("genome file is not found"
                + (this.design.getGenomeFile() == null ? "." : ": " + this.design.getGenomeFile()));

    if (this.design.getGenomeMaskedFile() != null && !this.design.getGenomeMaskedFile().isFile())
        throw new InvalidParameterException("genome masked file is not found"
                + (this.design.getGenomeMaskedFile() == null ? "." : ": " + this.design.getGenomeMaskedFile()));

    // Test the validity of the outptdir
    if (this.design.getOutputDir() == null || !this.design.getOutputDir().isDirectory())
        throw new InvalidParameterException("output directory is not found"
                + (this.design.getOutputDir() == null ? "." : ": " + this.design.getOutputDir()));

    isSkipElementEnable(designElement, "sequencefilters");

    final DesignCommand d = design;

    // Test if phases must be skipped
    d.setSkipSequenceCreation(isSkipElementEnable(designElement, "sequencecreation"));
    d.setSkipSequenceFilters(isSkipElementEnable(designElement, "sequencefilters"));
    d.setSkipMeasurementsComputation(isSkipElementEnable(designElement, "measurements"));
    d.setSkipMeasurementsFilters(isSkipElementEnable(designElement, "measurementfilters"));
    d.setSkipSelector(isSkipElementEnable(designElement, "selector"));

    // Set the sequenceFilters
    d.setSequenceFiltersList(parseSequenceFilters(designElement));

    // Set the measurements
    d.setMeasurementsList(parseMeasurements(designElement));

    // Set the measurement filters
    d.setMeasurementFiltersList(parseMeasurementFilters(designElement));

    // Set the selector
    d.setSelector(parseSelector(designElement));

    // Set the weights
    d.setWeightSetters(parseSelectWeights(designElement));

    // Set the outputs
    d.setOutputsList(parseOutput(designElement));

}

From source file:fr.ens.transcriptome.teolenn.DesignReader.java

License:Open Source License

/**
 * Parse the "sequencefilters" element of the DOM.
 * @param rootElement root element of the document
 * @return a list of SequenceFilter objects
 * @throws IOException if an error occurs while parsing
 *//*  w ww.j a  v a2 s.co m*/
private List<SequenceFilter> parseSequenceFilters(final Element rootElement) throws IOException {

    List<SequenceFilter> list = new ArrayList<SequenceFilter>();

    for (Iterator i = rootElement.elementIterator("sequencefilters"); i.hasNext();) {
        final Element filters = (Element) i.next();

        for (Iterator i2 = filters.elementIterator("sequencefilter"); i2.hasNext();) {
            final Element filter = (Element) i2.next();

            String filterName = null;

            for (Iterator i3 = filter.elementIterator("name"); i3.hasNext();) {
                final Element name = (Element) i3.next();
                filterName = name.getTextTrim();
            }

            if (filterName == null) {
                logger.warning("Filter without name.");
                continue;
            }

            // Add the sequence filter to the registery if it is a plug in
            for (Iterator i4 = filter.elementIterator("class"); i4.hasNext();) {
                final Element clazz = (Element) i4.next();
                String filterClass = clazz.getTextTrim();
                SequenceFilterRegistery.addSequenceFilterType(filterName, filterClass);
            }

            // Get the parameters of the sequenceFilter
            final Properties properties = getElementParameters(filter);
            final SequenceFilter f = SequenceFilterRegistery.getSequenceFilter(filterName);

            if (f == null)
                logger.warning("Unknown sequence filter: " + filterName);
            else {

                for (Map.Entry<Object, Object> entry : properties.entrySet())
                    // Set the initialization parameters for the sequence filter
                    f.setInitParameter((String) entry.getKey(), (String) entry.getValue());

                list.add(f);
            }
        }

    }

    // Set the defaults initialization parameters of the sequence filters
    for (SequenceFilter sq : list)
        this.design.setDefaultModuleInitParameters(sq);

    return list;
}

From source file:fr.ens.transcriptome.teolenn.DesignReader.java

License:Open Source License

/**
 * Parse the "measurements" element of the DOM.
 * @param rootElement root element of the document
 * @return a list of Measurement objects
 * @throws IOException if an error occurs while parsing
 *///  w  ww . j  a va 2s.c  o m
private List<Measurement> parseMeasurements(final Element rootElement) throws IOException {

    final List<Measurement> list = new ArrayList<Measurement>();

    list.add(new ChromosomeMeasurement());
    list.add(new OligoStartMeasurement());
    list.add(new OligoLengthMeasurement());

    for (Iterator i = rootElement.elementIterator("measurements"); i.hasNext();) {
        final Element measurements = (Element) i.next();

        for (Iterator i2 = measurements.elementIterator("measurement"); i2.hasNext();) {
            final Element measurement = (Element) i2.next();

            String measurementName = null;

            for (Iterator i3 = measurement.elementIterator("name"); i3.hasNext();) {
                final Element name = (Element) i3.next();
                measurementName = name.getTextTrim();
            }

            if (measurementName == null) {
                logger.warning("Measurement without name.");
                continue;
            }

            // Skip if user attempt to add another Scaffold measurement
            if (ChromosomeMeasurement.MEASUREMENT_NAME.toLowerCase().equals(measurementName.toLowerCase()))
                continue;

            // Skip if user attempt to add another oligo start measurement
            if (OligoStartMeasurement.MEASUREMENT_NAME.toLowerCase().equals(measurementName.toLowerCase()))
                continue;

            // Add the measurement to registery if it is a plug in
            for (Iterator i4 = measurement.elementIterator("class"); i4.hasNext();) {
                final Element clazz = (Element) i4.next();
                String measurementClass = clazz.getTextTrim();
                MeasurementRegistery.addMeasurementType(measurementName, measurementClass);
            }

            // Get the parameters of the measurement
            final Properties properties = getElementParameters(measurement);

            final Measurement m = MeasurementRegistery.getMeasurement(measurementName);
            if (m == null)
                logger.warning("Unknown measurement: " + measurementName);
            else {

                // Set the initialization parameters for the measurement
                for (Map.Entry<Object, Object> entry : properties.entrySet())
                    m.setInitParameter((String) entry.getKey(), (String) entry.getValue());

                list.add(m);
            }
        }

    }

    // Set the default initialization parameters of the measurements
    for (Measurement m : list)
        this.design.setDefaultModuleInitParameters(m);

    return list;
}

From source file:fr.ens.transcriptome.teolenn.DesignReader.java

License:Open Source License

/**
 * Parse the "measurementfilters" element of the DOM.
 * @param rootElement root element of the document
 * @return a list of MeasurementFilter objects
 * @throws IOException if an error occurs while parsing
 *//*from ww  w . j  a va 2 s. c  om*/
private List<MeasurementFilter> parseMeasurementFilters(final Element rootElement) throws IOException {

    final List<MeasurementFilter> list = new ArrayList<MeasurementFilter>();

    for (Iterator i = rootElement.elementIterator("measurementfilters"); i.hasNext();) {
        final Element filters = (Element) i.next();

        for (Iterator i2 = filters.elementIterator("measurementfilter"); i2.hasNext();) {
            final Element filter = (Element) i2.next();

            String measurementFilterName = null;

            for (Iterator i3 = filter.elementIterator("name"); i3.hasNext();) {
                final Element name = (Element) i3.next();
                measurementFilterName = name.getTextTrim();
            }

            if (measurementFilterName == null) {
                logger.warning("Measurement filter without name.");
                continue;
            }

            // Add the measurement to registery if it is a plug in
            for (Iterator i4 = filter.elementIterator("class"); i4.hasNext();) {
                final Element clazz = (Element) i4.next();
                String measurementClass = clazz.getTextTrim();
                MeasurementFilterRegistery.addMeasurementFilterType(measurementFilterName, measurementClass);
            }

            // Get the parameters of the measurement filters
            final Properties properties = getElementParameters(filter);

            final MeasurementFilter mf = MeasurementFilterRegistery
                    .getMeasuremrentFilter(measurementFilterName);
            if (mf == null)
                logger.warning("Unknown measurement: " + measurementFilterName);
            else {

                // Set the initialization parameters for the measurement filters
                for (Map.Entry<Object, Object> entry : properties.entrySet())
                    mf.setInitParameter((String) entry.getKey(), (String) entry.getValue());

                list.add(mf);
            }
        }

    }

    // Set the defaults initialization parameters of the measurements filters
    for (MeasurementFilter mf : list)
        this.design.setDefaultModuleInitParameters(mf);

    return list;
}

From source file:fr.ens.transcriptome.teolenn.DesignReader.java

License:Open Source License

/**
 * Parse the "selector" element of the DOM.
 * @param rootElement root element of the document
 * @return a selector objects//from w  w  w.  j  av  a  2  s. c om
 * @throws TeolennException if an error occurs while parsing
 */
private SequenceSelector parseSelector(final Element rootElement) throws TeolennException {

    for (Iterator i = rootElement.elementIterator("selector"); i.hasNext();) {

        final Element selector = (Element) i.next();

        String selectorName = null;

        for (Iterator i1 = selector.elementIterator("name"); i1.hasNext();) {
            final Element name = (Element) i1.next();
            selectorName = name.getTextTrim();
        }

        // Add the selector to registery if it is a plug in
        for (Iterator i2 = selector.elementIterator("class"); i2.hasNext();) {
            final Element clazz = (Element) i2.next();
            String selectorClass = clazz.getTextTrim();
            SequenceSelectorRegistery.addSequenceSelectorType(selectorName, selectorClass);
        }

        // Get the parameters of the measurement
        final Properties properties = getElementParameters(selector);

        SequenceSelector s = SequenceSelectorRegistery.getSequenceSelector(selectorName);

        if (s == null) {
            logger.warning("Unknown selector: " + selectorName);
            throw new TeolennException("Unknown selector: " + selectorName);
        }

        // Set the initialization parameters for the selector
        for (Map.Entry<Object, Object> entry : properties.entrySet())
            s.setInitParameter((String) entry.getKey(), (String) entry.getValue());

        // Set defaults parameters
        this.design.setDefaultModuleInitParameters(s);

        return s;
    }

    throw new TeolennException("No selector found.");
}