List of usage examples for org.dom4j Element elementIterator
Iterator<Element> elementIterator(QName qName);
From source file:eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.IteratorReseachersFile.java
License:Open Source License
/** * //from w w w .j a v a 2s .c om * @throws Exception */ public boolean iterate() throws Exception { if (source_file_xml == null) { document = sourceXmlDocument; root = document.getRootElement(); } else if (sourceXmlDocument != null) { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); document = reader.read(source_file_xml); root = document.getRootElement(); } else if (root != null) { root = root; } else { return false; } String sInstitutionName = ""; String sWebAddress = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchName = ""; String sResearchFirstName = ""; String sResearchLastName = ""; String sResearchInitials = ""; String sStaffIndentifier = ""; String dirBase = work_dir + "\\"; boolean end = false; try { beginActions(); } catch (Exception ex) { end = true; Logger.getLogger("root").error("", ex); } for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext() && !end;) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText(); String dirI = ""; if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirI = dirBase + "\\" + sInstitutionName.replaceAll("[^a-z^A-Z]", "") + "\\"; else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY)) dirI = dirBase; end = !actionsInInstitutionNode(e1, dirI, sInstitutionName, sWebAddress); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext() && !end;) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); String dirUAD = ""; if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirUAD = dirI + "\\" + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]", "") + "\\"; else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY)) dirUAD = dirBase; end = !actionsInUnitOfAssessmentNode(e2, dirUAD, sInstitutionName, sWebAddress, sUnitOfAssessment_Description); for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext() && !end;) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext() && !end;) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); sResearchLastName = e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); sResearchInitials = e4.element(XMLTags.RESEARCHER_INITIALS).getText(); sResearchFirstName = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); sResearchName = e4.element(XMLTags.RESEARCHER_NAME) == null ? "" : e4.element(XMLTags.RESEARCHER_NAME).getText(); sStaffIndentifier = e4.element(XMLTags.RESEARCHER_STAFFIDENTIFIER).getText(); String sAux = sResearchLastName.replaceAll("[^a-z^A-Z]", "") + "#" + sResearchInitials.replaceAll("[^a-z^A-Z]", ""); String dirR = ""; if (local_format_type.equals(LocalFormatType.TREE_DIRECTORY)) dirR = dirUAD + "\\" + sAux + "\\"; else if (local_format_type.equals(LocalFormatType.PLAIN_DIRECTORY)) dirR = dirBase; ResearcherNameInfo rsi = new ResearcherNameInfo(sResearchLastName, sResearchInitials, sResearchFirstName, sResearchName); end = !actionsInResearcherNode(e4, dirR, sInstitutionName, sWebAddress, sUnitOfAssessment_Description, sResearchGroupDescription, rsi, sStaffIndentifier); for (Iterator i5 = e4.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext() && !end;) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); String url = e5.getText(); if (!url.equals("")) { String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT); if (ext == null || ext == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML; String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE); if (type == null || type == "") ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV; end = !actionsInResearcherWebPageNode(e4, dirR, sInstitutionName, sWebAddress, sUnitOfAssessment_Description, sResearchGroupDescription, rsi, sStaffIndentifier, url, ext, type); } } } } } } try { endActions(); } catch (Exception ex) { Logger.getLogger("root").error("", ex); } return !end; }
From source file:eu.sisob.uma.NPL.Researchers.Data.ViewCreator_CSVandSheets.java
License:Open Source License
/** * * @param document//from ww w. java 2s . c om * @param dest * @param create_spreadsheet * @param debug_mode */ public static void createViewFilesFromDataExtracted(org.dom4j.Document document, File dest, boolean create_spreadsheet, boolean debug_mode) { List<ViewsExporterUnit> units = new ArrayList<ViewsExporterUnit>(); ViewsExporterUnit new_unit = null; //new_unit = new ViewsExporterUnit("AgentIdentification", dest.getAbsolutePath() + File.separator + agent_identification_file); new_unit = new ViewsExporterUnit(CVItemExtracted.AgentIdentification.class.getSimpleName(), dest.getAbsolutePath() + File.separator + agent_identification_file); //Put from XML. Remove "get"s and think that richer data machine will put new fields to xml new_unit.map.put(CVItemExtracted.AgentIdentification.FirstFamilyName, "First Last Name"); // "FirstFamilyName" new_unit.map.put(CVItemExtracted.AgentIdentification.SecondFamilyName, "Second Last Name"); new_unit.map.put(CVItemExtracted.AgentIdentification.GivenName, "First Name"); new_unit.map.put(CVItemExtracted.AgentIdentification.Gender, "Gender"); new_unit.map.put(CVItemExtracted.AgentIdentification.Nationality, "Nationality"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthCity, "Birth City"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthRegion, "Birth Region"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthCountry, "Birth Country"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateDayMonthYear, "Birthday Day"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateMonthYear, "Birthday Month"); new_unit.map.put(CVItemExtracted.AgentIdentification.BirthDateYear, "Birthday Year"); new_unit.map.put(CVItemExtracted.AgentIdentification.Email, "Email"); new_unit.map.put(CVItemExtracted.AgentIdentification.Phone, "Phone"); new_unit.createMapIndex(); try { new_unit.createNewCsv(); } catch (IOException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); return; } units.add(new_unit); //new_unit = new ViewsExporterUnit("ProfessionalActivity", dest.getAbsolutePath() + File.separator + professional_activity_file); new_unit = new ViewsExporterUnit(CVItemExtracted.ProfessionalActivity.class.getSimpleName(), dest.getAbsolutePath() + File.separator + professional_activity_file); new_unit.map.put(CVItemExtracted.ProfessionalActivity.Title_name, "Literal Position Name"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.Position, "Position Name"); //MISS IN GATE new_unit.map.put(CVItemExtracted.ProfessionalActivity.PositionNumber, "Position Number"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_dayMonthYear, "Start Date Day"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_monthYear, "Start Date Month"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_year, "Start Date Year"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.DateInit_duration, "Duration"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity1_entityName, "Entity 1"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity2_entityName, "Entity 2"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.Entity3_entityName, "Entity 3"); //NEW //new_unit.map.put(CVItemExtracted.ProfessionalActivity., "Complete Instituion (e1+e2+e3)"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_city, "City"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionName, "Region"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionCode, "Region Code"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryName, "Country"); new_unit.map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryCode, "Country Code"); new_unit.createMapIndex(); try { new_unit.createNewCsv(); } catch (IOException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); return; } units.add(new_unit); //new_unit = new ViewsExporterUnit("AccreditedUniversityStudies", dest.getAbsolutePath() + File.separator + accredited_university_studies_file); new_unit = new ViewsExporterUnit(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName(), dest.getAbsolutePath() + File.separator + accredited_university_studies_file); //NEW //new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies, "Type Degree"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Title_name, "Literal Study Name"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Position, "Study Name"); //NEW new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Hons, "hons"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_dayMonthYear, "Acchievement Date Day"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_MonthYear, "Acchievement Date Month"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.DateTitle_year, "Acchievement Date Year"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity1_entityName, "Entity 1"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity2_entityName, "Entity 2"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.Entity3_entityName, "Entity 3"); //NEW //new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies., "Complete Institution (e1+e2+e3)"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_city, "City"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionName, "Region"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionCode, "Region Code"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryName, "Country"); new_unit.map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryCode, "Country Code"); new_unit.createMapIndex(); try { new_unit.createNewCsv(); } catch (IOException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); return; } units.add(new_unit); org.dom4j.Element root = document.getRootElement(); boolean bLock = false; int count = 0; bLock = false; for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) { org.dom4j.Element elInfoBlock = (org.dom4j.Element) i.next(); // <blockinfo id_annotationrecollecting="default" id_entity="3626" URL="file:/..."> // <ProfessionalActivityNoCurrent> // <Content>Research Fellow, University of Leicester (1984</Content> // <DateInit_year>1984</DateInit_year> // <Pattern>ProfessionalActivityPattern1</Pattern> // <Entity_entityName>University of Leicester</Entity_entityName> // <Title_name>Research Fellow</Title_name> // </ProfessionalActivityNoCurrent> // ... // </blockinfo> String id_entity = elInfoBlock.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT); int countElements = 0; for (Object oCVNItem : elInfoBlock.elements()) { countElements++; org.dom4j.Element elCVNItem = (org.dom4j.Element) oCVNItem; String className = elCVNItem.getName(); ViewsExporterUnit unit_ref = null; for (ViewsExporterUnit unit : units) { if (className.startsWith(unit.tag)) { unit_ref = unit; break; } } if (unit_ref == null) { ProjectLogger.LOGGER.info(className + " has not views exporter unit"); } else { LinkedHashMap<String, String> values = new LinkedHashMap<String, String>(); for (Object oCVNItemField : elCVNItem.elements()) { org.dom4j.Element elCVNItemField = (org.dom4j.Element) oCVNItemField; String methodName = elCVNItemField.getName(); String value = ""; String value_name = ""; if (methodName.equals("Pattern")) { if (debug_mode) value = elCVNItemField.getText(); } else if (methodName.equals("Content")) { if (debug_mode) value = elCVNItemField.getText(); } else if ((methodName.equals("Domain"))) { if (debug_mode) value = elCVNItemField.getText(); } else { value_name = elCVNItemField.getName(); value = elCVNItemField.getText(); values.put(value_name, value); } } if (elCVNItem.attributeValue("action_mode").equals("") || elCVNItem.attributeValue("action_mode").equals("add")) { try { String[] line = unit_ref.createNewLine(id_entity, values); unit_ref.AddCsv(line); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } } else if (elCVNItem.attributeValue("action_mode").equals("overwrite")) { try { String[] line = unit_ref.createNewLine(id_entity, values); unit_ref.UpdateCsv(id_entity, line); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } } } } } int max_r = 0; int max_c = 0; for (ViewsExporterUnit unit : units) { if (max_c < unit.map_index.size() + 1) max_c = unit.map_index.size() + 1; if (max_r < unit.lines) max_r = unit.lines; try { unit.closeCsv(); } catch (IOException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } } if (create_spreadsheet) { //Create open document spread sheet TableModel model = new DefaultTableModel(max_r, max_c); // Save the data to an ODS file and open it. final File file = new File(dest.getAbsolutePath() + File.separator + spreadsheet_w_all_data_file); try { SpreadSheet.createEmpty(model).saveAs(file); SpreadSheet spread_sheet = SpreadSheet.createFromFile(file); int i = 0; for (ViewsExporterUnit unit : units) { Sheet sheet = null; if (i > 0) { sheet = spread_sheet.addSheet(i, unit.tag); } else { sheet = spread_sheet.getSheet(i); sheet.setName(unit.tag); } sheet.setRowCount(unit.lines); sheet.setColumnCount(unit.map_index.size() + 1); InputStreamReader fw1 = new InputStreamReader(new FileInputStream(unit.filepath), "UTF-8"); CSVReader reader = new CSVReader(fw1, csv_separator); String[] line = null; int r = 0; while ((line = reader.readNext()) != null) { for (int c = 0; c < line.length; c++) { sheet.setValueAt(line[c], c, r); } r++; //System.out.println(r); } i++; } spread_sheet.saveAs(file); } catch (FileNotFoundException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } catch (IOException ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage()); } finally { } } }
From source file:eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java
License:Open Source License
/** * * @param doc//w ww.ja v a2 s . com * @param resolver */ public static void resolveLocationOfEntities(org.dom4j.Document doc, LocationDataResolver resolver) { boolean verbose = resolver.verbose; org.dom4j.Element root = doc.getRootElement(); for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) { org.dom4j.Element ib = (org.dom4j.Element) i.next(); // Professional activities List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>(); for (Object obj : ib.elements()) { org.dom4j.Element prof = (org.dom4j.Element) obj; if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName())) profs.add(prof); } for (org.dom4j.Element prof : profs) { String entity_name = ""; String element_name = ""; /* */ /* * Trying to extract more information about the organization detected, like the location for example * * Location searchs: * Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try * to resolve the date first for the 3, next for the 2, and next for the 1. * * Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries). * But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries. */ org.dom4j.Element ent_name_3 = prof .element(CVItemExtracted.ProfessionalActivity.Entity3_entityName); org.dom4j.Element ent_type_3 = prof.element(CVItemExtracted.ProfessionalActivity.Entity3_type); if (ent_name_3 != null && ent_type_3 != null) { if (ent_type_3.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { //"University of Massachusetts" entity_name = ent_name_3.getText(); element_name = ent_name_3.getName(); } } else { org.dom4j.Element ent_name_2 = prof .element(CVItemExtracted.ProfessionalActivity.Entity2_entityName); org.dom4j.Element ent_type_2 = prof.element(CVItemExtracted.ProfessionalActivity.Entity2_type); if (ent_name_2 != null && ent_type_2 != null) { if (ent_type_2.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { entity_name = ent_name_2.getText(); element_name = ent_name_2.getName(); } } else { org.dom4j.Element ent_name_1 = prof .element(CVItemExtracted.ProfessionalActivity.Entity1_entityName); org.dom4j.Element ent_type_1 = prof .element(CVItemExtracted.ProfessionalActivity.Entity1_type); if (ent_name_1 != null && ent_type_1 != null) { if (ent_type_1.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { entity_name = ent_name_1.getText(); element_name = ent_name_1.getName(); } } } } entity_name = entity_name.replace(" ", " ").trim(); if (!entity_name.equals("")) { ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name); LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name); if (location != null) { ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location); HashMap<String, String> map = new HashMap<String, String>(); map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_city, "city"); map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionName, "region"); map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_regionCode, "region_code"); map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryName, "country"); map.put(CVItemExtracted.ProfessionalActivity.PlaceJob_countryCode, "country_code"); map.put(element_name, "canonic_name"); Element place = null; // Update locations and entity name using map object for (String key : map.keySet()) { String value = location.getByName(map.get(key)); place = prof.element(key); if (place == null) { prof.addElement(key).setText(value); } else { ProjectLogger.LOGGER .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value); place.setText(value); } } } } } // Accredited Studies List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>(); for (Object obj : ib.elements()) { org.dom4j.Element study = (org.dom4j.Element) obj; if (study.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName())) studies.add(study); } for (org.dom4j.Element study : studies) { String entity_name = ""; String element_name = ""; /* */ /* * Trying to extract more information about the organization detected, like the location for example * * Location searchs: * Normally, Entity3_entityName contains Entity2_entityName and so on, so the heurstic will try * to resolve the date first for the 3, next for the 2, and next for the 1. * * Once time the location will searched, the algoritm will take the first occurrence of each entity (cities, regions, countries). * But after, the algoritm will eliminate regions with the same name in cities, and regions with the same name in countries. */ org.dom4j.Element ent_name_3 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_entityName); org.dom4j.Element ent_type_3 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity3_type); if (ent_name_3 != null && ent_type_3 != null) { if (ent_type_3.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { //"University of Massachusetts" entity_name = ent_name_3.getText(); element_name = ent_name_3.getName(); } } else { org.dom4j.Element ent_name_2 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_entityName); org.dom4j.Element ent_type_2 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity2_type); if (ent_name_2 != null && ent_type_2 != null) { if (ent_type_2.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { entity_name = ent_name_2.getText(); element_name = ent_name_2.getName(); } } else { org.dom4j.Element ent_name_1 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_entityName); org.dom4j.Element ent_type_1 = study .element(CVItemExtracted.AccreditedUniversityStudies.Entity1_type); if (ent_name_1 != null && ent_type_1 != null) { if (ent_type_1.getText().equals( eu.sisob.uma.api.prototypetextmining.gatedataextractor.Literals.EntityType_University)) { entity_name = ent_name_1.getText(); element_name = ent_name_1.getName(); } } } } entity_name = entity_name.replace(" ", " ").trim(); if (!entity_name.equals("")) { ProjectLogger.LOGGER.info("\tTry to resolve => " + entity_name); LocationDataResolver.LocationTupleWithEntity location = resolver.resolve(entity_name); if (location != null) { ProjectLogger.LOGGER.info("\tLocation solved => " + entity_name + " = " + location); HashMap<String, String> map = new HashMap<String, String>(); map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_city, "city"); map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionName, "region"); map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_regionCode, "region_code"); map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryName, "country"); map.put(CVItemExtracted.AccreditedUniversityStudies.PlaceTitle_countryCode, "country_code"); map.put(element_name, "canonic_name"); Element place = null; // Update locations and entity name using map object for (String key : map.keySet()) { String value = location.getByName(map.get(key)); place = study.element(key); if (place == null) { study.addElement(key).setText(value); } else { ProjectLogger.LOGGER .info("\tChange '" + key + "' with '" + place.getText() + "' by " + value); place.setText(value); } } } } } } }
From source file:eu.sisob.uma.NPL.Researchers.DataResearcherAugmentedInformation.java
License:Open Source License
/** * * @param doc/* ww w . j a va 2 s. co m*/ * @param dbpool_academic_trad_tables */ public static void resolveAcademicPosistion(org.dom4j.Document doc, H2DBPool dbpool_academic_trad_tables) { org.dom4j.Element root = doc.getRootElement(); Connection cnn = null; try { cnn = dbpool_academic_trad_tables.getConnection(); } catch (ClassNotFoundException ex) { Logger.getRootLogger().error(ex.toString()); cnn = null; return; } catch (SQLException ex) { Logger.getRootLogger().error(ex.toString()); cnn = null; return; } for (Iterator i = root.elementIterator("blockinfo"); i.hasNext();) { org.dom4j.Element ib = (org.dom4j.Element) i.next(); // Professional activities List<org.dom4j.Element> profs = new ArrayList<org.dom4j.Element>(); for (Object obj : ib.elements()) { org.dom4j.Element prof = (org.dom4j.Element) obj; if (prof.getName().startsWith(CVItemExtracted.ProfessionalActivity.class.getSimpleName())) profs.add(prof); } for (org.dom4j.Element prof : profs) { String title_name = ""; /* */ /* * Try to get the standar cademic position of prof acti */ org.dom4j.Element title_name_element = prof .element(CVItemExtracted.ProfessionalActivity.Title_name); if (title_name_element != null) { title_name = title_name_element.getText(); } while (title_name.contains(" ")) title_name = title_name.replace(" ", " ").trim(); if (!title_name.equals("")) { ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name); Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name, TraductionTablesOperations.TRAD_TABLE_PROF_ACTIVITIES, "cvn_trad_", "id_"); if (id_type != null) { String standard_type = TraductionTablesOperations.getProfActivityStandardName(cnn, id_type); ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type); String key = CVItemExtracted.ProfessionalActivity.Position; String value = standard_type; Element position = prof.element(key); if (position == null) { prof.addElement(key).setText(standard_type); } else { position.setText(standard_type); ProjectLogger.LOGGER .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value); } } } } /* * Try to get the standard cademic position of univ study */ List<org.dom4j.Element> studies = new ArrayList<org.dom4j.Element>(); for (Object obj : ib.elements()) { org.dom4j.Element prof = (org.dom4j.Element) obj; if (prof.getName().startsWith(CVItemExtracted.AccreditedUniversityStudies.class.getSimpleName())) profs.add(prof); } for (org.dom4j.Element study : studies) { String title_name = ""; /* */ /* * */ org.dom4j.Element title_name_element = study .element(CVItemExtracted.AccreditedUniversityStudies.Title_name); if (title_name_element != null) { title_name = title_name_element.getText(); } while (!title_name.contains(" ")) title_name = title_name.replace(" ", " ").trim(); if (!title_name.equals("")) { ProjectLogger.LOGGER.info("\tTry to resolve => " + title_name); Integer id_type = TraductionTablesOperations.getTypeListFromTraductionTable(cnn, title_name, TraductionTablesOperations.TRAD_TABLE_UNIVERSITY_STUDIES, "cvn_trad_", "id_"); if (id_type != null) { String standard_type = TraductionTablesOperations.getUniversityStudyStandardName(cnn, id_type); ProjectLogger.LOGGER.info("\tResolve => " + title_name + " => " + standard_type); String key = CVItemExtracted.AccreditedUniversityStudies.Position; String value = standard_type; Element position = study.element(key); if (position == null) { study.addElement(key).setText(standard_type); } else { position.setText(standard_type); ProjectLogger.LOGGER .info("\tChange '" + key + "' with '" + position.getText() + "' by " + value); } } } } } }
From source file:eu.sisob.uma.NPL.Researchers.GateDataExtractorSingle.java
License:Open Source License
/** * PROCESS STEPS 4//from w w w. ja va 2 s . c o m * The Data Extractor uses GATE (Cunningham et al., 2011) for processing and annotating the * provided data, in order to extract useful information about the researchers. * Inputs: * - Data in the form of blocks of information useful or interesting for extraction * obtained from the third module. * Format XML: * <root> * <infoblock id=researcherid type=I_INDEX_DATA_TYPE>content or URL<7infoblock> * Outputs: * - Processed and annotated useful data stored in a repository. See TextMiningParserGateResearcher.iniAnnotatorCollectors * @param infoblocksXmlFile * @param verbose * @param verbose_dir * @return RepositoryCVN filled with extracted data * @throws DocumentException */ public static RepositoryPreprocessDataMiddleData createPreprocessRepositoryFromXml(File infoblocksXmlFile, boolean verbose, File verbose_dir) throws DocumentException { RepositoryPreprocessDataMiddleData preprocessedRep = new RepositoryPreprocessDataMiddleData(); org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(infoblocksXmlFile); //("ResearcherPagesMonkeyTask.xml"); org.dom4j.Element root = document.getRootElement(); boolean bLock = false; int N_MAX = 100, count = 0; Random randomGenerator = new Random(); bLock = false; for (Iterator i = root.elementIterator("infoblock"); i.hasNext();) { org.dom4j.Element ib = (org.dom4j.Element) i.next(); MiddleData aoPreProcessData = new MiddleData( ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT), ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT), ib.attributeValue(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING), ib.getText(), null, verbose, verbose_dir); { //if(N_MAX > count) //if(ib.getText().contains("2f21a5ff")) { preprocessedRep.addData(aoPreProcessData); bLock = true; count++; } } } ProjectLogger.LOGGER.info(count + " documents added"); return preprocessedRep; }
From source file:fr.ens.transcriptome.teolenn.DesignReader.java
License:Open Source License
/** * Read the design file and run the design * @param designFile The design file/*from w ww . jav a 2 s. co m*/ * @param genomeFile The genome file * @param genomeMaskedFile The genome masked file * @param outputDir The output dir * @throws TeolennException if an error occurs while computing the design * @throws IOException if an error occurs while reading the design file * @throws DocumentException if an error occurs while parsing the design file */ public void readDesign(final File designFile, final File genomeFile, final File genomeMaskedFile, File outputDir) throws TeolennException, IOException, DocumentException { this.design = new DesignCommand(); this.constants = new Properties(); logger.info(Globals.APP_NAME + " version " + Globals.APP_VERSION + " (" + Globals.APP_BUILD_NUMBER + " " + Globals.APP_BUILD_DATE + ")"); SAXReader saxReader = new SAXReader(); Document document = saxReader.read(new FileReader(designFile)); Element root = document.getRootElement(); Element designElement = root; double designFileVersion = 0.0; for (Iterator i1 = designElement.elementIterator("formatversion"); i1.hasNext();) designFileVersion = Double.parseDouble(((Element) i1.next()).getTextTrim()); if (designFileVersion != Globals.DESIGN_FILE_VERSION) { System.err.println("Invalid version of your " + Globals.APP_NAME + " design file."); System.exit(1); } // constants element this.constants = getElementConstants(designElement); for (Iterator i2 = designElement.elementIterator("startposition"); i2.hasNext();) { final String sp = ((Element) i2.next()).getTextTrim(); if ("1".equals(sp)) this.design.setStart1(true); else this.design.setStart1(false); } setConstant("startPosition", "" + this.design.isStart1()); // oligolength element for (Iterator i3 = designElement.elementIterator("oligolength"); i3.hasNext();) this.design.setOligoLength(Integer.parseInt(getValue(i3))); setConstant("oligolength", "" + this.design.getOligoLength()); // oligoIntervallength element for (Iterator i4 = designElement.elementIterator("oligointervallength"); i4.hasNext();) this.design.setOligoIntervalLength(Integer.parseInt(getValue(i4))); setConstant("oligointervallength", "" + this.design.getOligoIntervalLength()); // genomefile element if (genomeFile != null) this.design.setGenomeFile(genomeFile); else for (Iterator i5 = designElement.elementIterator("genomefile"); i5.hasNext();) this.design.setGenomeFile(new File(getValue(i5))); setConstant("genomefile", "" + this.design.getGenomeFile().getAbsolutePath()); // genomemakedfile element if (genomeMaskedFile != null) this.design.setGenomeMaskedFile(genomeMaskedFile); else for (Iterator i6 = designElement.elementIterator("genomemaskedfile"); i6.hasNext();) { final String filename = getValue(i6); if (!"".equals(filename)) this.design.setGenomeMaskedFile(new File(filename)); } setConstant("genomemaskedfile", "" + this.design.getGenomeMaskedFile().getAbsolutePath()); // outputdir element if (outputDir != null) this.design.setOutputDir(outputDir); else for (Iterator i7 = designElement.elementIterator("outputdir"); i7.hasNext();) { final String path = getValue(i7); if (!"".equals(path)) this.design.setOutputDir((new File(path)).getCanonicalFile()); } setConstant("outputdir", "" + this.design.getOutputDir().getAbsolutePath()); if (this.design.getGenomeFile() == null || !this.design.getGenomeFile().isFile()) throw new InvalidParameterException("genome file is not found" + (this.design.getGenomeFile() == null ? "." : ": " + this.design.getGenomeFile())); if (this.design.getGenomeMaskedFile() != null && !this.design.getGenomeMaskedFile().isFile()) throw new InvalidParameterException("genome masked file is not found" + (this.design.getGenomeMaskedFile() == null ? "." : ": " + this.design.getGenomeMaskedFile())); // Test the validity of the outptdir if (this.design.getOutputDir() == null || !this.design.getOutputDir().isDirectory()) throw new InvalidParameterException("output directory is not found" + (this.design.getOutputDir() == null ? "." : ": " + this.design.getOutputDir())); isSkipElementEnable(designElement, "sequencefilters"); final DesignCommand d = design; // Test if phases must be skipped d.setSkipSequenceCreation(isSkipElementEnable(designElement, "sequencecreation")); d.setSkipSequenceFilters(isSkipElementEnable(designElement, "sequencefilters")); d.setSkipMeasurementsComputation(isSkipElementEnable(designElement, "measurements")); d.setSkipMeasurementsFilters(isSkipElementEnable(designElement, "measurementfilters")); d.setSkipSelector(isSkipElementEnable(designElement, "selector")); // Set the sequenceFilters d.setSequenceFiltersList(parseSequenceFilters(designElement)); // Set the measurements d.setMeasurementsList(parseMeasurements(designElement)); // Set the measurement filters d.setMeasurementFiltersList(parseMeasurementFilters(designElement)); // Set the selector d.setSelector(parseSelector(designElement)); // Set the weights d.setWeightSetters(parseSelectWeights(designElement)); // Set the outputs d.setOutputsList(parseOutput(designElement)); }
From source file:fr.ens.transcriptome.teolenn.DesignReader.java
License:Open Source License
/** * Parse the "sequencefilters" element of the DOM. * @param rootElement root element of the document * @return a list of SequenceFilter objects * @throws IOException if an error occurs while parsing *//* w ww.j a v a2 s.co m*/ private List<SequenceFilter> parseSequenceFilters(final Element rootElement) throws IOException { List<SequenceFilter> list = new ArrayList<SequenceFilter>(); for (Iterator i = rootElement.elementIterator("sequencefilters"); i.hasNext();) { final Element filters = (Element) i.next(); for (Iterator i2 = filters.elementIterator("sequencefilter"); i2.hasNext();) { final Element filter = (Element) i2.next(); String filterName = null; for (Iterator i3 = filter.elementIterator("name"); i3.hasNext();) { final Element name = (Element) i3.next(); filterName = name.getTextTrim(); } if (filterName == null) { logger.warning("Filter without name."); continue; } // Add the sequence filter to the registery if it is a plug in for (Iterator i4 = filter.elementIterator("class"); i4.hasNext();) { final Element clazz = (Element) i4.next(); String filterClass = clazz.getTextTrim(); SequenceFilterRegistery.addSequenceFilterType(filterName, filterClass); } // Get the parameters of the sequenceFilter final Properties properties = getElementParameters(filter); final SequenceFilter f = SequenceFilterRegistery.getSequenceFilter(filterName); if (f == null) logger.warning("Unknown sequence filter: " + filterName); else { for (Map.Entry<Object, Object> entry : properties.entrySet()) // Set the initialization parameters for the sequence filter f.setInitParameter((String) entry.getKey(), (String) entry.getValue()); list.add(f); } } } // Set the defaults initialization parameters of the sequence filters for (SequenceFilter sq : list) this.design.setDefaultModuleInitParameters(sq); return list; }
From source file:fr.ens.transcriptome.teolenn.DesignReader.java
License:Open Source License
/** * Parse the "measurements" element of the DOM. * @param rootElement root element of the document * @return a list of Measurement objects * @throws IOException if an error occurs while parsing */// w ww . j a va 2s.c o m private List<Measurement> parseMeasurements(final Element rootElement) throws IOException { final List<Measurement> list = new ArrayList<Measurement>(); list.add(new ChromosomeMeasurement()); list.add(new OligoStartMeasurement()); list.add(new OligoLengthMeasurement()); for (Iterator i = rootElement.elementIterator("measurements"); i.hasNext();) { final Element measurements = (Element) i.next(); for (Iterator i2 = measurements.elementIterator("measurement"); i2.hasNext();) { final Element measurement = (Element) i2.next(); String measurementName = null; for (Iterator i3 = measurement.elementIterator("name"); i3.hasNext();) { final Element name = (Element) i3.next(); measurementName = name.getTextTrim(); } if (measurementName == null) { logger.warning("Measurement without name."); continue; } // Skip if user attempt to add another Scaffold measurement if (ChromosomeMeasurement.MEASUREMENT_NAME.toLowerCase().equals(measurementName.toLowerCase())) continue; // Skip if user attempt to add another oligo start measurement if (OligoStartMeasurement.MEASUREMENT_NAME.toLowerCase().equals(measurementName.toLowerCase())) continue; // Add the measurement to registery if it is a plug in for (Iterator i4 = measurement.elementIterator("class"); i4.hasNext();) { final Element clazz = (Element) i4.next(); String measurementClass = clazz.getTextTrim(); MeasurementRegistery.addMeasurementType(measurementName, measurementClass); } // Get the parameters of the measurement final Properties properties = getElementParameters(measurement); final Measurement m = MeasurementRegistery.getMeasurement(measurementName); if (m == null) logger.warning("Unknown measurement: " + measurementName); else { // Set the initialization parameters for the measurement for (Map.Entry<Object, Object> entry : properties.entrySet()) m.setInitParameter((String) entry.getKey(), (String) entry.getValue()); list.add(m); } } } // Set the default initialization parameters of the measurements for (Measurement m : list) this.design.setDefaultModuleInitParameters(m); return list; }
From source file:fr.ens.transcriptome.teolenn.DesignReader.java
License:Open Source License
/** * Parse the "measurementfilters" element of the DOM. * @param rootElement root element of the document * @return a list of MeasurementFilter objects * @throws IOException if an error occurs while parsing *//*from ww w . j a va 2 s. c om*/ private List<MeasurementFilter> parseMeasurementFilters(final Element rootElement) throws IOException { final List<MeasurementFilter> list = new ArrayList<MeasurementFilter>(); for (Iterator i = rootElement.elementIterator("measurementfilters"); i.hasNext();) { final Element filters = (Element) i.next(); for (Iterator i2 = filters.elementIterator("measurementfilter"); i2.hasNext();) { final Element filter = (Element) i2.next(); String measurementFilterName = null; for (Iterator i3 = filter.elementIterator("name"); i3.hasNext();) { final Element name = (Element) i3.next(); measurementFilterName = name.getTextTrim(); } if (measurementFilterName == null) { logger.warning("Measurement filter without name."); continue; } // Add the measurement to registery if it is a plug in for (Iterator i4 = filter.elementIterator("class"); i4.hasNext();) { final Element clazz = (Element) i4.next(); String measurementClass = clazz.getTextTrim(); MeasurementFilterRegistery.addMeasurementFilterType(measurementFilterName, measurementClass); } // Get the parameters of the measurement filters final Properties properties = getElementParameters(filter); final MeasurementFilter mf = MeasurementFilterRegistery .getMeasuremrentFilter(measurementFilterName); if (mf == null) logger.warning("Unknown measurement: " + measurementFilterName); else { // Set the initialization parameters for the measurement filters for (Map.Entry<Object, Object> entry : properties.entrySet()) mf.setInitParameter((String) entry.getKey(), (String) entry.getValue()); list.add(mf); } } } // Set the defaults initialization parameters of the measurements filters for (MeasurementFilter mf : list) this.design.setDefaultModuleInitParameters(mf); return list; }
From source file:fr.ens.transcriptome.teolenn.DesignReader.java
License:Open Source License
/** * Parse the "selector" element of the DOM. * @param rootElement root element of the document * @return a selector objects//from w w w. j av a 2 s. c om * @throws TeolennException if an error occurs while parsing */ private SequenceSelector parseSelector(final Element rootElement) throws TeolennException { for (Iterator i = rootElement.elementIterator("selector"); i.hasNext();) { final Element selector = (Element) i.next(); String selectorName = null; for (Iterator i1 = selector.elementIterator("name"); i1.hasNext();) { final Element name = (Element) i1.next(); selectorName = name.getTextTrim(); } // Add the selector to registery if it is a plug in for (Iterator i2 = selector.elementIterator("class"); i2.hasNext();) { final Element clazz = (Element) i2.next(); String selectorClass = clazz.getTextTrim(); SequenceSelectorRegistery.addSequenceSelectorType(selectorName, selectorClass); } // Get the parameters of the measurement final Properties properties = getElementParameters(selector); SequenceSelector s = SequenceSelectorRegistery.getSequenceSelector(selectorName); if (s == null) { logger.warning("Unknown selector: " + selectorName); throw new TeolennException("Unknown selector: " + selectorName); } // Set the initialization parameters for the selector for (Map.Entry<Object, Object> entry : properties.entrySet()) s.setInitParameter((String) entry.getKey(), (String) entry.getValue()); // Set defaults parameters this.design.setDefaultModuleInitParameters(s); return s; } throw new TeolennException("No selector found."); }