Java tutorial
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ /* Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga" http://iaia.lcc.uma.es | http://www.uma.es This file is part of SISOB Data Extractor. SISOB Data Extractor is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. SISOB Data Extractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>. */ package eu.sisob.uma.crawler.ResearchersCrawlers.deprecated; import eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.CrawlerDepartamentsV2_deprecated; import eu.sisob.uma.crawler.ResearchersCrawlers.CandidateTypeURL; import eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.CrawlerResearchesPagesV2_deprecated; import eu.sisob.uma.api.crawler4j.crawler.PageFetcher; import eu.sisob.uma.api.crawler4j.crawler.WebCrawler; import eu.sisob.uma.crawler.ProjectLogger; import eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.CrawlerDepartamentsV2Controller_deprecated; import eu.sisob.uma.crawler.ResearchersCrawlers.deprecated.CrawlerResearchesPagesV2Controller_deprecated; import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.LocalFormatType; import java.io.*; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.TreeMap; import eu.sisob.uma.crawler.ResearchersCrawlers.Workers.*; import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ResearcherNameInfo; import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.XMLTags; import eu.sisob.uma.footils.File.FileFootils; import org.apache.log4j.Logger; /** ** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/) * Data extraction tasks (prototype). See: DataExtractionPrototype_1.pdf. * * This class has steps to locate and download researcher webpages from university webpages from xml file * * SKETCH of xml format: * <Institution> * <InstitutionName>...</InstitutionName> * <WebAddress>...</WebAddress> * <UnitOfAssessment> * <UnitOfAssessment_Description></UnitOfAssessment_Description> * <DepartamentWebAddress></DepartamentWebAddress> * <ResearchGroup> * <ResearchGroupDescription> * </ResearchGroupDescription> * <Researcher> * <StaffIdentifier></StaffIdentifier> * <FirstName></FirstName> * <LastName></LastName> * <Initials></Initials> * <ResearcherWebAddress> * </ResearcherWebAddress> * </Researcher> * </ResearchGroup> * * See IteratorReseachersFile for see more datails of the format of xml file. * * INCLUDE: PROCESS STEPS 1, 2, 3 */ public class LocalResearchersWebPagesExtractor { /* * PROCESS STEP 1 * Launches the crawler in order to find the researcher's personal web pages * within the corresponding university website. * * First tries to search each department webpage, next, tries to search the researcher's pages * within departments. * * @param xmlFile */ public static void P1_step_collectResearcherLinks(String xmlFilePath, int numberOfCrawlers) { P1_step_collectResearcherLinks(xmlFilePath, numberOfCrawlers, ""); } public static void P1_step_collectResearcherLinks(String xmlFilePath, int numberOfCrawlers, String sControlInstitutionName) { try { /* * rootfolder is a folder where intermediate crawl data is * stored. */ String rootFolder = "temp/"; FileFootils.deleteDir(rootFolder); /* * numberOfCrawlers shows the number of concurrent threads * that should be initiated for crawling. */ File xmlFile = new File(xmlFilePath); org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); String sInstitutionName = ""; String sWebAddress = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchers = ""; String sResearchersInitials = ""; PageFetcher.startConnectionMonitorThread(); WebCrawler.setTraceLinkName(true); WebCrawler.setTracePageName(true); TreeMap<String, TreeMap<String, List<CandidateTypeURL>>> finalResults = new TreeMap<String, TreeMap<String, List<CandidateTypeURL>>>(); boolean bFlagInstitutionName = false; String sControlUnitOfAssessment_Description = ""; boolean bFlagUnitOfAssessmentName = false; boolean bSaveFile = true; boolean bSetEmptyAllResearchers = true; if (bSetEmptyAllResearchers) { File fField = new File(xmlFilePath.replace(".xml", "backup.xml")); FileOutputStream fileOS = new java.io.FileOutputStream(fField, false); OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8"); BufferedWriter bw = new java.io.BufferedWriter(writer); String sOut = document.asXML(); bw.write(sOut); bw.close(); ProjectLogger.LOGGER.info(xmlFilePath + " backuped."); } int[] counterSuccess = new int[3]; int[] counterTotal = new int[3]; for (int i = 0; i < counterSuccess.length; i++) counterSuccess[i] = 0; for (int i = 0; i < counterTotal.length; i++) counterTotal[i] = 0; for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { bSaveFile = false; org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText(); if (sWebAddress.charAt(sWebAddress.length() - 1) != '/') sWebAddress += "/"; if (!sInstitutionName.toLowerCase().contains(sControlInstitutionName.toLowerCase()) && !bFlagInstitutionName) continue; bFlagInstitutionName = true; List<String> subjects = new ArrayList<String>(); ProjectLogger.LOGGER.info("Department phase - " + sInstitutionName); boolean bNeedToSearchDeparmentWebAddress = false; for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null && e2.element("DepartamentWebAddress").elements().size() != 0) { ProjectLogger.LOGGER .info("\tExist departments webaddress for " + sUnitOfAssessment_Description); } else { subjects.add(sUnitOfAssessment_Description); ProjectLogger.LOGGER .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description); bNeedToSearchDeparmentWebAddress = true; } } String sSeed = sWebAddress; String sContainPattern = sSeed.replace("http://www.", ""); int iAux = sContainPattern.indexOf("/"); sContainPattern = sContainPattern.substring(0, iAux); if (bNeedToSearchDeparmentWebAddress) { CrawlerDepartamentsV2Controller_deprecated controllerDepts = new CrawlerDepartamentsV2Controller_deprecated( rootFolder + sInstitutionName.replace(" ", ".") + ".Researchers", subjects); controllerDepts.addSeed(sSeed); controllerDepts.setPolitenessDelay(200); controllerDepts.setMaximumCrawlDepth(3); controllerDepts.setMaximumPagesToFetch(-1); controllerDepts.setContainPattern(sContainPattern); controllerDepts.clearPossibleResults(); ProjectLogger.LOGGER .info("======================================================================"); ProjectLogger.LOGGER.info("Begin crawling: " + sInstitutionName + " (" + sWebAddress + ")"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerDepts.start(CrawlerDepartamentsV2_deprecated.class, 1); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms"); ProjectLogger.LOGGER .info("======================================================================"); CandidateTypeURL.printResults( "Results of: " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerDepts.getPossibleResultsTYPE()); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION) .getText(); TreeMap<String, List<CandidateTypeURL>> t = controllerDepts.getPossibleResultsTYPE(); Iterator<String> it = t.keySet().iterator(); while (it.hasNext()) { String s = it.next(); if (s.toLowerCase() .equals("department of " + sUnitOfAssessment_Description.toLowerCase())) { if (e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null && e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS).elements().size() != 0) { throw new Exception(sUnitOfAssessment_Description + " must be empty."); } List<CandidateTypeURL> lst = t.get(s); for (CandidateTypeURL ss : lst) { e2.addElement(XMLTags.DEPARTMENT_WEB_ADDRESS).addText(ss.sURL); bSaveFile = true; } break; } } } } ProjectLogger.LOGGER.info("Researcher phase - " + sInstitutionName); if (sContainPattern != "") sContainPattern = sContainPattern; for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); List<String> lstDepartmentWebAddress = new ArrayList<String>(); for (Iterator i3 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); if (!e3.getText().equals("")) lstDepartmentWebAddress.add(e3.getText()); } if (lstDepartmentWebAddress.size() > 0) { ProjectLogger.LOGGER .info("\tExist departments webaddress for " + sUnitOfAssessment_Description); boolean bExistResearcherWebAddress = false; List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>(); for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (bSetEmptyAllResearchers) { boolean aux = true; while (aux) { org.dom4j.Element eaux = e4.element(XMLTags.RESEARCHER_WEB_ADDRESS); if (eaux != null) e4.remove(eaux); else aux = false; } } if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) == null) { String initials = e4.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? "" : e4.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name); researchers.add(rsi); bExistResearcherWebAddress = false; } else if (bSetEmptyAllResearchers) { throw new Exception( "XML element of " + e4.element(XMLTags.RESEARCHER_INITIALS).getText() + "," + e4.element(XMLTags.RESEARCHER_LASTNAME).getText() + " must not have researcher web address at this moment"); } } } if (!bExistResearcherWebAddress) { ProjectLogger.LOGGER.info("\tMiss researchers webaddress for " + sUnitOfAssessment_Description + ". Try to search."); CrawlerResearchesPagesV2Controller_deprecated controllerReseachers = new CrawlerResearchesPagesV2Controller_deprecated( rootFolder + sInstitutionName.replace(" ", ".") + "_" + sUnitOfAssessment_Description.replace(" ", "."), researchers); String sSeeds = ""; for (String s : lstDepartmentWebAddress) { controllerReseachers.addSeed(s); sSeeds += s + ","; } controllerReseachers.setPolitenessDelay(200); controllerReseachers.setMaximumCrawlDepth(3); controllerReseachers.setMaximumPagesToFetch(-1); controllerReseachers.setContainPattern(sContainPattern); controllerReseachers.clearInterestingUrlsDetected(); if (!sUnitOfAssessment_Description.contains(sControlUnitOfAssessment_Description) && !bFlagUnitOfAssessmentName) continue; bFlagUnitOfAssessmentName = true; ProjectLogger.LOGGER .info("======================================================================"); ProjectLogger.LOGGER.info("Begin crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " (" + sSeeds + ")"); long lTimerAux = java.lang.System.currentTimeMillis(); controllerReseachers.start(CrawlerResearchesPagesV2_deprecated.class, 1); controllerReseachers.postProcessResults(); lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux; ProjectLogger.LOGGER.info("Extracting Links in: " + lTimerAux + " ms"); ProjectLogger.LOGGER .info("======================================================================"); CandidateTypeURL.printResults( "Results of: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " (" + sWebAddress + ") by TYPE", controllerReseachers.getInterestingUrlsDetected()); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); String initials = e4.element(XMLTags.RESEARCHER_INITIALS) == null ? "" : e4.element(XMLTags.RESEARCHER_INITIALS).getText(); String last_name = e4.element(XMLTags.RESEARCHER_LASTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_LASTNAME).getText(); String first_name = e4.element(XMLTags.RESEARCHER_FIRSTNAME) == null ? "" : e4.element(XMLTags.RESEARCHER_FIRSTNAME).getText(); String whole_name = e4.element(XMLTags.RESEARCHER_NAME) == null ? "" : e4.element(XMLTags.RESEARCHER_NAME).getText(); ResearcherNameInfo rsi = new ResearcherNameInfo(last_name, initials, first_name, whole_name); TreeMap<String, List<CandidateTypeURL>> t = controllerReseachers .getInterestingUrlsDetected(); List<CandidateTypeURL> lst = t.get( CrawlerResearchesPagesV2Controller_deprecated.RESEARCHER_RESULT_TAG); boolean bExist = false; if (lst != null) { boolean lock1 = true; for (CandidateTypeURL ss : lst) { if (rsi.equals(ss.data)) { e4.addElement(XMLTags.RESEARCHER_WEB_ADDRESS) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE, ss.sSubType) .addAttribute(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT, ss.sExt) .addText(ss.sURL); lock1 = false; bSaveFile = true; bExist = true; } } } if (bExist) { counterSuccess[0]++; } } } ProjectLogger.LOGGER.info("Researches results: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } else { ProjectLogger.LOGGER.info( "\tExist researchers webaddress for " + sUnitOfAssessment_Description + "."); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { counterSuccess[0]++; } } } ProjectLogger.LOGGER.info( "Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } } else { ProjectLogger.LOGGER .info("\tNot exist departments webaddress for " + sUnitOfAssessment_Description); counterTotal[0] = 0; counterSuccess[0] = 0; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { counterTotal[0]++; org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { counterSuccess[0]++; } } } if (counterSuccess[0] > 0) ProjectLogger.LOGGER.info( "\tExist researchers webaddress for " + sUnitOfAssessment_Description + "."); else ProjectLogger.LOGGER.info("\tNot exist researchers webaddress for " + sUnitOfAssessment_Description + "."); ProjectLogger.LOGGER .info("Results exist: " + sInstitutionName + " - " + sUnitOfAssessment_Description + " - " + counterSuccess[0] + " / " + counterTotal[0]); } } counterSuccess[1] += counterSuccess[0]; counterTotal[1] += counterTotal[0]; if (bSaveFile) { File fField = new File(xmlFilePath); FileOutputStream fileOS = new java.io.FileOutputStream(fField, false); OutputStreamWriter writer = new java.io.OutputStreamWriter(fileOS, "UTF-8"); BufferedWriter bw = new java.io.BufferedWriter(writer); String sOut = document.asXML(); bw.write(sOut); bw.close(); ProjectLogger.LOGGER.info(xmlFile + " updated."); } } ProjectLogger.LOGGER.info("Researches results:" + counterSuccess[1] + " / " + counterTotal[1]); } catch (Exception ex) { ProjectLogger.LOGGER.error(ex.getMessage(), ex); } finally { PageFetcher.stopConnectionMonitorThread(); } } /* * Check effectivity of the recollection and recount found web pages. (of PROCESS STEP 1) */ public static void P1_checkEffectivityCollectResearcherLinks(String xmlFile) { P1_checkEffectivityCollectResearcherLinks(xmlFile, false, -1); } /** * Check effectivity of the recollection and recount found web pages. (of PROCESS STEP 1) * @param showOnlyBad * @param topPercent */ public static void P1_checkEffectivityCollectResearcherLinks(String xmlFile, boolean showOnlyBad, float topPercent) { try { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); String sInstitutionName = ""; String sWebAddress = ""; String sUnitOfAssessment_Description = ""; String sResearchGroupDescription = ""; String sResearchers = ""; String sResearchersInitials = ""; int[] counterSuccess = new int[2]; int[] counterTotal = new int[2]; for (int i = 0; i < counterSuccess.length; i++) counterSuccess[i] = 0; for (int i = 0; i < counterTotal.length; i++) counterTotal[i] = 0; if (showOnlyBad) { ProjectLogger.LOGGER.info("Show only departments with less than " + topPercent + "%.\r\n"); } for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText(); sWebAddress = e1.element(XMLTags.INSTITUTION_WEBADDRESS).getText(); for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) { org.dom4j.Element e2 = (org.dom4j.Element) i2.next(); sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText(); //FIXME if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20); boolean bExistDept = false; String sURLs = ""; for (Iterator i5 = e2.elementIterator(XMLTags.DEPARTMENT_WEB_ADDRESS); i5.hasNext();) { org.dom4j.Element e5 = (org.dom4j.Element) i5.next(); sURLs += " " + e5.getText(); bExistDept = true; } // if(e2.element(XMLTags.DEPARTMENT_WEB_ADDRESS) != null) // { // bExistDept = true; // //ProjectLogger.LOGGER.info("\tExist departments webaddress for " + sUnitOfAssessment_Description); // } // else // { // bExistDept = false; // } String sOut = ""; if (!bExistDept) { sOut = "FAIL: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: " + sUnitOfAssessment_Description; } else { sOut = "SUCCESS: " + sInstitutionName + "(" + sWebAddress + ") departments webaddress: " + sUnitOfAssessment_Description + " URLS= " + sURLs; } counterTotal[0] = 0; counterSuccess[0] = 0; String researchersText = ""; String researchersMissText = ""; for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) { org.dom4j.Element e3 = (org.dom4j.Element) i3.next(); sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText(); for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) { org.dom4j.Element e4 = (org.dom4j.Element) i4.next(); counterTotal[0]++; if (e4.element(XMLTags.RESEARCHER_WEB_ADDRESS) != null && e4.element(XMLTags.RESEARCHER_WEB_ADDRESS).elements().size() > 0) { researchersText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " " + e4.elementText(XMLTags.RESEARCHER_INITIALS); counterSuccess[0]++; } else { researchersMissText += ", " + e4.elementText(XMLTags.RESEARCHER_LASTNAME) + " " + e4.elementText(XMLTags.RESEARCHER_INITIALS); } } } int percent = (counterSuccess[0] * 100) / counterTotal[0]; if (showOnlyBad) { if (percent <= topPercent) { ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info("BAD RESULTS: " + sOut); ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/" + counterTotal[0] + "\t(" + percent + " %)"); ProjectLogger.LOGGER.info("\tFound: " + researchersText); ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText); } } else { ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info(sOut); ProjectLogger.LOGGER.info("\tResearchers found: " + counterSuccess[0] + "/" + counterTotal[0] + "\t(" + percent + " %)"); ProjectLogger.LOGGER.info("\tFound: " + researchersText); ProjectLogger.LOGGER.info("\tMiss: " + researchersMissText); } counterTotal[1] += counterTotal[0]; counterSuccess[1] += counterSuccess[0]; } } ProjectLogger.LOGGER.info(""); ProjectLogger.LOGGER.info("TOTAL Researchers found: " + counterSuccess[1] + "/" + counterTotal[1]); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } } /* * PROCESS STEP 2 (1/2) * Downloads the websites identified as researcher's personal pages in this structure: * .\[DIR_DOWNLOAD_PAGES]\CardiffUniversity\Chemistry\Redman#JE\55d8cb13\index.html, pub.html, cv.html * @param xmlFile * @param downloadPagesDir * @param nThreads */ public static void P2_step_downloadResearchesPages(String xmlFile, String downloadPagesDir, int nThreads) { DownloaderResearchersWebPagesXMLFormat.downloadAllResearchersPagesWithThreads(xmlFile, downloadPagesDir, LocalFormatType.PLAIN_DIRECTORY, nThreads, false); } /* * FIX FOR PROCESS STEP 2 (1/2) * Downloads the websites identified as researcher's personal pages in this structure of a Insitution: * .\[DIR_DOWNLOAD_PAGES]\CardiffUniversity\Chemistry\Redman#JE\55d8cb13\index.html, pub.html, cv.html * @param xmlFile * @param Institution * @param destDir */ public static void P2_redownloadInstitution(String xmlFile, String Institution, String destDir) throws Exception { org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(); org.dom4j.Document document = reader.read(xmlFile); org.dom4j.Element root = document.getRootElement(); for (Iterator i1 = root.elementIterator(XMLTags.INSTITUTION); i1.hasNext();) { org.dom4j.Element e1 = (org.dom4j.Element) i1.next(); if (e1.element(XMLTags.INSTITUTION_NAME).getText().equals(Institution)) { DownloaderResearchersWebPagesXMLFormat.downloadResearchesPages(destDir, LocalFormatType.PLAIN_DIRECTORY, e1, true); } } } /* * PROCESS STEP 2 (2/2) * Cleans downloaded webpages removing useless information, such as HTML headers, JavaScript modules, etc * .\[DIR_DOWNLOAD_PAGES]\* * @param sourceXmlFile * @param downloadPagesDir * @throws Exception */ public static void P2_step_cleanHtmlAllResearcherPages(String sourceXmlFile, String downloadPagesDir) throws Exception { try { CleanerResearchersWebpages o = new CleanerResearchersWebpages(new File(sourceXmlFile), new File(downloadPagesDir), LocalFormatType.PLAIN_DIRECTORY); o.iterate(); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } } /* * PROCESS STEP 3 * Writes XML file with the content of webpages for Data Collector. * By the moment we used path files: only <infoblock id="xxxx" type=2>filepath</infoblock> * @param sourceXmlFile * @param downloadPagesDir * @param destXmlFile * @throws Exception */ public static void P3_step_exportDocumentsOnXMLFileForTextMining(String sourceXmlFile, String downloadPagesDir, String destXmlFile) throws Exception { try { ExportDocumentsOnXMLFileForTextMiningCreator o = new ExportDocumentsOnXMLFileForTextMiningCreator( new File(sourceXmlFile), new File(downloadPagesDir), new File(destXmlFile), LocalFormatType.PLAIN_DIRECTORY); o.iterate(); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } } /* * PROCESS STEP 3 (VERSION 2) - Not matter Hash, only read what there are in directories * Writes XML file with the content of webpages for Data Collector. * By the moment we used path files: only <infoblock id="xxxx" type=2>filepath</infoblock> * @param sourceXmlFile * @param downloadPagesDir * @param destXmlFile * @throws Exception */ public static void P3_step_exportWebPagesOnXMLFileForTextMining_v2_nommaterhash(String sourceXmlFile, String downloadPagesDir, String destXmlFile) throws Exception { try { ExportDocumentsOnXMLFileForTextMiningCreatorV2 o = new ExportDocumentsOnXMLFileForTextMiningCreatorV2( new File(sourceXmlFile), new File(downloadPagesDir), new File(destXmlFile), LocalFormatType.PLAIN_DIRECTORY); o.iterate(); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } } /* * PROCESS STEP 3 (OPTIONAL, for study patterns) * Filters webpages with xpath expression and writes XML file with the content * of webpages for Data Collector. * By the moment we used path files: only <infoblock id="xxxx" type=2>filepath</infoblock> * @param sourceXmlFile * @param downloadPagesDir * @param xpathExp * @param destXmlFile * @throws Exception */ public static void P3_step_filterWebPagesWriteWebPagesOnXMLFileForTextMining(String sourceXmlFile, String downloadPagesDir, String xpathExp, String destXmlFile) throws Exception { try { ExportDocumentsOnXMLFileForTextMiningCreatorWithFilter o = new ExportDocumentsOnXMLFileForTextMiningCreatorWithFilter( new File(sourceXmlFile), new File(downloadPagesDir), xpathExp, new File(destXmlFile), LocalFormatType.PLAIN_DIRECTORY); o.iterate(); } catch (Exception ex) { ProjectLogger.LOGGER.info(ex.getMessage()); } } }