Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package org.shareok.data.sagedata; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.json.JSONException; import org.json.JSONObject; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.shareok.data.config.DataUtil; import org.shareok.data.config.ShareokdataManager; import org.shareok.data.datahandlers.DataHandlersUtil; import org.shareok.data.documentProcessor.CsvHandler; import org.shareok.data.documentProcessor.DocumentProcessorUtil; import org.springframework.beans.BeansException; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; /** * * @author Tao Zhao */ public class SageJournalIssueDateProcessor { private static final org.apache.log4j.Logger logger = org.apache.log4j.Logger .getLogger(SageJournalIssueDateProcessor.class); private static final String[] unparseableJournals = { "Journal of Rehabilitation and Assistive Technologies Engineering" }; // These journals ONLY have a link to PDF full text and should used the first date of the month as the pub date of the issue private static List<String> NO_ARTICLE_PUB_DATE_JOURNALS_LIST = new ArrayList<>(); static { NO_ARTICLE_PUB_DATE_JOURNALS_LIST.add("Plastic Surgery Case Studies"); NO_ARTICLE_PUB_DATE_JOURNALS_LIST.add("Plastic Surgery"); } @SuppressWarnings("empty-statement") public void retrieveSageJournalVolIssueDates(Map<String, String> processedJournalsMap) { List<String> processedJournals = new ArrayList<>(); // JSONObject jsonObj = getSavedSageJournalVolIssueDateInformation(); try { Map<String, Map<String, String>> journalMap = getSavedSageJournalVolIssueDateInformation(); if (null == journalMap) { journalMap = new HashMap<>(); } Document doc = null; try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0) .select("tbody").get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } int kk = 0; mainLoop: for (String journal : journalMap.keySet()) { System.out.println("Print out journal " + journal + " information :"); if (null != processedJournalsMap && (journal == null ? processedJournalsMap.get(journal) == null : journal.equals(processedJournalsMap.get(journal)))) { System.out.println("Journal : has already been processed!"); continue; } // if(journal.contains("Christian Education")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); //// continue; // } // if(journal.contains("Plastic Surgery")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); // continue; // } Map<String, String> journalInfoMap = journalMap.get(journal); for (String key : journalInfoMap.keySet()) { if (key.equals("issueLink")) { Document loiDdoc = null; try { loiDdoc = Jsoup.connect(journalInfoMap.get(key)).userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break; } Thread.sleep(2200); if (null != loiDdoc) { Map<String, Map<String, String>> dataMap; if (null != journalMap.get(journal).get("data")) { dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data")); } else { dataMap = new HashMap<>(); } Elements decaseDivs = loiDdoc.select("div.decade"); if (null != decaseDivs && decaseDivs.size() > 0) { for (Element decade : decaseDivs) { Elements yearsDiv = decade.select("div.years").get(0).children(); if (null != yearsDiv && yearsDiv.size() > 0) { for (Element yearEle : yearsDiv) { Elements volumesDiv = yearEle.select("div.volumes").get(0) .children(); if (null != volumesDiv && volumesDiv.size() > 0) { for (Element volumeEle : volumesDiv) { String volume = volumeEle.select("a").get(0).text().trim() .split("Volume")[1].trim(); Elements issueInfoDivEles = volumeEle .select("div.js_issue"); if (null != issueInfoDivEles && issueInfoDivEles.size() > 0) { for (Element issueInfoDiv : issueInfoDivEles) { String issueText = issueInfoDiv.select("a").get(0) .text(); issueText = issueText.split(", ")[0] .split("Issue")[1].trim(); String oldIssueDate = ""; String issueDate = ""; if (NO_ARTICLE_PUB_DATE_JOURNALS_LIST .contains(journal)) { issueDate = "01 " + issueInfoDiv .select("span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; // if(issueDate.contains("Winter")){ // issueDate = issueDate.replaceAll("Winter", "October"); // } // if(issueDate.contains("Fall") || issueDate.contains("Autumn")){ // issueDate = issueDate.replaceAll("Fall", "September"); // issueDate = issueDate.replaceAll("Autumn", "September"); // } // if(issueDate.contains("Summer")){ // issueDate = issueDate.replaceAll("Summer", "April"); // } // if(issueDate.contains("Spring")){ // issueDate = issueDate.replaceAll("Spring", "January"); // } // try{ // // for date string like "01 July-October 2016" // if(issueDate.contains("-")){ // String[] dateInfo = issueDate.split("-"); // issueDate = dateInfo[0] + " " + dateInfo[1].split(" ")[1]; // } // // for date string like "01 July/October 2016" // if(issueDate.contains("/")){ // String[] dataInfo = issueDate.split("/"); // issueDate = dataInfo[0] + " " + dataInfo[1].split(" ")[1]; // } // } // catch(ArrayIndexOutOfBoundsException ex){ // System.out.println("Journal name: "+journal); // System.out.println("Volume: "+volume+", issue: "+issueText); // System.out.println("This date string cannot be parsed: "+oldIssueDate); // ex.printStackTrace(); // continue; // } try { issueDate = "01 " + issueInfoDiv.select( "span.loiIssueCoverDateText").get(0) .text().trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } catch (ParseException ex) { // if(!journal.contains("OMEGA - Journal of Death and Dying")){ // continue; // } System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + oldIssueDate); ex.printStackTrace(); continue; } } else { try { Element issueLinkEle = issueInfoDiv .select("a").get(0); String issueLink = issueLinkEle .attr("href"); Document issueDoc = null; try { issueDoc = Jsoup.connect(issueLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Elements articleDivs = issueDoc .select("div.art_title, .linkable"); String articleLink = SageDataUtil.SAGE_HTTP_PREFIX + articleDivs.get(0) .select("a.ref, .nowrap") .get(0).attr("href"); if (articleLink.contains("pdf/")) { System.out.println("journal: " + journal + " volume=" + volume + " issue=" + issueText + " has ONLY PDF links!"); try { issueDate = issueInfoDiv.select( "span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; if (issueDate.contains("Winter")) { issueDate = issueDate .replaceAll("Winter", "December"); } if (issueDate.contains("Fall") || issueDate.contains( "Autumn")) { issueDate = issueDate .replaceAll("Fall", "September"); issueDate = issueDate .replaceAll("Autumn", "September"); } if (issueDate.contains("Summer")) { issueDate = issueDate .replaceAll("Summer", "June"); } if (issueDate.contains("Spring")) { issueDate = issueDate .replaceAll("Spring", "March"); } if (issueDate.contains("/")) { String[] dataInfo = issueDate .split("/"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0]; month1 = dateInfo1Arr[1]; } else { date = "01"; month1 = dataInfo[0].trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } // The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern else if (issueDate.contains("-")) { if (journal.equals( "OMEGA - Journal of Death and Dying")) { Document articleDoc = null; try { articleDoc = Jsoup .connect( articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000) .get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv .text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } else { String[] dataInfo = issueDate .split("-"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0] .trim(); month1 = dateInfo1Arr[1] .trim(); } else { date = "01"; month1 = dataInfo[0] .trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } } else { issueDate = "01 " + issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (ParseException | ArrayIndexOutOfBoundsException ex) { System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + issueDate); ex.printStackTrace(); continue; } } else { Document articleDoc = null; try { articleDoc = Jsoup .connect(articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv.text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (Exception ex) { logger.error( "Cannot get the issue date for journal =" + journal + " volume=" + volume + " issue=" + issueText + " date=" + oldIssueDate, ex); continue; } } if (DataHandlersUtil.datesCompare(issueDate, "2010-01-01") < 0) { if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper .writeValueAsString(dataMap); journalInfoMap.put("data", json); } processedJournals.add(journal); continue mainLoop; } try { if (null != dataMap && dataMap.size() > 0 && null != dataMap.get(volume) && null != dataMap.get(volume) .get(issueText)) { continue; } else { Map<String, String> issueMap = dataMap .get(volume); if (null == issueMap) { issueMap = new HashMap<>(); issueMap.put(issueText, issueDate); dataMap.put(volume, issueMap); } else { issueMap.put(issueText, issueDate); } System.out.println("This is vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } catch (Exception ex) { System.out.println( "Cannot add the pub date info into data map for vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } } } } } } } } if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(dataMap); journalInfoMap.put("data", json); } } } } processedJournals.add(journal); if (kk > 100) { break; } kk++; } } catch (IOException ex) { ex.printStackTrace(); } ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(journalMap); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File sageFile = new File(sageJournalIssueDateInfoFilePath); if (sageFile.exists()) { String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_" + DataHandlersUtil.getCurrentTimeString() + ".json"; sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld)); } DocumentProcessorUtil.outputStringToFile(json, ShareokdataManager.getSageJournalIssueDateInfoFilePath()); System.out.println("processed journals = " + mapper.writeValueAsString(processedJournals)); } catch (Exception ex) { logger.error("Cannot process the issue dates.", ex); } } public Map<String, Map<String, String>> getSavedSageJournalVolIssueDateInformation() { try { Map<String, Map<String, String>> map = new HashMap<>(); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File f = new File(sageJournalIssueDateInfoFilePath); if (f.exists()) { InputStream is = new FileInputStream(sageJournalIssueDateInfoFilePath); String jsonTxt = IOUtils.toString(is); JSONObject json = new JSONObject(jsonTxt); return DataUtil.getMapOfStringMapFromJsonObject(json); } } catch (IOException ex) { logger.error("Cannot get saved sage journal volume issue date information", ex); } return null; } public Map<String, String> getIssueDateProcessedSageJournals( Map<String, Map<String, String>> sageJournalVolIssueDateData) { Map<String, String> processedJournals = new HashMap<>(); for (String journal : sageJournalVolIssueDateData.keySet()) { Map<String, String> journalData = sageJournalVolIssueDateData.get(journal); if (null != journalData && null != journalData.get("data")) { processedJournals.put(journal, journal); } } return processedJournals; } /** * Sage journal "OMEGA - Journal of Death and Dying" has some date information cannot be parsed: e.g. 01 January 2003-2004 * @param journalVolumeIssueData : json object loaded from sageJournalIssueDateInfo.json * @param volume : volume string * @param issue : issue string * @return : a string map of start and end dates */ public Map<String, String> getStartEndDatesForOMEGAJournalOfDeathAndDying(JSONObject journalVolumeIssueData, String volume, String issue) { Map<String, String> datesMap = new HashMap<>(); try { String issueDateStr = journalVolumeIssueData.getJSONObject("OMEGA - Journal of Death and Dying") .getJSONObject("data").getJSONObject(volume).getString(issue); String[] issueDateStrInfo = issueDateStr.split("-"); String[] info = issueDateStrInfo[0].split(" "); String endDate = info[0] + " " + info[1] + " " + issueDateStrInfo[1]; datesMap.put("startDate", issueDateStrInfo[0]); datesMap.put("endDate", endDate); return datesMap; } catch (Exception ex) { logger.error("Cannot parse the date information of journal : OMEGA - Journal of Death and Dying", ex); } return null; } /** * * @param missingIssueDatesFile : a csv file contains the dates of certain journals and issues missing * */ public void updateSageJournalIssueDatesDataFromMissingIssueDatesFile(String missingIssueDatesFile) throws JsonProcessingException, IOException { try { Map<String, Map<String, Map<String, String>>> missingDataMap = new HashMap<>(); Map<String, Map<String, String>> map = getSavedSageJournalVolIssueDateInformation(); ApplicationContext context = new ClassPathXmlApplicationContext("documentProcessorContext.xml"); CsvHandler csv = (CsvHandler) context.getBean("csvHandler"); csv.setFileName(missingIssueDatesFile); csv.readData(); Map<String, String> missingDatesData = csv.getData(); for (String key : missingDatesData.keySet()) { if (key.startsWith("journal")) { String[] info = key.split("-"); String row = info[1]; String journal = missingDatesData.get("journal-" + row); String volume = missingDatesData.get("volume-" + row); String issue = missingDatesData.get("issue-" + row); String missingData = missingDatesData.get("data-" + row); String newDate = parsingSageJournalIssueDate(missingData); Map<String, Map<String, String>> missingJournalData = missingDataMap.get(journal); if (null == missingJournalData) { missingJournalData = new HashMap<>(); missingDataMap.put(journal, missingJournalData); } Map<String, String> missingVolData = missingJournalData.get(volume); if (null == missingVolData) { missingVolData = new HashMap<>(); missingJournalData.put(volume, missingVolData); } missingVolData.put(issue, newDate); } } for (String journal : missingDataMap.keySet()) { Map<String, Map<String, String>> missingVolData = missingDataMap.get(journal); Map<String, String> journalMap = map.get(journal); if (null == journalMap) { journalMap = new HashMap<>(); journalMap.put("data", "{}"); map.put(journal, journalMap); continue; } String data = journalMap.get("data"); JSONObject dataJson = new JSONObject(data); for (String volume : missingVolData.keySet()) { Map<String, String> missingIssueMap = missingVolData.get(volume); JSONObject volDataJason = null; if (dataJson.has(volume)) { volDataJason = dataJson.getJSONObject(volume); } else { volDataJason = new JSONObject(); dataJson.put(volume, volDataJason); } for (String issue : missingIssueMap.keySet()) { volDataJason.put(issue, missingIssueMap.get(issue)); } } String json = dataJson.toString(); journalMap.put("data", json); } System.out.println("All missing issues have been added into journal data map."); ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(map); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File sageFile = new File(sageJournalIssueDateInfoFilePath); if (sageFile.exists()) { String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_" + DataHandlersUtil.getCurrentTimeString() + ".json"; sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld)); } DocumentProcessorUtil.outputStringToFile(json, ShareokdataManager.getSageJournalIssueDateInfoFilePath()); } catch (BeansException | JSONException ex) { ex.printStackTrace(); } } public Map<String, Map<String, String>> updateSageJournalLinks(Map<String, Map<String, String>> journalMap) { Document doc = null; try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0).select("tbody") .get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } } catch (Exception ex) { ex.printStackTrace(); } return journalMap; } public String parsingSageJournalIssueDate(String oldDate) { String originalOldDate = oldDate; String newDate = ""; try { if (!Character.isDigit(oldDate.charAt(0))) { oldDate = "01 " + oldDate; } if (oldDate.contains("/")) { String[] dataInfo = oldDate.split("/"); String dateInfo1 = dataInfo[0].trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1.split(" "); date = dateInfo1Arr[0]; month1 = dateInfo1Arr[1]; String month2 = dataInfo[1].split("\\s+")[0]; String year = dataInfo[1].split("\\s+")[1]; String date1 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month1 + " " + year); String date2 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month2 + " " + year); newDate = date1 + "::" + date2; } else if (oldDate.contains("-")) { String[] dataInfo = oldDate.split("-"); String dateInfo1 = dataInfo[0].trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1.split(" "); date = dateInfo1Arr[0].trim(); month1 = dateInfo1Arr[1].trim(); String month2 = dataInfo[1].split("\\s+")[0]; String year = dataInfo[1].split("\\s+")[1]; String date1 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month1 + " " + year); String date2 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month2 + " " + year); newDate = date1 + "::" + date2; } else if (oldDate.contains("&")) { String[] dataInfo = oldDate.split("&"); String dateInfo1 = dataInfo[0].trim(); String dateInfo2 = dataInfo[1].trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1.split(" "); date = dateInfo1Arr[0].trim(); month1 = dateInfo1Arr[1].trim(); String month2 = dateInfo2.split("\\s+")[0]; String year = dateInfo2.split("\\s+")[1]; String date1 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month1 + " " + year); String date2 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month2 + " " + year); newDate = date1 + "::" + date2; } } catch (ParseException | ArrayIndexOutOfBoundsException ex) { System.out.println("Cannot parse the date: " + originalOldDate); ex.printStackTrace(); } return newDate; } /** * Somehow some of the issue keys have comma after the issue digit number. Need to find out the reason. */ public void cleanIssueKeys() { Map<String, Map<String, String>> map = getSavedSageJournalVolIssueDateInformation(); String journalName = ""; String data = ""; try { for (String journal : map.keySet()) { journalName = journal; Map<String, String> journalMap = map.get(journal); data = journalMap.get("data"); if (DocumentProcessorUtil.isEmptyString(data)) { System.out.println("Journal " + journal + " has not data to process!"); continue; } JSONObject dataJson = new JSONObject(data); for (String volume : dataJson.keySet()) { JSONObject volJson = dataJson.getJSONObject(volume); List<String> wrongKeys = new ArrayList<>(); for (String issue : volJson.keySet()) { if (issue.contains(",")) { wrongKeys.add(issue); } } for (String wrongKey : wrongKeys) { String newKey = wrongKey.split("\\,+")[0]; String wrongKeyData = volJson.getString(wrongKey); volJson.remove(wrongKey); volJson.put(newKey, wrongKeyData); } } data = dataJson.toString(); journalMap.put("data", data); } System.out.println("The data map has been cleaned"); ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(map); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File sageFile = new File(sageJournalIssueDateInfoFilePath); if (sageFile.exists()) { String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_" + DataHandlersUtil.getCurrentTimeString() + ".json"; sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld)); } DocumentProcessorUtil.outputStringToFile(json, ShareokdataManager.getSageJournalIssueDateInfoFilePath()); } catch (NullPointerException ex) { System.out .println("This data " + data + " for journal " + journalName + " gives null pointer exception"); ex.printStackTrace(); } catch (JsonProcessingException ex) { logger.error("JSON processing exception", ex); } catch (IOException ex) { logger.error("IO exception", ex); } } // public void updateSageJournalIssueDatesData(Map<String, Map<String, String>> journalMap) throws InterruptedException, IOException{ // int kk = 0; // mainLoop: // for(String journal : journalMap.keySet()){ // System.out.println("Print out journal "+journal+" information :"); // Map<String, String> journalInfoMap = journalMap.get(journal); // for(String key : journalInfoMap.keySet()){ // if(key.equals("issueLink")){ // Document loiDdoc = null; // try{ // loiDdoc = Jsoup.connect(journalInfoMap.get(key)) // .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") // .cookie("auth", "token") // .timeout(300000) // .get(); // } // catch(HttpStatusException ex){ // ex.printStackTrace(); // break; // } // Thread.sleep(2200); // if(null != loiDdoc){ // Map<String, Map<String, String>> dataMap; // if(null != journalMap.get(journal).get("data")){ // dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data")); // } // else{ // dataMap = new HashMap<>(); // } // Elements decaseDivs = loiDdoc.select("div.decade"); // if(null != decaseDivs && decaseDivs.size() > 0){ // for(Element decade : decaseDivs){ // Elements yearsDiv = decade.select("div.years").get(0).children(); // if(null != yearsDiv && yearsDiv.size() > 0){ // for(Element yearEle : yearsDiv){ // Elements volumesDiv = yearEle.select("div.volumes").get(0).children(); // if(null != volumesDiv && volumesDiv.size() > 0){ // for(Element volumeEle : volumesDiv){ // String volume = volumeEle.select("a").get(0).text().trim().split("Volume")[1].trim(); // Elements issueInfoDivEles = volumeEle.select("div.js_issue"); // if(null != issueInfoDivEles && issueInfoDivEles.size() > 0){ // for(Element issueInfoDiv : issueInfoDivEles){ // String issueText = issueInfoDiv.select("a").get(0).text(); // issueText = issueText.split(", ")[0].split("Issue")[1].trim(); // String oldIssueDate = ""; // String issueDate = ""; // if(NO_ARTICLE_PUB_DATE_JOURNALS_LIST.contains(journal)){ // issueDate = "01 " + issueInfoDiv.select("span.loiIssueCoverDateText").get(0).text().trim(); // oldIssueDate = issueDate; // try{ // issueDate = "01 " + issueInfoDiv.select("span.loiIssueCoverDateText").get(0).text().trim(); // oldIssueDate = issueDate; // issueDate = DataHandlersUtil.convertFullMonthDateStringFormat(issueDate); // } // catch(ParseException ex){ //// if(!journal.contains("OMEGA - Journal of Death and Dying")){ //// continue; //// } // System.out.println("Journal name: "+journal); // System.out.println("Volume: "+volume+", issue: "+issueText); // System.out.println("This date string cannot be parsed: "+oldIssueDate); // ex.printStackTrace(); // continue; // } // // } // else{ // try{ // Element issueLinkEle = issueInfoDiv.select("a").get(0); // String issueLink = issueLinkEle.attr("href"); // Document issueDoc = null; // try{ // issueDoc = Jsoup.connect(issueLink) // .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") // .cookie("auth", "token") // .timeout(300000) // .get(); // } // catch(HttpStatusException ex){ // ex.printStackTrace(); // break mainLoop; // } // Thread.sleep(2200); // Elements articleDivs = issueDoc.select("div.art_title, .linkable"); // String articleLink = SageDataUtil.SAGE_HTTP_PREFIX + articleDivs.get(0).select("a.ref, .nowrap").get(0).attr("href"); // if(articleLink.contains("pdf/")){ // System.out.println("journal: "+journal+" volume="+volume+" issue="+issueText+" has ONLY PDF links!"); // try{ // issueDate = issueInfoDiv.select("span.loiIssueCoverDateText").get(0).text().trim(); // oldIssueDate = issueDate; // if(issueDate.contains("Winter")){ // issueDate = issueDate.replaceAll("Winter", "December"); // } // if(issueDate.contains("Fall") || issueDate.contains("Autumn")){ // issueDate = issueDate.replaceAll("Fall", "September"); // issueDate = issueDate.replaceAll("Autumn", "September"); // } // if(issueDate.contains("Summer")){ // issueDate = issueDate.replaceAll("Summer", "June"); // } // if(issueDate.contains("Spring")){ // issueDate = issueDate.replaceAll("Spring", "March"); // } // if(issueDate.contains("/")){ // String[] dataInfo = issueDate.split("/"); // String dateInfo1 = dataInfo[0].trim(); // String date; // String month1; // String[] dateInfo1Arr = dateInfo1.split(" "); // if(dateInfo1Arr.length == 2){ // date = dateInfo1Arr[0]; // month1 = dateInfo1Arr[1]; // } // else{ // date = "01"; // month1 = dataInfo[0].trim(); // } // String month2 = dataInfo[1].split("\\s+")[0]; // String year = dataInfo[1].split("\\s+")[1]; // String date1 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month1 + " " + year); // String date2 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month2 + " " + year); // issueDate = date1 + "::" + date2; // } // // The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern // else if(issueDate.contains("-")){ // String[] dataInfo = issueDate.split("-"); // String dateInfo1 = dataInfo[0].trim(); // String date; // String month1; // String[] dateInfo1Arr = dateInfo1.split(" "); // if(dateInfo1Arr.length == 2){ // date = dateInfo1Arr[0].trim(); // month1 = dateInfo1Arr[1].trim(); // } // else{ // date = "01"; // month1 = dataInfo[0].trim(); // } // String month2 = dataInfo[1].split("\\s+")[0]; // String year = dataInfo[1].split("\\s+")[1]; // String date1 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month1 + " " + year); // String date2 = DataHandlersUtil.convertFullMonthDateStringFormat(date + " " + month2 + " " + year); // issueDate = date1 + "::" + date2; // } // else{ // issueDate = "01 " + issueDate; // issueDate = DataHandlersUtil.convertFullMonthDateStringFormat(issueDate); // } // } // catch(ParseException | ArrayIndexOutOfBoundsException ex){ // System.out.println("Journal name: "+journal); // System.out.println("Volume: "+volume+", issue: "+issueText); // System.out.println("This date string cannot be parsed: "+issueDate); // ex.printStackTrace(); // continue; // } // } // else{ // Document articleDoc = null; // try{ // articleDoc = Jsoup.connect(articleLink) // .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") // .cookie("auth", "token") // .timeout(300000) // .get(); // } // catch(HttpStatusException ex){ // ex.printStackTrace(); // break mainLoop; // } // Thread.sleep(2200); // Element pubDateDiv = articleDoc.select("div.published-dates").get(0); // issueDate = pubDateDiv.text().split("Issue published:")[1].trim(); // oldIssueDate = issueDate; // issueDate = DataHandlersUtil.convertFullMonthDateStringFormat(issueDate); // } // // } // catch(Exception ex){ // logger.error("Cannot get the issue date for journal ="+journal+" volume="+volume+" issue="+issueText+" date="+oldIssueDate, ex); // continue; // } // } // if(DataHandlersUtil.datesCompare(issueDate, "2010-01-01") < 0){ // if(dataMap.size() > 0){ // ObjectMapper mapper = new ObjectMapper(); // String json = mapper.writeValueAsString(dataMap); // journalInfoMap.put("data", json); // } // continue mainLoop; // } // try{ // if(null != dataMap && dataMap.size() > 0 && null != dataMap.get(volume) && null != dataMap.get(volume).get(issueText)){ // continue; // } // else{ // Map<String, String> issueMap = dataMap.get(volume); // if(null == issueMap){ // issueMap = new HashMap<>(); // issueMap.put(issueText, issueDate); // dataMap.put(volume, issueMap); // } // else{ // issueMap.put(issueText, issueDate); // } // System.out.println("This is vol. "+volume+" and issue "+issueText+" and date "+issueDate); // } // } // catch(Exception ex){ // System.out.println("Cannot add the pub date info into data map for vol. "+volume+" and issue "+issueText+" and date "+issueDate); // } // } // } // } // } // } // } // // } // } // if(dataMap.size() > 0){ // ObjectMapper mapper = new ObjectMapper(); // String json = mapper.writeValueAsString(dataMap); // journalInfoMap.put("data", json); // } // } // // } // } // if(kk > 100){ // break; // } // kk++; // } // } }