Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.thesmartweb.vivliocrawlermaven; /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import java.io.FileWriter; import java.io.IOException; import java.io.StringWriter; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Locale; import org.dom4j.Attribute; import org.dom4j.Element; import org.dom4j.Namespace; import org.dom4j.QName; import se.kb.oai.ore.Metadata; import se.kb.oai.pmh.Header; import se.kb.oai.ore.AggregatedResource; import se.kb.oai.pmh.IdentifiersList; import se.kb.oai.pmh.MetadataFormat; import se.kb.oai.pmh.MetadataFormatsList; import se.kb.oai.pmh.OaiPmhServer; import se.kb.oai.pmh.Record; import se.kb.oai.pmh.RecordsList; import se.kb.oai.pmh.SetsList; import org.dom4j.tree.*; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import java.nio.file.Files; import java.nio.file.Paths; import java.util.HashSet; import java.util.List; import org.json.simple.parser.JSONParser; import se.kb.oai.OAIException; /** * * @author themis */ public class VivlioCrawlerMavenMain { //the names of the following variables are mostly self-explanatory public String title; public List<String> creators = new ArrayList<String>();//it will include all the students 'names public HashSet<String> subjects = new HashSet<String>();//it captures the subjects of each thesis. Hashset is used to ensure that we capture unique subjects public String description;//abstract of the thesis public String datestring; public List<String> thesisFiles = new ArrayList<String>(); public String thesisURL; public String supervisor; public String citation; protected void VivlioCrawlerMavenMain() { this.title = ""; this.description = ""; this.datestring = ""; this.thesisURL = ""; this.supervisor = ""; this.citation = ""; } /** * @param args the command line arguments */ public static void main(String[] args) { // TODO code application logic here try { OaiPmhServer server = new OaiPmhServer("http://vivliothmmy.ee.auth.gr/cgi/oai2"); RecordsList listRecords = server.listRecords("oai_dc");//we capture all the records in oai dc format List<VivlioCrawlerMavenMain> listtotal = new ArrayList<VivlioCrawlerMavenMain>(); //we capture all the names of the professors and former professor of ECE of AUTH from a txt file //change the directory to yours List<String> profs = Files.readAllLines(Paths.get( "/home/themis/NetBeansProjects/VivlioCrawlerMaven/src/main/java/com/thesmartweb/vivliocrawlermaven/profs.txt")); boolean more = true;//it is a flag used if we encounter more entries than the initial capture JSONArray array = new JSONArray();//it is going to be our final total json array JSONObject jsonObject = new JSONObject();//it is going to be our final total json object while (more) { for (Record rec : listRecords.asList()) { VivlioCrawlerMavenMain vc = new VivlioCrawlerMavenMain(); Element metadata = rec.getMetadata(); if (metadata != null) { //System.out.println(rec.getMetadataAsString()); List<Element> elements = metadata.elements(); //System.out.println(metadata.getStringValue()); for (Element element : elements) { String name = element.getName(); //we get the title, remove \r, \n and beginning and trailing whitespace if (name.equalsIgnoreCase("title")) { vc.title = element.getStringValue(); vc.title = vc.title.trim(); vc.title = vc.title.replaceAll("(\\r|\\n)", ""); if (!(vc.title.endsWith("."))) { vc.title = vc.title + ".";//we also add dot in the end for the titles to be uniformed } } if (name.equalsIgnoreCase("creator")) { vc.creators.add(element.getStringValue());//we capture the students' names } if (name.equalsIgnoreCase("subject")) { vc.subjects.add(element.getStringValue());//we capture the subjects } if (name.equalsIgnoreCase("description")) { vc.description = element.getStringValue();//we capture the abstract } if (name.equalsIgnoreCase("date")) { vc.datestring = element.getStringValue(); } if (name.equalsIgnoreCase("identifier")) { if (element.getStringValue().contains("http://")) { vc.thesisFiles.add(element.getStringValue());//we capture the url of the thesis whole file if (vc.thesisURL == null) { vc.thesisURL = element.getStringValue().substring(0, 32); } } //if the identifier contains the title then it must be the citation //out of the citation we need to extract the supevisor's name if (element.getStringValue().contains(vc.title.substring(0, 10))) { vc.citation = element.getStringValue(); vc.supervisor = element.getStringValue(); Iterator profsIterator = profs.iterator(); vc.supervisor = vc.supervisor.replace(vc.title, "");//we remove the title out of the citation //if we have two students we remove the first occurence of "" which stands for "and" if (vc.creators.size() == 2) { vc.supervisor = vc.supervisor.replaceFirst("", ""); } //we remove the students' names Iterator creatorsIterator = vc.creators.iterator(); while (creatorsIterator.hasNext()) { vc.supervisor = vc.supervisor.replace(creatorsIterator.next().toString(), ""); } boolean profFlag = false;//flag used that declares that we found the professor that was supervisor while (profsIterator.hasNext() && !profFlag) { String prof = profsIterator.next().toString(); //we split the professor's name to surname and name //because some entries have first the surname and others first the name String[] profSplitted = prof.split("\\s+"); String supervisorCleared = vc.supervisor; supervisorCleared = supervisorCleared.replaceAll("\\s+", "");//we clear the white space supervisorCleared = supervisorCleared.replaceAll("(\\r|\\n)", "");//we remove the \r\n //now we check if the citation includes any name of the professors from the txt if (supervisorCleared.contains(profSplitted[0]) && supervisorCleared.contains(profSplitted[1])) { vc.supervisor = prof; profFlag = true; } } //if we don't find the name of the supervisor, we have to perform string manipulation to extract it if (!profFlag) { vc.supervisor = vc.supervisor.trim(); //we remove the word "" which stands for "Thessaloniki" and "" which stands for Greece if (vc.supervisor.contains("")) { vc.supervisor = vc.supervisor.replaceFirst("", ""); } if (vc.supervisor.contains("")) { vc.supervisor = vc.supervisor.replaceFirst("", ""); } if (vc.supervisor.contains("")) { vc.supervisor = vc.supervisor.replaceFirst("", ""); } if (vc.supervisor.contains("")) { vc.supervisor = vc.supervisor.replaceFirst("", ""); } //we remove the year and then we should be left only with the supervisor's name vc.supervisor = vc.supervisor.replace("(", ""); vc.supervisor = vc.supervisor.trim(); vc.supervisor = vc.supervisor.replace(")", ""); vc.supervisor = vc.supervisor.trim(); vc.supervisor = vc.supervisor.replace(",", ""); vc.supervisor = vc.supervisor.trim(); vc.supervisor = vc.supervisor.replace(".", ""); vc.supervisor = vc.supervisor.trim(); vc.supervisor = vc.supervisor.replace(vc.datestring.substring(0, 4), ""); vc.supervisor = vc.supervisor.trim(); } //we put everything in a json object JSONObject obj = new JSONObject(); obj.put("title", vc.title); obj.put("description", vc.description); JSONArray creatorsArray = new JSONArray(); creatorsArray.add(vc.creators); obj.put("creators", creatorsArray); JSONArray subjectsArray = new JSONArray(); List<String> subjectsList = new ArrayList<String>(vc.subjects); subjectsArray.add(subjectsList); obj.put("subjects", subjectsArray); obj.put("datestring", vc.datestring); JSONArray thesisFilesArray = new JSONArray(); thesisFilesArray.add(vc.thesisFiles); obj.put("thesisFiles", thesisFilesArray); obj.put("thesisURL", vc.thesisURL); obj.put("supervisor", vc.supervisor); obj.put("citation", vc.citation); //if you are using JSON.simple do this array.add(obj); } } } listtotal.add(vc);//a list containing all the objects //it is not used for now, but created for potential extension of the work } } //the following if clause searches for new records if (listRecords.getResumptionToken() != null) { listRecords = server.listRecords(listRecords.getResumptionToken()); } else { more = false; } } //we print which records did not have a supervisor for (VivlioCrawlerMavenMain vctest : listtotal) { if (vctest.supervisor == null) { System.out.println(vctest.title); System.out.println(vctest.citation); } } //we create a pretty json with GSON and we write it into a file jsonObject.put("VivliothmmyOldArray", array); JsonParser parser = new JsonParser(); JsonObject json = parser.parse(jsonObject.toJSONString()).getAsJsonObject(); Gson gson = new GsonBuilder().setPrettyPrinting().create(); String prettyJson = gson.toJson(json); try { FileWriter file = new FileWriter( "/home/themis/NetBeansProjects/VivlioCrawlerMaven/src/main/java/com/thesmartweb/vivliocrawlermaven/VivliothmmyOldRecords.json"); file.write(prettyJson); file.flush(); file.close(); } catch (IOException e) { System.out.println("Exception: " + e); } //System.out.print(prettyJson); //int j=0; } catch (OAIException | IOException e) { System.out.println("Exception: " + e); } } }