Java tutorial
/* * Copyright (c) 2016 Washington State Department of Transportation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> * */ package gov.wa.wsdot.cms; import gov.wa.wsdot.cms.shared.ChannelsAndPostingsBase; import gov.wa.wsdot.cms.shared.ResourceItem; import gov.wa.wsdot.cms.utils.Migration; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.google.gson.Gson; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.MongoClient; /** * Dump all CMS channels with parent references. */ public class App { static DocumentBuilderFactory factory; static DocumentBuilder builder; static HashMap<String, ResourceItem> nodeGuidHashMap = new HashMap<String, ResourceItem>(); static String sitePath = "/sites/default/files/"; // Location of Drupal files. static String archiveFolder = ""; // Where the files are currently stored. Needs to be dynamic. static String outputFile = ""; // This should be dynamically generated based on the templates being parsed. static String locationUri = "public://import"; // Default. Location of files to be imported. Either local or on the test web server. static HashMap<String, ChannelsAndPostingsBase> postingsHashMap = new HashMap<String, ChannelsAndPostingsBase>(); static HashMap<String, Integer> templatesCountMap = new HashMap<String, Integer>(); // Keep track of the number of each template. static HashMap<String, Integer> templatesExcludedCountMap = new HashMap<String, Integer>(); // Keep track of the number of each excluded template. static HashMap<String, Integer> templatesExpiredCountMap = new HashMap<String, Integer>(); // Keep track of the number of each excluded template. static HashMap<String, Integer> resourcesCountMap = new HashMap<String, Integer>(); // Keep track of the number of each resource. static HashMap<String, String> templatesMap = new HashMap<String, String>(); static MongoClient mongo; static DB db; static DBCollection channelCollection; static DBCollection postingCollection; static DBCollection resourcesCollection; static Set<String> highways = new HashSet<String>(); static HashMap<String, String> redirectsMap = new HashMap<String, String>(); // legacyURL -> redirectURL static boolean reportOnly; /** * The main method for the program * * @param args command-line arguments. * @throws ParserConfigurationException * @throws SAXException * @throws IOException */ public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { int i = 0, j; String arg; char flag; boolean vflag = false; reportOnly = false; while (i < args.length && args[i].startsWith("-")) { arg = args[i++]; // Use this for more detailed output. if (arg.equals("-verbose")) { System.out.println("Verbose mode on"); vflag = true; } // Use this to check for arguments that require more arguments. else if (arg.equals("-uri")) { if (i < args.length) { locationUri = args[i++]; } else { // e.g. http://test.wsdot.wa.gov/wsdot/migration System.err.println("-uri requires a local or network location"); } if (vflag) System.out.println("Uri = " + locationUri); } // Use this to check for a series of flag arguments. else { for (j = 1; j < arg.length(); j++) { flag = arg.charAt(j); switch (flag) { case 'r': if (vflag) System.out.println("Report only"); // Does not parse nodes or build content reportOnly = true; break; default: System.err.println("App: illegal option " + flag); break; } } } } buildTemplatesMap(); // filename is the only required parameter. if (i == args.length) { System.err.println("Usage: App [-verbose] [-xnr] [-uri aUri] filename"); System.exit(0); } else { archiveFolder = args[i]; if (vflag) System.out.println("Archive folder: " + archiveFolder); } // Factory API which allows us to get a parser which produces DOM object // trees from XML documents. factory = DocumentBuilderFactory.newInstance(); builder = factory.newDocumentBuilder(); mongo = new MongoClient("localhost"); db = mongo.getDB("public"); channelCollection = db.getCollection("channels"); postingCollection = db.getCollection("postings"); resourcesCollection = db.getCollection("resources"); // TODO automate this selection. Document document = builder.parse(new File(archiveFolder + File.separator + "Cha.xml")); document.getDocumentElement().normalize(); BuildChannels(document); // CMS containers only import from the Cha.xml file BuildPostings(document); // CMS postings / pages from Cha.xml and Fol.xml files for content. } /** * Add all the templates to the data structure */ private static void buildTemplatesMap() { /** * Public templates */ templatesMap.put("{DFF8BE08-33C2-4C8B-912F-75DA9E798298}", "Default"); // Location="/Templates/Standard/" templatesMap.put("{461E556C-E713-48BB-8B56-B9C9C6B8529D}", "DefaultwNoLeft"); // Location="/Templates/Standard/" templatesMap.put("{78FF8841-6880-444C-9BFA-56732F591D17}", "NewsItem"); // Location="/Templates/News/" templatesMap.put("{00857788-CFB2-4AB6-A616-44650EAED222}", "ProjectSpecifics"); // Location="/Templates/Projects/" templatesMap.put("{CAC413A0-56C7-4EB8-83C9-D2C9E704800A}", "HomePage"); // Location="/Templates/Home/" templatesMap.put("{06550835-2580-403E-8D28-64A7866E5EDD}", "NewsRoom"); // Location="/Templates/News/" templatesMap.put("{2ADE8FE3-7F45-4CED-BE89-A5BD1DC4E0E7}", "NewsArchive"); // Location="/Templates/News/" templatesMap.put("{ED4E5A97-4608-4D7F-8ECA-CC5CE6C3B0D3}", "NewsHome"); // Location="/Templates/News/" templatesMap.put("{302FD56C-9930-4A45-969C-29901D086242}", "CompletedProject"); // Location="/Templates/Projects/" templatesMap.put("{5E703BE3-B25C-4841-95AE-C96B695BC513}", "ProjectFeedback"); // Location="/Templates/Projects/" templatesMap.put("{60C905CE-E8F3-46BE-A40E-C31CA064C999}", "oldcompleted"); // Location="/Templates/Projects/" templatesMap.put("{66EA195A-46C1-4072-86AE-0480A7693696}", "ProjectTemplate"); // Location="/Templates/Projects/" templatesMap.put("{882B84F7-EE64-478A-9A2F-2EAF6BF4AFAD}", "ProjectHome"); // Location="/Templates/Projects/" templatesMap.put("{AB5D347F-4C85-4648-8204-F37C465CC240}", "ProjectHomeFacetmap"); // Location="/Templates/Projects/" templatesMap.put("{B5034FF2-6088-4D13-B2D5-0D33FBED6A12}", "QuarterlyReport"); // Location="/Templates/Projects/" - 22 templatesMap.put("{8FE13678-A62C-4B62-9E32-7E09625CE2D3}", "Monthly Reports Home"); // Location="/Templates/Public Transportation/" templatesMap.put("{D5A341D3-4CAC-40D9-BB3D-E581A0D9F93F}", "Monthly Report"); // Location="/Templates/Public Transportation/" templatesMap.put("{3850D8CD-9459-4BF9-BA82-42C46448BC51}", "TrafficContent"); // Location="/Templates/Traffic/" templatesMap.put("{3A0C5395-E681-48AF-AE0E-B926B531B1BA}", "TextOnly"); // Location="/Templates/Traffic/" templatesMap.put("{070C8446-4AC2-4084-A9A2-C3B42C686C3A}", "PropertyListFacetmap"); // Location="/Templates/RealEstate/" templatesMap.put("{67E38140-7C4A-4984-A6B1-ECC3C698CB92}", "ScraperTest"); // Location="/Templates/Administrative/" templatesMap.put("{688DA8B1-D561-4DE2-9E35-4268570EFBC8}", "CrawlPage"); // Location="/Templates/Administrative/" templatesMap.put("{CCCC8256-DA97-4659-A06D-9AD1BACBD6BC}", "PostingStatus"); // Location="/Templates/Administrative/" templatesMap.put("{F4279E09-2113-4DF5-B98F-D49B756CE7D9}", "PlaceholderResources"); // Location="/Templates/Administrative/" templatesMap.put("{3C41E93A-6917-49A4-AAE5-52EEB96D4453}", "Certificate of Status"); // Location="/Templates/Aviation/" templatesMap.put("{8D71BB62-9FA1-42B4-B255-3E6D9E577F97}", "HOV Dispute"); // Location="/Templates/Feedback/" templatesMap.put("{A79169D4-BA75-40AB-A315-0C4B464A65FE}", "HOV Report"); //Location="/Templates/Feedback/" templatesMap.put("{B6854085-DB6C-4950-A771-C343BF38A7EA}", "CVISN Form"); // Location="/Templates/Feedback/" templatesMap.put("{E572F52B-B545-45E9-9EC1-BA5A09DAA823}", "Feedback"); // Location="/Templates/Feedback/" templatesMap.put("{F381A6CC-D14A-491B-BE41-FCA474C65D74}", "Web Feedback"); // Location="/Templates/Feedback/" templatesMap.put("{FF56C752-58E5-4692-BD9A-3A0538C560AD}", "Graffiti Feedback"); // Location="/Templates/Feedback/" templatesMap.put("{1493A417-5C0C-457B-B5F1-5FC70EB05C13}", "Redirect"); // Location="/Templates/Standard/" templatesMap.put("{89BC0F62-6974-49ED-850C-BA6BB9839370}", "Training"); // Location="/Templates/Standard/" templatesMap.put("{A114DAFA-DFB0-48CC-B133-E8C29251E901}", "DefaultwUpdateBox"); // Location="/Templates/Standard/" templatesMap.put("{A3FD3A59-1EF6-49B8-8357-4728B9FFA36C}", "ResourceList"); // Location="/Templates/Standard/" templatesMap.put("{320598FA-2DBE-4DF8-A6C7-E0BDAD394A3C}", "Abstract"); // Location="/Templates/Publications/" templatesMap.put("{C7DD44E7-16E5-4EDD-A630-E5B4EC316A60}", "AbstractsList"); // Location="/Templates/Publications/" templatesMap.put("{46978165-408D-4F68-AEC2-1FB11FC79D59}", "Abstract"); // Location="/Templates/Research/" templatesMap.put("{74A5FFA4-F529-4A22-839F-494C22634805}", "AbstractsList"); // Location="/Templates/Research/" templatesMap.put("{2343FC1F-F44D-4EF0-AB19-88D2DF435D4D}", "FeaturedEmployeeListing"); // Location="/Templates/Employment/" templatesMap.put("{94967861-7B8F-4634-B72D-D6EDFA6D6905}", "HomePage"); // Location="/Templates/Employment/" templatesMap.put("{9F061DB7-C5C7-4615-B280-17520EAB5B5A}", "FeaturedEmployee"); // Location="/Templates/Employment/" templatesMap.put("{D9BC634C-2BF1-4831-9908-4BEACB2C3505}", "JobsListing"); // Location="/Templates/Employment/" templatesMap.put("{DB90941A-0FF2-40D7-8C99-F38F811E6B37}", "Redirect"); // Location="/Templates/Standard/Redirect/" templatesMap.put("{2EA65D29-2828-468C-AC7E-2DC7BDF077B3}", "CompletedProjectTemplate"); // Location="/Templates/Projects/oldcompleted/" templatesMap.put("{D8B8A0B1-F728-42BF-A796-78A8AD076816}", "DefaultTraining"); // Location="/Templates/Standard/Training/" } /** * Build a list of all channels and their parents nodes * * @param document */ private static void BuildChannels(Document document) { Map<String, String> channelGuids = new HashMap<String, String>(); // GUID, Name List<ChannelsAndPostingsBase> parentReferences = new ArrayList<ChannelsAndPostingsBase>(); // Name, Parent NodeList channelNodeList = document.getElementsByTagName("Channel"); int channelNodeListLength = channelNodeList.getLength(); if (reportOnly) { System.out.println(); System.out.println("=== REPORT ONLY ==="); } System.out.println(); System.out.println("Number of Channel nodes: " + channelNodeListLength); System.out.println(); // Build a map of GUIDS and channel names. for (int i = 0; i < channelNodeListLength; i++) { Node channelNode = channelNodeList.item(i); Element element = (Element) channelNode; String guid = element.getAttribute("GUID"); String name = element.getElementsByTagName("Version").item(0).getAttributes().getNamedItem("Name") .getNodeValue(); channelGuids.put(guid, name); } // Build a list of channel names and their parent nodes. for (int i = 0; i < channelNodeListLength; i++) { Node channelNode = channelNodeList.item(i); Element element = (Element) channelNode; String guid = element.getAttribute("GUID"); int internalId = Integer.parseInt(element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("InternalID").getNodeValue()); String name = element.getElementsByTagName("Version").item(0).getAttributes().getNamedItem("Name") .getNodeValue(); String parent = element.getElementsByTagName("Version").item(0).getAttributes().getNamedItem("Parent") .getNodeValue(); try { // Remove leading and trailing slashes. String location = element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Location").getNodeValue() .substring(1, element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Location").getNodeValue().length() - 1); location = location.replaceAll("Channels/?", ""); String[] locationArray = location.split("/"); location = Arrays.toString(locationArray).replace("[", "").replace("]", "").replace(", ", ";"); int level; if (location.isEmpty()) { level = 0; System.out.println(name); // Root channel so just print the channel name } else { level = locationArray.length; System.out.println(location + ";" + name); } parentReferences .add(new ChannelsAndPostingsBase(internalId, guid, name, parent, level, location, 1)); } catch (StringIndexOutOfBoundsException e) { // Root channel doesn't have leading and trailing slashes. Ignore it. continue; } } saveChannels(parentReferences, channelCollection); // Dump POJO out as a Json string. /* Gson gson = new Gson(); String json = gson.toJson(parentReferences); System.out.println(json); */ } /** * Build a list of postings, their parents nodes, content and associated resources * * @param document * @throws ParserConfigurationException * @throws SAXException * @throws IOException */ private static void BuildPostings(Document document) throws ParserConfigurationException, SAXException, IOException { Map<String, String> channelGuids = new HashMap<String, String>(); // GUID, Name List<String> pagesToExclude = new ArrayList<String>(); List<String> templatesToExclude = new ArrayList<String>(); HashMap<String, HashMap<String, String>> controlsHashMap = null; HashMap<String, ResourceItem> resourcesHashMap = null; HashMap<String, String> genericPropertiesHashMap = null; /** * Public templates */ // Ignore postings based on these templates templatesToExclude.add("{CAC413A0-56C7-4EB8-83C9-D2C9E704800A}"); // Location="/Templates/Home/HomePage" templatesToExclude.add("{06550835-2580-403E-8D28-64A7866E5EDD}"); // Location="/Templates/News/NewsRoom" templatesToExclude.add("{2ADE8FE3-7F45-4CED-BE89-A5BD1DC4E0E7}"); // Location="/Templates/News/NewsArchive" templatesToExclude.add("{ED4E5A97-4608-4D7F-8ECA-CC5CE6C3B0D3}"); // Location="/Templates/News/NewsHome" templatesToExclude.add("{302FD56C-9930-4A45-969C-29901D086242}"); // Location="/Templates/Projects/CompletedProject" templatesToExclude.add("{5E703BE3-B25C-4841-95AE-C96B695BC513}"); // Location="/Templates/Projects/ProjectFeedback" templatesToExclude.add("{60C905CE-E8F3-46BE-A40E-C31CA064C999}"); // Location="/Templates/Projects/oldcompleted" templatesToExclude.add("{882B84F7-EE64-478A-9A2F-2EAF6BF4AFAD}"); // Location="/Templates/Projects/ProjectHome" templatesToExclude.add("{AB5D347F-4C85-4648-8204-F37C465CC240}"); // Location="/Templates/Projects/ProjectHomeFacetmap" templatesToExclude.add("{B5034FF2-6088-4D13-B2D5-0D33FBED6A12}"); // Location="/Templates/Projects/QuarterlyReport" templatesToExclude.add("{8FE13678-A62C-4B62-9E32-7E09625CE2D3}"); // Location="/Templates/Public Transportation/Monthly Reports Home" templatesToExclude.add("{D5A341D3-4CAC-40D9-BB3D-E581A0D9F93F}"); // Location="/Templates/Public Transportation/Monthly Report" templatesToExclude.add("{3850D8CD-9459-4BF9-BA82-42C46448BC51}"); // Location="/Templates/Traffic/TrafficContent" templatesToExclude.add("{3A0C5395-E681-48AF-AE0E-B926B531B1BA}"); // Location="/Templates/Traffic/TextOnly" templatesToExclude.add("{070C8446-4AC2-4084-A9A2-C3B42C686C3A}"); // Location="/Templates/RealEstate/PropertyListFacetmap" templatesToExclude.add("{67E38140-7C4A-4984-A6B1-ECC3C698CB92}"); // Location="/Templates/Administrative/ScraperTest" templatesToExclude.add("{688DA8B1-D561-4DE2-9E35-4268570EFBC8}"); // Location="/Templates/Administrative/CrawlPage" templatesToExclude.add("{CCCC8256-DA97-4659-A06D-9AD1BACBD6BC}"); // Location="/Templates/Administrative/PostingStatus" templatesToExclude.add("{F4279E09-2113-4DF5-B98F-D49B756CE7D9}"); // Location="/Templates/Administrative/PlaceholderResources" templatesToExclude.add("{3C41E93A-6917-49A4-AAE5-52EEB96D4453}"); // Location="/Templates/Aviation/Certificate of Status" templatesToExclude.add("{8D71BB62-9FA1-42B4-B255-3E6D9E577F97}"); // Location="/Templates/Feedback/HOV Dispute" templatesToExclude.add("{A79169D4-BA75-40AB-A315-0C4B464A65FE}"); // Location="/Templates/Feedback/HOV Report" templatesToExclude.add("{B6854085-DB6C-4950-A771-C343BF38A7EA}"); // Location="/Templates/Feedback/CVISN Form" templatesToExclude.add("{E572F52B-B545-45E9-9EC1-BA5A09DAA823}"); // Location="/Templates/Feedback/Feedback" templatesToExclude.add("{F381A6CC-D14A-491B-BE41-FCA474C65D74}"); // Location="/Templates/Feedback/Web Feedback" templatesToExclude.add("{FF56C752-58E5-4692-BD9A-3A0538C560AD}"); // Location="/Templates/Feedback/Graffiti Feedback" templatesToExclude.add("{89BC0F62-6974-49ED-850C-BA6BB9839370}"); // Location="/Templates/Standard/Training" templatesToExclude.add("{A114DAFA-DFB0-48CC-B133-E8C29251E901}"); // Location="/Templates/Standard/DefaultwUpdateBox" templatesToExclude.add("{A3FD3A59-1EF6-49B8-8357-4728B9FFA36C}"); // Location="/Templates/Standard/ResourceList" templatesToExclude.add("{320598FA-2DBE-4DF8-A6C7-E0BDAD394A3C}"); // Location="/Templates/Publications/Abstract" templatesToExclude.add("{C7DD44E7-16E5-4EDD-A630-E5B4EC316A60}"); // Location="/Templates/Publications/AbstractsList" templatesToExclude.add("{74A5FFA4-F529-4A22-839F-494C22634805}"); // Location="/Templates/Research/AbstractsList" templatesToExclude.add("{2343FC1F-F44D-4EF0-AB19-88D2DF435D4D}"); // Location="/Templates/Employment/FeaturedEmployeeListing" templatesToExclude.add("{94967861-7B8F-4634-B72D-D6EDFA6D6905}"); // Location="/Templates/Employment/HomePage" templatesToExclude.add("{9F061DB7-C5C7-4615-B280-17520EAB5B5A}"); // Location="/Templates/Employment/FeaturedEmployee" templatesToExclude.add("{D9BC634C-2BF1-4831-9908-4BEACB2C3505}"); // Location="/Templates/Employment/JobsListing" templatesToExclude.add("{2EA65D29-2828-468C-AC7E-2DC7BDF077B3}"); // Location="/Templates/Projects/oldcompleted/CompletedProjectTemplate" templatesToExclude.add("{D8B8A0B1-F728-42BF-A796-78A8AD076816}"); // Location="/Templates/Standard/Training/DefaultTraining" // Build the channel guids from channel containers already in the database. BasicDBObject query = new BasicDBObject("isChannel", 1); DBCursor cursor = channelCollection.find(query); while (cursor.hasNext()) { DBObject dbObject = cursor.next(); channelGuids.put(dbObject.get("guid").toString(), dbObject.get("name").toString()); } NodeList postingNodeList = document.getElementsByTagName("Posting"); int postingNodeListLength = postingNodeList.getLength(); System.out.println("Number of Posting nodes: " + postingNodeListLength); // Build a list of posting names and their parent nodes. for (int i = 0; i < postingNodeListLength; i++) { Node channelNode = postingNodeList.item(i); Element element = (Element) channelNode; String guid = element.getAttribute("GUID"); int internalId = Integer.parseInt(element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("InternalID").getNodeValue()); String name = element.getElementsByTagName("Version").item(0).getAttributes().getNamedItem("Name") .getNodeValue(); String parent = element.getElementsByTagName("Version").item(0).getAttributes().getNamedItem("Parent") .getNodeValue(); String shortcut = element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Shortcut").getNodeValue(); String template = element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("TemplateGuid").getNodeValue(); String expireDate = element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Expiredate").getNodeValue(); // Skip templates which are summary or rollup types and contain little or no content. // These will be generated separately in Drupal. if (templatesToExclude.contains(template)) { pagesToExclude.add(shortcut); if (templatesExcludedCountMap.containsKey(template)) { templatesExcludedCountMap.put(template, templatesExcludedCountMap.get(template) + 1); } else { templatesExcludedCountMap.put(template, 1); } continue; } else if (!expireDate.equalsIgnoreCase("401769")) { // Year 3000. Any other date, the page is expired. pagesToExclude.add(shortcut); if (templatesExpiredCountMap.containsKey(template)) { templatesExpiredCountMap.put(template, templatesExpiredCountMap.get(template) + 1); } else { templatesExpiredCountMap.put(template, 1); } continue; } else { if (templatesCountMap.containsKey(template)) { templatesCountMap.put(template, templatesCountMap.get(template) + 1); } else { templatesCountMap.put(template, 1); } } try { // Remove leading and trailing slashes. String location = element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Location").getNodeValue() .substring(1, element.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Location").getNodeValue().length() - 1); location = location.replaceAll("Channels/?", ""); String[] locationArray = location.split("/"); location = Arrays.toString(locationArray).replace("[", "").replace("]", "").replace(", ", ">>"); int level; if (location.isEmpty()) { level = 0; } else { level = locationArray.length; } ChannelsAndPostingsBase posting = new ChannelsAndPostingsBase(); posting.setUid(internalId); posting.setGuid(guid); posting.setName(name); posting.setParent(parent); posting.setLevel(level); posting.setLocation(location); posting.setIsChannel(0); posting.setTemplate(template); postingsHashMap.put(shortcut, posting); } catch (StringIndexOutOfBoundsException e) { // Root channel doesn't have leading and trailing slashes. Ignore it. continue; } } // Load and parse the Folders and Pages XML document which contains content for the pages. // TODO automate this selection. Document folDocument = builder.parse(new File(archiveFolder + File.separator + "Fol.xml")); folDocument.getDocumentElement().normalize(); NodeList pageNodeList = folDocument.getElementsByTagName("Page"); int pageNodeListLength = pageNodeList.getLength(); System.out.println("Number of Page nodes: " + pageNodeListLength); System.out.println("Number of valid pages: " + (pageNodeListLength - pagesToExclude.size())); System.out.println(); if (reportOnly) { } else { for (int j = 0; j < pageNodeListLength; j++) { Node pageNode = pageNodeList.item(j); Element pageNodeElement = (Element) pageNode; String pageGuid = pageNodeElement.getAttribute("GUID"); if (pagesToExclude.contains(pageGuid)) continue; String pageName = pageNodeElement.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("Name").getNodeValue(); postingsHashMap.get(pageGuid).setPageName(pageName); String location = postingsHashMap.get(pageGuid).getLocation().replace(">>", "/") + "/"; postingsHashMap.get(pageGuid).setLegacyURL(location + pageName + ".htm"); String createdWhen = pageNodeElement.getElementsByTagName("Version").item(0).getAttributes() .getNamedItem("CreatedWhen").getNodeValue(); postingsHashMap.get(pageGuid) .setCreatedWhen(Migration.convertDays(Double.parseDouble(createdWhen))); controlsHashMap = new HashMap<String, HashMap<String, String>>(); resourcesHashMap = new HashMap<String, ResourceItem>(); genericPropertiesHashMap = new HashMap<String, String>(); traverseNodes(pageNodeList.item(j), pageGuid, controlsHashMap, resourcesHashMap, genericPropertiesHashMap); // Crawl Page node. buildContent(pageGuid, controlsHashMap, resourcesHashMap, genericPropertiesHashMap); } } // Print out the template use count. System.out.println("Pages to process (Based on template)"); System.out.println("------------------------------------"); for (Map.Entry<String, Integer> entry : templatesCountMap.entrySet()) { System.out.println(templatesMap.get(entry.getKey()) + ": " + entry.getValue()); if (templatesMap.get(entry.getKey()).equalsIgnoreCase("Redirect")) { for (Map.Entry<String, String> redirect : redirectsMap.entrySet()) { System.out.println(" " + redirect.getKey() + " -> " + redirect.getValue()); } } } System.out.println(); System.out.println("Pages to ignore (Based on template)"); System.out.println("-----------------------------------"); for (Map.Entry<String, Integer> entry : templatesExcludedCountMap.entrySet()) { System.out.println(templatesMap.get(entry.getKey()) + ": " + entry.getValue()); } System.out.println(); System.out.println("Pages to ignore (Expired)"); System.out.println("-------------------------"); for (Map.Entry<String, Integer> entry : templatesExpiredCountMap.entrySet()) { System.out.println(templatesMap.get(entry.getKey()) + ": " + entry.getValue()); } System.out.println(); System.out.println("Page resources and counts"); System.out.println("-------------------------"); Map<String, Integer> sortedResources = sortByComparator(resourcesCountMap, false); for (Map.Entry<String, Integer> entry : sortedResources.entrySet()) { System.out.println(entry.getKey() + ": " + entry.getValue()); } System.out.println(); if (reportOnly) { } else { savePostings(postingsHashMap, channelCollection); // Should be changed to a taskCollection savePostingsAsJson(postingsHashMap); } } /** * Sort the resources associated with each page * * @param unsortedResources * @param order * @return * @see <a href="http://stackoverflow.com/a/13913206">http://stackoverflow.com/a/13913206</a> */ private static Map<String, Integer> sortByComparator(HashMap<String, Integer> unsortedResources, final boolean order) { List<Entry<String, Integer>> list = new LinkedList<Entry<String, Integer>>(unsortedResources.entrySet()); Collections.sort(list, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { if (order) { return o1.getValue().compareTo(o2.getValue()); } else { return o2.getValue().compareTo(o1.getValue()); } } }); // Convert sorted map back to a Map Map<String, Integer> sortedMap = new LinkedHashMap<String, Integer>(); for (Entry<String, Integer> entry : list) { sortedMap.put(entry.getKey(), entry.getValue()); } return sortedMap; } /** * Save the channel structure to the database * * @param parentReferences * @param collection */ private static void saveChannels(List<ChannelsAndPostingsBase> parentReferences, DBCollection collection) { // Store POJO to MongoDB collection. for (ChannelsAndPostingsBase item : parentReferences) { BasicDBObject doc = new BasicDBObject(); doc.put("uid", item.getUid()); doc.put("guid", item.getGuid()); doc.put("name", item.getName()); doc.put("parent", item.getParent()); doc.put("level", item.getLevel()); doc.put("location", item.getLocation()); doc.put("isChannel", item.getIsChannel()); doc.put("created", new Date()); BasicDBObject updateQuery = new BasicDBObject("uid", item.getUid()); collection.update(updateQuery, doc, true, false); } System.out.println(); System.out.println("All done saving channels."); System.out.println(); } /** * Save the posting structure and content to the database * * @param postingsHashMap * @param collection */ private static void savePostings(HashMap<String, ChannelsAndPostingsBase> postingsHashMap, DBCollection collection) { for (Entry<String, ChannelsAndPostingsBase> entry : postingsHashMap.entrySet()) { BasicDBObject doc = new BasicDBObject(); doc.put("uid", entry.getValue().getUid()); doc.put("guid", entry.getValue().getGuid()); doc.put("name", entry.getValue().getName()); doc.put("parent", entry.getValue().getParent()); doc.put("level", entry.getValue().getLevel()); doc.put("location", entry.getValue().getLocation()); doc.put("isChannel", entry.getValue().getIsChannel()); doc.put("created", new Date()); if (entry.getValue().getPageName() != null) { doc.put("pageName", entry.getValue().getPageName()); } BasicDBObject updateQuery = new BasicDBObject("uid", entry.getValue().getUid()); collection.update(updateQuery, doc, true, false); } System.out.println("All done saving postings."); } /** * Output the data as a JSON file based on the template the content relates to. * * @param postingsHashMap */ private static void savePostingsAsJson(HashMap<String, ChannelsAndPostingsBase> postingsHashMap) { /** * Step through postingsHashMap and build separate hashmaps based on the posting's template. * <p> * Save those individual json files with the template's name e.g. Default.json, ProjectSpecifics.json, etc. */ for (Map.Entry<String, Integer> template : templatesCountMap.entrySet()) { String templateName = templatesMap.get(template.getKey()); HashMap<String, ChannelsAndPostingsBase> postingsByTemplate = new HashMap<String, ChannelsAndPostingsBase>(); for (Map.Entry<String, ChannelsAndPostingsBase> posting : postingsHashMap.entrySet()) { if (posting.getValue().getTemplate().equals(template.getKey())) { postingsByTemplate.put(posting.getKey(), posting.getValue()); } } Gson gson = new Gson(); String json = gson.toJson(postingsByTemplate); try { File jsonFile = new File(archiveFolder + File.separator + templateName + ".json"); if (!jsonFile.exists()) { jsonFile.createNewFile(); } FileWriter fw = new FileWriter(jsonFile.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); bw.write(json); bw.close(); System.out.println("Wrote " + templateName + " postings data to JSON file."); } catch (IOException e) { e.printStackTrace(); } } } /** * Recursive call to traverse Node datatypes. * * @param node node we are processing * @param pageGuid unique id of the page * @param resourcesHashMap * @param controlsHashMap * @param genericPropertiesHashMap * @param postingCollection * @param collection */ private static void traverseNodes(Node node, String pageGuid, HashMap<String, HashMap<String, String>> controlsHashMap, HashMap<String, ResourceItem> resourcesHashMap, HashMap<String, String> genericPropertiesHashMap) { NamedNodeMap attributes; NamedNodeMap blobAttributes; String placeholderDefinition; ResourceItem resourceItem; String nodeGuid; String resourceBlobId; // Now traverse the rest of the tree in depth-first order. if (node.hasChildNodes()) { // Get the children in a list. NodeList nl = node.getChildNodes(); // How many nodes? int size = nl.getLength(); for (int i = 0; i < size; i++) { if (nl.item(i).getNodeType() == Node.ELEMENT_NODE) { /** * Controls */ if (nl.item(i).getNodeName().equalsIgnoreCase("Control")) { placeholderDefinition = nl.item(i).getAttributes().getNamedItem("Name").getNodeValue(); Element propertyElement = (Element) nl.item(i); int numProperties = propertyElement.getElementsByTagName("Property").getLength(); HashMap<String, String> property = new HashMap<String, String>(); for (int j = 0; j < numProperties; j++) { attributes = propertyElement.getElementsByTagName("Property").item(j).getAttributes(); if (attributes.getNamedItem("Value") != null) { property.put(attributes.getNamedItem("InternalIDResource").getNodeValue(), attributes.getNamedItem("Value").getNodeValue()); } else { String nameAttribute = attributes.getNamedItem("Name").getNodeValue(); if (nameAttribute.matches("ControlProp(\\d+)")) { property.put(attributes.getNamedItem("InternalIDResource").getNodeValue(), attributes.getNamedItem("Name").getNodeValue()); } } } if (!property.isEmpty()) { controlsHashMap.put(placeholderDefinition, property); } /** * Resources */ } else if (nl.item(i).getNodeName().equalsIgnoreCase("Resources")) { Element propertyElement = (Element) nl.item(i); int numResources = propertyElement.getElementsByTagName("Resource").getLength(); for (int j = 0; j < numResources; j++) { resourceItem = new ResourceItem(); attributes = propertyElement.getElementsByTagName("Resource").item(j).getAttributes(); if (attributes.getNamedItem("IsLink").getNodeValue().equals("1")) { if (attributes.getNamedItem("URL") != null) { // // <Resource URL="http://www..." ResourceBlobId="0" IsLink="1" Name="NewResource2" InternalID="2"/> // resourceItem.setIsLink(attributes.getNamedItem("IsLink").getNodeValue()); resourceItem.setUrl(attributes.getNamedItem("URL").getNodeValue()); } else { // // <Resource ResourceBlobId="0" NodeGuid="{1027105F-...}" IsLink="1" Name="NewResource3" InternalID="3"/> // // NodeGuid references Posting GUID attribute in Cha.xml file which references the Page GUID in the Fol.xml file // via the Shortcut attribute. How do I resolve this with an SDO export? HashMap? MongoDB? // // NodeGUID references the CMS database Node table and NodeGUID column. In that table there // is a FollowGUID column which points to the Node row which has the Name of the page. // Build the channel guids from channel containers already in the database. String nodeGUID = attributes.getNamedItem("NodeGuid").getNodeValue(); BasicDBObject query = new BasicDBObject("NodeGUID", nodeGUID); DBCursor postingCursor = postingCollection.find(query); String guid = ""; String postingName = "default"; String channelName = ""; String location = ""; String rootRelativeURL = ""; while (postingCursor.hasNext()) { DBObject dbObject = postingCursor.next(); guid = dbObject.get("ParentGUID").toString(); postingName = dbObject.get("Name").toString(); } if (postingCursor.size() == 0) { guid = nodeGUID; } query = new BasicDBObject("guid", guid); DBCursor channelCursor = channelCollection.find(query); while (channelCursor.hasNext()) { DBObject dbObject = channelCursor.next(); channelName = dbObject.get("name").toString(); location = dbObject.get("location").toString(); } if (channelName.isEmpty() && location.isEmpty()) { rootRelativeURL = ""; } else if (location.isEmpty()) { rootRelativeURL = "/" + channelName + "/" + postingName + ".htm"; } else { rootRelativeURL = "/" + location.replace(";", "/") + "/" + channelName + "/" + postingName + ".htm"; } resourceItem.setIsLink(attributes.getNamedItem("IsLink").getNodeValue()); resourceItem.setUrl(rootRelativeURL); } resourcesHashMap.put(attributes.getNamedItem("InternalID").getNodeValue(), resourceItem); } else { Node blobInfo = propertyElement.getElementsByTagName("Resource").item(j); Element blobInfoElement = (Element) blobInfo; if (blobInfoElement.hasChildNodes()) { // // <Resource ResourceBlobId="24041" NodeGuid="{7CE36259-...}" IsLink="0" Name="btn_video_smaller1" InternalID="1146"> // <BlobInfo FileName="resF7F9C8ACD9... .jpg" Size="1578" Guid="{F7F9C8AC-...}" FileExtension="jpg" InternalID="24041"/> // </Resource> // blobAttributes = blobInfoElement.getElementsByTagName("BlobInfo").item(0) .getAttributes(); resourceItem.setIsLink(attributes.getNamedItem("IsLink").getNodeValue()); resourceItem.setName(attributes.getNamedItem("Name").getNodeValue()); resourceItem .setFilename(blobAttributes.getNamedItem("FileName").getNodeValue()); resourceItem.setFileExtension( blobAttributes.getNamedItem("FileExtension").getNodeValue()); // Store unique reference to resource item keyed by NodeGuid. // If NodeGuid is not already in the HashMap then store a reference to it. if (attributes.getNamedItem("NodeGuid") != null) { nodeGuid = attributes.getNamedItem("NodeGuid").getNodeValue(); if (nodeGuidHashMap.get(nodeGuid) == null) { nodeGuidHashMap.put(nodeGuid, resourceItem); } // // <Resource ResourceBlobId="104084" IsLink="0" Name="SR510toSR512WEB" InternalID="3"> // <BlobInfo FileName="resDD0CDE054B....pdf" Size="257303" Guid="{DD0CDE05-...}" FileExtension="pdf" InternalID="104084" /> // </Resource> // // If the resource is stored locally in the page there is no NodeGuid to reference. // Use the ResourceBlobId as a fallback. } else { resourceBlobId = attributes.getNamedItem("ResourceBlobId").getNodeValue(); if (nodeGuidHashMap.get(resourceBlobId) == null) { nodeGuidHashMap.put(resourceBlobId, resourceItem); } } } else { // // <Resource ResourceBlobId="24041" NodeGuid="{7CE36259-...}" IsLink="0" Name="btn_video_smaller94" InternalID="1147"/> // // Duplicate ResourceBlobId referencing existing link to resource on the page. // Store unique reference to resource item keyed on NodeGuid. if (attributes.getNamedItem("NodeGuid") != null) { nodeGuid = attributes.getNamedItem("NodeGuid").getNodeValue(); resourceItem.setIsLink(nodeGuidHashMap.get(nodeGuid).getIsLink()); resourceItem.setName(nodeGuidHashMap.get(nodeGuid).getName()); resourceItem.setFilename(nodeGuidHashMap.get(nodeGuid).getFilename()); resourceItem .setFileExtension(nodeGuidHashMap.get(nodeGuid).getFileExtension()); // // <Resource ResourceBlobId="104084" IsLink="0" Name="SR510toSR512WEB1" InternalID="6" /> // // No NodeGuid. Use ResourceBlobId. } else { resourceBlobId = attributes.getNamedItem("ResourceBlobId").getNodeValue(); resourceItem.setIsLink(nodeGuidHashMap.get(resourceBlobId).getIsLink()); resourceItem.setName(nodeGuidHashMap.get(resourceBlobId).getName()); resourceItem.setFilename(nodeGuidHashMap.get(resourceBlobId).getFilename()); resourceItem.setFileExtension( nodeGuidHashMap.get(resourceBlobId).getFileExtension()); } } resourcesHashMap.put(attributes.getNamedItem("InternalID").getNodeValue(), resourceItem); } } /** * GenericProperties */ } else if (nl.item(i).getNodeName().equalsIgnoreCase("GenericProperties")) { Element propertyElement = (Element) nl.item(i); int numProperties = propertyElement.getElementsByTagName("Property").getLength(); for (int j = 0; j < numProperties; j++) { attributes = propertyElement.getElementsByTagName("Property").item(j).getAttributes(); // Posting summary. Used in News Items and varioius RSS feeds for mobile apps and GovDelivery content. if (attributes.getNamedItem("Name").getNodeValue().equalsIgnoreCase("_Description")) { if (attributes.getNamedItem("ValueLong") != null) { postingsHashMap.get(pageGuid) .setDescription(attributes.getNamedItem("ValueLong").getNodeValue()); } else if (attributes.getNamedItem("Value") != null) { postingsHashMap.get(pageGuid) .setDescription(attributes.getNamedItem("Value").getNodeValue()); ; } else { postingsHashMap.get(pageGuid).setDescription(""); } // Otherwise, store everything else for processing in the individual template model. } else { if (attributes.getNamedItem("Value") != null) { genericPropertiesHashMap.put(attributes.getNamedItem("Name").getNodeValue(), attributes.getNamedItem("Value").getNodeValue()); } else { genericPropertiesHashMap.put(attributes.getNamedItem("Name").getNodeValue(), ""); } } } } // Recursive call to traverse nodes. traverseNodes(nl.item(i), pageGuid, controlsHashMap, resourcesHashMap, genericPropertiesHashMap); } } } } /** * Method to reassemble content from data structures. * * @param pageGuid * @param pageControls * @param pageResources * @param genericPropertiesHashMap */ private static void buildContent(String pageGuid, HashMap<String, HashMap<String, String>> pageControls, HashMap<String, ResourceItem> pageResources, HashMap<String, String> genericPropertiesMap) { TemplateFactory templateFactory = new TemplateFactory(); String placeholderDefinition; String propertyValue; String updatedPropertyValue = null; String regexPattern; String oldFilename; String newFilename = null; String newFilenamePath = null; String fileExtension = null; String imageExtensions[] = { "bmp", "eps", "gif", "jpeg", "jpg", "png", "tif" }; String documentExtensions[] = { "doc", "docm", "docx", "fp5", "fp7", "pdf", "pps", "ppsx", "ppt", "pptx", "psd", "txt", "xls", "xlsx" }; String projectPhaseImages[] = { "Status_Planning.gif", "Status_PlanningDesign.gif", "Status_Design.gif", "Status_DesignConstruct.gif", "Status_Construction.gif", "Status_Complete.gif" }; // Use the posting createdWhen date to build the file directory path of the images and documents. String createdWhen[] = postingsHashMap.get(pageGuid).getCreatedWhen().split("\\s+"); // 2011-03-24 17:00:50 String date[] = createdWhen[0].split("-"); // 2011-03-24 String year = date[0]; // 2011 String month = date[1]; // 03 String day = date[2]; // 24 String location = null; String siteStructure = null; List<String> files = new ArrayList<String>(); List<String> images = new ArrayList<String>(); String templateName = templatesMap.get(postingsHashMap.get(pageGuid).getTemplate()); location = postingsHashMap.get(pageGuid).getLocation().toLowerCase(); //siteStructure = location.replace(";", "/"); // Hold a reference of the Control Name and its content HashMap<String, String> controlsMap = new HashMap<String, String>(); for (Map.Entry<String, HashMap<String, String>> control : pageControls.entrySet()) { placeholderDefinition = control.getKey(); propertyValue = pageControls.get(placeholderDefinition).get("0"); propertyValue = Migration.sanitizeContent(propertyValue); for (Map.Entry<String, String> property : control.getValue().entrySet()) { if (!property.getKey().equals("0")) { regexPattern = "<!--\\* Resource = \"" + property.getValue() + "\" -->"; if (pageResources.get(property.getKey()).getIsLink().equals("1")) { updatedPropertyValue = propertyValue.replaceAll(regexPattern, pageResources.get(property.getKey()).getUrl()); /** * <a href="<!--* Resource = "ControlProp0" -->">http://wwwi.wsdot.wa.gov/IT/Help/Lync2013.htm </a> * * If Resource is an internal link then I could match the link text and use that for the href value. * What if the link text is not a URL but text? Buzzard. * * Currently the href will be replaced with the internal GUID e.g. "{84C6C09A-5072-441E-AE40-100A64F9A1B7}" */ } else { oldFilename = pageResources.get(property.getKey()).getFilename(); fileExtension = pageResources.get(property.getKey()).getFileExtension().toLowerCase(); newFilename = pageResources.get(property.getKey()).getName() + "." + fileExtension; BasicDBObject query = new BasicDBObject("filename", newFilename); // Count individual references to resources. We don't want duplicates. if (resourcesCountMap.containsKey(newFilename)) { resourcesCountMap.put(newFilename, resourcesCountMap.get(newFilename) + 1); // Use existing resources in MongoDB if they are there. DBCursor filenameCursor = resourcesCollection.find(query); while (filenameCursor.hasNext()) { DBObject dbObject = filenameCursor.next(); newFilenamePath = dbObject.get("path").toString(); } } else { resourcesCountMap.put(newFilename, 1); if (resourcesCollection.find(query).count() > 0) { // Use existing resources in MongoDB if they are there. DBCursor filenameCursor = resourcesCollection.find(query); while (filenameCursor.hasNext()) { DBObject dbObject = filenameCursor.next(); newFilenamePath = dbObject.get("path").toString(); } } else { newFilenamePath = sitePath + year + "/" + month + "/" + day + "/" + newFilename; // New resource so store reference to it in MongoDB BasicDBObject doc = new BasicDBObject(); doc.put("filename", newFilename); doc.put("path", newFilenamePath); doc.put("created", new Date()); BasicDBObject updateQuery = new BasicDBObject("filename", newFilename); resourcesCollection.update(updateQuery, doc, true, false); // Since it's a new resource let's be sure to import it in Drupal if (Arrays.asList(imageExtensions).contains(fileExtension)) { // If the image is a project phase, skip it; these are handled by a taxonomy if (!Arrays.asList(projectPhaseImages).contains(newFilename)) { images.add(locationUri + File.separator + archiveFolder + File.separator + newFilename); } } else if (Arrays.asList(documentExtensions).contains(fileExtension)) { files.add(locationUri + File.separator + archiveFolder + File.separator + newFilename); } } } updatedPropertyValue = propertyValue.replaceAll(regexPattern, newFilenamePath); Migration.resourceCopy(archiveFolder, oldFilename, newFilename); } propertyValue = updatedPropertyValue; } } controlsMap.put(placeholderDefinition, propertyValue); } if (files.size() > 0) { postingsHashMap.get(pageGuid) .setDocuments(files.toString().replace("[", "").replace("]", "").replace(", ", ";")); } if (images.size() > 0) { postingsHashMap.get(pageGuid) .setImages(images.toString().replace("[", "").replace("]", "").replace(", ", ";")); } /** * PageContent needs to be assembled differently depending on the template the content is coming from. * Using a Factory pattern to create the object so we don't expose the creation logic. Creating the * objects will be fussy because each template can be slightly different including how the common * placeholders are named. * * In some cases we will just be copying from the MainContentPlaceHolderDefinition and in others we will * need to concatenate multiple placeholders into one or into different ones depending on the Drupal content * type. */ Template template = templateFactory.getTemplate(templateName); try { postingsHashMap.put(pageGuid, template.build(postingsHashMap.get(pageGuid), controlsMap, genericPropertiesMap)); if (templateName.equalsIgnoreCase("Redirect")) { redirectsMap.put(postingsHashMap.get(pageGuid).getLegacyURL(), postingsHashMap.get(pageGuid).getRedirectURL()); } } catch (NullPointerException e) { System.out.println("Don't have a template class for: " + templateName); System.out.println("Or there was a problem with this page: " + pageGuid); System.out.println(); e.printStackTrace(); System.exit(0); } } }