Java tutorial
/** * */ package uk.bl.wa.annotation; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.io.File; import java.io.IOException; import java.io.StringReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.solr.common.util.Base64; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.JsonParser; import org.codehaus.jackson.map.ObjectMapper; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import com.google.common.base.Joiner; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; /** * * This downloads the data from the ACT prototype (based on Drupal) and creates * a set of @Annotations from the appropriate taxonomy. * * @author Roger Coram, Andrew Jackson * */ public class AnnotationsFromAct { private String[] crawlFreqs = new String[] { "nevercrawl", "domaincrawl", "annual", "sixmonthly", "quarterly", "monthly", "weekly", "daily" }; private static String WARC_ACT_URL = "http://www.webarchive.org.uk/act/websites/export/daily"; private static String WARC_COLLECTIONS_URL = "http://www.webarchive.org.uk/act/taxonomy_term.xml?sort=name&direction=ASC&vocabulary=5&limit=500&page=0"; private static String WARC_COLLECTIONS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=5&limit=500&page=0"; private static String WARC_SUBJECTS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=2&limit=500&page=0"; private static Log LOG = LogFactory.getLog(AnnotationsFromAct.class); private String cookie; private String csrf; private static final String COLLECTION_XML = "taxonomy_term"; private static final String OK_PUBLISH = "1"; private static final String FIELD_PUBLISH = "field_publish"; private static final String FIELD_DATES = "field_dates"; private static final String FIELD_NAME = "name"; private static final String FIELD_START_DATE = "value"; private static final String FIELD_END_DATE = "value2"; // Map of all categories and subjects: private Map<Integer, JsonNode> cm = new HashMap<Integer, JsonNode>(); private Map<Integer, JsonNode> sm = new HashMap<Integer, JsonNode>(); // The annotations being built up from ACT: private Annotations ann = new Annotations(); /** * * @throws IOException * @throws JDOMException */ public AnnotationsFromAct() throws IOException, JDOMException { // Populate LOG.info("Logging into ACT..."); this.actLogin(); // Get the collections export: LOG.info("Getting collections export from ACT..."); String collectionXml = readAct(AnnotationsFromAct.WARC_COLLECTIONS_URL); LOG.info("Parsing collection XML..."); parseCollectionXml(collectionXml); // Get all Targets: LOG.info("Getting main export from ACT..."); String recordXml = readAct(AnnotationsFromAct.WARC_ACT_URL); LOG.info("Parsing record XML..."); parseRecordXml(recordXml); } protected AnnotationsFromAct(String dummy) { } /** * Performs login operation to ACT, setting Cookie and CSRF. * @throws IOException */ private void actLogin() throws IOException { Config loginConf = ConfigFactory.parseFile(new File("credentials.conf")); URL login = new URL(loginConf.getString("act.login")); LOG.info("Logging in at " + login); HttpURLConnection connection = (HttpURLConnection) login.openConnection(); StringBuilder credentials = new StringBuilder(); credentials.append(loginConf.getString("act.username")); credentials.append(":"); credentials.append(loginConf.getString("act.password")); connection.setRequestProperty("Authorization", "Basic " + Base64.byteArrayToBase64(credentials.toString().getBytes())); connection.setRequestProperty("Content-Type", "text/plain"); Scanner scanner; if (connection.getResponseCode() != 200) { scanner = new Scanner(connection.getErrorStream()); scanner.useDelimiter("\\Z"); throw new IOException(scanner.next()); } else { scanner = new Scanner(connection.getInputStream()); } scanner.useDelimiter("\\Z"); this.csrf = scanner.next(); this.cookie = connection.getHeaderField("set-cookie"); } /** * Read data from ACT to include curator-specified metadata. * @param conf * @return * @throws MalformedURLException * @throws IOException */ private String readAct(String url) throws IOException { URL act = new URL(url); HttpURLConnection connection = (HttpURLConnection) act.openConnection(); if (this.cookie != null) { connection.setRequestProperty("Cookie", this.cookie); connection.setRequestProperty("X-CSRF-TOKEN", this.csrf); } Scanner scanner; if (connection.getResponseCode() != 200) { scanner = new Scanner(connection.getErrorStream()); scanner.useDelimiter("\\Z"); throw new IOException(scanner.next()); } else { scanner = new Scanner(connection.getInputStream()); } scanner.useDelimiter("\\Z"); return scanner.next(); } /** * Parses XML from ACT, mapping collection names to date ranges. * * @throws IOException * @throws JDOMException * */ @SuppressWarnings("unchecked") private void parseCollectionXml(String xml) throws JDOMException, IOException { SAXBuilder builder = new SAXBuilder(); Document document = (Document) builder.build(new StringReader(xml)); Element rootNode = document.getRootElement(); List<Element> list = rootNode.getChildren(COLLECTION_XML); Element node = null; DateRange dateRange; String name, start, end, publish; for (int i = 0; i < list.size(); i++) { node = (Element) list.get(i); publish = node.getChildText(FIELD_PUBLISH); name = node.getChildText(FIELD_NAME); if (publish != null && publish.equals(OK_PUBLISH)) { start = node.getChild(FIELD_DATES).getChildText(FIELD_START_DATE); end = node.getChild(FIELD_DATES).getChildText(FIELD_END_DATE); dateRange = new DateRange(start, end); LOG.info("Adding collection " + name + " with dateRange " + dateRange); ann.getCollectionDateRanges().put(name, dateRange); } else { LOG.info("Skipping collection \"" + name + "\" (not ok to publish)"); } } } /** * Removes inactive Collections before optionally creating a UriCollection. * * @param collectionCategories * @param allCollections * @param subject * @return */ private UriCollection filterUriCollection(String collectionCategories, String allCollections, String subject) { UriCollection output = null; Set<String> validCollections = ann.getCollectionDateRanges().keySet(); if (collectionCategories != null && !validCollections.contains(collectionCategories)) collectionCategories = null; ArrayList<String> valid = new ArrayList<String>(); if (allCollections != null) { for (String a : allCollections.split("|")) { if (validCollections.contains(a)) valid.add(a); } if (valid.size() == 0) { allCollections = null; } else { allCollections = Joiner.on("|").join(valid); } } valid.clear(); if (subject != null) { for (String s : subject.split("|")) { if (validCollections.contains(s)) valid.add(s); } if (valid.size() == 0) { subject = null; } else { subject = Joiner.on("|").join(valid); } } if (collectionCategories != null && allCollections != null && subject != null) output = new UriCollection(collectionCategories, allCollections, subject); return output; } /** * Parses XML output from ACT into a lookup, mapping URLs to collections. * * @param xml * @throws JDOMException * @throws IOException * @throws URISyntaxException */ private void parseRecordXml(String xml) throws JDOMException, IOException { SAXBuilder builder = new SAXBuilder(); Document document = (Document) builder.build(new StringReader(xml)); Element rootNode = document.getRootElement(); List<Element> list = rootNode.getChildren("node"); Element node = null; String urls, collectionCategories, allCollections, subject, scope; for (int i = 0; i < list.size(); i++) { node = (Element) list.get(i); urls = node.getChildText("urls"); collectionCategories = node.getChildText("collectionCategories"); // Trac #2271: Erroneous data in ACT might contain pipe-separated text. if (collectionCategories != null && collectionCategories.indexOf("|") != -1) { collectionCategories = collectionCategories.split("|")[0]; } allCollections = node.getChildText("allCollections"); subject = node.getChildText("subject"); scope = node.getChildText("scope"); LOG.info("Looking at scope [" + scope + "] subject [" + subject + "] collectionCategories [" + collectionCategories + "] w/ collections [" + allCollections + "]"); // As long as one of the fields is populated we have something to do... if (collectionCategories != null || allCollections != null || subject != null) { UriCollection collection = filterUriCollection(collectionCategories, allCollections, subject); LOG.info("Filtered to " + collection); // There should be no scope beyond those created in the Constructor. if (collection != null) addCollection(scope, urls, collection); } } for (String key : ann.getCollections().keySet()) { LOG.info("Processed " + ann.getCollections().get(key).size() + " URIs for collection " + key); } } /** * * @param scope * @param urls * @param collection */ private void addCollection(String scope, String urls, UriCollection collection) { LOG.debug("Adding " + urls + " to collection " + collection.toString()); HashMap<String, UriCollection> relevantCollection = ann.getCollections().get(scope); for (String url : urls.split("\\s+")) { if (scope.equals("resource")) { /* * FIXME try { // Trac #2271: try keying on canonicalized URL. * url = canon.urlStringToKey(url); } catch( URIException u ) { * LOG.warn("Problem parsing URL: " + u.getMessage() + ": " + * url); } */ relevantCollection.put(url, collection); } else { URI uri; try { uri = new URI(url); } catch (URISyntaxException e) { LOG.warn(e.getMessage()); continue; } if (scope.equals("root")) { String prefix = uri.getScheme() + "://" + uri.getHost(); relevantCollection.put(prefix, collection); } if (scope.equals("subdomains")) { String host = uri.getHost(); relevantCollection.put(host, collection); } } } } /** * * @return */ public Annotations getAnnotations() { return ann; } /** * * @param map * @param startUrl * @throws IOException */ private void getTaxonomyViaJson(Map<Integer, JsonNode> map, String startUrl) throws IOException { // Get the collections export: String nextUrl = startUrl; String thisUrl = null; // Grab all the pages of collections: do { // Load the content: thisUrl = nextUrl; LOG.info("Getting taxnomy export from ACT... " + thisUrl); String collectionXml = readAct(thisUrl); // Map it to JsonNode tree: ObjectMapper mapper = new ObjectMapper(); JsonParser jp = mapper.getJsonFactory().createJsonParser(collectionXml); JsonNode root = jp.readValueAsTree(); // Add to the map of the categories: for (JsonNode node : root.get("list")) { Integer ci = Integer.parseInt(node.get("tid").getTextValue()); map.put(ci, node); } // Look up the next URL: nextUrl = root.path("next").getTextValue(); if (nextUrl != null) nextUrl = nextUrl.replaceFirst("\\?", "\\.json\\?"); } while (nextUrl != null); } /** * * @throws JsonParseException * @throws IOException */ private void getCollectionsViaJson() throws IOException { // Get the subjects taxonomy: this.getTaxonomyViaJson(sm, AnnotationsFromAct.WARC_SUBJECTS_URL_JSON); // Get the collections taxonomy: this.getTaxonomyViaJson(cm, AnnotationsFromAct.WARC_COLLECTIONS_URL_JSON); // Now patch up the parent-child relationships etc. for (JsonNode node : cm.values()) { // Get the parent categories: List<JsonNode> cats = this.resolveParents(node); // Turn that into a string representation: String catPath = this.getCatPath(cats); // Look to see if the root collection is marked as published: Boolean publish = cats.get(0).get("field_publish").getBooleanValue(); if (publish) { // LOG.info("Collection Path: " + catPath + " PUBLISHED"); // Add to list of collections, w/ date ranges: String name = catPath; String start = null; if (cats.get(0).get("field_dates").get("value") != null) { start = cats.get(0).get("field_dates").get("value").getTextValue(); } String end = null; if (cats.get(0).get("field_dates").get("value2") != null) { end = cats.get(0).get("field_dates").get("value2").getTextValue(); } DateRange dateRange = new DateRange(start, end); // LOG.info("Adding collection " + name + " with dateRange " // + dateRange); ann.getCollectionDateRanges().put(name, dateRange); } else { LOG.debug("Skipping unpublished collection with path: " + catPath); } } } /** * * @param cats * @return */ private String getCatPath(List<JsonNode> cats) { // Build up the full path string: StringBuilder catPath = new StringBuilder(); for (int i = 0; i < cats.size(); i++) { JsonNode cat = cats.get(i); catPath.append(cat.get("name").getTextValue()); // Append a separator if this is not the last entry: if (i < cats.size() - 1) catPath.append("|"); } return catPath.toString(); } /** * * @param c * @param cats */ private void resolveParents(JsonNode c, List<JsonNode> cats) { // Store this item: cats.add(0, c); // Loop through the parents (although there is only ever one in this // dataset): for (JsonNode parentRef : c.get("parent")) { Integer ci = parentRef.get("id").getIntValue(); JsonNode parent = cm.get(ci); resolveParents(parent, cats); } } private List<JsonNode> resolveParents(JsonNode c) { // Get the parent categories: List<JsonNode> cats = new ArrayList<JsonNode>(); // Find all the parents: this.resolveParents(c, cats); // And return: return cats; } /** * * @throws IOException */ private void getTargetsViaJson() throws IOException { String actUrl = "http://www.webarchive.org.uk/act/node.json?type=url"; int page = 0; int max_page = -1; do { page++; LOG.info("Getting page " + page + " of targets export from ACT... " + actUrl); String targets = readAct(actUrl); ObjectMapper mapper = new ObjectMapper(); JsonParser jp = mapper.getJsonFactory().createJsonParser(targets); JsonNode root = jp.readValueAsTree(); for (JsonNode node : root.get("list")) { String scope = node.get("field_scope").getTextValue(); LOG.debug("Got \"" + node.get("title").getTextValue() + "\" with scope: " + scope); String collectionCategories = null; List<String> allCollections = new ArrayList<String>(); String[] subjects = null; // Add on the categories: for (JsonNode cat : node.get("field_collection_categories")) { Integer cid = Integer.parseInt(cat.get("id").getTextValue()); JsonNode catd = cm.get(cid); if (catd == null) { LOG.warn("NULL catd for id=" + cid + " from: " + node.asText()); continue; } LOG.debug("collectionCategories: " + catd.get("name").getTextValue()); // Get the parent categories: List<JsonNode> catds = this.resolveParents(catd); // Turn that into a string representation: String catPath = this.getCatPath(catds); allCollections.add(catPath); if (collectionCategories == null) { collectionCategories = catds.get(0).get("name").getTextValue(); } } // Get the Subject: if (node.get("field_subject") != null) { Integer sid = Integer.parseInt(node.get("field_subject").get("id").getTextValue()); String subject = sm.get(sid).get("name").getTextValue(); LOG.debug("Found a SUBJECT: " + node.get("field_subject").get("id") + " > " + subject); subjects = new String[] { subject }; } UriCollection uc = new UriCollection(collectionCategories, allCollections.toArray(new String[1]), subjects); for (JsonNode url : node.get("field_url")) { LOG.debug("Got " + url.get("url").getTextValue()); // Add to the collection: addCollection(scope, url.get("url").getTextValue(), uc); } } // Look up the next page URL: actUrl = root.path("next").getTextValue(); if (actUrl != null) actUrl = actUrl.replaceFirst("\\?", "\\.json\\?"); } while (actUrl != null && (page < max_page || max_page < 0)); // Summarise the result: for (String key : ann.getCollections().keySet()) { LOG.info("Processed " + ann.getCollections().get(key).size() + " URIs for collection " + key); } } /** * * @param args * @throws IOException * @throws MalformedURLException * @throws JsonParseException * @throws JDOMException */ public static void main(String[] args) throws JsonParseException, MalformedURLException, IOException, JDOMException { // Populate LOG.info("Logging into ACT..."); AnnotationsFromAct act = new AnnotationsFromAct("dummy"); act.actLogin(); act.getCollectionsViaJson(); act.getTargetsViaJson(); String filename = "annotations.json"; LOG.info("Writing annotations to: " + filename); act.getAnnotations().toJsonFile(filename); LOG.info("...done."); } }