uk.bl.wa.annotation.AnnotationsFromAct.java Source code

Introduction

Here is the source code for uk.bl.wa.annotation.AnnotationsFromAct.java
Source

/**
 * 
 */
package uk.bl.wa.annotation;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.common.util.Base64;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.map.ObjectMapper;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;

import com.google.common.base.Joiner;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;

/**
 * 
 * This downloads the data from the ACT prototype (based on Drupal) and creates
 * a set of @Annotations from the appropriate taxonomy.
 * 
 * @author Roger Coram, Andrew Jackson
 * 
 */
public class AnnotationsFromAct {

    private String[] crawlFreqs = new String[] { "nevercrawl", "domaincrawl", "annual", "sixmonthly", "quarterly",
            "monthly", "weekly", "daily" };
    private static String WARC_ACT_URL = "http://www.webarchive.org.uk/act/websites/export/daily";
    private static String WARC_COLLECTIONS_URL = "http://www.webarchive.org.uk/act/taxonomy_term.xml?sort=name&direction=ASC&vocabulary=5&limit=500&page=0";
    private static String WARC_COLLECTIONS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=5&limit=500&page=0";
    private static String WARC_SUBJECTS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=2&limit=500&page=0";

    private static Log LOG = LogFactory.getLog(AnnotationsFromAct.class);

    private String cookie;
    private String csrf;

    private static final String COLLECTION_XML = "taxonomy_term";
    private static final String OK_PUBLISH = "1";
    private static final String FIELD_PUBLISH = "field_publish";
    private static final String FIELD_DATES = "field_dates";
    private static final String FIELD_NAME = "name";
    private static final String FIELD_START_DATE = "value";
    private static final String FIELD_END_DATE = "value2";

    // Map of all categories and subjects:
    private Map<Integer, JsonNode> cm = new HashMap<Integer, JsonNode>();
    private Map<Integer, JsonNode> sm = new HashMap<Integer, JsonNode>();

    // The annotations being built up from ACT:
    private Annotations ann = new Annotations();

    /**
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public AnnotationsFromAct() throws IOException, JDOMException {
        // Populate
        LOG.info("Logging into ACT...");
        this.actLogin();
        // Get the collections export:
        LOG.info("Getting collections export from ACT...");
        String collectionXml = readAct(AnnotationsFromAct.WARC_COLLECTIONS_URL);
        LOG.info("Parsing collection XML...");
        parseCollectionXml(collectionXml);
        // Get all Targets:
        LOG.info("Getting main export from ACT...");
        String recordXml = readAct(AnnotationsFromAct.WARC_ACT_URL);
        LOG.info("Parsing record XML...");
        parseRecordXml(recordXml);
    }

    protected AnnotationsFromAct(String dummy) {
    }

    /**
     * Performs login operation to ACT, setting Cookie and CSRF.
     * @throws IOException
     */
    private void actLogin() throws IOException {
        Config loginConf = ConfigFactory.parseFile(new File("credentials.conf"));
        URL login = new URL(loginConf.getString("act.login"));
        LOG.info("Logging in at " + login);

        HttpURLConnection connection = (HttpURLConnection) login.openConnection();
        StringBuilder credentials = new StringBuilder();
        credentials.append(loginConf.getString("act.username"));
        credentials.append(":");
        credentials.append(loginConf.getString("act.password"));
        connection.setRequestProperty("Authorization",
                "Basic " + Base64.byteArrayToBase64(credentials.toString().getBytes()));
        connection.setRequestProperty("Content-Type", "text/plain");

        Scanner scanner;
        if (connection.getResponseCode() != 200) {
            scanner = new Scanner(connection.getErrorStream());
            scanner.useDelimiter("\\Z");
            throw new IOException(scanner.next());
        } else {
            scanner = new Scanner(connection.getInputStream());
        }
        scanner.useDelimiter("\\Z");
        this.csrf = scanner.next();
        this.cookie = connection.getHeaderField("set-cookie");
    }

    /**
     * Read data from ACT to include curator-specified metadata.
     * @param conf
     * @return
     * @throws MalformedURLException
     * @throws IOException
     */
    private String readAct(String url) throws IOException {
        URL act = new URL(url);
        HttpURLConnection connection = (HttpURLConnection) act.openConnection();
        if (this.cookie != null) {
            connection.setRequestProperty("Cookie", this.cookie);
            connection.setRequestProperty("X-CSRF-TOKEN", this.csrf);
        }

        Scanner scanner;
        if (connection.getResponseCode() != 200) {
            scanner = new Scanner(connection.getErrorStream());
            scanner.useDelimiter("\\Z");
            throw new IOException(scanner.next());
        } else {
            scanner = new Scanner(connection.getInputStream());
        }
        scanner.useDelimiter("\\Z");
        return scanner.next();
    }

    /**
     * Parses XML from ACT, mapping collection names to date ranges.
     * 
     * @throws IOException
     * @throws JDOMException
     * 
     */
    @SuppressWarnings("unchecked")
    private void parseCollectionXml(String xml) throws JDOMException, IOException {
        SAXBuilder builder = new SAXBuilder();
        Document document = (Document) builder.build(new StringReader(xml));
        Element rootNode = document.getRootElement();
        List<Element> list = rootNode.getChildren(COLLECTION_XML);

        Element node = null;
        DateRange dateRange;
        String name, start, end, publish;
        for (int i = 0; i < list.size(); i++) {
            node = (Element) list.get(i);
            publish = node.getChildText(FIELD_PUBLISH);
            name = node.getChildText(FIELD_NAME);
            if (publish != null && publish.equals(OK_PUBLISH)) {
                start = node.getChild(FIELD_DATES).getChildText(FIELD_START_DATE);
                end = node.getChild(FIELD_DATES).getChildText(FIELD_END_DATE);
                dateRange = new DateRange(start, end);
                LOG.info("Adding collection " + name + " with dateRange " + dateRange);
                ann.getCollectionDateRanges().put(name, dateRange);
            } else {
                LOG.info("Skipping collection \"" + name + "\" (not ok to publish)");
            }
        }
    }

    /**
     * Removes inactive Collections before optionally creating a UriCollection.
     * 
     * @param collectionCategories
     * @param allCollections
     * @param subject
     * @return
     */
    private UriCollection filterUriCollection(String collectionCategories, String allCollections, String subject) {
        UriCollection output = null;
        Set<String> validCollections = ann.getCollectionDateRanges().keySet();

        if (collectionCategories != null && !validCollections.contains(collectionCategories))
            collectionCategories = null;

        ArrayList<String> valid = new ArrayList<String>();
        if (allCollections != null) {
            for (String a : allCollections.split("|")) {
                if (validCollections.contains(a))
                    valid.add(a);
            }
            if (valid.size() == 0) {
                allCollections = null;
            } else {
                allCollections = Joiner.on("|").join(valid);
            }
        }

        valid.clear();
        if (subject != null) {
            for (String s : subject.split("|")) {
                if (validCollections.contains(s))
                    valid.add(s);
            }
            if (valid.size() == 0) {
                subject = null;
            } else {
                subject = Joiner.on("|").join(valid);
            }
        }

        if (collectionCategories != null && allCollections != null && subject != null)
            output = new UriCollection(collectionCategories, allCollections, subject);

        return output;
    }

    /**
     * Parses XML output from ACT into a lookup, mapping URLs to collections.
     * 
     * @param xml
     * @throws JDOMException
     * @throws IOException
     * @throws URISyntaxException
     */
    private void parseRecordXml(String xml) throws JDOMException, IOException {
        SAXBuilder builder = new SAXBuilder();
        Document document = (Document) builder.build(new StringReader(xml));
        Element rootNode = document.getRootElement();
        List<Element> list = rootNode.getChildren("node");

        Element node = null;
        String urls, collectionCategories, allCollections, subject, scope;
        for (int i = 0; i < list.size(); i++) {
            node = (Element) list.get(i);
            urls = node.getChildText("urls");
            collectionCategories = node.getChildText("collectionCategories");
            // Trac #2271: Erroneous data in ACT might contain pipe-separated text.
            if (collectionCategories != null && collectionCategories.indexOf("|") != -1) {
                collectionCategories = collectionCategories.split("|")[0];
            }
            allCollections = node.getChildText("allCollections");
            subject = node.getChildText("subject");
            scope = node.getChildText("scope");
            LOG.info("Looking at scope [" + scope + "] subject [" + subject + "] collectionCategories ["
                    + collectionCategories + "] w/ collections [" + allCollections + "]");
            // As long as one of the fields is populated we have something to do...
            if (collectionCategories != null || allCollections != null || subject != null) {
                UriCollection collection = filterUriCollection(collectionCategories, allCollections, subject);
                LOG.info("Filtered to " + collection);
                // There should be no scope beyond those created in the Constructor.
                if (collection != null)
                    addCollection(scope, urls, collection);
            }
        }
        for (String key : ann.getCollections().keySet()) {
            LOG.info("Processed " + ann.getCollections().get(key).size() + " URIs for collection " + key);
        }
    }

    /**
     * 
     * @param scope
     * @param urls
     * @param collection
     */
    private void addCollection(String scope, String urls, UriCollection collection) {
        LOG.debug("Adding " + urls + " to collection " + collection.toString());
        HashMap<String, UriCollection> relevantCollection = ann.getCollections().get(scope);
        for (String url : urls.split("\\s+")) {
            if (scope.equals("resource")) {
                /*
                 * FIXME try { // Trac #2271: try keying on canonicalized URL.
                 * url = canon.urlStringToKey(url); } catch( URIException u ) {
                 * LOG.warn("Problem parsing URL: " + u.getMessage() + ": " +
                 * url); }
                 */
                relevantCollection.put(url, collection);
            } else {
                URI uri;
                try {
                    uri = new URI(url);
                } catch (URISyntaxException e) {
                    LOG.warn(e.getMessage());
                    continue;
                }
                if (scope.equals("root")) {
                    String prefix = uri.getScheme() + "://" + uri.getHost();
                    relevantCollection.put(prefix, collection);
                }
                if (scope.equals("subdomains")) {
                    String host = uri.getHost();
                    relevantCollection.put(host, collection);
                }
            }
        }
    }

    /**
     * 
     * @return
     */
    public Annotations getAnnotations() {
        return ann;
    }

    /**
     * 
     * @param map
     * @param startUrl
     * @throws IOException
     */
    private void getTaxonomyViaJson(Map<Integer, JsonNode> map, String startUrl) throws IOException {
        // Get the collections export:
        String nextUrl = startUrl;
        String thisUrl = null;
        // Grab all the pages of collections:
        do {
            // Load the content:
            thisUrl = nextUrl;
            LOG.info("Getting taxnomy export from ACT... " + thisUrl);
            String collectionXml = readAct(thisUrl);

            // Map it to JsonNode tree:
            ObjectMapper mapper = new ObjectMapper();
            JsonParser jp = mapper.getJsonFactory().createJsonParser(collectionXml);
            JsonNode root = jp.readValueAsTree();

            // Add to the map of the categories:
            for (JsonNode node : root.get("list")) {
                Integer ci = Integer.parseInt(node.get("tid").getTextValue());
                map.put(ci, node);
            }
            // Look up the next URL:
            nextUrl = root.path("next").getTextValue();
            if (nextUrl != null)
                nextUrl = nextUrl.replaceFirst("\\?", "\\.json\\?");
        } while (nextUrl != null);

    }

    /**
     * 
     * @throws JsonParseException
     * @throws IOException
     */
    private void getCollectionsViaJson() throws IOException {
        // Get the subjects taxonomy:
        this.getTaxonomyViaJson(sm, AnnotationsFromAct.WARC_SUBJECTS_URL_JSON);
        // Get the collections taxonomy:
        this.getTaxonomyViaJson(cm, AnnotationsFromAct.WARC_COLLECTIONS_URL_JSON);

        // Now patch up the parent-child relationships etc.
        for (JsonNode node : cm.values()) {

            // Get the parent categories:
            List<JsonNode> cats = this.resolveParents(node);

            // Turn that into a string representation:
            String catPath = this.getCatPath(cats);

            // Look to see if the root collection is marked as published:
            Boolean publish = cats.get(0).get("field_publish").getBooleanValue();
            if (publish) {
                // LOG.info("Collection Path: " + catPath + " PUBLISHED");
                // Add to list of collections, w/ date ranges:
                String name = catPath;
                String start = null;
                if (cats.get(0).get("field_dates").get("value") != null) {
                    start = cats.get(0).get("field_dates").get("value").getTextValue();
                }
                String end = null;
                if (cats.get(0).get("field_dates").get("value2") != null) {
                    end = cats.get(0).get("field_dates").get("value2").getTextValue();
                }
                DateRange dateRange = new DateRange(start, end);
                // LOG.info("Adding collection " + name + " with dateRange "
                // + dateRange);
                ann.getCollectionDateRanges().put(name, dateRange);
            } else {
                LOG.debug("Skipping unpublished collection with path: " + catPath);
            }
        }

    }

    /**
     * 
     * @param cats
     * @return
     */
    private String getCatPath(List<JsonNode> cats) {
        // Build up the full path string:
        StringBuilder catPath = new StringBuilder();
        for (int i = 0; i < cats.size(); i++) {
            JsonNode cat = cats.get(i);
            catPath.append(cat.get("name").getTextValue());
            // Append a separator if this is not the last entry:
            if (i < cats.size() - 1)
                catPath.append("|");
        }
        return catPath.toString();
    }

    /**
     * 
     * @param c
     * @param cats
     */
    private void resolveParents(JsonNode c, List<JsonNode> cats) {
        // Store this item:
        cats.add(0, c);
        // Loop through the parents (although there is only ever one in this
        // dataset):
        for (JsonNode parentRef : c.get("parent")) {
            Integer ci = parentRef.get("id").getIntValue();
            JsonNode parent = cm.get(ci);
            resolveParents(parent, cats);
        }
    }

    private List<JsonNode> resolveParents(JsonNode c) {
        // Get the parent categories:
        List<JsonNode> cats = new ArrayList<JsonNode>();
        // Find all the parents:
        this.resolveParents(c, cats);
        // And return:
        return cats;
    }

    /**
     * 
     * @throws IOException
     */
    private void getTargetsViaJson() throws IOException {
        String actUrl = "http://www.webarchive.org.uk/act/node.json?type=url";
        int page = 0;
        int max_page = -1;
        do {
            page++;
            LOG.info("Getting page " + page + " of targets export from ACT... " + actUrl);
            String targets = readAct(actUrl);

            ObjectMapper mapper = new ObjectMapper();
            JsonParser jp = mapper.getJsonFactory().createJsonParser(targets);
            JsonNode root = jp.readValueAsTree();

            for (JsonNode node : root.get("list")) {
                String scope = node.get("field_scope").getTextValue();
                LOG.debug("Got \"" + node.get("title").getTextValue() + "\" with scope: " + scope);
                String collectionCategories = null;
                List<String> allCollections = new ArrayList<String>();
                String[] subjects = null;
                // Add on the categories:
                for (JsonNode cat : node.get("field_collection_categories")) {
                    Integer cid = Integer.parseInt(cat.get("id").getTextValue());
                    JsonNode catd = cm.get(cid);
                    if (catd == null) {
                        LOG.warn("NULL catd for id=" + cid + " from: " + node.asText());
                        continue;
                    }
                    LOG.debug("collectionCategories: " + catd.get("name").getTextValue());
                    // Get the parent categories:
                    List<JsonNode> catds = this.resolveParents(catd);
                    // Turn that into a string representation:
                    String catPath = this.getCatPath(catds);
                    allCollections.add(catPath);
                    if (collectionCategories == null) {
                        collectionCategories = catds.get(0).get("name").getTextValue();
                    }
                }
                // Get the Subject:
                if (node.get("field_subject") != null) {
                    Integer sid = Integer.parseInt(node.get("field_subject").get("id").getTextValue());
                    String subject = sm.get(sid).get("name").getTextValue();
                    LOG.debug("Found a SUBJECT: " + node.get("field_subject").get("id") + " > " + subject);
                    subjects = new String[] { subject };
                }
                UriCollection uc = new UriCollection(collectionCategories, allCollections.toArray(new String[1]),
                        subjects);
                for (JsonNode url : node.get("field_url")) {
                    LOG.debug("Got " + url.get("url").getTextValue());
                    // Add to the collection:
                    addCollection(scope, url.get("url").getTextValue(), uc);
                }
            }

            // Look up the next page URL:
            actUrl = root.path("next").getTextValue();
            if (actUrl != null)
                actUrl = actUrl.replaceFirst("\\?", "\\.json\\?");
        } while (actUrl != null && (page < max_page || max_page < 0));

        // Summarise the result:
        for (String key : ann.getCollections().keySet()) {
            LOG.info("Processed " + ann.getCollections().get(key).size() + " URIs for collection " + key);
        }
    }

    /**
     * 
     * @param args
     * @throws IOException
     * @throws MalformedURLException
     * @throws JsonParseException
     * @throws JDOMException
     */
    public static void main(String[] args)
            throws JsonParseException, MalformedURLException, IOException, JDOMException {

        // Populate
        LOG.info("Logging into ACT...");
        AnnotationsFromAct act = new AnnotationsFromAct("dummy");
        act.actLogin();

        act.getCollectionsViaJson();

        act.getTargetsViaJson();

        String filename = "annotations.json";

        LOG.info("Writing annotations to: " + filename);
        act.getAnnotations().toJsonFile(filename);
        LOG.info("...done.");
    }

}