org.dspace.content.authority.LCNameAuthority.java Source code

Java tutorial

Introduction

Here is the source code for org.dspace.content.authority.LCNameAuthority.java

Source

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.content.authority;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.XMLReader;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import org.apache.log4j.Logger;

import org.dspace.core.ConfigurationManager;
import org.dspace.content.DCPersonName;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.HttpException;

/**
 * Sample personal name authority based on Library of Congress Name Authority
 * Also serves as an example of an SRU client as authority.
 *
 * This is tuned for the data in the LC Name Authority test instance, see
 * http://alcme.oclc.org/srw/search/lcnaf
 *
 * WARNING: This is just a proof-of-concept implementation.  It would need
 * WARNING: lots of refinement to be used in production, because it is very
 * WARNING: sloppy about digging through the MARC/XML results.  No doubt
 * WARNING: it is losing a lot of valid results and information.
 * WARNING: Could also do a better job including more info (title, life dates
 * WARNING: etc) in the label instead of just the name.
 *
 * Reads these DSpace Config properties:
 *
 *      lcname.url = http://alcme.oclc.org/srw/search/lcnaf
 *
 *  TODO: make # of results to ask for (and return) configurable.
 *
 * @author Larry Stone
 * @version $Revision $
 */
public class LCNameAuthority implements ChoiceAuthority {
    private static Logger log = Logger.getLogger(LCNameAuthority.class);

    // get these from configuration
    private static String url = null;

    // NS URI for SRU respones
    private static final String NS_SRU = "http://www.loc.gov/zing/srw/";

    // NS URI for MARC/XML
    private static final String NS_MX = "http://www.loc.gov/MARC21/slim";

    // constructor does static init too..
    public LCNameAuthority() {
        if (url == null) {
            url = ConfigurationManager.getProperty("lcname.url");

            // sanity check
            if (url == null) {
                throw new IllegalStateException("Missing DSpace configuration keys for LCName Query");
            }
        }
    }

    // punt!  this is a poor implementation..
    public Choices getBestMatch(String field, String text, int collection, String locale) {
        return getMatches(field, text, collection, 0, 2, locale);
    }

    /**
     * Match a proposed value against name authority records
     * Value is assumed to be in "Lastname, Firstname" format.
     */
    public Choices getMatches(String field, String text, int collection, int start, int limit, String locale) {
        Choices result = queryPerson(text, start, limit);
        if (result == null) {
            result = new Choices(true);
        }

        return result;
    }

    // punt; supposed to get the canonical display form of a metadata authority key
    // XXX FIXME implement this with a query on the authority key, cache results
    public String getLabel(String field, String key, String locale) {
        return key;
    }

    /**
     * Guts of the implementation, returns a complete Choices result, or
     * null for a failure.
     */
    private Choices queryPerson(String text, int start, int limit) {
        // punt if there is no query text
        if (text == null || text.trim().length() == 0) {
            return new Choices(true);
        }

        // 1. build CQL query
        DCPersonName pn = new DCPersonName(text);
        StringBuilder query = new StringBuilder();
        query.append("local.FirstName = \"").append(pn.getFirstNames()).append("\" and local.FamilyName = \"")
                .append(pn.getLastName()).append("\"");

        // XXX arbitrary default limit - should be configurable?
        if (limit == 0) {
            limit = 50;
        }

        NameValuePair args[] = new NameValuePair[6];
        args[0] = new NameValuePair("operation", "searchRetrieve");
        args[1] = new NameValuePair("version", "1.1");
        args[2] = new NameValuePair("recordSchema", "info:srw/schema/1/marcxml-v1.1");
        args[3] = new NameValuePair("query", query.toString());
        args[4] = new NameValuePair("maximumRecords", String.valueOf(limit));
        args[5] = new NameValuePair("startRecord", String.valueOf(start + 1));
        HttpClient hc = new HttpClient();
        String srUrl = url + "?" + EncodingUtil.formUrlEncode(args, "UTF8");
        GetMethod get = new GetMethod(srUrl);

        log.debug("Trying SRU query, URL=" + srUrl);

        // 2. web request
        try {
            int status = hc.executeMethod(get);
            if (status == 200) {
                SAXParserFactory spf = SAXParserFactory.newInstance();
                SAXParser sp = spf.newSAXParser();
                XMLReader xr = sp.getXMLReader();
                SRUHandler handler = new SRUHandler();

                // XXX FIXME: should turn off validation here explicitly, but
                //  it seems to be off by default.
                xr.setFeature("http://xml.org/sax/features/namespaces", true);
                xr.setContentHandler(handler);
                xr.setErrorHandler(handler);
                xr.parse(new InputSource(get.getResponseBodyAsStream()));

                // this probably just means more results available..
                if (handler.hits != handler.result.size()) {
                    log.warn("Discrepency in results, result.length=" + handler.result.size()
                            + ", yet expected results=" + handler.hits);
                }
                boolean more = handler.hits > (start + handler.result.size());

                // XXX add non-auth option; perhaps the UI should do this?
                // XXX it's really a policy matter if they allow unauth result.
                // XXX good, stop it.
                // handler.result.add(new Choice("", text, "Non-Authority: \""+text+"\""));

                int confidence;
                if (handler.hits == 0) {
                    confidence = Choices.CF_NOTFOUND;
                } else if (handler.hits == 1) {
                    confidence = Choices.CF_UNCERTAIN;
                } else {
                    confidence = Choices.CF_AMBIGUOUS;
                }
                return new Choices(handler.result.toArray(new Choice[handler.result.size()]), start, handler.hits,
                        confidence, more);
            }
        } catch (HttpException e) {
            log.error("SRU query failed: ", e);
            return new Choices(true);
        } catch (IOException e) {
            log.error("SRU query failed: ", e);
            return new Choices(true);
        } catch (ParserConfigurationException e) {
            log.warn("Failed parsing SRU result: ", e);
            return new Choices(true);
        } catch (SAXException e) {
            log.warn("Failed parsing SRU result: ", e);
            return new Choices(true);
        } finally {
            get.releaseConnection();
        }
        return new Choices(true);
    }

    /**
     * XXX FIXME TODO: Very sloppy MARC/XML parser.
     * This only reads subfields 010.a (for LCCN, to use as key)
     * and 100.a (for "established personal name")
     * Maybe look at Indicator on 100 too.
     * Should probably read other 100 subfields to build a more detailed label.
     */
    private static class SRUHandler extends DefaultHandler {
        private List<Choice> result = new ArrayList<Choice>();
        private int hits = -1;
        private String textValue = null;
        private String name = null;
        private String lccn = null;
        private String lastTag = null;
        private String lastCode = null;

        // NOTE:  text value MAY be presented in multiple calls, even if
        // it all one word, so be ready to splice it together.
        // BEWARE:  subclass's startElement method should call super()
        // to null out 'value'.  (Don't you miss the method combination
        // options of a real object system like CLOS?)
        public void characters(char[] ch, int start, int length) throws SAXException {
            String newValue = new String(ch, start, length);
            if (newValue.length() > 0) {
                if (textValue == null) {
                    textValue = newValue;
                } else {
                    textValue += newValue;
                }
            }
        }

        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
            if (localName.equals("numberOfRecords") && namespaceURI.equals(NS_SRU)) {
                hits = Integer.parseInt(textValue.trim());
                if (hits > 0) {
                    name = null;
                    lccn = null;
                    log.debug("Expecting " + hits + " records in results.");
                }
            }

            // after record get next hit ready
            else if (localName.equals("record") && namespaceURI.equals(NS_SRU)) {
                if (name != null && lccn != null) {
                    // HACK: many LC name entries end with ',' ...trim it.
                    if (name.endsWith(",")) {
                        name = name.substring(0, name.length() - 1);
                    }

                    // XXX DEBUG
                    // log.debug("Got result, name="+name+", lccn="+lccn);
                    result.add(new Choice(lccn, name, name));
                } else {
                    log.warn("Got anomalous result, at least one of these null: lccn=" + lccn + ", name=" + name);
                }
                name = null;
                lccn = null;
            }

            else if (localName.equals("subfield") && namespaceURI.equals(NS_MX)) {
                if (lastTag != null && lastCode != null) {
                    if (lastTag.equals("010") && lastCode.equals("a")) {
                        // 010.a is lccn, "authority code"
                        lccn = textValue;
                    } else if (lastTag.equals("100") && lastCode.equals("a")) {
                        // 100.a is the personal name
                        name = textValue;
                    }

                    if (lastTag.equals("100") && lastCode.equals("d") && (name != null)) {
                        name = name + "  " + textValue;
                    }
                }
            }
        }

        // subclass overriding this MUST call it with super()
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
                throws SAXException {
            textValue = null;

            if (localName.equals("datafield") && namespaceURI.equals(NS_MX)) {
                lastTag = atts.getValue("tag");
                if (lastTag == null) {
                    log.warn("MARC datafield without tag attribute!");
                }
            } else if (localName.equals("subfield") && namespaceURI.equals(NS_MX)) {
                lastCode = atts.getValue("code");
                if (lastCode == null) {
                    log.warn("MARC subfield without code attribute!");
                }
            }
        }

        public void error(SAXParseException exception) throws SAXException {
            throw new SAXException(exception);
        }

        public void fatalError(SAXParseException exception) throws SAXException {
            throw new SAXException(exception);
        }
    }
}