org.vivoweb.harvester.fetch.WOSFetch.java Source code

Java tutorial

Introduction

Here is the source code for org.vivoweb.harvester.fetch.WOSFetch.java

Source

/*******************************************************************************
 * Copyright (c) 2010-2011 VIVO Harvester Team. For full list of contributors, please see the AUTHORS file provided.
 * All rights reserved.
 * This program and the accompanying materials are made available under the terms of the new BSD license which accompanies this distribution, and is available at http://www.opensource.org/licenses/bsd-license.html
 ******************************************************************************/
package org.vivoweb.harvester.fetch;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.vivoweb.harvester.util.FileAide;
import org.vivoweb.harvester.util.InitLog;
import org.vivoweb.harvester.util.SOAPMessenger;
import org.vivoweb.harvester.util.XPathTool;
import org.vivoweb.harvester.util.args.ArgDef;
import org.vivoweb.harvester.util.args.ArgList;
import org.vivoweb.harvester.util.args.ArgParser;
import org.vivoweb.harvester.util.args.UsageException;
import org.vivoweb.harvester.util.repo.RecordHandler;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.collections.map.HashedMap;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Fetches SOAP-XML data from a SOAP compatible site placing the data in the supplied file.
 */
public class WOSFetch {
    /**
     * SLF4J Logger
     */
    private static Logger log = LoggerFactory.getLogger(WOSFetch.class);
    /**
     * RecordHandler to put data in.
     */
    private RecordHandler outputRH;

    /**
     * URL to send Authorization message to
     */
    private URL authUrl;

    /**
     * URL to send Authorization message to
     */
    private URL searchUrl;

    /**
     * URL to send Authorization message to
     */
    private URL lamrUrl;

    /**
     * Inputstream with SOAP style XML message to get authorization
     */
    private InputStream authMessage;

    /**
     * Inputstream with SOAP style XML message to close session
     */
    private InputStream closeMessage;

    /**
     * Inputstream with SOAP style XML message to perform the search
     */
    private InputStream searchFile;

    /**
     * Inputstream with SOAP style XML message to perform the search
     */
    private InputStream lamrFile;

    /**
     * The String version of the xml message
     */
    private String lamrMessage;

    /**
     * the set of identifiers which are then requested from the LAMR site.
     */
    private Set<String> lamrSet;

    /**
     * The authentication sessionID
     */
    private String sessionID;

    /**
     * This string is where the base64 encoded user name and password combination is to be stored
     */
    private String usernamePassword;

    /**
     * Constructor
     * @param authurl The location of the authorization site
     * @param searchurl The location of the search site
     * @param lamrurl The location of the links article match retrieval site
     * @param outputRH The record handler used for storing the harvested records
     * @param xmlSearchFile the file path to the search query message
     * @param xmlLamrFile the file path to the links article match retrieval message
     * @param userPass the user name password string to be base 64 encoded
     * @throws IOException error talking with database
     */
    public WOSFetch(URL authurl, URL searchurl, URL lamrurl, RecordHandler outputRH, String xmlSearchFile,
            String xmlLamrFile, String userPass) throws IOException {
        init(authurl, searchurl, lamrurl, outputRH, null, FileAide.getInputStream(xmlSearchFile),
                FileAide.getInputStream(xmlLamrFile), userPass);
    }

    /**
     * Command line Constructor
     * @param args commandline arguments
     * @throws IOException error creating task
     * @throws UsageException user requested usage message
     */
    private WOSFetch(String[] args) throws IOException, UsageException {
        this(getParser().parse(args));
    }

    /**
     * Arglist Constructor
     * @param args option set of parsed args
     * @throws IOException error creating task
     */
    private WOSFetch(ArgList args) throws IOException {
        if (args.has("a")) {
            init(new URL(args.get("u")), new URL(args.get("c")), new URL(args.get("l")),
                    RecordHandler.parseConfig(args.get("o"), args.getValueMap("O")),
                    FileAide.getInputStream(args.get("a")), FileAide.getInputStream(args.get("s")),
                    FileAide.getInputStream(args.get("m")), args.get("p"));
        } else {
            init(new URL(args.get("u")), new URL(args.get("c")), new URL(args.get("l")),
                    RecordHandler.parseConfig(args.get("o"), args.getValueMap("O")), null,
                    FileAide.getInputStream(args.get("s")), FileAide.getInputStream(args.get("m")), args.get("p"));
        }
    }

    /**
     * Library style Constructor
     * @param authorizationUrl The location of the authorization site
     * @param searchUrl The location of the search site
     * @param lamrhUrl The location of the links article match retrieval site
     * @param output The record handler used for storing the harvested records
     * @param xmlAuthStream the stream to the authorization message
     * @param xmlSearchStream the stream to the search query message
     * @param xmlLamrStream the stream to the links article match retrieval message
     * @param usernamePassword the user name password string to be base 64 encoded
     */
    public WOSFetch(URL authorizationUrl, URL searchUrl, URL lamrhUrl, RecordHandler output,
            InputStream xmlAuthStream, InputStream xmlSearchStream, InputStream xmlLamrStream,
            String usernamePassword) {
        init(authorizationUrl, searchUrl, lamrhUrl, output, xmlAuthStream, xmlSearchStream, xmlLamrStream,
                usernamePassword);

    }

    /**
     * The initializing method called on via the constructors.
     * @param authorizationUrl The location of the authorization site
     * @param searchUrl The location of the search site
     * @param lamrUrl The location of the links article match retrieval site
     * @param output The record handler used for storing the harvested records
     * @param xmlAuthStream the stream to the authorization message
     * @param xmlSearchStream the stream to the search query message
     * @param xmlLamrStream the stream to the links article match retrieval message
     * @param usernamePassword the user name password string to be base 64 encoded
     */
    private void init(URL authorizationUrl, URL searchUrl, URL lamrUrl, RecordHandler output,
            InputStream xmlAuthStream, InputStream xmlSearchStream, InputStream xmlLamrStream,
            String usernamePassword) {
        String authString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
                + "<soap:Envelope xmlns:soap=\"http://schemas.xmlsoap.org/soap/envelope/\" "
                + "xmlns:ns2=\"http://auth.cxf.wokmws.thomsonreuters.com\">"
                + "<soap:Body><ns2:authenticate/></soap:Body></soap:Envelope>";

        String closeString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
                + "<soap:Envelope xmlns:soap=\"http://schemas.xmlsoap.org/soap/envelope/\" "
                + "xmlns:ns2=\"http://auth.cxf.wokmws.thomsonreuters.com\">"
                + "<soap:Body><ns2:closeSession/></soap:Body></soap:Envelope>";

        this.outputRH = output;
        this.authUrl = authorizationUrl;
        if (usernamePassword == null) {
            this.usernamePassword = null;
        } else {
            this.usernamePassword = new Base64().encodeBase64URLSafeString(usernamePassword.getBytes());
        }
        if (xmlAuthStream == null) {
            this.authMessage = new ByteArrayInputStream(authString.getBytes());
        } else {
            this.authMessage = xmlAuthStream;
        }
        this.closeMessage = new ByteArrayInputStream(closeString.getBytes());
        this.searchUrl = searchUrl;
        this.searchFile = xmlSearchStream;

        this.lamrUrl = lamrUrl;
        this.lamrFile = xmlLamrStream;
        try {
            this.lamrMessage = IOUtils.toString(this.lamrFile);
            //         log.debug("LAMR message\n" + this.lamrMessage);
        } catch (IOException e) {
            e.printStackTrace();
        }
        this.lamrSet = new TreeSet<String>();

        log.debug("Checking for NULL values");
        if (this.outputRH == null) {
            log.debug("Outputfile = null");
            log.error("Must provide output file!");
        } else {
            log.debug("Outputfile = " + this.outputRH.toString());
        }

        if (this.searchFile == null) {
            log.debug("Search = null");
            log.error("Must provide Search message file!");
        } else {
            log.debug("Search = " + this.searchFile.toString());
        }

        if (this.authUrl == null) {
            log.debug("URL = null");
            log.error("Must provide authorization site url!");
        } else {
            log.debug("URL = " + this.authUrl.toString());
        }

        if (this.searchUrl == null) {
            log.debug("URL = null");
            log.error("Must provide Search site url!");
        } else {
            log.debug("URL = " + this.searchUrl.toString());
        }

        if (this.sessionID == null) {
            log.debug("SessionID = null");
            this.sessionID = "";
        } else {
            log.debug("SessionID = " + this.sessionID);
        }

    }

    /**
     * @param documentNode a DOM node to be changed into a properly indented string
     * @return The indented string containing the node and sub-nodes
     */
    private String nodeToString(Node documentNode) {
        StreamResult result = null;
        try {
            Transformer transformer = TransformerFactory.newInstance().newTransformer();
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");

            result = new StreamResult(new StringWriter());
            DOMSource domSource = new DOMSource(documentNode);
            transformer.transform(domSource, result);
        } catch (TransformerException e) {
            e.printStackTrace();
        } catch (TransformerFactoryConfigurationError e1) {
            e1.printStackTrace();
        }

        return result.getWriter().toString();
    }

    /**
     * @param previousQuery a WOS soap query xml message
     * @return the string with the altered first node in the 
     * @throws IOException thrown if there is an issue parsing the previousQuery string
     */
    private String getnextQuery(String previousQuery) throws IOException {
        String nextQuery = "";
        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true); // never forget this!
            Document searchDoc = factory.newDocumentBuilder()
                    .parse(new ByteArrayInputStream(previousQuery.getBytes()));
            //log.debug("searchDoc:");
            //log.debug(documentToString(searchDoc));

            NodeList firstrecordNodes = searchDoc.getElementsByTagName("firstRecord");
            Node firstnode = firstrecordNodes.item(0);
            int firstrecord = Integer.parseInt(firstnode.getTextContent());
            log.debug("firstrecord = " + firstrecord);

            NodeList countNodes = searchDoc.getElementsByTagName("count");
            int count = Integer.parseInt(countNodes.item(0).getTextContent());
            log.debug("count= " + count);
            int newFirst = firstrecord + count;
            firstnode.setTextContent(Integer.toString(newFirst));
            log.debug("new First Record= " + newFirst);
            //      }

            nextQuery = nodeToString(searchDoc);
            //log.debug("newsearchDoc:\n"+nextQuery);
        } catch (SAXException e) {
            e.printStackTrace();
            throw new IOException(e);
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
            throw new IOException(e);
        }
        return nextQuery;

    }

    /**
     * @param responseXML String containing the results from the WOS soap query
     * @return the number of records found in the string
     */
    private Map<String, String> extractSearchRecords(String responseXML) {
        HashMap<String, String> recordMap = new HashMap<String, String>();
        int numRecords = 0;
        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true); // never forget this!
            Document responseDoc = factory.newDocumentBuilder()
                    .parse(new ByteArrayInputStream(responseXML.getBytes()));
            NodeList recordList = responseDoc.getElementsByTagName("records");
            for (int index = 0; index < recordList.getLength(); index++) {
                Element currentRecord = (Element) recordList.item(index);
                String identifier = currentRecord.getElementsByTagName("UT").item(0).getTextContent();
                String id = "id_-_" + identifier;
                compileLamrList(identifier);
                Element recordRoot = responseDoc.createElement("Description");
                recordRoot.setAttribute("ID", identifier);
                recordRoot.appendChild(currentRecord);
                String data = nodeToString(recordRoot);
                recordMap.put(id, data);
                writeRecord(id, data);
                numRecords++;
            }
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        }
        log.debug("Extracted " + numRecords + " records from search");
        return recordMap;
    }

    /**
     * 
     */
    private void executeLamrQuery() {
        if (this.lamrSet.isEmpty()) {
            log.debug("No LAMR query sent, empty LAMR set.");
            return;
        }
        //compile lamrquery with lamrSet

        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true); // never forget this!
        Document lamrDoc = null;
        try {
            lamrDoc = factory.newDocumentBuilder().parse(new ByteArrayInputStream(this.lamrMessage.getBytes()));

            //         log.debug("LAMR Message :\n"+nodeToString(lamrDoc));
            NodeList mapElements = lamrDoc.getElementsByTagName("map");
            Element lookUp = null;
            for (int index = 0; index < mapElements.getLength(); index++) {
                Element currentmap = (Element) mapElements.item(index);
                //         log.debug("Element :\n" + nodeToString(currentmap));
                //         log.debug("Element name = \"" + currentmap.getAttribute("id")+ "\"");
                if (currentmap.getAttribute("id").contentEquals("lookup")) {
                    log.debug("Found element lookup");
                    //            lookUp = (Element)currentmap.getParentNode();
                    lookUp = currentmap;
                    break;
                }
            }
            if (lookUp == null) {
                log.error("No \"lookup\" node in LAMR query message");
            }
            //         log.debug("prelookUp = " + nodeToString(lookUp));

            for (String currentUT : this.lamrSet) {
                Element val = lamrDoc.createElement("val");
                val.setAttribute("name", "ut");
                val.setTextContent(currentUT);
                Element docMap = lamrDoc.createElement("map");
                docMap.setAttribute("name", "doc-" + currentUT);
                docMap.appendChild(val);
                lookUp.appendChild(docMap);
            }
            //         log.debug("LAMR Message :\n"+nodeToString(lamrDoc));

            //send lamrquery
            Document lamrRespDoc = null;

            ByteArrayOutputStream lamrResponse = new ByteArrayOutputStream();
            {
                SOAPMessenger soapfetch = new SOAPMessenger(this.lamrUrl, lamrResponse,
                        new ByteArrayInputStream(nodeToString(lamrDoc).getBytes()), "", null);
                soapfetch.execute();
            }
            lamrRespDoc = factory.newDocumentBuilder().parse(new ByteArrayInputStream(lamrResponse.toByteArray()));

            //         log.debug("LAMR Response :\n"+nodeToString(lamrRespDoc));
            //         extract records - A little hacky - message specifics sensitive
            //         To ensure no erroneous name spaces rebuilding structure from existing data.
            log.debug("Extracting LAMR Records");
            //          records are in map elements.
            NodeList respMapList = lamrRespDoc.getElementsByTagName("map");
            int recordsFound = 0;
            //         cycle through existing map elements;
            for (int index = 0; index < respMapList.getLength(); index++) {
                Element currentNode = (Element) respMapList.item(index);
                //            what we are looking for is found in maps named "WOS"
                if (currentNode.getAttribute("name").contentEquals("WOS")) {
                    //               for output similarity  have the root node be Description
                    Element recordRoot = lamrRespDoc.createElement("Description");
                    String ut = "";
                    //               each WOS node has the result formatted as named val nodes.
                    NodeList valList = currentNode.getElementsByTagName("val");
                    for (int index2 = 0; index2 < valList.getLength(); index2++) {
                        Element currentVal = (Element) valList.item(index2);
                        //                  Getting Record ID
                        if (currentVal.getAttribute("name").contentEquals("ut")) {
                            ut = currentVal.getTextContent();
                            break;
                        }
                    }
                    if (ut != "") {
                        recordsFound++;
                        recordRoot.setAttribute("ID", ut);
                        NodeList currentchildren = currentNode.getElementsByTagName("val");
                        Element currentDup = lamrRespDoc.createElement("map");
                        currentDup.setAttribute("name", "WOS");
                        for (int index2 = 0; index2 < currentchildren.getLength(); index2++) {
                            Element cur = (Element) currentchildren.item(index2);
                            Element childNode = lamrRespDoc.createElement(cur.getTagName());
                            childNode.setAttribute("name", cur.getAttribute("name"));
                            childNode.setTextContent(cur.getTextContent());
                            currentDup.appendChild(childNode);
                        }
                        recordRoot.appendChild(currentDup);

                        writeRecord("id_-_LAMR_-_" + ut, nodeToString(recordRoot));
                    }
                }
            }
            log.debug("Found " + recordsFound + " LAMR Records");

            //write records
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        }
        this.lamrSet.clear();
    }

    /**
     * @param id The identifying name for the record. (used as a filename in the text file record handler)
     * @param data A string representing the information to place within the record.
     * @throws IOException Thrown if there is an issue with the recordhandler back-end
     */
    public void writeRecord(String id, String data) throws IOException {
        log.trace("Adding Record " + id);
        this.outputRH.addRecord(id, data, this.getClass());
    }

    /**
     * @param id The identifying name for the record. (<UT> attribute)
     * @param data A string representing the information to place within the record.
     * @throws IOException Thrown if there is an issue with the recordhandler back-end
     */
    public void compileLamrList(String id) {
        log.trace("Adding LAMR UT = " + id);
        this.lamrSet.add(id);
        if (this.lamrSet.size() == 50) {
            executeLamrQuery();
        }
    }

    /**
     * Executes the task
     * @throws IOException error processing record handler or jdbc connection
     */
    public void execute() throws IOException {
        String searchQuery = IOUtils.toString(this.searchFile);
        ByteArrayOutputStream authResponse = new ByteArrayOutputStream();
        {
            HashMap<String, String> reqProp = new HashMap<String, String>();
            if (this.usernamePassword != null) {
                reqProp.put("Authorization", "Basic " + this.usernamePassword);
            }
            SOAPMessenger soapfetch = new SOAPMessenger(this.authUrl, authResponse, this.authMessage, "", reqProp);
            soapfetch.execute();
        }
        String authCode = XPathTool.getXpathStreamResult(new ByteArrayInputStream(authResponse.toByteArray()),
                "//return");
        int recordsFound, lastRec;
        do {

            ByteArrayOutputStream searchResponse = new ByteArrayOutputStream();
            {
                SOAPMessenger soapfetch = new SOAPMessenger(this.searchUrl, searchResponse,
                        new ByteArrayInputStream(searchQuery.getBytes()), authCode, null);
                soapfetch.execute();
            }
            String recFound = XPathTool.getXpathStreamResult(new ByteArrayInputStream(searchResponse.toByteArray()),
                    "//recordsFound");
            String searchCount = XPathTool.getXpathStreamResult(new ByteArrayInputStream(searchQuery.getBytes()),
                    "//retrieveParameters/count");
            String firstrecord = XPathTool.getXpathStreamResult(new ByteArrayInputStream(searchQuery.getBytes()),
                    "//retrieveParameters/firstRecord");

            Map<String, String> recordMap = extractSearchRecords(new String(searchResponse.toByteArray(), "UTF-8"));
            //         for(String recId : recordMap.keySet()){
            //
            //            writeRecord(recId, recordMap.get(recId));
            //         }
            log.debug("Search count = \"" + searchCount + "\"");
            log.debug("Records Found = \"" + recFound + "\"");
            recordsFound = Integer.parseInt(recFound);
            lastRec = Integer.parseInt(searchCount) + Integer.parseInt(firstrecord);
            log.debug("Records left = " + (recordsFound - lastRec));
            searchQuery = getnextQuery(searchQuery);
            try {
                Thread.sleep(100); // do nothing for 100 miliseconds (1000 miliseconds = 1 second)
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        } while (lastRec < recordsFound);
        executeLamrQuery();

        ByteArrayOutputStream closeResponse = new ByteArrayOutputStream();
        {
            SOAPMessenger soapfetch = new SOAPMessenger(this.authUrl, closeResponse, this.closeMessage, authCode,
                    null);
            soapfetch.execute();
        }
    }

    /**
     * Get the ArgParser for this task
     * @return the ArgParser
     */
    private static ArgParser getParser() {
        ArgParser parser = new ArgParser("WOSFetch");
        parser.addArgument(new ArgDef().setShortOption('u').setLongOpt("authurl").withParameter(true, "URL")
                .setDescription("The URL which will receive the AUTHMESSAGE.").setRequired(true));
        parser.addArgument(
                new ArgDef().setShortOption('c').setLongOpt("searchconnection").withParameter(true, "URL")
                        .setDescription("The URL which will receive the SEARCHMESSAGE.").setRequired(true));
        parser.addArgument(new ArgDef().setShortOption('l').setLongOpt("lamrconnection").withParameter(true, "URL")
                .setDescription("The URL which will receive the LAMRMESSAGE.").setRequired(true));
        parser.addArgument(
                new ArgDef().setShortOption('s').setLongOpt("searchmessage").withParameter(true, "SEARCHMESSAGE")
                        .setDescription("The SEARCHMESSAGE file path.").setRequired(true));
        parser.addArgument(
                new ArgDef().setShortOption('a').setLongOpt("authmessage").withParameter(true, "AUTHMESSAGE")
                        .setDescription("The AUTHMESSAGE file path.").setRequired(false));
        parser.addArgument(
                new ArgDef().setShortOption('m').setLongOpt("lamrmessage").withParameter(true, "LAMRMESSAGE")
                        .setDescription("The LAMRMESSAGE file path.").setRequired(false));
        parser.addArgument(new ArgDef().setShortOption('p').setLongOpt("usernamepassword")
                .withParameter(true, "USERNAMEPASSWORD")
                .setDescription("The username and password string to be encoded using base64").setRequired(false));
        parser.addArgument(new ArgDef().setShortOption('o').setLongOpt("output").withParameter(true, "OUTPUT_FILE")
                .setDescription("XML result file path").setRequired(true));
        parser.addArgument(new ArgDef().setShortOption('O').setLongOpt("outputOverride")
                .withParameterValueMap("RH_PARAM", "VALUE")
                .setDescription("override the RH_PARAM of output recordhandler using VALUE").setRequired(false));
        return parser;
    }

    /**
     * Main method
     * @param args commandline arguments
     */
    public static void main(String... args) {
        Exception error = null;
        try {
            InitLog.initLogger(args, getParser());
            log.info(getParser().getAppName() + ": Start");
            new WOSFetch(args).execute();
        } catch (IllegalArgumentException e) {
            log.error(e.getMessage());
            log.debug("Stacktrace:", e);
            System.out.println(getParser().getUsage());
            error = e;
        } catch (UsageException e) {
            log.info("Printing Usage:");
            System.out.println(getParser().getUsage());
            error = e;
        } catch (Exception e) {
            log.error(e.getMessage());
            log.debug("Stacktrace:", e);
            error = e;
        } finally {
            log.info(getParser().getAppName() + ": End");
            if (error != null) {
                System.exit(1);
            }
        }
    }
}