com.twentyn.patentExtractor.Util.java Source code

Introduction

Here is the source code for com.twentyn.patentExtractor.Util.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentExtractor;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import org.apache.commons.codec.binary.Base64InputStream;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class Util {
    public enum DocumentType {
        PATENT, APPLICATION, SEQUENCE, UNKNOWN,
    }

    private static final HashMap<String, DocumentType> NODE_NAME_TO_DOC_TYPE = new HashMap<String, DocumentType>() {
        {
            put("us-patent-grant", DocumentType.PATENT);
            put("us-patent-application", DocumentType.APPLICATION);
            put("sequence-cwu", DocumentType.SEQUENCE);
        }
    };

    private static final ThreadLocal<XPathFactory> XPATH_FACTORY = new ThreadLocal<XPathFactory>() {
        @Override
        protected XPathFactory initialValue() {
            return XPathFactory.newInstance();
        }
    };

    private static final ThreadLocal<TransformerFactory> TRANSFORMER_FACTORY = new ThreadLocal<TransformerFactory>() {
        @Override
        protected TransformerFactory initialValue() {
            return TransformerFactory.newInstance();
        }
    };

    public static XPathFactory getXPathFactory() {
        return XPATH_FACTORY.get();
    }

    public static TransformerFactory getTransformerFactory() {
        return TRANSFORMER_FACTORY.get();
    }

    public static DocumentType identifyDocType(Document dom) throws XPathExpressionException {
        XPath xpath = null;
        xpath = getXPathFactory().newXPath();
        for (Map.Entry<String, DocumentType> entry : NODE_NAME_TO_DOC_TYPE.entrySet()) {
            Node top = (Node) xpath.evaluate("/" + entry.getKey(), dom, XPathConstants.NODE);
            if (top != null) {
                return entry.getValue();
            }
        }
        return DocumentType.UNKNOWN;
    }

    public static byte[] compressXMLDocument(Document doc)
            throws IOException, TransformerConfigurationException, TransformerException {
        Transformer transformer = getTransformerFactory().newTransformer();
        // The OutputKeys.INDENT configuration key determines whether the output is indented.

        DOMSource w3DomSource = new DOMSource(doc);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        Writer w = new BufferedWriter(new OutputStreamWriter(
                new GZIPOutputStream(new Base64OutputStream(baos, true, 0, new byte[] { '\n' }))));
        StreamResult sResult = new StreamResult(w);
        transformer.transform(w3DomSource, sResult);
        w.close();
        return baos.toByteArray();
    }

    public static Document decompressXMLDocument(byte[] bytes) throws IOException, ParserConfigurationException,
            SAXException, TransformerConfigurationException, TransformerException {
        // With help from http://stackoverflow.com/questions/309424/read-convert-an-inputstream-to-a-string
        ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
        InputStream s = new GZIPInputStream(new Base64InputStream(bais));
        DocumentBuilder documentBuilder = mkDocBuilderFactory().newDocumentBuilder();
        Document doc = documentBuilder.parse(s);
        s.close();
        return doc;
    }

    public static DocumentBuilderFactory mkDocBuilderFactory() throws ParserConfigurationException {
        /* Try to load the document.  Note that the factory must be configured within the context of a method call
         * for exception handling.  TODO: can we work around this w/ dependency injection? */

        // With help from http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        docFactory.setValidating(false);
        docFactory.setNamespaceAware(true);
        docFactory.setFeature("http://xml.org/sax/features/namespaces", false);
        docFactory.setFeature("http://xml.org/sax/features/validation", false);
        docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        return docFactory;
    }

    public static String documentToString(Document doc)
            throws ParserConfigurationException, TransformerConfigurationException, TransformerException {
        StringWriter stringWriter = new StringWriter();
        DOMSource source = new DOMSource(doc);
        StreamResult result = new StreamResult(stringWriter);
        Transformer transformer = getTransformerFactory().newTransformer();
        transformer.transform(source, result);
        return stringWriter.toString();
    }

    public static Document nodeToDocument(DocumentBuilder docBuilder, String documentContainer, Node n) {
        /* With help from:
         * http://examples.javacodegeeks.com/core-java/xml/dom/copy-nodes-subtree-from-one-dom-document-to-another/ */
        org.w3c.dom.Document newDoc = docBuilder.newDocument();
        Element rootElement = newDoc.createElement(documentContainer);
        Node newNode = newDoc.importNode(n, true);
        rootElement.appendChild(newNode);
        newDoc.appendChild(rootElement);
        return newDoc;
    }

    public static class DocumentSerializer extends JsonSerializer<Document> {
        @Override
        public void serialize(Document document, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
                throws IOException, JsonProcessingException {
            byte[] compressedDoc;
            try {
                compressedDoc = compressXMLDocument(document);
            } catch (TransformerException e) {
                throw new IOException("Caught TransformerException when compressing document", e);
            }
            jsonGenerator.writeString(new String(compressedDoc, "UTF-8"));
        }
    }

    public static class DocumentDeserializer extends JsonDeserializer<Document> {
        @Override
        public Document deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
                throws IOException, JsonProcessingException {
            byte[] compressedDoc = jsonParser.getText().getBytes("UTF-8");
            Document doc;
            try {
                doc = decompressXMLDocument(compressedDoc);
            } catch (ParserConfigurationException e) {
                throw new IOException("Caught ParserConfigurationException when compressing document", e);
            } catch (SAXException e) {
                throw new IOException("Caught SAXException when compressing document", e);
            } catch (TransformerConfigurationException e) {
                throw new IOException("Caught TransformerConfigurationException when compressing document", e);
            } catch (TransformerException e) {
                throw new IOException("Caught TransformerException when compressing document", e);
            }
            return doc;
        }
    }
}