Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.twentyn.patentExtractor; import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.commons.io.input.ReaderInputStream; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; /** * This class represents parts of a USPTO patent document that are relevant to 20's use cases. It can extract * information from the USPTO's XML documents and convert it to a POJO that can then be serialized as JSON. Use this * as the basis for any processing of patent text. */ public class PatentDocument { public static final Logger LOGGER = LogManager.getLogger(PatentDocument.class); // See http://www.uspto.gov/learning-and-resources/xml-resources. public static final String DTD2014 = "v4.5 2014-04-03"; public static final String DTD2013 = "v4.4 2013-05-16"; public static final String DTD2012 = "v4.3 2012-12-04"; public static final String DTD2006 = "v4.2 2006-08-23"; public static final String DTD2005 = "v4.1 2005-08-25"; public static final String DTD2004 = "v40 2004-12-02"; public static final String DTD2014_APP = "v4.4 2014-04-03"; public static final String DTD2012_APP = "v4.3 2012-12-04"; public static final String DTD2006_APP = "v4.2 2006-08-23"; public static final String DTD2005_APP = "v4.1 2005-08-25"; public static final String DTD2004_APP = "v4.0 2004-12-02"; public static final String PATH_DTD_VERSION = "/us-patent-grant/@dtd-version"; public static final String PATH_DTD_VERSION_APP = "/us-patent-application/@dtd-version"; public static final String[] PATHS_TEXT = { "//description", "//invention-title", "//abstract", }; public static final String PATH_CLAIMS = "//claims"; public static final String PATH_KEY_FILE_ID = "fileId", PATH_KEY_TITLE = "title", PATH_KEY_DATE = "date", PATH_KEY_MAIN_CLASSIFICATION = "classification", PATH_KEY_FURTHER_CLASSIFICATIONS = "further_classifications", PATH_KEY_SEARCHED_CLASSIFICATIONS = "referenced_classifications"; // TODO: is there a type-safe way of building an object from XPath with a map of functions? public static final HashMap<String, String> PATHS_2013 = new HashMap<String, String>() { { put(PATH_KEY_FILE_ID, "/us-patent-grant/@file"); put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title"); put(PATH_KEY_DATE, "/us-patent-grant/@date-publ"); put(PATH_KEY_MAIN_CLASSIFICATION, "/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()"); put(PATH_KEY_FURTHER_CLASSIFICATIONS, "/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification"); put(PATH_KEY_SEARCHED_CLASSIFICATIONS, "/us-patent-grant/us-bibliographic-data-grant/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification"); } }; public static final HashMap<String, String> PATHS_2004 = new HashMap<String, String>() { { put(PATH_KEY_FILE_ID, "/us-patent-grant/@file"); put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title"); put(PATH_KEY_DATE, "/us-patent-grant/@date-publ"); put(PATH_KEY_MAIN_CLASSIFICATION, "/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()"); put(PATH_KEY_FURTHER_CLASSIFICATIONS, "/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification"); put(PATH_KEY_SEARCHED_CLASSIFICATIONS, "/us-patent-grant/us-bibliographic-data-grant/field-of-search/classification-national[./country/text()='US']/main-classification"); } }; public static final HashMap<String, String> PATHS_2014_APP = new HashMap<String, String>() { { put(PATH_KEY_FILE_ID, "/us-patent-application/@file"); put(PATH_KEY_TITLE, "/us-patent-application/us-bibliographic-data-application/invention-title"); put(PATH_KEY_DATE, "/us-patent-application/@date-publ"); put(PATH_KEY_MAIN_CLASSIFICATION, "/us-patent-application/us-bibliographic-data-application/classification-national/main-classification/text()"); put(PATH_KEY_FURTHER_CLASSIFICATIONS, "/us-patent-application/us-bibliographic-data-application/classification-national/further-classification"); put(PATH_KEY_SEARCHED_CLASSIFICATIONS, // Note: doesn't exist, but left for ease of use. "/us-patent-application/us-bibliographic-data-application/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification"); } }; public static final HashMap<String, HashMap<String, String>> VERSION_MAP = new HashMap<String, HashMap<String, String>>() { { put(DTD2014, PATHS_2013); // All the 2013 paths work with the 2014 DTD. put(DTD2013, PATHS_2013); put(DTD2012, PATHS_2013); // All the 2013 paths work with the 2012 DTD. put(DTD2006, PATHS_2013); // All the 2013 paths work with the 2006 DTD. put(DTD2005, PATHS_2013); // All the 2013 paths work with the 2005 DTD. put(DTD2004, PATHS_2004); put(DTD2014_APP, PATHS_2014_APP); put(DTD2012_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2012 app DTD. put(DTD2006_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2006 app DTD. put(DTD2005_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD, though the classifications might be different. put(DTD2004_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD assuming searched classifications are always empty. } }; private static final Pattern GZIP_PATTERN = Pattern.compile("\\.gz$"); public static class HtmlVisitor implements NodeVisitor { // Based on https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java private static final HashSet<String> SEGMENTING_NODES = new HashSet<String>() { { addAll(Arrays.asList("p", "h1", "h2", "h3", "h4", "h5", "h6", "dt", "dd", "tr", "li", "body", "div", // HTML entities "row", "claim" // patent-specific entities )); } }; private static final Pattern SPACE_PATTERN = Pattern.compile("^\\s+$"); private StringBuilder segmentBuilder = new StringBuilder(); private List<String> textSegments = new LinkedList<>(); @Override public void head(org.jsoup.nodes.Node node, int i) { // This borrows a page from HtmlToPlainText's book. if (node instanceof TextNode) { String text = ((TextNode) node).text(); if (text != null && text.length() > 0) { segmentBuilder.append(((TextNode) node).text()); } } } @Override public void tail(org.jsoup.nodes.Node node, int i) { String nodeName = node.nodeName(); if (nodeName.equals("a")) { // Same as Jsoup's HtmlToPlainText. segmentBuilder.append(String.format(" <%s>", node.absUrl("href"))); } else if (SEGMENTING_NODES.contains(nodeName) && segmentBuilder.length() > 0) { String segmentText = segmentBuilder.toString(); // Ignore blank lines, as we'll be tagging each line separately. if (!SPACE_PATTERN.matcher(segmentText).matches()) { this.textSegments.add(segmentText); } // TODO: is it better to drop the old one than clear the existing? segmentBuilder.setLength(0); } } public List<String> getTextContent() { return this.textSegments; } } private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes) throws ParserConfigurationException, TransformerConfigurationException, TransformerException, XPathExpressionException { List<String> allTextList = new ArrayList<>(0); if (textNodes != null) { for (int i = 0; i < textNodes.getLength(); i++) { Node n = textNodes.item(i); /* This extremely around-the-horn approach to handling text content is due to the mix of HTML and * XML in the patent body. We use Jsoup to parse the HTML entities we find in the body, and use * its extremely convenient NodeVisitor API to recursively traverse the document and extract the * text content in reasonable chunks. */ Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n); String docText = Util.documentToString(contentsDoc); // With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText); HtmlVisitor visitor = new HtmlVisitor(); NodeTraversor traversor = new NodeTraversor(visitor); traversor.traverse(htmlDoc); List<String> textSegments = visitor.getTextContent(); allTextList.addAll(textSegments); } } return allTextList; } /** * Extracts the text content from text fields in a patent XML document. * * @param docBuilder A document builder to use when constructing intermediate XML/HTML documents in the extraction * process. * @param paths A list of XPath paths from which to exactract text. * @param xpath An XPath instance to use when running XPath queries. * @param doc The XML document from which to extract text. * @return A list of strings representing the textual content of the document. These could be sentences, * paragraphs, or larger text units, but should represent some sort of structure in the document's text. * @throws ParserConfigurationException * @throws TransformerConfigurationException * @throws TransformerException * @throws XPathExpressionException */ private static List<String> getRelevantDocumentText(DocumentBuilder docBuilder, String[] paths, XPath xpath, Document doc) throws ParserConfigurationException, TransformerConfigurationException, TransformerException, XPathExpressionException { List<String> allTextList = new ArrayList<>(0); for (String path : paths) { XPathExpression exp = xpath.compile(path); NodeList textNodes = (NodeList) exp.evaluate(doc, XPathConstants.NODESET); allTextList.addAll(extractTextFromHTML(docBuilder, textNodes)); } return allTextList; } /** * Converts an XML file into a patent document object, extracting relevant fields from the patent XML. * * @param inputPath A path to the file to be read. * @return A patent object if the XML can be read, or null otherwise. * @throws IOException Thrown on file I/O errors. * @throws ParserConfigurationException Thrown when the XML parser cannot be configured correctly. * @throws SAXException Thrown on XML parser errors. * @throws XPathExpressionException Thrown when XPath fails to handle queries against the specified document. */ // TODO: logging? // TODO: are @nullable and @non-null annotations still a thing? // TODO: prolly belongs in a factory. public static PatentDocument patentDocumentFromXMLFile(File inputPath) throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException, XPathExpressionException { InputStream iStream = null; iStream = new BufferedInputStream(new FileInputStream(inputPath)); if (GZIP_PATTERN.matcher(inputPath.getName()).find()) { iStream = new GZIPInputStream(iStream); } return patentDocumentFromXMLStream(iStream); } /** * Converts a string of XML into a patent document object, extracting relevant fields from the patent XML. * * @param text The XML string to parse and extract. * @return A patent object if the XML can be read, or null otherwise. * @throws IOException * @throws ParserConfigurationException * @throws SAXException * @throws TransformerConfigurationException * @throws TransformerException * @throws XPathExpressionException */ public static PatentDocument patentDocumentFromXMLString(String text) throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException, XPathExpressionException { StringReader stringReader = new StringReader(text); return patentDocumentFromXMLStream(new ReaderInputStream(stringReader)); } public static PatentDocument patentDocumentFromXMLStream(InputStream iStream) throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException, XPathExpressionException { // Create XPath objects for validating that this document is actually a patent. XPath xpath = Util.getXPathFactory().newXPath(); XPathExpression versionXPath = xpath.compile(PATH_DTD_VERSION); XPathExpression versionXPathApp = xpath.compile(PATH_DTD_VERSION_APP); DocumentBuilderFactory docFactory = Util.mkDocBuilderFactory(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); Document doc = docBuilder.parse(iStream); Util.DocumentType docType = Util.identifyDocType(doc); if (docType != Util.DocumentType.PATENT && docType != Util.DocumentType.APPLICATION) { LOGGER.warn("Found unexpected document type: " + docType); return null; } boolean isApplication = docType == Util.DocumentType.APPLICATION; // Yes this is in fact the way suggested by the XPath API. String version; if (!isApplication) { version = (String) versionXPath.evaluate(doc, XPathConstants.STRING); } else { version = (String) versionXPathApp.evaluate(doc, XPathConstants.STRING); } if (version == null || !VERSION_MAP.containsKey(version)) { LOGGER.warn(String.format("Unrecognized patent DTD version: %s", version)); return null; } HashMap<String, String> paths = VERSION_MAP.get(version); /* Create XPath objects for extracting the fields of interest based on the version information. * TODO: extract these into some sharable, thread-safe place, maybe via dependency injection. */ XPathExpression idXPath = xpath.compile(paths.get(PATH_KEY_FILE_ID)); XPathExpression dateXPath = xpath.compile(paths.get(PATH_KEY_DATE)); XPathExpression titleXPath = xpath.compile(paths.get(PATH_KEY_TITLE)); XPathExpression classificationXPath = xpath.compile(paths.get(PATH_KEY_MAIN_CLASSIFICATION)); XPathExpression furtherClassificationsXPath = xpath.compile(paths.get(PATH_KEY_FURTHER_CLASSIFICATIONS)); XPathExpression searchedClassificationsXPath = xpath.compile(paths.get(PATH_KEY_SEARCHED_CLASSIFICATIONS)); String fileId = (String) idXPath.evaluate(doc, XPathConstants.STRING); String date = (String) dateXPath.evaluate(doc, XPathConstants.STRING); NodeList titleNodes = (NodeList) titleXPath.evaluate(doc, XPathConstants.NODESET); String title = StringUtils.join(" ", extractTextFromHTML(docBuilder, titleNodes)); String classification = (String) classificationXPath.evaluate(doc, XPathConstants.STRING); NodeList furtherClassificationNodes = (NodeList) furtherClassificationsXPath.evaluate(doc, XPathConstants.NODESET); ArrayList<String> furtherClassifications = null; if (furtherClassificationNodes != null) { furtherClassifications = new ArrayList<>(furtherClassificationNodes.getLength()); for (int i = 0; i < furtherClassificationNodes.getLength(); i++) { Node n = furtherClassificationNodes.item(i); String txt = n.getTextContent(); if (txt != null) { furtherClassifications.add(i, txt); } } } else { furtherClassifications = new ArrayList<>(0); } NodeList otherClassificationNodes = (NodeList) searchedClassificationsXPath.evaluate(doc, XPathConstants.NODESET); ArrayList<String> otherClassifications = null; if (otherClassificationNodes != null) { otherClassifications = new ArrayList<>(otherClassificationNodes.getLength()); for (int i = 0; i < otherClassificationNodes.getLength(); i++) { Node n = otherClassificationNodes.item(i); String txt = n.getTextContent(); if (txt != null) { otherClassifications.add(i, txt); } } } else { otherClassifications = new ArrayList<>(0); } // Extract text content for salient document paths. List<String> allTextList = getRelevantDocumentText(docBuilder, PATHS_TEXT, xpath, doc); List<String> claimsTextList = getRelevantDocumentText(docBuilder, new String[] { PATH_CLAIMS }, xpath, doc); return new PatentDocument(fileId, date, title, classification, furtherClassifications, otherClassifications, allTextList, claimsTextList, isApplication); } @JsonProperty("file_id") protected String fileId; @JsonProperty("grant_date") protected String grantDate; @JsonProperty("title") protected String title; @JsonProperty("primary_classification") protected String mainClassification; @JsonProperty("further_classification") protected List<String> furtherClassifications; @JsonProperty("searched_classifications") protected List<String> searchedClassifications; @JsonProperty("text_content") protected List<String> textContent; @JsonProperty("claims") protected List<String> claimsText; @JsonProperty("isApplication") protected Boolean isApplication; // TODO: this could probably use a builder if it gets more complicated. protected PatentDocument(String fileId, String grantDate, String title, String mainClassification, List<String> furtherClassifications, List<String> searchedClassifications, List<String> textContent, List<String> claimsText, Boolean isApplication) { this.fileId = fileId; this.grantDate = grantDate; this.title = title; this.mainClassification = mainClassification; this.furtherClassifications = furtherClassifications; this.searchedClassifications = searchedClassifications; this.textContent = textContent; this.claimsText = claimsText; this.isApplication = isApplication; } public String getFileId() { return fileId; } public String getGrantDate() { return grantDate; } public String getTitle() { return title; } public String getMainClassification() { return mainClassification; } public List<String> getFurtherClassifications() { return furtherClassifications; } public List<String> getSearchedClassifications() { return searchedClassifications; } public List<String> getTextContent() { return textContent; } public List<String> getClaimsText() { return claimsText; } public Boolean getIsApplication() { return isApplication; } }