Java tutorial
/* * Copyright 2002-2010 The Rector and Visitors of the * University of Virginia. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package edu.virginia.speclab.juxta.author.model; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; import java.io.StringReader; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.io.IOUtils; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.ext.LexicalHandler; import org.xml.sax.helpers.DefaultHandler; import com.Ostermiller.util.StringTokenizer; import edu.virginia.speclab.diff.OffsetRange; import edu.virginia.speclab.diff.document.LocationMarker; import edu.virginia.speclab.diff.document.NoteData; import edu.virginia.speclab.diff.document.OffsetMap; import edu.virginia.speclab.diff.document.PageBreakData; import edu.virginia.speclab.diff.document.TagSet; import edu.virginia.speclab.diff.token.JuxtaXMLNode; import edu.virginia.speclab.exceptions.ReportedException; import edu.virginia.speclab.juxta.author.model.manifest.BiblioData; /** * Normally juxta reads in either an XML file with a very limited set of tags * (call this a juxta xml file) and simply parses out the raw text, making note * of some line number annotations with data given in some of the tags. * * What we want to be able to do is read an arbitrary XML file with an * arbitrary set of tags and do something meaningful with it. This parser * attempts to do that. It is based around a SAX parsing strategy. It builds * a simple tree of XML nodes as it parses through the document. The nodes do * not contain the text components of the document. As the document is parsed, * it builds a flattened string of the text elements encountered, and annotates * the nodes in the tree it is building with these offsets. It also produces a * flat wad of XML text that can be referred to later via another pair of offsets * contained in each node. The resulting tree provides an interface for mapping * text offsets in the flattened, parsed text output back to nodes inside the * tree, which use the structure of the tree to form simple XPaths into the * originating document. The nodes also contain a pair of offsets into the * raw XML produced by reading the file, so that the raw file can be displayed * with the appropriate tag highlighted for the user for any particular token. * * We also build a bidirectional character-for-character map between the * XML source and the processed raw text. * * This class also handles submission of plaintext, and treats malformed XML * documents as plaintext. (In fact, that's how it knows: if the SAX parser * throws an exception, it treats it as plaintext.) * * @author ben */ public class JuxtaXMLParser extends DefaultHandler implements LexicalHandler { public enum DocumentType { PLAINTEXT, XML, UNKNOWN }; private DocumentType docType; private File file; private Charset encodingCharSet; private CharsetDecoder decoder; private String juxtaVersion; private SAXParser parser; private StringBuffer flattenedText; private JuxtaXMLNode root; private JuxtaXMLNode current; private int flatTextPosition, xmlTextPosition; private TagSet excludeElements; private TagSet notableElements; private TagSet newlineElements; private Set<String> elementsEncountered; private Locator locator; private List<Integer> lineToOffsetMap; private String rawXMLText; private int lastXMLPosition; private OffsetMap offsetMap; private boolean insideEntity; private boolean insideWhitespaceRun; private int rawEntityLength; private int charactersInsideEntity; private List<JuxtaXMLParserTagHandler> tagHandlers; private List<Integer> revisionsToAccept; private int currentRevisionIndex; private JuxtaXMLParserBibliographicTagHandler biblioTagHandler; private JuxtaXMLParserDocumentPointerTagHandler docPointerHandler; private JuxtaXMLParserMilestoneTagHandler milestoneTagHandler; private JuxtaXMLParserAddDelTagHandler addDelTagHandler; private JuxtaXMLParserNoteTagHandler noteTagHandler; private JuxtaXMLPageBreakTagHandler pageBreakHandler; private HashMap<String, OffsetRange> idOffsetMap; public JuxtaXMLParser(File file, String juxtaVersion, Charset encodingCharSet) throws FileNotFoundException { this.docType = DocumentType.UNKNOWN; this.juxtaVersion = juxtaVersion; this.encodingCharSet = encodingCharSet; this.decoder = this.encodingCharSet.newDecoder(); this.decoder.onMalformedInput(CodingErrorAction.REPLACE); this.decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); this.idOffsetMap = new HashMap<String, OffsetRange>(); this.file = file; this.revisionsToAccept = new ArrayList<Integer>(); this.currentRevisionIndex = 0; root = current = null; flattenedText = new StringBuffer(); flatTextPosition = -1; xmlTextPosition = -1; lastXMLPosition = 0; excludeElements = new TagSet(); notableElements = new TagSet(); newlineElements = new TagSet(); elementsEncountered = new HashSet<String>(); lineToOffsetMap = new ArrayList<Integer>(); insideEntity = false; insideWhitespaceRun = false; // install our default special-case tag handlers tagHandlers = new LinkedList<JuxtaXMLParserTagHandler>(); biblioTagHandler = new JuxtaXMLParserBibliographicTagHandler(); addTagHandler(biblioTagHandler); docPointerHandler = new JuxtaXMLParserDocumentPointerTagHandler(); addTagHandler(docPointerHandler); milestoneTagHandler = new JuxtaXMLParserMilestoneTagHandler(); addTagHandler(milestoneTagHandler); } public void setRevisionsToAccept(String revisionsList) { this.revisionsToAccept.clear(); StringTokenizer st = new StringTokenizer(revisionsList, ","); while (st.hasMoreTokens()) { this.revisionsToAccept.add(Integer.parseInt(st.nextToken())); } } private boolean isDocumentRevised() { return (this.revisionsToAccept.size() > 0); } public OffsetRange getTagOffsetRange(final String tagId) { return this.idOffsetMap.get(tagId); } public List<NoteData> getNotes() { if (this.noteTagHandler == null) { return new ArrayList<NoteData>(); } return this.noteTagHandler.getNotes(); } public List<Revision> getRevisions() { if (addDelTagHandler == null) { return new ArrayList<Revision>(); } return addDelTagHandler.getRevisions(); } public List<PageBreakData> getPageBreaks() { if (this.pageBreakHandler != null) { return this.pageBreakHandler.getPageBreaks(); } return new ArrayList<PageBreakData>(); } public DocumentType getDocumentType() { return docType; } public void setDocumentType(DocumentType type) { docType = type; } public String getJuxtaVersion() { return juxtaVersion; } public void addTagHandler(JuxtaXMLParserTagHandler handler) { if (handler == null) return; tagHandlers.add(handler); } public JuxtaXMLParserDocumentPointerTagHandler getDocumentPointer() { return docPointerHandler; } private InputStream stripUtf8Bom(InputStream inputStream) throws IOException { PushbackInputStream pushbackInputStream = new PushbackInputStream(new BufferedInputStream(inputStream), 3); byte[] bom = new byte[3]; if (pushbackInputStream.read(bom) != -1) { if (!(bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF)) { pushbackInputStream.unread(bom); } } return pushbackInputStream; } /** * read doc into lines. determine character offset map for each line. * collapse lines into one raw xml string with new lines. * @throws ReportedException */ private void firstPassReadFile() throws ReportedException { try { FileInputStream fis = new FileInputStream(this.file); InputStreamReader is = new InputStreamReader(stripUtf8Bom(fis), this.decoder); List<String> lines = IOUtils.readLines(is); StringBuffer rawXMLTextBuffer = new StringBuffer(); int offset = 0; lineToOffsetMap.add(new Integer(-1)); for (String line : lines) { // Strip out marcon decorators; they break the text components line = line.replaceAll("[~\u0304]", ""); int length = line.length(); lineToOffsetMap.add(new Integer(offset + length)); offset += length + 1; rawXMLTextBuffer.append(line).append("\n"); } rawXMLText = rawXMLTextBuffer.toString(); } catch (IOException e) { throw new ReportedException(e, "Problem with I/O on file: " + file); } } /** * Return the absolute position in the rawXMLText String given by a * row number and column number reported (presumably) by the Location object * while parsing the file. * * @param line The line number of the location, 1-based * @param col The column number of the location, 1-based * @return the absolute character offset */ private int mapLineAndColumnToOffset(int line, int col) { if (line == -1 || col == -1) return -1; return lineToOffsetMap.get(line - 1).intValue() + col; } public File getBaseDirectory() { return file.getParentFile(); } public String getSourceFileName() { return file.getName(); } public OffsetMap getOffsetMap() { return offsetMap; } public void parse() throws ReportedException { firstPassReadFile(); offsetMap = new OffsetMap(rawXMLText.length()); // Setup special tag handling to cover behavior for // add, del, note and pb tags setupCustomTagHandling(); try { if (file.getName().endsWith("xml")) { SAXParserFactory factory = SAXParserFactory.newInstance(); parser = factory.newSAXParser(); XMLReader xmlReader = parser.getXMLReader(); xmlReader.setProperty("http://xml.org/sax/properties/lexical-handler", this); // ignore external DTDs xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); parser.parse(new InputSource(new StringReader(rawXMLText)), this); if (getDocumentType() == DocumentType.UNKNOWN) { // If we made it here and we don't know what the document type is, // it's just an XML document setDocumentType(DocumentType.XML); } // Uncomment these lines for some verbose debugging of the XML parsing // and the resulting processed output. // getRootNode().debugPrint(); // try { // printDebuggingInfo(); // } catch (ReportedException ex) { //Logger.getLogger(JuxtaXMLParser.class.getName()).log(Level.SEVERE, null, ex); // } } else { processAsPlaintext(); } } catch (ParserConfigurationException ex) { throw new ReportedException(ex, "Problem with our parser configuration."); } catch (SAXParseException ex) { throw new ReportedException(ex, "XML Parsing Error at column " + ex.getColumnNumber() + ", line " + ex.getLineNumber() + " :\n" + ex.getLocalizedMessage()); } catch (SAXException ex) { throw new ReportedException(ex, "SAX Exception: " + ex.getLocalizedMessage()); } catch (IOException ex) { throw new ReportedException(ex, ex.getLocalizedMessage()); } } private void setupCustomTagHandling() { this.addDelTagHandler = new JuxtaXMLParserAddDelTagHandler(); this.addTagHandler(addDelTagHandler); this.currentRevisionIndex = 0; // The REVISED version will start off by including all // revision tags. The subset that is actually accepted // will be handled later based on the revisionsToAcceptList. if (isDocumentRevised()) { this.excludeElements.excludeTag("del"); this.excludeElements.excludeTag("delSpan"); this.excludeElements.excludeTag("add"); this.excludeElements.excludeTag("addSpan"); } else { // The ORIGINAL version will include text that // was marked as deleted, but not text added this.excludeElements.excludeTag("del"); this.excludeElements.excludeTag("delSpan"); this.excludeElements.includeTag("add"); this.excludeElements.includeTag("addSpan"); } if (this.excludeElements.contains("note") == false) { this.noteTagHandler = new JuxtaXMLParserNoteTagHandler(); addTagHandler(this.noteTagHandler); this.excludeElements.includeTag("note"); } if (this.excludeElements.contains("pb") == false) { this.pageBreakHandler = new JuxtaXMLPageBreakTagHandler(); addTagHandler(this.pageBreakHandler); this.excludeElements.includeTag("pb"); } } @Override public void setDocumentLocator(Locator locator) { this.locator = locator; } public void setExcludedTagSet(TagSet excluded) { if (excluded == null) excludeElements = new TagSet(); else excludeElements = excluded; } public void setNotableTagSet(TagSet notable) { if (notable == null) notableElements = new TagSet(); else notableElements = notable; } public void setNewlineTagSet(TagSet newlines) { if (newlines == null) newlineElements = new TagSet(); else newlineElements = newlines; } public TagSet getNewlineTagSet() { return newlineElements; } public BiblioData getBiblioData() { return biblioTagHandler.getBiblioData(); } public String getFlattenedText() { // +++ consider caching the flattened text String once the processing is // complete return flattenedText.toString(); } public String getXMLText() { return rawXMLText; } public JuxtaXMLNode getRootNode() { return root; } public List<LocationMarker> getLocationMarkerList() { if (getDocumentType() == DocumentType.PLAINTEXT) { return LocationMarkerGenerator.generateNewLineMarkers(rawXMLText); } return milestoneTagHandler.getLocationMarkerList(); } public Set<String> getElementsEncountered() { return elementsEncountered; } @Override public void startDocument() throws SAXException { // no-op } @Override public void endDocument() throws SAXException { // Allow for indexes at the end of the buffer to (at length()) to correctly map offsetMap.mapTargetToSource(flatTextPosition + 1, xmlTextPosition + 1); offsetMap.mapSourceToTarget(xmlTextPosition + 1, flatTextPosition + 1); } @Override public void startElement(String uri, String localName, String qname, Attributes attr) { if (root == null) { // this is the root node. exciting. root = new JuxtaXMLNode(qname, flatTextPosition + 1); current = root; } else { JuxtaXMLNode newNode = new JuxtaXMLNode(qname, flatTextPosition + 1); current.addChild(newNode); current = newNode; } // When dealing with revised modes, only keep revison // tags that we have been explicitly told to. Note the nify // inverted logic... if (isDocumentRevised() && isRevisionTag(qname)) { if (this.revisionsToAccept.contains(Integer.valueOf(this.currentRevisionIndex))) { if (this.current.getName().equals("del") || this.current.getName().equals("delSpan")) { current.setExcluded(true); } } else { if (this.current.getName().equals("add") || this.current.getName().equals("addSpan")) { current.setExcluded(true); } } } // set the start offset of the current node to the LAST xml position we read // then reset lastXMLPosition to be the current position given by locator. // // When these callbacks are called, the locator element points to the line and column number // of the current position, which is usually just after the thing that was just processed. // So for instance in this case, the locator points to the end of the opening tag. // The LAST time we read the location pointer and shoved it into lastXMLPosition corresponds // to either 1) the beginning of the document or 2) the end of the last end tag or character read. // This means that it already points to the offset we want to record, with the caveat that // the root tag will have offsets that also encompass the <?xml?> directive and anything that comes // between the beginning of the file and the open root tag, since we aren't ever infomed of an // event that happens when the root tag is getting ready to be opened. current.setXMLStartOffset(lastXMLPosition); int openTagStartOffset = lastXMLPosition; int openTagEndOffset = lastXMLPosition = mapLineAndColumnToOffset(locator.getLineNumber(), locator.getColumnNumber()); // Keep track of all of the element names that we see in the text. elementsEncountered.add(qname); // If our parent node is an excluded node, then // we ourselves are excluded. When we called addChild our excluded // flag was set to be whatever the parent node's flag was set to. // If we AREN'T set as excluded from that event, then we check // to see if the current element name is set to be excluded if (!current.isExcluded()) { current.setExcluded(excludeElements.contains(qname)); } // Check to see if we are an element to be noted current.setIsNotable(notableElements.contains(qname)); // are we a newline tag? if (!current.isExcluded() && newlineElements.contains(qname)) { insertCharacterFromSource('\n'); openTagStartOffset++; } for (int i = openTagStartOffset; i < openTagEndOffset; i++) skipCharacterFromSource(); for (int i = 0; i < attr.getLength(); ++i) { String name = attr.getQName(i); String value = attr.getValue(i); current.setAttribute(name, value); } } @Override public void endElement(String uri, String localName, String qname) { int closeTagStartOffset = lastXMLPosition; lastXMLPosition = mapLineAndColumnToOffset(locator.getLineNumber(), locator.getColumnNumber()); int closeTagEndOffset = lastXMLPosition; // Is this an empty tag? if (current.isEmptyTag() && current.isNotable()) { // insert a non-standard doc character to represent its presence insertInventedFlatTextCharacter('\u25CA'); // that's a "lozenge", see: http://www.fileformat.info/info/unicode/char/25ca/index.htm } current.setXMLEndOffset(lastXMLPosition); current.setEndOffset(flatTextPosition + 1); // plus one, because the end offset should be one past the last character included in our scheme for (int i = closeTagStartOffset; i < closeTagEndOffset; i++) { skipCharacterFromSource(); } // track all non-excluded tags with xml:id attributes // in a map. This map can be used to look up ranges for // target nodes for tags that use them: example = note. if (this.current.isExcluded() == false) { String id = this.current.getAttribute("xml:id"); if (id != null) { this.idOffsetMap.put(id, getContentRange(this.current)); } } // Do we have any handlers for this tag? Iterator<JuxtaXMLParserTagHandler> it = tagHandlers.iterator(); while (it.hasNext()) { JuxtaXMLParserTagHandler tagHandler = it.next(); if (tagHandler.getInterestingTagSet().contains(qname)) { tagHandler.processTag(this, current); } } current = current.getParent(); if (isRevisionTag(qname)) { this.revisionsToAccept.remove(Integer.valueOf(this.currentRevisionIndex)); this.currentRevisionIndex++; } } /** * Grab the full XML content of the specified node and find the * raw offsets to the node content (the text after the first '>' * and before the last '</' * * @param xmlNode * @return */ private OffsetRange getContentRange(JuxtaXMLNode xmlNode) { OffsetRange range = new OffsetRange(); int start = xmlNode.getXMLStartOffset(); int end = xmlNode.getXMLEndOffset(); String tagTxt = getXMLText().substring(start, end); int relTxtStart = tagTxt.indexOf(">") + 1; int relTxtEnd = tagTxt.lastIndexOf("</"); int realStart = start + relTxtStart; int realEnd = realStart + (relTxtEnd - relTxtStart); range.set(realStart, realEnd, OffsetRange.Space.ORIGINAL); return range; } @Override public void characters(char[] ch, int start, int length) { int mappedOffset = mapLineAndColumnToOffset(locator.getLineNumber(), locator.getColumnNumber()); int lastLastXMLPosition = lastXMLPosition; if (insideEntity) // Let endEntity know how many translated characters // resulted from the entity. I've never see this // be anything other than 1, but I'm not going to assume that // it won't ever be 0 or 2, for instance. // In this case, endEndity() is going to handle modifying the offset // based on how large the entity is (in terms of absolute characters in // the source XML). charactersInsideEntity = length; else lastXMLPosition = mappedOffset; current.setIsEmptyTag(false); for (int i = start; i < start + length; i++) { // Skip text inside excluded tags, and compress whitespace char character = ch[i]; boolean isWhitespace = Character.isWhitespace(character); char characterFromSourceDocument = rawXMLText.charAt(lastLastXMLPosition + (i - start)); if (!isWhitespace || !insideWhitespaceRun) { if (isWhitespace) { // normalize all whitespace to a space characer character = ' '; } // It's not clear that the !insideEntity part is necessary here if (characterFromSourceDocument == '&' && !insideEntity) { // we need to advance quickly through all of the characters that make up this // NCR. The XML Parser has already turned this into a // real unicode character for us, but will NOT trigger startEntity/endEntity // because it's a real jerk. So we have to do it ourselves. I quote: // // "Note also that the boundaries of character references (which are not really entities anyway) are not reported." // THANKS FOR THAT LESSON IN PEDANTISM, XML PARSER int j = 0; while (rawXMLText.charAt(lastLastXMLPosition + i - start + j++) != ';') { skipCharacterFromSource(); } } if (current.isExcluded()) { skipCharacterFromSource(); } else { insertCharacterFromSource(character); } } else { skipCharacterFromSource(); } if (!current.isExcluded()) { insideWhitespaceRun = isWhitespace; } } } private boolean isRevisionTag(String name) { return (name.equals("add") || name.equals("addSpan") || name.equals("del") || name.equals("delSpan")); } public void startDTD(String name, String publicId, String systemId) throws SAXException { // no-op } public void endDTD() throws SAXException { // no-op } public void startEntity(String name) throws SAXException { insideEntity = true; // this is a hack, but the locator gets confused on custom entities and won't tell us where // we are in the file, and this should work... I think. The two characters are the & and ; that // flank the entity name. rawEntityLength = name.length() + 2; } public void endEntity(String name) throws SAXException { insideEntity = false; for (int i = charactersInsideEntity; i < rawEntityLength; i++) skipCharacterFromSource(); lastXMLPosition = lastXMLPosition + rawEntityLength; } public void startCDATA() throws SAXException { // no-op (for now) } public void endCDATA() throws SAXException { // no-op (for now) } public void comment(char[] ch, int start, int length) throws SAXException { int startXMLPosition = lastXMLPosition; int endXMLPosition = lastXMLPosition = mapLineAndColumnToOffset(locator.getLineNumber(), locator.getColumnNumber()); for (int i = startXMLPosition; i < endXMLPosition; i++) skipCharacterFromSource(); } // Prints out (to STDOUT) a huge table, showing the offset mapping between the // source XML and the flattened text. When the mapping is working correctly, // you should be able to see that in this output. Also this // output is useful when compared with the offsets reported by the // JuxtaXMLNode#debugPrint method. public void printDebuggingInfo() throws ReportedException { int lastFlatTextPosition = -1; int remappedIndex = 0; for (int i = 0; i <= rawXMLText.length(); i++) { char xmlChar, flatChar, revMappedXMLChar; int targetOffset = offsetMap.getTargetOffset(i); if (targetOffset == -1) flatChar = '-'; else if (targetOffset >= flattenedText.length()) flatChar = '-'; else flatChar = flattenedText.charAt(targetOffset); if (i >= rawXMLText.length()) xmlChar = '-'; else xmlChar = rawXMLText.charAt(i); revMappedXMLChar = ' '; remappedIndex = -2; if (lastFlatTextPosition == targetOffset) flatChar = ' '; else if (targetOffset >= 0) { remappedIndex = offsetMap.getSourceOffset(targetOffset); revMappedXMLChar = rawXMLText.charAt(remappedIndex); } lastFlatTextPosition = targetOffset; if (xmlChar == '\n') xmlChar = '*'; if (flatChar == '\n') flatChar = '*'; if (revMappedXMLChar == '\n') revMappedXMLChar = '*'; System.out.println("" + i + "\t" + xmlChar + "\t" + targetOffset + "\t" + flatChar + "\t" + remappedIndex + "\t" + revMappedXMLChar); } } private void processAsPlaintext() { // Might want to consume whitespace in here at some point too // but since it's plaintext we'll assume the whitespace is // meaningful. root = new JuxtaXMLNode("", 0, rawXMLText.length(), null); for (int i = 0; i <= rawXMLText.length(); i++) { offsetMap.mapSourceToTarget(i, i); offsetMap.mapTargetToSource(i, i); } flattenedText.append(rawXMLText); this.setDocumentType(DocumentType.PLAINTEXT); } // Call this to add a character to the flat text that does not exist // in the source text private void insertInventedFlatTextCharacter(char character) { flattenedText.append(character); // increase the flat text position and map the current xml position to it flatTextPosition++; offsetMap.mapTargetToSource(flatTextPosition, xmlTextPosition); } private void insertCharacterFromSource(char character) { flattenedText.append(character); flatTextPosition++; xmlTextPosition++; offsetMap.mapTargetToSource(flatTextPosition, xmlTextPosition); offsetMap.mapSourceToTarget(xmlTextPosition, flatTextPosition); } private void skipCharacterFromSource() { xmlTextPosition++; offsetMap.mapSourceToTarget(xmlTextPosition, Math.max(0, flatTextPosition)); // protect against -1 being included at the beginning of the root tag } // Handy method for walking up a Document tree from a Node object contained within it // that was fetched via an XPath lookup. This way we can take any sort of XPath // construct and turn it into a simple index-based xpath that the rest of our // system can understand without reparsing the document into a DOM model. static public String getIndexBasedXPathForGeneralXPath(String xpathString, String xml) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { factory.setNamespaceAware(false); // ignore the horrible issues of namespacing DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(xml))); XPathFactory xpathFactory = XPathFactory.newInstance(); XPath xpath = xpathFactory.newXPath(); Node root = doc.getFirstChild(); XPathExpression expr = xpath.compile(xpathString); Node node = (Node) expr.evaluate(doc, XPathConstants.NODE); return nodeToSimpleXPath(node, root); } catch (SAXException ex) { } catch (IOException ex) { } catch (XPathExpressionException ex) { } catch (ParserConfigurationException ex) { } return null; } private static String nodeToSimpleXPath(Node node, Node root) { // recursion ahoy if (node == null) return ""; // need to get my index by looping through previous siblings for nodes with the same name as me Node sibling = node.getPreviousSibling(); int index = 1; while (sibling != null) { if (sibling.getNodeType() == node.getNodeType() && sibling.getNodeName().equals(node.getNodeName())) index++; sibling = sibling.getPreviousSibling(); } String path = "/" + node.getNodeName(); if (!node.isSameNode(root)) path = path + "[" + index + "]"; else return path; // If we've picked anything other than an element node, ignore it, and walk up // the tree for other things. We get this when someone asks for /blah/blah/text() or something, // which we don't support. if (node.getNodeType() != Node.ELEMENT_NODE) path = ""; return (nodeToSimpleXPath(node.getParentNode(), root) + path); } }