List of usage examples for javax.xml.xpath XPathConstants STRING
QName STRING
To view the source code for javax.xml.xpath XPathConstants STRING.
Click Source Link
The XPath 1.0 string data type.
Maps to Java String .
From source file:org.wso2.carbon.bpmn.core.types.datatypes.xml.api.XMLDocument.java
/** * Function to evaluate xPath query, and return specified return type * * @param xpathStr xpath expression to evaluate * @param returnType The desired return type of xpath evaluation. Supported retrun types : "NODESET", "NODE", "STRING", "NUMBER", "BOOLEAN" * @return result of xpath evaluation in specified return type * @throws BPMNXmlException// w w w . j a v a2 s . com * @throws XPathExpressionException */ public Object xPath(String xpathStr, String returnType) throws BPMNXmlException, XPathExpressionException { if (returnType.equals(XPathConstants.NODESET.getLocalPart())) { Utils.evaluateXPath(doc, xpathStr, XPathConstants.NODESET); } else if (returnType.equals(XPathConstants.NODE.getLocalPart())) { Utils.evaluateXPath(doc, xpathStr, XPathConstants.NODE); } else if (returnType.equals(XPathConstants.STRING.getLocalPart())) { Utils.evaluateXPath(doc, xpathStr, XPathConstants.STRING); } else if (returnType.equals(XPathConstants.NUMBER.getLocalPart())) { Utils.evaluateXPath(doc, xpathStr, XPathConstants.NUMBER); } else if (returnType.equals(XPathConstants.BOOLEAN.getLocalPart())) { Utils.evaluateXPath(doc, xpathStr, XPathConstants.BOOLEAN); } else { //Unknown return type throw new BPMNXmlException("Unknown return type : " + returnType); } return null; }
From source file:org.wso2.carbon.bpmn.core.types.datatypes.xml.Utils.java
/** * Function to evaluate xpath. This will resolve the NodeList to Node if the result contains only one node * @param doc/*from ww w . j a v a2 s. c o m*/ * @param xpathStr * @return * @throws XPathExpressionException */ public static Object evaluateXPath(Document doc, String xpathStr) throws BPMNXmlException { Object result = null; NodeList outputObjList = null; try { outputObjList = (NodeList) evaluateXPath(doc, xpathStr, XPathConstants.NODESET); if (outputObjList.getLength() == 1) { //If there is only one node if (outputObjList.item(0) instanceof Text) { return ((Text) outputObjList.item(0)).getWholeText(); } return outputObjList.item(0); } return outputObjList; } catch (XPathExpressionException eLevel1) { //provided xpath cannot be evaluated to NodeList, so it may be evaluated to string try { if (log.isDebugEnabled()) { log.debug("Since evaluating the xpath: " + xpathStr + " to NodeList failed, retrying to evaluate it to a STRING"); } return evaluateXPath(doc, xpathStr, XPathConstants.STRING); } catch (XPathExpressionException eLevel2) { if (log.isDebugEnabled()) { log.debug("Error occurred while evaluating xpath :" + xpathStr + " on xml: " + doc.toString()); } throw new BPMNXmlException( "Error occurred while evaluating xpath :" + xpathStr + " due to error in xpath", eLevel2); } } }
From source file:org.wso2.carbon.humantask.core.engine.runtime.xpath.XPathExpressionRuntime.java
/** * Evaluate XPath expression//from www. java 2s. c o m * * @param exp XPath expression string * @param evalCtx Evaluation context containing all the required context information * @return Return List of selected nodes or string */ @Override public List evaluate(String exp, EvaluationContext evalCtx) { List result; Object someRes; try { someRes = evaluate(exp, evalCtx, XPathConstants.NODESET); } catch (Exception e) { someRes = evaluate(exp, evalCtx, XPathConstants.STRING); } if (someRes instanceof List) { result = (List) someRes; if (log.isDebugEnabled()) { log.debug("Returned list of size " + result.size()); } if ((result.size() == 1) && !(result.get(0) instanceof Node)) { // Dealing with a Java class Object simpleType = result.get(0); // Dates get a separate treatment as we don't want to call toString on them String textVal; if (simpleType instanceof Date) { textVal = ISO8601DateParser.format((Date) simpleType); } else if (simpleType instanceof DurationValue) { textVal = ((DurationValue) simpleType).getStringValue(); } else { textVal = simpleType.toString(); } // Wrapping in a document Document document = DOMUtils.newDocument(); // Giving our node a parent just in case it's an LValue expression Element wrapper = document.createElement("wrapper"); Text text = document.createTextNode(textVal); wrapper.appendChild(text); document.appendChild(wrapper); result = Collections.singletonList(text); } } else if (someRes instanceof NodeList) { NodeList retVal = (NodeList) someRes; if (log.isDebugEnabled()) { log.debug("Returned node list of size " + retVal.getLength()); } result = new ArrayList(retVal.getLength()); for (int m = 0; m < retVal.getLength(); ++m) { Node val = retVal.item(m); if (val.getNodeType() == Node.DOCUMENT_NODE) { val = ((Document) val).getDocumentElement(); } result.add(val); } } else if (someRes instanceof String) { // Wrapping in a document Document document = DOMUtils.newDocument(); Element wrapper = document.createElement("wrapper"); Text text = document.createTextNode((String) someRes); wrapper.appendChild(text); document.appendChild(wrapper); result = Collections.singletonList(text); } else { result = null; } return result; }
From source file:org.wso2.carbon.humantask.core.engine.runtime.xpath.XPathExpressionRuntime.java
/** * Evaluate given XPath string and returns result as a string * * @param exp XPath expression string * @param evalCtx Evaluation context containing all the required context information * @return String/*w w w . j a v a2s. c o m*/ */ @Override public String evaluateAsString(String exp, EvaluationContext evalCtx) { return (String) evaluate(exp, evalCtx, XPathConstants.STRING); }
From source file:org.wso2.identity.scenarios.commons.SAML2SSOTestBase.java
/** * Return whether SAML Assertion has the canonicalization method * set to 'http://www.w3.org/2001/10/xml-exc-c14n#WithComments'. * * @param document/*www .j av a2 s.c o m*/ * @return true if canonicalization method equals to 'http://www.w3.org/2001/10/xml-exc-c14n#WithComments' */ private boolean isSignedWithComments(Document document) { XPath xPath = XPathFactory.newInstance().newXPath(); try { String assertionId = (String) xPath.compile("//*[local-name()='Assertion']/@ID").evaluate(document, XPathConstants.STRING); if (StringUtils.isBlank(assertionId)) { return false; } NodeList nodeList = ((NodeList) xPath .compile("//*[local-name()='Assertion']" + "/*[local-name()='Signature']" + "/*[local-name()='SignedInfo']" + "/*[local-name()='Reference'][@URI='#" + assertionId + "']" + "/*[local-name()='Transforms']" + "/*[local-name()='Transform']" + "[@Algorithm='http://www.w3.org/2001/10/xml-exc-c14n#WithComments']") .evaluate(document, XPathConstants.NODESET)); return nodeList != null && nodeList.getLength() > 0; } catch (XPathExpressionException e) { String message = "Failed to find the canonicalization algorithm of the assertion. Defaulting to: " + "http://www.w3.org/2001/10/xml-exc-c14n#"; log.warn(message); return false; } }
From source file:org.wso2.ppaas.configurator.tests.ConfiguratorTestManager.java
public String readXML(String resourcePath, String xpathExpression) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; try {/*from w w w . j ava 2 s.c o m*/ String targetFile = ConfiguratorTestManager.class.getResource(PATH_SEP).getPath() + ".." + PATH_SEP + CONFIGURATOR_DIR_NAME + PATH_SEP + resourcePath; builder = factory.newDocumentBuilder(); Document doc = builder.parse(targetFile); XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); XPathExpression expr = xpath.compile(xpathExpression); String value = expr.evaluate(doc, XPathConstants.STRING).toString(); log.info("Parsed value" + value); return value; } catch (ParserConfigurationException | SAXException | IOException | XPathExpressionException e) { log.error("Error in parsing xml " + e.getMessage()); } return null; }
From source file:org.xdi.service.XmlService.java
public String getNodeValue(Document xmlDocument, String xPathExpression, String attributeName) throws XPathExpressionException { XPath xPath = XPathFactory.newInstance().newXPath(); XPathExpression formXPathExpression = xPath.compile(xPathExpression); if (StringHelper.isEmpty(attributeName)) { String nodeValue = (String) formXPathExpression.evaluate(xmlDocument, XPathConstants.STRING); return nodeValue; }/* w ww . j a v a2 s . com*/ Node node = ((Node) formXPathExpression.evaluate(xmlDocument, XPathConstants.NODE)); if (node == null) { return null; } Node attributeNode = node.getAttributes().getNamedItem(attributeName); if (attributeNode == null) { return null; } return attributeNode.getNodeValue(); }
From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*w ww .j av a 2s . c o m*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor(); BxDocument bxDoc = structureExtractor.extractStructure(pdfStream); Integer bxDocLen = bxDoc.asZones().size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : StringTools.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : StringTools.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = StringTools.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = bxDoc.asZones().get(zoneIdx); List<String> zoneTokens = StringTools.tokenize(StringTools .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } printlnVerbose("==========================="); for (BxPage page : bxDoc.getPages()) { for (BxZone zone : page.getZones()) { Integer zoneIdx = bxDoc.asZones().indexOf(zone); BxZone curZone = bxDoc.asZones().get(zoneIdx); String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase()); List<String> zoneTokens = StringTools.tokenize(zoneText); Boolean valueSet = false; Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens; if (Math.max(zoneTokens.size(), entryTokens.size()) > 0 && Math.min(zoneTokens.size(), entryTokens.size()) / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7 && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("0 "); } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment - t2.alignment; if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size()); if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("1 "); } } if (!valueSet) { Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class); for (LabelTrio trio : swLabelSim.get(zoneIdx)) { if (cumulated.containsKey(trio.label)) { cumulated.put(trio.label, cumulated.get(trio.label) + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } else { cumulated.put(trio.label, trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } } Double max = Double.NEGATIVE_INFINITY; BxZoneLabel bestLabel = null; for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) { if (entry.getValue() > max) { max = entry.getValue(); bestLabel = entry.getKey(); } } if (max >= 0.5) { curZone.setLabel(bestLabel); printVerbose("2 "); valueSet = true; } } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<LabelTrio> l = swLabelSim.get(zoneIdx); BxZoneLabel best = null; int bestScore = 0; for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { if (lt.entryTokens.contains(zt)) { i++; } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } if (best != null) { curZone.setLabel(best); valueSet = true; } else { for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { for (String j : lt.entryTokens) { if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "") .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) { i++; break; } } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } } if (best != null) { curZone.setLabel(best); valueSet = true; } } if (!valueSet) { curZone.setLabel(null); } printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n"); } Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>(); Set<BxZone> unlabeledZones = new HashSet<BxZone>(); for (BxZone zone : page.getZones()) { if (zone.getLabel() == null) { unlabeledZones.add(zone); zoneLocMap.put(zone, new ZoneLocaliser(zone)); } } Integer lastNumberOfUnlabeledZones; do { lastNumberOfUnlabeledZones = unlabeledZones.size(); infereLabels(unlabeledZones, zoneLocMap); infereLabels(unlabeledZones, zoneLocMap); } while (lastNumberOfUnlabeledZones != unlabeledZones.size()); } printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>="); return bxDoc; }
From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);//w w w. j a va 2 s. c om dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader(); Reader r = new InputStreamReader(pdfStream); BxDocument bxDoc = new BxDocument().setPages(reader.read(r)); List<BxZone> zones = Lists.newArrayList(bxDoc.asZones()); Integer bxDocLen = zones.size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : TextUtils.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : TextUtils.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = TextUtils.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = zones.get(zoneIdx); List<String> zoneTokens = TextUtils.tokenize( TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } for (BxPage pp : bxDoc) { boolean changed = true; while (changed) { changed = false; boolean wasIntro = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); int i = zones.indexOf(z); double titleAl = 0; double authorAl = 0; List<LabelTrio> sims = swLabelSim.get(i); for (LabelTrio t : sims) { if (t.label.equals(BxZoneLabel.MET_TITLE)) { titleAl = t.alignment / t.entryTokens.size(); } if (t.label.equals(BxZoneLabel.MET_AUTHOR)) { authorAl = t.alignment / t.entryTokens.size(); } } String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); int linesCount = z.childrenCount(); int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent()); BxLine firstLine = z.getFirstChild(); if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.MET_TITLE) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) && titleAl >= 0.7 && authorAl >= 0.4) { z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR); } if (linesCount == 2 && text.contains("page") && text.contains("of") && text.contains("page number not for")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (linesCount == 1 && (text.contains("page number not for") || (text.contains("page") && text.contains("of")))) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && linesCount < 11 && (text.contains("department") || text.contains("university"))) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (linesCount < 5 && firstLine.toText().length() < 11 && firstLine.toText().startsWith("Figure") && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx > 0 && z.hasPrev() && z.hasNext() && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_DATES) || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)) && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) && z.getWidth() < 100) { if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2; double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2; double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } } if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION) || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:") || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) { z.setLabel(BxZoneLabel.MET_EDITOR); } if (pageIdx == 0 && text.startsWith("copyright:")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume") && text.contains("issue")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.MET_AUTHOR) || z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { BxPage p = z.getParent(); if (pageIdx > 0) { BxPage prevPage = p.getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 1) { BxPage nextPage = p.getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx > 1) { BxPage prevPage = p.getPrev().getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 2) { BxPage nextPage = p.getNext().getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO) || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+") && text.length() <= 4 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (text.equals("acknowledgments")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("introduction") && z.hasPrev() && !z.getPrev().toText().toLowerCase().equals("abstract")) { wasIntro = true; } if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references") && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10 && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES) && (text.matches("[1-9][0-9]?[0-9]?\\.?") || text.matches(".*[1-2][0-9][0-9][0-9].*"))) { z.setLabel(BxZoneLabel.REFERENCES); } if ((z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (text.startsWith("doi") || text.startsWith("cite this article"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("author details")) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (firstLine.toText().toLowerCase().equals("acknowledgments") || firstLine.toText().toLowerCase().equals("acknowledgements"))) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100) && text.matches("sup-[0-9][0-9]?")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("references")) { z.setLabel(BxZoneLabel.REFERENCES); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT) && text.contains("this article is distributed")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("journal")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("correspondence")) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && text.contains("accepted") && text.contains("published")) { z.setLabel(BxZoneLabel.MET_DATES); } if (pageIdx == 0 && linesCount < 10 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4 && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) { if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) { z.setLabel(z.getPrev().getLabel()); } } if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with") || text.contains("will be the most significant development") || text.contains("disseminating the results of biomedical") || text.contains("sir paul nurse") || text.contains("your research papers") || text.contains("available free of charge") || text.contains("peer reviewed and published") || text.contains("cited in pubmed and archived") || text.contains("you keep the copyright") || text.contains("submit your manuscript") || text.contains("submit your next manuscript") || text.contains("online submission") || text.contains("peer review") || text.contains("space constraints") || text.contains("publication on acceptance") || text.contains("inclusion in pubmed") || text.contains("freely available") || text.contains("publication history"))) { z.setLabel(BxZoneLabel.OTH_UNKNOWN); } if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest") || text.startsWith("competing interests") || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest") || z.getPrev().toText().toLowerCase().equals("conflict of interest") || z.getPrev().toText().toLowerCase().equals("competing interests")))) { z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT); } changed = changed || !orig.equals(z.getLabel()); } boolean wasAuthor = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor && ((text.contains("email") && text.contains("@")) || text.startsWith("correspondence"))) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) { wasAuthor = true; } changed = changed || !orig.equals(z.getLabel()); } } } return bxDoc; }
From source file:sf.net.experimaestro.connectors.OARLauncher.java
/** * Evaluate an XPath to a string//from w ww.ja v a 2s .c o m */ static private String evaluateXPathToString(String expression, Document document) { String value; XPath xpath = XPathFactory.newInstance().newXPath(); try { value = (String) xpath.evaluate(expression, document, XPathConstants.STRING); } catch (XPathExpressionException e) { throw new XPMRuntimeException(e, "Cannot evaluted XPath expression [%s]", xpath); } return value; }