Example usage for javax.xml.xpath XPathConstants STRING

List of usage examples for javax.xml.xpath XPathConstants STRING

Introduction

In this page you can find the example usage for javax.xml.xpath XPathConstants STRING.

Prototype

QName STRING

To view the source code for javax.xml.xpath XPathConstants STRING.

Click Source Link

Document

The XPath 1.0 string data type.

Maps to Java String .

Usage

From source file:org.wso2.carbon.bpmn.core.types.datatypes.xml.api.XMLDocument.java

/**
 * Function to evaluate xPath query, and return specified return type
 *
 * @param xpathStr xpath expression to evaluate
 * @param returnType The desired return type of xpath evaluation. Supported retrun types : "NODESET", "NODE", "STRING", "NUMBER", "BOOLEAN"
 * @return result of xpath evaluation in specified return type
 * @throws BPMNXmlException//  w w w  . j  a v a2 s  .  com
 * @throws XPathExpressionException
 */
public Object xPath(String xpathStr, String returnType) throws BPMNXmlException, XPathExpressionException {

    if (returnType.equals(XPathConstants.NODESET.getLocalPart())) {
        Utils.evaluateXPath(doc, xpathStr, XPathConstants.NODESET);
    } else if (returnType.equals(XPathConstants.NODE.getLocalPart())) {
        Utils.evaluateXPath(doc, xpathStr, XPathConstants.NODE);
    } else if (returnType.equals(XPathConstants.STRING.getLocalPart())) {
        Utils.evaluateXPath(doc, xpathStr, XPathConstants.STRING);
    } else if (returnType.equals(XPathConstants.NUMBER.getLocalPart())) {
        Utils.evaluateXPath(doc, xpathStr, XPathConstants.NUMBER);
    } else if (returnType.equals(XPathConstants.BOOLEAN.getLocalPart())) {
        Utils.evaluateXPath(doc, xpathStr, XPathConstants.BOOLEAN);
    } else {
        //Unknown return type
        throw new BPMNXmlException("Unknown return type : " + returnType);
    }

    return null;
}

From source file:org.wso2.carbon.bpmn.core.types.datatypes.xml.Utils.java

/**
 * Function to evaluate xpath. This will resolve the NodeList to Node if the result contains only one node
 * @param doc/*from ww  w . j a v a2 s. c o  m*/
 * @param xpathStr
 * @return
 * @throws XPathExpressionException
 */
public static Object evaluateXPath(Document doc, String xpathStr) throws BPMNXmlException {

    Object result = null;
    NodeList outputObjList = null;
    try {
        outputObjList = (NodeList) evaluateXPath(doc, xpathStr, XPathConstants.NODESET);
        if (outputObjList.getLength() == 1) {
            //If there is only one node
            if (outputObjList.item(0) instanceof Text) {
                return ((Text) outputObjList.item(0)).getWholeText();
            }
            return outputObjList.item(0);
        }
        return outputObjList;

    } catch (XPathExpressionException eLevel1) {
        //provided xpath cannot be evaluated to NodeList, so it may be evaluated to string
        try {

            if (log.isDebugEnabled()) {
                log.debug("Since evaluating the xpath: " + xpathStr
                        + " to NodeList failed, retrying to evaluate it to a STRING");
            }
            return evaluateXPath(doc, xpathStr, XPathConstants.STRING);

        } catch (XPathExpressionException eLevel2) {
            if (log.isDebugEnabled()) {
                log.debug("Error occurred while evaluating xpath :" + xpathStr + " on xml: " + doc.toString());
            }
            throw new BPMNXmlException(
                    "Error occurred while evaluating xpath :" + xpathStr + " due to error in xpath", eLevel2);
        }
    }
}

From source file:org.wso2.carbon.humantask.core.engine.runtime.xpath.XPathExpressionRuntime.java

/**
 * Evaluate XPath expression//from  www. java  2s. c o  m
 *
 * @param exp     XPath expression string
 * @param evalCtx Evaluation context containing all the required context information
 * @return Return List of selected nodes or string
 */
@Override
public List evaluate(String exp, EvaluationContext evalCtx) {
    List result;
    Object someRes;
    try {
        someRes = evaluate(exp, evalCtx, XPathConstants.NODESET);
    } catch (Exception e) {
        someRes = evaluate(exp, evalCtx, XPathConstants.STRING);
    }
    if (someRes instanceof List) {
        result = (List) someRes;
        if (log.isDebugEnabled()) {
            log.debug("Returned list of size " + result.size());
        }

        if ((result.size() == 1) && !(result.get(0) instanceof Node)) {
            // Dealing with a Java class
            Object simpleType = result.get(0);
            // Dates get a separate treatment as we don't want to call toString on them
            String textVal;
            if (simpleType instanceof Date) {
                textVal = ISO8601DateParser.format((Date) simpleType);
            } else if (simpleType instanceof DurationValue) {
                textVal = ((DurationValue) simpleType).getStringValue();
            } else {
                textVal = simpleType.toString();
            }

            // Wrapping in a document
            Document document = DOMUtils.newDocument();
            // Giving our node a parent just in case it's an LValue expression
            Element wrapper = document.createElement("wrapper");
            Text text = document.createTextNode(textVal);
            wrapper.appendChild(text);
            document.appendChild(wrapper);
            result = Collections.singletonList(text);
        }
    } else if (someRes instanceof NodeList) {
        NodeList retVal = (NodeList) someRes;
        if (log.isDebugEnabled()) {
            log.debug("Returned node list of size " + retVal.getLength());
        }
        result = new ArrayList(retVal.getLength());
        for (int m = 0; m < retVal.getLength(); ++m) {
            Node val = retVal.item(m);
            if (val.getNodeType() == Node.DOCUMENT_NODE) {
                val = ((Document) val).getDocumentElement();
            }
            result.add(val);
        }
    } else if (someRes instanceof String) {
        // Wrapping in a document
        Document document = DOMUtils.newDocument();
        Element wrapper = document.createElement("wrapper");
        Text text = document.createTextNode((String) someRes);
        wrapper.appendChild(text);
        document.appendChild(wrapper);
        result = Collections.singletonList(text);
    } else {
        result = null;
    }

    return result;
}

From source file:org.wso2.carbon.humantask.core.engine.runtime.xpath.XPathExpressionRuntime.java

/**
 * Evaluate given XPath string and returns result as a string
 *
 * @param exp     XPath expression string
 * @param evalCtx Evaluation context containing all the required context information
 * @return String/*w w w  . j a v  a2s.  c  o m*/
 */
@Override
public String evaluateAsString(String exp, EvaluationContext evalCtx) {
    return (String) evaluate(exp, evalCtx, XPathConstants.STRING);
}

From source file:org.wso2.identity.scenarios.commons.SAML2SSOTestBase.java

/**
 * Return whether SAML Assertion has the canonicalization method
 * set to 'http://www.w3.org/2001/10/xml-exc-c14n#WithComments'.
 *
 * @param document/*www .j av  a2  s.c o  m*/
 * @return true if canonicalization method equals to 'http://www.w3.org/2001/10/xml-exc-c14n#WithComments'
 */
private boolean isSignedWithComments(Document document) {

    XPath xPath = XPathFactory.newInstance().newXPath();
    try {
        String assertionId = (String) xPath.compile("//*[local-name()='Assertion']/@ID").evaluate(document,
                XPathConstants.STRING);

        if (StringUtils.isBlank(assertionId)) {
            return false;
        }

        NodeList nodeList = ((NodeList) xPath
                .compile("//*[local-name()='Assertion']" + "/*[local-name()='Signature']"
                        + "/*[local-name()='SignedInfo']" + "/*[local-name()='Reference'][@URI='#" + assertionId
                        + "']" + "/*[local-name()='Transforms']" + "/*[local-name()='Transform']"
                        + "[@Algorithm='http://www.w3.org/2001/10/xml-exc-c14n#WithComments']")
                .evaluate(document, XPathConstants.NODESET));
        return nodeList != null && nodeList.getLength() > 0;
    } catch (XPathExpressionException e) {
        String message = "Failed to find the canonicalization algorithm of the assertion. Defaulting to: "
                + "http://www.w3.org/2001/10/xml-exc-c14n#";
        log.warn(message);
        return false;
    }
}

From source file:org.wso2.ppaas.configurator.tests.ConfiguratorTestManager.java

public String readXML(String resourcePath, String xpathExpression) {

    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = null;
    try {/*from   w  w  w .  j  ava 2  s.c  o m*/
        String targetFile = ConfiguratorTestManager.class.getResource(PATH_SEP).getPath() + ".." + PATH_SEP
                + CONFIGURATOR_DIR_NAME + PATH_SEP + resourcePath;
        builder = factory.newDocumentBuilder();
        Document doc = builder.parse(targetFile);
        XPathFactory xPathfactory = XPathFactory.newInstance();
        XPath xpath = xPathfactory.newXPath();
        XPathExpression expr = xpath.compile(xpathExpression);
        String value = expr.evaluate(doc, XPathConstants.STRING).toString();
        log.info("Parsed value" + value);
        return value;

    } catch (ParserConfigurationException | SAXException | IOException | XPathExpressionException e) {
        log.error("Error in parsing xml " + e.getMessage());
    }
    return null;
}

From source file:org.xdi.service.XmlService.java

public String getNodeValue(Document xmlDocument, String xPathExpression, String attributeName)
        throws XPathExpressionException {
    XPath xPath = XPathFactory.newInstance().newXPath();
    XPathExpression formXPathExpression = xPath.compile(xPathExpression);

    if (StringHelper.isEmpty(attributeName)) {
        String nodeValue = (String) formXPathExpression.evaluate(xmlDocument, XPathConstants.STRING);

        return nodeValue;
    }/*  w ww  .  j a  v a2 s  .  com*/

    Node node = ((Node) formXPathExpression.evaluate(xmlDocument, XPathConstants.NODE));
    if (node == null) {
        return null;
    }

    Node attributeNode = node.getAttributes().getNamedItem(attributeName);
    if (attributeNode == null) {
        return null;
    }

    return attributeNode.getNodeValue();
}

From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*w ww  .j av  a  2s  . c  o  m*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor();
    BxDocument bxDoc = structureExtractor.extractStructure(pdfStream);
    Integer bxDocLen = bxDoc.asZones().size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : StringTools.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : StringTools.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = StringTools.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            List<String> zoneTokens = StringTools.tokenize(StringTools
                    .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    printlnVerbose("===========================");
    for (BxPage page : bxDoc.getPages()) {
        for (BxZone zone : page.getZones()) {
            Integer zoneIdx = bxDoc.asZones().indexOf(zone);
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase());
            List<String> zoneTokens = StringTools.tokenize(zoneText);
            Boolean valueSet = false;

            Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                @Override
                public int compare(LabelTrio t1, LabelTrio t2) {
                    Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size();
                    if (Math.abs(simDif) < 0.0001) {
                        return t2.entryTokens.size() - t1.entryTokens.size();
                    }
                    if (simDif > 0) {
                        return 1;
                    } else {
                        return -1;
                    }
                }
            });
            Collections.reverse(swLabelSim.get(zoneIdx));

            List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens;
            if (Math.max(zoneTokens.size(), entryTokens.size()) > 0
                    && Math.min(zoneTokens.size(), entryTokens.size())
                            / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7
                    && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) {
                curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                valueSet = true;
                printVerbose("0 ");
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment - t2.alignment;
                        if (Math.abs(simDif) < 0.0001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size());
                if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) {
                    curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                    valueSet = true;
                    printVerbose("1 ");
                }
            }

            if (!valueSet) {
                Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class);
                for (LabelTrio trio : swLabelSim.get(zoneIdx)) {
                    if (cumulated.containsKey(trio.label)) {
                        cumulated.put(trio.label, cumulated.get(trio.label)
                                + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    } else {
                        cumulated.put(trio.label,
                                trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    }
                }
                Double max = Double.NEGATIVE_INFINITY;
                BxZoneLabel bestLabel = null;
                for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) {
                    if (entry.getValue() > max) {
                        max = entry.getValue();
                        bestLabel = entry.getKey();
                    }
                }
                if (max >= 0.5) {
                    curZone.setLabel(bestLabel);
                    printVerbose("2 ");
                    valueSet = true;
                }
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment / t1.entryTokens.size()
                                - t2.alignment / t2.entryTokens.size();
                        if (Math.abs(simDif) < 0.001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                List<LabelTrio> l = swLabelSim.get(zoneIdx);

                BxZoneLabel best = null;
                int bestScore = 0;
                for (LabelTrio lt : l) {
                    int i = 0;
                    for (String zt : zoneTokens) {
                        if (lt.entryTokens.contains(zt)) {
                            i++;
                        }
                    }
                    if (i > bestScore && i > 1) {
                        best = lt.label;
                        bestScore = i;
                    }
                }
                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                } else {
                    for (LabelTrio lt : l) {
                        int i = 0;
                        for (String zt : zoneTokens) {
                            for (String j : lt.entryTokens) {
                                if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "")
                                        .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) {
                                    i++;
                                    break;
                                }
                            }
                        }
                        if (i > bestScore && i > 1) {
                            best = lt.label;
                            bestScore = i;
                        }
                    }
                }

                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                }
            }
            if (!valueSet) {
                curZone.setLabel(null);
            }
            printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n");
        }
        Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>();
        Set<BxZone> unlabeledZones = new HashSet<BxZone>();
        for (BxZone zone : page.getZones()) {
            if (zone.getLabel() == null) {
                unlabeledZones.add(zone);
                zoneLocMap.put(zone, new ZoneLocaliser(zone));
            }
        }
        Integer lastNumberOfUnlabeledZones;
        do {
            lastNumberOfUnlabeledZones = unlabeledZones.size();
            infereLabels(unlabeledZones, zoneLocMap);
            infereLabels(unlabeledZones, zoneLocMap);
        } while (lastNumberOfUnlabeledZones != unlabeledZones.size());
    }
    printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>=");

    return bxDoc;
}

From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//w  w  w. j a  va  2 s. c  om
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader();
    Reader r = new InputStreamReader(pdfStream);
    BxDocument bxDoc = new BxDocument().setPages(reader.read(r));

    List<BxZone> zones = Lists.newArrayList(bxDoc.asZones());

    Integer bxDocLen = zones.size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = TextUtils.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = zones.get(zoneIdx);
            List<String> zoneTokens = TextUtils.tokenize(
                    TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    for (BxPage pp : bxDoc) {

        boolean changed = true;
        while (changed) {

            changed = false;
            boolean wasIntro = false;

            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();
                int i = zones.indexOf(z);

                double titleAl = 0;
                double authorAl = 0;
                List<LabelTrio> sims = swLabelSim.get(i);
                for (LabelTrio t : sims) {
                    if (t.label.equals(BxZoneLabel.MET_TITLE)) {
                        titleAl = t.alignment / t.entryTokens.size();
                    }
                    if (t.label.equals(BxZoneLabel.MET_AUTHOR)) {
                        authorAl = t.alignment / t.entryTokens.size();
                    }
                }

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                int linesCount = z.childrenCount();
                int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent());
                BxLine firstLine = z.getFirstChild();

                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.MET_TITLE)
                                || z.getLabel().equals(BxZoneLabel.BODY_CONTENT))
                        && titleAl >= 0.7 && authorAl >= 0.4) {
                    z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR);
                }
                if (linesCount == 2 && text.contains("page") && text.contains("of")
                        && text.contains("page number not for")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (linesCount == 1 && (text.contains("page number not for")
                        || (text.contains("page") && text.contains("of")))) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && linesCount < 11 && (text.contains("department") || text.contains("university"))) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (linesCount < 5 && firstLine.toText().length() < 11
                        && firstLine.toText().startsWith("Figure")
                        && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx > 0 && z.hasPrev() && z.hasNext()
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                                || z.getLabel().equals(BxZoneLabel.MET_DATES)
                                || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT))
                        && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                                || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE))
                        && z.getWidth() < 100) {
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                            && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        z.setLabel(BxZoneLabel.BODY_TABLE);
                    }
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2;
                        double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                    if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2;
                        double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                }
                if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION)
                        || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:")
                        || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) {
                    z.setLabel(BxZoneLabel.MET_EDITOR);
                }
                if (pageIdx == 0 && text.startsWith("copyright:")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume")
                        && text.contains("issue")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.MET_AUTHOR)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    BxPage p = z.getParent();
                    if (pageIdx > 0) {
                        BxPage prevPage = p.getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 1) {
                        BxPage nextPage = p.getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx > 1) {
                        BxPage prevPage = p.getPrev().getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 2) {
                        BxPage nextPage = p.getNext().getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                        || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+")
                        && text.length() <= 4
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (text.equals("acknowledgments")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (text.startsWith("introduction") && z.hasPrev()
                        && !z.getPrev().toText().toLowerCase().equals("abstract")) {
                    wasIntro = true;
                }
                if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }

                if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references")
                        && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10
                        && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT)
                        && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES)
                        && (text.matches("[1-9][0-9]?[0-9]?\\.?")
                                || text.matches(".*[1-2][0-9][0-9][0-9].*"))) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if ((z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (text.startsWith("doi") || text.startsWith("cite this article"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("author details")) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (firstLine.toText().toLowerCase().equals("acknowledgments")
                                || firstLine.toText().toLowerCase().equals("acknowledgements"))) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)
                        && text.matches("sup-[0-9][0-9]?")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("references")) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) {
                    z.setLabel(BxZoneLabel.BODY_TABLE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)
                        && text.contains("this article is distributed")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("journal")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("correspondence")) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }
                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && text.contains("accepted") && text.contains("published")) {
                    z.setLabel(BxZoneLabel.MET_DATES);
                }

                if (pageIdx == 0 && linesCount < 10
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4
                        && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) {
                    if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) {
                        z.setLabel(z.getPrev().getLabel());
                    }
                }
                if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with")
                        || text.contains("will be the most significant development")
                        || text.contains("disseminating the results of biomedical")
                        || text.contains("sir paul nurse") || text.contains("your research papers")
                        || text.contains("available free of charge")
                        || text.contains("peer reviewed and published")
                        || text.contains("cited in pubmed and archived")
                        || text.contains("you keep the copyright") || text.contains("submit your manuscript")
                        || text.contains("submit your next manuscript") || text.contains("online submission")
                        || text.contains("peer review") || text.contains("space constraints")
                        || text.contains("publication on acceptance") || text.contains("inclusion in pubmed")
                        || text.contains("freely available") || text.contains("publication history"))) {
                    z.setLabel(BxZoneLabel.OTH_UNKNOWN);
                }
                if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }

                if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest")
                        || text.startsWith("competing interests")
                        || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest")
                                || z.getPrev().toText().toLowerCase().equals("conflict of interest")
                                || z.getPrev().toText().toLowerCase().equals("competing interests")))) {
                    z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT);
                }

                changed = changed || !orig.equals(z.getLabel());
            }

            boolean wasAuthor = false;
            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor
                        && ((text.contains("email") && text.contains("@"))
                                || text.startsWith("correspondence"))) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }

                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel())
                        || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) {
                    wasAuthor = true;
                }
                changed = changed || !orig.equals(z.getLabel());
            }

        }
    }

    return bxDoc;
}

From source file:sf.net.experimaestro.connectors.OARLauncher.java

/**
 * Evaluate an XPath to a string//from  w ww.ja  v a 2s .c o  m
 */
static private String evaluateXPathToString(String expression, Document document) {
    String value;
    XPath xpath = XPathFactory.newInstance().newXPath();
    try {
        value = (String) xpath.evaluate(expression, document, XPathConstants.STRING);
    } catch (XPathExpressionException e) {
        throw new XPMRuntimeException(e, "Cannot evaluted XPath expression [%s]", xpath);
    }
    return value;
}