Example usage for javax.xml.xpath XPathConstants NODE

Introduction

In this page you can find the example usage for javax.xml.xpath XPathConstants NODE.

Prototype

QName NODE

To view the source code for javax.xml.xpath XPathConstants NODE.

Click Source Link

Document

The XPath 1.0 NodeSet data type.

Usage

From source file:org.zaproxy.zap.extension.saml.SAMLMessage.java

/** Update XML document with any attributes that were changed */
private void updateXMLDocument() {
    XPathFactory xFactory = XPathFactory.newInstance();
    XPath xpath = xFactory.newXPath();
    for (Attribute attribute : attributeMap.values()) {
        try {/*from  www .  j  a va 2 s .  co m*/
            Node node = (Node) xpath.compile(attribute.getxPath()).evaluate(xmlDocument, XPathConstants.NODE);
            if (node != null) { // the attributes that aren't available will be giving null
                // values
                if (node instanceof Element) {
                    node.setTextContent(attribute.getValue().toString());
                } else if (node instanceof Attr) {
                    ((Attr) node).setValue(attribute.getValue().toString());
                } else {
                    node.setNodeValue(attribute.getValue().toString());
                }
            }
        } catch (XPathExpressionException e) {
            log.warn(attribute.getxPath() + " is not a valid XPath", e);
        }
    }
    if (SAMLConfiguration.getInstance().getXSWEnabled()) {
        try {
            NodeList nodeList = (NodeList) xpath.compile("/Response//Signature").evaluate(xmlDocument,
                    XPathConstants.NODESET);
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node item = nodeList.item(i);
                if (item instanceof Element) {
                    item.getParentNode().removeChild(item);
                }
            }
        } catch (XPathExpressionException e) {
            log.warn("'/Response//Signature' is not a valid XPath", e);
        }
    }
}

From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*from w  w w.  ja v  a2 s .  c  om*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor();
    BxDocument bxDoc = structureExtractor.extractStructure(pdfStream);
    Integer bxDocLen = bxDoc.asZones().size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : StringTools.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : StringTools.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = StringTools.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            List<String> zoneTokens = StringTools.tokenize(StringTools
                    .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    printlnVerbose("===========================");
    for (BxPage page : bxDoc.getPages()) {
        for (BxZone zone : page.getZones()) {
            Integer zoneIdx = bxDoc.asZones().indexOf(zone);
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase());
            List<String> zoneTokens = StringTools.tokenize(zoneText);
            Boolean valueSet = false;

            Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                @Override
                public int compare(LabelTrio t1, LabelTrio t2) {
                    Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size();
                    if (Math.abs(simDif) < 0.0001) {
                        return t2.entryTokens.size() - t1.entryTokens.size();
                    }
                    if (simDif > 0) {
                        return 1;
                    } else {
                        return -1;
                    }
                }
            });
            Collections.reverse(swLabelSim.get(zoneIdx));

            List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens;
            if (Math.max(zoneTokens.size(), entryTokens.size()) > 0
                    && Math.min(zoneTokens.size(), entryTokens.size())
                            / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7
                    && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) {
                curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                valueSet = true;
                printVerbose("0 ");
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment - t2.alignment;
                        if (Math.abs(simDif) < 0.0001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size());
                if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) {
                    curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                    valueSet = true;
                    printVerbose("1 ");
                }
            }

            if (!valueSet) {
                Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class);
                for (LabelTrio trio : swLabelSim.get(zoneIdx)) {
                    if (cumulated.containsKey(trio.label)) {
                        cumulated.put(trio.label, cumulated.get(trio.label)
                                + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    } else {
                        cumulated.put(trio.label,
                                trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    }
                }
                Double max = Double.NEGATIVE_INFINITY;
                BxZoneLabel bestLabel = null;
                for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) {
                    if (entry.getValue() > max) {
                        max = entry.getValue();
                        bestLabel = entry.getKey();
                    }
                }
                if (max >= 0.5) {
                    curZone.setLabel(bestLabel);
                    printVerbose("2 ");
                    valueSet = true;
                }
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment / t1.entryTokens.size()
                                - t2.alignment / t2.entryTokens.size();
                        if (Math.abs(simDif) < 0.001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                List<LabelTrio> l = swLabelSim.get(zoneIdx);

                BxZoneLabel best = null;
                int bestScore = 0;
                for (LabelTrio lt : l) {
                    int i = 0;
                    for (String zt : zoneTokens) {
                        if (lt.entryTokens.contains(zt)) {
                            i++;
                        }
                    }
                    if (i > bestScore && i > 1) {
                        best = lt.label;
                        bestScore = i;
                    }
                }
                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                } else {
                    for (LabelTrio lt : l) {
                        int i = 0;
                        for (String zt : zoneTokens) {
                            for (String j : lt.entryTokens) {
                                if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "")
                                        .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) {
                                    i++;
                                    break;
                                }
                            }
                        }
                        if (i > bestScore && i > 1) {
                            best = lt.label;
                            bestScore = i;
                        }
                    }
                }

                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                }
            }
            if (!valueSet) {
                curZone.setLabel(null);
            }
            printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n");
        }
        Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>();
        Set<BxZone> unlabeledZones = new HashSet<BxZone>();
        for (BxZone zone : page.getZones()) {
            if (zone.getLabel() == null) {
                unlabeledZones.add(zone);
                zoneLocMap.put(zone, new ZoneLocaliser(zone));
            }
        }
        Integer lastNumberOfUnlabeledZones;
        do {
            lastNumberOfUnlabeledZones = unlabeledZones.size();
            infereLabels(unlabeledZones, zoneLocMap);
            infereLabels(unlabeledZones, zoneLocMap);
        } while (lastNumberOfUnlabeledZones != unlabeledZones.size());
    }
    printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>=");

    return bxDoc;
}

From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*from w  ww. jav a2 s.c  om*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader();
    Reader r = new InputStreamReader(pdfStream);
    BxDocument bxDoc = new BxDocument().setPages(reader.read(r));

    List<BxZone> zones = Lists.newArrayList(bxDoc.asZones());

    Integer bxDocLen = zones.size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = TextUtils.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = zones.get(zoneIdx);
            List<String> zoneTokens = TextUtils.tokenize(
                    TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    for (BxPage pp : bxDoc) {

        boolean changed = true;
        while (changed) {

            changed = false;
            boolean wasIntro = false;

            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();
                int i = zones.indexOf(z);

                double titleAl = 0;
                double authorAl = 0;
                List<LabelTrio> sims = swLabelSim.get(i);
                for (LabelTrio t : sims) {
                    if (t.label.equals(BxZoneLabel.MET_TITLE)) {
                        titleAl = t.alignment / t.entryTokens.size();
                    }
                    if (t.label.equals(BxZoneLabel.MET_AUTHOR)) {
                        authorAl = t.alignment / t.entryTokens.size();
                    }
                }

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                int linesCount = z.childrenCount();
                int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent());
                BxLine firstLine = z.getFirstChild();

                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.MET_TITLE)
                                || z.getLabel().equals(BxZoneLabel.BODY_CONTENT))
                        && titleAl >= 0.7 && authorAl >= 0.4) {
                    z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR);
                }
                if (linesCount == 2 && text.contains("page") && text.contains("of")
                        && text.contains("page number not for")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (linesCount == 1 && (text.contains("page number not for")
                        || (text.contains("page") && text.contains("of")))) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && linesCount < 11 && (text.contains("department") || text.contains("university"))) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (linesCount < 5 && firstLine.toText().length() < 11
                        && firstLine.toText().startsWith("Figure")
                        && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx > 0 && z.hasPrev() && z.hasNext()
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                                || z.getLabel().equals(BxZoneLabel.MET_DATES)
                                || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT))
                        && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                                || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE))
                        && z.getWidth() < 100) {
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                            && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        z.setLabel(BxZoneLabel.BODY_TABLE);
                    }
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2;
                        double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                    if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2;
                        double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                }
                if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION)
                        || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:")
                        || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) {
                    z.setLabel(BxZoneLabel.MET_EDITOR);
                }
                if (pageIdx == 0 && text.startsWith("copyright:")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume")
                        && text.contains("issue")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.MET_AUTHOR)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    BxPage p = z.getParent();
                    if (pageIdx > 0) {
                        BxPage prevPage = p.getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 1) {
                        BxPage nextPage = p.getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx > 1) {
                        BxPage prevPage = p.getPrev().getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 2) {
                        BxPage nextPage = p.getNext().getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                        || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+")
                        && text.length() <= 4
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (text.equals("acknowledgments")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (text.startsWith("introduction") && z.hasPrev()
                        && !z.getPrev().toText().toLowerCase().equals("abstract")) {
                    wasIntro = true;
                }
                if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }

                if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references")
                        && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10
                        && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT)
                        && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES)
                        && (text.matches("[1-9][0-9]?[0-9]?\\.?")
                                || text.matches(".*[1-2][0-9][0-9][0-9].*"))) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if ((z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (text.startsWith("doi") || text.startsWith("cite this article"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("author details")) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (firstLine.toText().toLowerCase().equals("acknowledgments")
                                || firstLine.toText().toLowerCase().equals("acknowledgements"))) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)
                        && text.matches("sup-[0-9][0-9]?")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("references")) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) {
                    z.setLabel(BxZoneLabel.BODY_TABLE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)
                        && text.contains("this article is distributed")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("journal")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("correspondence")) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }
                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && text.contains("accepted") && text.contains("published")) {
                    z.setLabel(BxZoneLabel.MET_DATES);
                }

                if (pageIdx == 0 && linesCount < 10
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4
                        && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) {
                    if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) {
                        z.setLabel(z.getPrev().getLabel());
                    }
                }
                if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with")
                        || text.contains("will be the most significant development")
                        || text.contains("disseminating the results of biomedical")
                        || text.contains("sir paul nurse") || text.contains("your research papers")
                        || text.contains("available free of charge")
                        || text.contains("peer reviewed and published")
                        || text.contains("cited in pubmed and archived")
                        || text.contains("you keep the copyright") || text.contains("submit your manuscript")
                        || text.contains("submit your next manuscript") || text.contains("online submission")
                        || text.contains("peer review") || text.contains("space constraints")
                        || text.contains("publication on acceptance") || text.contains("inclusion in pubmed")
                        || text.contains("freely available") || text.contains("publication history"))) {
                    z.setLabel(BxZoneLabel.OTH_UNKNOWN);
                }
                if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }

                if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest")
                        || text.startsWith("competing interests")
                        || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest")
                                || z.getPrev().toText().toLowerCase().equals("conflict of interest")
                                || z.getPrev().toText().toLowerCase().equals("competing interests")))) {
                    z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT);
                }

                changed = changed || !orig.equals(z.getLabel());
            }

            boolean wasAuthor = false;
            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor
                        && ((text.contains("email") && text.contains("@"))
                                || text.startsWith("correspondence"))) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }

                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel())
                        || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) {
                    wasAuthor = true;
                }
                changed = changed || !orig.equals(z.getLabel());
            }

        }
    }

    return bxDoc;
}

From source file:sf.net.experimaestro.manager.js.JSNode.java

@Expose(value = "get_node", scope = true)
public Object getNode(Context context, Scriptable scope, String expression) throws XPathExpressionException {
    XPathExpression xpath = XMLUtils.parseXPath(expression, JSUtils.getNamespaceContext(scope));
    Node node = (Node) xpath.evaluate(this.node, XPathConstants.NODE);
    if (node == null)
        return NOT_FOUND;
    return new JSNode(node);
}

From source file:sf.net.experimaestro.manager.js.JSNode.java

@Expose(value = "text", scope = true)
public String getText(Context context, Scriptable scope, String expression) throws XPathExpressionException {
    XPathExpression xpath = XMLUtils.parseXPath(expression, JSUtils.getNamespaceContext(scope));
    Node node = (Node) xpath.evaluate(this.node, XPathConstants.NODE);
    if (node == null)
        return "";

    String text = node.getTextContent();
    return text == null ? "" : text;
}

From source file:sh.isaac.api.util.ArtifactUtilities.java

/**
 * Make maven relative path./*w  w  w. ja  va  2s  .  com*/
 *
 * @param baseMavenURL - optional - but required if you are downloading a SNAPSHOT dependency, as this method will need to download the metadata file
 * from the repository server in order to determine the proper version component for the SNAPSHOT.
 * @param mavenUsername - optional - only used for a SNAPSHOT dependency
 * @param mavenPassword - optional - only used for a SNAPSHOT dependency
 * @param groupId the group id
 * @param artifactId the artifact id
 * @param version the version
 * @param classifier - optional
 * @param type the type
 * @return the string
 * @throws Exception the exception
 */
public static String makeMavenRelativePath(String baseMavenURL, String mavenUsername, String mavenPassword,
        String groupId, String artifactId, String version, String classifier, String type) throws Exception {
    final String temp = groupId.replaceAll("\\.", "/");
    String snapshotVersion = "";
    String versionWithoutSnapshot = version;

    if (version.endsWith("-SNAPSHOT")) {
        versionWithoutSnapshot = version.substring(0, version.lastIndexOf("-SNAPSHOT"));

        final URL metadataUrl = new URL(baseMavenURL + (baseMavenURL.endsWith("/") ? "" : "/") + temp + "/"
                + artifactId + "/" + version + "/maven-metadata.xml");

        // Need to download the maven-metadata.xml file
        final Task<File> task = new DownloadUnzipTask(mavenUsername, mavenPassword, metadataUrl, false, false,
                null);

        WorkExecutors.get().getExecutor().execute(task);

        final File metadataFile = task.get();
        final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();

        // added to avoid XXE injections
        domFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);

        DocumentBuilder builder;
        Document dDoc = null;
        final XPath xPath = XPathFactory.newInstance().newXPath();

        builder = domFactory.newDocumentBuilder();
        dDoc = builder.parse(metadataFile);

        final String timestamp = ((Node) xPath.evaluate("/metadata/versioning/snapshot/timestamp", dDoc,
                XPathConstants.NODE)).getTextContent();
        final String buildNumber = ((Node) xPath.evaluate("/metadata/versioning/snapshot/buildNumber", dDoc,
                XPathConstants.NODE)).getTextContent();

        snapshotVersion = "-" + timestamp + "-" + buildNumber;
        metadataFile.delete();

        // The download task makes a subfolder in temp for this, delete that too
        metadataFile.getParentFile().delete();
    }

    return temp + "/" + artifactId + "/" + version + "/" + artifactId + "-" + versionWithoutSnapshot
            + snapshotVersion + (StringUtils.isNotBlank(classifier) ? "-" + classifier : "") + "." + type;
}

From source file:test.framework.TestBase.java

/**
 * Return the child of the node selected by the xPath.
 * //w  w  w . jav  a 2 s .c o m
 * @param node The node.
 * @param xPath The xPath expression.
 * @return The child of the node selected by the xPath.
 * @throws TransformerException If anything fails.
 */
protected static Node selectSingleNode(final Node node, final String xpathExpression)
        throws TransformerException {
    XPathFactory factory = XPathFactory.newInstance();
    XPath xPath = factory.newXPath();
    try {
        return (Node) xPath.evaluate(xpathExpression, node, XPathConstants.NODE);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:tufts.vue.ds.XMLIngest.java

static void XPathExtract(XmlSchema schema, Document document) {

    try {/*  ww w  .  ja va 2  s.c om*/

        XPath xpath = XPathFactory.newInstance().newXPath();

        String expression = "/rss/channel/item";
        //String expression = "rss/channel/item/title";

        errout("Extracting " + expression);

        // First, obtain the element as a node.

        //tufts.DocDump.dump(document);

        Node nodeValue = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
        errout("   Node: " + nodeValue);

        // Next, obtain the element as a String.

        String stringValue = (String) xpath.evaluate(expression, document, XPathConstants.STRING);
        System.out.println(" String: " + stringValue);

        NodeList nodeSet = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET);
        errout("NodeSet: " + Util.tag(nodeSet) + "; size=" + nodeSet.getLength());

        for (int i = 0; i < nodeSet.getLength(); i++) {
            scanNode(schema, nodeSet.item(i), null, null);
        }

        //             // Finally, obtain the element as a Number (Double).

        //             Double birthdateDouble = (Double) xpath.evaluate(expression, document, XPathConstants.NUMBER);

        //             System.out.println("Double is: " + birthdateDouble);

    } catch (XPathExpressionException e) {
        System.err.println("XPathExpressionException caught...");
        e.printStackTrace();
    } catch (Throwable t) {
        t.printStackTrace();
    }
}