List of usage examples for javax.xml.xpath XPathConstants NODE
QName NODE
To view the source code for javax.xml.xpath XPathConstants NODE.
Click Source Link
The XPath 1.0 NodeSet data type.
From source file:org.zaproxy.zap.extension.saml.SAMLMessage.java
/** Update XML document with any attributes that were changed */ private void updateXMLDocument() { XPathFactory xFactory = XPathFactory.newInstance(); XPath xpath = xFactory.newXPath(); for (Attribute attribute : attributeMap.values()) { try {/*from www . j a va 2 s . co m*/ Node node = (Node) xpath.compile(attribute.getxPath()).evaluate(xmlDocument, XPathConstants.NODE); if (node != null) { // the attributes that aren't available will be giving null // values if (node instanceof Element) { node.setTextContent(attribute.getValue().toString()); } else if (node instanceof Attr) { ((Attr) node).setValue(attribute.getValue().toString()); } else { node.setNodeValue(attribute.getValue().toString()); } } } catch (XPathExpressionException e) { log.warn(attribute.getxPath() + " is not a valid XPath", e); } } if (SAMLConfiguration.getInstance().getXSWEnabled()) { try { NodeList nodeList = (NodeList) xpath.compile("/Response//Signature").evaluate(xmlDocument, XPathConstants.NODESET); for (int i = 0; i < nodeList.getLength(); i++) { Node item = nodeList.item(i); if (item instanceof Element) { item.getParentNode().removeChild(item); } } } catch (XPathExpressionException e) { log.warn("'/Response//Signature' is not a valid XPath", e); } } }
From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*from w w w. ja v a2 s . c om*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor(); BxDocument bxDoc = structureExtractor.extractStructure(pdfStream); Integer bxDocLen = bxDoc.asZones().size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : StringTools.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : StringTools.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = StringTools.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = bxDoc.asZones().get(zoneIdx); List<String> zoneTokens = StringTools.tokenize(StringTools .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } printlnVerbose("==========================="); for (BxPage page : bxDoc.getPages()) { for (BxZone zone : page.getZones()) { Integer zoneIdx = bxDoc.asZones().indexOf(zone); BxZone curZone = bxDoc.asZones().get(zoneIdx); String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase()); List<String> zoneTokens = StringTools.tokenize(zoneText); Boolean valueSet = false; Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens; if (Math.max(zoneTokens.size(), entryTokens.size()) > 0 && Math.min(zoneTokens.size(), entryTokens.size()) / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7 && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("0 "); } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment - t2.alignment; if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size()); if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("1 "); } } if (!valueSet) { Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class); for (LabelTrio trio : swLabelSim.get(zoneIdx)) { if (cumulated.containsKey(trio.label)) { cumulated.put(trio.label, cumulated.get(trio.label) + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } else { cumulated.put(trio.label, trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } } Double max = Double.NEGATIVE_INFINITY; BxZoneLabel bestLabel = null; for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) { if (entry.getValue() > max) { max = entry.getValue(); bestLabel = entry.getKey(); } } if (max >= 0.5) { curZone.setLabel(bestLabel); printVerbose("2 "); valueSet = true; } } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<LabelTrio> l = swLabelSim.get(zoneIdx); BxZoneLabel best = null; int bestScore = 0; for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { if (lt.entryTokens.contains(zt)) { i++; } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } if (best != null) { curZone.setLabel(best); valueSet = true; } else { for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { for (String j : lt.entryTokens) { if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "") .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) { i++; break; } } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } } if (best != null) { curZone.setLabel(best); valueSet = true; } } if (!valueSet) { curZone.setLabel(null); } printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n"); } Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>(); Set<BxZone> unlabeledZones = new HashSet<BxZone>(); for (BxZone zone : page.getZones()) { if (zone.getLabel() == null) { unlabeledZones.add(zone); zoneLocMap.put(zone, new ZoneLocaliser(zone)); } } Integer lastNumberOfUnlabeledZones; do { lastNumberOfUnlabeledZones = unlabeledZones.size(); infereLabels(unlabeledZones, zoneLocMap); infereLabels(unlabeledZones, zoneLocMap); } while (lastNumberOfUnlabeledZones != unlabeledZones.size()); } printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>="); return bxDoc; }
From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*from w ww. jav a2 s.c om*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader(); Reader r = new InputStreamReader(pdfStream); BxDocument bxDoc = new BxDocument().setPages(reader.read(r)); List<BxZone> zones = Lists.newArrayList(bxDoc.asZones()); Integer bxDocLen = zones.size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : TextUtils.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : TextUtils.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = TextUtils.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = zones.get(zoneIdx); List<String> zoneTokens = TextUtils.tokenize( TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } for (BxPage pp : bxDoc) { boolean changed = true; while (changed) { changed = false; boolean wasIntro = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); int i = zones.indexOf(z); double titleAl = 0; double authorAl = 0; List<LabelTrio> sims = swLabelSim.get(i); for (LabelTrio t : sims) { if (t.label.equals(BxZoneLabel.MET_TITLE)) { titleAl = t.alignment / t.entryTokens.size(); } if (t.label.equals(BxZoneLabel.MET_AUTHOR)) { authorAl = t.alignment / t.entryTokens.size(); } } String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); int linesCount = z.childrenCount(); int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent()); BxLine firstLine = z.getFirstChild(); if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.MET_TITLE) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) && titleAl >= 0.7 && authorAl >= 0.4) { z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR); } if (linesCount == 2 && text.contains("page") && text.contains("of") && text.contains("page number not for")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (linesCount == 1 && (text.contains("page number not for") || (text.contains("page") && text.contains("of")))) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && linesCount < 11 && (text.contains("department") || text.contains("university"))) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (linesCount < 5 && firstLine.toText().length() < 11 && firstLine.toText().startsWith("Figure") && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx > 0 && z.hasPrev() && z.hasNext() && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_DATES) || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)) && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) && z.getWidth() < 100) { if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2; double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2; double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } } if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION) || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:") || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) { z.setLabel(BxZoneLabel.MET_EDITOR); } if (pageIdx == 0 && text.startsWith("copyright:")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume") && text.contains("issue")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.MET_AUTHOR) || z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { BxPage p = z.getParent(); if (pageIdx > 0) { BxPage prevPage = p.getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 1) { BxPage nextPage = p.getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx > 1) { BxPage prevPage = p.getPrev().getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 2) { BxPage nextPage = p.getNext().getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO) || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+") && text.length() <= 4 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (text.equals("acknowledgments")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("introduction") && z.hasPrev() && !z.getPrev().toText().toLowerCase().equals("abstract")) { wasIntro = true; } if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references") && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10 && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES) && (text.matches("[1-9][0-9]?[0-9]?\\.?") || text.matches(".*[1-2][0-9][0-9][0-9].*"))) { z.setLabel(BxZoneLabel.REFERENCES); } if ((z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (text.startsWith("doi") || text.startsWith("cite this article"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("author details")) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (firstLine.toText().toLowerCase().equals("acknowledgments") || firstLine.toText().toLowerCase().equals("acknowledgements"))) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100) && text.matches("sup-[0-9][0-9]?")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("references")) { z.setLabel(BxZoneLabel.REFERENCES); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT) && text.contains("this article is distributed")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("journal")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("correspondence")) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && text.contains("accepted") && text.contains("published")) { z.setLabel(BxZoneLabel.MET_DATES); } if (pageIdx == 0 && linesCount < 10 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4 && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) { if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) { z.setLabel(z.getPrev().getLabel()); } } if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with") || text.contains("will be the most significant development") || text.contains("disseminating the results of biomedical") || text.contains("sir paul nurse") || text.contains("your research papers") || text.contains("available free of charge") || text.contains("peer reviewed and published") || text.contains("cited in pubmed and archived") || text.contains("you keep the copyright") || text.contains("submit your manuscript") || text.contains("submit your next manuscript") || text.contains("online submission") || text.contains("peer review") || text.contains("space constraints") || text.contains("publication on acceptance") || text.contains("inclusion in pubmed") || text.contains("freely available") || text.contains("publication history"))) { z.setLabel(BxZoneLabel.OTH_UNKNOWN); } if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest") || text.startsWith("competing interests") || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest") || z.getPrev().toText().toLowerCase().equals("conflict of interest") || z.getPrev().toText().toLowerCase().equals("competing interests")))) { z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT); } changed = changed || !orig.equals(z.getLabel()); } boolean wasAuthor = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor && ((text.contains("email") && text.contains("@")) || text.startsWith("correspondence"))) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) { wasAuthor = true; } changed = changed || !orig.equals(z.getLabel()); } } } return bxDoc; }
From source file:sf.net.experimaestro.manager.js.JSNode.java
@Expose(value = "get_node", scope = true) public Object getNode(Context context, Scriptable scope, String expression) throws XPathExpressionException { XPathExpression xpath = XMLUtils.parseXPath(expression, JSUtils.getNamespaceContext(scope)); Node node = (Node) xpath.evaluate(this.node, XPathConstants.NODE); if (node == null) return NOT_FOUND; return new JSNode(node); }
From source file:sf.net.experimaestro.manager.js.JSNode.java
@Expose(value = "text", scope = true) public String getText(Context context, Scriptable scope, String expression) throws XPathExpressionException { XPathExpression xpath = XMLUtils.parseXPath(expression, JSUtils.getNamespaceContext(scope)); Node node = (Node) xpath.evaluate(this.node, XPathConstants.NODE); if (node == null) return ""; String text = node.getTextContent(); return text == null ? "" : text; }
From source file:sh.isaac.api.util.ArtifactUtilities.java
/** * Make maven relative path./*w w w. ja va 2s . com*/ * * @param baseMavenURL - optional - but required if you are downloading a SNAPSHOT dependency, as this method will need to download the metadata file * from the repository server in order to determine the proper version component for the SNAPSHOT. * @param mavenUsername - optional - only used for a SNAPSHOT dependency * @param mavenPassword - optional - only used for a SNAPSHOT dependency * @param groupId the group id * @param artifactId the artifact id * @param version the version * @param classifier - optional * @param type the type * @return the string * @throws Exception the exception */ public static String makeMavenRelativePath(String baseMavenURL, String mavenUsername, String mavenPassword, String groupId, String artifactId, String version, String classifier, String type) throws Exception { final String temp = groupId.replaceAll("\\.", "/"); String snapshotVersion = ""; String versionWithoutSnapshot = version; if (version.endsWith("-SNAPSHOT")) { versionWithoutSnapshot = version.substring(0, version.lastIndexOf("-SNAPSHOT")); final URL metadataUrl = new URL(baseMavenURL + (baseMavenURL.endsWith("/") ? "" : "/") + temp + "/" + artifactId + "/" + version + "/maven-metadata.xml"); // Need to download the maven-metadata.xml file final Task<File> task = new DownloadUnzipTask(mavenUsername, mavenPassword, metadataUrl, false, false, null); WorkExecutors.get().getExecutor().execute(task); final File metadataFile = task.get(); final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); // added to avoid XXE injections domFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); DocumentBuilder builder; Document dDoc = null; final XPath xPath = XPathFactory.newInstance().newXPath(); builder = domFactory.newDocumentBuilder(); dDoc = builder.parse(metadataFile); final String timestamp = ((Node) xPath.evaluate("/metadata/versioning/snapshot/timestamp", dDoc, XPathConstants.NODE)).getTextContent(); final String buildNumber = ((Node) xPath.evaluate("/metadata/versioning/snapshot/buildNumber", dDoc, XPathConstants.NODE)).getTextContent(); snapshotVersion = "-" + timestamp + "-" + buildNumber; metadataFile.delete(); // The download task makes a subfolder in temp for this, delete that too metadataFile.getParentFile().delete(); } return temp + "/" + artifactId + "/" + version + "/" + artifactId + "-" + versionWithoutSnapshot + snapshotVersion + (StringUtils.isNotBlank(classifier) ? "-" + classifier : "") + "." + type; }
From source file:test.framework.TestBase.java
/** * Return the child of the node selected by the xPath. * //w w w . jav a 2 s .c o m * @param node The node. * @param xPath The xPath expression. * @return The child of the node selected by the xPath. * @throws TransformerException If anything fails. */ protected static Node selectSingleNode(final Node node, final String xpathExpression) throws TransformerException { XPathFactory factory = XPathFactory.newInstance(); XPath xPath = factory.newXPath(); try { return (Node) xPath.evaluate(xpathExpression, node, XPathConstants.NODE); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:tufts.vue.ds.XMLIngest.java
static void XPathExtract(XmlSchema schema, Document document) { try {/* ww w . ja va 2 s.c om*/ XPath xpath = XPathFactory.newInstance().newXPath(); String expression = "/rss/channel/item"; //String expression = "rss/channel/item/title"; errout("Extracting " + expression); // First, obtain the element as a node. //tufts.DocDump.dump(document); Node nodeValue = (Node) xpath.evaluate(expression, document, XPathConstants.NODE); errout(" Node: " + nodeValue); // Next, obtain the element as a String. String stringValue = (String) xpath.evaluate(expression, document, XPathConstants.STRING); System.out.println(" String: " + stringValue); NodeList nodeSet = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET); errout("NodeSet: " + Util.tag(nodeSet) + "; size=" + nodeSet.getLength()); for (int i = 0; i < nodeSet.getLength(); i++) { scanNode(schema, nodeSet.item(i), null, null); } // // Finally, obtain the element as a Number (Double). // Double birthdateDouble = (Double) xpath.evaluate(expression, document, XPathConstants.NUMBER); // System.out.println("Double is: " + birthdateDouble); } catch (XPathExpressionException e) { System.err.println("XPathExpressionException caught..."); e.printStackTrace(); } catch (Throwable t) { t.printStackTrace(); } }