Java tutorial
/** * This file is part of CERMINE project. Copyright (c) 2011-2013 ICM-UW * * CERMINE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public * License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later * version. * * CERMINE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more * details. * * You should have received a copy of the GNU Affero General Public License along with CERMINE. If not, see * <http://www.gnu.org/licenses/>. */ package pl.edu.icm.cermine.pubmed; import pl.edu.icm.cermine.tools.SmartHashMap; import com.google.common.collect.Lists; import java.io.*; import java.util.Map.Entry; import java.util.*; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import pl.edu.icm.cermine.content.cleaning.ContentCleaner; import pl.edu.icm.cermine.exception.AnalysisException; import pl.edu.icm.cermine.exception.TransformationException; import pl.edu.icm.cermine.structure.model.*; import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter; import pl.edu.icm.cermine.structure.transformers.TrueVizToBxDocumentReader; import pl.edu.icm.cermine.tools.TextUtils; import pl.edu.icm.cermine.tools.XMLTools; import pl.edu.icm.cermine.tools.distance.CosineDistance; import pl.edu.icm.cermine.tools.distance.SmithWatermanDistance; public class RuleBasedPubmedXMLGenerator { private static class LabelTrio { private BxZoneLabel label; private Double alignment; private List<String> entryTokens; @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((label == null) ? 0 : label.hashCode()); result = prime * result + ((alignment == null) ? 0 : alignment.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } LabelTrio other = (LabelTrio) obj; if (label != other.label) { return false; } if (alignment == null) { if (other.alignment != null) { return false; } } else if (!alignment.equals(other.alignment)) { return false; } return true; } public LabelTrio(BxZoneLabel label, List<String> tokens, Double similarity) { this.alignment = similarity; this.label = label; this.entryTokens = tokens; } }; private boolean verbose = false; private void setVerbose(boolean verbose) { this.verbose = verbose; } private void printlnVerbose(String string) { if (verbose) { System.out.println(string); } } private void printVerbose(String string) { if (verbose) { System.out.print(string); } } public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false); dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader(); Reader r = new InputStreamReader(pdfStream); BxDocument bxDoc = new BxDocument().setPages(reader.read(r)); List<BxZone> zones = Lists.newArrayList(bxDoc.asZones()); Integer bxDocLen = zones.size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : TextUtils.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : TextUtils.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = TextUtils.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = zones.get(zoneIdx); List<String> zoneTokens = TextUtils.tokenize( TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } for (BxPage pp : bxDoc) { boolean changed = true; while (changed) { changed = false; boolean wasIntro = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); int i = zones.indexOf(z); double titleAl = 0; double authorAl = 0; List<LabelTrio> sims = swLabelSim.get(i); for (LabelTrio t : sims) { if (t.label.equals(BxZoneLabel.MET_TITLE)) { titleAl = t.alignment / t.entryTokens.size(); } if (t.label.equals(BxZoneLabel.MET_AUTHOR)) { authorAl = t.alignment / t.entryTokens.size(); } } String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); int linesCount = z.childrenCount(); int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent()); BxLine firstLine = z.getFirstChild(); if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.MET_TITLE) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) && titleAl >= 0.7 && authorAl >= 0.4) { z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR); } if (linesCount == 2 && text.contains("page") && text.contains("of") && text.contains("page number not for")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (linesCount == 1 && (text.contains("page number not for") || (text.contains("page") && text.contains("of")))) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && linesCount < 11 && (text.contains("department") || text.contains("university"))) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (linesCount < 5 && firstLine.toText().length() < 11 && firstLine.toText().startsWith("Figure") && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx > 0 && z.hasPrev() && z.hasNext() && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_DATES) || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)) && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) && z.getWidth() < 100) { if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2; double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2; double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } } if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION) || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:") || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) { z.setLabel(BxZoneLabel.MET_EDITOR); } if (pageIdx == 0 && text.startsWith("copyright:")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume") && text.contains("issue")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.MET_AUTHOR) || z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { BxPage p = z.getParent(); if (pageIdx > 0) { BxPage prevPage = p.getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 1) { BxPage nextPage = p.getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx > 1) { BxPage prevPage = p.getPrev().getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 2) { BxPage nextPage = p.getNext().getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO) || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+") && text.length() <= 4 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (text.equals("acknowledgments")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("introduction") && z.hasPrev() && !z.getPrev().toText().toLowerCase().equals("abstract")) { wasIntro = true; } if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references") && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10 && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES) && (text.matches("[1-9][0-9]?[0-9]?\\.?") || text.matches(".*[1-2][0-9][0-9][0-9].*"))) { z.setLabel(BxZoneLabel.REFERENCES); } if ((z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (text.startsWith("doi") || text.startsWith("cite this article"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("author details")) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (firstLine.toText().toLowerCase().equals("acknowledgments") || firstLine.toText().toLowerCase().equals("acknowledgements"))) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100) && text.matches("sup-[0-9][0-9]?")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("references")) { z.setLabel(BxZoneLabel.REFERENCES); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT) && text.contains("this article is distributed")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("journal")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("correspondence")) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && text.contains("accepted") && text.contains("published")) { z.setLabel(BxZoneLabel.MET_DATES); } if (pageIdx == 0 && linesCount < 10 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4 && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) { if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) { z.setLabel(z.getPrev().getLabel()); } } if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with") || text.contains("will be the most significant development") || text.contains("disseminating the results of biomedical") || text.contains("sir paul nurse") || text.contains("your research papers") || text.contains("available free of charge") || text.contains("peer reviewed and published") || text.contains("cited in pubmed and archived") || text.contains("you keep the copyright") || text.contains("submit your manuscript") || text.contains("submit your next manuscript") || text.contains("online submission") || text.contains("peer review") || text.contains("space constraints") || text.contains("publication on acceptance") || text.contains("inclusion in pubmed") || text.contains("freely available") || text.contains("publication history"))) { z.setLabel(BxZoneLabel.OTH_UNKNOWN); } if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest") || text.startsWith("competing interests") || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest") || z.getPrev().toText().toLowerCase().equals("conflict of interest") || z.getPrev().toText().toLowerCase().equals("competing interests")))) { z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT); } changed = changed || !orig.equals(z.getLabel()); } boolean wasAuthor = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor && ((text.contains("email") && text.contains("@")) || text.startsWith("correspondence"))) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) { wasAuthor = true; } changed = changed || !orig.equals(z.getLabel()); } } } return bxDoc; } public static void main(String[] args) { if (args.length != 1) { System.err.println("Usage: <pubmed directory>"); System.exit(1); } File dir = new File(args[0]); Collection<File> files = FileUtils.listFiles(dir, new String[] { "pdf" }, true); int i = 0; for (File pdfFile : files) { try { String pdfPath = pdfFile.getPath(); String nxmlPath = TextUtils.getNLMPath(pdfPath); String cxmlPath = pdfPath.replaceFirst("\\.pdf", ".cxml"); String cpxmlPath = pdfPath.replaceFirst("\\.pdf", ".cxml-corr"); File cpxmlFile = new File(cpxmlPath); if (cpxmlFile.exists()) { i++; continue; } System.out.println(pdfPath); InputStream nxmlStream = new FileInputStream(nxmlPath); InputStream cxmlStream = new FileInputStream(cxmlPath); RuleBasedPubmedXMLGenerator datasetGenerator = new RuleBasedPubmedXMLGenerator(); datasetGenerator.setVerbose(false); BxDocument bxDoc = datasetGenerator.generateTrueViz(cxmlStream, nxmlStream); i++; int keys = 0; Set<BxZoneLabel> set = EnumSet.noneOf(BxZoneLabel.class); int total = 0; int known = 0; for (BxZone z : bxDoc.asZones()) { total++; if (z.getLabel() != null) { known++; if (z.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_METADATA)) { set.add(z.getLabel()); } if (BxZoneLabel.REFERENCES.equals(z.getLabel())) { keys = 1; } } } if (set.contains(BxZoneLabel.MET_AFFILIATION)) { keys++; } if (set.contains(BxZoneLabel.MET_AUTHOR)) { keys++; } if (set.contains(BxZoneLabel.MET_BIB_INFO)) { keys++; } if (set.contains(BxZoneLabel.MET_TITLE)) { keys++; } FileWriter fstream = new FileWriter(cpxmlPath); BufferedWriter out = new BufferedWriter(fstream); BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter(); out.write(writer.write(Lists.newArrayList(bxDoc))); out.close(); System.out.println( "Progress: " + i + " out of " + files.size() + " (" + (i * 100. / files.size()) + "%)"); } catch (Exception e) { e.printStackTrace(); } } } }