List of usage examples for javax.xml.parsers DocumentBuilderFactory setFeature
public abstract void setFeature(String name, boolean value) throws ParserConfigurationException;
From source file:pl.edu.icm.cermine.evaluation.BwmetaGrobidFinalMetadataExtractionEvaluation.java
public void evaluate(int mode, NlmIterator iter) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException { javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance(); dbf.setValidating(false);//from w ww . jav a 2 s . c o m dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder(); SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser"); builder.setValidation(false); builder.setFeature("http://xml.org/sax/features/validation", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); List<ComparisonResult> titles = new ArrayList<ComparisonResult>(); List<ComparisonResult> authors = new ArrayList<ComparisonResult>(); List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>(); List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>(); List<ComparisonResult> emails = new ArrayList<ComparisonResult>(); List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>(); List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>(); List<ComparisonResult> keywords = new ArrayList<ComparisonResult>(); List<ComparisonResult> journals = new ArrayList<ComparisonResult>(); List<ComparisonResult> volumes = new ArrayList<ComparisonResult>(); List<ComparisonResult> issues = new ArrayList<ComparisonResult>(); List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>(); List<ComparisonResult> years = new ArrayList<ComparisonResult>(); List<ComparisonResult> dois = new ArrayList<ComparisonResult>(); List<ComparisonResult> references = new ArrayList<ComparisonResult>(); if (mode == 1) { System.out.println("path,gro_title,gro_abstract,gro_keywords," + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue," + "gro_pages,gro_year,gro_doi,gro_refs,one"); } int i = 0; for (NlmPair pair : iter) { i++; if (mode == 0) { System.out.println(""); System.out.println(">>>>>>>>> " + i); System.out.println(pair.getExtractedNlm().getPath()); } if (mode == 1) { System.out.print(pair.getOriginalNlm().getPath() + ","); } org.w3c.dom.Document originalNlm; org.w3c.dom.Document extractedNlm; try { originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm())); extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm())); } catch (SAXException ex) { i--; continue; } // Document's title MetadataSingle title = new MetadataSingle(originalNlm, "/bwmeta/element/name[not(@type)]", extractedNlm, "//teiHeader//titleStmt/title"); title.setComp(EvaluationUtils.swComparator); titles.add(title); title.print(mode, "title"); // Abstract MetadataSingle abstrakt = new MetadataSingle(originalNlm, "/bwmeta/element/description[@type='abstract']", extractedNlm, "//teiHeader//abstract/p"); abstrakt.setComp(EvaluationUtils.swComparator); abstracts.add(abstrakt); abstrakt.print(mode, "abstract"); // Keywords MetadataList keyword = new MetadataList(originalNlm, "/bwmeta/element/tags[@type='keyword']/tag", extractedNlm, "//teiHeader//keywords//term"); keywords.add(keyword); keyword.print(mode, "keywords"); // Authors List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm, "/bwmeta/element/contributor[@role='author']"); List<String> expectedAuthors = new ArrayList<String>(); for (Node authorNode : expectedAuthorNodes) { List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name"); if (names.isEmpty()) { continue; } for (Node n : names) { if (n.getAttributes().getNamedItem("type") != null && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) { expectedAuthors.add(n.getTextContent());//.replaceAll("[^a-zA-Z]", "")); break; } } } List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); List<String> extractedAuthors = new ArrayList<String>(); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); extractedAuthors.add(author); } MetadataList author = new MetadataList(expectedAuthors, extractedAuthors); author.setComp(EvaluationUtils.authorComparator); authors.add(author); author.print(mode, "author"); // Affiliations Set<String> expectedAffiliationsSet = Sets .newHashSet(XMLTools.extractTextAsList(originalNlm, "/bwmeta/element/affiliation/text")); Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/affiliation")); List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet); List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet); MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations); affiliation.setComp(EvaluationUtils.cosineComparator()); affiliations.add(affiliation); affiliation.print(mode, "affiliation"); // Author - Affiliation relation MetadataRelation authorAffiliation = new MetadataRelation(); authorAffiliation.setComp1(EvaluationUtils.authorComparator); authorAffiliation.setComp2(EvaluationUtils.cosineComparator()); List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm, "/bwmeta/element/affiliation"); Map<String, String> expectedAffiliationMap = new HashMap<String, String>(); for (Node expectedAffiliationNode : expectedAffiliationNodes) { String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue(); String aff = XMLTools.extractChildrenTextFromNode(expectedAffiliationNode, "text").get(0); expectedAffiliationMap.put(id, aff); } for (Node expectedAuthorNode : expectedAuthorNodes) { String authorName = null; List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name"); if (names.isEmpty()) { continue; } for (Node n : names) { if (n.getAttributes().getNamedItem("type") != null && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) { authorName = n.getTextContent();//.replaceAll("[^a-zA-Z]", ""); break; } } if (authorName == null) continue; List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "affiliation-ref"); for (Node xref : xrefs) { String affId = xref.getAttributes().getNamedItem("ref").getNodeValue(); String aff = expectedAffiliationMap.get(affId); if (aff != null) authorAffiliation.addExpected(new StringRelation(authorName, aff)); } } extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); Node n = authorNode.getParentNode(); NodeList nl = n.getChildNodes(); for (int iu = 0; iu < nl.getLength(); iu++) { Node aff = nl.item(iu); if ("affiliation".equals(aff.getNodeName())) { String aw = XMLTools.extractTextFromNode(aff); authorAffiliation.addExtracted(new StringRelation(a, aw)); } } } authorsAffiliations.add(authorAffiliation); authorAffiliation.print(mode, "author - affiliation"); // Email addresses MetadataList email = new MetadataList(originalNlm, "/bwmeta/element/contributor[@role='author']/attribute[@key='contact-email']/value", extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email"); email.setComp(EvaluationUtils.emailComparator); emails.add(email); email.print(mode, "email"); // Author - Email relations MetadataRelation authorEmail = new MetadataRelation(); authorEmail.setComp1(EvaluationUtils.authorComparator); authorEmail.setComp2(EvaluationUtils.emailComparator); for (Node expectedAuthorNode : expectedAuthorNodes) { String authorName = null; List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name"); if (names.isEmpty()) { continue; } for (Node n : names) { if (n.getAttributes().getNamedItem("type") != null && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) { authorName = n.getTextContent(); break; } } if (authorName == null) continue; List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "attribute"); for (Node address : addresses) { if ("contact-email".equals(address.getAttributes().getNamedItem("key").getNodeValue())) { String ema = XMLTools.extractChildrenTextFromNode(address, "value").get(0); authorEmail.addExpected(new StringRelation(authorName, ema)); } } } extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); Node n = authorNode.getParentNode(); NodeList nl = n.getChildNodes(); for (int iu = 0; iu < nl.getLength(); iu++) { Node aff = nl.item(iu); if ("email".equals(aff.getNodeName())) { String aw = XMLTools.extractTextFromNode(aff); authorEmail.addExtracted(new StringRelation(a, aw)); } } } authorsEmails.add(authorEmail); authorEmail.print(mode, "author - email"); // Journal title MetadataSingle journal = new MetadataSingle(originalNlm, "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Journal']/name[@type='canonical']", extractedNlm, "//monogr/title[@level='j' and @type='main']"); journal.setComp(EvaluationUtils.journalComparator); journals.add(journal); journal.print(mode, "journal title"); // Volume MetadataSingle volume = new MetadataSingle(originalNlm, "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Volume']/name[@type='canonical']", extractedNlm, "//monogr/imprint/biblScope[@unit='volume']"); volumes.add(volume); volume.print(mode, "volume"); // Issue MetadataSingle issue = new MetadataSingle(originalNlm, "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Number']/name[@type='canonical']", extractedNlm, "//monogr/imprint/biblScope[@unit='issue']"); issues.add(issue); issue.print(mode, "issue"); // Pages range MetadataSingle fPage = new MetadataSingle(originalNlm, "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position", extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from"); MetadataSingle lPage = new MetadataSingle(originalNlm, "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position", extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to"); String expRange = fPage.hasExpected() ? fPage.getExpectedValue().replaceAll("-", "--") : ""; String extrRange = fPage.hasExtracted() && lPage.hasExtracted() ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue() : ""; MetadataSingle pageRange = new MetadataSingle(expRange, extrRange); pageRanges.add(pageRange); pageRange.print(mode, "pages"); // Publication date List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm, "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Year']/name[@type='canonical']"); expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate); List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm, "//teiHeader//date[@type='published']"); List<String> extractedPubDate = Lists.newArrayList(); if (!extractedPubDates.isEmpty()) { Node pubDate = extractedPubDates.get(0); String date = pubDate.getTextContent(); if (pubDate.getAttributes().getNamedItem("when") != null) { date = pubDate.getAttributes().getNamedItem("when").getTextContent(); } extractedPubDate = Lists.newArrayList(date.split("-")); extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate); } MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"), StringUtils.join(extractedPubDate, "---")); year.setComp(EvaluationUtils.yearComparator); years.add(year); year.print(mode, "year"); // DOI MetadataSingle doi = new MetadataSingle(originalNlm, "/bwmeta/element/id[@scheme='bwmeta1.id-class.DOI']/@value", extractedNlm, "//teiHeader//idno[@type='DOI']"); dois.add(doi); doi.print(mode, "DOI"); // References List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//relation[@type='reference-to']/attribute[@key='reference-text']/value"); //bwmeta List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid List<String> originalRefs = new ArrayList<String>(); List<String> extractedRefs = new ArrayList<String>(); for (Node originalRefNode : originalRefNodes) { originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim()); } for (Node extractedRefNode : extractedRefNodes) { extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim()); } MetadataList refs = new MetadataList(originalRefs, extractedRefs); refs.setComp(EvaluationUtils.cosineComparator(0.6)); references.add(refs); refs.print(mode, "references"); if (mode == 1) { System.out.println("1"); } } if (mode != 1) { System.out.println("==== Summary (" + iter.size() + " docs)===="); PrecisionRecall titlePR = new PrecisionRecall().build(titles); titlePR.print("Title"); PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts); abstractPR.print("Abstract"); PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords); keywordsPR.print("Keywords"); PrecisionRecall authorsPR = new PrecisionRecall().build(authors); authorsPR.print("Authors"); PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations); affiliationsPR.print("Affiliations"); PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations); authorsAffiliationsPR.print("Author - affiliation"); PrecisionRecall emailsPR = new PrecisionRecall().build(emails); emailsPR.print("Emails"); PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails); authorsEmailsPR.print("Author - email"); PrecisionRecall journalPR = new PrecisionRecall().build(journals); journalPR.print("Journal"); PrecisionRecall volumePR = new PrecisionRecall().build(volumes); volumePR.print("Volume"); PrecisionRecall issuePR = new PrecisionRecall().build(issues); issuePR.print("Issue"); PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges); pageRangePR.print("Pages"); PrecisionRecall yearPR = new PrecisionRecall().build(years); yearPR.print("Year"); PrecisionRecall doiPR = new PrecisionRecall().build(dois); doiPR.print("DOI"); PrecisionRecall refsPR = new PrecisionRecall().build(references); refsPR.print("References"); List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR, abstractPR, keywordsPR, yearPR, doiPR); double avgPrecision = 0; double avgRecall = 0; double avgF1 = 0; for (PrecisionRecall result : results) { avgPrecision += result.getPrecision(); avgRecall += result.getRecall(); avgF1 += result.getF1(); } avgPrecision /= results.size(); avgRecall /= results.size(); avgF1 /= results.size(); System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision); System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall); System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1); } }
From source file:pl.edu.icm.cermine.evaluation.FinalMetadataExtractionEvaluation.java
public void evaluate(NlmIterator iter) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException { javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*w ww . java 2s. co m*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder(); SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser"); builder.setValidation(false); builder.setFeature("http://xml.org/sax/features/validation", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); PrecissonRecall issn = new PrecissonRecall(); PrecissonRecall doi = new PrecissonRecall(); PrecissonRecall volume = new PrecissonRecall(); PrecissonRecall issue = new PrecissonRecall(); PrecissonRecall pages = new PrecissonRecall(); PrecissonRecall dateYear = new PrecissonRecall(); PrecissonRecall dateFull = new PrecissonRecall(); PrecissonRecall journalTitle = new PrecissonRecall(); List<Double> abstractRates = new ArrayList<Double>(iter.size()); List<Double> titleRates = new ArrayList<Double>(iter.size()); List<Double> keywordPrecisions = new ArrayList<Double>(iter.size()); List<Double> keywordRecalls = new ArrayList<Double>(iter.size()); List<Double> authorsPrecisions = new ArrayList<Double>(iter.size()); List<Double> authorsRecalls = new ArrayList<Double>(iter.size()); List<Double> affPrecisions = new ArrayList<Double>(iter.size()); List<Double> affRecalls = new ArrayList<Double>(iter.size()); int ii = 0; for (NlmPair pair : iter) { ii++; System.out.println(""); printVerbose(">>>>>>>>> " + ii); printVerbose(pair.getExtractedNlm().getPath()); org.w3c.dom.Document originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm())); org.w3c.dom.Document extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm())); String expectedTitle = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta//article-title"); String extractedTitle = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/title-group/article-title"); List<Node> expectedAuthorsNodes = XMLTools.extractNodes(originalNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/name"); List<String> expectedAuthors = new ArrayList<String>(); for (Node authorNode : expectedAuthorsNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "given-names"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); author = author.replaceAll("[^a-zA-Z ]", ""); expectedAuthors.add(author); } List<String> extractedAuthors1 = XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/string-name"); List<String> extractedAuthors = new ArrayList<String>(); for (String author : extractedAuthors1) { extractedAuthors.add(author.replaceAll("[^a-zA-Z ]", "")); } List<String> expectedKeywords = XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//kwd"); List<String> extractedKeywords = XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta/kwd-group/kwd"); String expectedJournalTitle = XMLTools.extractTextFromNode(originalNlm, "/article/front/journal-meta//journal-title"); String extractedJournalTitle = XMLTools.extractTextFromNode(extractedNlm, "/article/front/journal-meta/journal-title-group/journal-title"); String expectedAbstract = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/abstract"); String extractedAbstract = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/abstract"); String expectedDoi = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/article-id[@pub-id-type='doi']"); String extractedDoi = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/article-id[@pub-id-type='doi']"); String expectedISSN = XMLTools.extractTextFromNode(originalNlm, "/article/front/journal-meta/issn[@pub-type='ppub']"); String extractedISSN = XMLTools.extractTextFromNode(extractedNlm, "/article/front/journal-meta/issn[@pub-type='ppub']"); String expectedVolume = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/volume"); String extractedVolume = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/volume"); String expectedIssue = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/issue"); String extractedIssue = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/issue"); String expectedFPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/fpage"); String extractedFPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/fpage"); String expectedLPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/lpage"); String extractedLPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/lpage"); List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta/pub-date"); expectedPubDate = removeLeadingZerosFromDate(expectedPubDate); List<String> extractedPubDate = XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta/pub-date"); extractedPubDate = removeLeadingZerosFromDate(extractedPubDate); Set<String> expectedAffiliationsSet = Sets .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff")); Set<String> extractedAffiliationsSet = Sets .newHashSet(XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta//aff")); List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet); List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet); //equality measures if (!expectedVolume.isEmpty()) { if (expectedVolume.equals(extractedVolume)) { ++volume.correct; } ++volume.expected; } if (!extractedVolume.isEmpty()) { volume.extracted++; } if (!expectedIssue.isEmpty()) { if (expectedIssue.equals(extractedIssue)) { ++issue.correct; } ++issue.expected; } if (!extractedIssue.isEmpty()) { issue.extracted++; } if (!expectedISSN.isEmpty()) { if (extractedISSN.equals(expectedISSN)) { ++issn.correct; } ++issn.expected; } if (!extractedISSN.isEmpty()) { issn.extracted++; } if (!expectedDoi.isEmpty()) { if (expectedDoi.equals(extractedDoi)) { ++doi.correct; } ++doi.expected; } if (!extractedDoi.isEmpty()) { doi.extracted++; } if (!expectedFPage.isEmpty() && !expectedLPage.isEmpty()) { if (expectedFPage.equals(extractedFPage) && expectedLPage.equals(extractedLPage)) { ++pages.correct; } ++pages.expected; } if (!extractedFPage.isEmpty() && !extractedLPage.isEmpty()) { pages.extracted++; } if (!expectedPubDate.isEmpty()) { Boolean yearsMatch = DateComparator.yearsMatch(expectedPubDate, extractedPubDate); if (yearsMatch != null) { if (yearsMatch) { ++dateYear.correct; } ++dateYear.expected; } } if (!extractedPubDate.isEmpty()) { dateYear.extracted++; dateFull.extracted++; } //Smith-Waterman distance measures if (expectedAbstract.length() > 0) { abstractRates.add(compareStringsSW(expectedAbstract, extractedAbstract)); } else { abstractRates.add(null); } if (expectedTitle.length() > 0) { titleRates.add(compareStringsSW(expectedTitle, extractedTitle)); } else { titleRates.add(null); } if (!expectedJournalTitle.isEmpty()) { journalTitle.expected++; } if (!extractedJournalTitle.isEmpty()) { journalTitle.extracted++; if (isSubsequence(expectedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase(), extractedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase())) { journalTitle.correct++; } } //precision + recall if (expectedAuthors.size() > 0) { authorsRecalls.add(calculateRecall(expectedAuthors, extractedAuthors)); } else { authorsRecalls.add(null); } if (extractedAuthors.size() > 0) { authorsPrecisions.add(calculatePrecision(expectedAuthors, extractedAuthors)); } else { authorsPrecisions.add(null); } if (expectedKeywords.size() > 0) { keywordRecalls.add(calculateRecall(expectedKeywords, extractedKeywords)); } else { keywordRecalls.add(null); } if (extractedKeywords.size() > 0) { keywordPrecisions.add(calculatePrecision(expectedKeywords, extractedKeywords)); } else { keywordPrecisions.add(null); } if (expectedAffiliations.size() > 0) { affRecalls.add(calculateRecall(expectedAffiliations, extractedAffiliations)); } else { affRecalls.add(null); } if (extractedAffiliations.size() > 0) { affPrecisions.add(calculatePrecision(expectedAffiliations, extractedAffiliations)); } else { affPrecisions.add(null); } System.out.println(""); printVerbose(">>> Expected authors: "); for (String author : expectedAuthors) { printVerbose(author); } System.out.println(""); printVerbose(">>> Extracted authors: "); for (String author : extractedAuthors) { printVerbose(author); } System.out.println(""); printVerbose(">>> Expected keywords: "); for (String keyword : expectedKeywords) { printVerbose(keyword); } System.out.println(""); printVerbose(">>> Extracted keywords: "); for (String keyword : extractedKeywords) { printVerbose(keyword); } printVerbose(">>> Expected journal title: " + expectedJournalTitle); printVerbose(">>> Extracted journal title: " + extractedJournalTitle); printVerbose(">>> Expected article title: " + expectedTitle); printVerbose(">>> Extracted article title: " + extractedTitle); printVerbose(">>> Expected article abstract: " + expectedAbstract); printVerbose(">>> Extracted article abstract: " + extractedAbstract); printVerbose(">>> Expected doi: " + expectedDoi); printVerbose(">>> Extracted doi: " + extractedDoi); printVerbose(">>> Expected issn: " + expectedISSN); printVerbose(">>> Extracted issn: " + extractedISSN); printVerbose(">>> Expected volume: " + expectedVolume); printVerbose(">>> Extracted volume: " + extractedVolume); printVerbose(">>> Expected issue: " + expectedIssue); printVerbose(">>> Extracted issue: " + extractedIssue); printVerbose(">>> Expected pages: " + expectedFPage + " " + expectedLPage); printVerbose(">>> Extracted pages: " + extractedFPage + " " + extractedLPage); printVerbose(">>> Expected date: "); for (String date : expectedPubDate) { printVerbose(date); } printVerbose(">>> Extracted date: "); for (String date : extractedPubDate) { printVerbose(date); } printVerbose(">>> Expected affs: "); for (String aff : expectedAffiliations) { printVerbose(aff); } printVerbose(">>> Extracted affs: "); for (String aff : extractedAffiliations) { printVerbose(aff); } printVerbose("abstract " + abstractRates.get(abstractRates.size() - 1)); printVerbose("title " + titleRates.get(titleRates.size() - 1)); printVerbose("journal title " + journalTitle); System.out.println(""); printVerbose("authors precission " + authorsPrecisions.get(authorsPrecisions.size() - 1)); printVerbose("authors recall " + authorsRecalls.get(authorsPrecisions.size() - 1)); System.out.println(""); printVerbose("aff precission " + affPrecisions.get(affPrecisions.size() - 1)); printVerbose("aff recall " + affRecalls.get(affPrecisions.size() - 1)); System.out.println(""); printVerbose("keywords precission " + keywordPrecisions.get(keywordPrecisions.size() - 1)); printVerbose("keywords recall " + keywordRecalls.get(keywordPrecisions.size() - 1)); printVerbose("date years" + dateYear); printVerbose("doi" + doi); printVerbose("issn" + issn); printVerbose("volume" + volume); printVerbose("issue" + issue); printVerbose("pages" + pages); } Double value; System.out.println("==== Summary (" + iter.size() + " docs)===="); if ((value = calculateAverage(abstractRates)) != null) { System.out.printf("abstract avg (SW) \t\t%4.2f\n", 100 * value); } if ((value = calculateAverage(titleRates)) != null) { System.out.printf("title avg (SW) \t\t\t%4.2f\n", 100 * value); } if ((value = journalTitle.calculatePrecission()) != null) { System.out.printf("journal title precission\t\t%4.2f\n", 100 * value); } if ((value = journalTitle.calculateRecall()) != null) { System.out.printf("journal title recall\t\t%4.2f\n", 100 * value); } if ((value = calculateAverage(authorsPrecisions)) != null) { System.out.printf("authors precision avg (EQ)\t%4.2f\n", 100 * value); } if ((value = calculateAverage(authorsRecalls)) != null) { System.out.printf("authors recall avg (EQ)\t\t%4.2f\n", 100 * value); } if ((value = calculateAverage(affPrecisions)) != null) { System.out.printf("aff precision avg (EQ)\t%4.2f\n", 100 * value); } if ((value = calculateAverage(affRecalls)) != null) { System.out.printf("aff recall avg (EQ)\t\t%4.2f\n", 100 * value); } if ((value = calculateAverage(keywordPrecisions)) != null) { System.out.printf("keywords precision avg (EQ)\t%4.2f\n", 100 * value); } if ((value = calculateAverage(keywordRecalls)) != null) { System.out.printf("keywords recall avg (EQ)\t%4.2f\n", 100 * value); } if ((value = dateYear.calculatePrecission()) != null) { System.out.printf("date year precission avg\t\t%4.2f\n", 100 * value); } if ((value = dateYear.calculateRecall()) != null) { System.out.printf("date year recall avg\t\t%4.2f\n", 100 * value); } if ((value = doi.calculatePrecission()) != null) { System.out.printf("doi precission\t\t%4.2f\n", 100 * value); } if ((value = doi.calculateRecall()) != null) { System.out.printf("doi recall\t\t%4.2f\n", 100 * value); } if ((value = issn.calculatePrecission()) != null) { System.out.printf("issn precission\t\t%4.2f\n", 100 * value); } if ((value = issn.calculateRecall()) != null) { System.out.printf("issn recall\t\t%4.2f\n", 100 * value); } if ((value = volume.calculatePrecission()) != null) { System.out.printf("volume precission\t\t%4.2f\n", 100 * value); } if ((value = volume.calculateRecall()) != null) { System.out.printf("volume recall\t\t%4.2f\n", 100 * value); } if ((value = issue.calculatePrecission()) != null) { System.out.printf("issue precission\t\t%4.2f\n", 100 * value); } if ((value = issue.calculateRecall()) != null) { System.out.printf("issue recall\t\t%4.2f\n", 100 * value); } if ((value = pages.calculatePrecission()) != null) { System.out.printf("pages precission avg\t\t%4.2f\n", 100 * value); } if ((value = pages.calculateRecall()) != null) { System.out.printf("pages recall avg\t\t%4.2f\n", 100 * value); } }
From source file:pl.edu.icm.cermine.evaluation.GrobidFinalMetadataExtractionEvaluation.java
public void evaluate(int mode, NlmIterator iter) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException { javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance(); dbf.setValidating(false);//from w ww .ja va 2 s. c o m dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder(); SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser"); builder.setValidation(false); builder.setFeature("http://xml.org/sax/features/validation", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); List<ComparisonResult> titles = new ArrayList<ComparisonResult>(); List<ComparisonResult> authors = new ArrayList<ComparisonResult>(); List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>(); List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>(); List<ComparisonResult> emails = new ArrayList<ComparisonResult>(); List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>(); List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>(); List<ComparisonResult> keywords = new ArrayList<ComparisonResult>(); List<ComparisonResult> journals = new ArrayList<ComparisonResult>(); List<ComparisonResult> volumes = new ArrayList<ComparisonResult>(); List<ComparisonResult> issues = new ArrayList<ComparisonResult>(); List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>(); List<ComparisonResult> years = new ArrayList<ComparisonResult>(); List<ComparisonResult> dois = new ArrayList<ComparisonResult>(); List<ComparisonResult> references = new ArrayList<ComparisonResult>(); if (mode == 1) { System.out.println("path,gro_title,gro_abstract,gro_keywords," + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue," + "gro_pages,gro_year,gro_doi,gro_refs,one"); } int i = 0; for (NlmPair pair : iter) { i++; if (mode == 0) { System.out.println(""); System.out.println(">>>>>>>>> " + i); System.out.println(pair.getExtractedNlm().getPath()); } if (mode == 1) { System.out.print(pair.getOriginalNlm().getPath() + ","); } org.w3c.dom.Document originalNlm; org.w3c.dom.Document extractedNlm; try { originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm())); extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm())); } catch (SAXException ex) { i--; continue; } // Document's title MetadataSingle title = new MetadataSingle(originalNlm, "/article/front/article-meta//article-title", extractedNlm, "//teiHeader//titleStmt/title"); title.setComp(EvaluationUtils.swComparator); titles.add(title); title.print(mode, "title"); // Abstract MetadataSingle abstrakt = new MetadataSingle(originalNlm, "/article/front/article-meta/abstract", extractedNlm, "//teiHeader//abstract/p"); abstrakt.setComp(EvaluationUtils.swComparator); abstracts.add(abstrakt); abstrakt.print(mode, "abstract"); // Keywords MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm, "//teiHeader//keywords//term"); keywords.add(keyword); keyword.print(mode, "keywords"); // Authors List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]"); List<String> expectedAuthors = new ArrayList<String>(); for (Node authorNode : expectedAuthorNodes) { List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name"); if (names.isEmpty()) { continue; } Node name = names.get(0); List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names"); List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname"); String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); expectedAuthors.add(author); } List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); List<String> extractedAuthors = new ArrayList<String>(); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); extractedAuthors.add(author); } MetadataList author = new MetadataList(expectedAuthors, extractedAuthors); author.setComp(EvaluationUtils.authorComparator); authors.add(author); author.print(mode, "author"); // Affiliations Set<String> expectedAffiliationsSet = Sets .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff")); Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/affiliation")); List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet); List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet); MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations); affiliation.setComp(EvaluationUtils.cosineComparator()); affiliations.add(affiliation); affiliation.print(mode, "affiliation"); // Author - Affiliation relation MetadataRelation authorAffiliation = new MetadataRelation(); authorAffiliation.setComp1(EvaluationUtils.authorComparator); authorAffiliation.setComp2(EvaluationUtils.cosineComparator()); List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm, "/article/front/article-meta//aff[@id]"); Map<String, String> expectedAffiliationMap = new HashMap<String, String>(); for (Node expectedAffiliationNode : expectedAffiliationNodes) { String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue(); String aff = XMLTools.extractTextFromNode(expectedAffiliationNode); expectedAffiliationMap.put(id, aff); } for (Node expectedAuthorNode : expectedAuthorNodes) { String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode)); List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "xref"); for (Node xref : xrefs) { if (xref.getAttributes() != null && xref.getAttributes().getNamedItem("ref-type") != null && "aff".equals(xref.getAttributes().getNamedItem("ref-type").getNodeValue())) { String affId = xref.getAttributes().getNamedItem("rid").getNodeValue(); for (String id : affId.split(" ")) { String aff = expectedAffiliationMap.get(id); if (aff != null) { authorAffiliation.addExpected(new StringRelation(authorName, aff)); } } } } } extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); Node n = authorNode.getParentNode(); NodeList nl = n.getChildNodes(); for (int iu = 0; iu < nl.getLength(); iu++) { Node aff = nl.item(iu); if ("affiliation".equals(aff.getNodeName())) { String aw = XMLTools.extractTextFromNode(aff); authorAffiliation.addExtracted(new StringRelation(a, aw)); } } } authorsAffiliations.add(authorAffiliation); authorAffiliation.print(mode, "author - affiliation"); // Email addresses MetadataList email = new MetadataList(originalNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email", extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email"); email.setComp(EvaluationUtils.emailComparator); emails.add(email); email.print(mode, "email"); // Author - Email relations MetadataRelation authorEmail = new MetadataRelation(); authorEmail.setComp1(EvaluationUtils.authorComparator); authorEmail.setComp2(EvaluationUtils.emailComparator); for (Node expectedAuthorNode : expectedAuthorNodes) { String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode)); List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "address"); for (Node address : addresses) { for (String emailAddress : XMLTools.extractChildrenTextFromNode(address, "email")) { authorEmail.addExpected(new StringRelation(authorName, emailAddress)); } } for (String emailAddress : XMLTools.extractChildrenTextFromNode(expectedAuthorNode, "email")) { authorEmail.addExpected(new StringRelation(authorName, emailAddress)); } } extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/persName"); for (Node authorNode : extractedAuthorNodes) { List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename"); List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname"); String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); Node n = authorNode.getParentNode(); NodeList nl = n.getChildNodes(); for (int iu = 0; iu < nl.getLength(); iu++) { Node aff = nl.item(iu); if ("email".equals(aff.getNodeName())) { String aw = XMLTools.extractTextFromNode(aff); authorEmail.addExtracted(new StringRelation(a, aw)); } } } authorsEmails.add(authorEmail); authorEmail.print(mode, "author - email"); // Journal title MetadataSingle journal = new MetadataSingle(originalNlm, "/article/front/journal-meta//journal-title", extractedNlm, "//monogr/title[@level='j' and @type='main']"); journal.setComp(EvaluationUtils.journalComparator); journals.add(journal); journal.print(mode, "journal title"); // Volume MetadataSingle volume = new MetadataSingle(originalNlm, "/article/front/article-meta/volume", extractedNlm, "//monogr/imprint/biblScope[@unit='volume']"); volumes.add(volume); volume.print(mode, "volume"); // Issue MetadataSingle issue = new MetadataSingle(originalNlm, "/article/front/article-meta/issue", extractedNlm, "//monogr/imprint/biblScope[@unit='issue']"); issues.add(issue); issue.print(mode, "issue"); // Pages range MetadataSingle fPage = new MetadataSingle(originalNlm, "/article/front/article-meta/fpage", extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from"); MetadataSingle lPage = new MetadataSingle(originalNlm, "/article/front/article-meta/lpage", extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to"); String expRange = fPage.hasExpected() && lPage.hasExpected() ? fPage.getExpectedValue() + "--" + lPage.getExpectedValue() : ""; String extrRange = fPage.hasExtracted() && lPage.hasExtracted() ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue() : ""; MetadataSingle pageRange = new MetadataSingle(expRange, extrRange); pageRanges.add(pageRange); pageRange.print(mode, "pages"); // Publication date List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta/pub-date"); expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate); List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm, "//teiHeader//date[@type='published']"); List<String> extractedPubDate = Lists.newArrayList(); if (!extractedPubDates.isEmpty()) { Node pubDate = extractedPubDates.get(0); String date = pubDate.getTextContent(); if (pubDate.getAttributes().getNamedItem("when") != null) { date = pubDate.getAttributes().getNamedItem("when").getTextContent(); } extractedPubDate = Lists.newArrayList(date.split("-")); extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate); } MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"), StringUtils.join(extractedPubDate, "---")); year.setComp(EvaluationUtils.yearComparator); years.add(year); year.print(mode, "year"); // DOI MetadataSingle doi = new MetadataSingle(originalNlm, "/article/front/article-meta/article-id[@pub-id-type='doi']", extractedNlm, "//teiHeader//idno[@type='DOI']"); dois.add(doi); doi.print(mode, "DOI"); // References List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref"); //nxml List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid List<String> originalRefs = new ArrayList<String>(); List<String> extractedRefs = new ArrayList<String>(); for (Node originalRefNode : originalRefNodes) { originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim()); } for (Node extractedRefNode : extractedRefNodes) { extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim()); } MetadataList refs = new MetadataList(originalRefs, extractedRefs); refs.setComp(EvaluationUtils.cosineComparator(0.6)); references.add(refs); refs.print(mode, "references"); if (mode == 1) { System.out.println("1"); } } if (mode != 1) { System.out.println("==== Summary (" + iter.size() + " docs)===="); PrecisionRecall titlePR = new PrecisionRecall().build(titles); titlePR.print("Title"); PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts); abstractPR.print("Abstract"); PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords); keywordsPR.print("Keywords"); PrecisionRecall authorsPR = new PrecisionRecall().build(authors); authorsPR.print("Authors"); PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations); affiliationsPR.print("Affiliations"); PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations); authorsAffiliationsPR.print("Author - affiliation"); PrecisionRecall emailsPR = new PrecisionRecall().build(emails); emailsPR.print("Emails"); PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails); authorsEmailsPR.print("Author - email"); PrecisionRecall journalPR = new PrecisionRecall().build(journals); journalPR.print("Journal"); PrecisionRecall volumePR = new PrecisionRecall().build(volumes); volumePR.print("Volume"); PrecisionRecall issuePR = new PrecisionRecall().build(issues); issuePR.print("Issue"); PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges); pageRangePR.print("Pages"); PrecisionRecall yearPR = new PrecisionRecall().build(years); yearPR.print("Year"); PrecisionRecall doiPR = new PrecisionRecall().build(dois); doiPR.print("DOI"); PrecisionRecall refsPR = new PrecisionRecall().build(references); refsPR.print("References"); List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR, abstractPR, keywordsPR, yearPR, doiPR); double avgPrecision = 0; double avgRecall = 0; double avgF1 = 0; for (PrecisionRecall result : results) { avgPrecision += result.getPrecision(); avgRecall += result.getRecall(); avgF1 += result.getF1(); } avgPrecision /= results.size(); avgRecall /= results.size(); avgF1 /= results.size(); System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision); System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall); System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1); } }
From source file:pl.edu.icm.cermine.evaluation.ParsCitFinalMetadataExtractionEvaluation.java
public void evaluate(int mode, NlmIterator iter) throws AnalysisException, IOException, TransformationException, ParserConfigurationException, SAXException, JDOMException, XPathExpressionException, TransformerException { javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance(); dbf.setValidating(false);// w w w . jav a 2s . c o m dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder(); SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser"); builder.setValidation(false); builder.setFeature("http://xml.org/sax/features/validation", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); List<ComparisonResult> titles = new ArrayList<ComparisonResult>(); List<ComparisonResult> authors = new ArrayList<ComparisonResult>(); List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>(); List<ComparisonResult> emails = new ArrayList<ComparisonResult>(); List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>(); List<ComparisonResult> keywords = new ArrayList<ComparisonResult>(); List<ComparisonResult> references = new ArrayList<ComparisonResult>(); if (mode == 1) { System.out.println("path,pcit_title,pcit_abstract,pcit_keywords," + "pcit_authors,pcit_affs,pcit_email,pcit_refs,one"); } int i = 0; for (NlmPair pair : iter) { i++; if (mode == 0) { System.out.println(""); System.out.println(">>>>>>>>> " + i); System.out.println(pair.getExtractedNlm().getPath()); } if (mode == 1) { System.out.print(pair.getOriginalNlm().getPath() + ","); } org.w3c.dom.Document originalNlm; org.w3c.dom.Document extractedNlm; try { originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm())); extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm())); } catch (SAXException ex) { i--; continue; } // Title String expectedTitle = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta//article-title"); List<Node> extractedTitleNodes = XMLTools.extractNodes(extractedNlm, "//algorithm[@name='ParsHed']//title"); String extractedTitle = null; double confidence = 0; for (Node extractedTitleNode : extractedTitleNodes) { if (extractedTitle == null) { extractedTitle = extractedTitleNode.getTextContent(); } Node conf = extractedTitleNode.getAttributes().getNamedItem("confidence"); if (conf != null) { double actConf = Double.valueOf(conf.getNodeValue()); if (actConf > confidence) { confidence = actConf; extractedTitle = extractedTitleNode.getTextContent(); } } } MetadataSingle title = new MetadataSingle(expectedTitle, extractedTitle); title.setComp(EvaluationUtils.swComparator); titles.add(title); title.print(mode, "title"); // Abstract String expectedAbstract = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/abstract"); List<Node> extractedAbstractNodes = XMLTools.extractNodes(extractedNlm, "//algorithm[@name='ParsHed']//abstract"); String extractedAbstract = null; confidence = 0; for (Node extractedAbstractNode : extractedAbstractNodes) { if (extractedAbstract == null) { extractedAbstract = extractedAbstractNode.getTextContent(); } Node conf = extractedAbstractNode.getAttributes().getNamedItem("confidence"); if (conf != null) { double actConf = Double.valueOf(conf.getNodeValue()); if (actConf > confidence) { confidence = actConf; extractedAbstract = extractedAbstractNode.getTextContent(); } } } MetadataSingle abstrakt = new MetadataSingle(expectedAbstract, extractedAbstract); abstrakt.setComp(EvaluationUtils.swComparator); abstracts.add(abstrakt); abstrakt.print(mode, "abstract"); // Keywords MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm, "//algorithm[@name='ParsHed']//keyword"); keywords.add(keyword); keyword.print(mode, "keywords"); // Authors List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]"); List<String> expectedAuthors = new ArrayList<String>(); for (Node authorNode : expectedAuthorNodes) { List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name"); if (names.isEmpty()) { continue; } Node name = names.get(0); List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names"); List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname"); String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " "); expectedAuthors.add(author); } List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm, "//algorithm[@name='ParsHed']//author"); List<String> extractedAuthors = new ArrayList<String>(); for (Node authorNode : extractedAuthorNodes) { String author = XMLTools.extractTextFromNode(authorNode); extractedAuthors.add(author); } MetadataList author = new MetadataList(expectedAuthors, extractedAuthors); author.setComp(EvaluationUtils.authorComparator); authors.add(author); author.print(mode, "author"); // Affiliations Set<String> expectedAffiliationsSet = Sets .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff")); Set<String> extractedAffiliationsSet = Sets.newHashSet( XMLTools.extractTextAsList(extractedNlm, "//algorithm[@name='ParsHed']//affiliation")); List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet); List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet); MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations); affiliation.setComp(EvaluationUtils.cosineComparator()); affiliations.add(affiliation); affiliation.print(mode, "affiliation"); // Email addresses MetadataList email = new MetadataList(originalNlm, "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email", extractedNlm, "//algorithm[@name='ParsHed']//email"); email.setComp(EvaluationUtils.emailComparator); emails.add(email); email.print(mode, "email"); //references List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref"); List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//algorithm[@name='ParsCit']//citationList/citation/rawString"); List<String> originalRefs = new ArrayList<String>(); List<String> extractedRefs = new ArrayList<String>(); for (Node originalRefNode : originalRefNodes) { originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim()); } for (Node extractedRefNode : extractedRefNodes) { extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim()); } MetadataList refs = new MetadataList(originalRefs, extractedRefs); refs.setComp(EvaluationUtils.cosineComparator(0.6)); references.add(refs); refs.print(mode, "references"); if (mode == 1) { System.out.println("1"); } } if (mode != 1) { System.out.println("==== Summary (" + iter.size() + " docs)===="); PrecisionRecall titlePR = new PrecisionRecall().build(titles); titlePR.print("Title"); PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts); abstractPR.print("Abstract"); PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords); keywordsPR.print("Keywords"); PrecisionRecall authorsPR = new PrecisionRecall().build(authors); authorsPR.print("Authors"); PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations); affiliationsPR.print("Affiliations"); PrecisionRecall emailsPR = new PrecisionRecall().build(emails); emailsPR.print("Emails"); PrecisionRecall refsPR = new PrecisionRecall().build(references); refsPR.print("References"); List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR, abstractPR, keywordsPR, refsPR); double avgPrecision = 0; double avgRecall = 0; double avgF1 = 0; for (PrecisionRecall result : results) { avgPrecision += result.getPrecision(); avgRecall += result.getRecall(); avgF1 += result.getF1(); } avgPrecision /= results.size(); avgRecall /= results.size(); avgF1 /= results.size(); System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision); System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall); System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1); } }
From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*w w w.j a v a2 s .c om*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor(); BxDocument bxDoc = structureExtractor.extractStructure(pdfStream); Integer bxDocLen = bxDoc.asZones().size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : StringTools.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : StringTools.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : StringTools.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = StringTools.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = bxDoc.asZones().get(zoneIdx); List<String> zoneTokens = StringTools.tokenize(StringTools .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } printlnVerbose("==========================="); for (BxPage page : bxDoc.getPages()) { for (BxZone zone : page.getZones()) { Integer zoneIdx = bxDoc.asZones().indexOf(zone); BxZone curZone = bxDoc.asZones().get(zoneIdx); String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase()); List<String> zoneTokens = StringTools.tokenize(zoneText); Boolean valueSet = false; Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens; if (Math.max(zoneTokens.size(), entryTokens.size()) > 0 && Math.min(zoneTokens.size(), entryTokens.size()) / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7 && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("0 "); } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment - t2.alignment; if (Math.abs(simDif) < 0.0001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size()); if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) { curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label); valueSet = true; printVerbose("1 "); } } if (!valueSet) { Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class); for (LabelTrio trio : swLabelSim.get(zoneIdx)) { if (cumulated.containsKey(trio.label)) { cumulated.put(trio.label, cumulated.get(trio.label) + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } else { cumulated.put(trio.label, trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size())); } } Double max = Double.NEGATIVE_INFINITY; BxZoneLabel bestLabel = null; for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) { if (entry.getValue() > max) { max = entry.getValue(); bestLabel = entry.getKey(); } } if (max >= 0.5) { curZone.setLabel(bestLabel); printVerbose("2 "); valueSet = true; } } if (!valueSet) { Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() { @Override public int compare(LabelTrio t1, LabelTrio t2) { Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size(); if (Math.abs(simDif) < 0.001) { return t2.entryTokens.size() - t1.entryTokens.size(); } if (simDif > 0) { return 1; } else { return -1; } } }); Collections.reverse(swLabelSim.get(zoneIdx)); List<LabelTrio> l = swLabelSim.get(zoneIdx); BxZoneLabel best = null; int bestScore = 0; for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { if (lt.entryTokens.contains(zt)) { i++; } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } if (best != null) { curZone.setLabel(best); valueSet = true; } else { for (LabelTrio lt : l) { int i = 0; for (String zt : zoneTokens) { for (String j : lt.entryTokens) { if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "") .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) { i++; break; } } } if (i > bestScore && i > 1) { best = lt.label; bestScore = i; } } } if (best != null) { curZone.setLabel(best); valueSet = true; } } if (!valueSet) { curZone.setLabel(null); } printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n"); } Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>(); Set<BxZone> unlabeledZones = new HashSet<BxZone>(); for (BxZone zone : page.getZones()) { if (zone.getLabel() == null) { unlabeledZones.add(zone); zoneLocMap.put(zone, new ZoneLocaliser(zone)); } } Integer lastNumberOfUnlabeledZones; do { lastNumberOfUnlabeledZones = unlabeledZones.size(); infereLabels(unlabeledZones, zoneLocMap); infereLabels(unlabeledZones, zoneLocMap); } while (lastNumberOfUnlabeledZones != unlabeledZones.size()); } printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>="); return bxDoc; }
From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java
public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream) throws AnalysisException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformationException { XPath xpath = XPathFactory.newInstance().newXPath(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false);/*ww w. ja v a 2s . com*/ dbf.setFeature("http://xml.org/sax/features/namespaces", false); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder builder = dbf.newDocumentBuilder(); Document domDoc = builder.parse(nlmStream); TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader(); Reader r = new InputStreamReader(pdfStream); BxDocument bxDoc = new BxDocument().setPages(reader.read(r)); List<BxZone> zones = Lists.newArrayList(bxDoc.asZones()); Integer bxDocLen = zones.size(); SmartHashMap entries = new SmartHashMap(); //abstract Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc, XPathConstants.NODE); String abstractString = XMLTools.extractTextFromNode(abstractNode); entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT); entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT); //title String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title", domDoc, XPathConstants.STRING); entries.putIf(titleString, BxZoneLabel.MET_TITLE); String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle", domDoc, XPathConstants.STRING); entries.putIf(subtitleString, BxZoneLabel.MET_TITLE); //journal title String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc, XPathConstants.STRING); if (journalTitleString == null || journalTitleString.isEmpty()) { journalTitleString = (String) xpath.evaluate( "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING); } entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO); //journal publisher String journalPublisherString = (String) xpath .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO); String journalPublisherIdString = (String) xpath.evaluate( "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc, XPathConstants.STRING); entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO); //journal issn String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc, XPathConstants.STRING); entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO); //copyright/permissions String permissionsString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE)); entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT); //license Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc, XPathConstants.NODE); String licenseString = (String) XMLTools.extractTextFromNode(licenseNode); entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT); //article type NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc, XPathConstants.NODESET); List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes); Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group", domDoc, XPathConstants.NODE); articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode)); entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE); //received date List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE)); if (!receivedDate.isEmpty() && receivedDate.size() >= 3) { for (String date : TextUtils.produceDates(receivedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //accepted date List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate( "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE)); if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) { for (String date : TextUtils.produceDates(acceptedDate)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } //publication date List<String> pubdateString; if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } else { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } pubdateString.clear(); if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET)) .getLength() > 1) { Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']", domDoc, XPathConstants.NODE); pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode); } if (pubdateString != null && pubdateString.size() >= 3) { for (String date : TextUtils.produceDates(pubdateString)) { entries.putIf(date, BxZoneLabel.MET_DATES); } } String extLink = (String) xpath.evaluate( "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc, XPathConstants.STRING); printlnVerbose(extLink); entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA); //keywords Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc, XPathConstants.NODE); String keywordsString = XMLTools.extractTextFromNode(keywordsNode); entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS); //DOI String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']", domDoc, XPathConstants.STRING); entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO); //volume String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc, XPathConstants.STRING); entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO); entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO); //issue String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc, XPathConstants.STRING); entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO); entries.putIf("journal", BxZoneLabel.MET_BIB_INFO); entries.putIf("et al", BxZoneLabel.MET_BIB_INFO); List<String> authorNames = new ArrayList<String>(); List<String> authorEmails = new ArrayList<String>(); List<String> authorAffiliations = new ArrayList<String>(); List<String> editors = new ArrayList<String>(); //pages String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING); String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING); entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER); try { int f = Integer.valueOf(fPage); int l = Integer.valueOf(lPage); while (f < l) { f++; entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER); } } catch (NumberFormatException ex) { } entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER); //editors NodeList editorNodes = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) { String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx)); editors.add(editorString); } entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR); NodeList authorsResult = (NodeList) xpath.evaluate( "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) { Node curNode = authorsResult.item(nodeIdx); //author names String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING); String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING); //author affiliation List<String> aff = XMLTools.extractTextAsList((NodeList) xpath .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET)); //author correspondence String email; try { email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { email = ""; } if (email.isEmpty()) { try { email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING); } catch (XPathExpressionException e) { //yaaay, probably there is no e-mail at all! => do nothing } } if (!email.isEmpty()) { authorEmails.add(email); } if (!aff.isEmpty()) { authorAffiliations.addAll(aff); } authorNames.add(name + " " + surname); } entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR); //authors' affiliations NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc, XPathConstants.NODESET); authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes)); entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION); //correspondence again NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp", domDoc, XPathConstants.NODESET); authorEmails.add(XMLTools.extractTextFromNodes(correspNodes)); entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE); //author notes Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc, XPathConstants.NODE); String notesString = XMLTools.extractTextFromNode(notesNode); entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE); notesString = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE)); //article body NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET); List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes); entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT); NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET); String appStrings = XMLTools.extractTextFromNodes(appNodes); entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT); //section titles NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc, XPathConstants.NODESET); List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes); entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT); NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc, XPathConstants.NODESET); List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes); entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT); //figures NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc, XPathConstants.NODESET); List<String> figureStrings = XMLTools.extractTextAsList(figureNodes); figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET); figureStrings.addAll(XMLTools.extractTextAsList(figureNodes)); entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE); //tables List<String> tableCaptions = new ArrayList<String>(); List<String> tableBodies = new ArrayList<String>(); List<String> tableFootnotes = new ArrayList<String>(); //tableNodes NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET); for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) { Node tableNode = tableNodes.item(nodeIdx); String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING); tableCaptions.add(caption); String body = XMLTools .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE)); tableBodies.add(body); List<String> footnotes = XMLTools.extractTextAsList( (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET)); tableFootnotes.addAll(footnotes); entries.putIf(caption, BxZoneLabel.BODY_TABLE); entries.putIf(body, BxZoneLabel.BODY_TABLE); entries.putIf(footnotes, BxZoneLabel.BODY_TABLE); } //financial disclosure String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE)); entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT); //conflict String conflictString = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE)); entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT); //copyright String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate( "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE)); entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT); //acknowledgment String acknowledgement = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT); acknowledgement = XMLTools.extractTextFromNode( (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE)); entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT); //glossary String glossary = XMLTools .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE)); entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY); //formula NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc, XPathConstants.NODESET); for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) { Node curFormulaNode = formulaNodes.item(nodeIdx); String label = (String) xpath.evaluate("label", curFormulaNode); entries.putIf(label, BxZoneLabel.BODY_EQUATION); NodeList curNodeChildren = curFormulaNode.getChildNodes(); List<String> formulaParts = new ArrayList<String>(); for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) { Node curChild = curNodeChildren.item(childIdx); if (curChild.getNodeName().equals("label")) { continue; } formulaParts.add(XMLTools.extractTextFromNode(curChild)); } entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION); } //references List<String> refStrings = new ArrayList<String>(); Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE); if (refParentNode != null) { for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) { refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx))); } } entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES); entries.put("references", BxZoneLabel.REFERENCES); Set<String> allBibInfos = new HashSet<String>(); for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) { allBibInfos.addAll(Arrays.asList(entry.getKey().split(" "))); } } entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO); printlnVerbose("journalTitle: " + journalTitleString); printlnVerbose("journalPublisher: " + journalPublisherString); printlnVerbose("journalISSNPublisher: " + journalISSNString); printlnVerbose("articleType: " + articleTypeStrings); printlnVerbose("received: " + receivedDate); printlnVerbose("accepted: " + acceptedDate); printlnVerbose("pubdate: " + pubdateString); printlnVerbose("permissions: " + permissionsString); printlnVerbose("license: " + licenseString); printlnVerbose("title: " + titleString); printlnVerbose("abstract: " + abstractString); printlnVerbose("authorEmails: " + authorEmails); printlnVerbose("authorNames: " + authorNames); printlnVerbose("authorAff: " + authorAffiliations); printlnVerbose("authorNotes: " + notesString); printlnVerbose("editor: " + editors); printlnVerbose("keywords: " + keywordsString); printlnVerbose("DOI: " + doiString); printlnVerbose("volume: " + volumeString); printlnVerbose("issue: " + issueString); printlnVerbose("financial dis.: " + financialDisclosure); printlnVerbose("paragraphs: " + paragraphStrings); printlnVerbose("section titles: " + sectionTitles); printlnVerbose("tableBodies: " + tableBodies); printlnVerbose("tableCaptions: " + tableCaptions); printlnVerbose("tableFootnotes: " + tableFootnotes); printlnVerbose("figures: " + figureStrings); printlnVerbose("acknowledgement: " + acknowledgement); printlnVerbose("ref: " + refStrings.size() + " " + refStrings); SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1); CosineDistance cos = new CosineDistance(); //index: (zone,entry) List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen); List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen); for (Integer i = 0; i < bxDocLen; ++i) { swLabelSim.add(new ArrayList<LabelTrio>()); cosLabProb.add(new ArrayList<LabelTrio>()); } //iterate over entries for (Entry<String, BxZoneLabel> entry : entries.entrySet()) { List<String> entryTokens = TextUtils.tokenize(entry.getKey()); printlnVerbose("--------------------"); printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n"); //iterate over zones for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) { BxZone curZone = zones.get(zoneIdx); List<String> zoneTokens = TextUtils.tokenize( TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase()))); Double smithSim; Double cosSim; if (curZone.toText().contains("www.biomedcentral.com")) { //ignore smithSim = 0.; cosSim = 0.; } else { smithSim = smith.compare(entryTokens, zoneTokens); cosSim = cos.compare(entryTokens, zoneTokens); } printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n"); swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim)); cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim)); } } for (BxPage pp : bxDoc) { boolean changed = true; while (changed) { changed = false; boolean wasIntro = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); int i = zones.indexOf(z); double titleAl = 0; double authorAl = 0; List<LabelTrio> sims = swLabelSim.get(i); for (LabelTrio t : sims) { if (t.label.equals(BxZoneLabel.MET_TITLE)) { titleAl = t.alignment / t.entryTokens.size(); } if (t.label.equals(BxZoneLabel.MET_AUTHOR)) { authorAl = t.alignment / t.entryTokens.size(); } } String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); int linesCount = z.childrenCount(); int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent()); BxLine firstLine = z.getFirstChild(); if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.MET_TITLE) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) && titleAl >= 0.7 && authorAl >= 0.4) { z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR); } if (linesCount == 2 && text.contains("page") && text.contains("of") && text.contains("page number not for")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (linesCount == 1 && (text.contains("page number not for") || (text.contains("page") && text.contains("of")))) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && linesCount < 11 && (text.contains("department") || text.contains("university"))) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (linesCount < 5 && firstLine.toText().length() < 11 && firstLine.toText().startsWith("Figure") && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx > 0 && z.hasPrev() && z.hasNext() && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_DATES) || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)) && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) && z.getWidth() < 100) { if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE) && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2; double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) { double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2; double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2; double zMX = z.getX() + z.getWidth() / 2; double zMY = z.getY() + z.getHeight() / 2; if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) { z.setLabel(BxZoneLabel.BODY_TABLE); } } } if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION) || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:") || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) { z.setLabel(BxZoneLabel.MET_EDITOR); } if (pageIdx == 0 && text.startsWith("copyright:")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume") && text.contains("issue")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.MET_AUTHOR) || z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { BxPage p = z.getParent(); if (pageIdx > 0) { BxPage prevPage = p.getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 1) { BxPage nextPage = p.getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx > 1) { BxPage prevPage = p.getPrev().getPrev(); for (BxZone z1 : prevPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } if (pageIdx < bxDoc.childrenCount() - 2) { BxPage nextPage = p.getNext().getNext(); for (BxZone z1 : nextPage) { if (z1.toText().replaceAll("[^a-zA-Z]", "") .equals(z.toText().replaceAll("[^a-zA-Z]", "")) && Math.abs(z1.getY() - z.getY()) < 10) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } } } } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN) || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO) || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+") && text.length() <= 4 && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if (text.equals("acknowledgments")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("introduction") && z.hasPrev() && !z.getPrev().toText().toLowerCase().equals("abstract")) { wasIntro = true; } if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references") && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10 && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT) && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES) && (text.matches("[1-9][0-9]?[0-9]?\\.?") || text.matches(".*[1-2][0-9][0-9][0-9].*"))) { z.setLabel(BxZoneLabel.REFERENCES); } if ((z.getLabel().equals(BxZoneLabel.REFERENCES) || z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (text.startsWith("doi") || text.startsWith("cite this article"))) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("author details")) { z.setLabel(BxZoneLabel.MET_AFFILIATION); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && (firstLine.toText().toLowerCase().equals("acknowledgments") || firstLine.toText().toLowerCase().equals("acknowledgements"))) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) { z.setLabel(BxZoneLabel.BODY_CONTENT); } if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100) && text.matches("sup-[0-9][0-9]?")) { z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER); } if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && firstLine.toText().toLowerCase().equals("references")) { z.setLabel(BxZoneLabel.REFERENCES); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.") || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?") || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) { z.setLabel(BxZoneLabel.BODY_FIGURE); } if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText() .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*") || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) { z.setLabel(BxZoneLabel.BODY_TABLE); } if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT) && text.contains("this article is distributed")) { z.setLabel(BxZoneLabel.MET_COPYRIGHT); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("journal")) { z.setLabel(BxZoneLabel.MET_BIB_INFO); } if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA) && text.contains("correspondence")) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (pageIdx == 0 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && text.contains("accepted") && text.contains("published")) { z.setLabel(BxZoneLabel.MET_DATES); } if (pageIdx == 0 && linesCount < 10 && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4 && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) { if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) { z.setLabel(z.getPrev().getLabel()); } } if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with") || text.contains("will be the most significant development") || text.contains("disseminating the results of biomedical") || text.contains("sir paul nurse") || text.contains("your research papers") || text.contains("available free of charge") || text.contains("peer reviewed and published") || text.contains("cited in pubmed and archived") || text.contains("you keep the copyright") || text.contains("submit your manuscript") || text.contains("submit your next manuscript") || text.contains("online submission") || text.contains("peer review") || text.contains("space constraints") || text.contains("publication on acceptance") || text.contains("inclusion in pubmed") || text.contains("freely available") || text.contains("publication history"))) { z.setLabel(BxZoneLabel.OTH_UNKNOWN); } if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) { z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT); } if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest") || text.startsWith("competing interests") || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest") || z.getPrev().toText().toLowerCase().equals("conflict of interest") || z.getPrev().toText().toLowerCase().equals("competing interests")))) { z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT); } changed = changed || !orig.equals(z.getLabel()); } boolean wasAuthor = false; for (BxZone z : pp) { BxZoneLabel orig = z.getLabel(); String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase(); if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor && ((text.contains("email") && text.contains("@")) || text.startsWith("correspondence"))) { z.setLabel(BxZoneLabel.MET_CORRESPONDENCE); } if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) { wasAuthor = true; } changed = changed || !orig.equals(z.getLabel()); } } } return bxDoc; }
From source file:sernet.gs.ui.rcp.main.LoggerInitializerTest.java
private Document loadLog4jFile(String name) throws ParserConfigurationException, SAXException, IOException { URL costumLog4jFile = getClass().getResource(name); DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilderFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder(); return documentBuilder.parse(costumLog4jFile.getPath()); }
From source file:sh.isaac.api.util.ArtifactUtilities.java
/** * Make maven relative path.//from w w w .j a va 2 s .c om * * @param baseMavenURL - optional - but required if you are downloading a SNAPSHOT dependency, as this method will need to download the metadata file * from the repository server in order to determine the proper version component for the SNAPSHOT. * @param mavenUsername - optional - only used for a SNAPSHOT dependency * @param mavenPassword - optional - only used for a SNAPSHOT dependency * @param groupId the group id * @param artifactId the artifact id * @param version the version * @param classifier - optional * @param type the type * @return the string * @throws Exception the exception */ public static String makeMavenRelativePath(String baseMavenURL, String mavenUsername, String mavenPassword, String groupId, String artifactId, String version, String classifier, String type) throws Exception { final String temp = groupId.replaceAll("\\.", "/"); String snapshotVersion = ""; String versionWithoutSnapshot = version; if (version.endsWith("-SNAPSHOT")) { versionWithoutSnapshot = version.substring(0, version.lastIndexOf("-SNAPSHOT")); final URL metadataUrl = new URL(baseMavenURL + (baseMavenURL.endsWith("/") ? "" : "/") + temp + "/" + artifactId + "/" + version + "/maven-metadata.xml"); // Need to download the maven-metadata.xml file final Task<File> task = new DownloadUnzipTask(mavenUsername, mavenPassword, metadataUrl, false, false, null); WorkExecutors.get().getExecutor().execute(task); final File metadataFile = task.get(); final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); // added to avoid XXE injections domFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); DocumentBuilder builder; Document dDoc = null; final XPath xPath = XPathFactory.newInstance().newXPath(); builder = domFactory.newDocumentBuilder(); dDoc = builder.parse(metadataFile); final String timestamp = ((Node) xPath.evaluate("/metadata/versioning/snapshot/timestamp", dDoc, XPathConstants.NODE)).getTextContent(); final String buildNumber = ((Node) xPath.evaluate("/metadata/versioning/snapshot/buildNumber", dDoc, XPathConstants.NODE)).getTextContent(); snapshotVersion = "-" + timestamp + "-" + buildNumber; metadataFile.delete(); // The download task makes a subfolder in temp for this, delete that too metadataFile.getParentFile().delete(); } return temp + "/" + artifactId + "/" + version + "/" + artifactId + "-" + versionWithoutSnapshot + snapshotVersion + (StringUtils.isNotBlank(classifier) ? "-" + classifier : "") + "." + type; }
From source file:uk.me.jeffsutton.pojogen.SimplePOJO.java
public Document parse(BufferedReader xml) throws IOException, SAXException, ParserConfigurationException { String file = ""; try {// w w w.ja va 2 s . c o m String str; while ((str = xml.readLine()) != null) { file += str; } } catch (Exception e) { e.printStackTrace(); } file = file.replaceAll("<!DOCTYPE((.|\n|\r)*?)\">", ""); // convert String into InputStream InputStream is = new ByteArrayInputStream(file.getBytes()); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); dbf.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); dbf.setFeature("http://xml.org/sax/features/validation", false); dbf.setNamespaceAware(false); dbf.setIgnoringComments(true); dbf.setValidating(false); dbf.setXIncludeAware(true); return dbf.newDocumentBuilder().parse(is); }