Example usage for javax.xml.parsers DocumentBuilderFactory setFeature

Introduction

In this page you can find the example usage for javax.xml.parsers DocumentBuilderFactory setFeature.

Prototype

public abstract void setFeature(String name, boolean value) throws ParserConfigurationException;

Source Link

Document

Set a feature for this DocumentBuilderFactory and DocumentBuilder s created by this factory.

Usage

From source file:pl.edu.icm.cermine.evaluation.BwmetaGrobidFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//from   w  ww  .  jav a 2  s  .  c o  m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> journals = new ArrayList<ComparisonResult>();
    List<ComparisonResult> volumes = new ArrayList<ComparisonResult>();
    List<ComparisonResult> issues = new ArrayList<ComparisonResult>();
    List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>();
    List<ComparisonResult> years = new ArrayList<ComparisonResult>();
    List<ComparisonResult> dois = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,gro_title,gro_abstract,gro_keywords,"
                + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue,"
                + "gro_pages,gro_year,gro_doi,gro_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;

        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Document's title
        MetadataSingle title = new MetadataSingle(originalNlm, "/bwmeta/element/name[not(@type)]", extractedNlm,
                "//teiHeader//titleStmt/title");
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        MetadataSingle abstrakt = new MetadataSingle(originalNlm,
                "/bwmeta/element/description[@type='abstract']", extractedNlm, "//teiHeader//abstract/p");
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/bwmeta/element/tags[@type='keyword']/tag",
                extractedNlm, "//teiHeader//keywords//term");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/bwmeta/element/contributor[@role='author']");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    expectedAuthors.add(n.getTextContent());//.replaceAll("[^a-zA-Z]", ""));
                    break;
                }
            }
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/bwmeta/element/affiliation/text"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Author - Affiliation relation
        MetadataRelation authorAffiliation = new MetadataRelation();
        authorAffiliation.setComp1(EvaluationUtils.authorComparator);
        authorAffiliation.setComp2(EvaluationUtils.cosineComparator());

        List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm, "/bwmeta/element/affiliation");
        Map<String, String> expectedAffiliationMap = new HashMap<String, String>();
        for (Node expectedAffiliationNode : expectedAffiliationNodes) {
            String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
            String aff = XMLTools.extractChildrenTextFromNode(expectedAffiliationNode, "text").get(0);
            expectedAffiliationMap.put(id, aff);
        }

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = null;

            List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    authorName = n.getTextContent();//.replaceAll("[^a-zA-Z]", "");
                    break;
                }
            }

            if (authorName == null)
                continue;

            List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "affiliation-ref");
            for (Node xref : xrefs) {
                String affId = xref.getAttributes().getNamedItem("ref").getNodeValue();
                String aff = expectedAffiliationMap.get(affId);
                if (aff != null)
                    authorAffiliation.addExpected(new StringRelation(authorName, aff));
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("affiliation".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorAffiliation.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsAffiliations.add(authorAffiliation);
        authorAffiliation.print(mode, "author - affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/bwmeta/element/contributor[@role='author']/attribute[@key='contact-email']/value",
                extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        // Author - Email relations
        MetadataRelation authorEmail = new MetadataRelation();
        authorEmail.setComp1(EvaluationUtils.authorComparator);
        authorEmail.setComp2(EvaluationUtils.emailComparator);

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = null;

            List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    authorName = n.getTextContent();
                    break;
                }
            }

            if (authorName == null)
                continue;

            List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "attribute");
            for (Node address : addresses) {
                if ("contact-email".equals(address.getAttributes().getNamedItem("key").getNodeValue())) {
                    String ema = XMLTools.extractChildrenTextFromNode(address, "value").get(0);
                    authorEmail.addExpected(new StringRelation(authorName, ema));
                }
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("email".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorEmail.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsEmails.add(authorEmail);
        authorEmail.print(mode, "author - email");

        // Journal title
        MetadataSingle journal = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Journal']/name[@type='canonical']",
                extractedNlm, "//monogr/title[@level='j' and @type='main']");
        journal.setComp(EvaluationUtils.journalComparator);
        journals.add(journal);
        journal.print(mode, "journal title");

        // Volume
        MetadataSingle volume = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Volume']/name[@type='canonical']",
                extractedNlm, "//monogr/imprint/biblScope[@unit='volume']");
        volumes.add(volume);
        volume.print(mode, "volume");

        // Issue            
        MetadataSingle issue = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Number']/name[@type='canonical']",
                extractedNlm, "//monogr/imprint/biblScope[@unit='issue']");
        issues.add(issue);
        issue.print(mode, "issue");

        // Pages range
        MetadataSingle fPage = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from");
        MetadataSingle lPage = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to");
        String expRange = fPage.hasExpected() ? fPage.getExpectedValue().replaceAll("-", "--") : "";
        String extrRange = fPage.hasExtracted() && lPage.hasExtracted()
                ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue()
                : "";
        MetadataSingle pageRange = new MetadataSingle(expRange, extrRange);
        pageRanges.add(pageRange);
        pageRange.print(mode, "pages");

        // Publication date
        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Year']/name[@type='canonical']");
        expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate);
        List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//date[@type='published']");
        List<String> extractedPubDate = Lists.newArrayList();
        if (!extractedPubDates.isEmpty()) {
            Node pubDate = extractedPubDates.get(0);
            String date = pubDate.getTextContent();
            if (pubDate.getAttributes().getNamedItem("when") != null) {
                date = pubDate.getAttributes().getNamedItem("when").getTextContent();
            }
            extractedPubDate = Lists.newArrayList(date.split("-"));
            extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate);
        }

        MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"),
                StringUtils.join(extractedPubDate, "---"));
        year.setComp(EvaluationUtils.yearComparator);
        years.add(year);
        year.print(mode, "year");

        // DOI
        MetadataSingle doi = new MetadataSingle(originalNlm,
                "/bwmeta/element/id[@scheme='bwmeta1.id-class.DOI']/@value", extractedNlm,
                "//teiHeader//idno[@type='DOI']");
        dois.add(doi);
        doi.print(mode, "DOI");

        // References
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm,
                "//relation[@type='reference-to']/attribute[@key='reference-text']/value"); //bwmeta
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();

        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations);
        authorsAffiliationsPR.print("Author - affiliation");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails);
        authorsEmailsPR.print("Author - email");

        PrecisionRecall journalPR = new PrecisionRecall().build(journals);
        journalPR.print("Journal");

        PrecisionRecall volumePR = new PrecisionRecall().build(volumes);
        volumePR.print("Volume");

        PrecisionRecall issuePR = new PrecisionRecall().build(issues);
        issuePR.print("Issue");

        PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges);
        pageRangePR.print("Pages");

        PrecisionRecall yearPR = new PrecisionRecall().build(years);
        yearPR.print("Year");

        PrecisionRecall doiPR = new PrecisionRecall().build(dois);
        doiPR.print("DOI");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, yearPR, doiPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.evaluation.FinalMetadataExtractionEvaluation.java

public void evaluate(NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*w ww  .  java  2s. co  m*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    PrecissonRecall issn = new PrecissonRecall();
    PrecissonRecall doi = new PrecissonRecall();
    PrecissonRecall volume = new PrecissonRecall();
    PrecissonRecall issue = new PrecissonRecall();
    PrecissonRecall pages = new PrecissonRecall();
    PrecissonRecall dateYear = new PrecissonRecall();
    PrecissonRecall dateFull = new PrecissonRecall();
    PrecissonRecall journalTitle = new PrecissonRecall();

    List<Double> abstractRates = new ArrayList<Double>(iter.size());
    List<Double> titleRates = new ArrayList<Double>(iter.size());

    List<Double> keywordPrecisions = new ArrayList<Double>(iter.size());
    List<Double> keywordRecalls = new ArrayList<Double>(iter.size());

    List<Double> authorsPrecisions = new ArrayList<Double>(iter.size());
    List<Double> authorsRecalls = new ArrayList<Double>(iter.size());

    List<Double> affPrecisions = new ArrayList<Double>(iter.size());
    List<Double> affRecalls = new ArrayList<Double>(iter.size());

    int ii = 0;
    for (NlmPair pair : iter) {
        ii++;
        System.out.println("");
        printVerbose(">>>>>>>>> " + ii);

        printVerbose(pair.getExtractedNlm().getPath());

        org.w3c.dom.Document originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
        org.w3c.dom.Document extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));

        String expectedTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta//article-title");
        String extractedTitle = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/title-group/article-title");

        List<Node> expectedAuthorsNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/name");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorsNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            author = author.replaceAll("[^a-zA-Z ]", "");
            expectedAuthors.add(author);
        }

        List<String> extractedAuthors1 = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/string-name");
        List<String> extractedAuthors = new ArrayList<String>();
        for (String author : extractedAuthors1) {
            extractedAuthors.add(author.replaceAll("[^a-zA-Z ]", ""));
        }

        List<String> expectedKeywords = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta//kwd");
        List<String> extractedKeywords = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/kwd-group/kwd");

        String expectedJournalTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/journal-meta//journal-title");
        String extractedJournalTitle = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/journal-meta/journal-title-group/journal-title");

        String expectedAbstract = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/abstract");
        String extractedAbstract = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/abstract");

        String expectedDoi = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']");
        String extractedDoi = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']");

        String expectedISSN = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/journal-meta/issn[@pub-type='ppub']");
        String extractedISSN = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/journal-meta/issn[@pub-type='ppub']");

        String expectedVolume = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/volume");
        String extractedVolume = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/volume");

        String expectedIssue = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/issue");
        String extractedIssue = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/issue");

        String expectedFPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/fpage");
        String extractedFPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/fpage");

        String expectedLPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/lpage");
        String extractedLPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/lpage");

        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta/pub-date");
        expectedPubDate = removeLeadingZerosFromDate(expectedPubDate);
        List<String> extractedPubDate = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/pub-date");
        extractedPubDate = removeLeadingZerosFromDate(extractedPubDate);

        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta//aff"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);

        //equality measures
        if (!expectedVolume.isEmpty()) {
            if (expectedVolume.equals(extractedVolume)) {
                ++volume.correct;
            }
            ++volume.expected;
        }
        if (!extractedVolume.isEmpty()) {
            volume.extracted++;
        }
        if (!expectedIssue.isEmpty()) {
            if (expectedIssue.equals(extractedIssue)) {
                ++issue.correct;
            }
            ++issue.expected;
        }
        if (!extractedIssue.isEmpty()) {
            issue.extracted++;
        }
        if (!expectedISSN.isEmpty()) {
            if (extractedISSN.equals(expectedISSN)) {
                ++issn.correct;
            }
            ++issn.expected;
        }
        if (!extractedISSN.isEmpty()) {
            issn.extracted++;
        }
        if (!expectedDoi.isEmpty()) {
            if (expectedDoi.equals(extractedDoi)) {
                ++doi.correct;
            }
            ++doi.expected;
        }
        if (!extractedDoi.isEmpty()) {
            doi.extracted++;
        }
        if (!expectedFPage.isEmpty() && !expectedLPage.isEmpty()) {
            if (expectedFPage.equals(extractedFPage) && expectedLPage.equals(extractedLPage)) {
                ++pages.correct;
            }
            ++pages.expected;
        }
        if (!extractedFPage.isEmpty() && !extractedLPage.isEmpty()) {
            pages.extracted++;
        }

        if (!expectedPubDate.isEmpty()) {
            Boolean yearsMatch = DateComparator.yearsMatch(expectedPubDate, extractedPubDate);
            if (yearsMatch != null) {
                if (yearsMatch) {
                    ++dateYear.correct;
                }
                ++dateYear.expected;
            }
        }
        if (!extractedPubDate.isEmpty()) {
            dateYear.extracted++;
            dateFull.extracted++;
        }

        //Smith-Waterman distance measures
        if (expectedAbstract.length() > 0) {
            abstractRates.add(compareStringsSW(expectedAbstract, extractedAbstract));
        } else {
            abstractRates.add(null);
        }
        if (expectedTitle.length() > 0) {
            titleRates.add(compareStringsSW(expectedTitle, extractedTitle));
        } else {
            titleRates.add(null);
        }
        if (!expectedJournalTitle.isEmpty()) {
            journalTitle.expected++;
        }
        if (!extractedJournalTitle.isEmpty()) {
            journalTitle.extracted++;
            if (isSubsequence(expectedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase(),
                    extractedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase())) {
                journalTitle.correct++;
            }
        }

        //precision + recall
        if (expectedAuthors.size() > 0) {
            authorsRecalls.add(calculateRecall(expectedAuthors, extractedAuthors));
        } else {
            authorsRecalls.add(null);
        }
        if (extractedAuthors.size() > 0) {
            authorsPrecisions.add(calculatePrecision(expectedAuthors, extractedAuthors));
        } else {
            authorsPrecisions.add(null);
        }
        if (expectedKeywords.size() > 0) {
            keywordRecalls.add(calculateRecall(expectedKeywords, extractedKeywords));
        } else {
            keywordRecalls.add(null);
        }
        if (extractedKeywords.size() > 0) {
            keywordPrecisions.add(calculatePrecision(expectedKeywords, extractedKeywords));
        } else {
            keywordPrecisions.add(null);
        }
        if (expectedAffiliations.size() > 0) {
            affRecalls.add(calculateRecall(expectedAffiliations, extractedAffiliations));
        } else {
            affRecalls.add(null);
        }
        if (extractedAffiliations.size() > 0) {
            affPrecisions.add(calculatePrecision(expectedAffiliations, extractedAffiliations));
        } else {
            affPrecisions.add(null);
        }

        System.out.println("");
        printVerbose(">>> Expected authors: ");
        for (String author : expectedAuthors) {
            printVerbose(author);
        }

        System.out.println("");
        printVerbose(">>> Extracted authors: ");
        for (String author : extractedAuthors) {
            printVerbose(author);
        }

        System.out.println("");
        printVerbose(">>> Expected keywords: ");
        for (String keyword : expectedKeywords) {
            printVerbose(keyword);
        }

        System.out.println("");
        printVerbose(">>> Extracted keywords: ");
        for (String keyword : extractedKeywords) {
            printVerbose(keyword);
        }

        printVerbose(">>> Expected journal title: " + expectedJournalTitle);
        printVerbose(">>> Extracted journal title: " + extractedJournalTitle);

        printVerbose(">>> Expected article title: " + expectedTitle);
        printVerbose(">>> Extracted article title: " + extractedTitle);

        printVerbose(">>> Expected article abstract: " + expectedAbstract);
        printVerbose(">>> Extracted article abstract: " + extractedAbstract);

        printVerbose(">>> Expected doi: " + expectedDoi);
        printVerbose(">>> Extracted doi: " + extractedDoi);

        printVerbose(">>> Expected issn: " + expectedISSN);
        printVerbose(">>> Extracted issn: " + extractedISSN);

        printVerbose(">>> Expected volume: " + expectedVolume);
        printVerbose(">>> Extracted volume: " + extractedVolume);

        printVerbose(">>> Expected issue: " + expectedIssue);
        printVerbose(">>> Extracted issue: " + extractedIssue);

        printVerbose(">>> Expected pages: " + expectedFPage + " " + expectedLPage);
        printVerbose(">>> Extracted pages: " + extractedFPage + " " + extractedLPage);

        printVerbose(">>> Expected date: ");
        for (String date : expectedPubDate) {
            printVerbose(date);
        }

        printVerbose(">>> Extracted date: ");
        for (String date : extractedPubDate) {
            printVerbose(date);
        }
        printVerbose(">>> Expected affs: ");
        for (String aff : expectedAffiliations) {
            printVerbose(aff);
        }

        printVerbose(">>> Extracted affs: ");
        for (String aff : extractedAffiliations) {
            printVerbose(aff);
        }

        printVerbose("abstract " + abstractRates.get(abstractRates.size() - 1));
        printVerbose("title " + titleRates.get(titleRates.size() - 1));
        printVerbose("journal title " + journalTitle);

        System.out.println("");
        printVerbose("authors precission " + authorsPrecisions.get(authorsPrecisions.size() - 1));
        printVerbose("authors recall " + authorsRecalls.get(authorsPrecisions.size() - 1));

        System.out.println("");
        printVerbose("aff precission " + affPrecisions.get(affPrecisions.size() - 1));
        printVerbose("aff recall " + affRecalls.get(affPrecisions.size() - 1));

        System.out.println("");
        printVerbose("keywords precission " + keywordPrecisions.get(keywordPrecisions.size() - 1));
        printVerbose("keywords recall " + keywordRecalls.get(keywordPrecisions.size() - 1));

        printVerbose("date years" + dateYear);
        printVerbose("doi" + doi);
        printVerbose("issn" + issn);
        printVerbose("volume" + volume);
        printVerbose("issue" + issue);
        printVerbose("pages" + pages);
    }

    Double value;
    System.out.println("==== Summary (" + iter.size() + " docs)====");
    if ((value = calculateAverage(abstractRates)) != null) {
        System.out.printf("abstract avg (SW) \t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(titleRates)) != null) {
        System.out.printf("title avg (SW) \t\t\t%4.2f\n", 100 * value);
    }
    if ((value = journalTitle.calculatePrecission()) != null) {
        System.out.printf("journal title precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = journalTitle.calculateRecall()) != null) {
        System.out.printf("journal title recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(authorsPrecisions)) != null) {
        System.out.printf("authors precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(authorsRecalls)) != null) {
        System.out.printf("authors recall avg (EQ)\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(affPrecisions)) != null) {
        System.out.printf("aff precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(affRecalls)) != null) {
        System.out.printf("aff recall avg (EQ)\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(keywordPrecisions)) != null) {
        System.out.printf("keywords precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(keywordRecalls)) != null) {
        System.out.printf("keywords recall avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = dateYear.calculatePrecission()) != null) {
        System.out.printf("date year precission avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = dateYear.calculateRecall()) != null) {
        System.out.printf("date year recall avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = doi.calculatePrecission()) != null) {
        System.out.printf("doi precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = doi.calculateRecall()) != null) {
        System.out.printf("doi recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = issn.calculatePrecission()) != null) {
        System.out.printf("issn precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = issn.calculateRecall()) != null) {
        System.out.printf("issn recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = volume.calculatePrecission()) != null) {
        System.out.printf("volume precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = volume.calculateRecall()) != null) {
        System.out.printf("volume recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = issue.calculatePrecission()) != null) {
        System.out.printf("issue precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = issue.calculateRecall()) != null) {
        System.out.printf("issue recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = pages.calculatePrecission()) != null) {
        System.out.printf("pages precission avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = pages.calculateRecall()) != null) {
        System.out.printf("pages recall avg\t\t%4.2f\n", 100 * value);
    }
}

From source file:pl.edu.icm.cermine.evaluation.GrobidFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//from   w ww  .ja va  2  s. c  o  m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> journals = new ArrayList<ComparisonResult>();
    List<ComparisonResult> volumes = new ArrayList<ComparisonResult>();
    List<ComparisonResult> issues = new ArrayList<ComparisonResult>();
    List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>();
    List<ComparisonResult> years = new ArrayList<ComparisonResult>();
    List<ComparisonResult> dois = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,gro_title,gro_abstract,gro_keywords,"
                + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue,"
                + "gro_pages,gro_year,gro_doi,gro_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;

        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Document's title
        MetadataSingle title = new MetadataSingle(originalNlm, "/article/front/article-meta//article-title",
                extractedNlm, "//teiHeader//titleStmt/title");
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        MetadataSingle abstrakt = new MetadataSingle(originalNlm, "/article/front/article-meta/abstract",
                extractedNlm, "//teiHeader//abstract/p");
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm,
                "//teiHeader//keywords//term");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            Node name = names.get(0);
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            expectedAuthors.add(author);
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Author - Affiliation relation
        MetadataRelation authorAffiliation = new MetadataRelation();
        authorAffiliation.setComp1(EvaluationUtils.authorComparator);
        authorAffiliation.setComp2(EvaluationUtils.cosineComparator());

        List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta//aff[@id]");
        Map<String, String> expectedAffiliationMap = new HashMap<String, String>();
        for (Node expectedAffiliationNode : expectedAffiliationNodes) {
            String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
            String aff = XMLTools.extractTextFromNode(expectedAffiliationNode);
            expectedAffiliationMap.put(id, aff);
        }

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode));
            List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "xref");
            for (Node xref : xrefs) {
                if (xref.getAttributes() != null && xref.getAttributes().getNamedItem("ref-type") != null
                        && "aff".equals(xref.getAttributes().getNamedItem("ref-type").getNodeValue())) {
                    String affId = xref.getAttributes().getNamedItem("rid").getNodeValue();
                    for (String id : affId.split(" ")) {
                        String aff = expectedAffiliationMap.get(id);
                        if (aff != null) {
                            authorAffiliation.addExpected(new StringRelation(authorName, aff));
                        }
                    }
                }
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("affiliation".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorAffiliation.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsAffiliations.add(authorAffiliation);
        authorAffiliation.print(mode, "author - affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email",
                extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        // Author - Email relations
        MetadataRelation authorEmail = new MetadataRelation();
        authorEmail.setComp1(EvaluationUtils.authorComparator);
        authorEmail.setComp2(EvaluationUtils.emailComparator);

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode));

            List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "address");
            for (Node address : addresses) {
                for (String emailAddress : XMLTools.extractChildrenTextFromNode(address, "email")) {
                    authorEmail.addExpected(new StringRelation(authorName, emailAddress));
                }
            }
            for (String emailAddress : XMLTools.extractChildrenTextFromNode(expectedAuthorNode, "email")) {
                authorEmail.addExpected(new StringRelation(authorName, emailAddress));
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("email".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorEmail.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsEmails.add(authorEmail);
        authorEmail.print(mode, "author - email");

        // Journal title
        MetadataSingle journal = new MetadataSingle(originalNlm, "/article/front/journal-meta//journal-title",
                extractedNlm, "//monogr/title[@level='j' and @type='main']");
        journal.setComp(EvaluationUtils.journalComparator);
        journals.add(journal);
        journal.print(mode, "journal title");

        // Volume
        MetadataSingle volume = new MetadataSingle(originalNlm, "/article/front/article-meta/volume",
                extractedNlm, "//monogr/imprint/biblScope[@unit='volume']");
        volumes.add(volume);
        volume.print(mode, "volume");

        // Issue            
        MetadataSingle issue = new MetadataSingle(originalNlm, "/article/front/article-meta/issue",
                extractedNlm, "//monogr/imprint/biblScope[@unit='issue']");
        issues.add(issue);
        issue.print(mode, "issue");

        // Pages range
        MetadataSingle fPage = new MetadataSingle(originalNlm, "/article/front/article-meta/fpage",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from");
        MetadataSingle lPage = new MetadataSingle(originalNlm, "/article/front/article-meta/lpage",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to");
        String expRange = fPage.hasExpected() && lPage.hasExpected()
                ? fPage.getExpectedValue() + "--" + lPage.getExpectedValue()
                : "";
        String extrRange = fPage.hasExtracted() && lPage.hasExtracted()
                ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue()
                : "";
        MetadataSingle pageRange = new MetadataSingle(expRange, extrRange);
        pageRanges.add(pageRange);
        pageRange.print(mode, "pages");

        // Publication date
        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta/pub-date");
        expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate);
        List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//date[@type='published']");
        List<String> extractedPubDate = Lists.newArrayList();
        if (!extractedPubDates.isEmpty()) {
            Node pubDate = extractedPubDates.get(0);
            String date = pubDate.getTextContent();
            if (pubDate.getAttributes().getNamedItem("when") != null) {
                date = pubDate.getAttributes().getNamedItem("when").getTextContent();
            }
            extractedPubDate = Lists.newArrayList(date.split("-"));
            extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate);
        }

        MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"),
                StringUtils.join(extractedPubDate, "---"));
        year.setComp(EvaluationUtils.yearComparator);
        years.add(year);
        year.print(mode, "year");

        // DOI
        MetadataSingle doi = new MetadataSingle(originalNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']", extractedNlm,
                "//teiHeader//idno[@type='DOI']");
        dois.add(doi);
        doi.print(mode, "DOI");

        // References
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref"); //nxml
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();

        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations);
        authorsAffiliationsPR.print("Author - affiliation");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails);
        authorsEmailsPR.print("Author - email");

        PrecisionRecall journalPR = new PrecisionRecall().build(journals);
        journalPR.print("Journal");

        PrecisionRecall volumePR = new PrecisionRecall().build(volumes);
        volumePR.print("Volume");

        PrecisionRecall issuePR = new PrecisionRecall().build(issues);
        issuePR.print("Issue");

        PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges);
        pageRangePR.print("Pages");

        PrecisionRecall yearPR = new PrecisionRecall().build(years);
        yearPR.print("Year");

        PrecisionRecall doiPR = new PrecisionRecall().build(dois);
        doiPR.print("DOI");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, yearPR, doiPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.evaluation.ParsCitFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//  w w  w  .  jav  a 2s  .  c  o m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,pcit_title,pcit_abstract,pcit_keywords,"
                + "pcit_authors,pcit_affs,pcit_email,pcit_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;
        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Title
        String expectedTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta//article-title");
        List<Node> extractedTitleNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//title");
        String extractedTitle = null;
        double confidence = 0;
        for (Node extractedTitleNode : extractedTitleNodes) {
            if (extractedTitle == null) {
                extractedTitle = extractedTitleNode.getTextContent();
            }
            Node conf = extractedTitleNode.getAttributes().getNamedItem("confidence");
            if (conf != null) {
                double actConf = Double.valueOf(conf.getNodeValue());
                if (actConf > confidence) {
                    confidence = actConf;
                    extractedTitle = extractedTitleNode.getTextContent();
                }
            }
        }

        MetadataSingle title = new MetadataSingle(expectedTitle, extractedTitle);
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        String expectedAbstract = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/abstract");
        List<Node> extractedAbstractNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//abstract");
        String extractedAbstract = null;
        confidence = 0;
        for (Node extractedAbstractNode : extractedAbstractNodes) {
            if (extractedAbstract == null) {
                extractedAbstract = extractedAbstractNode.getTextContent();
            }
            Node conf = extractedAbstractNode.getAttributes().getNamedItem("confidence");
            if (conf != null) {
                double actConf = Double.valueOf(conf.getNodeValue());
                if (actConf > confidence) {
                    confidence = actConf;
                    extractedAbstract = extractedAbstractNode.getTextContent();
                }
            }
        }
        MetadataSingle abstrakt = new MetadataSingle(expectedAbstract, extractedAbstract);
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm,
                "//algorithm[@name='ParsHed']//keyword");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            Node name = names.get(0);
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            expectedAuthors.add(author);
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//author");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            String author = XMLTools.extractTextFromNode(authorNode);
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(
                XMLTools.extractTextAsList(extractedNlm, "//algorithm[@name='ParsHed']//affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email",
                extractedNlm, "//algorithm[@name='ParsHed']//email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        //references
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref");
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsCit']//citationList/citation/rawString");

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();
        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, refsPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*w  w w.j a  v a2 s .c  om*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor();
    BxDocument bxDoc = structureExtractor.extractStructure(pdfStream);
    Integer bxDocLen = bxDoc.asZones().size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : StringTools.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : StringTools.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = StringTools.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            List<String> zoneTokens = StringTools.tokenize(StringTools
                    .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    printlnVerbose("===========================");
    for (BxPage page : bxDoc.getPages()) {
        for (BxZone zone : page.getZones()) {
            Integer zoneIdx = bxDoc.asZones().indexOf(zone);
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase());
            List<String> zoneTokens = StringTools.tokenize(zoneText);
            Boolean valueSet = false;

            Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                @Override
                public int compare(LabelTrio t1, LabelTrio t2) {
                    Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size();
                    if (Math.abs(simDif) < 0.0001) {
                        return t2.entryTokens.size() - t1.entryTokens.size();
                    }
                    if (simDif > 0) {
                        return 1;
                    } else {
                        return -1;
                    }
                }
            });
            Collections.reverse(swLabelSim.get(zoneIdx));

            List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens;
            if (Math.max(zoneTokens.size(), entryTokens.size()) > 0
                    && Math.min(zoneTokens.size(), entryTokens.size())
                            / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7
                    && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) {
                curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                valueSet = true;
                printVerbose("0 ");
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment - t2.alignment;
                        if (Math.abs(simDif) < 0.0001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size());
                if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) {
                    curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                    valueSet = true;
                    printVerbose("1 ");
                }
            }

            if (!valueSet) {
                Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class);
                for (LabelTrio trio : swLabelSim.get(zoneIdx)) {
                    if (cumulated.containsKey(trio.label)) {
                        cumulated.put(trio.label, cumulated.get(trio.label)
                                + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    } else {
                        cumulated.put(trio.label,
                                trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    }
                }
                Double max = Double.NEGATIVE_INFINITY;
                BxZoneLabel bestLabel = null;
                for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) {
                    if (entry.getValue() > max) {
                        max = entry.getValue();
                        bestLabel = entry.getKey();
                    }
                }
                if (max >= 0.5) {
                    curZone.setLabel(bestLabel);
                    printVerbose("2 ");
                    valueSet = true;
                }
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment / t1.entryTokens.size()
                                - t2.alignment / t2.entryTokens.size();
                        if (Math.abs(simDif) < 0.001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                List<LabelTrio> l = swLabelSim.get(zoneIdx);

                BxZoneLabel best = null;
                int bestScore = 0;
                for (LabelTrio lt : l) {
                    int i = 0;
                    for (String zt : zoneTokens) {
                        if (lt.entryTokens.contains(zt)) {
                            i++;
                        }
                    }
                    if (i > bestScore && i > 1) {
                        best = lt.label;
                        bestScore = i;
                    }
                }
                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                } else {
                    for (LabelTrio lt : l) {
                        int i = 0;
                        for (String zt : zoneTokens) {
                            for (String j : lt.entryTokens) {
                                if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "")
                                        .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) {
                                    i++;
                                    break;
                                }
                            }
                        }
                        if (i > bestScore && i > 1) {
                            best = lt.label;
                            bestScore = i;
                        }
                    }
                }

                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                }
            }
            if (!valueSet) {
                curZone.setLabel(null);
            }
            printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n");
        }
        Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>();
        Set<BxZone> unlabeledZones = new HashSet<BxZone>();
        for (BxZone zone : page.getZones()) {
            if (zone.getLabel() == null) {
                unlabeledZones.add(zone);
                zoneLocMap.put(zone, new ZoneLocaliser(zone));
            }
        }
        Integer lastNumberOfUnlabeledZones;
        do {
            lastNumberOfUnlabeledZones = unlabeledZones.size();
            infereLabels(unlabeledZones, zoneLocMap);
            infereLabels(unlabeledZones, zoneLocMap);
        } while (lastNumberOfUnlabeledZones != unlabeledZones.size());
    }
    printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>=");

    return bxDoc;
}

From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*ww  w.  ja v a  2s .  com*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader();
    Reader r = new InputStreamReader(pdfStream);
    BxDocument bxDoc = new BxDocument().setPages(reader.read(r));

    List<BxZone> zones = Lists.newArrayList(bxDoc.asZones());

    Integer bxDocLen = zones.size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = TextUtils.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = zones.get(zoneIdx);
            List<String> zoneTokens = TextUtils.tokenize(
                    TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    for (BxPage pp : bxDoc) {

        boolean changed = true;
        while (changed) {

            changed = false;
            boolean wasIntro = false;

            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();
                int i = zones.indexOf(z);

                double titleAl = 0;
                double authorAl = 0;
                List<LabelTrio> sims = swLabelSim.get(i);
                for (LabelTrio t : sims) {
                    if (t.label.equals(BxZoneLabel.MET_TITLE)) {
                        titleAl = t.alignment / t.entryTokens.size();
                    }
                    if (t.label.equals(BxZoneLabel.MET_AUTHOR)) {
                        authorAl = t.alignment / t.entryTokens.size();
                    }
                }

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                int linesCount = z.childrenCount();
                int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent());
                BxLine firstLine = z.getFirstChild();

                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.MET_TITLE)
                                || z.getLabel().equals(BxZoneLabel.BODY_CONTENT))
                        && titleAl >= 0.7 && authorAl >= 0.4) {
                    z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR);
                }
                if (linesCount == 2 && text.contains("page") && text.contains("of")
                        && text.contains("page number not for")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (linesCount == 1 && (text.contains("page number not for")
                        || (text.contains("page") && text.contains("of")))) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && linesCount < 11 && (text.contains("department") || text.contains("university"))) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (linesCount < 5 && firstLine.toText().length() < 11
                        && firstLine.toText().startsWith("Figure")
                        && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx > 0 && z.hasPrev() && z.hasNext()
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                                || z.getLabel().equals(BxZoneLabel.MET_DATES)
                                || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT))
                        && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                                || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE))
                        && z.getWidth() < 100) {
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                            && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        z.setLabel(BxZoneLabel.BODY_TABLE);
                    }
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2;
                        double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                    if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2;
                        double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                }
                if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION)
                        || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:")
                        || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) {
                    z.setLabel(BxZoneLabel.MET_EDITOR);
                }
                if (pageIdx == 0 && text.startsWith("copyright:")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume")
                        && text.contains("issue")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.MET_AUTHOR)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    BxPage p = z.getParent();
                    if (pageIdx > 0) {
                        BxPage prevPage = p.getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 1) {
                        BxPage nextPage = p.getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx > 1) {
                        BxPage prevPage = p.getPrev().getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 2) {
                        BxPage nextPage = p.getNext().getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                        || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+")
                        && text.length() <= 4
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (text.equals("acknowledgments")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (text.startsWith("introduction") && z.hasPrev()
                        && !z.getPrev().toText().toLowerCase().equals("abstract")) {
                    wasIntro = true;
                }
                if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }

                if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references")
                        && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10
                        && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT)
                        && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES)
                        && (text.matches("[1-9][0-9]?[0-9]?\\.?")
                                || text.matches(".*[1-2][0-9][0-9][0-9].*"))) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if ((z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (text.startsWith("doi") || text.startsWith("cite this article"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("author details")) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (firstLine.toText().toLowerCase().equals("acknowledgments")
                                || firstLine.toText().toLowerCase().equals("acknowledgements"))) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)
                        && text.matches("sup-[0-9][0-9]?")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("references")) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) {
                    z.setLabel(BxZoneLabel.BODY_TABLE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)
                        && text.contains("this article is distributed")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("journal")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("correspondence")) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }
                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && text.contains("accepted") && text.contains("published")) {
                    z.setLabel(BxZoneLabel.MET_DATES);
                }

                if (pageIdx == 0 && linesCount < 10
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4
                        && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) {
                    if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) {
                        z.setLabel(z.getPrev().getLabel());
                    }
                }
                if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with")
                        || text.contains("will be the most significant development")
                        || text.contains("disseminating the results of biomedical")
                        || text.contains("sir paul nurse") || text.contains("your research papers")
                        || text.contains("available free of charge")
                        || text.contains("peer reviewed and published")
                        || text.contains("cited in pubmed and archived")
                        || text.contains("you keep the copyright") || text.contains("submit your manuscript")
                        || text.contains("submit your next manuscript") || text.contains("online submission")
                        || text.contains("peer review") || text.contains("space constraints")
                        || text.contains("publication on acceptance") || text.contains("inclusion in pubmed")
                        || text.contains("freely available") || text.contains("publication history"))) {
                    z.setLabel(BxZoneLabel.OTH_UNKNOWN);
                }
                if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }

                if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest")
                        || text.startsWith("competing interests")
                        || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest")
                                || z.getPrev().toText().toLowerCase().equals("conflict of interest")
                                || z.getPrev().toText().toLowerCase().equals("competing interests")))) {
                    z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT);
                }

                changed = changed || !orig.equals(z.getLabel());
            }

            boolean wasAuthor = false;
            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor
                        && ((text.contains("email") && text.contains("@"))
                                || text.startsWith("correspondence"))) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }

                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel())
                        || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) {
                    wasAuthor = true;
                }
                changed = changed || !orig.equals(z.getLabel());
            }

        }
    }

    return bxDoc;
}

From source file:sernet.gs.ui.rcp.main.LoggerInitializerTest.java

private Document loadLog4jFile(String name) throws ParserConfigurationException, SAXException, IOException {

    URL costumLog4jFile = getClass().getResource(name);

    DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
    documentBuilderFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
    return documentBuilder.parse(costumLog4jFile.getPath());

}

From source file:sh.isaac.api.util.ArtifactUtilities.java

/**
 * Make maven relative path.//from w w w .j  a va  2  s  .c  om
 *
 * @param baseMavenURL - optional - but required if you are downloading a SNAPSHOT dependency, as this method will need to download the metadata file
 * from the repository server in order to determine the proper version component for the SNAPSHOT.
 * @param mavenUsername - optional - only used for a SNAPSHOT dependency
 * @param mavenPassword - optional - only used for a SNAPSHOT dependency
 * @param groupId the group id
 * @param artifactId the artifact id
 * @param version the version
 * @param classifier - optional
 * @param type the type
 * @return the string
 * @throws Exception the exception
 */
public static String makeMavenRelativePath(String baseMavenURL, String mavenUsername, String mavenPassword,
        String groupId, String artifactId, String version, String classifier, String type) throws Exception {
    final String temp = groupId.replaceAll("\\.", "/");
    String snapshotVersion = "";
    String versionWithoutSnapshot = version;

    if (version.endsWith("-SNAPSHOT")) {
        versionWithoutSnapshot = version.substring(0, version.lastIndexOf("-SNAPSHOT"));

        final URL metadataUrl = new URL(baseMavenURL + (baseMavenURL.endsWith("/") ? "" : "/") + temp + "/"
                + artifactId + "/" + version + "/maven-metadata.xml");

        // Need to download the maven-metadata.xml file
        final Task<File> task = new DownloadUnzipTask(mavenUsername, mavenPassword, metadataUrl, false, false,
                null);

        WorkExecutors.get().getExecutor().execute(task);

        final File metadataFile = task.get();
        final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();

        // added to avoid XXE injections
        domFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);

        DocumentBuilder builder;
        Document dDoc = null;
        final XPath xPath = XPathFactory.newInstance().newXPath();

        builder = domFactory.newDocumentBuilder();
        dDoc = builder.parse(metadataFile);

        final String timestamp = ((Node) xPath.evaluate("/metadata/versioning/snapshot/timestamp", dDoc,
                XPathConstants.NODE)).getTextContent();
        final String buildNumber = ((Node) xPath.evaluate("/metadata/versioning/snapshot/buildNumber", dDoc,
                XPathConstants.NODE)).getTextContent();

        snapshotVersion = "-" + timestamp + "-" + buildNumber;
        metadataFile.delete();

        // The download task makes a subfolder in temp for this, delete that too
        metadataFile.getParentFile().delete();
    }

    return temp + "/" + artifactId + "/" + version + "/" + artifactId + "-" + versionWithoutSnapshot
            + snapshotVersion + (StringUtils.isNotBlank(classifier) ? "-" + classifier : "") + "." + type;
}

From source file:uk.me.jeffsutton.pojogen.SimplePOJO.java

public Document parse(BufferedReader xml) throws IOException, SAXException, ParserConfigurationException {
    String file = "";
    try {//  w w  w.ja va  2  s  .  c o m
        String str;
        while ((str = xml.readLine()) != null) {
            file += str;
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    file = file.replaceAll("<!DOCTYPE((.|\n|\r)*?)\">", "");

    // convert String into InputStream
    InputStream is = new ByteArrayInputStream(file.getBytes());

    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    dbf.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setNamespaceAware(false);
    dbf.setIgnoringComments(true);
    dbf.setValidating(false);
    dbf.setXIncludeAware(true);

    return dbf.newDocumentBuilder().parse(is);
}