Example usage for javax.xml.parsers DocumentBuilderFactory setFeature

List of usage examples for javax.xml.parsers DocumentBuilderFactory setFeature

Introduction

In this page you can find the example usage for javax.xml.parsers DocumentBuilderFactory setFeature.

Prototype

public abstract void setFeature(String name, boolean value) throws ParserConfigurationException;

Source Link

Document

Set a feature for this DocumentBuilderFactory and DocumentBuilder s created by this factory.

Usage

From source file:pl.edu.icm.cermine.evaluation.BwmetaGrobidFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//from   w  ww  .  jav a 2  s  .  c o  m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> journals = new ArrayList<ComparisonResult>();
    List<ComparisonResult> volumes = new ArrayList<ComparisonResult>();
    List<ComparisonResult> issues = new ArrayList<ComparisonResult>();
    List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>();
    List<ComparisonResult> years = new ArrayList<ComparisonResult>();
    List<ComparisonResult> dois = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,gro_title,gro_abstract,gro_keywords,"
                + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue,"
                + "gro_pages,gro_year,gro_doi,gro_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;

        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Document's title
        MetadataSingle title = new MetadataSingle(originalNlm, "/bwmeta/element/name[not(@type)]", extractedNlm,
                "//teiHeader//titleStmt/title");
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        MetadataSingle abstrakt = new MetadataSingle(originalNlm,
                "/bwmeta/element/description[@type='abstract']", extractedNlm, "//teiHeader//abstract/p");
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/bwmeta/element/tags[@type='keyword']/tag",
                extractedNlm, "//teiHeader//keywords//term");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/bwmeta/element/contributor[@role='author']");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    expectedAuthors.add(n.getTextContent());//.replaceAll("[^a-zA-Z]", ""));
                    break;
                }
            }
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/bwmeta/element/affiliation/text"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Author - Affiliation relation
        MetadataRelation authorAffiliation = new MetadataRelation();
        authorAffiliation.setComp1(EvaluationUtils.authorComparator);
        authorAffiliation.setComp2(EvaluationUtils.cosineComparator());

        List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm, "/bwmeta/element/affiliation");
        Map<String, String> expectedAffiliationMap = new HashMap<String, String>();
        for (Node expectedAffiliationNode : expectedAffiliationNodes) {
            String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
            String aff = XMLTools.extractChildrenTextFromNode(expectedAffiliationNode, "text").get(0);
            expectedAffiliationMap.put(id, aff);
        }

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = null;

            List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    authorName = n.getTextContent();//.replaceAll("[^a-zA-Z]", "");
                    break;
                }
            }

            if (authorName == null)
                continue;

            List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "affiliation-ref");
            for (Node xref : xrefs) {
                String affId = xref.getAttributes().getNamedItem("ref").getNodeValue();
                String aff = expectedAffiliationMap.get(affId);
                if (aff != null)
                    authorAffiliation.addExpected(new StringRelation(authorName, aff));
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("affiliation".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorAffiliation.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsAffiliations.add(authorAffiliation);
        authorAffiliation.print(mode, "author - affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/bwmeta/element/contributor[@role='author']/attribute[@key='contact-email']/value",
                extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        // Author - Email relations
        MetadataRelation authorEmail = new MetadataRelation();
        authorEmail.setComp1(EvaluationUtils.authorComparator);
        authorEmail.setComp2(EvaluationUtils.emailComparator);

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = null;

            List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            for (Node n : names) {
                if (n.getAttributes().getNamedItem("type") != null
                        && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                    authorName = n.getTextContent();
                    break;
                }
            }

            if (authorName == null)
                continue;

            List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "attribute");
            for (Node address : addresses) {
                if ("contact-email".equals(address.getAttributes().getNamedItem("key").getNodeValue())) {
                    String ema = XMLTools.extractChildrenTextFromNode(address, "value").get(0);
                    authorEmail.addExpected(new StringRelation(authorName, ema));
                }
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("email".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorEmail.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsEmails.add(authorEmail);
        authorEmail.print(mode, "author - email");

        // Journal title
        MetadataSingle journal = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Journal']/name[@type='canonical']",
                extractedNlm, "//monogr/title[@level='j' and @type='main']");
        journal.setComp(EvaluationUtils.journalComparator);
        journals.add(journal);
        journal.print(mode, "journal title");

        // Volume
        MetadataSingle volume = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Volume']/name[@type='canonical']",
                extractedNlm, "//monogr/imprint/biblScope[@unit='volume']");
        volumes.add(volume);
        volume.print(mode, "volume");

        // Issue            
        MetadataSingle issue = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Number']/name[@type='canonical']",
                extractedNlm, "//monogr/imprint/biblScope[@unit='issue']");
        issues.add(issue);
        issue.print(mode, "issue");

        // Pages range
        MetadataSingle fPage = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from");
        MetadataSingle lPage = new MetadataSingle(originalNlm,
                "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to");
        String expRange = fPage.hasExpected() ? fPage.getExpectedValue().replaceAll("-", "--") : "";
        String extrRange = fPage.hasExtracted() && lPage.hasExtracted()
                ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue()
                : "";
        MetadataSingle pageRange = new MetadataSingle(expRange, extrRange);
        pageRanges.add(pageRange);
        pageRange.print(mode, "pages");

        // Publication date
        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Year']/name[@type='canonical']");
        expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate);
        List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//date[@type='published']");
        List<String> extractedPubDate = Lists.newArrayList();
        if (!extractedPubDates.isEmpty()) {
            Node pubDate = extractedPubDates.get(0);
            String date = pubDate.getTextContent();
            if (pubDate.getAttributes().getNamedItem("when") != null) {
                date = pubDate.getAttributes().getNamedItem("when").getTextContent();
            }
            extractedPubDate = Lists.newArrayList(date.split("-"));
            extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate);
        }

        MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"),
                StringUtils.join(extractedPubDate, "---"));
        year.setComp(EvaluationUtils.yearComparator);
        years.add(year);
        year.print(mode, "year");

        // DOI
        MetadataSingle doi = new MetadataSingle(originalNlm,
                "/bwmeta/element/id[@scheme='bwmeta1.id-class.DOI']/@value", extractedNlm,
                "//teiHeader//idno[@type='DOI']");
        dois.add(doi);
        doi.print(mode, "DOI");

        // References
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm,
                "//relation[@type='reference-to']/attribute[@key='reference-text']/value"); //bwmeta
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();

        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations);
        authorsAffiliationsPR.print("Author - affiliation");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails);
        authorsEmailsPR.print("Author - email");

        PrecisionRecall journalPR = new PrecisionRecall().build(journals);
        journalPR.print("Journal");

        PrecisionRecall volumePR = new PrecisionRecall().build(volumes);
        volumePR.print("Volume");

        PrecisionRecall issuePR = new PrecisionRecall().build(issues);
        issuePR.print("Issue");

        PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges);
        pageRangePR.print("Pages");

        PrecisionRecall yearPR = new PrecisionRecall().build(years);
        yearPR.print("Year");

        PrecisionRecall doiPR = new PrecisionRecall().build(dois);
        doiPR.print("DOI");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, yearPR, doiPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.evaluation.FinalMetadataExtractionEvaluation.java

public void evaluate(NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*w ww  .  java  2s. co  m*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    PrecissonRecall issn = new PrecissonRecall();
    PrecissonRecall doi = new PrecissonRecall();
    PrecissonRecall volume = new PrecissonRecall();
    PrecissonRecall issue = new PrecissonRecall();
    PrecissonRecall pages = new PrecissonRecall();
    PrecissonRecall dateYear = new PrecissonRecall();
    PrecissonRecall dateFull = new PrecissonRecall();
    PrecissonRecall journalTitle = new PrecissonRecall();

    List<Double> abstractRates = new ArrayList<Double>(iter.size());
    List<Double> titleRates = new ArrayList<Double>(iter.size());

    List<Double> keywordPrecisions = new ArrayList<Double>(iter.size());
    List<Double> keywordRecalls = new ArrayList<Double>(iter.size());

    List<Double> authorsPrecisions = new ArrayList<Double>(iter.size());
    List<Double> authorsRecalls = new ArrayList<Double>(iter.size());

    List<Double> affPrecisions = new ArrayList<Double>(iter.size());
    List<Double> affRecalls = new ArrayList<Double>(iter.size());

    int ii = 0;
    for (NlmPair pair : iter) {
        ii++;
        System.out.println("");
        printVerbose(">>>>>>>>> " + ii);

        printVerbose(pair.getExtractedNlm().getPath());

        org.w3c.dom.Document originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
        org.w3c.dom.Document extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));

        String expectedTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta//article-title");
        String extractedTitle = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/title-group/article-title");

        List<Node> expectedAuthorsNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/name");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorsNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            author = author.replaceAll("[^a-zA-Z ]", "");
            expectedAuthors.add(author);
        }

        List<String> extractedAuthors1 = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']/string-name");
        List<String> extractedAuthors = new ArrayList<String>();
        for (String author : extractedAuthors1) {
            extractedAuthors.add(author.replaceAll("[^a-zA-Z ]", ""));
        }

        List<String> expectedKeywords = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta//kwd");
        List<String> extractedKeywords = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/kwd-group/kwd");

        String expectedJournalTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/journal-meta//journal-title");
        String extractedJournalTitle = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/journal-meta/journal-title-group/journal-title");

        String expectedAbstract = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/abstract");
        String extractedAbstract = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/abstract");

        String expectedDoi = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']");
        String extractedDoi = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']");

        String expectedISSN = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/journal-meta/issn[@pub-type='ppub']");
        String extractedISSN = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/journal-meta/issn[@pub-type='ppub']");

        String expectedVolume = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/volume");
        String extractedVolume = XMLTools.extractTextFromNode(extractedNlm,
                "/article/front/article-meta/volume");

        String expectedIssue = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/issue");
        String extractedIssue = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/issue");

        String expectedFPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/fpage");
        String extractedFPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/fpage");

        String expectedLPage = XMLTools.extractTextFromNode(originalNlm, "/article/front/article-meta/lpage");
        String extractedLPage = XMLTools.extractTextFromNode(extractedNlm, "/article/front/article-meta/lpage");

        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta/pub-date");
        expectedPubDate = removeLeadingZerosFromDate(expectedPubDate);
        List<String> extractedPubDate = XMLTools.extractTextAsList(extractedNlm,
                "/article/front/article-meta/pub-date");
        extractedPubDate = removeLeadingZerosFromDate(extractedPubDate);

        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta//aff"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);

        //equality measures
        if (!expectedVolume.isEmpty()) {
            if (expectedVolume.equals(extractedVolume)) {
                ++volume.correct;
            }
            ++volume.expected;
        }
        if (!extractedVolume.isEmpty()) {
            volume.extracted++;
        }
        if (!expectedIssue.isEmpty()) {
            if (expectedIssue.equals(extractedIssue)) {
                ++issue.correct;
            }
            ++issue.expected;
        }
        if (!extractedIssue.isEmpty()) {
            issue.extracted++;
        }
        if (!expectedISSN.isEmpty()) {
            if (extractedISSN.equals(expectedISSN)) {
                ++issn.correct;
            }
            ++issn.expected;
        }
        if (!extractedISSN.isEmpty()) {
            issn.extracted++;
        }
        if (!expectedDoi.isEmpty()) {
            if (expectedDoi.equals(extractedDoi)) {
                ++doi.correct;
            }
            ++doi.expected;
        }
        if (!extractedDoi.isEmpty()) {
            doi.extracted++;
        }
        if (!expectedFPage.isEmpty() && !expectedLPage.isEmpty()) {
            if (expectedFPage.equals(extractedFPage) && expectedLPage.equals(extractedLPage)) {
                ++pages.correct;
            }
            ++pages.expected;
        }
        if (!extractedFPage.isEmpty() && !extractedLPage.isEmpty()) {
            pages.extracted++;
        }

        if (!expectedPubDate.isEmpty()) {
            Boolean yearsMatch = DateComparator.yearsMatch(expectedPubDate, extractedPubDate);
            if (yearsMatch != null) {
                if (yearsMatch) {
                    ++dateYear.correct;
                }
                ++dateYear.expected;
            }
        }
        if (!extractedPubDate.isEmpty()) {
            dateYear.extracted++;
            dateFull.extracted++;
        }

        //Smith-Waterman distance measures
        if (expectedAbstract.length() > 0) {
            abstractRates.add(compareStringsSW(expectedAbstract, extractedAbstract));
        } else {
            abstractRates.add(null);
        }
        if (expectedTitle.length() > 0) {
            titleRates.add(compareStringsSW(expectedTitle, extractedTitle));
        } else {
            titleRates.add(null);
        }
        if (!expectedJournalTitle.isEmpty()) {
            journalTitle.expected++;
        }
        if (!extractedJournalTitle.isEmpty()) {
            journalTitle.extracted++;
            if (isSubsequence(expectedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase(),
                    extractedJournalTitle.replaceAll("[^a-zA-Z]", "").toLowerCase())) {
                journalTitle.correct++;
            }
        }

        //precision + recall
        if (expectedAuthors.size() > 0) {
            authorsRecalls.add(calculateRecall(expectedAuthors, extractedAuthors));
        } else {
            authorsRecalls.add(null);
        }
        if (extractedAuthors.size() > 0) {
            authorsPrecisions.add(calculatePrecision(expectedAuthors, extractedAuthors));
        } else {
            authorsPrecisions.add(null);
        }
        if (expectedKeywords.size() > 0) {
            keywordRecalls.add(calculateRecall(expectedKeywords, extractedKeywords));
        } else {
            keywordRecalls.add(null);
        }
        if (extractedKeywords.size() > 0) {
            keywordPrecisions.add(calculatePrecision(expectedKeywords, extractedKeywords));
        } else {
            keywordPrecisions.add(null);
        }
        if (expectedAffiliations.size() > 0) {
            affRecalls.add(calculateRecall(expectedAffiliations, extractedAffiliations));
        } else {
            affRecalls.add(null);
        }
        if (extractedAffiliations.size() > 0) {
            affPrecisions.add(calculatePrecision(expectedAffiliations, extractedAffiliations));
        } else {
            affPrecisions.add(null);
        }

        System.out.println("");
        printVerbose(">>> Expected authors: ");
        for (String author : expectedAuthors) {
            printVerbose(author);
        }

        System.out.println("");
        printVerbose(">>> Extracted authors: ");
        for (String author : extractedAuthors) {
            printVerbose(author);
        }

        System.out.println("");
        printVerbose(">>> Expected keywords: ");
        for (String keyword : expectedKeywords) {
            printVerbose(keyword);
        }

        System.out.println("");
        printVerbose(">>> Extracted keywords: ");
        for (String keyword : extractedKeywords) {
            printVerbose(keyword);
        }

        printVerbose(">>> Expected journal title: " + expectedJournalTitle);
        printVerbose(">>> Extracted journal title: " + extractedJournalTitle);

        printVerbose(">>> Expected article title: " + expectedTitle);
        printVerbose(">>> Extracted article title: " + extractedTitle);

        printVerbose(">>> Expected article abstract: " + expectedAbstract);
        printVerbose(">>> Extracted article abstract: " + extractedAbstract);

        printVerbose(">>> Expected doi: " + expectedDoi);
        printVerbose(">>> Extracted doi: " + extractedDoi);

        printVerbose(">>> Expected issn: " + expectedISSN);
        printVerbose(">>> Extracted issn: " + extractedISSN);

        printVerbose(">>> Expected volume: " + expectedVolume);
        printVerbose(">>> Extracted volume: " + extractedVolume);

        printVerbose(">>> Expected issue: " + expectedIssue);
        printVerbose(">>> Extracted issue: " + extractedIssue);

        printVerbose(">>> Expected pages: " + expectedFPage + " " + expectedLPage);
        printVerbose(">>> Extracted pages: " + extractedFPage + " " + extractedLPage);

        printVerbose(">>> Expected date: ");
        for (String date : expectedPubDate) {
            printVerbose(date);
        }

        printVerbose(">>> Extracted date: ");
        for (String date : extractedPubDate) {
            printVerbose(date);
        }
        printVerbose(">>> Expected affs: ");
        for (String aff : expectedAffiliations) {
            printVerbose(aff);
        }

        printVerbose(">>> Extracted affs: ");
        for (String aff : extractedAffiliations) {
            printVerbose(aff);
        }

        printVerbose("abstract " + abstractRates.get(abstractRates.size() - 1));
        printVerbose("title " + titleRates.get(titleRates.size() - 1));
        printVerbose("journal title " + journalTitle);

        System.out.println("");
        printVerbose("authors precission " + authorsPrecisions.get(authorsPrecisions.size() - 1));
        printVerbose("authors recall " + authorsRecalls.get(authorsPrecisions.size() - 1));

        System.out.println("");
        printVerbose("aff precission " + affPrecisions.get(affPrecisions.size() - 1));
        printVerbose("aff recall " + affRecalls.get(affPrecisions.size() - 1));

        System.out.println("");
        printVerbose("keywords precission " + keywordPrecisions.get(keywordPrecisions.size() - 1));
        printVerbose("keywords recall " + keywordRecalls.get(keywordPrecisions.size() - 1));

        printVerbose("date years" + dateYear);
        printVerbose("doi" + doi);
        printVerbose("issn" + issn);
        printVerbose("volume" + volume);
        printVerbose("issue" + issue);
        printVerbose("pages" + pages);
    }

    Double value;
    System.out.println("==== Summary (" + iter.size() + " docs)====");
    if ((value = calculateAverage(abstractRates)) != null) {
        System.out.printf("abstract avg (SW) \t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(titleRates)) != null) {
        System.out.printf("title avg (SW) \t\t\t%4.2f\n", 100 * value);
    }
    if ((value = journalTitle.calculatePrecission()) != null) {
        System.out.printf("journal title precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = journalTitle.calculateRecall()) != null) {
        System.out.printf("journal title recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(authorsPrecisions)) != null) {
        System.out.printf("authors precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(authorsRecalls)) != null) {
        System.out.printf("authors recall avg (EQ)\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(affPrecisions)) != null) {
        System.out.printf("aff precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(affRecalls)) != null) {
        System.out.printf("aff recall avg (EQ)\t\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(keywordPrecisions)) != null) {
        System.out.printf("keywords precision avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = calculateAverage(keywordRecalls)) != null) {
        System.out.printf("keywords recall avg (EQ)\t%4.2f\n", 100 * value);
    }
    if ((value = dateYear.calculatePrecission()) != null) {
        System.out.printf("date year precission avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = dateYear.calculateRecall()) != null) {
        System.out.printf("date year recall avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = doi.calculatePrecission()) != null) {
        System.out.printf("doi precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = doi.calculateRecall()) != null) {
        System.out.printf("doi recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = issn.calculatePrecission()) != null) {
        System.out.printf("issn precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = issn.calculateRecall()) != null) {
        System.out.printf("issn recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = volume.calculatePrecission()) != null) {
        System.out.printf("volume precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = volume.calculateRecall()) != null) {
        System.out.printf("volume recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = issue.calculatePrecission()) != null) {
        System.out.printf("issue precission\t\t%4.2f\n", 100 * value);
    }
    if ((value = issue.calculateRecall()) != null) {
        System.out.printf("issue recall\t\t%4.2f\n", 100 * value);
    }
    if ((value = pages.calculatePrecission()) != null) {
        System.out.printf("pages precission avg\t\t%4.2f\n", 100 * value);
    }
    if ((value = pages.calculateRecall()) != null) {
        System.out.printf("pages recall avg\t\t%4.2f\n", 100 * value);
    }
}

From source file:pl.edu.icm.cermine.evaluation.GrobidFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//from   w ww  .ja va  2  s. c  o  m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> journals = new ArrayList<ComparisonResult>();
    List<ComparisonResult> volumes = new ArrayList<ComparisonResult>();
    List<ComparisonResult> issues = new ArrayList<ComparisonResult>();
    List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>();
    List<ComparisonResult> years = new ArrayList<ComparisonResult>();
    List<ComparisonResult> dois = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,gro_title,gro_abstract,gro_keywords,"
                + "gro_authors,gro_affs,gro_autaff,gro_email,gro_autemail,gro_journal,gro_volume,gro_issue,"
                + "gro_pages,gro_year,gro_doi,gro_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;

        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Document's title
        MetadataSingle title = new MetadataSingle(originalNlm, "/article/front/article-meta//article-title",
                extractedNlm, "//teiHeader//titleStmt/title");
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        MetadataSingle abstrakt = new MetadataSingle(originalNlm, "/article/front/article-meta/abstract",
                extractedNlm, "//teiHeader//abstract/p");
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm,
                "//teiHeader//keywords//term");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            Node name = names.get(0);
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            expectedAuthors.add(author);
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(XMLTools.extractTextAsList(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Author - Affiliation relation
        MetadataRelation authorAffiliation = new MetadataRelation();
        authorAffiliation.setComp1(EvaluationUtils.authorComparator);
        authorAffiliation.setComp2(EvaluationUtils.cosineComparator());

        List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta//aff[@id]");
        Map<String, String> expectedAffiliationMap = new HashMap<String, String>();
        for (Node expectedAffiliationNode : expectedAffiliationNodes) {
            String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
            String aff = XMLTools.extractTextFromNode(expectedAffiliationNode);
            expectedAffiliationMap.put(id, aff);
        }

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode));
            List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "xref");
            for (Node xref : xrefs) {
                if (xref.getAttributes() != null && xref.getAttributes().getNamedItem("ref-type") != null
                        && "aff".equals(xref.getAttributes().getNamedItem("ref-type").getNodeValue())) {
                    String affId = xref.getAttributes().getNamedItem("rid").getNodeValue();
                    for (String id : affId.split(" ")) {
                        String aff = expectedAffiliationMap.get(id);
                        if (aff != null) {
                            authorAffiliation.addExpected(new StringRelation(authorName, aff));
                        }
                    }
                }
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("affiliation".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorAffiliation.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsAffiliations.add(authorAffiliation);
        authorAffiliation.print(mode, "author - affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email",
                extractedNlm, "//teiHeader//sourceDesc/biblStruct//author/email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        // Author - Email relations
        MetadataRelation authorEmail = new MetadataRelation();
        authorEmail.setComp1(EvaluationUtils.authorComparator);
        authorEmail.setComp2(EvaluationUtils.emailComparator);

        for (Node expectedAuthorNode : expectedAuthorNodes) {
            String authorName = expectedAuthors.get(expectedAuthorNodes.indexOf(expectedAuthorNode));

            List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "address");
            for (Node address : addresses) {
                for (String emailAddress : XMLTools.extractChildrenTextFromNode(address, "email")) {
                    authorEmail.addExpected(new StringRelation(authorName, emailAddress));
                }
            }
            for (String emailAddress : XMLTools.extractChildrenTextFromNode(expectedAuthorNode, "email")) {
                authorEmail.addExpected(new StringRelation(authorName, emailAddress));
            }
        }

        extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//sourceDesc/biblStruct//author/persName");

        for (Node authorNode : extractedAuthorNodes) {

            List<String> givenNames = XMLTools.extractChildrenTextFromNode(authorNode, "forename");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(authorNode, "surname");
            String a = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");

            Node n = authorNode.getParentNode();
            NodeList nl = n.getChildNodes();
            for (int iu = 0; iu < nl.getLength(); iu++) {
                Node aff = nl.item(iu);
                if ("email".equals(aff.getNodeName())) {
                    String aw = XMLTools.extractTextFromNode(aff);
                    authorEmail.addExtracted(new StringRelation(a, aw));
                }
            }

        }

        authorsEmails.add(authorEmail);
        authorEmail.print(mode, "author - email");

        // Journal title
        MetadataSingle journal = new MetadataSingle(originalNlm, "/article/front/journal-meta//journal-title",
                extractedNlm, "//monogr/title[@level='j' and @type='main']");
        journal.setComp(EvaluationUtils.journalComparator);
        journals.add(journal);
        journal.print(mode, "journal title");

        // Volume
        MetadataSingle volume = new MetadataSingle(originalNlm, "/article/front/article-meta/volume",
                extractedNlm, "//monogr/imprint/biblScope[@unit='volume']");
        volumes.add(volume);
        volume.print(mode, "volume");

        // Issue            
        MetadataSingle issue = new MetadataSingle(originalNlm, "/article/front/article-meta/issue",
                extractedNlm, "//monogr/imprint/biblScope[@unit='issue']");
        issues.add(issue);
        issue.print(mode, "issue");

        // Pages range
        MetadataSingle fPage = new MetadataSingle(originalNlm, "/article/front/article-meta/fpage",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@from");
        MetadataSingle lPage = new MetadataSingle(originalNlm, "/article/front/article-meta/lpage",
                extractedNlm, "//monogr/imprint/biblScope[@unit='page']/@to");
        String expRange = fPage.hasExpected() && lPage.hasExpected()
                ? fPage.getExpectedValue() + "--" + lPage.getExpectedValue()
                : "";
        String extrRange = fPage.hasExtracted() && lPage.hasExtracted()
                ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue()
                : "";
        MetadataSingle pageRange = new MetadataSingle(expRange, extrRange);
        pageRanges.add(pageRange);
        pageRange.print(mode, "pages");

        // Publication date
        List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                "/article/front/article-meta/pub-date");
        expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate);
        List<Node> extractedPubDates = XMLTools.extractNodes(extractedNlm,
                "//teiHeader//date[@type='published']");
        List<String> extractedPubDate = Lists.newArrayList();
        if (!extractedPubDates.isEmpty()) {
            Node pubDate = extractedPubDates.get(0);
            String date = pubDate.getTextContent();
            if (pubDate.getAttributes().getNamedItem("when") != null) {
                date = pubDate.getAttributes().getNamedItem("when").getTextContent();
            }
            extractedPubDate = Lists.newArrayList(date.split("-"));
            extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate);
        }

        MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"),
                StringUtils.join(extractedPubDate, "---"));
        year.setComp(EvaluationUtils.yearComparator);
        years.add(year);
        year.print(mode, "year");

        // DOI
        MetadataSingle doi = new MetadataSingle(originalNlm,
                "/article/front/article-meta/article-id[@pub-id-type='doi']", extractedNlm,
                "//teiHeader//idno[@type='DOI']");
        dois.add(doi);
        doi.print(mode, "DOI");

        // References
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref"); //nxml
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//listBibl/biblStruct");//grobid

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();

        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations);
        authorsAffiliationsPR.print("Author - affiliation");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails);
        authorsEmailsPR.print("Author - email");

        PrecisionRecall journalPR = new PrecisionRecall().build(journals);
        journalPR.print("Journal");

        PrecisionRecall volumePR = new PrecisionRecall().build(volumes);
        volumePR.print("Volume");

        PrecisionRecall issuePR = new PrecisionRecall().build(issues);
        issuePR.print("Issue");

        PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges);
        pageRangePR.print("Pages");

        PrecisionRecall yearPR = new PrecisionRecall().build(years);
        yearPR.print("Year");

        PrecisionRecall doiPR = new PrecisionRecall().build(dois);
        doiPR.print("DOI");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, yearPR, doiPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.evaluation.ParsCitFinalMetadataExtractionEvaluation.java

public void evaluate(int mode, NlmIterator iter)
        throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
        SAXException, JDOMException, XPathExpressionException, TransformerException {

    javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);//  w w  w  .  jav  a 2s  .  c  o m
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

    SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
    builder.setValidation(false);
    builder.setFeature("http://xml.org/sax/features/validation", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
    List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
    List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
    List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
    List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
    List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
    List<ComparisonResult> references = new ArrayList<ComparisonResult>();

    if (mode == 1) {
        System.out.println("path,pcit_title,pcit_abstract,pcit_keywords,"
                + "pcit_authors,pcit_affs,pcit_email,pcit_refs,one");
    }

    int i = 0;
    for (NlmPair pair : iter) {
        i++;
        if (mode == 0) {
            System.out.println("");
            System.out.println(">>>>>>>>> " + i);
            System.out.println(pair.getExtractedNlm().getPath());
        }
        if (mode == 1) {
            System.out.print(pair.getOriginalNlm().getPath() + ",");
        }

        org.w3c.dom.Document originalNlm;
        org.w3c.dom.Document extractedNlm;
        try {
            originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
            extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
        } catch (SAXException ex) {
            i--;
            continue;
        }

        // Title
        String expectedTitle = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta//article-title");
        List<Node> extractedTitleNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//title");
        String extractedTitle = null;
        double confidence = 0;
        for (Node extractedTitleNode : extractedTitleNodes) {
            if (extractedTitle == null) {
                extractedTitle = extractedTitleNode.getTextContent();
            }
            Node conf = extractedTitleNode.getAttributes().getNamedItem("confidence");
            if (conf != null) {
                double actConf = Double.valueOf(conf.getNodeValue());
                if (actConf > confidence) {
                    confidence = actConf;
                    extractedTitle = extractedTitleNode.getTextContent();
                }
            }
        }

        MetadataSingle title = new MetadataSingle(expectedTitle, extractedTitle);
        title.setComp(EvaluationUtils.swComparator);
        titles.add(title);
        title.print(mode, "title");

        // Abstract
        String expectedAbstract = XMLTools.extractTextFromNode(originalNlm,
                "/article/front/article-meta/abstract");
        List<Node> extractedAbstractNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//abstract");
        String extractedAbstract = null;
        confidence = 0;
        for (Node extractedAbstractNode : extractedAbstractNodes) {
            if (extractedAbstract == null) {
                extractedAbstract = extractedAbstractNode.getTextContent();
            }
            Node conf = extractedAbstractNode.getAttributes().getNamedItem("confidence");
            if (conf != null) {
                double actConf = Double.valueOf(conf.getNodeValue());
                if (actConf > confidence) {
                    confidence = actConf;
                    extractedAbstract = extractedAbstractNode.getTextContent();
                }
            }
        }
        MetadataSingle abstrakt = new MetadataSingle(expectedAbstract, extractedAbstract);
        abstrakt.setComp(EvaluationUtils.swComparator);
        abstracts.add(abstrakt);
        abstrakt.print(mode, "abstract");

        // Keywords
        MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm,
                "//algorithm[@name='ParsHed']//keyword");
        keywords.add(keyword);
        keyword.print(mode, "keywords");

        // Authors
        List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]");

        List<String> expectedAuthors = new ArrayList<String>();
        for (Node authorNode : expectedAuthorNodes) {
            List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
            if (names.isEmpty()) {
                continue;
            }
            Node name = names.get(0);
            List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names");
            List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname");
            String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
            expectedAuthors.add(author);
        }

        List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsHed']//author");

        List<String> extractedAuthors = new ArrayList<String>();
        for (Node authorNode : extractedAuthorNodes) {
            String author = XMLTools.extractTextFromNode(authorNode);
            extractedAuthors.add(author);
        }

        MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
        author.setComp(EvaluationUtils.authorComparator);
        authors.add(author);
        author.print(mode, "author");

        // Affiliations
        Set<String> expectedAffiliationsSet = Sets
                .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
        Set<String> extractedAffiliationsSet = Sets.newHashSet(
                XMLTools.extractTextAsList(extractedNlm, "//algorithm[@name='ParsHed']//affiliation"));
        List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
        List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
        MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
        affiliation.setComp(EvaluationUtils.cosineComparator());
        affiliations.add(affiliation);
        affiliation.print(mode, "affiliation");

        // Email addresses
        MetadataList email = new MetadataList(originalNlm,
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email",
                extractedNlm, "//algorithm[@name='ParsHed']//email");
        email.setComp(EvaluationUtils.emailComparator);
        emails.add(email);
        email.print(mode, "email");

        //references
        List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref");
        List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm,
                "//algorithm[@name='ParsCit']//citationList/citation/rawString");

        List<String> originalRefs = new ArrayList<String>();
        List<String> extractedRefs = new ArrayList<String>();
        for (Node originalRefNode : originalRefNodes) {
            originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
        }
        for (Node extractedRefNode : extractedRefNodes) {
            extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
        }

        MetadataList refs = new MetadataList(originalRefs, extractedRefs);
        refs.setComp(EvaluationUtils.cosineComparator(0.6));

        references.add(refs);
        refs.print(mode, "references");

        if (mode == 1) {
            System.out.println("1");
        }
    }

    if (mode != 1) {
        System.out.println("==== Summary (" + iter.size() + " docs)====");

        PrecisionRecall titlePR = new PrecisionRecall().build(titles);
        titlePR.print("Title");

        PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
        abstractPR.print("Abstract");

        PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
        keywordsPR.print("Keywords");

        PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
        authorsPR.print("Authors");

        PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
        affiliationsPR.print("Affiliations");

        PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
        emailsPR.print("Emails");

        PrecisionRecall refsPR = new PrecisionRecall().build(references);
        refsPR.print("References");

        List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                abstractPR, keywordsPR, refsPR);

        double avgPrecision = 0;
        double avgRecall = 0;
        double avgF1 = 0;
        for (PrecisionRecall result : results) {
            avgPrecision += result.getPrecision();
            avgRecall += result.getRecall();
            avgF1 += result.getF1();
        }
        avgPrecision /= results.size();
        avgRecall /= results.size();
        avgF1 /= results.size();

        System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
        System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
        System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
    }
}

From source file:pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*w  w w.j a  v a2 s .c  om*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor();
    BxDocument bxDoc = structureExtractor.extractStructure(pdfStream);
    Integer bxDocLen = bxDoc.asZones().size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : StringTools.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : StringTools.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : StringTools.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = StringTools.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            List<String> zoneTokens = StringTools.tokenize(StringTools
                    .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    printlnVerbose("===========================");
    for (BxPage page : bxDoc.getPages()) {
        for (BxZone zone : page.getZones()) {
            Integer zoneIdx = bxDoc.asZones().indexOf(zone);
            BxZone curZone = bxDoc.asZones().get(zoneIdx);
            String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase());
            List<String> zoneTokens = StringTools.tokenize(zoneText);
            Boolean valueSet = false;

            Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                @Override
                public int compare(LabelTrio t1, LabelTrio t2) {
                    Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size();
                    if (Math.abs(simDif) < 0.0001) {
                        return t2.entryTokens.size() - t1.entryTokens.size();
                    }
                    if (simDif > 0) {
                        return 1;
                    } else {
                        return -1;
                    }
                }
            });
            Collections.reverse(swLabelSim.get(zoneIdx));

            List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens;
            if (Math.max(zoneTokens.size(), entryTokens.size()) > 0
                    && Math.min(zoneTokens.size(), entryTokens.size())
                            / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7
                    && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) {
                curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                valueSet = true;
                printVerbose("0 ");
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment - t2.alignment;
                        if (Math.abs(simDif) < 0.0001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size());
                if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) {
                    curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                    valueSet = true;
                    printVerbose("1 ");
                }
            }

            if (!valueSet) {
                Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class);
                for (LabelTrio trio : swLabelSim.get(zoneIdx)) {
                    if (cumulated.containsKey(trio.label)) {
                        cumulated.put(trio.label, cumulated.get(trio.label)
                                + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    } else {
                        cumulated.put(trio.label,
                                trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                    }
                }
                Double max = Double.NEGATIVE_INFINITY;
                BxZoneLabel bestLabel = null;
                for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) {
                    if (entry.getValue() > max) {
                        max = entry.getValue();
                        bestLabel = entry.getKey();
                    }
                }
                if (max >= 0.5) {
                    curZone.setLabel(bestLabel);
                    printVerbose("2 ");
                    valueSet = true;
                }
            }

            if (!valueSet) {
                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment / t1.entryTokens.size()
                                - t2.alignment / t2.entryTokens.size();
                        if (Math.abs(simDif) < 0.001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));
                List<LabelTrio> l = swLabelSim.get(zoneIdx);

                BxZoneLabel best = null;
                int bestScore = 0;
                for (LabelTrio lt : l) {
                    int i = 0;
                    for (String zt : zoneTokens) {
                        if (lt.entryTokens.contains(zt)) {
                            i++;
                        }
                    }
                    if (i > bestScore && i > 1) {
                        best = lt.label;
                        bestScore = i;
                    }
                }
                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                } else {
                    for (LabelTrio lt : l) {
                        int i = 0;
                        for (String zt : zoneTokens) {
                            for (String j : lt.entryTokens) {
                                if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "")
                                        .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) {
                                    i++;
                                    break;
                                }
                            }
                        }
                        if (i > bestScore && i > 1) {
                            best = lt.label;
                            bestScore = i;
                        }
                    }
                }

                if (best != null) {
                    curZone.setLabel(best);
                    valueSet = true;
                }
            }
            if (!valueSet) {
                curZone.setLabel(null);
            }
            printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n");
        }
        Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>();
        Set<BxZone> unlabeledZones = new HashSet<BxZone>();
        for (BxZone zone : page.getZones()) {
            if (zone.getLabel() == null) {
                unlabeledZones.add(zone);
                zoneLocMap.put(zone, new ZoneLocaliser(zone));
            }
        }
        Integer lastNumberOfUnlabeledZones;
        do {
            lastNumberOfUnlabeledZones = unlabeledZones.size();
            infereLabels(unlabeledZones, zoneLocMap);
            infereLabels(unlabeledZones, zoneLocMap);
        } while (lastNumberOfUnlabeledZones != unlabeledZones.size());
    }
    printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>=");

    return bxDoc;
}

From source file:pl.edu.icm.cermine.pubmed.RuleBasedPubmedXMLGenerator.java

public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
        throws AnalysisException, ParserConfigurationException, SAXException, IOException,
        XPathExpressionException, TransformationException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);/*ww  w.  ja v a  2s .  com*/
    dbf.setFeature("http://xml.org/sax/features/namespaces", false);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    Document domDoc = builder.parse(nlmStream);

    TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader();
    Reader r = new InputStreamReader(pdfStream);
    BxDocument bxDoc = new BxDocument().setPages(reader.read(r));

    List<BxZone> zones = Lists.newArrayList(bxDoc.asZones());

    Integer bxDocLen = zones.size();

    SmartHashMap entries = new SmartHashMap();

    //abstract
    Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
            XPathConstants.NODE);
    String abstractString = XMLTools.extractTextFromNode(abstractNode);
    entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
    entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

    //title
    String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
            domDoc, XPathConstants.STRING);
    entries.putIf(titleString, BxZoneLabel.MET_TITLE);
    String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
            domDoc, XPathConstants.STRING);
    entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
    //journal title
    String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
            XPathConstants.STRING);
    if (journalTitleString == null || journalTitleString.isEmpty()) {
        journalTitleString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
    }
    entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

    //journal publisher
    String journalPublisherString = (String) xpath
            .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
    entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
    String journalPublisherIdString = (String) xpath.evaluate(
            "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

    //journal issn
    String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
            XPathConstants.STRING);
    entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

    //copyright/permissions
    String permissionsString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
    entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

    //license
    Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
            XPathConstants.NODE);
    String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
    entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

    //article type
    NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
            XPathConstants.NODESET);
    List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
    Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
            domDoc, XPathConstants.NODE);
    articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

    entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

    //received date
    List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
    if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(receivedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //accepted date
    List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
            "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
    if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
        for (String date : TextUtils.produceDates(acceptedDate)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    //publication date
    List<String> pubdateString;
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    } else {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }
    pubdateString.clear();
    if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
            .getLength() > 1) {
        Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                domDoc, XPathConstants.NODE);
        pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
    }
    if (pubdateString != null && pubdateString.size() >= 3) {
        for (String date : TextUtils.produceDates(pubdateString)) {
            entries.putIf(date, BxZoneLabel.MET_DATES);
        }
    }

    String extLink = (String) xpath.evaluate(
            "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
            XPathConstants.STRING);
    printlnVerbose(extLink);
    entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
    //keywords
    Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
            XPathConstants.NODE);
    String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
    entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

    //DOI
    String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
            domDoc, XPathConstants.STRING);
    entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

    //volume
    String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
            XPathConstants.STRING);
    entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

    //issue
    String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
            XPathConstants.STRING);
    entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

    entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
    entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

    List<String> authorNames = new ArrayList<String>();
    List<String> authorEmails = new ArrayList<String>();
    List<String> authorAffiliations = new ArrayList<String>();
    List<String> editors = new ArrayList<String>();

    //pages
    String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
    String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
    entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
    try {
        int f = Integer.valueOf(fPage);
        int l = Integer.valueOf(lPage);
        while (f < l) {
            f++;
            entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
        }
    } catch (NumberFormatException ex) {
    }

    entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

    //editors
    NodeList editorNodes = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
        String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
        editors.add(editorString);
    }
    entries.putIf(TextUtils.joinStrings(editors), BxZoneLabel.MET_EDITOR);

    NodeList authorsResult = (NodeList) xpath.evaluate(
            "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
        Node curNode = authorsResult.item(nodeIdx);
        //author names
        String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
        String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
        //author affiliation
        List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

        //author correspondence
        String email;
        try {
            email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
        } catch (XPathExpressionException e) {
            email = "";
        }
        if (email.isEmpty()) {
            try {
                email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                //yaaay, probably there is no e-mail at all! => do nothing
            }
        }
        if (!email.isEmpty()) {
            authorEmails.add(email);
        }
        if (!aff.isEmpty()) {
            authorAffiliations.addAll(aff);
        }
        authorNames.add(name + " " + surname);
    }
    entries.putIf(TextUtils.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

    //authors' affiliations
    NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
            XPathConstants.NODESET);
    authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
    entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

    //correspondence again
    NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
            domDoc, XPathConstants.NODESET);
    authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
    entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

    //author notes
    Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
            XPathConstants.NODE);
    String notesString = XMLTools.extractTextFromNode(notesNode);
    entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
    notesString = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

    //article body
    NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
    List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
    entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

    NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
    String appStrings = XMLTools.extractTextFromNodes(appNodes);
    entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

    //section titles
    NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
            XPathConstants.NODESET);
    List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
    entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

    NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
            XPathConstants.NODESET);
    List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
    entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

    //figures
    NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
            XPathConstants.NODESET);
    List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

    figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
    figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

    entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

    //tables
    List<String> tableCaptions = new ArrayList<String>();
    List<String> tableBodies = new ArrayList<String>();
    List<String> tableFootnotes = new ArrayList<String>();
    //tableNodes
    NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

    for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
        Node tableNode = tableNodes.item(nodeIdx);

        String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
        tableCaptions.add(caption);

        String body = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
        tableBodies.add(body);

        List<String> footnotes = XMLTools.extractTextAsList(
                (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
        tableFootnotes.addAll(footnotes);

        entries.putIf(caption, BxZoneLabel.BODY_TABLE);
        entries.putIf(body, BxZoneLabel.BODY_TABLE);
        entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
    }

    //financial disclosure
    String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
            .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
    entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    //conflict
    String conflictString = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
    entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

    //copyright
    String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
            "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
    entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

    //acknowledgment
    String acknowledgement = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

    acknowledgement = XMLTools.extractTextFromNode(
            (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
    entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

    //glossary
    String glossary = XMLTools
            .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
    entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

    //formula
    NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
            XPathConstants.NODESET);
    for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
        Node curFormulaNode = formulaNodes.item(nodeIdx);
        String label = (String) xpath.evaluate("label", curFormulaNode);
        entries.putIf(label, BxZoneLabel.BODY_EQUATION);

        NodeList curNodeChildren = curFormulaNode.getChildNodes();
        List<String> formulaParts = new ArrayList<String>();
        for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
            Node curChild = curNodeChildren.item(childIdx);
            if (curChild.getNodeName().equals("label")) {
                continue;
            }
            formulaParts.add(XMLTools.extractTextFromNode(curChild));
        }
        entries.putIf(TextUtils.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
    }

    //references
    List<String> refStrings = new ArrayList<String>();
    Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
    if (refParentNode != null) {
        for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
            refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
        }
    }
    entries.putIf(TextUtils.joinStrings(refStrings), BxZoneLabel.REFERENCES);
    entries.put("references", BxZoneLabel.REFERENCES);

    Set<String> allBibInfos = new HashSet<String>();
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
            allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
        }
    }
    entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

    printlnVerbose("journalTitle: " + journalTitleString);
    printlnVerbose("journalPublisher: " + journalPublisherString);
    printlnVerbose("journalISSNPublisher: " + journalISSNString);

    printlnVerbose("articleType: " + articleTypeStrings);
    printlnVerbose("received: " + receivedDate);
    printlnVerbose("accepted: " + acceptedDate);
    printlnVerbose("pubdate: " + pubdateString);
    printlnVerbose("permissions: " + permissionsString);
    printlnVerbose("license: " + licenseString);

    printlnVerbose("title: " + titleString);
    printlnVerbose("abstract: " + abstractString);

    printlnVerbose("authorEmails: " + authorEmails);
    printlnVerbose("authorNames: " + authorNames);
    printlnVerbose("authorAff: " + authorAffiliations);
    printlnVerbose("authorNotes: " + notesString);
    printlnVerbose("editor: " + editors);

    printlnVerbose("keywords: " + keywordsString);
    printlnVerbose("DOI: " + doiString);
    printlnVerbose("volume: " + volumeString);
    printlnVerbose("issue: " + issueString);
    printlnVerbose("financial dis.: " + financialDisclosure);

    printlnVerbose("paragraphs: " + paragraphStrings);
    printlnVerbose("section titles: " + sectionTitles);

    printlnVerbose("tableBodies: " + tableBodies);
    printlnVerbose("tableCaptions: " + tableCaptions);
    printlnVerbose("tableFootnotes: " + tableFootnotes);

    printlnVerbose("figures: " + figureStrings);
    printlnVerbose("acknowledgement: " + acknowledgement);

    printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

    SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
    CosineDistance cos = new CosineDistance();

    //index: (zone,entry)
    List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
    List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
    for (Integer i = 0; i < bxDocLen; ++i) {
        swLabelSim.add(new ArrayList<LabelTrio>());
        cosLabProb.add(new ArrayList<LabelTrio>());
    }

    //iterate over entries
    for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
        List<String> entryTokens = TextUtils.tokenize(entry.getKey());
        printlnVerbose("--------------------");
        printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
        //iterate over zones
        for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
            BxZone curZone = zones.get(zoneIdx);
            List<String> zoneTokens = TextUtils.tokenize(
                    TextUtils.removeOrphantSpaces(TextUtils.cleanLigatures(curZone.toText().toLowerCase())));

            Double smithSim;
            Double cosSim;
            if (curZone.toText().contains("www.biomedcentral.com")) {
                //ignore
                smithSim = 0.;
                cosSim = 0.;
            } else {
                smithSim = smith.compare(entryTokens, zoneTokens);
                cosSim = cos.compare(entryTokens, zoneTokens);
            }
            printlnVerbose(smithSim + " " + zones.get(zoneIdx).toText() + "\n\n");
            swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
            cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
        }
    }

    for (BxPage pp : bxDoc) {

        boolean changed = true;
        while (changed) {

            changed = false;
            boolean wasIntro = false;

            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();
                int i = zones.indexOf(z);

                double titleAl = 0;
                double authorAl = 0;
                List<LabelTrio> sims = swLabelSim.get(i);
                for (LabelTrio t : sims) {
                    if (t.label.equals(BxZoneLabel.MET_TITLE)) {
                        titleAl = t.alignment / t.entryTokens.size();
                    }
                    if (t.label.equals(BxZoneLabel.MET_AUTHOR)) {
                        authorAl = t.alignment / t.entryTokens.size();
                    }
                }

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                int linesCount = z.childrenCount();
                int pageIdx = Lists.newArrayList(bxDoc).indexOf(z.getParent());
                BxLine firstLine = z.getFirstChild();

                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.MET_TITLE)
                                || z.getLabel().equals(BxZoneLabel.BODY_CONTENT))
                        && titleAl >= 0.7 && authorAl >= 0.4) {
                    z.setLabel(BxZoneLabel.MET_TITLE_AUTHOR);
                }
                if (linesCount == 2 && text.contains("page") && text.contains("of")
                        && text.contains("page number not for")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (linesCount == 1 && (text.contains("page number not for")
                        || (text.contains("page") && text.contains("of")))) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && linesCount < 11 && (text.contains("department") || text.contains("university"))) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_COPYRIGHT)) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (linesCount < 5 && firstLine.toText().length() < 11
                        && firstLine.toText().startsWith("Figure")
                        && z.getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (pageIdx > 0 && z.getLabel().equals(BxZoneLabel.MET_TITLE)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx > 0 && z.hasPrev() && z.hasNext()
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                                || z.getLabel().equals(BxZoneLabel.MET_DATES)
                                || z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT))
                        && (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                                || z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE))
                        && z.getWidth() < 100) {
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)
                            && z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        z.setLabel(BxZoneLabel.BODY_TABLE);
                    }
                    if (z.getPrev().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getPrev().getX() + z.getPrev().getWidth() / 2;
                        double prevMY = z.getPrev().getY() + z.getPrev().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                    if (z.getNext().getLabel().equals(BxZoneLabel.BODY_TABLE)) {
                        double prevMX = z.getNext().getX() + z.getNext().getWidth() / 2;
                        double prevMY = z.getNext().getY() + z.getNext().getHeight() / 2;
                        double zMX = z.getX() + z.getWidth() / 2;
                        double zMY = z.getY() + z.getHeight() / 2;
                        if (Math.abs(prevMX - zMX) < 200 && Math.abs(prevMY - zMY) < 200) {
                            z.setLabel(BxZoneLabel.BODY_TABLE);
                        }
                    }
                }
                if (pageIdx > 1 && (z.getLabel().equals(BxZoneLabel.MET_AFFILIATION)
                        || z.getLabel().equals(BxZoneLabel.MET_ABSTRACT))) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (pageIdx == 0 && linesCount < 10 && (text.startsWith("citation:")
                        || text.contains(" volume ") || text.contains("vol\\. ") || text.contains("doi"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (pageIdx == 0 && (text.startsWith("editor:") || text.startsWith("academic editor:"))) {
                    z.setLabel(BxZoneLabel.MET_EDITOR);
                }
                if (pageIdx == 0 && text.startsWith("copyright:")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_DATES) && text.contains("volume")
                        && text.contains("issue")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.MET_AUTHOR)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.MET_DATES)) && linesCount < 6
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    BxPage p = z.getParent();
                    if (pageIdx > 0) {
                        BxPage prevPage = p.getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 1) {
                        BxPage nextPage = p.getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx > 1) {
                        BxPage prevPage = p.getPrev().getPrev();
                        for (BxZone z1 : prevPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                    if (pageIdx < bxDoc.childrenCount() - 2) {
                        BxPage nextPage = p.getNext().getNext();
                        for (BxZone z1 : nextPage) {
                            if (z1.toText().replaceAll("[^a-zA-Z]", "")
                                    .equals(z.toText().replaceAll("[^a-zA-Z]", ""))
                                    && Math.abs(z1.getY() - z.getY()) < 10) {
                                z.setLabel(BxZoneLabel.MET_BIB_INFO);
                            }
                        }
                    }
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)
                        || z.getLabel().equals(BxZoneLabel.MET_BIB_INFO)
                        || z.getLabel().equals(BxZoneLabel.REFERENCES)) && text.matches("d?[0-9]+")
                        && text.length() <= 4
                        && (z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if (text.equals("acknowledgments")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (text.startsWith("introduction") && z.hasPrev()
                        && !z.getPrev().toText().toLowerCase().equals("abstract")) {
                    wasIntro = true;
                }
                if (wasIntro && z.getLabel().equals(BxZoneLabel.MET_ABSTRACT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }

                if (pageIdx == 0 && z.getLabel().equals(BxZoneLabel.REFERENCES) && !text.equals("references")
                        && !(z.hasPrev() && z.getPrev().toText().toLowerCase().equals("references"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if (z.getLabel().equals(BxZoneLabel.REFERENCES) && linesCount < 10
                        && !text.matches(".*[1-2][09][0-9][0-9].*") && z.hasNext() && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && z.getNext().getLabel().equals(BxZoneLabel.BODY_CONTENT)) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.MET_ABSTRACT)
                        && z.getX() + 10 < z.getPrev().getX() && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_ABSTRACT) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        && !text.startsWith("abstract") && z.getWidth() * 2 < pp.getWidth()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) && z.hasPrev()
                        && z.getPrev().getLabel().equals(BxZoneLabel.REFERENCES)
                        && (text.matches("[1-9][0-9]?[0-9]?\\.?")
                                || text.matches(".*[1-2][0-9][0-9][0-9].*"))) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if ((z.getLabel().equals(BxZoneLabel.REFERENCES)
                        || z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (text.startsWith("doi") || text.startsWith("cite this article"))) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("author details")) {
                    z.setLabel(BxZoneLabel.MET_AFFILIATION);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && (firstLine.toText().toLowerCase().equals("acknowledgments")
                                || firstLine.toText().toLowerCase().equals("acknowledgements"))) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }
                if (z.getLabel().equals(BxZoneLabel.MET_TITLE) && z.getY() * 2 > pp.getHeight()) {
                    z.setLabel(BxZoneLabel.BODY_CONTENT);
                }
                if ((z.getY() < 100 || z.getParent().getHeight() - z.getY() < 100)
                        && text.matches("sup-[0-9][0-9]?")) {
                    z.setLabel(BxZoneLabel.OTH_PAGE_NUMBER);
                }
                if ((z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                        || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && firstLine.toText().toLowerCase().equals("references")) {
                    z.setLabel(BxZoneLabel.REFERENCES);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?\\.")
                        || firstLine.toText().matches("F[iI][gG][uU][rR][eE] [0-9IV][0-9IV]?[0-9IV]?")
                        || firstLine.toText().matches("F[iI][gG]\\. [0-9IV][0-9IV]?[0-9IV]?"))) {
                    z.setLabel(BxZoneLabel.BODY_FIGURE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_CONTENT) && (firstLine.toText()
                        .matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?[\\.:] [A-Z].*")
                        || firstLine.toText().matches("T[aA][bB][lL][eE] [0-9IV][0-9IV]?[0-9IV]?\\.?"))) {
                    z.setLabel(BxZoneLabel.BODY_TABLE);
                }
                if (z.getLabel().equals(BxZoneLabel.BODY_ACKNOWLEDGMENT)
                        && text.contains("this article is distributed")) {
                    z.setLabel(BxZoneLabel.MET_COPYRIGHT);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("journal")) {
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                }

                if (pageIdx == 0 && !z.getLabel().isOfCategory(BxZoneLabelCategory.CAT_METADATA)
                        && text.contains("correspondence")) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }
                if (pageIdx == 0
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && text.contains("accepted") && text.contains("published")) {
                    z.setLabel(BxZoneLabel.MET_DATES);
                }

                if (pageIdx == 0 && linesCount < 10
                        && (z.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                                || z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN))
                        && z.hasPrev() && z.getY() - z.getHeight() - z.getPrev().getY() < 4
                        && Math.abs(firstLine.getHeight() - z.getPrev().getFirstChild().getHeight()) < 0.5) {
                    if (!z.getPrev().getLabel().equals(BxZoneLabel.MET_KEYWORDS)) {
                        z.setLabel(z.getPrev().getLabel());
                    }
                }
                if (pageIdx == bxDoc.childrenCount() - 1 && (text.startsWith("publish with")
                        || text.contains("will be the most significant development")
                        || text.contains("disseminating the results of biomedical")
                        || text.contains("sir paul nurse") || text.contains("your research papers")
                        || text.contains("available free of charge")
                        || text.contains("peer reviewed and published")
                        || text.contains("cited in pubmed and archived")
                        || text.contains("you keep the copyright") || text.contains("submit your manuscript")
                        || text.contains("submit your next manuscript") || text.contains("online submission")
                        || text.contains("peer review") || text.contains("space constraints")
                        || text.contains("publication on acceptance") || text.contains("inclusion in pubmed")
                        || text.contains("freely available") || text.contains("publication history"))) {
                    z.setLabel(BxZoneLabel.OTH_UNKNOWN);
                }
                if (text.startsWith("funding:") || firstLine.toText().equals("Funding")) {
                    z.setLabel(BxZoneLabel.BODY_ACKNOWLEDGMENT);
                }

                if (text.startsWith("conflicts of interest") || text.startsWith("conflict of interest")
                        || text.startsWith("competing interests")
                        || (z.hasPrev() && (z.getPrev().toText().toLowerCase().equals("conflicts of interest")
                                || z.getPrev().toText().toLowerCase().equals("conflict of interest")
                                || z.getPrev().toText().toLowerCase().equals("competing interests")))) {
                    z.setLabel(BxZoneLabel.BODY_CONFLICT_STMT);
                }

                changed = changed || !orig.equals(z.getLabel());
            }

            boolean wasAuthor = false;
            for (BxZone z : pp) {
                BxZoneLabel orig = z.getLabel();

                String text = ContentCleaner.cleanAllAndBreaks(z.toText()).toLowerCase();
                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel()) && wasAuthor
                        && ((text.contains("email") && text.contains("@"))
                                || text.startsWith("correspondence"))) {
                    z.setLabel(BxZoneLabel.MET_CORRESPONDENCE);
                }

                if (BxZoneLabel.MET_AUTHOR.equals(z.getLabel())
                        || BxZoneLabel.MET_TITLE_AUTHOR.equals(z.getLabel())) {
                    wasAuthor = true;
                }
                changed = changed || !orig.equals(z.getLabel());
            }

        }
    }

    return bxDoc;
}

From source file:sernet.gs.ui.rcp.main.LoggerInitializerTest.java

private Document loadLog4jFile(String name) throws ParserConfigurationException, SAXException, IOException {

    URL costumLog4jFile = getClass().getResource(name);

    DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
    documentBuilderFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
    return documentBuilder.parse(costumLog4jFile.getPath());

}

From source file:sh.isaac.api.util.ArtifactUtilities.java

/**
 * Make maven relative path.//from w w w .j  a va  2  s  .c  om
 *
 * @param baseMavenURL - optional - but required if you are downloading a SNAPSHOT dependency, as this method will need to download the metadata file
 * from the repository server in order to determine the proper version component for the SNAPSHOT.
 * @param mavenUsername - optional - only used for a SNAPSHOT dependency
 * @param mavenPassword - optional - only used for a SNAPSHOT dependency
 * @param groupId the group id
 * @param artifactId the artifact id
 * @param version the version
 * @param classifier - optional
 * @param type the type
 * @return the string
 * @throws Exception the exception
 */
public static String makeMavenRelativePath(String baseMavenURL, String mavenUsername, String mavenPassword,
        String groupId, String artifactId, String version, String classifier, String type) throws Exception {
    final String temp = groupId.replaceAll("\\.", "/");
    String snapshotVersion = "";
    String versionWithoutSnapshot = version;

    if (version.endsWith("-SNAPSHOT")) {
        versionWithoutSnapshot = version.substring(0, version.lastIndexOf("-SNAPSHOT"));

        final URL metadataUrl = new URL(baseMavenURL + (baseMavenURL.endsWith("/") ? "" : "/") + temp + "/"
                + artifactId + "/" + version + "/maven-metadata.xml");

        // Need to download the maven-metadata.xml file
        final Task<File> task = new DownloadUnzipTask(mavenUsername, mavenPassword, metadataUrl, false, false,
                null);

        WorkExecutors.get().getExecutor().execute(task);

        final File metadataFile = task.get();
        final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();

        // added to avoid XXE injections
        domFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);

        DocumentBuilder builder;
        Document dDoc = null;
        final XPath xPath = XPathFactory.newInstance().newXPath();

        builder = domFactory.newDocumentBuilder();
        dDoc = builder.parse(metadataFile);

        final String timestamp = ((Node) xPath.evaluate("/metadata/versioning/snapshot/timestamp", dDoc,
                XPathConstants.NODE)).getTextContent();
        final String buildNumber = ((Node) xPath.evaluate("/metadata/versioning/snapshot/buildNumber", dDoc,
                XPathConstants.NODE)).getTextContent();

        snapshotVersion = "-" + timestamp + "-" + buildNumber;
        metadataFile.delete();

        // The download task makes a subfolder in temp for this, delete that too
        metadataFile.getParentFile().delete();
    }

    return temp + "/" + artifactId + "/" + version + "/" + artifactId + "-" + versionWithoutSnapshot
            + snapshotVersion + (StringUtils.isNotBlank(classifier) ? "-" + classifier : "") + "." + type;
}

From source file:uk.me.jeffsutton.pojogen.SimplePOJO.java

public Document parse(BufferedReader xml) throws IOException, SAXException, ParserConfigurationException {
    String file = "";
    try {//  w w  w.ja va  2  s  .  c o m
        String str;
        while ((str = xml.readLine()) != null) {
            file += str;
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    file = file.replaceAll("<!DOCTYPE((.|\n|\r)*?)\">", "");

    // convert String into InputStream
    InputStream is = new ByteArrayInputStream(file.getBytes());

    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
    dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    dbf.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
    dbf.setFeature("http://xml.org/sax/features/validation", false);
    dbf.setNamespaceAware(false);
    dbf.setIgnoringComments(true);
    dbf.setValidating(false);
    dbf.setXIncludeAware(true);

    return dbf.newDocumentBuilder().parse(is);
}