pl.edu.icm.cermine.evaluation.BwmetaFinalMetadataExtractionEvaluation.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.evaluation.BwmetaFinalMetadataExtractionEvaluation.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.evaluation;

import pl.edu.icm.cermine.evaluation.tools.MetadataSingle;
import pl.edu.icm.cermine.evaluation.tools.MetadataList;
import pl.edu.icm.cermine.evaluation.tools.EvaluationUtils;
import pl.edu.icm.cermine.evaluation.tools.PrecisionRecall;
import pl.edu.icm.cermine.evaluation.tools.NlmPair;
import pl.edu.icm.cermine.evaluation.tools.ComparisonResult;
import pl.edu.icm.cermine.evaluation.tools.NlmIterator;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.lang.StringUtils;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import pl.edu.icm.cermine.evaluation.tools.MetadataRelation.StringRelation;
import pl.edu.icm.cermine.evaluation.tools.*;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.tools.XMLTools;

/**
 * @author Pawel Szostek (p.szostek@icm.edu.pl)
 * @author Dominika Tkaczyk (d.tkaczyk@icm.edu.pl)
 */
public final class BwmetaFinalMetadataExtractionEvaluation {

    public void evaluate(int mode, NlmIterator iter)
            throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
            SAXException, JDOMException, XPathExpressionException, TransformerException {

        javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);
        dbf.setFeature("http://xml.org/sax/features/namespaces", false);
        dbf.setFeature("http://xml.org/sax/features/validation", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

        SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
        builder.setValidation(false);
        builder.setFeature("http://xml.org/sax/features/validation", false);
        builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
        List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
        List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
        List<ComparisonResult> authorsAffiliations = new ArrayList<ComparisonResult>();
        List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
        List<ComparisonResult> authorsEmails = new ArrayList<ComparisonResult>();
        List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
        List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
        List<ComparisonResult> journals = new ArrayList<ComparisonResult>();
        List<ComparisonResult> volumes = new ArrayList<ComparisonResult>();
        List<ComparisonResult> issues = new ArrayList<ComparisonResult>();
        List<ComparisonResult> pageRanges = new ArrayList<ComparisonResult>();
        List<ComparisonResult> years = new ArrayList<ComparisonResult>();
        List<ComparisonResult> dois = new ArrayList<ComparisonResult>();
        List<ComparisonResult> references = new ArrayList<ComparisonResult>();

        if (mode == 1) {
            System.out.println("path,cerm_title,cerm_abstract,cerm_keywords,"
                    + "cerm_authors,cerm_affs,cerm_autaff,cerm_email,cerm_autemail,cerm_journal,cerm_volume,cerm_issue,"
                    + "cerm_pages,cerm_year,cerm_doi,cerm_refs,one");
        }

        int i = 0;
        for (NlmPair pair : iter) {
            i++;
            if (mode == 0) {
                System.out.println("");
                System.out.println(">>>>>>>>> " + i);
                System.out.println(pair.getExtractedNlm().getPath());
            }
            if (mode == 1) {
                System.out.print(pair.getOriginalNlm().getPath() + ",");
            }

            org.w3c.dom.Document originalNlm;
            org.w3c.dom.Document extractedNlm;
            try {
                originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
                extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
            } catch (SAXException ex) {
                i--;
                continue;
            }

            // Document's title
            MetadataSingle title = new MetadataSingle(originalNlm, "/bwmeta/element/name[not(@type)]", extractedNlm,
                    "/article/front/article-meta//article-title");
            title.setComp(EvaluationUtils.swComparator);
            titles.add(title);
            title.print(mode, "title");

            // Abstract
            MetadataSingle abstrakt = new MetadataSingle(originalNlm,
                    "/bwmeta/element/description[@type='abstract']", extractedNlm,
                    "/article/front/article-meta/abstract");
            abstrakt.setComp(EvaluationUtils.swComparator);
            abstracts.add(abstrakt);
            abstrakt.print(mode, "abstract");

            // Keywords
            MetadataList keyword = new MetadataList(originalNlm, "/bwmeta/element/tags[@type='keyword']/tag",
                    extractedNlm, "/article/front/article-meta/kwd-group/kwd");
            keywords.add(keyword);
            keyword.print(mode, "keywords");

            // Authors
            List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                    "/bwmeta/element/contributor[@role='author']");

            List<String> expectedAuthors = new ArrayList<String>();
            for (Node authorNode : expectedAuthorNodes) {
                List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
                if (names.isEmpty()) {
                    continue;
                }
                for (Node n : names) {
                    if (n.getAttributes().getNamedItem("type") != null
                            && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                        expectedAuthors.add(n.getTextContent());
                        break;
                    }
                }
            }

            List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                    "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][string-name]");

            List<String> extractedAuthors = new ArrayList<String>();
            for (Node authorNode : extractedAuthorNodes) {
                List<String> names = XMLTools.extractChildrenTextFromNode(authorNode, "string-name");
                if (names.isEmpty()) {
                    continue;
                }
                extractedAuthors.add(names.get(0));
            }

            MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
            author.setComp(EvaluationUtils.authorComparator);
            authors.add(author);
            author.print(mode, "author");

            // Affiliations
            Set<String> expectedAffiliationsSet = Sets
                    .newHashSet(XMLTools.extractTextAsList(originalNlm, "/bwmeta/element/affiliation/text"));
            Set<String> extractedAffiliationsSet = Sets
                    .newHashSet(XMLTools.extractTextAsList(extractedNlm, "/article/front/article-meta//aff"));
            List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
            List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
            MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
            affiliation.setComp(EvaluationUtils.cosineComparator());
            affiliations.add(affiliation);
            affiliation.print(mode, "affiliation");

            // Author - Affiliation relation
            MetadataRelation authorAffiliation = new MetadataRelation();
            authorAffiliation.setComp1(EvaluationUtils.authorComparator);
            authorAffiliation.setComp2(EvaluationUtils.cosineComparator());

            List<Node> expectedAffiliationNodes = XMLTools.extractNodes(originalNlm, "/bwmeta/element/affiliation");
            Map<String, String> expectedAffiliationMap = new HashMap<String, String>();
            for (Node expectedAffiliationNode : expectedAffiliationNodes) {
                String id = expectedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
                String aff = XMLTools.extractChildrenTextFromNode(expectedAffiliationNode, "text").get(0);
                expectedAffiliationMap.put(id, aff);
            }

            List<Node> extractedAffiliationNodes = XMLTools.extractNodes(extractedNlm,
                    "/article/front/article-meta//aff[@id]");
            Map<String, String> extractedAffiliationMap = new HashMap<String, String>();
            for (Node extractedAffiliationNode : extractedAffiliationNodes) {
                String id = extractedAffiliationNode.getAttributes().getNamedItem("id").getNodeValue();
                String aff = XMLTools.extractTextFromNode(extractedAffiliationNode);
                extractedAffiliationMap.put(id, aff);
            }

            for (Node expectedAuthorNode : expectedAuthorNodes) {
                String authorName = null;

                List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
                if (names.isEmpty()) {
                    continue;
                }
                for (Node n : names) {
                    if (n.getAttributes().getNamedItem("type") != null
                            && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                        authorName = n.getTextContent();
                        break;
                    }
                }

                if (authorName == null)
                    continue;

                List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "affiliation-ref");
                for (Node xref : xrefs) {
                    String affId = xref.getAttributes().getNamedItem("ref").getNodeValue();
                    String aff = expectedAffiliationMap.get(affId);
                    if (aff != null)
                        authorAffiliation.addExpected(new StringRelation(authorName, aff));
                }
            }

            for (Node extractedAuthorNode : extractedAuthorNodes) {
                String authorName = extractedAuthors.get(extractedAuthorNodes.indexOf(extractedAuthorNode));
                List<Node> xrefs = XMLTools.extractChildrenNodesFromNode(extractedAuthorNode, "xref");
                for (Node xref : xrefs) {
                    if ("aff".equals(xref.getAttributes().getNamedItem("ref-type").getNodeValue())) {
                        String affId = xref.getAttributes().getNamedItem("rid").getNodeValue();
                        for (String id : affId.split(" ")) {
                            String aff = extractedAffiliationMap.get(id);
                            if (aff != null) {
                                authorAffiliation.addExtracted(new StringRelation(authorName, aff));
                            }
                        }
                    }
                }
            }

            authorsAffiliations.add(authorAffiliation);
            authorAffiliation.print(mode, "author - affiliation");

            // Email addresses
            MetadataList email = new MetadataList(originalNlm,
                    "/bwmeta/element/contributor[@role='author']/attribute[@key='contact-email']/value",
                    extractedNlm,
                    "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email");
            email.setComp(EvaluationUtils.emailComparator);
            emails.add(email);
            email.print(mode, "email");

            // Author - Email relations
            MetadataRelation authorEmail = new MetadataRelation();
            authorEmail.setComp1(EvaluationUtils.authorComparator);
            authorEmail.setComp2(EvaluationUtils.emailComparator);

            for (Node expectedAuthorNode : expectedAuthorNodes) {
                String authorName = null;

                List<Node> names = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "name");
                if (names.isEmpty()) {
                    continue;
                }
                for (Node n : names) {
                    if (n.getAttributes().getNamedItem("type") != null
                            && n.getAttributes().getNamedItem("type").getTextContent().equals("canonical")) {
                        authorName = n.getTextContent();//.replaceAll("[^a-zA-Z]", "");
                        break;
                    }
                }

                if (authorName == null)
                    continue;

                List<Node> addresses = XMLTools.extractChildrenNodesFromNode(expectedAuthorNode, "attribute");
                for (Node address : addresses) {
                    if ("contact-email".equals(address.getAttributes().getNamedItem("key").getNodeValue())) {
                        String ema = XMLTools.extractChildrenTextFromNode(address, "value").get(0);
                        authorEmail.addExpected(new StringRelation(authorName, ema));
                    }
                }
            }
            for (Node extractedAuthorNode : extractedAuthorNodes) {
                String authorName = extractedAuthors.get(extractedAuthorNodes.indexOf(extractedAuthorNode));

                for (String emailAddress : XMLTools.extractChildrenTextFromNode(extractedAuthorNode, "email")) {
                    authorEmail.addExtracted(new StringRelation(authorName, emailAddress));
                }
            }
            authorsEmails.add(authorEmail);
            authorEmail.print(mode, "author - email");

            // Journal title
            MetadataSingle journal = new MetadataSingle(originalNlm,
                    "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Journal']/name[@type='canonical']",
                    extractedNlm, "/article/front/journal-meta/journal-title-group/journal-title");
            journal.setComp(EvaluationUtils.journalComparator);
            journals.add(journal);
            journal.print(mode, "journal title");

            // Volume
            MetadataSingle volume = new MetadataSingle(originalNlm,
                    "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Volume']/name[@type='canonical']",
                    extractedNlm, "/article/front/article-meta/volume");
            volumes.add(volume);
            volume.print(mode, "volume");

            // Issue            
            MetadataSingle issue = new MetadataSingle(originalNlm,
                    "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Number']/name[@type='canonical']",
                    extractedNlm, "/article/front/article-meta/issue");
            issues.add(issue);
            issue.print(mode, "issue");

            // Pages range
            MetadataSingle fPage = new MetadataSingle(originalNlm,
                    "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                    extractedNlm, "/article/front/article-meta/fpage");
            MetadataSingle lPage = new MetadataSingle(originalNlm,
                    "/bwmeta/element/structure/current[@level='bwmeta1.level.hierarchy_Journal_Article']/@position",
                    extractedNlm, "/article/front/article-meta/lpage");
            String expRange = fPage.hasExpected() ? fPage.getExpectedValue().replaceAll("-", "--") : "";

            String extrRange = fPage.hasExtracted() && lPage.hasExtracted()
                    ? fPage.getExtractedValue() + "--" + lPage.getExtractedValue()
                    : "";
            MetadataSingle pageRange = new MetadataSingle(expRange, extrRange);
            pageRanges.add(pageRange);
            pageRange.print(mode, "pages");

            // Publication date
            List<String> expectedPubDate = XMLTools.extractTextAsList(originalNlm,
                    "/bwmeta/element/structure/ancestor[@level='bwmeta1.level.hierarchy_Journal_Year']/name[@type='canonical']");
            expectedPubDate = EvaluationUtils.removeLeadingZerosFromDate(expectedPubDate);
            List<String> extractedPubDate = XMLTools.extractTextAsList(extractedNlm,
                    "/article/front/article-meta/pub-date");
            extractedPubDate = EvaluationUtils.removeLeadingZerosFromDate(extractedPubDate);

            MetadataSingle year = new MetadataSingle(StringUtils.join(expectedPubDate, "---"),
                    StringUtils.join(extractedPubDate, "---"));
            year.setComp(EvaluationUtils.yearComparator);
            years.add(year);
            year.print(mode, "year");

            // DOI
            MetadataSingle doi = new MetadataSingle(originalNlm,
                    "/bwmeta/element/id[@scheme='bwmeta1.id-class.DOI']/@value", extractedNlm,
                    "/article/front/article-meta/article-id[@pub-id-type='doi']");
            dois.add(doi);
            doi.print(mode, "DOI");

            //references
            List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm,
                    "//relation[@type='reference-to']/attribute[@key='reference-text']/value");
            List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm, "//ref-list/ref");

            List<String> originalRefs = new ArrayList<String>();
            List<String> extractedRefs = new ArrayList<String>();
            for (Node originalRefNode : originalRefNodes) {
                originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
            }
            for (Node extractedRefNode : extractedRefNodes) {
                extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
            }

            MetadataList refs = new MetadataList(originalRefs, extractedRefs);
            refs.setComp(EvaluationUtils.cosineComparator(0.6));

            references.add(refs);
            refs.print(mode, "references");

            if (mode == 1) {
                System.out.println("1");
            }
        }

        if (mode != 1) {
            System.out.println("==== Summary (" + iter.size() + " docs)====");

            PrecisionRecall titlePR = new PrecisionRecall().build(titles);
            titlePR.print("Title");

            PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
            abstractPR.print("Abstract");

            PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
            keywordsPR.print("Keywords");

            PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
            authorsPR.print("Authors");

            PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
            affiliationsPR.print("Affiliations");

            PrecisionRecall authorsAffiliationsPR = new PrecisionRecall().build(authorsAffiliations);
            authorsAffiliationsPR.print("Author - affiliation");

            PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
            emailsPR.print("Emails");

            PrecisionRecall authorsEmailsPR = new PrecisionRecall().build(authorsEmails);
            authorsEmailsPR.print("Author - email");

            PrecisionRecall journalPR = new PrecisionRecall().build(journals);
            journalPR.print("Journal");

            PrecisionRecall volumePR = new PrecisionRecall().build(volumes);
            volumePR.print("Volume");

            PrecisionRecall issuePR = new PrecisionRecall().build(issues);
            issuePR.print("Issue");

            PrecisionRecall pageRangePR = new PrecisionRecall().build(pageRanges);
            pageRangePR.print("Pages");

            PrecisionRecall yearPR = new PrecisionRecall().build(years);
            yearPR.print("Year");

            PrecisionRecall doiPR = new PrecisionRecall().build(dois);
            doiPR.print("DOI");

            PrecisionRecall refsPR = new PrecisionRecall().build(references);
            refsPR.print("References");

            List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                    abstractPR, keywordsPR, journalPR, volumePR, issuePR, pageRangePR, yearPR, doiPR, refsPR);

            double avgPrecision = 0;
            double avgRecall = 0;
            double avgF1 = 0;
            for (PrecisionRecall result : results) {
                avgPrecision += result.getPrecision();
                avgRecall += result.getRecall();
                avgF1 += result.getF1();
            }
            avgPrecision /= results.size();
            avgRecall /= results.size();
            avgF1 /= results.size();

            System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
            System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
            System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
        }
    }

    public static void main(String[] args)
            throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
            SAXException, JDOMException, XPathExpressionException, TransformerException {
        if (args.length != 3 && args.length != 4) {
            System.out.println(
                    "Usage: FinalMetadataExtractionEvaluation <input dir> <orig extension> <extract extension>");
            return;
        }
        String directory = args[0];
        String origExt = args[1];
        String extrExt = args[2];
        int mode = 0;
        if (args.length == 4 && args[3].equals("csv")) {
            mode = 1;
        }
        if (args.length == 4 && args[3].equals("q")) {
            mode = 2;
        }

        BwmetaFinalMetadataExtractionEvaluation e = new BwmetaFinalMetadataExtractionEvaluation();
        NlmIterator iter = new NlmIterator(directory, origExt, extrExt);
        e.evaluate(mode, iter);
    }

}