pl.edu.icm.cermine.evaluation.ParsCitFinalMetadataExtractionEvaluation.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.evaluation.ParsCitFinalMetadataExtractionEvaluation.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.evaluation;

import pl.edu.icm.cermine.evaluation.tools.MetadataSingle;
import pl.edu.icm.cermine.evaluation.tools.MetadataList;
import pl.edu.icm.cermine.evaluation.tools.EvaluationUtils;
import pl.edu.icm.cermine.evaluation.tools.PrecisionRecall;
import pl.edu.icm.cermine.evaluation.tools.NlmPair;
import pl.edu.icm.cermine.evaluation.tools.ComparisonResult;
import pl.edu.icm.cermine.evaluation.tools.NlmIterator;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.lang.StringUtils;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import pl.edu.icm.cermine.evaluation.tools.*;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.tools.XMLTools;

/**
 * @author Pawel Szostek (p.szostek@icm.edu.pl)
 * @author Dominika Tkaczyk (d.tkaczyk@icm.edu.pl)
 */
public final class ParsCitFinalMetadataExtractionEvaluation {

    public void evaluate(int mode, NlmIterator iter)
            throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
            SAXException, JDOMException, XPathExpressionException, TransformerException {

        javax.xml.parsers.DocumentBuilderFactory dbf = javax.xml.parsers.DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);
        dbf.setFeature("http://xml.org/sax/features/namespaces", false);
        dbf.setFeature("http://xml.org/sax/features/validation", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        javax.xml.parsers.DocumentBuilder documentBuilder = dbf.newDocumentBuilder();

        SAXBuilder builder = new SAXBuilder("org.apache.xerces.parsers.SAXParser");
        builder.setValidation(false);
        builder.setFeature("http://xml.org/sax/features/validation", false);
        builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        List<ComparisonResult> titles = new ArrayList<ComparisonResult>();
        List<ComparisonResult> authors = new ArrayList<ComparisonResult>();
        List<ComparisonResult> affiliations = new ArrayList<ComparisonResult>();
        List<ComparisonResult> emails = new ArrayList<ComparisonResult>();
        List<ComparisonResult> abstracts = new ArrayList<ComparisonResult>();
        List<ComparisonResult> keywords = new ArrayList<ComparisonResult>();
        List<ComparisonResult> references = new ArrayList<ComparisonResult>();

        if (mode == 1) {
            System.out.println("path,pcit_title,pcit_abstract,pcit_keywords,"
                    + "pcit_authors,pcit_affs,pcit_email,pcit_refs,one");
        }

        int i = 0;
        for (NlmPair pair : iter) {
            i++;
            if (mode == 0) {
                System.out.println("");
                System.out.println(">>>>>>>>> " + i);
                System.out.println(pair.getExtractedNlm().getPath());
            }
            if (mode == 1) {
                System.out.print(pair.getOriginalNlm().getPath() + ",");
            }

            org.w3c.dom.Document originalNlm;
            org.w3c.dom.Document extractedNlm;
            try {
                originalNlm = documentBuilder.parse(new FileInputStream(pair.getOriginalNlm()));
                extractedNlm = documentBuilder.parse(new FileInputStream(pair.getExtractedNlm()));
            } catch (SAXException ex) {
                i--;
                continue;
            }

            // Title
            String expectedTitle = XMLTools.extractTextFromNode(originalNlm,
                    "/article/front/article-meta//article-title");
            List<Node> extractedTitleNodes = XMLTools.extractNodes(extractedNlm,
                    "//algorithm[@name='ParsHed']//title");
            String extractedTitle = null;
            double confidence = 0;
            for (Node extractedTitleNode : extractedTitleNodes) {
                if (extractedTitle == null) {
                    extractedTitle = extractedTitleNode.getTextContent();
                }
                Node conf = extractedTitleNode.getAttributes().getNamedItem("confidence");
                if (conf != null) {
                    double actConf = Double.valueOf(conf.getNodeValue());
                    if (actConf > confidence) {
                        confidence = actConf;
                        extractedTitle = extractedTitleNode.getTextContent();
                    }
                }
            }

            MetadataSingle title = new MetadataSingle(expectedTitle, extractedTitle);
            title.setComp(EvaluationUtils.swComparator);
            titles.add(title);
            title.print(mode, "title");

            // Abstract
            String expectedAbstract = XMLTools.extractTextFromNode(originalNlm,
                    "/article/front/article-meta/abstract");
            List<Node> extractedAbstractNodes = XMLTools.extractNodes(extractedNlm,
                    "//algorithm[@name='ParsHed']//abstract");
            String extractedAbstract = null;
            confidence = 0;
            for (Node extractedAbstractNode : extractedAbstractNodes) {
                if (extractedAbstract == null) {
                    extractedAbstract = extractedAbstractNode.getTextContent();
                }
                Node conf = extractedAbstractNode.getAttributes().getNamedItem("confidence");
                if (conf != null) {
                    double actConf = Double.valueOf(conf.getNodeValue());
                    if (actConf > confidence) {
                        confidence = actConf;
                        extractedAbstract = extractedAbstractNode.getTextContent();
                    }
                }
            }
            MetadataSingle abstrakt = new MetadataSingle(expectedAbstract, extractedAbstract);
            abstrakt.setComp(EvaluationUtils.swComparator);
            abstracts.add(abstrakt);
            abstrakt.print(mode, "abstract");

            // Keywords
            MetadataList keyword = new MetadataList(originalNlm, "/article/front/article-meta//kwd", extractedNlm,
                    "//algorithm[@name='ParsHed']//keyword");
            keywords.add(keyword);
            keyword.print(mode, "keywords");

            // Authors
            List<Node> expectedAuthorNodes = XMLTools.extractNodes(originalNlm,
                    "/article/front/article-meta/contrib-group/contrib[@contrib-type='author'][name]");

            List<String> expectedAuthors = new ArrayList<String>();
            for (Node authorNode : expectedAuthorNodes) {
                List<Node> names = XMLTools.extractChildrenNodesFromNode(authorNode, "name");
                if (names.isEmpty()) {
                    continue;
                }
                Node name = names.get(0);
                List<String> givenNames = XMLTools.extractChildrenTextFromNode(name, "given-names");
                List<String> surnames = XMLTools.extractChildrenTextFromNode(name, "surname");
                String author = StringUtils.join(givenNames, " ") + " " + StringUtils.join(surnames, " ");
                expectedAuthors.add(author);
            }

            List<Node> extractedAuthorNodes = XMLTools.extractNodes(extractedNlm,
                    "//algorithm[@name='ParsHed']//author");

            List<String> extractedAuthors = new ArrayList<String>();
            for (Node authorNode : extractedAuthorNodes) {
                String author = XMLTools.extractTextFromNode(authorNode);
                extractedAuthors.add(author);
            }

            MetadataList author = new MetadataList(expectedAuthors, extractedAuthors);
            author.setComp(EvaluationUtils.authorComparator);
            authors.add(author);
            author.print(mode, "author");

            // Affiliations
            Set<String> expectedAffiliationsSet = Sets
                    .newHashSet(XMLTools.extractTextAsList(originalNlm, "/article/front/article-meta//aff"));
            Set<String> extractedAffiliationsSet = Sets.newHashSet(
                    XMLTools.extractTextAsList(extractedNlm, "//algorithm[@name='ParsHed']//affiliation"));
            List<String> expectedAffiliations = Lists.newArrayList(expectedAffiliationsSet);
            List<String> extractedAffiliations = Lists.newArrayList(extractedAffiliationsSet);
            MetadataList affiliation = new MetadataList(expectedAffiliations, extractedAffiliations);
            affiliation.setComp(EvaluationUtils.cosineComparator());
            affiliations.add(affiliation);
            affiliation.print(mode, "affiliation");

            // Email addresses
            MetadataList email = new MetadataList(originalNlm,
                    "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']//email",
                    extractedNlm, "//algorithm[@name='ParsHed']//email");
            email.setComp(EvaluationUtils.emailComparator);
            emails.add(email);
            email.print(mode, "email");

            //references
            List<Node> originalRefNodes = XMLTools.extractNodes(originalNlm, "//ref-list/ref");
            List<Node> extractedRefNodes = XMLTools.extractNodes(extractedNlm,
                    "//algorithm[@name='ParsCit']//citationList/citation/rawString");

            List<String> originalRefs = new ArrayList<String>();
            List<String> extractedRefs = new ArrayList<String>();
            for (Node originalRefNode : originalRefNodes) {
                originalRefs.add(XMLTools.extractTextFromNode(originalRefNode).trim());
            }
            for (Node extractedRefNode : extractedRefNodes) {
                extractedRefs.add(XMLTools.extractTextFromNode(extractedRefNode).trim());
            }

            MetadataList refs = new MetadataList(originalRefs, extractedRefs);
            refs.setComp(EvaluationUtils.cosineComparator(0.6));

            references.add(refs);
            refs.print(mode, "references");

            if (mode == 1) {
                System.out.println("1");
            }
        }

        if (mode != 1) {
            System.out.println("==== Summary (" + iter.size() + " docs)====");

            PrecisionRecall titlePR = new PrecisionRecall().build(titles);
            titlePR.print("Title");

            PrecisionRecall abstractPR = new PrecisionRecall().build(abstracts);
            abstractPR.print("Abstract");

            PrecisionRecall keywordsPR = new PrecisionRecall().build(keywords);
            keywordsPR.print("Keywords");

            PrecisionRecall authorsPR = new PrecisionRecall().build(authors);
            authorsPR.print("Authors");

            PrecisionRecall affiliationsPR = new PrecisionRecall().build(affiliations);
            affiliationsPR.print("Affiliations");

            PrecisionRecall emailsPR = new PrecisionRecall().build(emails);
            emailsPR.print("Emails");

            PrecisionRecall refsPR = new PrecisionRecall().build(references);
            refsPR.print("References");

            List<PrecisionRecall> results = Lists.newArrayList(titlePR, authorsPR, affiliationsPR, emailsPR,
                    abstractPR, keywordsPR, refsPR);

            double avgPrecision = 0;
            double avgRecall = 0;
            double avgF1 = 0;
            for (PrecisionRecall result : results) {
                avgPrecision += result.getPrecision();
                avgRecall += result.getRecall();
                avgF1 += result.getF1();
            }
            avgPrecision /= results.size();
            avgRecall /= results.size();
            avgF1 /= results.size();

            System.out.printf("Average precision\t\t%4.2f\n", 100 * avgPrecision);
            System.out.printf("Average recall\t\t%4.2f\n", 100 * avgRecall);
            System.out.printf("Average F1 score\t\t%4.2f\n", 100 * avgF1);
        }
    }

    public static void main(String[] args)
            throws AnalysisException, IOException, TransformationException, ParserConfigurationException,
            SAXException, JDOMException, XPathExpressionException, TransformerException {
        if (args.length != 3 && args.length != 4) {
            System.out.println(
                    "Usage: FinalMetadataExtractionEvaluation <input dir> <orig extension> <extract extension>");
            return;
        }
        String directory = args[0];
        String origExt = args[1];
        String extrExt = args[2];
        int mode = 0;
        if (args.length == 4 && args[3].equals("csv")) {
            mode = 1;
        }
        if (args.length == 4 && args[3].equals("q")) {
            mode = 2;
        }

        ParsCitFinalMetadataExtractionEvaluation e = new ParsCitFinalMetadataExtractionEvaluation();
        NlmIterator iter = new NlmIterator(directory, origExt, extrExt);
        e.evaluate(mode, iter);
    }

}