pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.pubmed.PubmedXMLGenerator.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.pubmed;

import java.io.*;
import java.util.Map.Entry;
import java.util.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import pl.edu.icm.cermine.PdfBxStructureExtractor;
import pl.edu.icm.cermine.evaluation.tools.CosineDistance;
import pl.edu.icm.cermine.evaluation.tools.SmithWatermanDistance;
import pl.edu.icm.cermine.evaluation.tools.StringTools;
import pl.edu.icm.cermine.evaluation.tools.XMLTools;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.zoneclassification.tools.ZoneLocaliser;
import pl.edu.icm.cermine.structure.model.*;
import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter;

public class PubmedXMLGenerator {

    private static class LabelTrio {

        private BxZoneLabel label;
        private Double alignment;
        private List<String> entryTokens;

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((label == null) ? 0 : label.hashCode());
            result = prime * result + ((alignment == null) ? 0 : alignment.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            LabelTrio other = (LabelTrio) obj;
            if (label != other.label) {
                return false;
            }
            if (alignment == null) {
                if (other.alignment != null) {
                    return false;
                }
            } else if (!alignment.equals(other.alignment)) {
                return false;
            }
            return true;
        }

        public LabelTrio(BxZoneLabel label, List<String> tokens, Double similarity) {
            this.alignment = similarity;
            this.label = label;
            this.entryTokens = tokens;
        }
    };

    private boolean verbose = false;

    private void setVerbose(boolean verbose) {
        this.verbose = verbose;
    }

    private void printlnVerbose(String string) {
        if (verbose) {
            System.out.println(string);
        }
    }

    private void printVerbose(String string) {
        if (verbose) {
            System.out.print(string);
        }
    }

    public BxDocument generateTrueViz(InputStream pdfStream, InputStream nlmStream)
            throws AnalysisException, ParserConfigurationException, SAXException, IOException,
            XPathExpressionException, TransformationException {
        XPath xpath = XPathFactory.newInstance().newXPath();
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);
        dbf.setFeature("http://xml.org/sax/features/namespaces", false);
        dbf.setFeature("http://xml.org/sax/features/validation", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        DocumentBuilder builder = dbf.newDocumentBuilder();
        Document domDoc = builder.parse(nlmStream);

        PdfBxStructureExtractor structureExtractor = new PdfBxStructureExtractor();
        BxDocument bxDoc = structureExtractor.extractStructure(pdfStream);
        Integer bxDocLen = bxDoc.asZones().size();

        SmartHashMap entries = new SmartHashMap();

        //abstract
        Node abstractNode = (Node) xpath.evaluate("/article/front/article-meta/abstract", domDoc,
                XPathConstants.NODE);
        String abstractString = XMLTools.extractTextFromNode(abstractNode);
        entries.putIf("Abstract " + abstractString, BxZoneLabel.MET_ABSTRACT);
        entries.putIf("Abstract", BxZoneLabel.MET_ABSTRACT);

        //title
        String titleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-title",
                domDoc, XPathConstants.STRING);
        entries.putIf(titleString, BxZoneLabel.MET_TITLE);
        String subtitleString = (String) xpath.evaluate("/article/front/article-meta/title-group/article-subtitle",
                domDoc, XPathConstants.STRING);
        entries.putIf(subtitleString, BxZoneLabel.MET_TITLE);
        //journal title
        String journalTitleString = (String) xpath.evaluate("/article/front/journal-meta/journal-title", domDoc,
                XPathConstants.STRING);
        if (journalTitleString == null || journalTitleString.isEmpty()) {
            journalTitleString = (String) xpath.evaluate(
                    "/article/front/journal-meta/journal-title-group/journal-title", domDoc, XPathConstants.STRING);
        }
        entries.putIf(journalTitleString, BxZoneLabel.MET_BIB_INFO);

        //journal publisher
        String journalPublisherString = (String) xpath
                .evaluate("/article/front/journal-meta/publisher/publisher-name", domDoc, XPathConstants.STRING);
        entries.putIf(journalPublisherString, BxZoneLabel.MET_BIB_INFO);
        String journalPublisherIdString = (String) xpath.evaluate(
                "/article/front/journal-meta/journal-id[@journal-id-type='publisher-id']", domDoc,
                XPathConstants.STRING);
        entries.putIf(journalPublisherIdString, BxZoneLabel.MET_BIB_INFO);

        //journal issn
        String journalISSNString = (String) xpath.evaluate("/article/front/journal-meta/issn", domDoc,
                XPathConstants.STRING);
        entries.putIf(journalISSNString, BxZoneLabel.MET_BIB_INFO);

        //copyright/permissions
        String permissionsString = XMLTools.extractTextFromNode(
                (Node) xpath.evaluate("/article/front/article-meta/permissions", domDoc, XPathConstants.NODE));
        entries.putIf(permissionsString, BxZoneLabel.MET_COPYRIGHT);

        //license
        Node licenseNode = (Node) xpath.evaluate("/article/front/article-meta/license", domDoc,
                XPathConstants.NODE);
        String licenseString = (String) XMLTools.extractTextFromNode(licenseNode);
        entries.putIf(licenseString, BxZoneLabel.MET_COPYRIGHT);

        //article type
        NodeList articleTypeNodes = (NodeList) xpath.evaluate("/article/@article-type", domDoc,
                XPathConstants.NODESET);
        List<String> articleTypeStrings = XMLTools.extractTextAsList(articleTypeNodes);
        Node articleTypeNode = (Node) xpath.evaluate("/article/front/article-meta/article-categories/subj-group",
                domDoc, XPathConstants.NODE);
        articleTypeStrings.add(XMLTools.extractTextFromNode(articleTypeNode));

        entries.putIf(articleTypeStrings, BxZoneLabel.MET_TYPE);

        //received date
        List<String> receivedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
                "/article/front/article-meta/history/date[@date-type='received']", domDoc, XPathConstants.NODE));
        if (!receivedDate.isEmpty() && receivedDate.size() >= 3) {
            for (String date : StringTools.produceDates(receivedDate)) {
                entries.putIf(date, BxZoneLabel.MET_DATES);
            }
        }

        //accepted date
        List<String> acceptedDate = XMLTools.extractChildrenAsTextList((Node) xpath.evaluate(
                "/article/front/article-meta/history/date[@date-type='accepted']", domDoc, XPathConstants.NODE));
        if (!acceptedDate.isEmpty() && acceptedDate.size() >= 3) {
            for (String date : StringTools.produceDates(acceptedDate)) {
                entries.putIf(date, BxZoneLabel.MET_DATES);
            }
        }

        //publication date
        List<String> pubdateString;
        if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
                .getLength() > 1) {
            Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='epub']",
                    domDoc, XPathConstants.NODE);
            pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
        } else {
            Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='collection']",
                    domDoc, XPathConstants.NODE);
            pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
        }
        if (pubdateString != null && pubdateString.size() >= 3) {
            for (String date : StringTools.produceDates(pubdateString)) {
                entries.putIf(date, BxZoneLabel.MET_DATES);
            }
        }
        pubdateString.clear();
        if (((NodeList) xpath.evaluate("/article/front/article-meta/pub-date", domDoc, XPathConstants.NODESET))
                .getLength() > 1) {
            Node pubdateNode = (Node) xpath.evaluate("/article/front/article-meta/pub-date[@pub-type='ppub']",
                    domDoc, XPathConstants.NODE);
            pubdateString = XMLTools.extractChildrenAsTextList(pubdateNode);
        }
        if (pubdateString != null && pubdateString.size() >= 3) {
            for (String date : StringTools.produceDates(pubdateString)) {
                entries.putIf(date, BxZoneLabel.MET_DATES);
            }
        }

        String extLink = (String) xpath.evaluate(
                "/article/front/article-meta/ext-link[@ext-link-type='uri']/xlink:href", domDoc,
                XPathConstants.STRING);
        printlnVerbose(extLink);
        entries.putIf(extLink, BxZoneLabel.MET_ACCESS_DATA);
        //keywords
        Node keywordsNode = (Node) xpath.evaluate("/article/front/article-meta/kwd-group", domDoc,
                XPathConstants.NODE);
        String keywordsString = XMLTools.extractTextFromNode(keywordsNode);
        entries.putIf(keywordsString, BxZoneLabel.MET_KEYWORDS);

        //DOI
        String doiString = (String) xpath.evaluate("/article/front/article-meta/article-id[@pub-id-type='doi']",
                domDoc, XPathConstants.STRING);
        entries.putIf("DOI " + doiString, BxZoneLabel.MET_BIB_INFO);

        //volume
        String volumeString = (String) xpath.evaluate("/article/front/article-meta/volume", domDoc,
                XPathConstants.STRING);
        entries.putIf("volume " + volumeString, BxZoneLabel.MET_BIB_INFO);
        entries.putIf("vol " + volumeString, BxZoneLabel.MET_BIB_INFO);

        //issue
        String issueString = (String) xpath.evaluate("/article/front/article-meta/issue", domDoc,
                XPathConstants.STRING);
        entries.putIf("number " + issueString, BxZoneLabel.MET_BIB_INFO);

        entries.putIf("journal", BxZoneLabel.MET_BIB_INFO);
        entries.putIf("et al", BxZoneLabel.MET_BIB_INFO);

        List<String> authorNames = new ArrayList<String>();
        List<String> authorEmails = new ArrayList<String>();
        List<String> authorAffiliations = new ArrayList<String>();
        List<String> editors = new ArrayList<String>();

        //pages
        String fPage = (String) xpath.evaluate("/article/front/article-meta/fpage", domDoc, XPathConstants.STRING);
        String lPage = (String) xpath.evaluate("/article/front/article-meta/lpage", domDoc, XPathConstants.STRING);
        entries.putIf("pages " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
        entries.putIf("pp " + fPage + " " + lPage, BxZoneLabel.MET_BIB_INFO);
        entries.putIf(fPage, BxZoneLabel.MET_BIB_INFO);
        entries.putIf(lPage, BxZoneLabel.MET_BIB_INFO);
        entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
        entries.putIf(lPage, BxZoneLabel.OTH_PAGE_NUMBER);
        try {
            int f = Integer.valueOf(fPage);
            int l = Integer.valueOf(lPage);
            while (f < l) {
                f++;
                entries.putIf(String.valueOf(f), BxZoneLabel.OTH_PAGE_NUMBER);
            }
        } catch (NumberFormatException ex) {
        }

        entries.putIf("page of", BxZoneLabel.OTH_PAGE_NUMBER);

        //editors
        NodeList editorNodes = (NodeList) xpath.evaluate(
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='editor']", domDoc,
                XPathConstants.NODESET);
        for (int nodeIdx = 0; nodeIdx < editorNodes.getLength(); ++nodeIdx) {
            String editorString = XMLTools.extractTextFromNode(editorNodes.item(nodeIdx));
            editors.add(editorString);
        }
        entries.putIf(StringTools.joinStrings(editors), BxZoneLabel.MET_EDITOR);

        NodeList authorsResult = (NodeList) xpath.evaluate(
                "/article/front/article-meta/contrib-group/contrib[@contrib-type='author']", domDoc,
                XPathConstants.NODESET);
        for (int nodeIdx = 0; nodeIdx < authorsResult.getLength(); ++nodeIdx) {
            Node curNode = authorsResult.item(nodeIdx);
            //author names
            String name = (String) xpath.evaluate("name/given-names", curNode, XPathConstants.STRING);
            String surname = (String) xpath.evaluate("name/surname", curNode, XPathConstants.STRING);
            //author affiliation
            List<String> aff = XMLTools.extractTextAsList((NodeList) xpath
                    .evaluate("/article/front/article-meta/contrib-group/aff", domDoc, XPathConstants.NODESET));

            //author correspondence
            String email;
            try {
                email = (String) xpath.evaluate("address/email", curNode, XPathConstants.STRING);
            } catch (XPathExpressionException e) {
                email = "";
            }
            if (email.isEmpty()) {
                try {
                    email = (String) xpath.evaluate("email", curNode, XPathConstants.STRING);
                } catch (XPathExpressionException e) {
                    //yaaay, probably there is no e-mail at all! => do nothing
                }
            }
            if (!email.isEmpty()) {
                authorEmails.add(email);
            }
            if (!aff.isEmpty()) {
                authorAffiliations.addAll(aff);
            }
            authorNames.add(name + " " + surname);
        }
        entries.putIf(StringTools.joinStrings(authorNames), BxZoneLabel.MET_AUTHOR);

        //authors' affiliations
        NodeList affNodes = (NodeList) xpath.evaluate("/article/front/article-meta/aff", domDoc,
                XPathConstants.NODESET);
        authorAffiliations.addAll(XMLTools.extractTextAsList(affNodes));
        entries.putIf(authorAffiliations, BxZoneLabel.MET_AFFILIATION);

        //correspondence again
        NodeList correspNodes = (NodeList) xpath.evaluate("/article/front/article-meta/author-notes/corresp",
                domDoc, XPathConstants.NODESET);
        authorEmails.add(XMLTools.extractTextFromNodes(correspNodes));
        entries.putIf(authorEmails, BxZoneLabel.MET_CORRESPONDENCE);

        //author notes
        Node notesNode = (Node) xpath.evaluate("/article/front/article-meta/author-notes/corresp/fn", domDoc,
                XPathConstants.NODE);
        String notesString = XMLTools.extractTextFromNode(notesNode);
        entries.putIf(notesString, BxZoneLabel.MET_CORRESPONDENCE);
        notesString = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("/article/back/notes", domDoc, XPathConstants.NODE));

        //article body
        NodeList paragraphNodes = (NodeList) xpath.evaluate("/article/body//p", domDoc, XPathConstants.NODESET);
        List<String> paragraphStrings = XMLTools.extractTextAsList(paragraphNodes);
        entries.putIf(paragraphStrings, BxZoneLabel.BODY_CONTENT);

        NodeList appNodes = (NodeList) xpath.evaluate("/article/back/app-group//p", domDoc, XPathConstants.NODESET);
        String appStrings = XMLTools.extractTextFromNodes(appNodes);
        entries.putIf(appStrings, BxZoneLabel.BODY_CONTENT);

        //section titles
        NodeList sectionTitleNodes = (NodeList) xpath.evaluate("/article/body//title", domDoc,
                XPathConstants.NODESET);
        List<String> sectionTitles = XMLTools.extractTextAsList(sectionTitleNodes);
        entries.putIf(sectionTitles, BxZoneLabel.BODY_CONTENT);

        NodeList appTitleNodes = (NodeList) xpath.evaluate("/article/back/app-group//title", domDoc,
                XPathConstants.NODESET);
        List<String> appTitles = XMLTools.extractTextAsList(appTitleNodes);
        entries.putIf(appTitles, BxZoneLabel.BODY_CONTENT);

        //figures
        NodeList figureNodes = (NodeList) xpath.evaluate("/article/floats-wrap//fig", domDoc,
                XPathConstants.NODESET);
        List<String> figureStrings = XMLTools.extractTextAsList(figureNodes);

        figureNodes = (NodeList) xpath.evaluate("/article/floats-group//fig", domDoc, XPathConstants.NODESET);
        figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

        figureNodes = (NodeList) xpath.evaluate("/article/back//fig", domDoc, XPathConstants.NODESET);
        figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

        figureNodes = (NodeList) xpath.evaluate("/article/body//fig", domDoc, XPathConstants.NODESET);
        figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

        figureNodes = (NodeList) xpath.evaluate("/article/back/app-group//fig", domDoc, XPathConstants.NODESET);
        figureStrings.addAll(XMLTools.extractTextAsList(figureNodes));

        entries.putIf(figureStrings, BxZoneLabel.BODY_FIGURE);

        //tables
        List<String> tableCaptions = new ArrayList<String>();
        List<String> tableBodies = new ArrayList<String>();
        List<String> tableFootnotes = new ArrayList<String>();
        //tableNodes
        NodeList tableNodes = (NodeList) xpath.evaluate("/article//table-wrap", domDoc, XPathConstants.NODESET);

        for (Integer nodeIdx = 0; nodeIdx < tableNodes.getLength(); ++nodeIdx) {
            Node tableNode = tableNodes.item(nodeIdx);

            String caption = (String) xpath.evaluate("caption", tableNode, XPathConstants.STRING);
            tableCaptions.add(caption);

            String body = XMLTools
                    .extractTextFromNode((Node) xpath.evaluate("table", tableNode, XPathConstants.NODE));
            tableBodies.add(body);

            List<String> footnotes = XMLTools.extractTextAsList(
                    (NodeList) xpath.evaluate("table-wrap-foot/fn", tableNode, XPathConstants.NODESET));
            tableFootnotes.addAll(footnotes);

            entries.putIf(caption, BxZoneLabel.BODY_TABLE);
            entries.putIf(body, BxZoneLabel.BODY_TABLE);
            entries.putIf(footnotes, BxZoneLabel.BODY_TABLE);
        }

        //financial disclosure
        String financialDisclosure = XMLTools.extractTextFromNode((Node) xpath
                .evaluate("/article//fn[@fn-type='financial-disclosure']", domDoc, XPathConstants.NODE));
        entries.putIf(financialDisclosure, BxZoneLabel.BODY_ACKNOWLEDGMENT);

        //conflict
        String conflictString = XMLTools.extractTextFromNode(
                (Node) xpath.evaluate("/article//fn[@fn-type='conflict']", domDoc, XPathConstants.NODE));
        entries.putIf(conflictString, BxZoneLabel.BODY_CONFLICT_STMT);

        //copyright
        String copyrightString = XMLTools.extractTextFromNode((Node) xpath.evaluate(
                "/article/front/article-meta/permissions/copyright-statement", domDoc, XPathConstants.NODE));
        entries.putIf(copyrightString, BxZoneLabel.MET_COPYRIGHT);

        //acknowledgment
        String acknowledgement = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("/article/back/ack", domDoc, XPathConstants.NODE));
        entries.putIf(acknowledgement, BxZoneLabel.BODY_ACKNOWLEDGMENT);

        acknowledgement = XMLTools.extractTextFromNode(
                (Node) xpath.evaluate("/article/back/fn-group/fn", domDoc, XPathConstants.NODE));
        entries.putIf(acknowledgement, BxZoneLabel.BODY_CONFLICT_STMT);

        //glossary
        String glossary = XMLTools
                .extractTextFromNode((Node) xpath.evaluate("/article/back/glossary", domDoc, XPathConstants.NODE));
        entries.putIf(glossary, BxZoneLabel.BODY_GLOSSARY);

        //formula
        NodeList formulaNodes = (NodeList) xpath.evaluate("/article/body//disp-formula", domDoc,
                XPathConstants.NODESET);
        for (int nodeIdx = 0; nodeIdx < formulaNodes.getLength(); ++nodeIdx) {
            Node curFormulaNode = formulaNodes.item(nodeIdx);
            String label = (String) xpath.evaluate("label", curFormulaNode);
            entries.putIf(label, BxZoneLabel.BODY_EQUATION);

            NodeList curNodeChildren = curFormulaNode.getChildNodes();
            List<String> formulaParts = new ArrayList<String>();
            for (int childIdx = 0; childIdx < curNodeChildren.getLength(); ++childIdx) {
                Node curChild = curNodeChildren.item(childIdx);
                if (curChild.getNodeName().equals("label")) {
                    continue;
                }
                formulaParts.add(XMLTools.extractTextFromNode(curChild));
            }
            entries.putIf(StringTools.joinStrings(formulaParts), BxZoneLabel.BODY_EQUATION);
        }

        //references
        List<String> refStrings = new ArrayList<String>();
        Node refParentNode = (Node) xpath.evaluate("/article/back/ref-list", domDoc, XPathConstants.NODE);
        if (refParentNode != null) {
            for (Integer refIdx = 0; refIdx < refParentNode.getChildNodes().getLength(); ++refIdx) {
                refStrings.add(XMLTools.extractTextFromNode(refParentNode.getChildNodes().item(refIdx)));
            }
        }
        entries.putIf(StringTools.joinStrings(refStrings), BxZoneLabel.REFERENCES);
        entries.put("references", BxZoneLabel.REFERENCES);

        Set<String> allBibInfos = new HashSet<String>();
        for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
            if (BxZoneLabel.MET_BIB_INFO.equals(entry.getValue())) {
                allBibInfos.addAll(Arrays.asList(entry.getKey().split(" ")));
            }
        }
        entries.put(StringUtils.join(allBibInfos, " "), BxZoneLabel.MET_BIB_INFO);

        printlnVerbose("journalTitle: " + journalTitleString);
        printlnVerbose("journalPublisher: " + journalPublisherString);
        printlnVerbose("journalISSNPublisher: " + journalISSNString);

        printlnVerbose("articleType: " + articleTypeStrings);
        printlnVerbose("received: " + receivedDate);
        printlnVerbose("accepted: " + acceptedDate);
        printlnVerbose("pubdate: " + pubdateString);
        printlnVerbose("permissions: " + permissionsString);
        printlnVerbose("license: " + licenseString);

        printlnVerbose("title: " + titleString);
        printlnVerbose("abstract: " + abstractString);

        printlnVerbose("authorEmails: " + authorEmails);
        printlnVerbose("authorNames: " + authorNames);
        printlnVerbose("authorAff: " + authorAffiliations);
        printlnVerbose("authorNotes: " + notesString);
        printlnVerbose("editor: " + editors);

        printlnVerbose("keywords: " + keywordsString);
        printlnVerbose("DOI: " + doiString);
        printlnVerbose("volume: " + volumeString);
        printlnVerbose("issue: " + issueString);
        printlnVerbose("financial dis.: " + financialDisclosure);

        printlnVerbose("paragraphs: " + paragraphStrings);
        printlnVerbose("section titles: " + sectionTitles);

        printlnVerbose("tableBodies: " + tableBodies);
        printlnVerbose("tableCaptions: " + tableCaptions);
        printlnVerbose("tableFootnotes: " + tableFootnotes);

        printlnVerbose("figures: " + figureStrings);
        printlnVerbose("acknowledgement: " + acknowledgement);

        printlnVerbose("ref: " + refStrings.size() + " " + refStrings);

        SmithWatermanDistance smith = new SmithWatermanDistance(.1, 0.1);
        CosineDistance cos = new CosineDistance();

        //index: (zone,entry)
        List<List<LabelTrio>> swLabelSim = new ArrayList<List<LabelTrio>>(bxDocLen);
        List<List<LabelTrio>> cosLabProb = new ArrayList<List<LabelTrio>>(bxDocLen);
        for (Integer i = 0; i < bxDocLen; ++i) {
            swLabelSim.add(new ArrayList<LabelTrio>());
            cosLabProb.add(new ArrayList<LabelTrio>());
        }

        //iterate over entries
        for (Entry<String, BxZoneLabel> entry : entries.entrySet()) {
            List<String> entryTokens = StringTools.tokenize(entry.getKey());
            printlnVerbose("--------------------");
            printlnVerbose(entry.getValue() + " " + entry.getKey() + "\n");
            //iterate over zones
            for (Integer zoneIdx = 0; zoneIdx < bxDocLen; ++zoneIdx) {
                BxZone curZone = bxDoc.asZones().get(zoneIdx);
                List<String> zoneTokens = StringTools.tokenize(StringTools
                        .removeOrphantSpaces(StringTools.cleanLigatures(curZone.toText().toLowerCase())));

                Double smithSim;
                Double cosSim;
                if (curZone.toText().contains("www.biomedcentral.com")) {
                    //ignore
                    smithSim = 0.;
                    cosSim = 0.;
                } else {
                    smithSim = smith.compare(entryTokens, zoneTokens);
                    cosSim = cos.compare(entryTokens, zoneTokens);
                }
                printlnVerbose(smithSim + " " + bxDoc.asZones().get(zoneIdx).toText() + "\n\n");
                swLabelSim.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, smithSim));
                cosLabProb.get(zoneIdx).add(new LabelTrio(entry.getValue(), entryTokens, cosSim));
            }
        }

        printlnVerbose("===========================");
        for (BxPage page : bxDoc.getPages()) {
            for (BxZone zone : page.getZones()) {
                Integer zoneIdx = bxDoc.asZones().indexOf(zone);
                BxZone curZone = bxDoc.asZones().get(zoneIdx);
                String zoneText = StringTools.removeOrphantSpaces(curZone.toText().toLowerCase());
                List<String> zoneTokens = StringTools.tokenize(zoneText);
                Boolean valueSet = false;

                Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                    @Override
                    public int compare(LabelTrio t1, LabelTrio t2) {
                        Double simDif = t1.alignment / t1.entryTokens.size() - t2.alignment / t2.entryTokens.size();
                        if (Math.abs(simDif) < 0.0001) {
                            return t2.entryTokens.size() - t1.entryTokens.size();
                        }
                        if (simDif > 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                });
                Collections.reverse(swLabelSim.get(zoneIdx));

                List<String> entryTokens = swLabelSim.get(zoneIdx).get(0).entryTokens;
                if (Math.max(zoneTokens.size(), entryTokens.size()) > 0
                        && Math.min(zoneTokens.size(), entryTokens.size())
                                / Math.max(zoneTokens.size(), (double) entryTokens.size()) > 0.7
                        && swLabelSim.get(zoneIdx).get(0).alignment / entryTokens.size() > 0.7) {
                    curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                    valueSet = true;
                    printVerbose("0 ");
                }

                if (!valueSet) {
                    Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                        @Override
                        public int compare(LabelTrio t1, LabelTrio t2) {
                            Double simDif = t1.alignment - t2.alignment;
                            if (Math.abs(simDif) < 0.0001) {
                                return t2.entryTokens.size() - t1.entryTokens.size();
                            }
                            if (simDif > 0) {
                                return 1;
                            } else {
                                return -1;
                            }
                        }
                    });
                    Collections.reverse(swLabelSim.get(zoneIdx));
                    printlnVerbose("-->" + swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size());
                    if (swLabelSim.get(zoneIdx).get(0).alignment / zoneTokens.size() > 0.5) {
                        curZone.setLabel(swLabelSim.get(zoneIdx).get(0).label);
                        valueSet = true;
                        printVerbose("1 ");
                    }
                }

                if (!valueSet) {
                    Map<BxZoneLabel, Double> cumulated = new EnumMap<BxZoneLabel, Double>(BxZoneLabel.class);
                    for (LabelTrio trio : swLabelSim.get(zoneIdx)) {
                        if (cumulated.containsKey(trio.label)) {
                            cumulated.put(trio.label, cumulated.get(trio.label)
                                    + trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                        } else {
                            cumulated.put(trio.label,
                                    trio.alignment / Math.max(zoneTokens.size(), trio.entryTokens.size()));
                        }
                    }
                    Double max = Double.NEGATIVE_INFINITY;
                    BxZoneLabel bestLabel = null;
                    for (Entry<BxZoneLabel, Double> entry : cumulated.entrySet()) {
                        if (entry.getValue() > max) {
                            max = entry.getValue();
                            bestLabel = entry.getKey();
                        }
                    }
                    if (max >= 0.5) {
                        curZone.setLabel(bestLabel);
                        printVerbose("2 ");
                        valueSet = true;
                    }
                }

                if (!valueSet) {
                    Collections.sort(swLabelSim.get(zoneIdx), new Comparator<LabelTrio>() {

                        @Override
                        public int compare(LabelTrio t1, LabelTrio t2) {
                            Double simDif = t1.alignment / t1.entryTokens.size()
                                    - t2.alignment / t2.entryTokens.size();
                            if (Math.abs(simDif) < 0.001) {
                                return t2.entryTokens.size() - t1.entryTokens.size();
                            }
                            if (simDif > 0) {
                                return 1;
                            } else {
                                return -1;
                            }
                        }
                    });
                    Collections.reverse(swLabelSim.get(zoneIdx));
                    List<LabelTrio> l = swLabelSim.get(zoneIdx);

                    BxZoneLabel best = null;
                    int bestScore = 0;
                    for (LabelTrio lt : l) {
                        int i = 0;
                        for (String zt : zoneTokens) {
                            if (lt.entryTokens.contains(zt)) {
                                i++;
                            }
                        }
                        if (i > bestScore && i > 1) {
                            best = lt.label;
                            bestScore = i;
                        }
                    }
                    if (best != null) {
                        curZone.setLabel(best);
                        valueSet = true;
                    } else {
                        for (LabelTrio lt : l) {
                            int i = 0;
                            for (String zt : zoneTokens) {
                                for (String j : lt.entryTokens) {
                                    if (zt.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", "")
                                            .equals(j.replaceAll("[^0-9a-zA-Z,;\\.!\\?]", ""))) {
                                        i++;
                                        break;
                                    }
                                }
                            }
                            if (i > bestScore && i > 1) {
                                best = lt.label;
                                bestScore = i;
                            }
                        }
                    }

                    if (best != null) {
                        curZone.setLabel(best);
                        valueSet = true;
                    }
                }
                if (!valueSet) {
                    curZone.setLabel(null);
                }
                printlnVerbose(zone.getLabel() + " " + zone.toText() + "\n");
            }
            Map<BxZone, ZoneLocaliser> zoneLocMap = new HashMap<BxZone, ZoneLocaliser>();
            Set<BxZone> unlabeledZones = new HashSet<BxZone>();
            for (BxZone zone : page.getZones()) {
                if (zone.getLabel() == null) {
                    unlabeledZones.add(zone);
                    zoneLocMap.put(zone, new ZoneLocaliser(zone));
                }
            }
            Integer lastNumberOfUnlabeledZones;
            do {
                lastNumberOfUnlabeledZones = unlabeledZones.size();
                infereLabels(unlabeledZones, zoneLocMap);
                infereLabels(unlabeledZones, zoneLocMap);
            } while (lastNumberOfUnlabeledZones != unlabeledZones.size());
        }
        printlnVerbose("=>=>=>=>=>=>=>=>=>=>=>=>=>=");

        return bxDoc;
    }

    private void infereLabels(Set<BxZone> unlabeledZones, Map<BxZone, ZoneLocaliser> zoneLocMap) {
        Set<BxZone> toBeRemoved = new HashSet<BxZone>();
        for (BxZone zone : unlabeledZones) {
            if (zone.getLabel() == null) {
                ZoneLocaliser loc = zoneLocMap.get(zone);
                if ((loc.getLeftZone() != null && loc.getRightZone() != null)
                        && (loc.getLeftZone().getLabel() == loc.getRightZone().getLabel())) {
                    zone.setLabel(loc.getLeftZone().getLabel());
                    printVerbose("3 ");
                    toBeRemoved.add(zone);
                } else if ((loc.getLowerZone() != null && loc.getUpperZone() != null)
                        && (loc.getLowerZone().getLabel() == loc.getUpperZone().getLabel())) {
                    zone.setLabel(loc.getLowerZone().getLabel());
                    printVerbose("3 ");
                    toBeRemoved.add(zone);
                } else if (zone.hasNext() && zone.hasPrev()
                        && zone.getPrev().getLabel() == zone.getNext().getLabel()) {
                    zone.setLabel(zone.getPrev().getLabel());
                    printVerbose("3 ");
                    toBeRemoved.add(zone);
                }
            }
        }
        for (BxZone zone : toBeRemoved) {
            zoneLocMap.remove(zone);
        }
        unlabeledZones.removeAll(toBeRemoved);
    }

    public static void main(String[] args) {
        if (args.length != 1) {
            System.err.println("Usage: <pubmed directory>");
            System.exit(1);
        }

        File dir = new File(args[0]);
        for (File pdfFile : FileUtils.listFiles(dir, new String[] { "pdf" }, true)) {
            try {
                String pdfPath = pdfFile.getPath();
                String nxmlPath = StringTools.getNLMPath(pdfPath);

                File xmlFile = new File(StringTools.getTrueVizPath(nxmlPath));
                if (xmlFile.exists()) {
                    continue;
                }

                System.out.print(pdfPath + " ");

                InputStream pdfStream = new FileInputStream(pdfPath);
                InputStream nxmlStream = new FileInputStream(nxmlPath);

                PubmedXMLGenerator datasetGenerator = new PubmedXMLGenerator();
                datasetGenerator.setVerbose(false);
                BxDocument bxDoc = datasetGenerator.generateTrueViz(pdfStream, nxmlStream);

                int keys = 0;
                Set<BxZoneLabel> set = EnumSet.noneOf(BxZoneLabel.class);
                int total = 0;
                int known = 0;
                for (BxZone z : bxDoc.asZones()) {
                    total++;
                    if (z.getLabel() != null) {
                        known++;
                        if (z.getLabel().isOfCategoryOrGeneral(BxZoneLabelCategory.CAT_METADATA)) {
                            set.add(z.getLabel());
                        }
                        if (BxZoneLabel.REFERENCES.equals(z.getLabel())) {
                            keys = 1;
                        }
                    }
                }

                if (set.contains(BxZoneLabel.MET_AFFILIATION)) {
                    keys++;
                }
                if (set.contains(BxZoneLabel.MET_AUTHOR)) {
                    keys++;
                }
                if (set.contains(BxZoneLabel.MET_BIB_INFO)) {
                    keys++;
                }
                if (set.contains(BxZoneLabel.MET_TITLE)) {
                    keys++;
                }
                int coverage = 0;
                if (total > 0) {
                    coverage = known * 100 / total;
                }
                System.out.print(coverage + " " + set.size() + " " + keys);

                FileWriter fstream = new FileWriter(
                        StringTools.getTrueVizPath(nxmlPath).replace(".xml", "." + coverage + ".cxml"));
                BufferedWriter out = new BufferedWriter(fstream);
                BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter();
                out.write(writer.write(bxDoc.getPages()));
                out.close();

                System.out.println(" done");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

}