ubic.pubmedgate.resolve.EvaluationRDFModel.java Source code

Introduction

Here is the source code for ubic.pubmedgate.resolve.EvaluationRDFModel.java
Source

/*
 * The WhiteText project
 * 
 * Copyright (c) 2012 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.pubmedgate.resolve;

import java.io.FileInputStream;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.BAMSandAllen.JenaUtil;
import ubic.BAMSandAllen.Util;
import ubic.BAMSandAllen.Vocabulary;
import ubic.basecode.dataStructure.CountingMap;
import ubic.basecode.dataStructure.StringToStringSetMap;
import ubic.basecode.dataStructure.params.ParamKeeper;
import ubic.pubmedgate.Config;
import ubic.pubmedgate.GateInterface;
import ubic.pubmedgate.organism.SpeciesUtil;
import ubic.pubmedgate.resolve.RDFResolvers.RDFResolver;
import ubic.pubmedgate.resolve.mentionEditors.BracketRemoverMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.CytoPrefixMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.DirectionRemoverMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.DirectionSplittingMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.HemisphereStripMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.MentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.NDotExpanderMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.NucleusOfTheRemoverMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.OfTheRemoverMentionEditor;
import ubic.pubmedgate.resolve.mentionEditors.RegionSuffixRemover;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.datatypes.xsd.XSDDateTime;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.rdf.model.impl.StatementImpl;

public class EvaluationRDFModel extends ResolutionRDFModel {
    protected static Log log = LogFactory.getLog(EvaluationRDFModel.class);

    public EvaluationRDFModel(Model model, boolean reason) throws Exception {
        super(model, reason);
    }

    public EvaluationRDFModel() throws Exception {
        super();
    }

    public EvaluationRDFModel(boolean empty) throws Exception {
        super(empty);
    }

    public EvaluationRDFModel(String filename) throws Exception {
        super(filename);
    }

    public Map<String, String> evaluateConceptMapping(Property property) {
        Set<Property> properties = new HashSet<Property>();
        properties.add(property);
        return evaluateConceptMapping(properties);
    }

    public Map<Resource, Integer> getPMIDtoYearMap() {
        Map<Resource, Integer> result = new HashMap<Resource, Integer>();
        Map<Resource, Calendar> dates = getPMIDtoDateMap();
        for (Resource key : dates.keySet()) {
            // result.put( key, dates.get( key ).getTimeInMillis() );
            result.put(key, dates.get(key).get(Calendar.YEAR));
        }
        return result;
    }

    public Map<String, Integer> getPMIDStringtoYearMap() {
        Map<String, Integer> result = new HashMap<String, Integer>();
        Map<Resource, Calendar> dates = getPMIDtoDateMap();
        for (Resource key : dates.keySet()) {
            String URI = key.getURI();
            URI = URI.substring(URI.lastIndexOf(':') + 1);

            result.put(URI, dates.get(key).get(Calendar.YEAR));
        }
        return result;
    }

    public Map<Resource, Calendar> getPMIDtoDateMap() {
        // ResIterator resItr = model.listResourcesWithProperty( Vocabulary.publication_date );
        StmtIterator stmtItr = model.listStatements(null, Vocabulary.publication_date, (Literal) (null));
        Map<Resource, Calendar> result = new HashMap<Resource, Calendar>();
        while (stmtItr.hasNext()) {
            Statement s = stmtItr.nextStatement();

            XSDDateTime x = (XSDDateTime) XSDDatatype.XSDdateTime.parse(s.getLiteral().getLexicalForm());
            Calendar date = x.asCalendar();

            // log.info( date.getTime().toString() );
            // log.info( s.getSubject() + " -> " + s.getLiteral() );

            result.put(s.getSubject(), date);

        }
        return result;
    }

    public void findRejectedExact() {
        Collection<Property> props = new LinkedList<Property>();
        props.add(Vocabulary.string_match_ignorecase);
        evaluateConceptMapping(props, getMatchedMentions());
    }

    public Map<String, String> evaluateConceptMapping(Collection<Property> properties) {
        return evaluateConceptMapping(properties, getMentions());
    }

    // given a set of properties linking neuromentions to neuroterms (then neuroconcepts), find out the metrics
    public Map<String, String> evaluateConceptMapping(Collection<Property> properties, Set<Resource> mentions) {
        Map<String, String> result = new HashMap<String, String>();
        int accept = 0;
        int reject = 0;
        int rejectFreq = 0;
        int rejectAbs = 0;
        int specToGen = 0;
        int noEvaluation = 0;
        int totalMentionToConceptLinks = 0;
        int mentionsWithOneAccept = 0;

        log.info("Evaluating concept mappings for " + mentions.size() + " mentions");
        // get mentions
        int count = 0;
        for (Resource mention : mentions) {
            if (count++ % 100 == 0)
                log.info("Count: " + count + " mentions");
            Set<Resource> allNeuroTerms = getLinkedNeuroTerms(mention);

            Set<Resource> neuroTerms = new HashSet<Resource>();
            for (Resource neuroTerm : allNeuroTerms) {
                // select mention to concept links for these properties
                // get terms via the properties
                for (Property p : properties) {
                    boolean isLinked = model.contains(new StatementImpl(mention, p, neuroTerm));
                    if (isLinked) {
                        neuroTerms.add(neuroTerm);
                    }
                }
            }

            boolean hasOneAccept = false;
            // get concepts for the terms
            Set<Resource> neuroConcepts = getConceptsFromTerms(neuroTerms);
            totalMentionToConceptLinks += neuroConcepts.size();
            for (Resource neuroConcept : neuroConcepts) {
                boolean isAccept = model
                        .contains(new StatementImpl(mention, Vocabulary.evaluation_accept, neuroConcept));
                boolean isReject = model
                        .contains(new StatementImpl(mention, Vocabulary.evaluation_reject, neuroConcept));
                boolean hasResult = model
                        .contains(new StatementImpl(mention, Vocabulary.evaluation_result, neuroConcept));
                boolean isSpecToGen = model.contains(
                        new StatementImpl(mention, Vocabulary.evaluation_specific_to_general, neuroConcept));
                if (isAccept) {
                    accept++;
                    hasOneAccept = true;
                }
                if (isReject) {
                    log.info("Rejected mention:" + JenaUtil.getLabel(mention) + "->"
                            + JenaUtil.getLabel(neuroConcept));
                    reject++;
                    int freq = mention.getProperty(Vocabulary.number_of_occurances).getInt();
                    rejectFreq += freq;
                    int abs = mention.getProperty(Vocabulary.number_of_abstracts).getInt();
                    rejectAbs += abs;
                }
                if (isSpecToGen)
                    specToGen++;
                if (!hasResult) {
                    noEvaluation++;
                    log.info("No evaluation result: " + JenaUtil.getLabel(mention) + " -> "
                            + JenaUtil.getLabel(neuroConcept) + " URI: " + mention.toString() + " -> "
                            + neuroConcept.toString());

                }
            }
            if (hasOneAccept)
                mentionsWithOneAccept++;
        }
        log.info("Properties:" + properties.toString());
        result.put("properties", properties.toString());
        log.info("accept:" + accept);
        result.put("accept", "" + accept);
        log.info("reject:" + reject);
        result.put("reject", "" + reject);
        result.put("rejectAbs", rejectAbs + "");
        result.put("rejectFreq", rejectFreq + "");
        log.info("specToGen:" + specToGen);
        result.put("specToGen", "" + specToGen);
        log.info("noEvaluation:" + noEvaluation);
        result.put("noEvaluation", "" + noEvaluation);
        log.info("totalMentionToConceptLinks:" + totalMentionToConceptLinks);
        result.put("totalMentionToConceptLinks", "" + totalMentionToConceptLinks);
        result.put("mentionsWithOneAccept", "" + mentionsWithOneAccept);
        return result;
    }

    // not used anymore
    @Deprecated
    public static void createAllResoloversStats() throws Exception {
        EvaluationRDFModel evaluationModel;
        evaluationModel = new EvaluationRDFModel();

        evaluationModel.loadManualMatches();
        evaluationModel.loadManualEvaluations();
        evaluationModel.loadAutomaticEvaluations();

        // order is important here! - load only one resolver here!
        evaluationModel.createMatches3();
        // with all matches loaded, which ones contribute?
        evaluationModel.writePropertiesEval();
    }

    public static void createStats(Model model, boolean reason, boolean loadManualMatches) throws Exception {
        // starts with a blank slate
        EvaluationRDFModel evaluationModel;
        evaluationModel = new EvaluationRDFModel(model, reason);
        if (loadManualMatches)
            evaluationModel.loadManualMatches();
        evaluationModel.loadManualEvaluations();
        evaluationModel.loadAutomaticEvaluations();

        ParamKeeper results = new ParamKeeper();

        List<RDFResolver> resolvers = evaluationModel.getAllResolvers();
        for (RDFResolver resolver : resolvers) {
            evaluationModel = new EvaluationRDFModel(model, reason);
            Map<String, String> params = oneResolover(evaluationModel, resolver, loadManualMatches);
            // store the last result (all mention editors)
            results.addParamInstance(params);
        }
        // add in a line for all resolvers together (slow)
        results.addParamInstance(getAllResolversStats(loadManualMatches, model, reason));
        results.writeExcel(Config.config.getString("whitetext.resolve.results.folder") + "AllResolversInd.xls");
    }

    public static Map<String, String> getAllResolversStats(boolean loadManualMatches, Model model, boolean reason)
            throws Exception {
        // look at mention editors for all resolvers -- requires blank slate
        EvaluationRDFModel evaluationModel = new EvaluationRDFModel(model, reason);
        // evaluations
        if (loadManualMatches)
            evaluationModel.loadManualMatches();
        evaluationModel.loadManualEvaluations();
        evaluationModel.loadAutomaticEvaluations();
        Map<String, String> params = evaluationModel.createSpreadSheetForEditors(evaluationModel.getAllResolvers());
        return params;
    }

    public void getStats() throws Exception {
        super.getStats();
        log.info("Evaluation statements:"
                + model.listStatements(null, Vocabulary.evaluation_result, (RDFNode) null).toSet().size());
        log.info("  accepts:"
                + model.listStatements(null, Vocabulary.evaluation_result, (RDFNode) null).toSet().size());
        log.info("  rejects:"
                + model.listStatements(null, Vocabulary.evaluation_reject, (RDFNode) null).toSet().size());
        log.info("  spec2gen:" + model
                .listStatements(null, Vocabulary.evaluation_specific_to_general, (RDFNode) null).toSet().size());
    }

    /**
     * Runs a single resolver on a new model
     * 
     * @param evaluationModel
     * @param resolver
     * @return the results for the resolvers using all mention editors
     * @throws Exception
     */
    public static Map<String, String> oneResolover(EvaluationRDFModel evaluationModel, RDFResolver resolver,
            boolean loadManualMatches) throws Exception {

        // evaluations
        if (loadManualMatches)
            evaluationModel.loadManualMatches();
        evaluationModel.loadManualEvaluations();
        evaluationModel.loadAutomaticEvaluations();

        // put resolver in list
        Collection<RDFResolver> resolvers = new HashSet<RDFResolver>();
        resolvers.add(resolver);
        Map<String, String> params = evaluationModel.createSpreadSheetForEditors(resolvers);
        return params;
    }

    public Map<String, String> getAllStats(Property property) throws Exception {
        Set<Property> properties = new HashSet<Property>();
        properties.add(property);
        return getAllStats(properties);
    }

    public Map<String, String> getAllStats(Set<Property> properties) throws Exception {
        Map<String, String> result = new HashMap<String, String>();
        result.putAll(evaluateConceptMapping(properties));
        result.putAll(getStatsToMap());
        return result;
    }

    /*
     * This evaluates the mention editors for a specific set of resolvers, it has to start from an empty model with no
     * resolutions. returns the last set of results (all editors)
     */
    public Map<String, String> createSpreadSheetForEditors(Collection<RDFResolver> resolvers) throws Exception {
        String name = "";
        Set<Property> properties = new HashSet<Property>();
        for (RDFResolver r : resolvers) {
            Property p = r.getProperty();
            properties.add(p);
            name = name + p.getLocalName() + ".";
        }
        name += "xls";

        ParamKeeper results = new ParamKeeper();

        // add mention editors one by one, and evaluate everytime
        runResolvers(resolvers, getMentions());
        results.addParamInstance(getAllStats(properties));

        // should be refactored
        List<MentionEditor> editors = new LinkedList<MentionEditor>();
        editors.add(new DirectionSplittingMentionEditor());
        editors.add(new HemisphereStripMentionEditor());
        editors.add(new BracketRemoverMentionEditor());
        editors.add(new NDotExpanderMentionEditor());
        editors.add(new OfTheRemoverMentionEditor());
        editors.add(new RegionSuffixRemover());
        // editors.add( new BroadmannPrefixAdderMentionEditor() );
        // loosing information
        editors.add(new CytoPrefixMentionEditor());
        editors.add(new DirectionRemoverMentionEditor());
        editors.add(new NucleusOfTheRemoverMentionEditor());
        editors.add(new DirectionRemoverMentionEditor());

        // iterate mention editors to see how they perform
        Map<String, String> params = null;
        for (MentionEditor editor : editors) {
            addMentionEditorToResolvers(resolvers, editor);
            runResolvers(resolvers, getUnMatchedMentions());
            params = getAllStats(properties);
            params.put("added editor", editor.getName());
            results.addParamInstance(params);
        }

        results.writeExcel(Config.config.getString("whitetext.resolve.results.folder") + name);
        return params;
    }

    public void speciesStats(String annotationSet) throws Exception {
        SpeciesCounter speciesCounter = new SpeciesCounter(annotationSet);
        Set<Property> properties = getAllProperties();

        // go through unmatched and matched getting a counting set of species
        boolean frequency = false;
        log.info("Getting normal species stats");
        Set<Resource> matchedMentions = getMatchedMentions();
        CountingMap<String> matched = speciesCounter.getSpeciesCounts(matchedMentions, frequency, this);
        Set<Resource> unMatchedMentions = getUnMatchedMentions();
        CountingMap<String> unMatched = speciesCounter.getSpeciesCounts(unMatchedMentions, frequency, this);
        frequency = true;
        log.info("Getting frequency species stats");
        CountingMap<String> matchedFreq = speciesCounter.getSpeciesCounts(matchedMentions, frequency, this);
        CountingMap<String> unMatchedFreq = speciesCounter.getSpeciesCounts(unMatchedMentions, frequency, this);
        log.info("matched size:" + matched.size());
        log.info("unmatched size:" + unMatched.size());

        Map<String, Set<Resource>> mentionMap = speciesCounter.getSpeciesMentions(getMentions(), this);

        // add stats for mouse, rat, human, Rhesus and macaque
        Set<String> common = speciesCounter.getCommonSpeciesLinStrings();

        Set<Resource> mentionsForCommonSpecies = new HashSet<Resource>();
        for (String speciesString : common) {
            mentionsForCommonSpecies.addAll(mentionMap.get(speciesString));
        }
        mentionMap.put("Common", mentionsForCommonSpecies);

        log.info("Done getting species to mention map");

        ParamKeeper keeper = new ParamKeeper();
        GateInterface p2g = new GateInterface();
        SpeciesUtil.SpeciesCountResult countResult = SpeciesUtil.getSpeciesStrings(p2g, p2g.getCorp());
        StringToStringSetMap speciesStrings = countResult.strings;

        // hack to add common species data
        speciesStrings.put("Common", "Common");
        Map<String, String> params;

        // create a row for each species
        for (String speciesString : speciesStrings.keySet()) {
            log.info(speciesString);
            if (speciesCounter.isFiltered(speciesString)) {
                log.info("Skipping:" + speciesString);
                continue;
            }
            params = new HashMap<String, String>();
            params.put("Species", speciesString);
            params.put("SpeciesText", speciesStrings.get(speciesString).toString());
            params.put("MatchedPerAbs", matched.get(speciesString).toString());
            params.put("UnMatchedPerAbs", unMatched.get(speciesString).toString());
            params.put("MatchedFreq", matchedFreq.get(speciesString).toString());
            params.put("UnMatchedFreq", unMatchedFreq.get(speciesString).toString());
            Set<Resource> mentionsForSpecies = mentionMap.get(speciesString);
            // its possible an abstract has a species but no mentions
            if (mentionsForSpecies != null) {
                params.put("mentionsLinkedToSpecies", mentionsForSpecies.size() + ""); //
                // split into unmatched and matched
                params.put("UniqueMatched", Util.intersectSize(matchedMentions, mentionsForSpecies) + "");
                params.put("UniqueUnMatched", Util.intersectSize(unMatchedMentions, mentionsForSpecies) + "");
                params.putAll(evaluateConceptMapping(properties, mentionsForSpecies));
            }
            keeper.addParamInstance(params);
        }

        params = new HashMap<String, String>();

        params.put("Species", "rat, mouse, human, rhesus, macaca f.");
        params.put("mentionsLinkedToSpecies", mentionsForCommonSpecies.size() + "");

        // split into unmatched and matched
        Set<Resource> commonMatched = (Set<Resource>) Util.intersect(matchedMentions, mentionsForCommonSpecies);
        Set<Resource> commonUnMatched = (Set<Resource>) Util.intersect(unMatchedMentions, mentionsForCommonSpecies);

        params.put("UniqueMatched", commonMatched.size() + "");
        params.put("UniqueUnMatched", commonUnMatched.size() + "");

        params.put("MatchedPerAbs", "" + getPMIDCount(commonMatched));
        params.put("UnMatchedPerAbs", "" + getPMIDCount(commonUnMatched));
        params.put("MatchedFreq", "" + sumMentionFrequencies(commonMatched));
        params.put("UnMatchedFreq", "" + sumMentionFrequencies(commonUnMatched));

        params.putAll(evaluateConceptMapping(properties, mentionsForCommonSpecies));
        keeper.addParamInstance(params);

        // add stats for all for comparison
        params = new HashMap<String, String>();
        params.put("Species", "Ignoring species");
        params.put("SpeciesText", "Ignoring species");
        params.put("MatchedPerAbs", "" + getPMIDCount(matchedMentions));
        params.put("UnMatchedPerAbs", "" + getPMIDCount(unMatchedMentions));
        params.put("MatchedFreq", "" + sumMentionFrequencies(matchedMentions));
        params.put("UnMatchedFreq", "" + sumMentionFrequencies(unMatchedMentions));
        params.put("UniqueMatched", matchedMentions.size() + "");
        params.put("UniqueUnMatched", unMatchedMentions.size() + "");
        params.putAll(evaluateConceptMapping(properties, getMentions()));

        keeper.addParamInstance(params);

        String filename = Config.config.getString("whitetext.resolve.results.folder") + "species.evaled.xls";
        keeper.writeExcel(filename);

    }

    public static Map<String, String> convert(Map<String, ?> input) {
        Map<String, String> result = new HashMap<String, String>();
        for (String key : input.keySet()) {
            result.put(key, input.get(key).toString());
        }
        return result;
    }

    /**
     * assumes resolution links are already made
     */
    public void writePropertiesEval() throws Exception {
        ParamKeeper results = new ParamKeeper();
        // go through all properties
        for (Property p : getAllProperties()) {
            Map<String, String> params = new HashMap<String, String>();
            params.putAll(getAllStats(p));
            params.put("Property", p.getLocalName());
            // write out stats on each line
            results.addParamInstance(params);
        }
        results.writeExcel(Config.config.getString("whitetext.resolve.results.folder") + "AllResolvers.xls");
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        boolean loadManualMatches = false;

        // double check
        // must start with a blank lexicon file!!
        String file = "resolve.Lexicon.RDF";
        // file = "/home/lfrench/WhiteText/rdf/ResolutionRDF.allComp.allResolvers.malletOnly.rdf";
        // if ( !loadManualMatches ) file += ".noManualMatches";
        file = Config.config.getString(file);

        Model modelLoad = ModelFactory.createDefaultModel();
        modelLoad.read(new FileInputStream(file), null);

        // getAllResolversStats( loadManualMatches );

        // iterates mention editors for each, then all resolvers (load manual assumed to be true)
        boolean reason = true;
        createStats(modelLoad, reason, loadManualMatches);
        // getAllResolversStats( loadManualMatches, modelLoad, reason );

        System.exit(1);

    }
}