ubic.pubmedgate.resolve.MakeLexiconRDFModel.java Source code

Introduction

Here is the source code for ubic.pubmedgate.resolve.MakeLexiconRDFModel.java
Source

/*
 * The WhiteText project
 * 
 * Copyright (c) 2012 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.pubmedgate.resolve;

import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import ubic.BAMSandAllen.Util;
import ubic.BAMSandAllen.Vocabulary;
import ubic.BAMSandAllen.AllenDataLoaders.AllenMajorMatrices;
import ubic.BAMSandAllen.BAMSDataLoaders.BAMSDataLoader;
import ubic.basecode.dataStructure.CountingMap;
import ubic.basecode.dataStructure.StringToStringSetMap;
import ubic.basecode.io.excel.ExcelUtil;
import ubic.basecode.ontology.OntologyLoader;
import ubic.pubmedgate.Config;
import ubic.pubmedgate.ConnectionsDocument;
import ubic.pubmedgate.GateInterface;
import ubic.pubmedgate.NeuroNamesLoader;
import ubic.pubmedgate.NeuroNamesMouseAndRatLoader;
import ubic.pubmedgate.organism.SpeciesLoader;
import ubic.pubmedgate.resolve.depreciated.BIRNLexResolver;
import ubic.pubmedgate.resolve.depreciated.ResolveBrianRegions;
import ubic.pubmedgate.resolve.focusedAnalysis.SynonymSchema;
import au.com.bytecode.opencsv.CSVReader;

import com.hp.hpl.jena.ontology.OntClass;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;
import com.hp.hpl.jena.vocabulary.RDF;
import com.hp.hpl.jena.vocabulary.RDFS;
import com.sun.org.apache.xerces.internal.parsers.XIncludeAwareParserConfiguration;

public class MakeLexiconRDFModel {
    protected static Log log = LogFactory.getLog(MakeLexiconRDFModel.class);
    Model model;

    public MakeLexiconRDFModel() {
        model = ModelFactory.createDefaultModel();
        model.setNsPrefix("lexiconLinks", Vocabulary.getLexiconURI());
        model.setNsPrefix("TermSpace", Vocabulary.getLexiconSpaceURI());
    }

    public void setModel(Model model) {
        this.model = model;
    }

    public void writeOut() throws Exception {
        writeOut(Config.config.getString("resolve.Lexicon.RDF"));
    }

    public void writeOut(String filename) throws Exception {
        model.write(new FileWriter(filename));
    }

    /*
     * very rough way to load in brede database entries, testing only
     */public void addBredeNodes() throws Exception {
        // http://neuro.imm.dtu.dk/services/brededatabase/worois.xml

        // InputSource src = new InputSource( new FileInputStream( xmlFile ) );
        XIncludeAwareParserConfiguration config = new XIncludeAwareParserConfiguration();
        DOMParser prsr = new org.apache.xerces.parsers.DOMParser();
        prsr.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        prsr.parse(Vocabulary.getBredeURI());
        Document XMLdocument = prsr.getDocument();

        NodeList l = XMLdocument.getElementsByTagName("Roi");
        for (int i = 0; i < l.getLength(); i++) {
            Node node = l.item(i);
            NodeList l2 = node.getChildNodes();
            String woroi;

            Resource mainConcept = null;
            for (int ii = 0; ii < l2.getLength(); ii++) {
                Node childNode = l2.item(ii);
                if (childNode.getNodeName().equals("woroi")) {
                    // make the concept
                    woroi = AllenMajorMatrices.unmarshallText(childNode);
                    mainConcept = model.createResource(Vocabulary.getBredeURI() + woroi);
                    mainConcept.addProperty(RDF.type, Vocabulary.bredeName);
                    // log.info( woroi );
                }

                if (childNode.getNodeName().equals("name") || childNode.getNodeName().equals("variation")) {
                    String name = AllenMajorMatrices.unmarshallText(childNode);
                    name = name.toLowerCase();
                    if (name.startsWith("left")) {
                        name = name.replaceFirst("left ", "");
                    }
                    if (name.startsWith("right")) {
                        name = name.replaceFirst("right ", "");
                    }

                    Resource r = Vocabulary.makeNeurotermNode(name, model);

                    // link to concept
                    if (childNode.getNodeName().equals("name")) {
                        // add its label if its the name tag
                        mainConcept.addLiteral(RDFS.label, name);
                        mainConcept.addProperty(Vocabulary.has_label_term, r);
                    }
                    if (childNode.getNodeName().equals("variation")) {
                        mainConcept.addProperty(Vocabulary.has_synonym_term, r);
                    }
                }
            }
        }
    }

    public void addNN2007Nodes() throws Exception {
        NeuroNamesMouseAndRatLoader NNMR = new NeuroNamesMouseAndRatLoader();
        NNMR.addToModel(model);
    }

    @Deprecated
    public void addBIRNLexNodes() throws Exception {
        BIRNLexResolver birnLex = new BIRNLexResolver();
        birnLex.addToModel(model);
    }

    public void addNIFSTDNodes() throws Exception {
        NIFSTDRegionExtractor nifstd = new NIFSTDRegionExtractor();
        nifstd.addToModel(model);
    }

    public void addNIFNodes() throws Exception {

    }

    public void addABCDNodes() throws Exception {
        String file = Config.config.getString("whitetext.lexicon.output") + "AvianBrainConnectivityDatabase.txt";
        CSVReader reader = new CSVReader(new FileReader(file), '\t');
        String[] line;
        Resource mainConcept = null;

        while ((line = reader.readNext()) != null) {
            // which is syn and which is main label?
            // log.info( line[0] + "->" + line[1] );
            String uri = line[0];
            String label = line[1];

            mainConcept = model.createResource(uri);
            mainConcept.addProperty(RDF.type, Vocabulary.ABCDName);
            Resource r = Vocabulary.makeNeurotermNode(label, model);
            // will use latest name for label - need better data from ABCD
            mainConcept.addLiteral(RDFS.label, label);
            mainConcept.addProperty(Vocabulary.has_synonym_term, r);
        }

    }

    public void addABANodes() throws Exception {
        OntModel ABA = OntologyLoader.loadMemoryModel(Vocabulary.getABAURI());
        ExtendedIterator it = ABA.listClasses();
        Resource mainConcept = null;

        for (OntClass i = (OntClass) it.next(); it.hasNext(); i = (OntClass) it.next()) {
            String label = i.getLabel(null);
            if (label == null) {
                log.info("Skipping:" + i.getLabel(null) + " = " + i.getURI());
                continue;
            }
            mainConcept = model.createResource(i.getURI());
            mainConcept.addProperty(RDF.type, Vocabulary.ABAName);
            Resource r = Vocabulary.makeNeurotermNode(label, model);
            mainConcept.addLiteral(RDFS.label, label);
            mainConcept.addProperty(Vocabulary.has_label_term, r);
            log.info(i.getLabel(null) + " = " + i.getURI());
        }
    }

    public void addNN2010Nodes() throws Exception {
        // was NN2002
        // make NN nodes, with ID as URL and name as label
        // NeuroNames Ontology of Mammalian Neuroanatomy: NN2010
        NeuroNamesLoader NN2010 = new NeuroNamesLoader();
        // load main labels
        NN2010.addClassicalStructuresToModel(model);
        NN2010.addAncillaryStructuresToModel(model);
        NN2010.addSynonymsToModel(model);
    }

    public void addBAMSNodes() {
        BAMSDataLoader bamsLoader = new BAMSDataLoader();
        bamsLoader.addToModel(model);
    }

    public void extendBAMSNodes() throws Exception {
        BAMSDataLoader bamsLoader = new BAMSDataLoader();
        // open up spreadsheet
        String filename = Config.config.getString("whitetext.resolve.addedBAMSSynonyms");
        log.info(filename);
        HSSFSheet sheet = ExcelUtil.getSheetFromFile(filename, "Sheet0");

        SynonymSchema schema = new SynonymSchema();

        int row = 1;
        int acceptCount = 0;
        while (true) {
            row++;
            String BAMSName = ExcelUtil.getValue(sheet, row, schema.getPosition("BAMSName"));
            String termString = ExcelUtil.getValue(sheet, row, schema.getPosition("Term"));
            if (BAMSName == null) {
                break;
            }
            // log.info( BAMSName );
            String acceptValue = ExcelUtil.getValue(sheet, row, schema.getPosition("Accept"));
            boolean accept;
            if (acceptValue != null) {
                accept = acceptValue.equals("X");
            } else {
                accept = false;
            }

            if (accept) {
                acceptCount++;
                // get URI for BAMS node
                OntClass hitRegion = null;
                for (OntClass region : bamsLoader.getAllBrianRegions()) {
                    // any synonyms in BAMS? don't think there is any
                    String label = region.getLabel(null);
                    if (label.equals(BAMSName)) {
                        hitRegion = region;
                        break;
                    }
                }
                if (hitRegion == null) {
                    log.info("Cant find:" + BAMSName);
                    continue;
                }
                // create synonym node
                // create a link between them
                Resource r = model.createResource(hitRegion.getURI());
                r.addProperty(Vocabulary.has_manual_link, Vocabulary.makeNeurotermNode(termString, model));
            }

        }
        log.info("Accept:" + acceptCount + " of " + row);
        // create nodes
    }

    public void addManualLinks() throws Exception {
        LoadManualMappings manuals = new LoadManualMappings();
        manuals.addToModel(model);
    }

    public void addDoubleMentionsToModel(String annotationSetNormal, String annotationSetUnseen, String dataStore) {
        GateInterface gateInt = new GateInterface(dataStore);

        // two sets of documents, unseen with predicted annotations and normal with annotated spans
        List<ConnectionsDocument> normalDocs = gateInt.getDocuments();
        List<ConnectionsDocument> unSeenDocs = GateInterface.getDocuments(gateInt.getUnseenCorp());

        ResolveBrianRegions regionGetterUnseen = new ResolveBrianRegions(unSeenDocs, annotationSetUnseen);
        ResolveBrianRegions regionGetterNormal = new ResolveBrianRegions(normalDocs, annotationSetNormal);

        Collection<String> allRegions = (Collection<String>) Util.union(regionGetterNormal.getAllBrainRegionText(),
                regionGetterUnseen.getAllBrainRegionText());
        log.info("Brain regions:" + allRegions.size());

        CountingMap<String> uniqueAnnotationTextUnSeen = regionGetterUnseen.getAllBrainRegionTextCounted();
        CountingMap<String> uniqueAnnotationTextNormal = regionGetterNormal.getAllBrainRegionTextCounted();

        StringToStringSetMap pmidMapUnseen = regionGetterUnseen.getAllBrainRegionTextToPMID();
        StringToStringSetMap pmidMapNormal = regionGetterNormal.getAllBrainRegionTextToPMID();
        log.info("PMID MAP unseen:" + pmidMapUnseen.getExpandedSize());
        log.info("PMID MAP normal:" + pmidMapNormal.getExpandedSize());

        for (String region : allRegions) {
            Resource r = Vocabulary.makeMentionNode(region, model);
            Integer occurancesUnseen = uniqueAnnotationTextUnSeen.get(region);
            if (occurancesUnseen == null)
                occurancesUnseen = 0;
            Integer occurancesNormal = uniqueAnnotationTextNormal.get(region);
            if (occurancesNormal == null)
                occurancesNormal = 0;

            r.addLiteral(Vocabulary.number_of_occurances, occurancesUnseen + occurancesNormal);

            Set<String> pmidsUnseen = pmidMapUnseen.get(region);
            if (pmidsUnseen == null)
                pmidsUnseen = new HashSet<String>();
            Set<String> pmidsNormal = pmidMapNormal.get(region);
            if (pmidsNormal == null)
                pmidsNormal = new HashSet<String>();
            Set<String> pmids = (Set<String>) Util.union(pmidsNormal, pmidsUnseen);

            r.addLiteral(Vocabulary.number_of_abstracts, pmids.size());
            r.addLiteral(Vocabulary.annotation_set, annotationSetNormal + "|" + annotationSetUnseen);
            for (String pmid : pmids) {
                r.addProperty(Vocabulary.in_PMID, model.createResource(Vocabulary.getpubmedURIPrefix() + pmid));
            }
        }

    }

    public void addDatesToModel(String dataStore, boolean useUnseenCorp) throws Exception {
        List<ConnectionsDocument> docs = getDataStoreDocs(dataStore, useUnseenCorp);

        GateInterface gateInt = new GateInterface(dataStore);
        for (ConnectionsDocument doc : docs) {
            String PMID = doc.getPMID();
            // for ( String PMID : PMIDs ) {
            // ConnectionsDocument doc = gateInt.getByPMID( PMID );
            Date date = doc.getPubDate();
            // check if it's in the RDF?

            Resource docResource = model.createResource(Vocabulary.getpubmedURIPrefix() + PMID);
            Calendar pubCal = Calendar.getInstance();
            pubCal.setTime(date);

            docResource.addLiteral(Vocabulary.publication_date, pubCal);

            log.info(docResource.getURI() + "->" + pubCal.toString());
        }
        log.info(docs.size() + " abstracts labelled for species");
    }

    public void addSpeciesToModel(List<ConnectionsDocument> docs) throws Exception {
        SpeciesLoader filterLoader = new SpeciesLoader();
        for (ConnectionsDocument doc : docs) {
            filterLoader.addToModel(doc, model);
        }
        log.info(docs.size() + " abstracts labelled for species");
    }

    public void addSpeciesToModel(String dataStore, boolean useUnseenCorp) throws Exception {
        List<ConnectionsDocument> docs = getDataStoreDocs(dataStore, useUnseenCorp);
        addSpeciesToModel(docs);
    }

    private List<ConnectionsDocument> getDataStoreDocs(String dataStore, boolean useUnseenCorp) {
        GateInterface gateInt = new GateInterface(dataStore);
        List<ConnectionsDocument> docs;
        if (!useUnseenCorp) {
            docs = gateInt.getDocuments();
        } else {
            docs = GateInterface.getDocuments(gateInt.getUnseenCorp());
        }
        return docs;
    }

    public void addMentionsToModel(String annotationSet, String dataStore, boolean useUnseenCorp) {
        List<ConnectionsDocument> docs = getDataStoreDocs(dataStore, useUnseenCorp);

        ResolveBrianRegions regionGetter = new ResolveBrianRegions(docs, annotationSet);
        log.info("Brain regions:" + regionGetter.getAllBrainRegionText().size());
        CountingMap<String> uniqueAnnotationText = regionGetter.getAllBrainRegionTextCounted();
        StringToStringSetMap pmidMap = regionGetter.getAllBrainRegionTextToPMID();
        for (String region : uniqueAnnotationText.keySet()) {
            Resource r = Vocabulary.makeMentionNode(region, model);
            r.addLiteral(Vocabulary.number_of_occurances, uniqueAnnotationText.get(region));
            Set<String> pmids = pmidMap.get(region);
            r.addLiteral(Vocabulary.number_of_abstracts, pmids.size());
            r.addLiteral(Vocabulary.annotation_set, annotationSet);
            for (String pmid : pmids) {
                r.addProperty(Vocabulary.in_PMID, model.createResource(Vocabulary.getpubmedURIPrefix() + pmid));
            }
        }
    }

    public void removeMentionsFromModel(String annotationSet, String dataStore) {
        GateInterface gateInt = new GateInterface(dataStore);
        List<ConnectionsDocument> docs = gateInt.getDocuments();
        ResolveBrianRegions regionGetter = new ResolveBrianRegions(docs, annotationSet);
        CountingMap<String> uniqueAnnotationText = regionGetter.getAllBrainRegionTextCounted();
        for (String region : uniqueAnnotationText.keySet()) {
            Resource r = Vocabulary.makeMentionNode(region, model);
            StmtIterator itr = model.listStatements(r, null, (RDFNode) null);
            model.remove(itr);
        }
    }

    public static void corpusLookTemp(String set, String store) {
        GateInterface gateInt = new GateInterface(store);
        List<ConnectionsDocument> docs = gateInt.getDocuments();
        ResolveBrianRegions regionGetter = new ResolveBrianRegions(docs, set);
        CountingMap<String> uniqueAnnotationText = regionGetter.getAllBrainRegionTextCounted();
        log.info(set + " size:" + uniqueAnnotationText.size());
        log.info(set + " sum:" + uniqueAnnotationText.summation());

    }

    public void readIn() throws Exception {
        model.read(new FileInputStream(Config.config.getString("resolve.Lexicon.RDF")), null);
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        String storeLoc = Config.config.getString("whitetext.datastore.location");
        // corpusLookTemp( "UnionMerge", Config.config.getString( "whitetext.datastore.location" ) );
        // corpusLookTemp( "UnionMerge", "/home/leon/Desktop/GATEDataStoreAfterTracts/store/" );
        // corpusLookTemp( "Mallet", "/home/leon/Desktop/fromKrusty/GATEDataStoreAfterTracts/store/" );
        // System.exit( 1 );
        boolean useUnseen = true;

        MakeLexiconRDFModel maker;
        maker = new MakeLexiconRDFModel();

        maker.readIn();
        // maker.addABCDNodes();
        maker.addDatesToModel(storeLoc, useUnseen);
        maker.writeOut();
        System.exit(1);

        useUnseen = false;
        maker.addSpeciesToModel(storeLoc, useUnseen);
        maker.writeOut();
        System.exit(1);

        // addSpeciesToModel( String annotationSet, String dataStore, boolean useUnseenCorp ) {
        ResolutionRDFModel resModel2 = new ResolutionRDFModel(maker.model, true);
        // resModel2.getStats();
        // resModel2.loadManualMatches();
        // resModel2.createMatches3();
        // resModel2.getStats();

        resModel2 = new ResolutionRDFModel(Config.config.getString("resolve.Lexicon.resolution.RDF"));
        resModel2.getStats();
        System.exit(1);

        maker = new MakeLexiconRDFModel();
        maker.readIn();
        maker.removeMentionsFromModel("UnionMerge", storeLoc);
        boolean useUnseenCorp = true;
        // maker.addMentionsToModel( "Mallet", storeLoc, useUnseenCorp );

        // maker.addMentionsToModel( "Mallet", storeLoc, useUnseenCorp );
        maker.addDoubleMentionsToModel("UnionMerge", "Mallet", storeLoc);

        maker.writeOut(Config.config.getString("resolve.Lexicon.RDF.allComp"));

        ResolutionRDFModel resModel = new ResolutionRDFModel(maker.model, true);
        resModel.getStats();
        System.exit(1);

        // from scratch - birnlex is broken
        maker.addNN2010Nodes();
        maker.addNN2007Nodes(); // mouse and rat
        //
        // maker.addBIRNLexNodes();
        // // maker.addManualLinks();
        maker.addBredeNodes();
        // maker.addBAMSNodes();
        //

        // TODO adding BIRNLex nodes is broken, need to update ontology code in basecode!!
        maker.readIn();
        // maker.addMentionsToModel( "UnionMerge", Config.config.getString( "whitetext.datastore.location" ) );
        maker.addABANodes();
        // maker.writeOut();
        // link terms using bag of words or stemming - see ResolutionRDFModel
    }

    public Model getModel() {
        return model;
    }
}