ubic.GEOMMTx.MeSHMapperTest.java Source code

Java tutorial

Introduction

Here is the source code for ubic.GEOMMTx.MeSHMapperTest.java

Source

/*
 * The GEOMMTx project
 * 
 * Copyright (c) 2007 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.GEOMMTx;

import gov.nih.nlm.nls.nlp.textfeatures.Candidate;
import gov.nih.nlm.nls.nlp.textfeatures.Phrase;
import gov.nih.nlm.nls.nlp.textfeatures.UMLS_SemanticTypePointer;

import java.io.File;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import ubic.GEOMMTx.evaluation.CUISUIPair;
import ubic.GEOMMTx.evaluation.EvaluatePhraseToCUISpreadsheet;
import ubic.GEOMMTx.util.SetupParameters;
import ubic.basecode.dataStructure.CountingMap;
import ubic.basecode.io.excel.ExcelUtil;
import au.com.bytecode.opencsv.CSVReader;

/**
 * Basically just a class that calls MMTx to get MeSH terms from free text
 * 
 * @author leon
 * @version $Id$
 */
public class MeSHMapperTest {

    protected static Log log = LogFactory.getLog(MeSHMapperTest.class);
    static String[] PHRASE_PARAMS = new String[] { "--no_acros_abbrs", "--term_processing", "--ignore_word_order",
            "--allow_concept_gaps" };
    private int CUISUIrejects = 0;
    private Set<String> acceptedSemanticTypes;

    private int answerCount;
    private int atLeastOne;
    private static GetUMLSCodes mapper;
    private int matchedCount;
    private static MMTxRunner mmtx;
    private int predictedCount;
    private Set<String> rejectedConcepts;
    private static Set<CUISUIPair> rejectedCUISUIPairs;

    private CountingMap<String> semTypeMap;

    private static Map<String, Set<UMLSSourceCode>> sourceMap;

    private int zeroCalls;

    @BeforeClass
    public static void setupOnce() {
        mmtx = new MMTxRunner(null, 0, SetupParameters.getStringArray("geommtx.annotator.mmtxOptions"));
        mapper = new GetUMLSCodes(true);
        sourceMap = mapper.getUMLSCodeMap();
        EvaluatePhraseToCUISpreadsheet evalSheet = new EvaluatePhraseToCUISpreadsheet();
        rejectedCUISUIPairs = evalSheet.getRejectedSUIs();
    }

    @Before
    public void setup() {
        acceptedSemanticTypes = new HashSet<String>();
        rejectedConcepts = new HashSet<String>();
        semTypeMap = new CountingMap<String>();
        resetCounters();
    }

    @Test
    public void testPredictions() throws Exception {

        try (CSVReader reader = new CSVReader(
                new InputStreamReader(this.getClass().getResourceAsStream("/LMD-FEATURES.txt")), ',', '"');) {
            List<String[]> lines = reader.readAll();

            for (String[] line : lines) {
                String ID = line[0];
                String name = line[2];

                Set<String> predictions = getMeSHIDs(name, false);
                for (String prediction : predictions) {
                    log.info(ID + "|name|" + prediction);
                }

                if (predictions.size() == 0)
                    zeroCalls++;
                predictedCount += predictions.size();
                // if ( predictedCount > 10 ) break;
            }
            // mapper.printSemMap();
            log.debug("Rejected anot:" + CUISUIrejects);
            log.debug("predictedCount:" + predictedCount);
            log.debug("zeroCalls:" + zeroCalls);

            for (String[] line : lines) {
                String ID = line[0];
                String desc = line[3];
                if (desc.trim().equals("")) {
                    zeroCalls++;
                    continue;
                }

                Set<String> predictions = getMeSHIDs(desc, false);
                for (String prediction : predictions) {
                    log.debug(ID + "|desc|" + prediction);
                }

                if (predictions.size() == 0)
                    zeroCalls++;
                predictedCount += predictions.size();
            }
            log.debug("Rejected anot:" + CUISUIrejects);
            log.debug("predictedCount:" + predictedCount);
            log.debug("zeroCalls:" + zeroCalls);
        }
    }

    @Test
    public void testOmimMesh() throws Exception {

        acceptedSemanticTypes = new HashSet<String>();
        acceptedSemanticTypes.add("Congenital Abnormality");
        acceptedSemanticTypes.add("Disease or Syndrome");
        acceptedSemanticTypes.add("Neoplastic Process");
        setAcceptedSemanticTypes(acceptedSemanticTypes);

        rejectedConcepts = new HashSet<String>();
        rejectedConcepts.add("Disease");
        rejectedConcepts.add("Syndrome");
        rejectedConcepts.add("Carcinoma");
        setRejectedConcepts(rejectedConcepts);

        log.debug(getMeSHIDs("Juvenile Myoclonic Epilepsy", true));
        // read in spreadsheet
        int howMany = 0;
        HSSFSheet sheet = ExcelUtil.getSheetFromFile(
                SetupParameters.getString("geommtx.home") + File.separator + "/src/test/resources/SPOMIMtoMeSH.xls",
                "Sheet1");
        int stopAfter = 10;
        for (int i = 0; i < stopAfter; i++) {
            // get curated set for column e(pos 4)
            StringTokenizer MeSHBreaker = new StringTokenizer(ExcelUtil.getValue(sheet, i, 4), " ");
            Set<String> answers = new HashSet<String>();
            while (MeSHBreaker.hasMoreTokens()) {
                answers.add(MeSHBreaker.nextToken());
            }

            // the excel file denotes OMIM entries
            if (ExcelUtil.getValue(sheet, i, 0).equals("OMIM")) {
                // ??
            }

            // skip swissprot
            if (ExcelUtil.getValue(sheet, i, 0).equals("SP")) {
                continue;
            }

            log.debug(ExcelUtil.getValue(sheet, i, 1));

            // the input text/description
            // get text from col C (pos 2)
            String text = ExcelUtil.getValue(sheet, i, 2);
            if (text != null && text.trim().length() != 0) {
                howMany++;
                evaluate(text, answers);
            }

        }
        log.debug("Descriptions seen:" + howMany);
        printStats();
    }

    private void evaluate(String text, Set<String> answers) throws GEOMMTXException {
        Set<String> predictions = getMeSHIDs(text, true);
        if (predictions.size() == 0)
            zeroCalls++;
        predictedCount += predictions.size();
        answerCount += answers.size();
        answers.retainAll(predictions);
        matchedCount += answers.size();
        if (answers.size() >= 1)
            atLeastOne++;
    }

    /**
     * @param text
     * @param useSemTypes
     * @return
     * @throws GEOMMTXException
     */
    private Set<String> getMeSHIDs(String text, boolean useSemTypes) throws GEOMMTXException {
        // log.info(text);
        text = text.replace(')', ' ');
        text = text.replace('(', ' ');
        text = text.replace('/', ' ');
        text = text.replace('\r', ' ');
        text = text.replace('\n', ' ');
        // log.info(text);

        String baseUMLS = "http://www.purl.org/umls/umls#";

        Set<String> result = new HashSet<String>();
        // log.info( text + " -> " );
        for (Phrase p : mmtx.getPhrases(text)) {

            for (Candidate c : mmtx.getConcepts(p)) {

                if (rejectedConcepts.contains(c.getUMLS_ConceptName()))
                    continue;

                if (rejectedCUISUIPairs.contains(new CUISUIPair(baseUMLS + c.getCUI(), baseUMLS + c.getSUI()))) {
                    CUISUIrejects++;
                    continue;
                }

                UMLS_SemanticTypePointer[] semTypes = c.getSemanticTypes();
                boolean hasAcceptedSemanticType = !useSemTypes;

                for (UMLS_SemanticTypePointer semType : semTypes) {
                    // log.info( semType.getName() );
                    semTypeMap.increment(semType.getName());
                    if (acceptedSemanticTypes.contains(semType.getName()))
                        hasAcceptedSemanticType = !useSemTypes || true;
                }

                // concept does not have accepted semantic type
                if (!hasAcceptedSemanticType)
                    continue;

                // if it has a source vocab listing
                if (sourceMap.get(c.getCUI()) != null) {
                    // then iterate them
                    for (UMLSSourceCode code : sourceMap.get(c.getCUI())) {
                        // if it's mesh then continue
                        if (code.getSource().startsWith("MSH")) {
                            // if we havent seen it yet, print it
                            // if ( !result.contains( code.getCode() ) ) {
                            // log.info( c.getConcept().toString() + " -> " + code.getCode() + " SCORE:"
                            // + c.getFinalScore() );
                            // }
                            result.add(p.getOriginalString() + "|" + c.getConcept().toString() + "|"
                                    + code.getCode() + "|" + c.getFinalScore());
                        }
                    }
                }
                // if it's MeSH than keep it
            }
        }

        return result;
    }

    private void printStats() {
        log.info("Predicted:" + predictedCount);
        log.info("Number of inputs that had no predicted MeSH terms:" + zeroCalls);
        log.info("Gold standard:" + answerCount);
        log.info("Intersection:" + matchedCount);
        log.info("Number where predicted one:" + atLeastOne);
    }

    private void resetCounters() {
        predictedCount = 0;
        answerCount = 0;
        matchedCount = 0;
        atLeastOne = 0;
    }

    private void setAcceptedSemanticTypes(Set<String> types) {
        acceptedSemanticTypes = types;
    }

    private void setRejectedConcepts(Set<String> concepts) {
        rejectedConcepts = concepts;
    }
}