Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.ctakes.ytex.kernel.metric; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import net.sf.ehcache.Cache; import net.sf.ehcache.CacheManager; import net.sf.ehcache.Element; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.ctakes.ytex.kernel.ImputedFeatureEvaluator; import org.apache.ctakes.ytex.kernel.InfoContentEvaluator; import org.apache.ctakes.ytex.kernel.IntrinsicInfoContentEvaluator; import org.apache.ctakes.ytex.kernel.OrderedPair; import org.apache.ctakes.ytex.kernel.SimSvcContextHolder; import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao; import org.apache.ctakes.ytex.kernel.dao.ConceptDao; import org.apache.ctakes.ytex.kernel.model.ConcRel; import org.apache.ctakes.ytex.kernel.model.ConceptGraph; import org.apache.ctakes.ytex.kernel.model.FeatureRank; import org.apache.ctakes.ytex.kernel.pagerank.PageRankService; import org.springframework.transaction.PlatformTransactionManager; import org.springframework.transaction.TransactionStatus; import org.springframework.transaction.support.TransactionCallback; import org.springframework.transaction.support.TransactionTemplate; import com.google.common.collect.ImmutableMap; /** * compute concept similarity * * @author vijay * */ public class ConceptSimilarityServiceImpl implements ConceptSimilarityService { private static final Log log = LogFactory.getLog(ConceptSimilarityServiceImpl.class); private static String formatPaths(List<LCSPath> lcsPaths) { StringBuilder b = new StringBuilder(); Iterator<LCSPath> lcsPathIter = lcsPaths.iterator(); while (lcsPathIter.hasNext()) { LCSPath lcsPath = lcsPathIter.next(); String lcs = lcsPath.getLcs(); b.append(lcs); b.append("="); b.append(lcsPath.toString()); if (lcsPathIter.hasNext()) b.append("|"); } return b.toString(); } @SuppressWarnings("static-access") public static void main(String args[]) throws IOException { Options options = new Options(); options.addOption(OptionBuilder.withArgName("concepts").hasArg().withDescription( "concept pairs or a file containing concept pairs. To specify pairs on command line, separate concepts by comma, concept pairs by semicolon. For file, separate concepts by comma or tab, each concept pair on a new line.") .isRequired(true).create("concepts")); options.addOption(OptionBuilder.withArgName("metrics").hasArg().withDescription( "comma-separated list of metrics. Valid metrics: " + Arrays.asList(SimilarityMetricEnum.values())) .isRequired(true).create("metrics")); options.addOption(OptionBuilder.withArgName("out").hasArg() .withDescription("file to write oputput to. if not specified, output sent to stdout.") .create("out")); options.addOption(OptionBuilder.withArgName("lcs") .withDescription("output lcs and path for each concept pair").create("lcs")); try { CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); String concepts = line.getOptionValue("concepts"); String metrics = line.getOptionValue("metrics"); String out = line.getOptionValue("out"); boolean lcs = line.hasOption("lcs"); PrintStream os = null; try { if (out != null) { os = new PrintStream(new BufferedOutputStream(new FileOutputStream(out))); } else { os = System.out; } List<ConceptPair> conceptPairs = parseConcepts(concepts); List<SimilarityMetricEnum> metricList = parseMetrics(metrics); ConceptSimilarityService simSvc = SimSvcContextHolder.getApplicationContext() .getBean(ConceptSimilarityService.class); List<SimilarityInfo> simInfos = lcs ? new ArrayList<SimilarityInfo>(conceptPairs.size()) : null; List<ConceptPairSimilarity> conceptSimMap = simSvc.similarity(conceptPairs, metricList, null, lcs); printSimilarities(conceptPairs, conceptSimMap, metricList, simInfos, lcs, os); // try { // Thread.sleep(60*1000); // } catch (InterruptedException e) { // e.printStackTrace(); // } } finally { if (out != null) { try { os.close(); } catch (Exception e) { } } } } catch (ParseException pe) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("java " + ConceptSimilarityServiceImpl.class.getName() + " get concept similiarity", options); } } private static List<ConceptPair> parseConcepts(String concepts) throws IOException { BufferedReader r = null; try { List<ConceptPair> conceptPairs = new ArrayList<ConceptPair>(); File f = new File(concepts); if (f.exists()) { r = new BufferedReader(new FileReader(f)); } else { r = new BufferedReader(new StringReader(concepts)); } String line = null; while ((line = r.readLine()) != null) { // for command line, split pairs by semicolon String lines[] = line.split(";"); for (String subline : lines) { String pair[] = subline.split(",|\\t"); if (pair.length != 2) { System.err.println("cannot parse concept pair: " + subline); } else { conceptPairs.add(new ConceptPair(pair[0], pair[1])); } } } return conceptPairs; } finally { if (r != null) r.close(); } } private static List<SimilarityMetricEnum> parseMetrics(String metrics) { String ms[] = metrics.split(","); List<SimilarityMetricEnum> metricSet = new ArrayList<SimilarityMetricEnum>(); for (String metric : ms) { SimilarityMetricEnum m = SimilarityMetricEnum.valueOf(metric); if (m == null) System.err.println("invalid metric: " + ms); else metricSet.add(m); } return metricSet; } private static void printSimilarities(List<ConceptPair> conceptPairs, List<ConceptPairSimilarity> conceptSimList, List<SimilarityMetricEnum> metricList, List<SimilarityInfo> simInfos, boolean lcs, PrintStream os) { // print header os.print("Concept 1\tConcept 2"); for (SimilarityMetricEnum metric : metricList) { os.print("\t"); os.print(metric); } if (lcs) { os.print("\tlcs(s)\tcorpus lcs\tintrinsic lcs\tpaths"); } os.println(); // print content for (ConceptPairSimilarity csim : conceptSimList) { ConceptPair p = csim.getConceptPair(); os.print(p.getConcept1()); os.print("\t"); os.print(p.getConcept2()); for (Double sim : csim.getSimilarities()) { os.print("\t"); if (sim != null) os.print(String.format("%6f", sim)); else os.print(0d); } if (lcs) { SimilarityInfo simInfo = csim.getSimilarityInfo(); os.print("\t"); Iterator<String> lcsIter = simInfo.getLcses().iterator(); while (lcsIter.hasNext()) { os.print(lcsIter.next()); if (lcsIter.hasNext()) os.print('|'); } os.print("\t"); os.print(simInfo.getCorpusLcs() == null ? "" : simInfo.getCorpusLcs()); os.print("\t"); os.print(simInfo.getIntrinsicLcs() == null ? "" : simInfo.getIntrinsicLcs()); os.print("\t"); os.print(formatPaths(simInfo.getLcsPaths())); } os.println(); } } private CacheManager cacheManager; private ConceptGraph cg = null; private ClassifierEvaluationDao classifierEvaluationDao; private ConceptDao conceptDao; private String conceptGraphName; private String conceptSetName; // /** // * information concept cache // */ // private Map<String, Double> corpusICMap = null; private String corpusName; private Map<String, BitSet> cuiTuiMap; // private Map<String, ConceptInfo> conceptInfoMap = null; // private ConceptInfo[] conceptInfoCache; /** * cache to hold lcs's */ private Cache lcsCache; private String lcsImputedType = ImputedFeatureEvaluator.MeasureType.INFOGAIN.getName(); private PageRankService pageRankService; private boolean preload = true; private Map<String, Double> corpusICMap; private Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap = null; private PlatformTransactionManager transactionManager; private List<String> tuiList; private void addCuiTuiToMap(Map<String, Set<String>> cuiTuiMap, Map<String, String> tuiMap, String cui, String tui) { // get 'the' tui string if (tuiMap.containsKey(tui)) tui = tuiMap.get(tui); else tuiMap.put(tui, tui); Set<String> tuis = cuiTuiMap.get(cui); if (tuis == null) { tuis = new HashSet<String>(); cuiTuiMap.put(cui, tuis); } tuis.add(tui); } @Override public Object[] getBestLCS(Set<String> lcses, boolean intrinsicIC, Map<String, Double> conceptFilter) { Map<String, Double> lcsICMap = new HashMap<String, Double>(lcses.size()); // if (isPreload()) { // look in conceptInfoMap for info content for (String lcs : lcses) { lcsICMap.put(lcs, getIC(lcs, intrinsicIC)); // } // } else { // // load info content on demand // Map<String, FeatureRank> frMap = getICOnDemand(lcses, // intrinsicIC); // for (Map.Entry<String, FeatureRank> frMapEntry : // frMap.entrySet()) { // lcsICMap.put(frMapEntry.getKey(), frMapEntry.getValue() // .getEvaluation()); // } } if (conceptFilter != null) { double currentBest = -1; Set<String> bestLcses = new HashSet<String>(); for (String lcs : lcses) { if (conceptFilter.containsKey(lcs)) { double lcsEval = conceptFilter.get(lcs); if (currentBest == -1 || lcsEval > currentBest) { bestLcses.clear(); bestLcses.add(lcs); currentBest = lcsEval; } else if (currentBest == lcsEval) { bestLcses.add(lcs); } } } if (currentBest < 0) currentBest = 0d; if (bestLcses.size() > 0) { return this.getBestLCS(bestLcses, lcsICMap); } else { // no lcses made the cut return null; } } else { // unfiltered - get the lowest ic return this.getBestLCS(lcses, lcsICMap); } } public Object[] getBestLCS(Set<String> lcses, Map<String, Double> icMap) { double ic = -1; String bestLCS = null; for (String lcs : lcses) { Double ictmp = icMap.get(lcs); if (ictmp != null && ic < ictmp.doubleValue()) { ic = ictmp; bestLCS = lcs; } } if (ic < 0) ic = 0d; return new Object[] { bestLCS, ic }; } // /** // * return lin measure. optionally filter lin measure so that only concepts // * that have an lcs that is relevant to the classification task have a // * non-zero lin measure. // * // * relevant concepts are those whose evaluation wrt the label exceeds a // * threshold. // * // * @param concept1 // * @param concept2 // * @param label // * if not null, then filter lcses. // * @param lcsMinEvaluation // * if gt; 0, then filter lcses. this is the threshold. // * @return 0 - no lcs, or no lcs that meets the threshold. // */ // @Override // public double filteredLin(String concept1, String concept2, // Map<String, Double> conceptFilter) { // double ic1 = getIC(concept1); // double ic2 = getIC(concept2); // // lin not defined if one of the concepts doesn't exist in the corpus // if (ic1 == 0 || ic2 == 0) // return 0; // double denom = getIC(concept1) + getIC(concept2); // if (denom != 0) { // ConcRel cr1 = cg.getConceptMap().get(concept1); // ConcRel cr2 = cg.getConceptMap().get(concept2); // if (cr1 != null && cr2 != null) { // Set<String> lcses = new HashSet<String>(); // int dist = getLCSFromCache(cr1, cr2, lcses); // if (dist > 0) { // double ic = getBestIC(lcses, conceptFilter); // return 2 * ic / denom; // } // } // } // return 0; // } // /** // * get the information content for the concept with the highest evaluation // * greater than a specified threshold. // * // * If threshold 0, get the lowest IC of all the lcs's. // * // * @param lcses // * the least common subsumers of a pair of concepts // * @param label // * label against which feature was evaluated // * @param lcsMinEvaluation // * threshold that the feature has to exceed. 0 for no filtering. // * @return 0 if no lcs that makes the cut. else find the lcs(es) with the // * maximal evaluation, and return getIC on these lcses. // * // * @see #getIC(Iterable) // */ // private double getBestIC(Set<String> lcses, // Map<String, Double> conceptFilter) { // if (conceptFilter != null) { // double currentBest = -1; // Set<String> bestLcses = new HashSet<String>(); // for (String lcs : lcses) { // if (conceptFilter.containsKey(lcs)) { // double lcsEval = conceptFilter.get(lcs); // if (currentBest == -1 || lcsEval > currentBest) { // bestLcses.clear(); // bestLcses.add(lcs); // currentBest = lcsEval; // } else if (currentBest == lcsEval) { // bestLcses.add(lcs); // } // } // } // if (bestLcses.size() > 0) { // return this.getIC(bestLcses); // } // } else { // // unfiltered - get the lowest ic // return this.getIC(lcses); // } // return 0; // } // private ConceptInfo getPreloadedConceptInfo(String conceptId) { // ConcRel cr = cg.getConceptMap().get(conceptId); // if (cr != null) { // return this.conceptInfoCache[cr.getNodeIndex()]; // } // return null; // } public CacheManager getCacheManager() { return cacheManager; } public ClassifierEvaluationDao getClassifierEvaluationDao() { return classifierEvaluationDao; } public ConceptDao getConceptDao() { return conceptDao; } // private String createKey(String c1, String c2) { // if (c1.compareTo(c2) < 0) { // return new StringBuilder(c1).append("-").append(c2).toString(); // } else { // return new StringBuilder(c2).append("-").append(c1).toString(); // } // } @Override public ConceptGraph getConceptGraph() { return cg; } public String getConceptGraphName() { return conceptGraphName; } public String getConceptSetName() { return conceptSetName; } public String getCorpusName() { return corpusName; } @Override public Map<String, BitSet> getCuiTuiMap() { return cuiTuiMap; } @Override public int getDepth(String concept) { // if (isPreload()) { // // preloaded all concept info - depth should be there // ConceptInfo ci = this.getPreloadedConceptInfo(concept); // if (ci != null) // return (int) ci.getDepth(); // } else { // // get the feature ranks for the intrinsic infocontent - // // rank = depth // Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>( // Arrays.asList(concept)), true); // if (frMap.containsKey(concept)) // return frMap.get(concept).getRank(); // } ConcRel cr = this.cg.getConceptMap().get(concept); if (cr != null) return cr.getDepth(); return 0; } @Override public double getIC(String concept, boolean intrinsicICMap) { double ic = 0d; if (intrinsicICMap) { ConcRel cr = this.cg.getConceptMap().get(concept); if (cr != null) ic = cr.getIntrinsicInfoContent(); } else { Double icC = null; if (isPreload()) { // we preloaded all ic - just look in the cache icC = this.corpusICMap.get(concept); } else { // we need to load the ic from the database on demand Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>(Arrays.asList(concept)), false); if (frMap.containsKey(concept)) return frMap.get(concept).getEvaluation(); } if (icC != null) ic = icC; } return ic; // if (isPreload()) { // ConceptInfo ci = this.getPreloadedConceptInfo(concept); // if (ci != null) // return intrinsicICMap ? ci.getIntrinsicIC() : ci.getCorpusIC(); // } else { // Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>( // Arrays.asList(concept)), intrinsicICMap); // if (frMap.containsKey(concept)) // return frMap.get(concept).getEvaluation(); // } // return 0d; } private Map<String, FeatureRank> getICOnDemand(Set<String> lcses, boolean intrinsicIC) { if (lcses == null || lcses.isEmpty()) return new HashMap<String, FeatureRank>(0); Map<String, FeatureRank> lcsICMap; lcsICMap = this.classifierEvaluationDao.getFeatureRanks(lcses, intrinsicIC ? null : this.corpusName, intrinsicIC ? null : this.conceptSetName, null, intrinsicIC ? IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT : InfoContentEvaluator.INFOCONTENT, null, 0d, this.getConceptGraphName()); return lcsICMap; } // /** // * get the concept with the lowest Information Content of all the LCSs. // * Functionality copied from umls interface. // * // * @todo make this configurable/add a parameter - avg/min/max/median? // * @param lcses // * @return // */ // public double getIC(Iterable<String> lcses) { // double ic = 0; // for (String lcs : lcses) { // double ictmp = getIC(lcs); // if (ic < ictmp) // ic = ictmp; // } // return ic; // } // // public double getIC(String concept1) { // Double dRetVal = corpusICMap.get(concept1); // if (dRetVal != null) // return (double) dRetVal; // else // return 0; // } public int getLCS(String concept1, String concept2, Set<String> lcses, List<LCSPath> lcsPaths) { int lcsDist = 0; ConcRel cr1 = getConceptGraph().getConceptMap().get(concept1); ConcRel cr2 = getConceptGraph().getConceptMap().get(concept2); if (cr1 != null && cr2 != null) { lcses.clear(); if (lcsPaths == null) { // no need to get paths which we don't cache - look in the cache lcsDist = getLCSFromCache(cr1, cr2, lcses); } else { lcsPaths.clear(); // need to get paths - compute the lcses and their paths lcsDist = lcs(concept1, concept2, lcsPaths); for (LCSPath lcsPath : lcsPaths) { lcses.add(lcsPath.getLcs()); } } } else { if (log.isDebugEnabled()) { if (cr1 == null) log.debug("could not find concept:" + concept1); if (cr2 == null) log.debug("could not find concept:" + concept2); } } return lcsDist; } public Cache getLcsCache() { return lcsCache; } @SuppressWarnings("unchecked") private int getLCSFromCache(ConcRel cr1, ConcRel cr2, Set<String> lcses) { StringBuilder cacheKeyBuilder = new StringBuilder(this.conceptGraphName); cacheKeyBuilder.append( cr1.getConceptID().compareTo(cr2.getConceptID()) < 0 ? cr1.getConceptID() : cr2.getConceptID()); cacheKeyBuilder.append( cr1.getConceptID().compareTo(cr2.getConceptID()) >= 0 ? cr2.getConceptID() : cr1.getConceptID()); String cacheKey = cacheKeyBuilder.toString(); Element e = this.lcsCache != null ? this.lcsCache.get(cacheKey) : null; if (e != null) { // hit the cache - unpack the lcs if (e.getObjectValue() != null) { Object[] val = (Object[]) e.getObjectValue(); lcses.addAll((Set<String>) val[1]); return (Integer) val[0]; } else { return -1; } } else { // missed the cache - save the lcs Object[] val = null; Set<ConcRel> lcsCRSet = new HashSet<ConcRel>(2); int dist = ConcRel.getLeastCommonConcept(cr1, cr2, lcsCRSet, null); if (dist >= 0) { val = new Object[2]; val[0] = dist; for (ConcRel cr : lcsCRSet) { lcses.add(cr.getConceptID()); } val[1] = lcses; } if (this.lcsCache != null) { e = new Element(cacheKey, val); this.lcsCache.put(e); } return dist; } } public String getLcsImputedType() { return lcsImputedType; } public PageRankService getPageRankService() { return pageRankService; } public Map<SimilarityMetricEnum, SimilarityMetric> getSimilarityMetricMap() { return similarityMetricMap; } public PlatformTransactionManager getTransactionManager() { return transactionManager; } @Override public List<String> getTuiList() { return this.tuiList; } public void init() { log.info("begin initialization for concept graph: " + conceptGraphName); cg = conceptDao.getConceptGraph(conceptGraphName); if (cg == null) { log.warn("concept graph null, name: " + conceptGraphName); } else { initSimilarityMetricMap(); if (isPreload()) { try { TransactionTemplate t = new TransactionTemplate(this.transactionManager); t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW); t.execute(new TransactionCallback<Object>() { @Override public Object doInTransaction(TransactionStatus arg0) { initInfoContent(); initCuiTuiMapFromCorpus(); return null; } }); } catch (Exception e) { log.info("could not initialize cui-tui map: " + e.getMessage() + ". This is expected if you do not have umls installed in your db."); } } } log.info("end initialization for concept graph: " + conceptGraphName); } /** * load cui-tui for the specified corpus from the MRSTY table */ public void initCuiTuiMapFromCorpus() { // don't duplicate tui strings to save memory SortedMap<String, String> tuiMap = new TreeMap<String, String>(); Map<String, Set<String>> tmpTuiCuiMap = new HashMap<String, Set<String>>(); List<Object[]> listCuiTui = this.classifierEvaluationDao.getCorpusCuiTuis(this.getCorpusName(), this.getConceptGraphName(), this.getConceptSetName()); for (Object[] cuiTui : listCuiTui) { String cui = (String) cuiTui[0]; String tui = (String) cuiTui[1]; addCuiTuiToMap(tmpTuiCuiMap, tuiMap, cui, tui); } // map of tui - bitset index SortedMap<String, Integer> mapTuiIndex = new TreeMap<String, Integer>(); // list of tuis corresponding to bitset indices List<String> tmpTuiList = new ArrayList<String>(tuiMap.size()); int index = 0; for (String tui : tuiMap.keySet()) { mapTuiIndex.put(tui, index++); tmpTuiList.add(tui); } this.tuiList = Collections.unmodifiableList(tmpTuiList); // convert list of cuis into bitsets // Map<String, BitSet> tmpCuiTuiBitsetMap = new HashMap<String, // BitSet>(); ImmutableMap.Builder<String, BitSet> cuiTuiBitsetMapBuilder = new ImmutableMap.Builder<String, BitSet>(); for (Map.Entry<String, Set<String>> cuiTuiMapEntry : tmpTuiCuiMap.entrySet()) { // tmpCuiTuiBitsetMap.put(cuiTuiMapEntry.getKey(), // tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex)); cuiTuiBitsetMapBuilder.put(cuiTuiMapEntry.getKey(), tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex)); } // this.cuiTuiMap = Collections.unmodifiableMap(tmpCuiTuiBitsetMap); this.cuiTuiMap = cuiTuiBitsetMapBuilder.build(); } /** * initialize information content caches TODO replace strings with concept * ids from conceptGraph to save memory */ private void initInfoContent() { // log.info("loading intrinsic infocontent for concept graph: " // + conceptGraphName); // List<ConceptInfo> listConceptInfo = classifierEvaluationDao // .getIntrinsicInfoContent(conceptGraphName); // if (listConceptInfo.isEmpty()) { // log.warn("intrinsic info content not available! most similarity measures will not work"); // } // this.conceptInfoCache = new ConceptInfo[cg.getConceptMap().size()]; // for (ConceptInfo ci : listConceptInfo) { // ConcRel cr = cg.getConceptMap().get(ci.getConceptId()); // if (cr != null) { // // save a little memory by reusing the string // ci.setConceptId(cr.getConceptID()); // conceptInfoCache[cr.getNodeIndex()] = ci; // } // } // fill intrinsicIC // Map<String, FeatureRank> intrinsicICMap = classifierEvaluationDao // .getIntrinsicInfoContent(conceptGraphName); // for (Map.Entry<String, FeatureRank> icMapEntry : intrinsicICMap // .entrySet()) { // FeatureRank r = icMapEntry.getValue(); // ConcRel cr = cg.getConceptMap().get(r.getFeatureName()); // if (cr != null) { // ConceptInfo ci = new ConceptInfo(); // ci.setConceptId(cr.getConceptID()); // ci.setDepth(r.getRank()); // ci.setIntrinsicIC(r.getEvaluation()); // conceptInfoMap.put(ci.getConceptId(), ci); // } // } // fill corpusIC log.info("loading corpus infocontent for corpusName=" + corpusName + ", conceptGraphName=" + conceptGraphName + ", conceptSetName=" + conceptSetName); Map<String, Double> corpusICMap = classifierEvaluationDao.getInfoContent(corpusName, conceptGraphName, this.conceptSetName); if (corpusICMap == null || corpusICMap.isEmpty()) { log.warn("IC not found"); } ImmutableMap.Builder<String, Double> mb = new ImmutableMap.Builder<String, Double>(); for (Map.Entry<String, Double> corpusICEntry : corpusICMap.entrySet()) { ConcRel cr = cg.getConceptMap().get(corpusICEntry.getKey()); if (cr != null) { mb.put(cr.getConceptID(), corpusICEntry.getValue()); } } this.corpusICMap = mb.build(); // ConceptInfo ci = this.conceptInfoCache[cr.getNodeIndex()]; // if (ci == null) { // // this shouldn't happen! there should be intrinsic ic for // // this concept // ci = new ConceptInfo(); // ci.setConceptId(cr.getConceptID()); // this.conceptInfoCache[cr.getNodeIndex()] = ci; // } // ci.setCorpusIC(corpusICEntry.getValue()); // } // } } /** * initialize the metrics */ private void initSimilarityMetricMap() { log.info("initializing similarity measures"); // Double maxIC = this.classifierEvaluationDao.getMaxFeatureEvaluation( // null, null, null, // IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT, 0, 0, // conceptGraphName); // Integer maxDepth = this.classifierEvaluationDao // .getMaxDepth(conceptGraphName); double maxIC = this.cg.getIntrinsicICMax(); int maxDepth = this.cg.getDepthMax(); this.similarityMetricMap = new HashMap<SimilarityMetricEnum, SimilarityMetric>( SimilarityMetricEnum.values().length); if (maxDepth > 0) { this.similarityMetricMap.put(SimilarityMetricEnum.LCH, new LCHMetric(this, maxDepth)); this.similarityMetricMap.put(SimilarityMetricEnum.LIN, new LinMetric(this, false)); this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LIN, new LinMetric(this, true)); this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LCH, new IntrinsicLCHMetric(this, maxIC)); this.similarityMetricMap.put(SimilarityMetricEnum.PATH, new PathMetric(this)); this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_PATH, new IntrinsicPathMetric(this, maxIC)); this.similarityMetricMap.put(SimilarityMetricEnum.RADA, new RadaMetric(this, maxDepth)); this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_RADA, new IntrinsicRadaMetric(this, maxIC)); this.similarityMetricMap.put(SimilarityMetricEnum.SOKAL, new SokalSneathMetric(this)); this.similarityMetricMap.put(SimilarityMetricEnum.JACCARD, new JaccardMetric(this)); this.similarityMetricMap.put(SimilarityMetricEnum.WUPALMER, new WuPalmerMetric(this)); } else { this.similarityMetricMap.put(SimilarityMetricEnum.PAGERANK, new PageRankMetric(this, this.getPageRankService())); } } public boolean isPreload() { return preload; } public int lcs(String concept1, String concept2, List<LCSPath> lcsPaths) { ConcRel cr1 = cg.getConceptMap().get(concept1); ConcRel cr2 = cg.getConceptMap().get(concept2); int dist = -1; if (cr1 != null && cr2 != null) { Set<ConcRel> crlcses = new HashSet<ConcRel>(); Map<ConcRel, LCSPath> crpaths = new HashMap<ConcRel, LCSPath>(); dist = ConcRel.getLeastCommonConcept(cr1, cr2, crlcses, crpaths); lcsPaths.addAll(crpaths.values()); } return dist; } // /* // * (non-Javadoc) // * // * @see // org.apache.ctakes.ytex.kernel.ConceptSimilarity#lch(java.lang.String, // * java.lang.String) // */ // public double lch(String concept1, String concept2) { // double dm = 2 * cg.getDepthMax() + 1.0; // ConcRel cr1 = cg.getConceptMap().get(concept1); // ConcRel cr2 = cg.getConceptMap().get(concept2); // if (cr1 != null && cr2 != null) { // Set<String> lcses = new HashSet<String>(); // int lcsDist = getLCSFromCache(cr1, cr2, lcses); // // leacock is defined as -log([path length]/(2*[depth]) // double lch = -Math.log(((double) lcsDist + 1.0) / dm); // // scale to depth // return lch / Math.log(dm); // } else { // if (log.isDebugEnabled()) { // if (cr1 == null) // log.debug("could not find concept:" + concept1); // if (cr2 == null) // log.debug("could not find concept:" + concept2); // } // return 0; // } // } /** * For the given label and cutoff, get the corresponding concepts whose * propagated ig meets the threshold. Used by lin kernel to find concepts * that actually have a non-trivial similarity * * @param label * label * @param rankCutoff * cutoff * @param conceptFilter * set to fill with concepts * @return double minimum evaluation */ @Override public double loadConceptFilter(String label, int rankCutoff, Map<String, Double> conceptFilter) { List<FeatureRank> imputedConcepts = this.classifierEvaluationDao.getImputedFeaturesByPropagatedCutoff( corpusName, conceptSetName, label, lcsImputedType + ImputedFeatureEvaluator.SUFFIX_IMPUTED, conceptGraphName, lcsImputedType + ImputedFeatureEvaluator.SUFFIX_PROP, rankCutoff); double minEval = 1d; for (FeatureRank r : imputedConcepts) { conceptFilter.put(r.getFeatureName(), r.getEvaluation()); if (minEval >= r.getEvaluation()) minEval = r.getEvaluation(); } return minEval; } // public double lin(String concept1, String concept2) { // return filteredLin(concept1, concept2, null); // } public void setCacheManager(CacheManager cacheManager) { this.cacheManager = cacheManager; } public void setClassifierEvaluationDao(ClassifierEvaluationDao classifierEvaluationDao) { this.classifierEvaluationDao = classifierEvaluationDao; } public void setConceptDao(ConceptDao conceptDao) { this.conceptDao = conceptDao; } public void setConceptGraphName(String conceptGraphName) { this.conceptGraphName = conceptGraphName; } public void setConceptSetName(String conceptSetName) { this.conceptSetName = conceptSetName; } public void setCorpusName(String corpusName) { this.corpusName = corpusName; } public void setLcsCache(Cache lcsCache) { this.lcsCache = lcsCache; } public void setLcsImputedType(String lcsImputedType) { this.lcsImputedType = lcsImputedType; } // double minEval = 1d; // List<FeatureRank> listPropagatedConcepts = classifierEvaluationDao // .getTopFeatures(corpusName, conceptSetName, label, // ImputedFeatureEvaluator.MeasureType.INFOGAIN.toString() // + ImputedFeatureEvaluator.SUFFIX_PROP, 0, 0, // conceptGraphName, rankCutoff); // for (FeatureRank r : listPropagatedConcepts) { // ConcRel cr = cg.getConceptMap().get(r.getFeatureName()); // if (cr != null) { // addSubtree(conceptFilterSet, cr); // } // if (r.getEvaluation() < minEval) // minEval = r.getEvaluation(); // } // return minEval; // } // // /** // * add all children of parent to conceptSet. Limit only to children that // * actually appear in the corpus // * // * @param conceptSet // * set of concepts to add ids to // * @param parent // * parent which will be added to the conceptSet // * @param corpusICSet // * set of concepts and hypernyms contained in corpus // */ // private void addSubtree(Map<String, Double> conceptSet, ConcRel parent) { // if (!conceptSet.containsKey(parent.getConceptID()) // && conceptFreq.containsKey(parent.getConceptID())) { // conceptSet.put(parent.getConceptID(), 0d); // for (ConcRel child : parent.getChildren()) { // addSubtree(conceptSet, child); // } // } // } public void setPageRankService(PageRankService pageRankService) { this.pageRankService = pageRankService; } public void setPreload(boolean preload) { this.preload = preload; } public void setSimilarityMetricMap(Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap) { this.similarityMetricMap = similarityMetricMap; } public void setTransactionManager(PlatformTransactionManager transactionManager) { this.transactionManager = transactionManager; } @Override public List<ConceptPairSimilarity> similarity(List<ConceptPair> conceptPairs, List<SimilarityMetricEnum> metrics, Map<String, Double> conceptFilter, boolean lcs) { List<ConceptPairSimilarity> conceptSimMap = new ArrayList<ConceptPairSimilarity>(conceptPairs.size()); for (ConceptPair conceptPair : conceptPairs) { conceptSimMap.add( similarity(metrics, conceptPair.getConcept1(), conceptPair.getConcept2(), conceptFilter, lcs)); } return conceptSimMap; } /** * */ @Override public ConceptPairSimilarity similarity(List<SimilarityMetricEnum> metrics, String concept1, String concept2, Map<String, Double> conceptFilter, boolean lcs) { // allocate simInfo if this isn't provided SimilarityInfo simInfo = new SimilarityInfo(); if (lcs) simInfo.setLcsPaths(new ArrayList<LCSPath>(1)); // allocate result map List<Double> similarities = new ArrayList<Double>(metrics.size()); if (cg != null) { // iterate over metrics, compute, stuff in map for (SimilarityMetricEnum metric : metrics) { double sim = this.similarityMetricMap.get(metric).similarity(concept1, concept2, conceptFilter, simInfo); similarities.add(sim); } } ConceptPairSimilarity csim = new ConceptPairSimilarity(); csim.setConceptPair(new ConceptPair(concept1, concept2)); csim.setSimilarities(similarities); csim.setSimilarityInfo(simInfo); return csim; } /** * convert the list of tuis into a bitset * * @param tuis * @param mapTuiIndex * @return */ private BitSet tuiListToBitset(Set<String> tuis, SortedMap<String, Integer> mapTuiIndex) { BitSet bs = new BitSet(mapTuiIndex.size()); for (String tui : tuis) { bs.set(mapTuiIndex.get(tui)); } return bs; } }