Java tutorial
/******************************************************************************* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /** * */ package de.tudarmstadt.ukp.dkpro.wsd.si.uby; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.collections15.Transformer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.dkpro.wsd.UnorderedPair; import de.tudarmstadt.ukp.dkpro.wsd.si.POS; import de.tudarmstadt.ukp.dkpro.wsd.si.SenseAlignment; import de.tudarmstadt.ukp.dkpro.wsd.si.SenseDictionary; import de.tudarmstadt.ukp.dkpro.wsd.si.SenseInventoryBase; import de.tudarmstadt.ukp.dkpro.wsd.si.SenseInventoryException; import de.tudarmstadt.ukp.dkpro.wsd.si.SenseTaxonomy; import de.tudarmstadt.ukp.lmf.api.Uby; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.enums.ESenseAxisType; import de.tudarmstadt.ukp.lmf.model.meta.Frequency; import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.SenseExample; import de.tudarmstadt.ukp.lmf.model.semantics.SenseRelation; import de.tudarmstadt.ukp.lmf.model.semantics.Synset; import de.tudarmstadt.ukp.lmf.model.semantics.SynsetRelation; import de.tudarmstadt.ukp.lmf.transform.DBConfig; import edu.uci.ics.jung.graph.UndirectedGraph; /** * A sense inevntory for UBY * * @author Tristan Miller <miller@ukp.informatik.tu-darmstadt.de> * */ public class UbySenseInventory extends SenseInventoryBase implements SenseTaxonomy, SenseDictionary, SenseAlignment { protected Uby uby; protected Lexicon lexicon; protected boolean allowMultiLingualAlignments = false; private final static SiPosToUbyPos siPosToUbyPos = new SiPosToUbyPos(); private final static UbyPosToSiPos ubyPosToSiPos = new UbyPosToSiPos(); private final Log logger = LogFactory.getLog(getClass()); // Variables and cache for sense descriptions private String senseDescriptionFormat = "%w; %d"; private final Map<String, CachedSense> senses = new HashMap<String, CachedSense>(); /** * Returns the underlying Uby object. * * @return */ public Uby getUnderlyingResource() { return uby; } public UbySenseInventory(DBConfig dbConfig) throws SenseInventoryException { try { uby = new Uby(dbConfig); } catch (IllegalArgumentException e) { throw new SenseInventoryException(e); } } /** * Sets the format of the string to be returned by the {@link * getSenseDescription()} method. The following printf-style format * specifiers are recognized: * * <dl> * <dt>%d</dt> * <dd>the sense's definition</dd> * <dt>%w</dt> * <dd>the sense's lemmas</dd> * <dt>%e</dt> * <dd>the sense's example sentences</dd> * </dl> * * A null format string is equivalent to "%d". * * @param format * A format string as described in the format string syntax. */ public void setSenseDescriptionFormat(String format) { if (format == null) { senseDescriptionFormat = "%d"; } else { senseDescriptionFormat = format; } } /** * Filter all queries by the given lexicon * * @param lexiconName * The name of the lexicon to filter on, or null if no filter * should be applied. * @throws SenseInventoryException */ public void setLexicon(String lexiconName) throws SenseInventoryException { if (lexiconName == null) { this.lexicon = null; return; } Lexicon lexicon; try { lexicon = uby.getLexiconByName(lexiconName); } catch (IllegalArgumentException e) { throw new SenseInventoryException(e); } if (this.lexicon != null) { // Flush the sense cache senses.clear(); } this.lexicon = lexicon; } /** * Determines whether {@link getSenseAlignments} should also return * alignments to senses in other languages. * * @param allow */ public void setAllowMultilingualAlignments(boolean allow) { this.allowMultiLingualAlignments = allow; flushSenseAlignmentCache(); } private void flushSenseAlignmentCache() { if (senses == null) { return; } for (CachedSense s : senses.values()) { s.alignments = null; } } /** * * @param url * Host_to_the_database/database_name * @param jdbc_driver_class * The jdbc driver class using to access database * @param db_vendor * @param user * Password for accessing the database * @param password * Database name * @param showSQL * If true all SQL queries are printed on the console * @throws SenseInventoryException */ public UbySenseInventory(String url, String jdbc_driver_class, String db_vendor, String user, String password, boolean showSQL) throws SenseInventoryException { this(new DBConfig(url, jdbc_driver_class, db_vendor, user, password, showSQL)); } @SuppressWarnings("unused") private UbySenseInventory() { } @Override public Map<String, List<String>> getSenseInventory() throws SenseInventoryException { throw new UnsupportedOperationException(); } @Override public List<String> getSenses(String sod) throws SenseInventoryException { return getSenses(sod, null); } /** * Get a list of Uby lexical entries for a given lemma and part of speech. * Because our POS tags are more coarse-grained than the ones used by Uby, * we need to call Uby's getLexicalEntries() multiple times and merge the * results. * * @param lemma * @param pos * @return */ protected List<LexicalEntry> getLexicalEntriesByPOS(String lemma, POS pos) { if (pos == null) { return uby.getLexicalEntries(lemma, null, lexicon); } List<LexicalEntry> entries = new ArrayList<LexicalEntry>(); for (EPartOfSpeech ubyPOS : siPosToUbyPos.transform(pos)) { entries.addAll(uby.getLexicalEntries(lemma, ubyPOS, lexicon)); } return entries; } @Override public List<String> getSenses(String sod, POS pos) throws SenseInventoryException, UnsupportedOperationException { List<LexicalEntry> entries = getLexicalEntriesByPOS(sod.replace('_', ' '), pos); List<String> senses = new ArrayList<String>(); for (LexicalEntry lexicalEntry : entries) { for (Sense sense : lexicalEntry.getSenses()) { senses.add(sense.getId()); } } return senses; } @Override public String getMostFrequentSense(String sod) throws SenseInventoryException, UnsupportedOperationException { return getMostFrequentSense(sod, null); } @Override public String getMostFrequentSense(String sod, POS pos) throws SenseInventoryException, UnsupportedOperationException { // TODO: Implement a cache as this operation is expensive List<LexicalEntry> entries = getLexicalEntriesByPOS(sod.replace('_', ' '), pos); Sense mostFrequentSense = null; int maxFrequency = Integer.MIN_VALUE; for (LexicalEntry lexicalEntry : entries) { for (Sense sense : lexicalEntry.getSenses()) { int senseFrequency = 0; boolean foundFrequencies = false; // Sum frequencies over all corpora and generators for (Frequency frequency : sense.getFrequencies()) { senseFrequency += frequency.getFrequency(); foundFrequencies = true; } if (foundFrequencies && senseFrequency > maxFrequency) { maxFrequency = senseFrequency; mostFrequentSense = sense; } } } if (mostFrequentSense != null) { return mostFrequentSense.getId(); } else { return null; } } @Override public String getSenseDescription(String senseId) throws SenseInventoryException { CachedSense sense = getSense(senseId); String description = senseDescriptionFormat.replace("%d", sense.getDefinition()); description = description.replace("%e", sense.getExamples().toString()); description = description.replace("%w", sense.getSynonyms().toString()); return description; } @Override public POS getPos(String senseId) throws SenseInventoryException { CachedSense sense = getSense(senseId); return sense.getPos(); } @Override public int getUseCount(String senseId) throws SenseInventoryException { throw new UnsupportedOperationException(); } @Override public String getSenseInventoryName() { if (lexicon == null) { return "Uby"; } else { return "Uby_" + lexicon.getName(); } } @Override public UndirectedGraph<String, UnorderedPair<String>> getUndirectedGraph() throws SenseInventoryException, UnsupportedOperationException { throw new UnsupportedOperationException(); } @Override public void setUndirectedGraph(UndirectedGraph<String, UnorderedPair<String>> graph) throws SenseInventoryException, UnsupportedOperationException { throw new UnsupportedOperationException(); } @Override public Set<String> getSenseNeighbours(String senseId) throws SenseInventoryException, UnsupportedOperationException { return getSense(senseId).getNeighbours(); } /** * Transforms a POS enum to a Uby POS * * @author Tristan Miller <miller@ukp.informatik.tu-darmstadt.de> * */ public static class SiPosToUbyPos implements Transformer<POS, EPartOfSpeech[]> { protected final EPartOfSpeech UbyNounPOS[] = { EPartOfSpeech.noun, EPartOfSpeech.nounCommon, EPartOfSpeech.nounProper, EPartOfSpeech.nounProperFirstName, EPartOfSpeech.nounProperLastName }; protected final EPartOfSpeech UbyVerbPOS[] = { EPartOfSpeech.verb, EPartOfSpeech.verbAuxiliary, EPartOfSpeech.verbMain, EPartOfSpeech.verbModal }; protected final EPartOfSpeech UbyAdjectivePOS[] = { EPartOfSpeech.adjective }; protected final EPartOfSpeech UbyAdverbPOS[] = { EPartOfSpeech.adverb }; @Override public EPartOfSpeech[] transform(POS pos) { if (pos == null) { return null; } switch (pos) { case NOUN: return UbyNounPOS; case VERB: return UbyVerbPOS; case ADJ: return UbyAdjectivePOS; case ADV: return UbyAdverbPOS; } return null; } } /** * Transforms a Uby POS to a POS enum * * @author Tristan Miller <miller@ukp.informatik.tu-darmstadt.de> * */ public static class UbyPosToSiPos implements Transformer<EPartOfSpeech, POS> { @Override public POS transform(EPartOfSpeech pos) { if (pos == null) { return null; } switch (pos) { case noun: case nounCommon: case nounProper: case nounProperFirstName: case nounProperLastName: return POS.NOUN; case verb: case verbAuxiliary: case verbMain: case verbModal: return POS.VERB; case adjective: return POS.ADJ; case adverb: return POS.ADV; default: return null; } } } @Override public Set<String> getSenseExamples(String senseId) throws SenseInventoryException { return getSense(senseId).getExamples(); } @Override public Set<String> getSenseWords(String senseId) throws SenseInventoryException { return getSense(senseId).getSynonyms(); } @Override public String getSenseDefinition(String senseId) throws SenseInventoryException { return getSense(senseId).getDefinition(); } /** * Given a Uby sense ID, return the sense ID used by the underlying lexicon * * @param senseId * @return * @throws SenseInventoryException */ public String getLexiconSenseId(String senseId) throws SenseInventoryException { return getSense(senseId).getLexiconSenseId(); } /** * Given a Uby sense ID, return the synset ID used by the underlying lexicon * * @param senseId * @return * @throws SenseInventoryException */ public String getLexiconSynsetId(String senseId) throws SenseInventoryException { return getSense(senseId).getLexiconSynsetId(); } /** * Returns a set of alignments for the given sense * * @param senseId * The ID of the sense whose alignments should be found * @return A (possibly empty) set of sense IDs for aligned senses * * @throws SenseInventoryException */ @Override public Set<String> getSenseAlignments(String senseId) throws SenseInventoryException { return getSense(senseId).getAlignments(); } private CachedSense getSense(String senseId) throws SenseInventoryException { CachedSense s = senses.get(senseId); if (s == null) { s = new CachedSense(senseId); senses.put(senseId, s); } return s; } private class CachedSense implements CachedDictionarySense, CachedTaxonomySense, CachedAlignedSense { private final String id; private final Sense sense; private final String definition; private final Synset synset; private final POS pos; private final int useCount; private Set<String> examples; private Set<String> words; private Set<String> neighbours; private Set<String> alignments; private String lexiconSenseId = null; private String lexiconSynsetId = null; private boolean foundLexiconSenseId = false; private boolean foundLexiconSynsetId = false; @SuppressWarnings("unused") private CachedSense() { id = null; sense = null; definition = null; synset = null; pos = null; useCount = 0; } public int getUseCount() { return useCount; } public POS getPos() { return pos; } public String getLexiconSenseId() { if (foundLexiconSenseId) { return lexiconSenseId; } foundLexiconSenseId = true; List<MonolingualExternalRef> externalReferences = sense.getMonolingualExternalRefs(); if (externalReferences == null || externalReferences.isEmpty()) { return null; } if (externalReferences.size() > 1) { logger.warn("Sense " + id + " has more than one external reference"); } MonolingualExternalRef externalReference = externalReferences.get(0); if (externalReference == null) { return null; } lexiconSenseId = externalReference.getExternalReference(); return lexiconSenseId; } public String getLexiconSynsetId() { if (foundLexiconSynsetId) { return lexiconSynsetId; } foundLexiconSynsetId = true; if (synset == null) { return null; } List<MonolingualExternalRef> externalReferences = synset.getMonolingualExternalRefs(); if (externalReferences == null || externalReferences.isEmpty()) { return null; } if (externalReferences.size() > 1) { logger.warn("Synset for sense " + id + " has more than one external reference"); } MonolingualExternalRef externalReference = externalReferences.get(0); if (externalReference == null) { return null; } lexiconSynsetId = externalReference.getExternalReference(); return lexiconSynsetId; } public CachedSense(String senseId) throws SenseInventoryException { id = senseId; try { sense = uby.getSenseById(senseId); synset = sense.getSynset(); pos = ubyPosToSiPos.transform(sense.getLexicalEntry().getPartOfSpeech()); // Sum frequencies over all corpora and generators int senseFrequency = 0; for (Frequency frequency : sense.getFrequencies()) { senseFrequency += frequency.getFrequency(); } useCount = senseFrequency; } catch (IllegalArgumentException e) { throw new SenseInventoryException(e); } definition = constructDefinition(); } private String constructDefinition() { String definition = sense.getDefinitionText(); if (definition != null && definition.length() > 0) { return definition; } if (synset == null) { return ""; } definition = synset.getDefinitionText(); if (definition != null && definition.length() > 0) { return definition; } return ""; } /** * Returns a set of alignments for the given sense * * @throws SenseInventoryException */ @Override public Set<String> getAlignments() throws SenseInventoryException { if (alignments != null) { return alignments; } List<SenseAxis> alignedSenses = uby.getSenseAxesBySense(sense); alignments = new HashSet<String>(alignedSenses.size()); for (SenseAxis axis : alignedSenses) { if (allowMultiLingualAlignments == false && axis.getSenseAxisType() != ESenseAxisType.monolingualSenseAlignment) { continue; } Sense alignedSense = sense.equals(axis.getSenseOne()) ? axis.getSenseTwo() : axis.getSenseOne(); alignments.add(alignedSense.getId()); } return alignments; } @Override public Set<String> getNeighbours() throws SenseInventoryException { if (neighbours != null) { return neighbours; } neighbours = new HashSet<String>(); List<SenseRelation> senseRelations = sense.getSenseRelations(); if (senseRelations != null && !senseRelations.isEmpty()) { for (SenseRelation senseRelation : senseRelations) { Sense target = senseRelation.getTarget(); if (target != null) { neighbours.add(target.getId()); } } } if (synset != null) { List<SynsetRelation> synsetRelations = synset.getSynsetRelations(); if (synsetRelations != null && !synsetRelations.isEmpty()) { for (SynsetRelation synsetRelation : synsetRelations) { Synset target = synsetRelation.getTarget(); if (target != null) { for (Sense neighbour : target.getSenses()) { neighbours.add(neighbour.getId()); } } } } } return neighbours; } @Override public Set<String> getSynonyms() throws SenseInventoryException { if (words != null) { return words; } words = new HashSet<String>(); // Add this sense's lemma to the set words.add(sense.getLexicalEntry().getLemmaForm()); // First thing to check: Are we part of a synset? If so, we can // find the synonyms through the synset. if (synset != null) { for (Sense s : synset.getSenses()) { words.add(s.getLexicalEntry().getLemmaForm()); } } // Second thing to check: See if we have any sense relations of type // "SYNONYM": for (SenseRelation senseRelation : sense.getSenseRelations()) { if (senseRelation.getRelName().equals("SYNONYM") && senseRelation.getFormRepresentation() != null) { words.add(senseRelation.getFormRepresentation().getWrittenForm()); } } return words; } @Override public Set<String> getExamples() throws SenseInventoryException { if (examples != null) { return examples; } examples = new HashSet<String>(); List<SenseExample> exampleList = sense.getSenseExamples(); for (SenseExample example : exampleList) { List<TextRepresentation> textRepresentations = example.getTextRepresentations(); for (TextRepresentation textRepresentation : textRepresentations) { examples.add(textRepresentation.getWrittenText()); } } return examples; } @Override public String getDefinition() throws SenseInventoryException { return definition; } } }