Java tutorial
/* * Copyright Paolo Dragone 2014. * Copyright Alessandro Ronca 2014. * * This file is part of Wiktionary Ontology. * * Wiktionary Ontology is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Wiktionary Ontology is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Wiktionary Ontology. If not, see <http://www.gnu.org/licenses/>. */ package org.dragoneronca.nlp.wol.graph_building; import com.google.common.collect.Sets; import org.apache.commons.configuration.PropertiesConfiguration; import org.dragoneronca.nlp.wol.WolConfiguration; import org.dragoneronca.nlp.wol.domain.RangedSenseIteratorFactory; import org.dragoneronca.nlp.wol.domain.RangedSenseScanner; import org.dragoneronca.nlp.wol.domain.WolDomainContext; import org.dragoneronca.nlp.wol.domain.entities.SemanticEdge; import org.dragoneronca.nlp.wol.domain.entities.Sense; import org.dragoneronca.nlp.wol.domain.entities.TaggedTerm; import org.dragoneronca.nlp.wol.domain.entities.TermSemanticEdge; import org.dragoneronca.nlp.wol.tagger.WolTagger; import org.dragoneronca.util.nlp.Language; import org.dragoneronca.util.nlp.POS; import javax.persistence.EntityManager; import java.util.*; /** * This class is able to build a WolGraph with only the edges pointing the the first sense of a * term. * <p/> * The order between senses is that one defined by Wiktionary. * * @author Paolo Dragone * @author Alessandro Ronca */ public class FirstSenseGraphBuilder implements Runnable { private static final WolTagger TAGGER = WolTagger.getInstance(); private static final double MAX_PROBABILITY = 1.0; private static final int SENSES_BUNCH_SIZE; private static final int DEFAULT_SENSES_BUNCH_SIZE = 100000; static { PropertiesConfiguration properties = WolConfiguration.getInstance().getConfiguration("environment"); SENSES_BUNCH_SIZE = properties.getInt("first_sense_graph_builder.senses_bunch_size", DEFAULT_SENSES_BUNCH_SIZE); } private final EntityManager entityManager; private boolean executed = false; /** * It constructs a <tt>FirstSenseGraphBuilder</tt> object. */ public FirstSenseGraphBuilder() { this.entityManager = WolDomainContext.getInstance().getEntityManager(); } /** * The main function to start the graph building algorithm. * * @param args no parameters. */ public static void main(String[] args) { WolConfiguration.getInstance(); FirstSenseGraphBuilder graphBuilder = new FirstSenseGraphBuilder(); graphBuilder.run(); } @Override public void run() { if (executed) { return; } else { executed = true; } RangedSenseScanner rangedSenseScanner = new RangedSenseScanner(SENSES_BUNCH_SIZE); while (rangedSenseScanner.hasNext()) { RangedSenseIteratorFactory rangedSenseIteratorFactory = rangedSenseScanner.next(); RangedSenseScanner.AlphabeticRange alphabeticRange = rangedSenseIteratorFactory.getRange(); Iterator<Sense> senseIterator = WolDomainContext.getInstance().senseIterator(SENSES_BUNCH_SIZE); while (senseIterator.hasNext()) { Sense sense = senseIterator.next(); linkSenseFromTerms(sense, alphabeticRange, rangedSenseIteratorFactory); } } } private void linkSenseFromTerms(Sense sense, RangedSenseScanner.AlphabeticRange alphabeticRange, RangedSenseIteratorFactory rangedSenseIteratorFactory) { TAGGER.tagSenses(Arrays.asList(sense).iterator()); for (TaggedTerm taggedTerm : sense.getTaggedGloss()) { if (!alphabeticRange.isInRange(taggedTerm.getLemma())) { continue; } Set<Sense> targetSenses = getTargetSenses(sense, Language.EN, taggedTerm.getPOS(), taggedTerm.getLemma(), rangedSenseIteratorFactory); if (targetSenses.isEmpty()) { return; } Sense firstSense = Collections.min(targetSenses, new Comparator<Sense>() { @Override public int compare(Sense sense1, Sense sense2) { return Integer.compare(sense1.getNumber(), sense2.getNumber()); } }); TermSemanticEdge semanticEdge = new TermSemanticEdge(firstSense, MAX_PROBABILITY, taggedTerm); semanticEdge.setOriginSense(sense); Set<SemanticEdge> outEdges = sense.getOutEdges(); if (outEdges == null) { outEdges = new HashSet<>(); sense.setOutEdges(outEdges); } outEdges.add(semanticEdge); entityManager.persist(semanticEdge); } } private Set<Sense> getTargetSenses(Sense originSense, Language lang, POS pos, String targetWord, RangedSenseIteratorFactory rangedSenseIteratorFactory) { Iterator<Sense> senseIterator = rangedSenseIteratorFactory.senseIterator(targetWord.replace('_', ' '), lang, pos); Set<Sense> referredSenses = Sets.newHashSet(senseIterator); Set<Sense> targetSenses = null; if (originSense.getContext() != null) { Set<Sense> sameContextReferredSenses = retainByContext(originSense.getContext(), referredSenses); if (!sameContextReferredSenses.isEmpty()) { targetSenses = sameContextReferredSenses; } } if (targetSenses == null) { targetSenses = referredSenses; } return targetSenses; } private Set<Sense> retainByContext(String context, Set<Sense> senses) { Set<Sense> retainedSenses = new HashSet<>(); for (Sense sense : senses) { if (sense.getContext() != null && sense.getContext().equals(context)) { retainedSenses.add(sense); } } return retainedSenses; } }