Java tutorial
/* * Copyright 2011 DBpedia Spotlight Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org */ package org.dbpedia.spotlight.spot; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dbpedia.spotlight.exceptions.InitializationException; import org.dbpedia.spotlight.model.*; import org.dbpedia.spotlight.spot.cooccurrence.ClassifierFactory; import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClass; import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClassification; import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClassifier; import org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProviderSQL; import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPOS; import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPattern; import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterTermsize; import org.dbpedia.spotlight.tagging.TaggedToken; import org.dbpedia.spotlight.tagging.TaggedTokenProvider; import java.util.LinkedList; import java.util.List; /** * Spot selector based on co-occurrence data using two classifiers for unigram * and ngram candidates. * * @author Joachim Daiber */ public class CoOccurrenceBasedSelector implements TaggedSpotSelector { private final Log LOG = LogFactory.getLog(this.getClass()); /** * Creates a spot selector based on n-gram co-occurrence. A SpotterConfiguration object must be * passed as a parameter since the selector must use and initialize an occurrence * data provider and a factory for classifiers. * * @see org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProvider * @see ClassifierFactory * * @param spotterConfiguration SpotterConfiguration object with classifier paths and JDBC * description of occurrence data provider. * @throws InitializationException Either the OccurrenceDataProvider or the ClassifierFactory * could not be initialized. */ public CoOccurrenceBasedSelector(SpotterConfiguration spotterConfiguration) throws InitializationException { LOG.info("Initializing spot occurrence data provider."); OccurrenceDataProviderSQL.initialize(spotterConfiguration); LOG.info("Done."); LOG.info("Initializing spot candidate classifiers."); new ClassifierFactory(spotterConfiguration.getCoOcSelectorClassifierUnigram(), spotterConfiguration.getCoOcSelectorClassifierNGram(), spotterConfiguration.getCoOcSelectorDatasource(), OccurrenceDataProviderSQL.getInstance()); LOG.info("Done."); } /** * Creates a spot selector based on n-gram co-occurrence. A SpotterConfiguration object must be * passed as a parameter since the selector must use and initialize an occurrence * data provider and a factory for classifiers. * * @see org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProvider * @see ClassifierFactory * * @param spotterConfiguration SpotterConfiguration object with classifier paths and JDBC * description of occurrence data provider. * @param taggedTokenProvider TaggedTokenProvider used to create a tagged text to test the * classifiers. * @throws InitializationException Either the OccurrenceDataProvider or the ClassifierFactory * could not be initialized. */ public CoOccurrenceBasedSelector(SpotterConfiguration spotterConfiguration, TaggedTokenProvider taggedTokenProvider) throws InitializationException { this(spotterConfiguration); //TODO Instead of doing a test classification here, we should properly check if the serialized model suits the WEKA instances that are produced from SurfaceFormOccurrences. LOG.info("Testing classifiers for co-occurrence based spot selector."); SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram(); SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram(); Text taggedText = new TaggedText("Bill Gates is a software developer from Berlin.", taggedTokenProvider); SurfaceFormOccurrence ngramOccurrence = new SurfaceFormOccurrence(new SurfaceForm("Bill Gates"), taggedText, 0, Provenance.Undefined(), -1); SurfaceFormOccurrence unigramOccurrence = new SurfaceFormOccurrence(new SurfaceForm("Berlin"), taggedText, 41, Provenance.Undefined(), -1); try { unigramClassifier.classify(unigramOccurrence); ngramClassifier.classify(ngramOccurrence); } catch (Exception e) { throw new InitializationException( "An error occurred while classifying a test spot using the co-occurrence " + "based spot selector. This is most probably caused by an outdated spot selector model. Please " + "check the spot selector models defined 'org.dbpedia.spotlight.spot.cooccurrence.classifier.*'.", e); } LOG.info("Done."); } /** * Filter the list of surface form occurrences, removing all occurrences that are considered * common. * * @param surfaceFormOccurrences spotted surface form occurrences * @return List of non-common surface form occurrences */ public List<SurfaceFormOccurrence> select(List<SurfaceFormOccurrence> surfaceFormOccurrences) { List<SurfaceFormOccurrence> selectedOccurrences = new LinkedList<SurfaceFormOccurrence>(); FilterPOS filterPOS = new FilterPOS(); FilterTermsize unigramFilter = new FilterTermsize(FilterTermsize.Termsize.unigram); FilterPattern filterPattern = new FilterPattern(); SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram(); SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram(); assert unigramClassifier != null; assert ngramClassifier != null; //ngramClassifier.setVerboseMode(true); f //unigramClassifier.setVerboseMode(true); List<String> decisions = new LinkedList<String>(); for (SurfaceFormOccurrence surfaceFormOccurrence : surfaceFormOccurrences) { if (surfaceFormOccurrence.surfaceForm().name().trim().length() == 0) { LOG.warn("I have an occurrence with empty surface form. :-O Ignoring."); LOG.error(surfaceFormOccurrence); continue; } if (!(surfaceFormOccurrence.context() instanceof TaggedText)) { //FIXME added this to avoid breaking, but code below will never run if we don't pass the taggedtext LOG.error(String.format("SurfaceFormOccurrence did not contain TaggedText. Cannot apply %s", this.getClass())); selectedOccurrences.add(surfaceFormOccurrence); continue; } if (unigramFilter.applies(surfaceFormOccurrence)) { /** * Unigram (n = 1) */ if (!filterPOS.applies(surfaceFormOccurrence)) { /** * The Surface Form is on the POS blacklist, i.e. a single adjective, * verb, etc. */ if (Character.isUpperCase(surfaceFormOccurrence.surfaceForm().name().charAt(0))) { TaggedToken taggedToken = ((TaggedText) surfaceFormOccurrence.context()) .taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence).get(0); /** * Add uppercase adjectives (e.g. Canadian tv star) */ if (taggedToken.getPOSTag() != null && taggedToken.getPOSTag().startsWith("j")) selectedOccurrences.add(surfaceFormOccurrence); } else { decisions.add("Dropped by POS filter: " + surfaceFormOccurrence); } } else if (!filterPattern.applies(surfaceFormOccurrence)) { decisions.add("Dropped by Pattern filter: " + surfaceFormOccurrence); } else { SpotClassification spotClassification; try { spotClassification = unigramClassifier.classify(surfaceFormOccurrence); if (spotClassification.getCandidateClass() == SpotClass.valid) { selectedOccurrences.add(surfaceFormOccurrence); //LOG.info(("Kept by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence); } else { decisions.add("Dropped by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence); } } catch (Exception e) { LOG.error("Exception when classifying unigram candidate: " + e); } } } else { /** * n > 1 */ SpotClassification spotClassification; try { spotClassification = ngramClassifier.classify(surfaceFormOccurrence); } catch (Exception e) { LOG.error("Exception when classifying ngram candidate: " + e); continue; } if (spotClassification.getCandidateClass() == SpotClass.valid) { selectedOccurrences.add(surfaceFormOccurrence); //LOG.info("Kept by nGramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence); } else { decisions.add("Dropped by NGramClassifier: " + surfaceFormOccurrence); } } } if (LOG.isDebugEnabled()) for (String decision : decisions) { LOG.debug(decision); } return selectedOccurrences; } }