Java tutorial
/* * Copyright (C) 2015 Artificial Intelligence * Laboratory @ University of Udine. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package it.uniud.ailab.dcore; import it.uniud.ailab.dcore.DistilledOutput.DetectedGram; import it.uniud.ailab.dcore.DistilledOutput.InferredConcept; import it.uniud.ailab.dcore.annotation.Annotator; import it.uniud.ailab.dcore.annotation.annotations.InferenceAnnotation; import it.uniud.ailab.dcore.annotation.annotations.UriAnnotation; import it.uniud.ailab.dcore.annotation.annotators.GenericEvaluatorAnnotator; import static it.uniud.ailab.dcore.annotation.annotators.GenericWikipediaAnnotator.WIKIURI; import it.uniud.ailab.dcore.annotation.annotators.WikipediaInferenceAnnotator; import it.uniud.ailab.dcore.persistence.Gram; import it.uniud.ailab.dcore.persistence.Keyphrase; import static it.uniud.ailab.dcore.utils.StageUtils.getStageName; import java.util.Collection; import java.util.Collections; import java.util.Locale; import org.springframework.beans.factory.annotation.Required; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; /** * The information extractor object. This is the class that runs the different * annotation pipelines. * * @author Marco Basaldella * @author Dario De Nart */ public class Distiller { // all these fields will be injected via setter method /** * The first step of the actual pipeline: this annotator should decide in * which language the document is written. */ private Annotator languageDetector; /** * The annotation pipelines. There should be one for language. */ private Map<Locale, Pipeline> pipelines = new HashMap<>(); /** * The language of the document. */ private Locale documentLocale = null; /** * The Object that will contain the text all its annotations. */ private Blackboard blackboard; /** * The verbose mode flag. If the flag is set to true, Distiller will * print information on the work he's doing on stdout. Default is false. */ private boolean verbose = false; /** * Sets the language detector. * * @param languageDetector the language detector. */ @Required public void setLanguageDetector(Annotator languageDetector) { this.languageDetector = languageDetector; } /** * Set the annotation pipelines. * * @param pipelines the annotation pipelines. */ @Required public void setPipelines(HashMap<Locale, Pipeline> pipelines) { this.pipelines = pipelines; } /** * Adds a pipeline to the pipelines map. * * @param locale the language that the pipeline will process * @param pipeline the pipeline to add */ public void addPipeline(Locale locale, Pipeline pipeline) { pipelines.put(locale, pipeline); } /** * Sets the locale in which the text extraction will be performed. The value * should be null for auto-detection of the locale of the IETF formatted * language tag if manual locale setting is desired. For example, passing * "en-US" will set the locale to English. An empty locale equals to the * "auto" parameter. * * @param locale the locale to use while processing the text. */ public void setLocale(Locale locale) throws IllegalArgumentException { this.documentLocale = locale; } /** * Gets the blackboard. * * @return the blackboard. */ public Blackboard getBlackboard() { return blackboard; } /** * Sets the verbose mode of the Distiller. * * @param verbose true to display information of the distillation process; * false for silent distillation. */ public void setVerbose(boolean verbose) { this.verbose = verbose; } /** * Gets the verbose mode of the Distiller. * @return TRUE if the Distillation is set to VERBOSE mode. */ public boolean getVerbose() { return verbose; } /** * Perform the extraction of keyphrases of a specified string, and returns * the blackboard filled with document and annotations. * * @param text the text to distill. * @return the blackboard filled with the processed text */ public Blackboard distillToBlackboard(String text) { blackboard = new Blackboard(); blackboard.createDocument(text); if (documentLocale == null) // if no language has been set, automatically detect it. { if (languageDetector != null) { languageDetector.annotate(blackboard, blackboard.getStructure()); } else // but if there's no language and no language detector, // throw an exception. { throw new DistillerException( "I can't decide the language of the document: no language is specified and no language detector is set."); } } else // set the pre-determined language { blackboard.getStructure().setLanguage(documentLocale); } Pipeline pipeline = pipelines.get(blackboard.getStructure().getLanguage()); if (pipeline == null) { throw new DistillerException( "No pipeline for the language " + blackboard.getStructure().getLanguage().getLanguage()); } for (Stage stage : pipeline.getStages()) { if (verbose) { System.out.println(String.format("Running %s...", getStageName(stage))); } stage.run(blackboard); } if (verbose) { System.out.println("Extraction complete!"); System.out.println(); } return blackboard; } /** * Perform the extraction of keyphrases of a specified string, and returns a * developer-friendly object that allows quick access to the extracted * information. * * @param text the text to extract * @return the distilled output */ public DistilledOutput distill(String text) { DistilledOutput output = new DistilledOutput(); output.setOriginalText(text); distillToBlackboard(text); output.setDetectedLanguage(blackboard.getStructure().getLanguage().getLanguage()); // Copy the grams, sorted by descending score output.initializeGrams(blackboard.getKeyphrases().size()); Collection<Gram> grams = blackboard.getKeyphrases(); Map<Keyphrase, Double> scoredGrams = new HashMap<>(); for (Gram g : grams) { Keyphrase k = (Keyphrase) g; scoredGrams.put(k, k.getFeature(GenericEvaluatorAnnotator.SCORE)); } List<Map.Entry<Keyphrase, Double>> sortedGrams = scoredGrams.entrySet().stream() .sorted(Collections.reverseOrder(Map.Entry.comparingByValue())).collect(Collectors.toList()); for (int i = 0; i < output.getGrams().length; i++) { DetectedGram gram = output.getGrams()[i]; Keyphrase originalGram = sortedGrams.get(i).getKey(); gram.setSurface(originalGram.getSurface()); gram.setKeyphraseness(originalGram .getFeature(it.uniud.ailab.dcore.annotation.annotators.GenericEvaluatorAnnotator.SCORE)); UriAnnotation wikiAnn = (UriAnnotation) originalGram.getAnnotation(WIKIURI); if (wikiAnn != null) { gram.setConceptName(wikiAnn.getSurface()); gram.setConceptPath(wikiAnn.getUri().toASCIIString()); } } output.initializeRelatedConcepts(blackboard.getAnnotations(WikipediaInferenceAnnotator.RELATED).size()); for (int i = 0; i < output.getRelatedConcepts().length; i++) { InferredConcept related = output.getRelatedConcepts()[i]; InferenceAnnotation originalRelatedConcept = (InferenceAnnotation) blackboard .getAnnotations(WikipediaInferenceAnnotator.RELATED).get(i); related.setConcept(originalRelatedConcept.getConcept()); related.setConceptPath(originalRelatedConcept.getUri().toASCIIString()); related.setScore(originalRelatedConcept.getScore()); } output.initializeHypernyms(blackboard.getAnnotations(WikipediaInferenceAnnotator.HYPERNYMS).size()); for (int i = 0; i < output.getHypernyms().length; i++) { InferredConcept hypernym = output.getHypernyms()[i]; InferenceAnnotation originalHypernym = (InferenceAnnotation) blackboard .getAnnotations(WikipediaInferenceAnnotator.HYPERNYMS).get(i); hypernym.setConcept(originalHypernym.getConcept()); hypernym.setConceptPath(originalHypernym.getUri().toASCIIString()); hypernym.setScore(originalHypernym.getScore()); } output.setExtractionCompleted(true); return output; } }