Java tutorial
/* * Copyright (C) 2015 Artificial Intelligence * Laboratory @ University of Udine. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package it.uniud.ailab.dcore.annotation.annotators; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import it.uniud.ailab.dcore.persistence.DocumentComponent; import it.uniud.ailab.dcore.Blackboard; import it.uniud.ailab.dcore.annotation.annotations.FeatureAnnotation; import it.uniud.ailab.dcore.persistence.Keyphrase; import it.uniud.ailab.dcore.persistence.Sentence; import it.uniud.ailab.dcore.persistence.Token; import it.uniud.ailab.dcore.utils.FileSystem; import java.io.InputStreamReader; import java.util.Map; /** * The default n-gram generator algorithm. This class generates relevant n-grams * checking, for every group of words, if their POS pattern can be found in a * database of pre-made sequences. For example, "Software Engineering" should be * tagged with "Noun Noun", so if the POS pattern "Noun Noun" is in the database * the n-gram "Software Engineering" will be selected. * * The database is a simple JSON file which is embedded with the project but, if * the user wants, is possible to let the algorithm use a custom database by * specifying it in the constructor. This may be useful to test other POS * patterns or to add support for new languages. * * Contextual to the generation, the algorithm assigns the first feature to the * n-gram, which is the "Noun Value", which is proportional to the number of * nouns in the gram. * * @author Dario De Nart * @author Marco Basaldella */ public class SimpleNGramGeneratorAnnotator implements GenericNGramGeneratorAnnotator { // <editor-fold desc="private fields"> /** * The languages that the n-gram generator will process and their POS * pattern database paths. */ private Map<Locale, String> posDatabasePaths; /** * The POS patterns found. */ private HashMap<String, Integer> validPOSPatterns; /** * The maximum size of n-grams to detect. */ private int maxGramSize; /** * The default maximum size of n-grams. */ private static final int DEFAULT_MAX_NGRAM_SIZE = 3; // </editor-fold> // <editor-fold desc="constructor"> /** * Initializes the nGram generator. */ public SimpleNGramGeneratorAnnotator() { validPOSPatterns = new HashMap<>(); posDatabasePaths = new HashMap<>(); maxGramSize = DEFAULT_MAX_NGRAM_SIZE; posDatabasePaths.put(Locale.ENGLISH, getClass().getClassLoader().getResource("ailab/posPatterns/en-penn.json").getFile()); posDatabasePaths.put(Locale.ITALIAN, getClass().getClassLoader().getResource("ailab/posPatterns/it-tanl.json").getFile()); } // </editor-fold> /** * Sets the database paths of the POS patterns. * * @param posDatabasePaths the database paths */ public void setPosDatabasePaths(Map<Locale, String> posDatabasePaths) { this.posDatabasePaths = posDatabasePaths; } /** * Adds to the database paths of the POS patterns a file path for a * specified language.. * * @param locale the language of the new path * @param path the path of the POS pattern file for the language */ public void addPosDatabasePaths(Locale locale, String path) { posDatabasePaths.put(locale, path); } /** * Sets the maximum size of an n-gram. If the size is not specified, the * maximum n-gram size is initialized to DEFAULT_MAX_NGRAM_SIZE. * * @param maxNgramSize the maximum size of an n-gram */ public void setMaxGramSize(int maxNgramSize) { this.maxGramSize = maxNgramSize; } // <editor-fold desc="worker methods"> /** * Generates the n-grams of a * {@link it.uniud.ailab.dcore.persistence.DocumentComponent} by using the * POS patterns and other annotations produced by the Annotators. A NGram * generator should offer support for a set of languages, so the Engine can * decide what generator should use if there are more than one. * * @param component the component to analyze. */ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { // load the database, then // TODO: handle exceptions better try { loadDatabase(component.getLanguage()); } catch (IOException | ParseException ex) { Logger.getLogger(SimpleNGramGeneratorAnnotator.class.getName()).log(Level.SEVERE, null, ex); } // do the actual nGram generation. spotNGrams(blackboard, component); } /** * Performs the actual work, by checking in a document if there are valid * nGram sequences that can be used as keyphrase. * * @param component the DocumentComponent to analyze. */ private void spotNGrams(Blackboard blackboard, DocumentComponent component) { // are we a sentence? if yes, spot the nGrams if (component.hasComponents()) { List<DocumentComponent> children = component.getComponents(); // if not and we're a section, traverse the document tree recursively for (DocumentComponent child : children) { spotNGrams(blackboard, child); } } else { String sentenceText = component.getText(); Sentence sent = (Sentence) component; List<Token> allWords = sent.getTokens(); // build the token start and end substring indexes in the input string int startIndexes[] = new int[allWords.size()]; int endIndexes[] = new int[allWords.size()]; // track how much text we've already scanned int searchWordFrom = 0; for (int i = 0; i < allWords.size(); i++) { startIndexes[i] = sentenceText.indexOf(allWords.get(i).getText(), searchWordFrom); endIndexes[i] = startIndexes[i] + allWords.get(i).getText().length(); searchWordFrom = endIndexes[i]; } // we keep n buffers of the last n words scanned // (where n = maxngramsize ). The first buffer is of size 1, // the second of size 2, ... and so on. // then, we compare these buffers with the valid known PoS patterns // and save the ngram if it matches. ArrayList<Token>[] lastReadBuffers = new ArrayList[maxGramSize]; for (int size = 0; size < maxGramSize; size++) { lastReadBuffers[size] = new ArrayList<>(); } for (int i = 0; i < allWords.size(); i++) { Token word = allWords.get(i); for (int size = 0; size < maxGramSize; size++) { // if the buffer is not full, fill it if (lastReadBuffers[size].size() < size + 1) { lastReadBuffers[size].add(word); } else { // else, cut the head and put the new word at the tail lastReadBuffers[size].remove(0); lastReadBuffers[size].add(word); } if (i >= size) { // if the pattern is in the database AND contains a noun, then // it's a nGram that could be a keyphrase. int nounValue = checkGramNounValue(lastReadBuffers[size]); if (nounValue > 0) { // the identifier is the stem of the words String identifier = lastReadBuffers[size].get(0).getStem(); for (int k = 1; k < lastReadBuffers[size].size(); k++) { identifier = identifier + " " + lastReadBuffers[size].get(k).getStem(); } identifier = identifier.toLowerCase(); int startIndex = startIndexes[i - (lastReadBuffers[size].size() - 1)]; int endIndex = endIndexes[i]; Keyphrase g = new Keyphrase(identifier, lastReadBuffers[size], sentenceText.substring(startIndex, endIndex)); g.putFeature(new FeatureAnnotation(NOUNVALUE, ((float) nounValue) / (float) g.getTokens().size())); blackboard.addGram(component, g); } } } // for } // for } // if } // </editor-fold> // <editor-fold desc="support methods"> /** * Checks if a list of POS-tagged tokens contains a n-gram that could be a * valid keyphrase and returns their noun value. * * @param candidate the list of tokens candidate to be a keyphrase * @return the noun value of the n-gram if it's in the database, -1 else. */ private int checkGramNounValue(List<Token> candidate) { // building the tagged String by merging the positions of the array String taggedString = ""; for (int i = 0; i < candidate.size(); i++) { taggedString += candidate.get(i).getPoS(); if (i < candidate.size() - 1) { taggedString += "/"; } } // check if the pattern is contained in the database Integer nounValue; if (validPOSPatterns.containsKey(taggedString)) { nounValue = validPOSPatterns.get(taggedString); } else { nounValue = -1; } return nounValue; } /** * Loads the nGram database according to the path and language specified in * the constructor. * * @param lang the language to search in the database * @throws IOException if the database file is nonexistent or non accessible * @throws ParseException if the database file is malformed * @throws NullPointerException if the language requested is not in the * database */ private void loadDatabase(Locale lang) throws IOException, ParseException { // Get the POS pattern file and parse it. InputStreamReader is = FileSystem.getInputStreamReaderFromPath(posDatabasePaths.get(lang)); BufferedReader reader = new BufferedReader(is); Object obj = (new JSONParser()).parse(reader); JSONObject fileblock = (JSONObject) obj; JSONArray pagesBlock = (JSONArray) fileblock.get("languages"); // Find the required language in the specified file Iterator<JSONObject> iterator = pagesBlock.iterator(); JSONObject languageBlock = null; while (iterator.hasNext()) { languageBlock = (iterator.next()); String currLanguage = (String) languageBlock.get("language"); if (currLanguage.equals(lang.getLanguage())) { break; } } // If the language is not supported by the database, stop the execution. if (languageBlock == null) { throw new NullPointerException( "Language " + lang.getLanguage() + " not found in file " + posDatabasePaths.get(lang)); } JSONArray patternBlock = (JSONArray) languageBlock.get("patterns"); try { maxGramSize = Integer.parseInt(languageBlock.get("maxGramSize").toString()); } catch (Exception e) { // the field is badly formatted or non-existent: set to default maxGramSize = DEFAULT_MAX_NGRAM_SIZE; ; } Iterator<JSONObject> patternIterator = patternBlock.iterator(); // put the patterns in the hashmap while (patternIterator.hasNext()) { JSONObject pattern = (patternIterator.next()); String POSpattern = (String) pattern.get("pattern"); Long nounCount = (Long) pattern.get("nounCount"); validPOSPatterns.put(POSpattern, nounCount.intValue()); } } // </editor-fold> }