Java tutorial
package kea.main; /* * KEAKeyphraseExtractor.java * Copyright (C) 2001-2006 Eibe Frank, Olena Medelyan * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Enumeration; import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; import edu.unc.ils.mrc.hive.api.SKOSScheme; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.Utils; import kea.filters.KEAFilter; import kea.filters.KEAPhraseFilter; import kea.stemmers.SremovalStemmer; import kea.stemmers.Stemmer; import kea.stopwords.Stopwords; import kea.stopwords.StopwordsEnglish; import kea.util.Counter; import kea.vocab.Vocabulary; import kea.vocab.VocabularyH2; import kea.vocab.VocabularySesame; /** * Extracts keyphrases from the documents in a given directory. Assumes that the * file names for the documents end with ".txt". Puts extracted keyphrases into * corresponding files ending with ".key" (if those are not already present). * Optionally an encoding for the documents/keyphrases can be defined (e.g. for * Chinese text). Documents for which ".key" exists, are used for evaluation. * * Valid options are: * <p> * * -l "directory name"<br> * Specifies name of directory. * <p> * * -m "model name"<br> * Specifies name of model. * <p> * * -v "vocabulary name"<br> * Specifies name of vocabulary. * <p> * * -f "vocabulary format"<br> * Specifies format of vocabulary (text or skos). * <p> * * -i "document language" <br> * Specifies document language (en, es, de, fr). * <p> * * -e "encoding"<br> * Specifies encoding. * <p> * * -n <br> * Specifies number of phrases to be output (default: 5). * <p> * * -t "name of class implementing stemmer"<br> * Sets stemmer to use (default: SremovalStemmer). * <p> * * -s "name of class implementing stopwords"<br> * Sets stemmer to use (default: StopwordsEnglish). * <p> * * -d<br> * Turns debugging mode on. * <p> * * -g<br> * Build global dictionaries from the test set. * <p> * * -a<br> * Also write stemmed phrase and score into ".key" file. * <p> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version 1.0 */ public class KEAKeyphraseExtractor implements OptionHandler { /** Stopwords path */ String m_stopwordsPath; /** Name of directory */ String m_dirName = null; /** Name of model */ String m_modelName = null; /** Name of vocabulary */ String m_vocabulary = null; /** Vocabulary format */ String m_vocabularyFormat = null; /** Document language */ String m_documentLanguage = "en"; /** Encoding */ String m_encoding = "default"; /** Debugging mode? */ boolean m_debug = false; /** The KEA filter object */ private KEAFilter m_KEAFilter = null; /** The number of phrases to extract. */ int m_numPhrases = 10; /** The stemmer to be used */ private Stemmer m_Stemmer = new SremovalStemmer(); /** The list of stop words to be used */ private Stopwords m_Stopwords; private SKOSScheme m_Scheme; /** Also write stemmed phrase and score into .key file. */ boolean m_AdditionalInfo = false; /** Build global dictionaries from the test set. */ boolean m_buildGlobal = false; private SKOSScheme schema; private Vocabulary vocabulary; public KEAKeyphraseExtractor(SKOSScheme schema) { this.m_KEAFilter = new KEAFilter(); this.schema = schema; m_vocabularyFormat = "skos"; try { String h2path = new File(schema.getRdfPath()).getParentFile().getAbsolutePath(); //h2path += File.separator + schema.getName().toLowerCase() + "H2" + File.separator + schema.getName().toLowerCase(); this.vocabulary = new VocabularyH2(schema.getName(), h2path, m_documentLanguage, schema.getManager()); } catch (Exception e) { e.printStackTrace(); } //this.vocabulary = new VocabularySesame(m_vocabulary, m_vocabularyFormat, //m_documentLanguage, schema.getManager()); } public void loadThesaurus() { System.out.println("SCHEMA LOADED IN KEYPHRASE EXTRACTOR " + schema.getLongName()); this.m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, this.vocabulary); this.m_KEAFilter.setVocabulary(schema.getName().toLowerCase()); } /** * Get the value of AdditionalInfo. * * @return Value of AdditionalInfo. */ public boolean getAdditionalInfo() { return m_AdditionalInfo; } /** * Set the value of AdditionalInfo. * * @param newAdditionalInfo * Value to assign to AdditionalInfo. */ public void setAdditionalInfo(boolean newAdditionalInfo) { m_AdditionalInfo = newAdditionalInfo; } /** * Get the value of BuildGlobal. * * @return Value of BuildGlobal. */ public boolean getBuildGlobal() { return m_buildGlobal; } /** * Set the value of BuildGlobal. * * @param newBuildGlobal * Value to assign to BuildGlobal. */ public void setBuildGlobal(boolean newBuildGlobal) { m_buildGlobal = newBuildGlobal; } /** * Get the value of numPhrases. * * @return Value of numPhrases. */ public int getNumPhrases() { return m_numPhrases; } /** * Get the Stemmer value. * * @return the Stemmer value. */ public Stemmer getStemmer() { return m_Stemmer; } /** * Set the Stemmer value. * * @param newStemmer * The new Stemmer value. */ public void setStemmer(Stemmer newStemmer) { this.m_Stemmer = newStemmer; } /** * Get the Stopwords value. * * @return the Stopwords value. */ public Stopwords getStopwords() { return m_Stopwords; } public void setMinNumOccur(int newMinNumOccur) { this.m_KEAFilter.setMinNumOccur(newMinNumOccur); } public void setStopwords(String stopwordsPath) { this.m_Stopwords = new StopwordsEnglish(stopwordsPath); } /** * Set the Stopwords value. * * @param newStopwords * The new Stopwords value. */ public void setStopwords(Stopwords newStopwords) { this.m_Stopwords = newStopwords; } /** * Set the value of numPhrases. * * @param newnumPhrases * Value to assign to numPhrases. */ public void setNumPhrases(int newnumPhrases) { m_numPhrases = newnumPhrases; } /** * Get the value of debug. * * @return Value of debug. */ public boolean getDebug() { return m_debug; } /** * Set the value of debug. * * @param newdebug * Value to assign to debug. */ public void setDebug(boolean newdebug) { m_debug = newdebug; } /** * Get the value of encoding. * * @return Value of encoding. */ public String getEncoding() { return m_encoding; } /** * Set the value of encoding. * * @param newencoding * Value to assign to encoding. */ public void setEncoding(String newencoding) { m_encoding = newencoding; } /** * Get the value of vocabulary name. * * @return Value of vocabulary name. */ public String getVocabulary() { return m_vocabulary; } /** * Set the value of vocabulary name. * * @param newvocabulary * Value to assign to vocabulary name. */ public void setVocabulary(String newvocabulary) { m_vocabulary = newvocabulary; } /** * Get the value of vocabulary format. * * @return Value of vocabulary format. */ public String getVocabularyFormat() { return m_vocabularyFormat; } /** * Set the value of vocabulary format. * * @param newvocabularyFormat * Value to assign to vocabularyFormat . */ public void setVocabularyFormat(String newvocabularyFormat) { m_vocabularyFormat = newvocabularyFormat; } /** * Get the value of document language. * * @return Value of document language. */ public String getDocumentLanguage() { return m_documentLanguage; } /** * Set the value of document language. * * @param newdocumentLanguage * Value to assign to document language. */ public void setDocumentLanguage(String newdocumentLanguage) { m_documentLanguage = newdocumentLanguage; } /** * Get the value of modelName. * * @return Value of modelName. */ public String getModelName() { return m_modelName; } /** * Set the value of modelName. * * @param newmodelName * Value to assign to modelName. */ public void setModelName(String newmodelName) { m_modelName = newmodelName; } /** * Get the value of dirName. * * @return Value of dirName. */ public String getDirName() { return m_dirName; } /** * Set the value of dirName. * * @param newdirName * Value to assign to dirName. */ public void setDirName(String newdirName) { m_dirName = newdirName; } /** * Parses a given list of options controlling the behaviour of this object. * Valid options are: * <p> * * -l "directory name"<br> * Specifies name of directory. * <p> * * -m "model name"<br> * Specifies name of model. * <p> * * -v "vocabulary name"<br> * Specifies vocabulary name. * <p> * * -f "vocabulary format"<br> * Specifies vocabulary format. * <p> * * -i "document language" <br> * Specifies document language. * <p> * * -e "encoding"<br> * Specifies encoding. * <p> * * -n<br> * Specifies number of phrases to be output (default: 5). * <p> * * -d<br> * Turns debugging mode on. * <p> * * -b<br> * Builds global dictionaries for computing TFxIDF from the test collection. * <p> * * -a<br> * Also write stemmed phrase and score into ".key" file. * <p> * * @param options * the list of options as an array of strings * @exception Exception * if an option is not supported */ public void setOptions(String[] options) throws Exception { String dirName = Utils.getOption('l', options); if (dirName.length() > 0) { setDirName(dirName); } else { setDirName(null); throw new Exception("Name of directory required argument."); } String modelName = Utils.getOption('m', options); if (modelName.length() > 0) { setModelName(modelName); } else { setModelName(null); throw new Exception("Name of model required argument."); } String vocabularyName = Utils.getOption('v', options); if (vocabularyName.length() > 0) { setVocabulary(vocabularyName); } else { setVocabulary(null); throw new Exception("Name of vocabulary required argument."); } String vocabularyFormat = Utils.getOption('f', options); if (!getVocabulary().equals("none")) { if (vocabularyFormat.length() > 0) { if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) { setVocabularyFormat(vocabularyFormat); } else { throw new Exception( "Unsupported format of vocabulary. It should be either \"skos\" or \"text\"."); } } else { setVocabularyFormat(null); throw new Exception( "If a controlled vocabulary is used, format of vocabulary required argument (skos or text)."); } } else { setVocabularyFormat(null); } String encoding = Utils.getOption('e', options); if (encoding.length() > 0) { setEncoding(encoding); } else { setEncoding("default"); } String documentLanguage = Utils.getOption('i', options); if (documentLanguage.length() > 0) { setDocumentLanguage(documentLanguage); } else { setDocumentLanguage("en"); } String numPhrases = Utils.getOption('n', options); if (numPhrases.length() > 0) { setNumPhrases(Integer.parseInt(numPhrases)); } else { setNumPhrases(5); } String stemmerString = Utils.getOption('t', options); if (stemmerString.length() > 0) { stemmerString = "kea.stemmers.".concat(stemmerString); setStemmer((Stemmer) Class.forName(stemmerString).newInstance()); } String stopwordsString = Utils.getOption('s', options); if (stopwordsString.length() > 0) { stopwordsString = "kea.stopwords.".concat(stopwordsString); setStopwords((Stopwords) Class.forName(stopwordsString).newInstance()); } setDebug(Utils.getFlag('d', options)); setBuildGlobal(Utils.getFlag('b', options)); setAdditionalInfo(Utils.getFlag('a', options)); Utils.checkForRemainingOptions(options); } /** * Gets the current option settings. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { String[] options = new String[21]; int current = 0; options[current++] = "-l"; options[current++] = "" + (getDirName()); options[current++] = "-m"; options[current++] = "" + (getModelName()); options[current++] = "-v"; options[current++] = "" + (getVocabulary()); options[current++] = "-f"; options[current++] = "" + (getVocabularyFormat()); options[current++] = "-e"; options[current++] = "" + (getEncoding()); options[current++] = "-i"; options[current++] = "" + (getDocumentLanguage()); options[current++] = "-n"; options[current++] = "" + (getNumPhrases()); options[current++] = "-t"; options[current++] = "" + (getStemmer().getClass().getName()); options[current++] = "-s"; options[current++] = "" + (getStopwords().getClass().getName()); if (getDebug()) { options[current++] = "-d"; } if (getBuildGlobal()) { options[current++] = "-b"; } if (getAdditionalInfo()) { options[current++] = "-a"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(13); newVector.addElement(new Option("\tSpecifies name of directory.", "l", 1, "-l <directory name>")); newVector.addElement(new Option("\tSpecifies name of model.", "m", 1, "-m <model name>")); newVector.addElement(new Option("\tSpecifies vocabulary name.", "v", 1, "-v <vocabulary name>")); newVector.addElement(new Option("\tSpecifies vocabulary format.", "f", 1, "-f <vocabulary format>")); newVector.addElement(new Option("\tSpecifies encoding.", "e", 1, "-e <encoding>")); newVector.addElement(new Option("\tSpecifies document language (en (default), es, de, fr).", "i", 1, "-i <document language>")); newVector.addElement(new Option("\tSpecifies number of phrases to be output (default: 5).", "n", 1, "-n")); newVector.addElement(new Option("\tSet the stemmer to use (default: SremovalStemmer).", "t", 1, "-t <name of stemmer class>")); newVector.addElement(new Option("\tSet the stopwords class to use (default: EnglishStopwords).", "s", 1, "-s <name of stopwords class>")); newVector.addElement(new Option("\tTurns debugging mode on.", "d", 0, "-d")); newVector.addElement(new Option( "\tBuilds global dictionaries for computing TFIDF from the test collection.", "b", 0, "-b")); newVector.addElement(new Option("\tAlso write stemmed phrase and score into \".key\" file.", "a", 0, "-a")); return newVector.elements(); } /** * Collects the stems of the file names. */ public Hashtable collectStems() throws Exception { Hashtable stems = new Hashtable(); try { File dir = new File(m_dirName); String[] files = dir.list(); for (int i = 0; i < files.length; i++) { if (files[i].endsWith(".txt")) { String stem = files[i].substring(0, files[i].length() - 4); if (!stems.containsKey(stem)) { stems.put(stem, new Double(0)); } } } } catch (Exception e) { throw new Exception("Problem opening directory " + m_dirName); } return stems; } /** * Builds the model from the files */ public synchronized void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } this.m_KEAFilter.setNumPhrases(m_numPhrases); this.m_KEAFilter.setVocabulary(m_vocabulary); this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); this.m_KEAFilter.setStemmer(m_Stemmer); this.m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { this.m_KEAFilter.m_NODEfeature = false; } else { // Know thesaurus is loaded in the constructor //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (this.m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.out.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } is.close(); newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); this.m_KEAFilter.input(data.instance(0), vocabulary); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = this.m_KEAFilter.output()) != null) { int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.out.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.out.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); } private void buildGlobalDictionaries(Hashtable stems) throws Exception { System.err.println("--- Building global dictionaries from the test collection.. "); // Build dictionary of n-grams with associated // document frequencies this.m_KEAFilter.m_Dictionary = new HashMap(); Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } KEAPhraseFilter kpf = new KEAPhraseFilter(); HashMap hash = this.m_KEAFilter.getPhrasesForDictionary(kpf.tokenize(txtStr.toString()), this.vocabulary); Iterator it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = (String) it.next(); Counter counter = (Counter) this.m_KEAFilter.m_Dictionary.get(phrase); if (counter == null) { this.m_KEAFilter.m_Dictionary.put(phrase, new Counter()); } else { counter.increment(); } } } } /** * Loads the extraction model from the file. */ public void loadModel() throws Exception { BufferedInputStream inStream = new BufferedInputStream(new FileInputStream(m_modelName)); System.out.println("This is the model that has been loaded -------------->" + m_modelName); ObjectInputStream in = new ObjectInputStream(inStream); this.m_KEAFilter = (KEAFilter) in.readObject(); // If TFxIDF values are to be computed from the test corpus if (m_buildGlobal == true) { if (m_debug) { System.err.println("-- The global dictionaries will be built from this test collection.."); } this.m_KEAFilter.m_Dictionary = null; } in.close(); } /** * The main method. */ public static void main(String[] ops) { KEAKeyphraseExtractor kmb = new KEAKeyphraseExtractor(null); try { // Checking and Setting Options selected by the user: kmb.setOptions(ops); System.err.print("Extracting keyphrases with options: "); // Reading Options, which were set above and output them: String[] optionSettings = kmb.getOptions(); for (int i = 0; i < optionSettings.length; i++) { System.err.print(optionSettings[i] + " "); } System.err.println(); // Loading selected Model: System.err.println("-- Loading the Model... "); kmb.loadModel(); // Extracting Keyphrases from all files in the selected directory // stem == the name of the file without ".txt" kmb.extractKeyphrases(kmb.collectStems()); } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); System.err.println("\nOptions:\n"); Enumeration en = kmb.listOptions(); while (en.hasMoreElements()) { Option option = (Option) en.nextElement(); System.err.println(option.synopsis()); System.err.println(option.description()); } } } }