maui.main.MauiModelBuilder.java Source code

Introduction

Here is the source code for maui.main.MauiModelBuilder.java
Source

package maui.main;

/*
 *    MauiModelBuilder.java
 *    Copyright (C) 2001-2009 Eibe Frank, Olena Medelyan
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.ObjectOutputStream;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Vector;

import org.wikipedia.miner.model.Wikipedia;

import org.apache.commons.io.FileUtils;

import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import maui.filters.MauiFilter;
import maui.stemmers.*;
import maui.stopwords.*;
import maui.vocab.Vocabulary;

/**
 * Builds a topic indexing model from the documents in a given
 * directory.  Assumes that the file names for the documents end with
 * ".txt".  Assumes that files containing corresponding
 * author-assigned keyphrases end with ".key". Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text).
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -e "encoding"<br>
 * Specifies encoding.<p>
 * 
 *  -w "WikipediaDatabase@WikipediaServer" <br>
 * Specifies wikipedia data.<p>
 * 
 * -v "vocabulary name" <br>
 * Specifies vocabulary name (e.g. agrovoc or none).<p>
 * 
 * -f "vocabulary format" <br>
 * Specifies vocabulary format (txt or skos).<p>
 *
 * -i "document language" <br>
 * Specifies document language (en, es, de, fr).<p>
 *
 * -d<br>
 * Turns debugging mode on.<p>
 * 
 * -x "length"<br>
 * Sets maximum phrase length (default: 3).<p>
 *
 * -y "length"<br>
 * Sets minimum phrase length (default: 1).<p>
 *
 * -o "number"<br>
 * Sets the minimum number of times a phrase needs to occur (default: 2). <p>
 *
 * -s "stopwords class"<br>
 * Sets the name of the class implementing the stop words (default: StopwordsEnglish).<p>
 *
 * -t "stemmer class "<br>
 * Sets stemmer to use (default: PorterStemmer). <p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz), Olena Medelyan (olena@cs.waikato.ac.nz)
 * @version 1.0
 */
public class MauiModelBuilder implements OptionHandler {

    /** Name of directory */
    public String inputDirectoryName = null;

    /** Name of model */
    public String modelName = null;

    /** Vocabulary name */
    public String vocabularyName = "none";

    /** Format of the vocabulary {skos,text} */
    public String vocabularyFormat = null;

    /** Directory where vocabularies are stored **/
    public String vocabularyDirectory = "data/vocabularies";

    /** Document language {en,es,de,fr,...} */
    public String documentLanguage = "en";

    /** Document encoding */
    public String documentEncoding = "default";

    /** Debugging mode? */
    public boolean debugMode = false;

    /** Maximum length of phrases */
    public int maxPhraseLength = 5;

    /** Minimum length of phrases */
    public int minPhraseLength = 1;

    /** Minimum number of occurences of a phrase */
    public int minNumOccur = 1;

    /** Wikipedia object */
    public Wikipedia wikipedia = null;

    /** Classifier */
    private Classifier classifier = null;

    /** Name of the server with the mysql Wikipedia data */
    private String wikipediaServer = "localhost";

    /** Name of the database with Wikipedia data */
    private String wikipediaDatabase = "database";

    /** Name of the directory with Wikipedia data in files */
    private String wikipediaDataDirectory = null;

    /** Should Wikipedia data be cached first? */
    private boolean cacheWikipediaData = false;

    /** Minimum keyphraseness of a string */
    private double minKeyphraseness = 0.01;

    /** Minimum sense probability or commonness */
    private double minSenseProbability = 0.005;

    /** Minimum number of the context articles */
    private int contextSize = 5;

    /** Use basic features  
     * TFxIDF & First Occurrence */
    boolean useBasicFeatures = true;

    /** Use domain keyphraseness feature */
    boolean useKeyphrasenessFeature = true;

    /** Use frequency features
     * TF & IDF additionally */
    boolean useFrequencyFeatures = true;

    /** Use occurrence position features
     * LastOccurrence & Spread */
    boolean usePositionsFeatures = true;

    /** Use thesaurus features
     * Node degree  */
    boolean useNodeDegreeFeature = true;

    /** Use length feature */
    boolean useLengthFeature = true;

    /** Use basic Wikipedia features 
     *  Wikipedia keyphraseness & Total Wikipedia keyphraseness */
    boolean useBasicWikipediaFeatures = false;

    /** Use all Wikipedia features 
     * Inverse Wikipedia frequency & Semantic relatedness*/
    boolean useAllWikipediaFeatures = false;

    /** Maui filter object */
    private MauiFilter mauiFilter = null;

    /** Stemmer to be used */
    Stemmer stemmer = new PorterStemmer();

    /** Llist of stopwords to be used */
    public Stopwords stopwords = new StopwordsEnglish("data/stopwords/stopwords_en.txt");

    private Vocabulary vocabulary = null;

    public void loadThesaurus(Stemmer st, Stopwords sw, String vocabularyDirectory) {
        if (vocabulary != null)
            return;

        try {

            if (debugMode) {
                System.err.println("--- Loading the vocabulary...");
            }
            vocabulary = new Vocabulary(vocabularyName, vocabularyFormat, vocabularyDirectory);
            vocabulary.setStemmer(st);
            vocabulary.setStopwords(sw);
            vocabulary.setDebug(debugMode);
            vocabulary.setLanguage(documentLanguage);
            vocabulary.initialize();

        } catch (Exception e) {
            System.err.println("Failed to load thesaurus!");
            e.printStackTrace();
        }

    }

    public boolean getDebug() {
        return debugMode;
    }

    public void setBasicFeatures(boolean useBasicFeatures) {
        this.useBasicFeatures = useBasicFeatures;
    }

    public void setKeyphrasenessFeature(boolean useKeyphrasenessFeature) {
        this.useKeyphrasenessFeature = useKeyphrasenessFeature;
    }

    public void setFrequencyFeatures(boolean useFrequencyFeatures) {
        this.useFrequencyFeatures = useFrequencyFeatures;
    }

    public void setPositionsFeatures(boolean usePositionsFeatures) {
        this.usePositionsFeatures = usePositionsFeatures;
    }

    public void setNodeDegreeFeature(boolean useNodeDegreeFeature) {
        this.useNodeDegreeFeature = useNodeDegreeFeature;
    }

    public void setLengthFeature(boolean useLengthFeature) {
        this.useLengthFeature = useLengthFeature;
    }

    public void setBasicWikipediaFeatures(boolean useBasicWikipediaFeatures) {
        this.useBasicWikipediaFeatures = useBasicWikipediaFeatures;
    }

    public void setAllWikipediaFeatures(boolean useAllWikipediaFeatures) {
        this.useAllWikipediaFeatures = useAllWikipediaFeatures;
    }

    public void setContextSize(int contextSize) {
        this.contextSize = contextSize;
    }

    public void setMinSenseProbability(double minSenseProbability) {
        this.minSenseProbability = minSenseProbability;
    }

    public void setMinKeyphraseness(double minKeyphraseness) {
        this.minKeyphraseness = minKeyphraseness;
    }

    /**
     * Parses a given list of options controlling the behaviour of this object.
     * Valid options are:<p>
     *
     * -l "directory name" <br>
     * Specifies name of directory.<p>
     *
     * -m "model name" <br>
     * Specifies name of model.<p>
     *
     * -v "vocabulary name" <br>
     * Specifies vocabulary name.<p>
     * 
     * -f "vocabulary format" <br>
     * Specifies vocabulary format.<p>
     *    
     * -i "document language" <br>
     * Specifies document language.<p>
     * 
     * -e "encoding" <br>
     * Specifies encoding.<p>
     * 
     *  -w "WikipediaDatabase@WikipediaServer" <br>
     * Specifies wikipedia data.<p>
     * 
     * -d<br>
     * Turns debugging mode on.<p>
     *
     * -x "length"<br>
     * Sets maximum phrase length (default: 3).<p>
     *
     * -y "length"<br>
     * Sets minimum phrase length (default: 3).<p>
     *
     * -o "number"<br>
     * The minimum number of times a phrase needs to occur (default: 2). <p>
     *
     * -s "name of class implementing list of stop words"<br>
     * Sets list of stop words to used (default: StopwordsEnglish).<p>
     *
     * -t "name of class implementing stemmer"<br>
     * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
     *
     * @param options the list of options as an array of strings
     * @exception Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {

        String dirName = Utils.getOption('l', options);
        if (dirName.length() > 0) {
            inputDirectoryName = dirName;
        } else {
            inputDirectoryName = null;
            throw new Exception("Name of directory required argument.");
        }

        String modelName = Utils.getOption('m', options);
        if (modelName.length() > 0) {
            this.modelName = modelName;
        } else {
            this.modelName = null;
            throw new Exception("Name of model required argument.");
        }

        String vocabularyName = Utils.getOption('v', options);
        if (vocabularyName.length() > 0) {
            this.vocabularyName = vocabularyName;
        }

        String vocabularyFormat = Utils.getOption('f', options);

        if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
            if (vocabularyFormat.length() > 0) {
                if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) {
                    this.vocabularyFormat = vocabularyFormat;
                } else {
                    throw new Exception(
                            "Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
                }
            } else {
                throw new Exception(
                        "If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
            }
        }

        String encoding = Utils.getOption('e', options);
        if (encoding.length() > 0)
            this.documentEncoding = encoding;

        String wikipediaConnection = Utils.getOption('w', options);
        if (wikipediaConnection.length() > 0) {
            int at = wikipediaConnection.indexOf("@");
            wikipediaDatabase = wikipediaConnection.substring(0, at);
            wikipediaServer = wikipediaConnection.substring(at + 1);
        }

        String documentLanguage = Utils.getOption('i', options);
        if (documentLanguage.length() > 0)
            this.documentLanguage = documentLanguage;

        String maxPhraseLengthString = Utils.getOption('x', options);
        if (maxPhraseLengthString.length() > 0)
            this.maxPhraseLength = Integer.parseInt(maxPhraseLengthString);

        String minPhraseLengthString = Utils.getOption('y', options);
        if (minPhraseLengthString.length() > 0)
            this.minPhraseLength = Integer.parseInt(minPhraseLengthString);

        String minNumOccurString = Utils.getOption('o', options);
        if (minNumOccurString.length() > 0)
            this.minNumOccur = Integer.parseInt(minNumOccurString);

        String stopwordsString = Utils.getOption('s', options);
        if (stopwordsString.length() > 0) {
            stopwordsString = "maui.stopwords.".concat(stopwordsString);
            this.stopwords = (Stopwords) Class.forName(stopwordsString).newInstance();
        }

        String stemmerString = Utils.getOption('t', options);
        if (stemmerString.length() > 0) {
            stemmerString = "maui.stemmers.".concat(stemmerString);
            this.stemmer = (Stemmer) Class.forName(stemmerString).newInstance();
        }
        debugMode = Utils.getFlag('d', options);
        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current option settings.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {

        String[] options = new String[23];
        int current = 0;

        options[current++] = "-l";
        options[current++] = "" + (this.inputDirectoryName);
        options[current++] = "-m";
        options[current++] = "" + (this.modelName);
        options[current++] = "-v";
        options[current++] = "" + (this.vocabularyName);
        options[current++] = "-f";
        options[current++] = "" + (this.vocabularyFormat);
        options[current++] = "-e";
        options[current++] = "" + (this.documentEncoding);
        options[current++] = "-i";
        options[current++] = "" + (this.documentLanguage);

        if (this.debugMode) {
            options[current++] = "-d";
        }
        options[current++] = "-x";
        options[current++] = "" + (this.maxPhraseLength);
        options[current++] = "-y";
        options[current++] = "" + (this.minPhraseLength);
        options[current++] = "-o";
        options[current++] = "" + (this.minNumOccur);
        options[current++] = "-s";
        options[current++] = "" + (stopwords.getClass().getName());
        options[current++] = "-t";
        options[current++] = "" + (stemmer.getClass().getName());

        while (current < options.length) {
            options[current++] = "";
        }
        return options;
    }

    /**
     * Returns an enumeration describing the available options.
     *
     * @return an enumeration of all the available options
     */
    public Enumeration<Option> listOptions() {

        Vector<Option> newVector = new Vector<Option>(12);

        newVector.addElement(new Option("\tSpecifies name of directory.", "l", 1, "-l <directory name>"));
        newVector.addElement(new Option("\tSpecifies name of model.", "m", 1, "-m <model name>"));
        newVector.addElement(new Option("\tSpecifies vocabulary name.", "v", 1, "-v <vocabulary name>"));
        newVector.addElement(new Option("\tSpecifies vocabulary format (text or skos or none).", "f", 1,
                "-f <vocabulary format>"));
        newVector.addElement(new Option("\tSpecifies document language (en (default), es, de, fr).", "i", 1,
                "-i <document language>"));
        newVector.addElement(new Option("\tSpecifies encoding.", "e", 1, "-e <encoding>"));
        newVector.addElement(new Option("\tSpecifies wikipedia database and server.", "w", 1,
                "-w <wikipediaDatabase@wikipediaServer>"));
        newVector.addElement(new Option("\tTurns debugging mode on.", "d", 0, "-d"));
        newVector.addElement(new Option("\tSets the maximum phrase length (default: 5).", "x", 1, "-x <length>"));
        newVector.addElement(new Option("\tSets the minimum phrase length (default: 1).", "y", 1, "-y <length>"));
        newVector.addElement(new Option("\tSet the minimum number of occurences (default: 2).", "o", 1, "-o"));
        newVector.addElement(new Option("\tSets the list of stopwords to use (default: StopwordsEnglish).", "s", 1,
                "-s <name of stopwords class>"));
        newVector.addElement(new Option("\tSet the stemmer to use (default: SremovalStemmer).", "t", 1,
                "-t <name of stemmer class>"));

        return newVector.elements();
    }

    /**
     * Collects the file names
     */
    public HashSet<String> collectStems() throws Exception {

        HashSet<String> stems = new HashSet<String>();

        try {
            File dir = new File(inputDirectoryName);

            for (String file : dir.list()) {
                if (file.endsWith(".txt")) {
                    String stem = file.substring(0, file.length() - 4);

                    File keys = new File(inputDirectoryName + "/" + stem + ".key");
                    if (keys.exists()) {
                        stems.add(stem);
                    }
                }
            }
        } catch (Exception e) {
            throw new Exception("Problem reading directory " + inputDirectoryName);
        }
        return stems;
    }

    /**
     * Builds the model from the training data
     */
    public void buildModel(HashSet<String> fileNames) throws Exception {

        // Check whether there is actually any data
        if (fileNames.size() == 0) {
            throw new Exception("Couldn't find any data in " + inputDirectoryName);
        }

        System.err.println("-- Building the model... ");

        FastVector atts = new FastVector(3);
        atts.addElement(new Attribute("filename", (FastVector) null));
        atts.addElement(new Attribute("document", (FastVector) null));
        atts.addElement(new Attribute("keyphrases", (FastVector) null));
        Instances data = new Instances("keyphrase_training_data", atts, 0);

        // Build model
        mauiFilter = new MauiFilter();

        mauiFilter.setDebug(debugMode);
        mauiFilter.setMaxPhraseLength(maxPhraseLength);
        mauiFilter.setMinPhraseLength(minPhraseLength);
        mauiFilter.setMinNumOccur(minNumOccur);
        mauiFilter.setStemmer(stemmer);
        mauiFilter.setDocumentLanguage(documentLanguage);
        mauiFilter.setVocabularyName(vocabularyName);
        mauiFilter.setVocabularyFormat(vocabularyFormat);
        mauiFilter.setStopwords(stopwords);

        if (wikipedia != null) {
            mauiFilter.setWikipedia(wikipedia);
        } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
            mauiFilter.setWikipedia(wikipedia);
        } else {
            mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
        }

        if (classifier != null) {
            mauiFilter.setClassifier(classifier);
        }

        mauiFilter.setInputFormat(data);

        // set features configurations
        mauiFilter.setBasicFeatures(useBasicFeatures);
        mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
        mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
        mauiFilter.setPositionsFeatures(usePositionsFeatures);
        mauiFilter.setLengthFeature(useLengthFeature);
        mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);
        mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures);
        mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures);
        mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);

        mauiFilter.setClassifier(classifier);

        mauiFilter.setContextSize(contextSize);
        mauiFilter.setMinKeyphraseness(minKeyphraseness);
        mauiFilter.setMinSenseProbability(minSenseProbability);

        if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
            loadThesaurus(stemmer, stopwords, vocabularyDirectory);
            mauiFilter.setVocabulary(vocabulary);
        }

        System.err.println("-- Reading the input documents... ");

        for (String fileName : fileNames) {

            double[] newInst = new double[3];

            newInst[0] = (double) data.attribute(0).addStringValue(fileName);

            File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
            File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

            try {

                String documentText;
                if (!documentEncoding.equals("default")) {
                    documentText = FileUtils.readFileToString(documentTextFile, documentEncoding);
                } else {
                    documentText = FileUtils.readFileToString(documentTextFile);
                }

                // Adding the text of the document to the instance
                newInst[1] = (double) data.attribute(1).addStringValue(documentText);

            } catch (Exception e) {

                System.err.println("Problem with reading " + documentTextFile);
                e.printStackTrace();
                newInst[1] = Instance.missingValue();
            }

            try {

                String documentTopics;
                if (!documentEncoding.equals("default")) {
                    documentTopics = FileUtils.readFileToString(documentTopicsFile, documentEncoding);
                } else {
                    documentTopics = FileUtils.readFileToString(documentTopicsFile);
                }

                // Adding the topics to the file
                newInst[2] = (double) data.attribute(2).addStringValue(documentTopics);

            } catch (Exception e) {

                System.err.println("Problem with reading " + documentTopicsFile);
                e.printStackTrace();
                newInst[2] = Instance.missingValue();
            }

            data.add(new Instance(1.0, newInst));

            mauiFilter.input(data.instance(0));
            data = data.stringFreeStructure();
        }
        mauiFilter.batchFinished();

        while ((mauiFilter.output()) != null) {
        }
        ;
    }

    /** 
     * Saves the extraction model to the file.
     */
    public void saveModel() throws Exception {

        BufferedOutputStream bufferedOut = new BufferedOutputStream(new FileOutputStream(modelName));
        ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
        out.writeObject(mauiFilter);
        out.flush();
        out.close();
    }

    /**
     * The main method.  
     */
    public static void main(String[] ops) {

        MauiModelBuilder modelBuilder = new MauiModelBuilder();

        try {

            modelBuilder.setOptions(ops);

            // Output what options are used
            if (modelBuilder.getDebug() == true) {
                System.err.print("Building model with options: ");
                String[] optionSettings = modelBuilder.getOptions();
                for (String optionSetting : optionSettings) {
                    System.err.print(optionSetting + " ");
                }
                System.err.println();
            }

            HashSet<String> fileNames = modelBuilder.collectStems();
            modelBuilder.buildModel(fileNames);

            if (modelBuilder.getDebug() == true) {
                System.err.print("Model built. Saving the model...");
            }

            modelBuilder.saveModel();

            System.err.print("Done!");

        } catch (Exception e) {

            // Output information on how to use this class
            e.printStackTrace();
            System.err.println(e.getMessage());
            System.err.println("\nOptions:\n");
            Enumeration<Option> en = modelBuilder.listOptions();
            while (en.hasMoreElements()) {
                Option option = (Option) en.nextElement();
                System.err.println(option.synopsis());
                System.err.println(option.description());
            }
        }
    }
}