kea.KEAFilter.java Source code

Introduction

Here is the source code for kea.KEAFilter.java
Source

/*
 *    KEAFilter.java
 *    Copyright (C) 2000, 2001 Eibe Frank
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package kea;

import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;

import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesSimple;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/**
 * This filter converts the incoming data into data appropriate for
 * keyphrase classification. It assumes that the dataset contains two
 * string attributes. The first attribute should contain the text of a
 * document. The second attribute should contain the keyphrases
 * associated with that document (if present). 
 *
 * The filter converts every instance (i.e. document) into a set of
 * instances, one for each word-based n-gram in the document. The
 * string attribute representing the document is replaced by some
 * numeric features, the estimated probability of each n-gram being a
 * keyphrase, and the rank of this phrase in the document according to
 * the probability.  Each new instances also has a class value
 * associated with it. The class is "true" if the n-gram is a true
 * keyphrase, and "false" otherwise. Of course, if the input document
 * doesn't come with author-assigned keyphrases, the class values for
 * that document will be missing.  
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
  */
@SuppressWarnings({ "serial", "rawtypes", "unchecked", "cast", "unused" })
public class KEAFilter extends Filter implements OptionHandler {

    /** Index of attribute containing the documents */
    private int m_DocumentAtt = 0;

    /** Index of attribute containing the keyphrases */
    private int m_KeyphrasesAtt = 1;

    /** The maximum length of phrases */
    private int m_MaxPhraseLength = 3;

    /** The minimum length of phrases */
    private int m_MinPhraseLength = 1;

    /** Is keyphrase frequency attribute being used? */
    private boolean m_KFused = false;

    /** Flag for debugging mode */
    private boolean m_Debug = false;

    /** Determines whether internal periods are allowed */
    private boolean m_DisallowInternalPeriods = false;

    /** The minimum number of occurences of a phrase */
    private int m_MinNumOccur = 2;

    /** The number of features describing a phrase */
    private int m_NumFeatures = 2;

    /* Indices of attributes in m_ClassifierData */
    private int m_TfidfIndex = 0;
    private int m_FirstOccurIndex = 1;
    private int m_KeyFreqIndex = 2;

    /** The punctuation filter used by this filter */
    private KEAPhraseFilter m_PunctFilter = null;

    /** The numbers filter used by this filter */
    private NumbersFilter m_NumbersFilter = null;

    /** The actual classifier used to compute probabilities */
    private Classifier m_Classifier = null;

    /** The dictionary containing the document frequencies */
    private HashMap m_Dictionary = null;

    /** The dictionary containing the keyphrases */
    private HashMap m_KeyphraseDictionary = null;

    /** The number of documents in the global frequencies corpus */
    private int m_NumDocs = 0;

    /** Template for the classifier data */
    private Instances m_ClassifierData = null;

    /** The stemmer to be used */
    private Stemmer m_Stemmer = new IteratedLovinsStemmer();

    /** The list of stop words to be used */
    private Stopwords m_Stopwords = new StopwordsEnglish();

    /** Determines whether check for proper nouns is performed */
    private boolean m_CheckForProperNouns = true;

    /**
     * Get the M_CheckProperNouns value.
     * @return the M_CheckProperNouns value.
     */
    public boolean getCheckForProperNouns() {
        return m_CheckForProperNouns;
    }

    /**
     * Set the M_CheckProperNouns value.
     * @param newM_CheckProperNouns The new M_CheckProperNouns value.
     */
    public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
        this.m_CheckForProperNouns = newM_CheckProperNouns;
    }

    /**
     * Get the M_Stopwords value.
     * @return the M_Stopwords value.
     */
    public Stopwords getStopwords() {
        return m_Stopwords;
    }

    /**
     * Set the M_Stopwords value.
     * @param newM_Stopwords The new M_Stopwords value.
     */
    public void setStopwords(Stopwords newM_Stopwords) {
        this.m_Stopwords = newM_Stopwords;
    }

    /**
     * Get the Stemmer value.
     * @return the Stemmer value.
     */
    public Stemmer getStemmer() {

        return m_Stemmer;
    }

    /**
     * Set the Stemmer value.
     * @param newStemmer The new Stemmer value.
     */
    public void setStemmer(Stemmer newStemmer) {

        this.m_Stemmer = newStemmer;
    }

    /**
     * Get the value of MinNumOccur.
     *
     * @return Value of MinNumOccur.
     */
    public int getMinNumOccur() {

        return m_MinNumOccur;
    }

    /**
     * Set the value of MinNumOccur.
     *
     * @param newMinNumOccur Value to assign to MinNumOccur.
     */
    public void setMinNumOccur(int newMinNumOccur) {

        m_MinNumOccur = newMinNumOccur;
    }

    /**
     * Get the value of MaxPhraseLength.
     *
     * @return Value of MaxPhraseLength.
     */
    public int getMaxPhraseLength() {

        return m_MaxPhraseLength;
    }

    /**
     * Set the value of MaxPhraseLength.
     *
     * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
     */
    public void setMaxPhraseLength(int newMaxPhraseLength) {

        m_MaxPhraseLength = newMaxPhraseLength;
    }

    /**
     * Get the value of MinPhraseLength.
     *
     * @return Value of MinPhraseLength.
     */
    public int getMinPhraseLength() {

        return m_MinPhraseLength;
    }

    /**
     * Set the value of MinPhraseLength.
     *
     * @param newMinPhraseLength Value to assign to MinPhraseLength.
     */
    public void setMinPhraseLength(int newMinPhraseLength) {

        m_MinPhraseLength = newMinPhraseLength;
    }

    /**
     * Returns the index of the stemmed phrases in the output ARFF file.
     */
    public int getStemmedPhraseIndex() {

        return m_DocumentAtt;
    }

    /**
     * Returns the index of the unstemmed phrases in the output ARFF file.
     */
    public int getUnstemmedPhraseIndex() {

        return m_DocumentAtt + 1;
    }

    /**
     * Returns the index of the phrases' probabilities in the output ARFF file.
     */
    public int getProbabilityIndex() {

        int index = m_DocumentAtt + 4;

        if (m_Debug) {
            if (m_KFused) {
                index++;
            }
        }
        return index;
    }

    /**
     * Returns the index of the phrases' ranks in the output ARFF file.
     */
    public int getRankIndex() {

        return getProbabilityIndex() + 1;
    }

    /**
     * Get the value of DocumentAtt.
     *
     * @return Value of DocumentAtt.
     */
    public int getDocumentAtt() {

        return m_DocumentAtt;
    }

    /**
     * Set the value of DocumentAtt.
     *
     * @param newDocumentAtt Value to assign to DocumentAtt.
     */
    public void setDocumentAtt(int newDocumentAtt) {

        m_DocumentAtt = newDocumentAtt;
    }

    /**
     * Get the value of KeyphraseAtt.
     *
     * @return Value of KeyphraseAtt.
     */
    public int getKeyphrasesAtt() {

        return m_KeyphrasesAtt;
    }

    /**
     * Set the value of KeyphrasesAtt.
     *
     * @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
     */
    public void setKeyphrasesAtt(int newKeyphrasesAtt) {

        m_KeyphrasesAtt = newKeyphrasesAtt;
    }

    /**
     * Get the value of Debug.
     *
     * @return Value of Debug.
     */
    public boolean getDebug() {

        return m_Debug;
    }

    /**
     * Set the value of Debug.
     *
     * @param newDebug Value to assign to Debug.
     */
    public void setDebug(boolean newDebug) {

        m_Debug = newDebug;
    }

    /**
     * Sets whether keyphrase frequency attribute is used.
     */
    public void setKFused(boolean flag) {

        m_KFused = flag;
        if (flag) {
            m_NumFeatures = 3;
        } else {
            m_NumFeatures = 2;
        }
    }

    /**
     * Gets whether keyphrase frequency attribute is used.
     */
    public boolean getKFused() {

        return m_KFused;
    }

    /**
     * Get whether the supplied columns are to be processed
     *
     * @return true if the supplied columns won't be processed
     */
    public boolean getDisallowInternalPeriods() {

        return m_DisallowInternalPeriods;
    }

    /**
     * Set whether selected columns should be processed. If true the 
     * selected columns won't be processed.
     *
     * @param invert the new invert setting
     */
    public void setDisallowInternalPeriods(boolean disallow) {

        m_DisallowInternalPeriods = disallow;
    }

    /**
     * Parses a given list of options controlling the behaviour of this object.
     * Valid options are:<p>
     *
     * -K<br>
     * Specifies whether keyphrase frequency statistic is used.<p>
     *
     * -M length<br>
     * Sets the maximum phrase length (default: 3).<p>
     *
     * -L length<br>
     * Sets the minimum phrase length (default: 1).<p>
     *
     * -D<br>
     * Turns debugging mode on.<p>
     *
     * -I index<br>
     * Sets the index of the attribute containing the documents (default: 0).<p>
     *
     * -J index<br>
     * Sets the index of the attribute containing the keyphrases (default: 1).<p>
     *
     * -P<br>
     * Disallow internal periods <p>
     *
     * -O number<br>
     * The minimum number of times a phrase needs to occur (default: 2). <p>
     *
     * @param options the list of options as an array of strings
     * @exception Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {

        setKFused(Utils.getFlag('K', options));
        setDebug(Utils.getFlag('D', options));
        String docAttIndexString = Utils.getOption('I', options);
        if (docAttIndexString.length() > 0) {
            setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
        } else {
            setDocumentAtt(0);
        }
        String keyphraseAttIndexString = Utils.getOption('J', options);
        if (keyphraseAttIndexString.length() > 0) {
            setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
        } else {
            setKeyphrasesAtt(1);
        }
        String maxPhraseLengthString = Utils.getOption('M', options);
        if (maxPhraseLengthString.length() > 0) {
            setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
        } else {
            setMaxPhraseLength(3);
        }
        String minPhraseLengthString = Utils.getOption('M', options);
        if (minPhraseLengthString.length() > 0) {
            setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
        } else {
            setMinPhraseLength(1);
        }
        String minNumOccurString = Utils.getOption('O', options);
        if (minNumOccurString.length() > 0) {
            setMinNumOccur(Integer.parseInt(minNumOccurString));
        } else {
            setMinNumOccur(2);
        }
        setDisallowInternalPeriods(Utils.getFlag('P', options));
    }

    /**
     * Gets the current settings of the filter.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {

        String[] options = new String[13];
        int current = 0;

        if (getKFused()) {
            options[current++] = "-K";
        }
        if (getDebug()) {
            options[current++] = "-D";
        }
        options[current++] = "-I";
        options[current++] = "" + (getDocumentAtt() + 1);
        options[current++] = "-J";
        options[current++] = "" + (getKeyphrasesAtt() + 1);
        options[current++] = "-M";
        options[current++] = "" + (getMaxPhraseLength());
        options[current++] = "-L";
        options[current++] = "" + (getMinPhraseLength());
        options[current++] = "-O";
        options[current++] = "" + (getMinNumOccur());

        if (getDisallowInternalPeriods()) {
            options[current++] = "-P";
        }

        while (current < options.length) {
            options[current++] = "";
        }
        return options;
    }

    /**
     * Returns an enumeration describing the available options
     *
     * @return an enumeration of all the available options
     */
    public Enumeration listOptions() {

        Vector newVector = new Vector(7);

        newVector
                .addElement(new Option("\tSpecifies whether keyphrase frequency statistic is used.", "K", 0, "-K"));
        newVector.addElement(new Option("\tSets the maximum phrase length (default: 3).", "M", 1, "-M <length>"));
        newVector.addElement(new Option("\tSets the minimum phrase length (default: 1).", "L", 1, "-L <length>"));
        newVector.addElement(new Option("\tTurns debugging mode on.", "D", 0, "-D"));
        newVector.addElement(new Option("\tSets the index of the document attribute (default: 0).", "I", 1, "-I"));
        newVector.addElement(new Option("\tSets the index of the keyphrase attribute (default: 1).", "J", 1, "-J"));
        newVector.addElement(new Option("\tDisallow internal periods.", "P", 0, "-P"));
        newVector.addElement(new Option("\tSet the minimum number of occurences (default: 2).", "O", 1, "-O"));

        return newVector.elements();
    }

    /**
     * Returns a string describing this filter
     *
     * @return a description of the filter suitable for
     * displaying in the explorer/experimenter gui
     */
    public String globalInfo() {
        return "Converts incoming data into data appropriate for " + "keyphrase classification.";
    }

    /**
     * Sets the format of the input instances.
     *
     * @param instanceInfo an Instances object containing the input
     * instance structure (any instances contained in the object are
     * ignored - only the structure is required).
     * @return true if the outputFormat may be collected immediately 
     */
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        if (instanceInfo.classIndex() >= 0) {
            throw new Exception("Don't know what do to if class index set!");
        }
        if (!instanceInfo.attribute(m_KeyphrasesAtt).isString()
                || !instanceInfo.attribute(m_DocumentAtt).isString()) {
            throw new Exception("Keyphrase attribute and document attribute " + "need to be string attributes.");
        }
        m_PunctFilter = new KEAPhraseFilter();
        int[] arr = new int[1];
        arr[0] = m_DocumentAtt;
        m_PunctFilter.setAttributeIndicesArray(arr);
        m_PunctFilter.setInputFormat(instanceInfo);
        m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
        m_NumbersFilter = new NumbersFilter();
        m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
        super.setInputFormat(m_NumbersFilter.getOutputFormat());
        return false;
    }

    /**
     * Input an instance for filtering. Ordinarily the instance is processed
     * and made available for output immediately. Some filters require all
     * instances be read before producing output.
     *
     * @param instance the input instance
     * @return true if the filtered instance may now be
     * collected with output().
     * @exception Exception if the input instance was not of the correct 
     * format or if there was a problem with the filtering.
     */
    public boolean input(Instance instance) throws Exception {

        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }
        if (m_NewBatch) {
            resetQueue();
            m_NewBatch = false;
        }

        if (m_Debug) {
            System.err.println("-- Reading instance");
        }

        m_PunctFilter.input(instance);
        m_PunctFilter.batchFinished();
        instance = m_PunctFilter.output();

        m_NumbersFilter.input(instance);
        m_NumbersFilter.batchFinished();
        instance = m_NumbersFilter.output();

        if (m_Dictionary == null) {
            bufferInput(instance);
            return false;
        } else {
            FastVector vector = convertInstance(instance, false);
            Enumeration enumeration = vector.elements();
            while (enumeration.hasMoreElements()) {
                Instance inst = (Instance) enumeration.nextElement();
                push(inst);
            }
            return true;
        }
    }

    /**
     * Signify that this batch of input to the filter is finished. 
     * If the filter requires all instances prior to filtering,
     * output() may now be called to retrieve the filtered instances.
     *
     * @return true if there are instances pending output
     * @exception Exception if no input structure has been defined
     */
    public boolean batchFinished() throws Exception {

        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }
        if (m_Dictionary == null) {
            buildGlobalDictionaries();
            buildClassifier();
            convertPendingInstances();
        }
        flushInput();
        m_NewBatch = true;
        return (numPendingOutput() != 0);
    }

    /**
     * Builds the global dictionaries.
     */
    private void buildGlobalDictionaries() throws Exception {

        if (m_Debug) {
            System.err.println("--- Building global dictionaries");
        }

        // Build dictionary of n-grams with associated
        // document frequencies
        m_Dictionary = new HashMap();
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
            HashMap hash = getPhrasesForDictionary(str);
            Iterator it = hash.keySet().iterator();
            while (it.hasNext()) {
                String phrase = (String) it.next();
                Counter counter = (Counter) m_Dictionary.get(phrase);
                if (counter == null) {
                    m_Dictionary.put(phrase, new Counter());
                } else {
                    counter.increment();
                }
            }
        }

        if (m_KFused) {

            // Build dictionary of n-grams that occur as keyphrases
            // with associated keyphrase frequencies
            m_KeyphraseDictionary = new HashMap();
            for (int i = 0; i < getInputFormat().numInstances(); i++) {
                String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
                HashMap hash = getGivenKeyphrases(str, false);
                if (hash != null) {
                    Iterator it = hash.keySet().iterator();
                    while (it.hasNext()) {
                        String phrase = (String) it.next();
                        Counter counter = (Counter) m_KeyphraseDictionary.get(phrase);
                        if (counter == null) {
                            m_KeyphraseDictionary.put(phrase, new Counter());
                        } else {
                            counter.increment();
                        }
                    }
                }
            }
        } else {
            m_KeyphraseDictionary = null;
        }

        // Set the number of documents in the global corpus
        m_NumDocs = getInputFormat().numInstances();
    }

    /**
     * Builds the classifier.
     */
    private void buildClassifier() throws Exception {

        // Generate input format for classifier
        FastVector atts = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == m_DocumentAtt) {
                atts.addElement(new Attribute("TFxIDF"));
                atts.addElement(new Attribute("First_occurrence"));
                if (m_KFused) {
                    atts.addElement(new Attribute("Keyphrase_frequency"));
                }
            } else if (i == m_KeyphrasesAtt) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                atts.addElement(new Attribute("Keyphrase?", vals));
            }
        }
        m_ClassifierData = new Instances("ClassifierData", atts, 0);
        m_ClassifierData.setClassIndex(m_NumFeatures);

        if (m_Debug) {
            System.err.println("--- Converting instances for classifier");
        }

        // Convert pending input instances into data for classifier
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            Instance current = getInputFormat().instance(i);

            // Get the key phrases for the document
            String keyphrases = current.stringValue(m_KeyphrasesAtt);
            HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
            HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);

            // Get the phrases for the document
            HashMap hash = new HashMap();
            int length = getPhrases(hash, current.stringValue(m_DocumentAtt));

            // Compute the feature values for each phrase and
            // add the instance to the data for the classifier
            Iterator it = hash.keySet().iterator();
            while (it.hasNext()) {
                String phrase = (String) it.next();
                FastVector phraseInfo = (FastVector) hash.get(phrase);
                double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length);
                Instance inst = new Instance(current.weight(), vals);
                m_ClassifierData.add(inst);
            }
        }

        if (m_Debug) {
            System.err.println("--- Building classifier");
        }

        // Build classifier
        FilteredClassifier fclass = new FilteredClassifier();
        fclass.setClassifier(new NaiveBayesSimple());
        fclass.setFilter(new Discretize());
        m_Classifier = fclass;
        m_Classifier.buildClassifier(m_ClassifierData);

        if (m_Debug) {
            System.err.println(m_Classifier);
        }

        // Save space
        m_ClassifierData = new Instances(m_ClassifierData, 0);
    }

    /** 
     * Conmputes the feature values for a given phrase.
     */
    private double[] featVals(String phrase, FastVector phraseInfo, boolean training, HashMap hashKeysEval,
            HashMap hashKeyphrases, int length) {

        // Compute feature values
        Counter counterLocal = (Counter) phraseInfo.elementAt(1);
        double[] newInst = new double[m_NumFeatures + 1];

        // Compute TFxIDF
        Counter counterGlobal = (Counter) m_Dictionary.get(phrase);
        double localVal = counterLocal.value(), globalVal = 0;
        if (counterGlobal != null) {
            globalVal = counterGlobal.value();
            if (training) {
                globalVal = globalVal - 1;
            }
        }

        // Just devide by length to get approximation of probability
        // that phrase in document is our phrase
        newInst[m_TfidfIndex] = (localVal / ((double) length))
                * (-Math.log((globalVal + 1) / ((double) m_NumDocs + 1)));

        // Compute first occurrence
        Counter counterFirst = (Counter) phraseInfo.elementAt(0);
        newInst[m_FirstOccurIndex] = (double) counterFirst.value() / (double) length;

        // Is keyphrase frequency attribute being used?
        if (m_KFused) {
            Counter keyphraseC = (Counter) m_KeyphraseDictionary.get(phrase);
            if ((training) && (hashKeyphrases != null) && (hashKeyphrases.containsKey(phrase))) {
                newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
            } else {
                if (keyphraseC != null) {
                    newInst[m_KeyFreqIndex] = keyphraseC.value();
                } else {
                    newInst[m_KeyFreqIndex] = 0;
                }
            }
        }

        // Compute class value
        String phraseInEvalFormat = evalFormat((String) phraseInfo.elementAt(2));
        if (hashKeysEval == null) { // no author-assigned keyphrases
            newInst[m_NumFeatures] = Instance.missingValue();
        } else if (!hashKeysEval.containsKey(phraseInEvalFormat)) {
            newInst[m_NumFeatures] = 0; // No keyphrase
        } else {
            hashKeysEval.remove(phraseInEvalFormat);
            newInst[m_NumFeatures] = 1; // Keyphrase
        }
        return newInst;
    }

    /**
     * Sets output format and converts pending input instances.
     */
    private void convertPendingInstances() throws Exception {

        if (m_Debug) {
            System.err.println("--- Converting pending instances");
        }

        // Create output format for filter
        FastVector atts = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == m_DocumentAtt) {
                atts.addElement(new Attribute("N-gram", (FastVector) null));
                atts.addElement(new Attribute("N-gram-original", (FastVector) null));
                atts.addElement(new Attribute("TFxIDF"));
                atts.addElement(new Attribute("First_occurrence"));
                if (m_Debug) {
                    if (m_KFused) {
                        atts.addElement(new Attribute("Keyphrase_frequency"));
                    }
                }
                atts.addElement(new Attribute("Probability"));
                atts.addElement(new Attribute("Rank"));
            } else if (i == m_KeyphrasesAtt) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                atts.addElement(new Attribute("Keyphrase?", vals));
            } else {
                atts.addElement(getInputFormat().attribute(i));
            }
        }
        Instances outFormat = new Instances("KEAdata", atts, 0);
        setOutputFormat(outFormat);

        // Convert pending input instances into output data
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            Instance current = getInputFormat().instance(i);
            FastVector vector = convertInstance(current, true);
            Enumeration enumeration = vector.elements();
            while (enumeration.hasMoreElements()) {
                Instance inst = (Instance) enumeration.nextElement();
                push(inst);
            }
        }
    }

    /**
     * Converts an instance.
     */
    private FastVector convertInstance(Instance instance, boolean training) throws Exception {

        FastVector vector = new FastVector();

        if (m_Debug) {
            System.err.println("-- Converting instance");
        }

        // Get the key phrases for the document
        HashMap hashKeyphrases = null;
        HashMap hashKeysEval = null;
        if (!instance.isMissing(m_KeyphrasesAtt)) {
            String keyphrases = instance.stringValue(m_KeyphrasesAtt);
            hashKeyphrases = getGivenKeyphrases(keyphrases, false);
            hashKeysEval = getGivenKeyphrases(keyphrases, true);
        }

        // Get the phrases for the document
        HashMap hash = new HashMap();
        int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

        // Compute number of extra attributes
        int numFeatures = 5;
        if (m_Debug) {
            if (m_KFused) {
                numFeatures = numFeatures + 1;
            }
        }

        // Set indices of key attributes
        int phraseAttIndex = m_DocumentAtt;
        int tfidfAttIndex = m_DocumentAtt + 2;
        int distAttIndex = m_DocumentAtt + 3;
        int probsAttIndex = m_DocumentAtt + numFeatures - 1;

        // Go through the phrases and convert them into instances
        Iterator it = hash.keySet().iterator();
        while (it.hasNext()) {
            String phrase = (String) it.next();
            FastVector phraseInfo = (FastVector) hash.get(phrase);
            double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
            Instance inst = new Instance(instance.weight(), vals);
            inst.setDataset(m_ClassifierData);

            // Get probability of phrase being key phrase
            double[] probs = m_Classifier.distributionForInstance(inst);
            double prob = probs[1];

            // Compute attribute values for final instance
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                    newInst[pos++] = index;

                    // Add TFxIDF
                    newInst[pos++] = inst.value(m_TfidfIndex);

                    // Add distance
                    newInst[pos++] = inst.value(m_FirstOccurIndex);

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = inst.value(m_KeyFreqIndex);
                        }
                    }

                    // Add probability 
                    probsAttIndex = pos;
                    newInst[pos++] = prob;

                    // Set rank to missing (computed below)
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = inst.classValue();
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance ins = new Instance(instance.weight(), newInst);
            ins.setDataset(outputFormatPeek());
            vector.addElement(ins);
        }

        // Add dummy instances for keyphrases that don't occur
        // in the document
        if (hashKeysEval != null) {
            Iterator phrases = hashKeysEval.keySet().iterator();
            while (phrases.hasNext()) {
                String phrase = (String) phrases.next();
                double[] newInst = new double[instance.numAttributes() + numFeatures];
                int pos = 0;
                for (int i = 0; i < instance.numAttributes(); i++) {
                    if (i == m_DocumentAtt) {

                        // Add phrase
                        int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                        newInst[pos++] = (double) index;

                        // Add original version
                        index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                        newInst[pos++] = (double) index;

                        // Add TFxIDF
                        newInst[pos++] = Instance.missingValue();

                        // Add distance
                        newInst[pos++] = Instance.missingValue();

                        // Add other features
                        if (m_Debug) {
                            if (m_KFused) {
                                newInst[pos++] = Instance.missingValue();
                            }
                        }

                        // Add probability and rank
                        newInst[pos++] = -Double.MAX_VALUE;
                        newInst[pos++] = Instance.missingValue();
                    } else if (i == m_KeyphrasesAtt) {
                        newInst[pos++] = 1; // Keyphrase
                    } else {
                        newInst[pos++] = instance.value(i);
                    }
                }
                Instance inst = new Instance(instance.weight(), newInst);
                inst.setDataset(outputFormatPeek());
                vector.addElement(inst);
            }
        }

        // Sort phrases according to their distance (stable sort)
        double[] vals = new double[vector.size()];
        for (int i = 0; i < vals.length; i++) {
            vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
        }
        FastVector newVector = new FastVector(vector.size());
        int[] sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Sort phrases according to their tfxidf value (stable sort)
        for (int i = 0; i < vals.length; i++) {
            vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
        }
        newVector = new FastVector(vector.size());
        sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Sort phrases according to their probability (stable sort)
        for (int i = 0; i < vals.length; i++) {
            vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
        }
        newVector = new FastVector(vector.size());
        sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Compute rank of phrases. Check for subphrases that are ranked
        // lower than superphrases and assign probability -1 and set the
        // rank to Integer.MAX_VALUE
        int rank = 1;
        for (int i = 0; i < vals.length; i++) {
            Instance currentInstance = (Instance) vector.elementAt(i);

            // Short cut: if phrase very unlikely make rank very low and continue
            if (Utils.grOrEq(vals[i], 1.0)) {
                currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
                continue;
            }

            // Otherwise look for super phrase starting with first phrase
            // in list that has same probability, TFxIDF value, and distance as
            // current phrase. We do this to catch all superphrases
            // that have same probability, TFxIDF value and distance as current phrase.
            int startInd = i;
            while (startInd < vals.length) {
                Instance inst = (Instance) vector.elementAt(startInd);
                if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                        || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                        || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                    break;
                }
                startInd++;
            }
            String val = currentInstance.stringValue(phraseAttIndex);
            boolean foundSuperphrase = false;
            for (int j = startInd - 1; j >= 0; j--) {
                if (j != i) {
                    Instance candidate = (Instance) vector.elementAt(j);
                    String potSuperphrase = candidate.stringValue(phraseAttIndex);
                    if (val.length() <= potSuperphrase.length()) {
                        if (KEAFilter.contains(val, potSuperphrase)) {
                            foundSuperphrase = true;
                            break;
                        }
                    }
                }
            }
            if (foundSuperphrase) {
                currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            } else {
                currentInstance.setValue(probsAttIndex + 1, rank++);
            }
        }
        return vector;
    }

    /** 
     * Checks whether one phrase is a subphrase of another phrase.
     */
    private static boolean contains(String sub, String sup) {

        int i = 0;
        while (i + sub.length() - 1 < sup.length()) {
            int j;
            for (j = 0; j < sub.length(); j++) {
                if (sub.charAt(j) != sup.charAt(i + j)) {
                    break;
                }
            }
            if (j == sub.length()) {
                if ((i + j) < sup.length()) {
                    if (sup.charAt(i + j) == ' ') {
                        return true;
                    } else {
                        return false;
                    }
                } else {
                    return true;
                }
            }

            // Skip forward to next space
            do {
                i++;
            } while ((i < sup.length()) && (sup.charAt(i) != ' '));
            i++;
        }
        return false;
    }

    /**
     * Returns a hashtable. Fills the hashtable
     * with the stemmed n-grams occuring in the given string
     * (as keys) and the number of times it occurs.
     */
    private HashMap getPhrasesForDictionary(String str) {

        String[] buffer = new String[m_MaxPhraseLength];
        HashMap hash = new HashMap();

        StringTokenizer tok = new StringTokenizer(str, "\n");
        while (tok.hasMoreTokens()) {
            String phrase = tok.nextToken();
            int numSeen = 0;
            StringTokenizer wordTok = new StringTokenizer(phrase, " ");
            while (wordTok.hasMoreTokens()) {
                String word = wordTok.nextToken();

                // Store word in buffer
                for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
                    buffer[i] = buffer[i + 1];
                }
                buffer[m_MaxPhraseLength - 1] = word;

                // How many are buffered?
                numSeen++;
                if (numSeen > m_MaxPhraseLength) {
                    numSeen = m_MaxPhraseLength;
                }

                // Don't consider phrases that end with a stop word
                if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
                    continue;
                }

                // Loop through buffer and add phrases to hashtable
                StringBuffer phraseBuffer = new StringBuffer();
                for (int i = 1; i <= numSeen; i++) {
                    if (i > 1) {
                        phraseBuffer.insert(0, ' ');
                    }
                    phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);

                    // Don't consider phrases that begin with a stop word
                    if ((i > 1) && (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
                        continue;
                    }

                    // Only consider phrases with minimum length
                    if (i >= m_MinPhraseLength) {

                        // Stem string
                        String orig = phraseBuffer.toString();
                        String internal = internalFormat(orig);
                        Counter count = (Counter) hash.get(internal);
                        if (count == null) {
                            hash.put(internal, new Counter());
                        } else {
                            count.increment();
                        }

                        // Add components if phrase is single word before
                        // conversion into internal format (i.e. error-correcting)
                        /*if ((orig.indexOf(' ') == -1) &&
                          (internal.indexOf(' ') != -1)) {
                          StringTokenizer tokW = new StringTokenizer(internal, " ");
                          while (tokW.hasMoreTokens()) {
                          String comp = (String)tokW.nextToken();
                          Counter countW = (Counter)hash.get(comp);
                          if (countW == null) {
                          hash.put(comp, new Counter());
                          } else {
                          countW.increment();
                          }
                          }
                          }*/
                    }
                }
            }
        }
        return hash;
    }

    /**
     * Expects an empty hashtable. Fills the hashtable
     * with the stemmed n-grams occuring in the given string
     * (as keys). Stores the position, the number of occurences,
     * and the most commonly occurring orgininal version of
     * each n-gram.
     *
     * N-grams that occur less than m_MinNumOccur are not used.
     *
     * Returns the total number of words (!) in the string.
     */
    private int getPhrases(HashMap hash, String str) {

        String[] buffer = new String[m_MaxPhraseLength];

        StringTokenizer tok = new StringTokenizer(str, "\n");
        int pos = 1;
        while (tok.hasMoreTokens()) {
            String phrase = tok.nextToken();
            int numSeen = 0;
            StringTokenizer wordTok = new StringTokenizer(phrase, " ");
            while (wordTok.hasMoreTokens()) {
                String word = wordTok.nextToken();

                // Store word in buffer
                for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
                    buffer[i] = buffer[i + 1];
                }
                buffer[m_MaxPhraseLength - 1] = word;

                // How many are buffered?
                numSeen++;
                if (numSeen > m_MaxPhraseLength) {
                    numSeen = m_MaxPhraseLength;
                }

                // Don't consider phrases that end with a stop word
                if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
                    pos++;
                    continue;
                }

                // Loop through buffer and add phrases to hashtable
                StringBuffer phraseBuffer = new StringBuffer();
                for (int i = 1; i <= numSeen; i++) {
                    if (i > 1) {
                        phraseBuffer.insert(0, ' ');
                    }
                    phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);

                    // Don't consider phrases that begin with a stop word
                    if ((i > 1) && (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
                        continue;
                    }

                    // Only consider phrases with minimum length
                    if (i >= m_MinPhraseLength) {

                        // Stem string
                        String phrStr = phraseBuffer.toString();
                        String internal = internalFormat(phrStr);
                        FastVector vec = (FastVector) hash.get(internal);
                        if (vec == null) {
                            vec = new FastVector(3);

                            // HashMap for storing all versions
                            HashMap secHash = new HashMap();
                            secHash.put(phrStr, new Counter());

                            // Update hashtable with all the info
                            vec.addElement(new Counter(pos + 1 - i));
                            vec.addElement(new Counter());
                            vec.addElement(secHash);
                            hash.put(internal, vec);
                        } else {

                            // Update number of occurrences
                            ((Counter) ((FastVector) vec).elementAt(1)).increment();

                            // Update hashtable storing different versions
                            HashMap secHash = (HashMap) vec.elementAt(2);
                            Counter count = (Counter) secHash.get(phrStr);
                            if (count == null) {
                                secHash.put(phrStr, new Counter());
                            } else {
                                count.increment();
                            }
                        }
                    }
                }
                pos++;
            }
        }

        // Replace secondary hashtables with most commonly occurring
        // version of each phrase (canonical) form. Delete all words
        // that are proper nouns.
        Iterator phrases = hash.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            FastVector info = (FastVector) hash.get(phrase);

            // Occurring less than m_MinNumOccur?
            if (((Counter) ((FastVector) info).elementAt(1)).value() < m_MinNumOccur) {
                phrases.remove();
                continue;
            }

            // Get canonical form
            String canForm = canonicalForm((HashMap) info.elementAt(2));
            if (canForm == null) {
                phrases.remove();
            } else {
                info.setElementAt(canForm, 2);
            }
        }
        return pos;
    }

    /**
     * Create canonical form of phrase.
     */
    private String canonicalForm(HashMap secHash) {

        int max = 0;
        String bestVersion = null;
        boolean allFullyHyphenated = true;
        int num = 0;
        Iterator versions = secHash.keySet().iterator();
        while (versions.hasNext()) {
            num++;
            String version = (String) versions.next();

            // Are all the words joined up?
            if (!isFullyHyphenated(version)) {
                allFullyHyphenated = false;
            }

            // Check for how often this version occurs
            Counter count = (Counter) secHash.get(version);
            if (count.value() > max) {
                max = count.value();
                bestVersion = version;
            }
        }
        if ((getCheckForProperNouns()) && (num == 1) && properNoun(bestVersion)) {
            //if (allProperNouns) {
            return null;
        } else {
            if (isFullyHyphenated(bestVersion) && !allFullyHyphenated) {
                bestVersion = bestVersion.replace('-', ' ');
            }
            return bestVersion;
        }
    }

    /**
     * Checks whether the given phrase is
     * fully hyphenated.
     */
    private boolean isFullyHyphenated(String str) {

        return (str.indexOf(' ') == -1);
    }

    /**
     * Checks whether the given string is a 
     * proper noun.
     *
     * @return true if it is a potential proper noun
     */
    private static boolean properNoun(String str) {

        // Is it more than one word?
        if (str.indexOf(' ') != -1) {
            return false;
        }

        // Does it start with an upper-case character?
        if (Character.isLowerCase(str.charAt(0))) {
            return false;
        }

        // Is there at least one character that's
        // not upper-case?
        for (int i = 1; i < str.length(); i++) {
            /*if (Character.isUpperCase(str.charAt(i))) {
            return false;
            }*/
            if (!Character.isUpperCase(str.charAt(i))) {
                return true;
            }
        }
        //return true;
        return false;
    }

    /**
     * Generates the evaluation format of a phrase.
     */
    private String evalFormat(String str) {

        return m_Stemmer.stemString(str);
    }

    /** 
     * Generates the internal format of a phrase.
     */
    private String internalFormat(String str) {

        // Remove some non-alphanumeric characters
        str = str.replace('-', ' ');
        str = str.replace('/', ' ');
        str = str.replace('&', ' ');

        // Stem string
        return m_Stemmer.stemString(str);
    }

    /**
     * Gets all the phrases in the given string and puts them into the
     * hashtable.  Also stores the original version of the stemmed
     * phrase in the hash table.  
     */
    private HashMap getGivenKeyphrases(String str, boolean forEval) {

        FastVector vector = new FastVector();
        HashMap hash = new HashMap();

        StringTokenizer tok = new StringTokenizer(str, "\n");
        while (tok.hasMoreTokens()) {
            String orig = tok.nextToken();
            orig = orig.trim();
            if (orig.length() > 0) {
                String modified;
                if (!forEval) {
                    modified = internalFormat(orig);
                } else {
                    modified = evalFormat(orig);
                }
                if (!hash.containsKey(modified)) {
                    hash.put(modified, orig);
                } else {
                    if (forEval) {
                        System.err.println("WARNING: stem of author-assigned keyphrase " + orig
                                + " matches existing stem (skipping it)!");
                    }
                }
            }
        }
        if (hash.size() == 0) {
            return null;
        } else {
            return hash;
        }
    }

    /**
     * Main method for testing this class.
     *
     * @param argv should contain arguments to the filter: use -h for help
     */
    public static void main(String[] argv) {

        try {
            if (Utils.getFlag('b', argv)) {
                Filter.batchFilterFile(new KEAFilter(), argv);
            } else {
                Filter.filterFile(new KEAFilter(), argv);
            }
        } catch (Exception ex) {
            System.out.println(ex.getMessage());
        }
    }
}