com.openkm.kea.filter.KEAFilter.java Source code

Java tutorial

Introduction

Here is the source code for com.openkm.kea.filter.KEAFilter.java

Source

/**
 *  OpenKM, Open Document Management System (http://www.openkm.com)
 *  Copyright (c) 2006-2012  Paco Avila & Josep Llort
 *
 *  No bytes were intentionally harmed during the development of this application.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

package com.openkm.kea.filter;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import weka.classifiers.Classifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.meta.RegressionByDiscretization;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
import com.openkm.kea.stemmers.SremovalStemmer;
import com.openkm.kea.stemmers.Stemmer;
import com.openkm.kea.stopwords.Stopwords;
import com.openkm.kea.stopwords.StopwordsEnglish;
import com.openkm.kea.util.Counter;
import com.openkm.kea.vocab.Vocabulary;

/**
 * This filter converts the incoming data into data appropriate for
 * keyphrase classification. It assumes that the dataset contains two
 * string attributes. The first attribute should contain the text of a
 * document. The second attribute should contain the keyphrases
 * associated with that document (if present). 
 *
 * The filter converts every instance (i.e. document) into a set of
 * instances, one for each word-based n-gram in the document. The
 * string attribute representing the document is replaced by some
 * numeric features, the estimated probability of each n-gram being a
 * keyphrase, and the rank of this phrase in the document according to
 * the probability.  Each new instances also has a class value
 * associated with it. The class is "true" if the n-gram is a true
 * keyphrase, and "false" otherwise. Of course, if the input document
 * doesn't come with author-assigned keyphrases, the class values for
 * that document will be missing.  
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz), Olena Medelyan (olena@cs.waikato.ac.nz)
 * @version 2.0
 */
public class KEAFilter extends Filter implements OptionHandler {
    private static Logger log = LoggerFactory.getLogger(KEAFilter.class);

    /**
     * 
     */
    private static final long serialVersionUID = 1L;

    /** Index of attribute containing the documents */
    private int m_DocumentAtt = 0;

    /** Index of attribute containing the keyphrases */
    private int m_KeyphrasesAtt = 1;

    /** The maximum length of phrases */
    private int m_MaxPhraseLength = 5;

    /** The minimum length of phrases */
    private int m_MinPhraseLength = 1;

    /** The number of phrases to extract. */
    private int m_numPhrases = 10;

    /** Experimental! 
     * Number of human indexers (times a keyphrase appears in the keyphrase set) */
    // adjust manually for >1 indexer
    private int m_Indexers = 1;

    /** Should non-descriptors be replaced by corresponding descriptors? */
    private boolean m_DESCRreplace = true;

    /** Is the node degree (number of related terms in candidate set) being used? */
    public boolean m_NODEfeature = true;

    /** Is the length of a phrase in words being used?*/
    private boolean m_LENGTHfeature = true;

    /** Experimental feature!
     * If m_STDEVused = true, should the standard deviation of position of phrase occurrences be considered? 
     * If set to true, the indicies of features need to be adjusted in the code manually!
     * 
     */
    private boolean m_STDEVfeature = false;

    /** Experimental feature!
     * Is keyphrase frequency attribute being used? 
     * If set to true, adjust the indicies in the code!*/
    private boolean m_KFused = false;

    // end. Don't use these features with m_KFused or adjust indicies below.

    /** Flag for debugging mode */
    private boolean m_Debug = false;

    /** Determines whether internal periods are allowed */
    private boolean m_DisallowInternalPeriods = false;

    /** The minimum number of occurences of a phrase */
    private int m_MinNumOccur = 2;

    /** The number of features describing a phrase */
    private int m_NumFeatures = 2;

    /** Indices of attributes in m_ClassifierData */
    private int m_TfidfIndex = 0;
    private int m_FirstOccurIndex = 1;

    /** Indicies of attributes for new features */

    private int m_LengthIndex = 2;// adjust!!
    private int m_NodeIndex = 3; // decrease if removing the above value
    private int m_STDEVIndex = 4; // adjust!!
    private int m_KeyFreqIndex = 3;

    /** The punctuation filter used by this filter */
    private KEAPhraseFilter m_PunctFilter = null;

    /** The numbers filter used by this filter */
    private NumbersFilter m_NumbersFilter = null;

    /** The actual classifier used to compute probabilities */
    private Classifier m_Classifier = null;

    /** The dictionary containing the document frequencies */
    public HashMap<String, Counter> m_Dictionary = null;

    /** The dictionary containing the keyphrases */
    private HashMap<String, Counter> m_KeyphraseDictionary = null;

    /** The number of documents in the global frequencies corpus */
    private int m_NumDocs = 0;

    /** Template for the classifier data */
    private Instances m_ClassifierData = null;

    /** The default stemmer to be used */
    private Stemmer m_Stemmer = new SremovalStemmer();

    /** The list of stop words to be used */
    private Stopwords m_Stopwords;

    /** The default language to be used */
    private String m_documentLanguage = "en";

    public KEAFilter(Stopwords m_Stopwords) {
        this.m_Stopwords = m_Stopwords;
    }

    /** The Vocabulary object */
    /**
     * orininally static
     * Changed to non-static so we can have multiple filters running
     */
    public Vocabulary m_Vocabulary;

    /**
     * New method to set the Vocabulary to null
     */
    public void clearVocabulary() {
        m_Vocabulary = null;
    }

    /** The Vocabulary name */
    private String m_vocabulary = "agrovoc";

    /** The Vocabulary format */
    private String m_vocabularyFormat = "skos";

    /**
     * Get the M_Vocabulary value.
     * @return the M_Vocabulary value.
     */
    public String getVocabulary() {
        return m_vocabulary;
    }

    /**
     * Set the M_Vocabulary value.
     * @param newM_Vocabulary The new M_Vocabulary value.
     */
    public void setVocabulary(String newM_Vocabulary) {
        this.m_vocabulary = newM_Vocabulary;
    }

    /**
     * Get the M_VocabularyFormat value.
     * @return the M_VocabularyFormat value.
     */
    public String getVocabularyFormat() {
        return m_vocabularyFormat;
    }

    /**
     * Set the M_VocabularyFormat value.
     * @param newM_VocabularyFormat The new M_VocabularyFormat value.
     */
    public void setVocabularyFormat(String newM_VocabularyFormat) {
        this.m_vocabularyFormat = newM_VocabularyFormat;
    }

    /**
     * Get the M_documentLanguage value.
     * @return the M_documentLanguage value.
     */
    public String getDocumentLanguage() {
        return m_documentLanguage;
    }

    /**
     * Set the M_documentLanguage value.
     * @param newM_documentLanguage The new M_documentLanguage value.
     */
    public void setDocumentLanguage(String newM_documentLanguage) {
        this.m_documentLanguage = newM_documentLanguage;
    }

    /** Determines whether check for proper nouns is performed */
    private boolean m_CheckForProperNouns = true;

    /**
     * Get the M_CheckProperNouns value.
     * @return the M_CheckProperNouns value.
     */
    public boolean getCheckForProperNouns() {
        return m_CheckForProperNouns;
    }

    /**
     * Set the M_CheckProperNouns value.
     * @param newM_CheckProperNouns The new M_CheckProperNouns value.
     */
    public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
        this.m_CheckForProperNouns = newM_CheckProperNouns;
    }

    /**
     * Get the M_Stopwords value.
     * @return the M_Stopwords value.
     */
    public Stopwords getStopwords() {
        return m_Stopwords;
    }

    /**
     * Set the M_Stopwords value.
     * @param newM_Stopwords The new M_Stopwords value.
     */
    public void setStopwords(Stopwords newM_Stopwords) {
        this.m_Stopwords = newM_Stopwords;
    }

    /**
     * Get the Stemmer value.
     * @return the Stemmer value.
     */
    public Stemmer getStemmer() {
        return m_Stemmer;
    }

    /**
     * Set the Stemmer value.
     * @param newStemmer The new Stemmer value.
     */
    public void setStemmer(Stemmer newStemmer) {
        this.m_Stemmer = newStemmer;

    }

    /**
     * Get the value of MinNumOccur.
     *
     * @return Value of MinNumOccur.
     */
    public int getMinNumOccur() {
        return m_MinNumOccur;
    }

    /**
     * Set the value of MinNumOccur.
     *
     * @param newMinNumOccur Value to assign to MinNumOccur.
     */
    public void setMinNumOccur(int newMinNumOccur) {
        m_MinNumOccur = newMinNumOccur;
    }

    /**
     * Get the value of MaxPhraseLength.
     *
     * @return Value of MaxPhraseLength.
     */
    public int getMaxPhraseLength() {
        return m_MaxPhraseLength;
    }

    /**
     * Set the value of MaxPhraseLength.
     *
     * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
     */
    public void setMaxPhraseLength(int newMaxPhraseLength) {
        m_MaxPhraseLength = newMaxPhraseLength;
    }

    /**
     * Get the value of MinPhraseLength.
     *
     * @return Value of MinPhraseLength.
     */
    public int getMinPhraseLength() {
        return m_MinPhraseLength;
    }

    /**
     * Set the value of MinPhraseLength.
     *
     * @param newMinPhraseLength Value to assign to MinPhraseLength.
     */
    public void setMinPhraseLength(int newMinPhraseLength) {
        m_MinPhraseLength = newMinPhraseLength;
    }

    /**
     * Get the value of numPhrases.
     *
     * @return Value of numPhrases.
     */
    public int getNumPhrases() {
        return m_numPhrases;
    }

    /**
     * Set the value of numPhrases.
     *
     * @param newnumPhrases Value to assign to numPhrases.
     */
    public void setNumPhrases(int newnumPhrases) {
        m_numPhrases = newnumPhrases;
    }

    /**
     * Returns the index of the stemmed phrases in the output ARFF file.
     */
    public int getStemmedPhraseIndex() {
        return m_DocumentAtt;
    }

    /**
     * Returns the index of the unstemmed phrases in the output ARFF file.
     */
    public int getUnstemmedPhraseIndex() {
        return m_DocumentAtt + 1;
    }

    /**
     * Returns the index of the phrases' probabilities in the output ARFF file.
     */
    public int getProbabilityIndex() {
        int index = m_DocumentAtt + 4;

        if (m_Debug) {
            if (m_KFused) {
                index++;
            }
        }
        if (m_STDEVfeature) {
            index++;
        }
        if (m_NODEfeature) {
            index++;
        }
        if (m_LENGTHfeature) {
            index++;
        }

        return index;
    }

    /**
     * Returns the index of the phrases' ranks in the output ARFF file.
     */
    public int getRankIndex() {
        return getProbabilityIndex() + 1;
    }

    /**
     * Get the value of DocumentAtt.
     *
     * @return Value of DocumentAtt.
     */
    public int getDocumentAtt() {
        return m_DocumentAtt;
    }

    /**
     * Set the value of DocumentAtt.
     *
     * @param newDocumentAtt Value to assign to DocumentAtt.
     */
    public void setDocumentAtt(int newDocumentAtt) {
        m_DocumentAtt = newDocumentAtt;
    }

    /**
     * Get the value of KeyphraseAtt.
     *
     * @return Value of KeyphraseAtt.
     */
    public int getKeyphrasesAtt() {
        return m_KeyphrasesAtt;
    }

    /**
     * Set the value of KeyphrasesAtt.
     *
     * @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
     */
    public void setKeyphrasesAtt(int newKeyphrasesAtt) {
        m_KeyphrasesAtt = newKeyphrasesAtt;
    }

    /**
     * Get the value of Debug.
     *
     * @return Value of Debug.
     */
    public boolean getDebug() {
        return m_Debug;
    }

    /**
     * Set the value of Debug.
     *
     * @param newDebug Value to assign to Debug.
     */
    public void setDebug(boolean newDebug) {
        m_Debug = newDebug;
    }

    /**
     * Sets whether keyphrase frequency attribute is used.
     */
    public void setKFused(boolean flag) {
        m_KFused = flag;
        if (flag) {
            m_NumFeatures++;
        }
    }

    /**
     * Sets whether Vocabulary relation attribute is used.
     */
    public void setNumFeature() {
        if (m_STDEVfeature) {
            m_NumFeatures++;
        }
        if (m_NODEfeature) {
            m_NumFeatures++;
        }
        if (m_LENGTHfeature) {
            m_NumFeatures++;
        }
    }

    /**
     * Gets whether keyphrase frequency attribute is used.
     */
    public boolean getKFused() {
        return m_KFused;
    }

    /**
     * Get whether the supplied columns are to be processed
     *
     * @return true if the supplied columns won't be processed
     */
    public boolean getDisallowInternalPeriods() {
        return m_DisallowInternalPeriods;
    }

    /**
     * Set whether selected columns should be processed. If true the 
     * selected columns won't be processed.
     *
     * @param disallow the new invert setting
     */
    public void setDisallowInternalPeriods(boolean disallow) {
        m_DisallowInternalPeriods = disallow;
    }

    public void loadThesaurus(Stemmer st, Stopwords sw) {
        m_Vocabulary = new Vocabulary(m_vocabulary, m_vocabularyFormat, m_documentLanguage);

        m_Vocabulary.setStemmer(st);
        m_Vocabulary.setStopwords(sw);
        m_Vocabulary.initialize();
        try {

            if (m_DESCRreplace) {
                m_Vocabulary.buildUSE();
            }
            if (m_NODEfeature) {
                m_Vocabulary.buildREL();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Parses a given list of options controlling the behaviour of this object.
     * Valid options are:<p>
     *
     * -K<br>
     * Specifies whether keyphrase frequency statistic is used.<p>
     *
     * -R<br>
     * Specifies whether Vocabulary relation statistic is used.<p>
     *
     * -M length<br>
     * Sets the maximum phrase length (default: 5).<p>
     *
     * -L length<br>
     * Sets the minimum phrase length (default: 1).<p>
     *
     * -D<br>
     * Turns debugging mode on.<p>
     *
     * -I index<br>
     * Sets the index of the attribute containing the documents (default: 0).<p>
     *
     * -J index<br>
     * Sets the index of the attribute containing the keyphrases (default: 1).<p>
     *
     * -P<br>
     * Disallow internal periods <p>
     *
     * -O number<br>
     * The minimum number of times a phrase needs to occur (default: 2). <p>
     *
     * @param options the list of options as an array of strings
     * @exception Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {

        setKFused(Utils.getFlag('K', options));
        setDebug(Utils.getFlag('D', options));
        String docAttIndexString = Utils.getOption('I', options);
        if (docAttIndexString.length() > 0) {
            setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
        } else {
            setDocumentAtt(0);
        }
        String keyphraseAttIndexString = Utils.getOption('J', options);
        if (keyphraseAttIndexString.length() > 0) {
            setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
        } else {
            setKeyphrasesAtt(1);
        }
        String maxPhraseLengthString = Utils.getOption('M', options);
        if (maxPhraseLengthString.length() > 0) {
            setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
        } else {
            setMaxPhraseLength(3);
        }
        String minPhraseLengthString = Utils.getOption('M', options);
        if (minPhraseLengthString.length() > 0) {
            setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
        } else {
            setMinPhraseLength(1);
        }
        String minNumOccurString = Utils.getOption('O', options);
        if (minNumOccurString.length() > 0) {
            setMinNumOccur(Integer.parseInt(minNumOccurString));
        } else {
            setMinNumOccur(2);
        }
        setDisallowInternalPeriods(Utils.getFlag('P', options));
    }

    /**
     * Gets the current settings of the filter.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {

        String[] options = new String[13];
        int current = 0;

        if (getKFused()) {
            options[current++] = "-K";
        }
        if (getDebug()) {
            options[current++] = "-D";
        }
        options[current++] = "-I";
        options[current++] = "" + (getDocumentAtt() + 1);
        options[current++] = "-J";
        options[current++] = "" + (getKeyphrasesAtt() + 1);
        options[current++] = "-M";
        options[current++] = "" + (getMaxPhraseLength());
        options[current++] = "-L";
        options[current++] = "" + (getMinPhraseLength());
        options[current++] = "-O";
        options[current++] = "" + (getMinNumOccur());

        if (getDisallowInternalPeriods()) {
            options[current++] = "-P";
        }

        while (current < options.length) {
            options[current++] = "";
        }
        return options;
    }

    /**
     * Returns an enumeration describing the available options
     *
     * @return an enumeration of all the available options
     */
    public Enumeration<Option> listOptions() {

        Vector<Option> newVector = new Vector<Option>(7);

        newVector
                .addElement(new Option("\tSpecifies whether keyphrase frequency statistic is used.", "K", 0, "-K"));
        newVector.addElement(new Option("\tSets the maximum phrase length (default: 3).", "M", 1, "-M <length>"));
        newVector.addElement(new Option("\tSets the minimum phrase length (default: 1).", "L", 1, "-L <length>"));
        newVector.addElement(new Option("\tTurns debugging mode on.", "D", 0, "-D"));
        newVector.addElement(new Option("\tSets the index of the document attribute (default: 0).", "I", 1, "-I"));
        newVector.addElement(new Option("\tSets the index of the keyphrase attribute (default: 1).", "J", 1, "-J"));
        newVector.addElement(new Option("\tDisallow internal periods.", "P", 0, "-P"));
        newVector.addElement(new Option("\tSet the minimum number of occurences (default: 2).", "O", 1, "-O"));

        return newVector.elements();
    }

    /**
     * Returns a string describing this filter
     *
     * @return a description of the filter suitable for
     * displaying in the explorer/experimenter gui
     */
    public String globalInfo() {
        return "Converts incoming data into data appropriate for " + "keyphrase classification.";
    }

    /**
     * Sets the format of the input instances.
     *
     * @param instanceInfo an Instances object containing the input
     * instance structure (any instances contained in the object are
     * ignored - only the structure is required).
     * @return true if the outputFormat may be collected immediately 
     */
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        if (instanceInfo.classIndex() >= 0) {
            throw new Exception("Don't know what do to if class index set!");
        }
        if (!instanceInfo.attribute(m_KeyphrasesAtt).isString()
                || !instanceInfo.attribute(m_DocumentAtt).isString()) {
            throw new Exception("Keyphrase attribute and document attribute " + "need to be string attributes.");
        }
        m_PunctFilter = new KEAPhraseFilter();
        int[] arr = new int[1];
        arr[0] = m_DocumentAtt;
        m_PunctFilter.setAttributeIndicesArray(arr);
        m_PunctFilter.setInputFormat(instanceInfo);
        m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());

        if (m_vocabulary.equals("none")) {
            m_NumbersFilter = new NumbersFilter();
            m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
            super.setInputFormat(m_NumbersFilter.getOutputFormat());
        } else {
            super.setInputFormat(m_PunctFilter.getOutputFormat());
        }

        return false;

    }

    /**
     * Returns the Capabilities of this filter.
     *
     * @return            the capabilities of this object
     * @see               Capabilities
     */
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();

        // attributes
        result.enableAllAttributes();
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enable(Capability.NOMINAL_CLASS);
        result.enable(Capability.NO_CLASS);
        result.enableAllClasses();

        // result.or(new LinearRegression().getCapabilities());

        return result;
    }

    /**
     * Input an instance for filtering. Ordinarily the instance is processed
     * and made available for output immediately. Some filters require all
     * instances be read before producing output.
     *
     * @param instance the input instance
     * @return true if the filtered instance may now be
     * collected with output().
     * @exception Exception if the input instance was not of the correct 
     * format or if there was a problem with the filtering.
     */
    @SuppressWarnings("unchecked")
    public boolean input(Instance instance) throws Exception {
        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }
        if (m_NewBatch) {
            resetQueue();
            m_NewBatch = false;
        }

        if (m_Debug) {
            log.info("-- Reading instance");
        }

        m_PunctFilter.input(instance);
        m_PunctFilter.batchFinished();
        instance = m_PunctFilter.output();

        if (m_vocabulary.equals("none")) {
            m_NumbersFilter.input(instance);
            m_NumbersFilter.batchFinished();
            instance = m_NumbersFilter.output();
        }

        if (m_Dictionary == null) {
            bufferInput(instance);
            return false;
        } else {
            FastVector vector = convertInstance(instance, false);
            Enumeration<Instance> en = vector.elements();
            while (en.hasMoreElements()) {
                Instance inst = en.nextElement();
                push(inst);
            }
            return true;
        }

    }

    /**
     * Signify that this batch of input to the filter is finished. 
     * If the filter requires all instances prior to filtering,
     * output() may now be called to retrieve the filtered instances.
     *
     * @return true if there are instances pending output
     * @exception Exception if no input structure has been defined
     */
    public boolean batchFinished() throws Exception {

        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }

        if (m_Dictionary == null) {
            buildGlobalDictionaries();
            buildClassifier();
            convertPendingInstances();
        }
        flushInput();
        m_NewBatch = true;
        return (numPendingOutput() != 0);
    }

    /**
     * Builds the global dictionaries.
     */
    public void buildGlobalDictionaries() throws Exception {
        if (m_Debug) {
            log.info("--- Building global dictionaries");
        }

        // Build dictionary of n-grams with associated
        // document frequencies
        m_Dictionary = new HashMap<String, Counter>();
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
            HashMap<String, Counter> hash = getPhrasesForDictionary(str);
            Iterator<String> it = hash.keySet().iterator();
            while (it.hasNext()) {
                String phrase = it.next();
                Counter counter = (Counter) m_Dictionary.get(phrase);
                if (counter == null) {
                    m_Dictionary.put(phrase, new Counter());
                } else {
                    counter.increment();
                }
            }
        }

        if (m_KFused) {
            if (m_Debug) {
                log.info("KF_used feature");
            }

            // Build dictionary of n-grams that occur as keyphrases
            // with associated keyphrase frequencies
            m_KeyphraseDictionary = new HashMap<String, Counter>();
            for (int i = 0; i < getInputFormat().numInstances(); i++) {
                String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
                HashMap<String, Counter> hash = getGivenKeyphrases(str, false);
                if (hash != null) {
                    Iterator<String> it = hash.keySet().iterator();
                    while (it.hasNext()) {
                        String phrase = it.next();
                        Counter counter = m_KeyphraseDictionary.get(phrase);
                        if (counter == null) {
                            m_KeyphraseDictionary.put(phrase, new Counter());
                        } else {
                            counter.increment();
                        }
                    }
                }
            }
        } else {
            m_KeyphraseDictionary = null;
        }

        // Set the number of documents in the global corpus
        m_NumDocs = getInputFormat().numInstances();
    }

    /**
     * Builds the classifier.
     */
    // aly: The main function, where everything important happens
    private void buildClassifier() throws Exception {
        // Generate input format for classifier
        FastVector atts = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == m_DocumentAtt) {
                atts.addElement(new Attribute("TFxIDF"));
                atts.addElement(new Attribute("First_occurrence"));
                if (m_KFused) {
                    atts.addElement(new Attribute("Keyphrase_frequency"));
                }
                if (m_STDEVfeature) {
                    atts.addElement(new Attribute("Standard_deviation"));
                }
                if (m_NODEfeature) {
                    atts.addElement(new Attribute("Relations_number"));
                }
                if (m_LENGTHfeature) {
                    atts.addElement(new Attribute("Phrase_length"));
                }
            } else if (i == m_KeyphrasesAtt) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                //atts.addElement(new Attribute("Keyphrase?", vals));
                atts.addElement(new Attribute("Keyphrase?"));
            }
        }
        m_ClassifierData = new Instances("ClassifierData", atts, 0);
        m_ClassifierData.setClassIndex(m_NumFeatures);

        if (m_Debug) {
            log.info("--- Converting instances for classifier");
        }
        // Convert pending input instances into data for classifier
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            Instance current = getInputFormat().instance(i);

            // Get the key phrases for the document
            String keyphrases = current.stringValue(m_KeyphrasesAtt);
            HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases, false);
            HashMap<String, Counter> hashKeysEval = getGivenKeyphrases(keyphrases, true);

            // Get the phrases for the document
            HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
            int length = getPhrases(hash, current.stringValue(m_DocumentAtt));
            // hash = getComposits(hash);

            // Compute the feature values for each phrase and
            // add the instance to the data for the classifier

            Iterator<String> it = hash.keySet().iterator();
            while (it.hasNext()) {
                String phrase = it.next();
                FastVector phraseInfo = (FastVector) hash.get(phrase);

                double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length, hash);
                //log.info(vals);
                Instance inst = new Instance(current.weight(), vals);
                // .err.println(phrase + "\t" + inst.toString());
                m_ClassifierData.add(inst);
            }
        }

        if (m_Debug) {
            log.info("--- Building classifier");
        }

        // Build classifier

        // Uncomment if you want to use a different classifier
        // Caution: Other places in the code will have to be adjusted!!
        /*I. Naive Bayes:
         FilteredClassifier fclass = new FilteredClassifier();      
         fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple());
         fclass.setFilter(new Discretize());
         m_Classifier = fclass;
         */

        //NaiveBayes nb = new NaiveBayes();
        //nb.setUseSupervisedDiscretization(true);
        //m_Classifier = nb;

        /* II. Linear Regression:
         LinearRegression lr = new LinearRegression();   
         lr.setAttributeSelectionMethod(new 
         weka.core.SelectedTag(1, LinearRegression.TAGS_SELECTION));
         lr.setEliminateColinearAttributes(false);
         lr.setDebug(false);
             
         m_Classifier = lr;*/

        /* III. Bagging with REPTrees
         Bagging bagging = new Bagging();   
             
         String[] ops_bagging = {
         new String("-P"),
         new String("100"),
         new String("-S"), 
         new String("1"),
         new String("-I"), 
         new String("50")};
             
         */

        /*
         * REPTree rept = new REPTree();
         //results are worse!
          rept.setNoPruning(true);
          String[] ops_rept = {
          new String("-M"), 
          new String("2"),
          new String("-V"), 
          new String("0.0010"),            
          new String("-N"), 
          new String("3"),
          new String("-S"), 
          new String("1"),
          new String("-L"), 
          new String("1"),};
              
          rept.setOptions(ops_rept);
          bagging.setClassifier(rept);
          */

        //   bagging.setOptions(ops_bagging);
        //FilteredClassifier fclass = new FilteredClassifier();      
        //fclass.setClassifier(new REPTree());
        //fclass.setFilter(new Discretize());
        //bagging.setClassifier(fclass);
        //   m_Classifier = bagging;

        RegressionByDiscretization rvd = new RegressionByDiscretization();
        FilteredClassifier fclass = new FilteredClassifier();
        fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple());
        fclass.setFilter(new Discretize());

        rvd.setClassifier(fclass);
        rvd.setNumBins(m_Indexers + 1);
        m_Classifier = rvd;

        // log.info(m_ClassifierData);   
        //System.exit(1);
        m_Classifier.buildClassifier(m_ClassifierData);

        if (m_Debug) {
            log.info("" + m_Classifier);
        }

        // Save space
        m_ClassifierData = new Instances(m_ClassifierData, 0);
    }

    /** 
     * Conmputes the feature values for a given phrase.
     */
    private double[] featVals(String id, FastVector phraseInfo, boolean training,
            HashMap<String, Counter> hashKeysEval, HashMap<String, Counter> hashKeyphrases, int length,
            HashMap<String, FastVector> hash) {

        // Compute feature values
        Counter counterLocal = (Counter) phraseInfo.elementAt(1);
        double[] newInst = new double[m_NumFeatures + 1];

        // Compute TFxIDF
        Counter counterGlobal = (Counter) m_Dictionary.get(id);
        double localVal = counterLocal.value(), globalVal = 0;
        if (counterGlobal != null) {
            globalVal = counterGlobal.value();
            if (training) {
                globalVal = globalVal - 1;
            }
        }

        // Just devide by length to get approximation of probability
        // that phrase in document is our phrase
        // newInst[m_TfidfIndex] = (localVal / ((double)length));
        newInst[m_TfidfIndex] = (localVal / ((double) length))
                * (-Math.log((globalVal + 1) / ((double) m_NumDocs + 1)));

        // Compute first occurrence
        Counter counterFirst = (Counter) phraseInfo.elementAt(0);
        newInst[m_FirstOccurIndex] = (double) counterFirst.value() / (double) length;

        // Is keyphrase frequency attribute being used?
        if (m_KFused) {
            Counter keyphraseC = (Counter) m_KeyphraseDictionary.get(id);
            if ((training) && (hashKeyphrases != null) && (hashKeyphrases.containsKey(id))) {
                newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
            } else {
                if (keyphraseC != null) {
                    newInst[m_KeyFreqIndex] = keyphraseC.value();
                } else {
                    newInst[m_KeyFreqIndex] = 0;
                }
            }
        }

        // Is term appearance attribute being used?
        if (m_STDEVfeature) {
            FastVector app = (FastVector) phraseInfo.elementAt(3);

            double[] vals = new double[app.size()];
            for (int i = 0; i < vals.length; i++) {
                vals[i] = ((Counter) app.elementAt(i)).value() / (double) length;
                ;
            }

            double mean = Utils.mean(vals);
            double summ = 0.0;
            for (int i = 0; i < vals.length; i++) {
                double a = vals[i];
                //log.info("Appearence " + i + " is at " + a);
                summ += (a - mean) * (a - mean);
            }
            double stdev = Math.sqrt(summ / (double) app.size());

            newInst[m_STDEVIndex] = stdev;

            /* Using instead of STDEV feature a thesaurus based feature (experiment)
             if (m_Vocabulary.getRelated(id,"compositeOf") != null) {
             //log.info(m_Vocabulary.getOrig(id) + " is a composite!");
              newInst[m_STDEVIndex] = 1.0;
              } else {
              newInst[m_STDEVIndex] = 0.0;
              }
              */

        }

        // Is node degree attribute being used?   
        if (m_NODEfeature) {

            Vector<String> idsRT = m_Vocabulary.getRelated(id);

            int intern = 0;
            if (idsRT != null) {
                for (int d = 0; d < idsRT.size(); d++) {
                    if (hash.get(idsRT.elementAt(d)) != null) {
                        intern++;
                    }
                }
            }
            // log.info("Node feature for " + m_Vocabulary.getOrig(id) + " = " + intern);

            newInst[m_NodeIndex] = (double) intern;

        }

        // Is term length attribute being used?
        if (m_LENGTHfeature) {
            String original;
            if (m_vocabulary.equals("none")) {
                original = id;
            } else {
                original = m_Vocabulary.getOrig(id);
            }
            if (original == null) {
                log.info("problem with id " + id);
                newInst[m_LengthIndex] = 1.0;
            } else {
                String[] words = split(original, " ");
                newInst[m_LengthIndex] = (double) words.length;
            }

        }

        // Compute class value

        if (hashKeysEval == null) { // no author-assigned keyphrases
            newInst[m_NumFeatures] = Instance.missingValue();
        } else if (!hashKeysEval.containsKey(id)) {

            newInst[m_NumFeatures] = 0; // Not a keyphrase

            // Experiment with giving phrases related to manually chosen one
            // higher values than to unrelated ones
            /*Vector related = (Vector)m_Vocabulary.getRelated(id);
             // if this id is related to one of the keyphrases, set its class value to 0.5
              if (related != null) {         
              Enumeration en = related.elements();         
              while (en.hasMoreElements()) {
              String relID = (String)en.nextElement();
              if (hashKeysEval.containsKey(relID)) {      
              newInst[m_NumFeatures] = 1; // Keyphrase
              }            
              }   
              }
              */

        } else {
            //hashKeysEval.remove(id);
            //newInst[m_NumFeatures] = 1; // Keyphrase

            // Learning from multiple-indexer's data
            // log.info(m_Indexers);
            // log.info("Calculating class value with m_Indexers = " + m_Indexers);

            double c = (double) ((Counter) hashKeysEval.get(id)).value() / m_Indexers;
            newInst[m_NumFeatures] = c; // Keyphrase

            // Or simple learning from 1 indexer:
            // newInst[m_NumFeatures] = 1.0; // Keyphrase
        }
        return newInst;
    }

    /**
     * Sets output format and converts pending input instances.
     */
    @SuppressWarnings("unchecked")
    private void convertPendingInstances() throws Exception {

        if (m_Debug) {
            log.info("--- Converting pending instances");
        }

        // Create output format for filter
        FastVector atts = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == m_DocumentAtt) {
                // string attributes
                atts.addElement(new Attribute("N-gram", (FastVector) null));
                atts.addElement(new Attribute("N-gram-original", (FastVector) null));
                // numeric attributes
                atts.addElement(new Attribute("TFxIDF"));
                atts.addElement(new Attribute("First_occurrence"));
                // optional attributes
                if (m_Debug) {
                    if (m_KFused) {
                        atts.addElement(new Attribute("Keyphrase_frequency"));
                    }
                }
                if (m_STDEVfeature) {
                    //FastVector rvals = new FastVector(2);
                    //rvals.addElement("False");
                    //rvals.addElement("True");
                    atts.addElement(new Attribute("Standard_deviation"));
                }
                if (m_NODEfeature) {
                    atts.addElement(new Attribute("Relations_number"));
                }
                if (m_LENGTHfeature) {
                    atts.addElement(new Attribute("Phrase_length"));
                }

                atts.addElement(new Attribute("Probability"));
                atts.addElement(new Attribute("Rank"));
            } else if (i == m_KeyphrasesAtt) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                //atts.addElement(new Attribute("Keyphrase?", vals));
                atts.addElement(new Attribute("Keyphrase?"));
            } else {
                atts.addElement(getInputFormat().attribute(i));
            }
        }
        Instances outFormat = new Instances("KEAdata", atts, 0);
        setOutputFormat(outFormat);

        // Convert pending input instances into output data
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            Instance current = getInputFormat().instance(i);
            FastVector vector = convertInstance(current, true);
            Enumeration<Instance> en = vector.elements();
            while (en.hasMoreElements()) {
                Instance inst = en.nextElement();
                push(inst);
            }
        }
    }

    /**
     * Converts an instance.
     */
    private FastVector convertInstance(Instance instance, boolean training) throws Exception {

        FastVector vector = new FastVector();

        if (m_Debug) {
            log.info("-- Converting instance");
        }

        // Get the key phrases for the document
        HashMap<String, Counter> hashKeyphrases = null;
        HashMap<String, Counter> hashKeysEval = null;
        if (!instance.isMissing(m_KeyphrasesAtt)) {
            String keyphrases = instance.stringValue(m_KeyphrasesAtt);
            hashKeyphrases = getGivenKeyphrases(keyphrases, false);
            hashKeysEval = getGivenKeyphrases(keyphrases, true);
        }

        // Get the phrases for the document
        HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
        int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
        //   hash = getComposits(hash);

        /* Experimental:
         To compute how many of the manual keyphrases appear in the documents:
            
        log.info("Doc phrases found " + hash.size());
        log.info("Manual keyphrases: ");
        Iterator iter = hashKeyphrases.keySet().iterator();
        int count = 0;
        while (iter.hasNext()) {
           String id = (String)iter.next();
           if (hash.containsKey(id)) {
        count++;
           }
        }
            
        double max_recall = (double)count/(double)hashKeyphrases.size();
            
            
        m_max_recall += max_recall;
        doc++;
        double avg_m_max_recall = m_max_recall/(double)doc;
            
        String file = instance.stringValue(2);
        log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
        log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
        */

        // Compute number of extra attributes
        int numFeatures = 5;
        if (m_Debug) {
            if (m_KFused) {
                numFeatures = numFeatures + 1;
            }
        }
        if (m_STDEVfeature) {
            numFeatures = numFeatures + 1;
        }
        if (m_NODEfeature) {
            numFeatures = numFeatures + 1;
        }
        if (m_LENGTHfeature) {
            numFeatures = numFeatures + 1;
        }

        // Set indices of key attributes
        //int phraseAttIndex = m_DocumentAtt;
        int tfidfAttIndex = m_DocumentAtt + 2;
        int distAttIndex = m_DocumentAtt + 3;
        int probsAttIndex = m_DocumentAtt + numFeatures - 1;
        //int classAttIndex = numFeatures;

        // Go through the phrases and convert them into instances
        Iterator<String> it = hash.keySet().iterator();
        while (it.hasNext()) {
            String id = it.next();
            FastVector phraseInfo = (FastVector) hash.get(id);

            double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash);

            Instance inst = new Instance(instance.weight(), vals);

            inst.setDataset(m_ClassifierData);

            // Get probability of a phrase being key phrase
            double[] probs = m_Classifier.distributionForInstance(inst);

            // If simple Naive Bayes used, change here to
            //double prob = probs[1];
            double prob = probs[0];

            // Compute attribute values for final instance
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // output of values for a given phrase:

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(id);
                    newInst[pos++] = index;

                    // Add original version
                    String orig = (String) phraseInfo.elementAt(2);

                    if (orig != null) {
                        index = outputFormatPeek().attribute(pos).addStringValue(orig);
                    } else {
                        index = outputFormatPeek().attribute(pos).addStringValue(id);
                    }
                    newInst[pos++] = index;

                    // Add TFxIDF
                    newInst[pos++] = inst.value(m_TfidfIndex);

                    // Add distance
                    newInst[pos++] = inst.value(m_FirstOccurIndex);

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = inst.value(m_KeyFreqIndex);
                        }
                    }
                    if (m_STDEVfeature) {
                        newInst[pos++] = inst.value(m_STDEVIndex);
                    }
                    if (m_NODEfeature) {
                        newInst[pos++] = inst.value(m_NodeIndex);
                    }
                    if (m_LENGTHfeature) {
                        newInst[pos++] = inst.value(m_LengthIndex);
                    }

                    // Add probability 
                    probsAttIndex = pos;
                    newInst[pos++] = prob;

                    // Set rank to missing (computed below)
                    newInst[pos++] = Instance.missingValue();

                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = inst.classValue();
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance ins = new Instance(instance.weight(), newInst);
            ins.setDataset(outputFormatPeek());
            vector.addElement(ins);
        }

        // Add dummy instances for keyphrases that don't occur
        // in the document
        if (hashKeysEval != null) {
            Iterator<String> phrases = hashKeysEval.keySet().iterator();
            while (phrases.hasNext()) {
                String phrase = phrases.next();
                double[] newInst = new double[instance.numAttributes() + numFeatures];
                int pos = 0;
                for (int i = 0; i < instance.numAttributes(); i++) {
                    if (i == m_DocumentAtt) {
                        // log.info("Here: " + phrase);
                        // Add phrase
                        int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                        newInst[pos++] = (double) index;

                        // Add original version
                        index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                        newInst[pos++] = (double) index;

                        // Add TFxIDF
                        newInst[pos++] = Instance.missingValue();

                        // Add distance
                        newInst[pos++] = Instance.missingValue();

                        // Add other features
                        if (m_Debug) {
                            if (m_KFused) {
                                newInst[pos++] = Instance.missingValue();
                            }
                        }
                        if (m_STDEVfeature) {
                            newInst[pos++] = Instance.missingValue();
                        }
                        if (m_NODEfeature) {
                            newInst[pos++] = Instance.missingValue();
                        }
                        if (m_LENGTHfeature) {
                            newInst[pos++] = Instance.missingValue();
                        }

                        // Add probability and rank
                        newInst[pos++] = -Double.MAX_VALUE;
                        // newInst[pos++] = Instance.missingValue();
                    } else if (i == m_KeyphrasesAtt) {
                        newInst[pos++] = 1; // Keyphrase
                    } else {
                        newInst[pos++] = instance.value(i);
                    }

                    Instance inst = new Instance(instance.weight(), newInst);
                    inst.setDataset(outputFormatPeek());
                    vector.addElement(inst);
                }

            }
        }

        // Sort phrases according to their distance (stable sort)
        double[] vals = new double[vector.size()];
        for (int i = 0; i < vals.length; i++) {
            vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
        }
        FastVector newVector = new FastVector(vector.size());
        int[] sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Sort phrases according to their tfxidf value (stable sort)
        for (int i = 0; i < vals.length; i++) {
            vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
        }
        newVector = new FastVector(vector.size());
        sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Sort phrases according to their probability (stable sort)
        for (int i = 0; i < vals.length; i++) {
            vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
        }
        newVector = new FastVector(vector.size());
        sortedIndices = Utils.stableSort(vals);
        for (int i = 0; i < vals.length; i++) {
            newVector.addElement(vector.elementAt(sortedIndices[i]));
        }
        vector = newVector;

        // Compute rank of phrases. Check for subphrases that are ranked
        // lower than superphrases and assign probability -1 and set the
        // rank to Integer.MAX_VALUE
        int rank = 1;
        for (int i = 0; i < vals.length; i++) {
            Instance currentInstance = (Instance) vector.elementAt(i);
            // Short cut: if phrase very unlikely make rank very low and continue
            if (Utils.grOrEq(vals[i], 1.0)) {
                currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
                continue;
            }

            // Otherwise look for super phrase starting with first phrase
            // in list that has same probability, TFxIDF value, and distance as
            // current phrase. We do this to catch all superphrases
            // that have same probability, TFxIDF value and distance as current phrase.
            int startInd = i;
            while (startInd < vals.length) {
                Instance inst = (Instance) vector.elementAt(startInd);
                if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                        || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                        || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                    break;
                }
                startInd++;
            }
            currentInstance.setValue(probsAttIndex + 1, rank++);

        }
        return vector;
    }
    /*
     private HashMap getComposits(HashMap dict) {
     HashMap dictClone = (HashMap)dict.clone();
     Iterator it1 = dictClone.keySet().iterator();       
     while (it1.hasNext()) {
     String id1 = (String)it1.next();
     String term1 = m_Vocabulary.getOrig(id1);
     Iterator it2 = dictClone.keySet().iterator();       
     while (it2.hasNext()) {
     String id2 = (String)it2.next();
         
     String term2 = m_Vocabulary.getOrig(id2);
         
     String composite = term1 + " " + term2;
     String idNew = m_Vocabulary.getID(composite);
         
     if (term1 != term2 && idNew != null) {
         
     FastVector vec = (FastVector)dict.get(idNew);
         
     if (vec == null) {
     log.info("Found " + m_Vocabulary.getOrig(idNew) + " (" + term1 + ", " + term2 + ")");
     // Specifying the size of the vector
      // According to additional selected features:
       vec = new FastVector(2);
           
       // Update hashtable with all the info
        vec.addElement(new Counter(0)); //0
        vec.addElement(new Counter()); //1
        vec.addElement(m_Vocabulary.getOrig(idNew)); //2
        dict.put(idNew, vec);
        } else {
            
        // Update number of occurrences
    ((Counter)((FastVector)vec).elementAt(1)).increment();
    }               
    }
    }
    }
    return dict;
    }
    */

    /**
     * Returns a hashtable. Fills the hashtable
     * with the stemmed n-grams occuring in the given string
     * (as keys) and the number of times it occurs.
     */
    public HashMap<String, Counter> getPhrasesForDictionary(String str) {

        String[] buffer = new String[m_MaxPhraseLength];
        HashMap<String, Counter> hash = new HashMap<String, Counter>();

        StringTokenizer tok = new StringTokenizer(str, "\n");
        while (tok.hasMoreTokens()) {
            String phrase = tok.nextToken();
            //  log.info("Sentence " + phrase);
            int numSeen = 0;
            StringTokenizer wordTok = new StringTokenizer(phrase, " ");
            while (wordTok.hasMoreTokens()) {
                String word = wordTok.nextToken();
                // log.info(word);
                // Store word in buffer
                for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
                    buffer[i] = buffer[i + 1];
                }
                buffer[m_MaxPhraseLength - 1] = word;

                // How many are buffered?
                numSeen++;
                if (numSeen > m_MaxPhraseLength) {
                    numSeen = m_MaxPhraseLength;
                }

                // Don't consider phrases that end with a stop word
                if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
                    continue;
                }

                // Loop through buffer and add phrases to hashtable
                StringBuffer phraseBuffer = new StringBuffer();
                for (int i = 1; i <= numSeen; i++) {
                    if (i > 1) {
                        phraseBuffer.insert(0, ' ');
                    }
                    phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);

                    // Don't consider phrases that begin with a stop word
                    if ((i > 1) && (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
                        continue;
                    }

                    // Only consider phrases with minimum length
                    if (i >= m_MinPhraseLength) {

                        // Match against the Vocabulary
                        String orig = phraseBuffer.toString();

                        // Create internal representation:
                        // either a stemmed version or a pseudo phrase:
                        String pseudo = pseudoPhrase(orig);
                        // log.info("Checking " + orig + " -- " + pseudo);

                        String id;
                        if (m_vocabulary.equals("none")) {
                            //   String pseudo = pseudoPhrase(orig);
                            id = pseudo;
                        } else {
                            id = (String) m_Vocabulary.getID(orig);
                        }

                        if (id != null) {
                            Counter count = (Counter) hash.get(id);
                            if (count == null) {
                                hash.put(id, new Counter());
                            } else {
                                count.increment();
                            }
                            //   log.info(orig + "\t" + id);
                        }
                    }
                }
            }
        }
        return hash;
    }

    /**
     * Expects an empty hashtable. Fills the hashtable
     * with the stemmed n-grams occuring in the given string
     * (as keys). Stores the position, the number of occurences,
     * and the most commonly occurring orgininal version of
     * each n-gram.
     *
     * N-grams that occur less than m_MinNumOccur are not used.
     *
     * Returns the total number of words (!) in the string.
     */
    private int getPhrases(HashMap<String, FastVector> hash, String str) {

        //FileOutputStream out = new FileOutputStream("candidates_kea41.txt");      
        //PrintWriter printer = new PrintWriter(new OutputStreamWriter(out)); 

        // hash = table to store all the information about phrases extracted from "str"
        // str  = the content of the document, separated by newlines in sentences

        String[] buffer = new String[m_MaxPhraseLength];

        // Extracting strings of a predefined length from "str":

        StringTokenizer tok = new StringTokenizer(str, "\n");
        int pos = 1;

        while (tok.hasMoreTokens()) {
            String phrase = tok.nextToken();
            int numSeen = 0;
            StringTokenizer wordTok = new StringTokenizer(phrase, " ");
            while (wordTok.hasMoreTokens()) {
                String word = wordTok.nextToken();

                // Store word in buffer
                for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
                    buffer[i] = buffer[i + 1];
                }
                buffer[m_MaxPhraseLength - 1] = word;

                // How many are buffered?
                numSeen++;
                if (numSeen > m_MaxPhraseLength) {
                    numSeen = m_MaxPhraseLength;
                }

                // Don't consider phrases that end with a stop word
                if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
                    pos++;
                    continue;
                }

                // Loop through buffer and add phrases to hashtable
                StringBuffer phraseBuffer = new StringBuffer();
                for (int i = 1; i <= numSeen; i++) {
                    if (i > 1) {
                        phraseBuffer.insert(0, ' ');
                    }
                    phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);

                    // Don't consider phrases that begin with a stop word
                    if ((i > 1) && (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
                        continue;
                    }

                    // Final restriction:
                    // Only consider phrases with minimum length
                    if (i >= m_MinPhraseLength) {

                        // orig = each detected phase in its original spelling  
                        String orig = phraseBuffer.toString();

                        // Create internal representation:
                        // either a stemmed version or a pseudo phrase: 

                        String id;
                        if (m_vocabulary.equals("none")) {
                            String pseudo = pseudoPhrase(orig);
                            id = pseudo;
                        } else {
                            //                     Match against the Vocabulary      
                            id = (String) m_Vocabulary.getID(orig);
                        }

                        //    log.info(orig + "\t" + pseudo + " \t " + id);

                        if (id != null) {

                            // if Vocabulary is used, derive the correct spelling
                            // of the descriptor, else use one of the spellings as in the document
                            if (!m_vocabulary.equals("none")) {
                                orig = m_Vocabulary.getOrig(id);
                            }

                            // Get the vector of the current phrase from the hash table.
                            // If it was already extracted from "str", the values will be
                            // updated in next steps, if not a new vector will be created.

                            FastVector vec = (FastVector) hash.get(id);

                            if (vec == null) {

                                // Specifying the size of the vector
                                // According to additional selected features:

                                if (m_STDEVfeature) {
                                    vec = new FastVector(3);
                                } else {
                                    vec = new FastVector(2);
                                }

                                // Update hashtable with all the info
                                vec.addElement(new Counter(pos + 1 - i)); //0
                                vec.addElement(new Counter()); //1
                                vec.addElement(orig); //2

                                if (m_STDEVfeature) {
                                    FastVector app = new FastVector();
                                    app.addElement(new Counter(pos + 1 - i));
                                    vec.addElement(app);
                                }
                                hash.put(id, vec);
                            } else {

                                // If the phrase already was identified,
                                // update its values in the old vector

                                // Update number of occurrences
                                ((Counter) ((FastVector) vec).elementAt(1)).increment();

                                if (m_STDEVfeature) {

                                    FastVector app = (FastVector) vec.elementAt(3);
                                    app.addElement(new Counter(pos + 1 - i));
                                    vec.addElement(app);
                                }

                            }
                        }
                    }
                }
                pos++;
            }
        }

        // Replace secondary hashtables with most commonly occurring
        // version of each phrase (canonical) form. Delete all words
        // that are proper nouns.
        Iterator<String> phrases = hash.keySet().iterator();

        while (phrases.hasNext()) {
            String phrase = phrases.next();
            FastVector info = (FastVector) hash.get(phrase);

            // Occurring less than m_MinNumOccur? //m_MinNumOccur         
            if (((Counter) ((FastVector) info).elementAt(1)).value() < m_MinNumOccur) {
                phrases.remove();
                continue;
            }
        }
        return pos;
    }

    /** 
     * Splits a string at given character into an array (ALY)
     */
    private static String[] split(String str, String separator) {

        ArrayList<String> lst = new ArrayList<String>();
        String word = "";

        for (int i = 0; i < str.length(); i++) {
            int j = i + 1;
            String letter = str.substring(i, j);
            if (!letter.equalsIgnoreCase(separator)) {
                word = word + str.charAt(i);
            } else {
                lst.add(word);
                word = "";
            }
        }
        if (word != "") {
            lst.add(word);
        }
        String[] result = (String[]) lst.toArray(new String[lst.size()]);
        return result;
    }

    /**
     * Gets all the phrases in the given string and puts them into the
     * hashtable.  Also stores the original version of the stemmed
     * phrase in the hash table.  
     */
    private HashMap<String, Counter> getGivenKeyphrases(String str, boolean forEval) {

        HashMap<String, Counter> hash = new HashMap<String, Counter>();
        // m_Indexers = 1;

        StringTokenizer tok = new StringTokenizer(str, "\n");
        while (tok.hasMoreTokens()) {
            String orig = tok.nextToken();
            orig = orig.trim();

            //         This is often the case with Mesh Terms,
            // where a term is accompanied by another specifying term
            // e.g. Monocytes/*immunology/microbiology
            // we ignore everything after the "/" symbol.
            if (orig.matches(".+?/.+?")) {
                String[] elements = orig.split("/");
                orig = elements[0];
            }

            orig = pseudoPhrase(orig);
            if (orig.length() > 0) {

                String id;
                if (m_vocabulary.equals("none")) {
                    id = orig;
                } else {
                    id = (String) m_Vocabulary.getID(orig);
                }
                if (id != null) {
                    //log.info("\t" + id);
                    if (!hash.containsKey(id)) {
                        hash.put(id, new Counter());
                    } else {
                        Counter c = (Counter) hash.get(id);
                        c.increment();
                        hash.put(id, c);
                        if (forEval && m_Debug) {
                            log.info("Skipping the phrase " + orig
                                    + ", which appears twice in the author-assigned keyphrase set.");
                        }
                    }
                }
            }
        }
        if (hash.size() == 0) {
            return null;
        } else {
            return hash;
        }
    }

    /** 
     * Generates the preudo phrase from a string.
     * A pseudo phrase is a version of a phrase
     * that only contains non-stopwords,
     * which are stemmed and sorted into alphabetical order. 
     */
    public String pseudoPhrase(String str) {
        //log.error(str + "\t");
        String[] pseudophrase;
        String[] words;
        String str_nostop;
        String stemmed;

        str = str.toLowerCase();

        // This is often the case with Mesh Terms,
        // where a term is accompanied by another specifying term
        // e.g. Monocytes/*immunology/microbiology
        // we ignore everything after the "/" symbol.
        if (str.matches(".+?/.+?")) {
            String[] elements = str.split("/");
            str = elements[0];
        }

        // removes scop notes in brackets
        // should be replaced with a cleaner solution
        if (str.matches(".+?\\(.+?")) {
            String[] elements = str.split("\\(");
            str = elements[0];
        }
        if (str.matches(".+?\\'.+?")) {
            String[] elements = str.split("\\'");
            str = elements[1];
        }

        // Remove some non-alphanumeric characters

        // str = str.replace('/', ' ');
        str = str.replace('-', ' ');
        str = str.replace('&', ' ');

        str = str.replaceAll("\\*", "");
        str = str.replaceAll("\\, ", " ");
        str = str.replaceAll("\\. ", " ");
        str = str.replaceAll("\\:", "");

        str = str.trim();

        // Stem string
        words = str.split(" ");
        str_nostop = "";
        for (int i = 0; i < words.length; i++) {
            if (!m_Stopwords.isStopword(words[i])) {
                if (str_nostop.equals("")) {
                    str_nostop = words[i];
                } else {
                    str_nostop = str_nostop + " " + words[i];
                }
            }
        }
        stemmed = m_Stemmer.stemString(str_nostop);

        //log.info(stemmed + "\t" + str_nostop + "\t"+ str);
        pseudophrase = sort(stemmed.split(" "));
        // log.info(join(pseudophrase));
        return join(pseudophrase);
    }

    /** 
     * Joins an array of strings to a single string.
     */
    private static String join(String[] str) {
        String result = "";
        for (int i = 0; i < str.length; i++) {
            if (result != "") {
                result = result + " " + str[i];
            } else {
                result = str[i];
            }
        }
        return result;
    }

    /** 
     * overloaded swap method: exchange 2 locations in an array of Strings.
     */
    public static void swap(int loc1, int loc2, String[] a) {
        String temp = a[loc1];
        a[loc1] = a[loc2];
        a[loc2] = temp;
    } // end swap

    /**
     * Sorts an array of Strings into alphabetic order
     *
     */
    public static String[] sort(String[] a) {

        // rename firstAt to reflect new role in alphabetic sorting
        int i, j, firstAt;

        for (i = 0; i < a.length - 1; i++) {
            firstAt = i;
            for (j = i + 1; j < a.length; j++) {
                // modify to preserve ordering of a String that starts with
                // upper case preceding the otherwise identical String that
                // has only lower case letters
                if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) < 0) {
                    // reset firstAt
                    firstAt = j;
                }
                // if identical when converted to all same case
                if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) == 0) {
                    // but a[j] precedes when not converted
                    if (a[j].compareTo(a[firstAt]) < 0) {
                        // reset firstAt
                        firstAt = j;
                    }
                }
            }
            if (firstAt != i) {
                swap(i, firstAt, a);
            }
        }
        return a;
    } // end method selectionSort

    /**
     * Main method for testing this class.
     *
     * @param argv should contain arguments to the filter: use -h for help
     */
    public static void main(String[] argv) {

        try {
            if (Utils.getFlag('b', argv)) {
                Filter.batchFilterFile(new KEAFilter(new StopwordsEnglish()), argv);
            } else {
                Filter.filterFile(new KEAFilter(new StopwordsEnglish()), argv);
            }
        } catch (Exception ex) {
            log.info(ex.getMessage());
        }
    }
}