com.esda.util.StringToWordVector.java Source code

Introduction

Here is the source code for com.esda.util.StringToWordVector.java
Source

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    StringToWordVector.java
 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
 *
 */

package com.esda.util;

import java.io.File;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

/**
 * <!-- globalinfo-start --> Converts String attributes into a set of attributes
 * representing word occurrence (depending on the tokenizer) information from
 * the text contained in the strings. The set of words (attributes) is
 * determined by the first batch filtered (typically training data).
 * <p/>
 * <!-- globalinfo-end -->
 *
 * <!-- options-start --> Valid options are:
 * <p/>
 *
 * <pre>
 * -C
 *  Output word counts rather than boolean word presence.
 * </pre>
 *
 * <pre>
 * -R &lt;index1,index2-index4,...&gt;
 *  Specify list of string attributes to convert to words (as weka Range).
 *  (default: select all string attributes)
 * </pre>
 *
 * <pre>
 * -V
 *  Invert matching sense of column indexes.
 * </pre>
 *
 * <pre>
 * -P &lt;attribute name prefix&gt;
 *  Specify a prefix for the created attribute names.
 *  (default: "")
 * </pre>
 *
 * <pre>
 * -W &lt;number of words to keep&gt;
 *  Specify approximate number of word fields to create.
 *  Surplus words will be discarded..
 *  (default: 1000)
 * </pre>
 *
 * <pre>
 * -prune-rate &lt;rate as a percentage of dataset&gt;
 *  Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
 *  -W prunes after creating a full dictionary. You may not have enough memory for this approach.
 *  (default: no periodic pruning)
 * </pre>
 *
 * <pre>
 * -T
 *  Transform the word frequencies into log(1+fij)
 *  where fij is the frequency of word i in jth document(instance).
 * </pre>
 *
 * <pre>
 * -I
 *  Transform each word frequency into:
 *  fij*log(num of Documents/num of documents containing word i)
 *    where fij if frequency of word i in jth document(instance)
 * </pre>
 *
 * <pre>
 * -N
 *  Whether to 0=not normalize/1=normalize all data/2=normalize test data only
 *  to average length of training documents (default 0=don't normalize).
 * </pre>
 *
 * <pre>
 * -L
 *  Convert all tokens to lowercase before adding to the dictionary.
 * </pre>
 *
 * <pre>
 * -S
 *  Ignore words that are in the stoplist.
 * </pre>
 *
 * <pre>
 * -stemmer &lt;spec&gt;
 *  The stemmering algorihtm (classname plus parameters) to use.
 * </pre>
 *
 * <pre>
 * -M &lt;int&gt;
 *  The minimum term frequency (default = 1).
 * </pre>
 *
 * <pre>
 * -O
 *  If this is set, the maximum number of words and the
 *  minimum term frequency is not enforced on a per-class
 *  basis but based on the documents in all the classes
 *  (even if a class attribute is set).
 * </pre>
 *
 * <pre>
 * -stopwords &lt;file&gt;
 *  A file containing stopwords to override the default ones.
 *  Using this option automatically sets the flag ('-S') to use the
 *  stoplist if the file exists.
 *  Format: one stopword per line, lines starting with '#'
 *  are interpreted as comments and ignored.
 * </pre>
 *
 * <pre>
 * -tokenizer &lt;spec&gt;
 *  The tokenizing algorihtm (classname plus parameters) to use.
 *  (default: weka.core.tokenizers.WordTokenizer)
 * </pre>
 *
 * <!-- options-end -->
 *
 * @author Len Trigg (len@reeltwo.com)
 * @author Stuart Inglis (stuart@reeltwo.com)
 * @author Gordon Paynter (gordon.paynter@ucr.edu)
 * @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
 * @version $Revision: 9004 $
 * @see Stopwords
 */
@SuppressWarnings("all")
public class StringToWordVector extends Filter implements UnsupervisedFilter, OptionHandler {

    /** for serialization. */
    static final long serialVersionUID = 8249106275278565424L;

    /** Range of columns to convert to word vectors. */
    protected Range m_SelectedRange = new Range("first-last");

    /** Contains a mapping of valid words to attribute indexes. */
    private TreeMap m_Dictionary = new TreeMap();

    /**
     * True if output instances should contain word frequency rather than
     * boolean 0 or 1.
     */
    private boolean m_OutputCounts = false;

    /** A String prefix for the attribute names. */
    private String m_Prefix = "";

    /**
     * Contains the number of documents (instances) a particular word appears
     * in. The counts are stored with the same indexing as given by
     * m_Dictionary.
     */
    private int[] m_DocsCounts;

    /**
     * Contains the number of documents (instances) in the input format from
     * which the dictionary is created. It is used in IDF transform.
     */
    private int m_NumInstances = -1;

    /**
     * Contains the average length of documents (among the first batch of
     * instances aka training data). This is used in length normalization of
     * documents which will be normalized to average document length.
     */
    private double m_AvgDocLength = -1;

    /**
     * The default number of words (per class if there is a class attribute
     * assigned) to attempt to keep.
     */
    private int m_WordsToKeep = 1000;

    /**
     * The percentage at which to periodically prune the dictionary.
     */
    private double m_PeriodicPruningRate = -1;

    /**
     * True if word frequencies should be transformed into log(1+fi) where fi is
     * the frequency of word i.
     */
    private boolean m_TFTransform;

    /** The normalization to apply. */
    protected int m_filterType = FILTER_NONE;

    /** normalization: No normalization. */
    public static final int FILTER_NONE = 0;
    /** normalization: Normalize all data. */
    public static final int FILTER_NORMALIZE_ALL = 1;
    /** normalization: Normalize test data only. */
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;

    /**
     * Specifies whether document's (instance's) word frequencies are to be
     * normalized. The are normalized to average length of documents specified
     * as input format.
     */
    public static final Tag[] TAGS_FILTER = { new Tag(FILTER_NONE, "No normalization"),
            new Tag(FILTER_NORMALIZE_ALL, "Normalize all data"),
            new Tag(FILTER_NORMALIZE_TEST_ONLY, "Normalize test data only"), };

    /**
     * True if word frequencies should be transformed into
     * fij*log(numOfDocs/numOfDocsWithWordi).
     */
    private boolean m_IDFTransform;

    /** True if all tokens should be downcased. */
    private boolean m_lowerCaseTokens;

    /** True if tokens that are on a stoplist are to be ignored. */
    private boolean m_useStoplist;

    /** the stemming algorithm. */
    private Stemmer m_Stemmer = new NullStemmer();

    /** the minimum (per-class) word frequency. */
    private int m_minTermFreq = 1;

    /** whether to operate on a per-class basis. */
    private boolean m_doNotOperateOnPerClassBasis = false;

    /**
     * a file containing stopwords for using others than the default Rainbow
     * ones.
     */
    private File m_Stopwords = new File(System.getProperty("user.dir"));

    /** the tokenizer algorithm to use. */
    private Tokenizer m_Tokenizer = new WordTokenizer();

    /**
     * Default constructor. Targets 1000 words in the output.
     */
    public StringToWordVector() {
    }

    /**
     * Returns an enumeration describing the available options.
     *
     * @return an enumeration of all the available options
     */
    public Enumeration listOptions() {
        Vector result = new Vector();

        result.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));

        result.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n"
                + "\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));

        result.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));

        result.addElement(new Option("\tSpecify a prefix for the created attribute names.\n" + "\t(default: \"\")",
                "P", 1, "-P <attribute name prefix>"));

        result.addElement(
                new Option(
                        "\tSpecify approximate number of word fields to create.\n"
                                + "\tSurplus words will be discarded..\n" + "\t(default: 1000)",
                        "W", 1, "-W <number of words to keep>"));

        result.addElement(new Option(
                "\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n"
                        + "\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n"
                        + "\t(default: no periodic pruning)",
                "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));

        result.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n"
                + "\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));

        result.addElement(new Option("\tTransform each word frequency into:\n"
                + "\tfij*log(num of Documents/num of documents containing word i)\n"
                + "\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));

        result.addElement(new Option(
                "\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n"
                        + "\tto average length of training documents " + "(default 0=don\'t normalize).",
                "N", 1, "-N"));

        result.addElement(new Option("\tConvert all tokens to lowercase before " + "adding to the dictionary.", "L",
                0, "-L"));

        result.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));

        result.addElement(new Option("\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1,
                "-stemmer <spec>"));

        result.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));

        result.addElement(new Option("\tIf this is set, the maximum number of words and the \n"
                + "\tminimum term frequency is not enforced on a per-class \n"
                + "\tbasis but based on the documents in all the classes \n"
                + "\t(even if a class attribute is set).", "O", 0, "-O"));

        result.addElement(new Option("\tA file containing stopwords to override the default ones.\n"
                + "\tUsing this option automatically sets the flag ('-S') to use the\n"
                + "\tstoplist if the file exists.\n" + "\tFormat: one stopword per line, lines starting with '#'\n"
                + "\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>"));

        result.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
                + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));

        return result.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     *
     * <!-- options-start --> Valid options are:
     * <p/>
     *
     * <pre>
     * -C
     *  Output word counts rather than boolean word presence.
     * </pre>
     *
     * <pre>
     * -R &lt;index1,index2-index4,...&gt;
     *  Specify list of string attributes to convert to words (as weka Range).
     *  (default: select all string attributes)
     * </pre>
     *
     * <pre>
     * -V
     *  Invert matching sense of column indexes.
     * </pre>
     *
     * <pre>
     * -P &lt;attribute name prefix&gt;
     *  Specify a prefix for the created attribute names.
     *  (default: "")
     * </pre>
     *
     * <pre>
     * -W &lt;number of words to keep&gt;
     *  Specify approximate number of word fields to create.
     *  Surplus words will be discarded..
     *  (default: 1000)
     * </pre>
     *
     * <pre>
     * -prune-rate &lt;rate as a percentage of dataset&gt;
     *  Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
     *  -W prunes after creating a full dictionary. You may not have enough memory for this approach.
     *  (default: no periodic pruning)
     * </pre>
     *
     * <pre>
     * -T
     *  Transform the word frequencies into log(1+fij)
     *  where fij is the frequency of word i in jth document(instance).
     * </pre>
     *
     * <pre>
     * -I
     *  Transform each word frequency into:
     *  fij*log(num of Documents/num of documents containing word i)
     *    where fij if frequency of word i in jth document(instance)
     * </pre>
     *
     * <pre>
     * -N
     *  Whether to 0=not normalize/1=normalize all data/2=normalize test data only
     *  to average length of training documents (default 0=don't normalize).
     * </pre>
     *
     * <pre>
     * -L
     *  Convert all tokens to lowercase before adding to the dictionary.
     * </pre>
     *
     * <pre>
     * -S
     *  Ignore words that are in the stoplist.
     * </pre>
     *
     * <pre>
     * -stemmer &lt;spec&gt;
     *  The stemmering algorihtm (classname plus parameters) to use.
     * </pre>
     *
     * <pre>
     * -M &lt;int&gt;
     *  The minimum term frequency (default = 1).
     * </pre>
     *
     * <pre>
     * -O
     *  If this is set, the maximum number of words and the
     *  minimum term frequency is not enforced on a per-class
     *  basis but based on the documents in all the classes
     *  (even if a class attribute is set).
     * </pre>
     *
     * <pre>
     * -stopwords &lt;file&gt;
     *  A file containing stopwords to override the default ones.
     *  Using this option automatically sets the flag ('-S') to use the
     *  stoplist if the file exists.
     *  Format: one stopword per line, lines starting with '#'
     *  are interpreted as comments and ignored.
     * </pre>
     *
     * <pre>
     * -tokenizer &lt;spec&gt;
     *  The tokenizing algorihtm (classname plus parameters) to use.
     *  (default: weka.core.tokenizers.WordTokenizer)
     * </pre>
     *
     * <!-- options-end -->
     *
     * @param options
     *            the list of options as an array of strings
     * @throws Exception
     *             if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
        String value;

        value = Utils.getOption('R', options);
        if (value.length() != 0)
            setSelectedRange(value);
        else
            setSelectedRange("first-last");

        setInvertSelection(Utils.getFlag('V', options));

        value = Utils.getOption('P', options);
        if (value.length() != 0)
            setAttributeNamePrefix(value);
        else
            setAttributeNamePrefix("");

        value = Utils.getOption('W', options);
        if (value.length() != 0)
            setWordsToKeep(Integer.valueOf(value).intValue());
        else
            setWordsToKeep(1000);

        value = Utils.getOption("prune-rate", options);
        if (value.length() > 0)
            setPeriodicPruning(Double.parseDouble(value));
        else
            setPeriodicPruning(-1);

        value = Utils.getOption('M', options);
        if (value.length() != 0)
            setMinTermFreq(Integer.valueOf(value).intValue());
        else
            setMinTermFreq(1);

        setOutputWordCounts(Utils.getFlag('C', options));

        setTFTransform(Utils.getFlag('T', options));

        setIDFTransform(Utils.getFlag('I', options));

        setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));

        String nString = Utils.getOption('N', options);
        if (nString.length() != 0)
            setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));
        else
            setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER));

        setLowerCaseTokens(Utils.getFlag('L', options));

        setUseStoplist(Utils.getFlag('S', options));

        String stemmerString = Utils.getOption("stemmer", options);
        if (stemmerString.length() == 0) {
            setStemmer(null);
        } else {
            String[] stemmerSpec = Utils.splitOptions(stemmerString);
            if (stemmerSpec.length == 0)
                throw new Exception("Invalid stemmer specification string");
            String stemmerName = stemmerSpec[0];
            stemmerSpec[0] = "";
            Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance();
            if (stemmer instanceof OptionHandler)
                ((OptionHandler) stemmer).setOptions(stemmerSpec);
            setStemmer(stemmer);
        }

        value = Utils.getOption("stopwords", options);
        if (value.length() != 0)
            setStopwords(new File(value));
        else
            setStopwords(null);

        String tokenizerString = Utils.getOption("tokenizer", options);
        if (tokenizerString.length() == 0) {
            setTokenizer(new WordTokenizer());
        } else {
            String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
            if (tokenizerSpec.length == 0)
                throw new Exception("Invalid tokenizer specification string");
            String tokenizerName = tokenizerSpec[0];
            tokenizerSpec[0] = "";
            Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance();
            if (tokenizer instanceof OptionHandler)
                ((OptionHandler) tokenizer).setOptions(tokenizerSpec);
            setTokenizer(tokenizer);
        }
    }

    /**
     * Gets the current settings of the filter.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {
        Vector result;

        result = new Vector();

        result.add("-R");
        result.add(getSelectedRange().getRanges());

        if (getInvertSelection())
            result.add("-V");

        if (!"".equals(getAttributeNamePrefix())) {
            result.add("-P");
            result.add(getAttributeNamePrefix());
        }

        result.add("-W");
        result.add(String.valueOf(getWordsToKeep()));

        result.add("-prune-rate");
        result.add(String.valueOf(getPeriodicPruning()));

        if (getOutputWordCounts())
            result.add("-C");

        if (getTFTransform())
            result.add("-T");

        if (getIDFTransform())
            result.add("-I");

        result.add("-N");
        result.add("" + m_filterType);

        if (getLowerCaseTokens())
            result.add("-L");

        if (getUseStoplist())
            result.add("-S");

        if (getStemmer() != null) {
            result.add("-stemmer");
            String spec = getStemmer().getClass().getName();
            if (getStemmer() instanceof OptionHandler)
                spec += " " + Utils.joinOptions(((OptionHandler) getStemmer()).getOptions());
            result.add(spec.trim());
        }

        result.add("-M");
        result.add(String.valueOf(getMinTermFreq()));

        if (getDoNotOperateOnPerClassBasis())
            result.add("-O");

        if (!getStopwords().isDirectory()) {
            result.add("-stopwords");
            result.add(getStopwords().getAbsolutePath());
        }

        result.add("-tokenizer");
        String spec = getTokenizer().getClass().getName();
        if (getTokenizer() instanceof OptionHandler)
            spec += " " + Utils.joinOptions(((OptionHandler) getTokenizer()).getOptions());
        result.add(spec.trim());

        return (String[]) result.toArray(new String[result.size()]);
    }

    /**
     * Constructor that allows specification of the target number of words in
     * the output.
     *
     * @param wordsToKeep
     *            the number of words in the output vector (per class if
     *            assigned).
     */
    public StringToWordVector(int wordsToKeep) {
        m_WordsToKeep = wordsToKeep;
    }

    /**
     * Used to store word counts for dictionary selection based on a threshold.
     */
    private class Count implements Serializable, RevisionHandler {

        /** for serialization. */
        static final long serialVersionUID = 2157223818584474321L;

        /** the counts. */
        public int count, docCount;

        /**
         * the constructor.
         *
         * @param c
         *            the count
         */
        public Count(int c) {
            count = c;
        }

        /**
         * Returns the revision string.
         *
         * @return the revision
         */
        public String getRevision() {
            return RevisionUtils.extract("$Revision: 9004 $");
        }
    }

    /**
     * Returns the Capabilities of this filter.
     *
     * @return the capabilities of this object
     * @see Capabilities
     */
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enableAllAttributes();
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enableAllClasses();
        result.enable(Capability.MISSING_CLASS_VALUES);
        result.enable(Capability.NO_CLASS);

        return result;
    }

    /**
     * Sets the format of the input instances.
     *
     * @param instanceInfo
     *            an Instances object containing the input instance structure
     *            (any instances contained in the object are ignored - only the
     *            structure is required).
     * @return true if the outputFormat may be collected immediately
     * @throws Exception
     *             if the input format can't be set successfully
     */
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        super.setInputFormat(instanceInfo);
        m_SelectedRange.setUpper(instanceInfo.numAttributes() - 1);
        m_AvgDocLength = -1;
        m_NumInstances = -1;
        return false;
    }

    /**
     * Input an instance for filtering. Filter requires all training instances
     * be read before producing output.
     *
     * @param instance
     *            the input instance.
     * @return true if the filtered instance may now be collected with output().
     * @throws IllegalStateException
     *             if no input structure has been defined.
     */
    public boolean input(Instance instance) throws Exception {

        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (m_NewBatch) {
            resetQueue();
            m_NewBatch = false;
        }
        if (isFirstBatchDone()) {
            FastVector fv = new FastVector();
            int firstCopy = convertInstancewoDocNorm(instance, fv);
            Instance inst = (Instance) fv.elementAt(0);
            if (m_filterType != FILTER_NONE) {
                normalizeInstance(inst, firstCopy);
            }
            push(inst);
            return true;
        } else {
            bufferInput(instance);
            return false;
        }
    }

    /**
     * Signify that this batch of input to the filter is finished. If the filter
     * requires all instances prior to filtering, output() may now be called to
     * retrieve the filtered instances.
     *
     * @return true if there are instances pending output.
     * @throws IllegalStateException
     *             if no input structure has been defined.
     */
    public boolean batchFinished() throws Exception {

        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }

        // We only need to do something in this method
        // if the first batch hasn't been processed. Otherwise
        // input() has already done all the work.
        if (!isFirstBatchDone()) {

            // Determine the dictionary from the first batch (training data)
            determineDictionary();

            // Convert all instances w/o normalization
            FastVector fv = new FastVector();
            int firstCopy = 0;
            for (int i = 0; i < m_NumInstances; i++) {
                firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv);
            }

            // Need to compute average document length if necessary
            if (m_filterType != FILTER_NONE) {
                m_AvgDocLength = 0;
                for (int i = 0; i < fv.size(); i++) {
                    Instance inst = (Instance) fv.elementAt(i);
                    double docLength = 0;
                    for (int j = 0; j < inst.numValues(); j++) {
                        if (inst.index(j) >= firstCopy) {
                            docLength += inst.valueSparse(j) * inst.valueSparse(j);
                        }
                    }
                    m_AvgDocLength += Math.sqrt(docLength);
                }
                m_AvgDocLength /= m_NumInstances;
            }

            // Perform normalization if necessary.
            if (m_filterType == FILTER_NORMALIZE_ALL) {
                for (int i = 0; i < fv.size(); i++) {
                    normalizeInstance((Instance) fv.elementAt(i), firstCopy);
                }
            }

            // Push all instances into the output queue
            for (int i = 0; i < fv.size(); i++) {
                push((Instance) fv.elementAt(i));
            }
        }

        // Flush the input
        flushInput();

        m_NewBatch = true;
        m_FirstBatchDone = true;
        return (numPendingOutput() != 0);
    }

    /**
     * Returns a string describing this filter.
     *
     * @return a description of the filter suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing "
                + "word occurrence (depending on the tokenizer) information from the "
                + "text contained in the strings. The set of words (attributes) is "
                + "determined by the first batch filtered (typically training data).";
    }

    /**
     * Gets whether output instances contain 0 or 1 indicating word presence, or
     * word counts.
     *
     * @return true if word counts should be output.
     */
    public boolean getOutputWordCounts() {
        return m_OutputCounts;
    }

    /**
     * Sets whether output instances contain 0 or 1 indicating word presence, or
     * word counts.
     *
     * @param outputWordCounts
     *            true if word counts should be output.
     */
    public void setOutputWordCounts(boolean outputWordCounts) {
        m_OutputCounts = outputWordCounts;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1" + "(indicating presence or absence of a word).";
    }

    /**
     * Get the value of m_SelectedRange.
     *
     * @return Value of m_SelectedRange.
     */
    public Range getSelectedRange() {
        return m_SelectedRange;
    }

    /**
     * Set the value of m_SelectedRange.
     *
     * @param newSelectedRange
     *            Value to assign to m_SelectedRange.
     */
    public void setSelectedRange(String newSelectedRange) {
        m_SelectedRange = new Range(newSelectedRange);
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on."
                + " This is a comma separated list of attribute indices, with"
                + " \"first\" and \"last\" valid values. Specify an inclusive"
                + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    /**
     * Gets the current range selection.
     *
     * @return a string containing a comma separated list of ranges
     */
    public String getAttributeIndices() {
        return m_SelectedRange.getRanges();
    }

    /**
     * Sets which attributes are to be worked on.
     *
     * @param rangeList
     *            a string representing the list of attributes. Since the string
     *            will typically come from a user, attributes are indexed from
     *            1. <br>
     *            eg: first-3,5,6-last
     * @throws IllegalArgumentException
     *             if an invalid range list is supplied
     */
    public void setAttributeIndices(String rangeList) {
        m_SelectedRange.setRanges(rangeList);
    }

    /**
     * Sets which attributes are to be processed.
     *
     * @param attributes
     *            an array containing indexes of attributes to process. Since
     *            the array will typically come from a program, attributes are
     *            indexed from 0.
     * @throws IllegalArgumentException
     *             if an invalid set of ranges is supplied
     */
    public void setAttributeIndicesArray(int[] attributes) {
        setAttributeIndices(Range.indicesToRangeList(attributes));
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected"
                + " attributes in the range will be worked on; if"
                + " true, only non-selected attributes will be processed.";
    }

    /**
     * Gets whether the supplied columns are to be processed or skipped.
     *
     * @return true if the supplied columns will be kept
     */
    public boolean getInvertSelection() {
        return m_SelectedRange.getInvert();
    }

    /**
     * Sets whether selected columns should be processed or skipped.
     *
     * @param invert
     *            the new invert setting
     */
    public void setInvertSelection(boolean invert) {
        m_SelectedRange.setInvert(invert);
    }

    /**
     * Get the attribute name prefix.
     *
     * @return The current attribute name prefix.
     */
    public String getAttributeNamePrefix() {
        return m_Prefix;
    }

    /**
     * Set the attribute name prefix.
     *
     * @param newPrefix
     *            String to use as the attribute name prefix.
     */
    public void setAttributeNamePrefix(String newPrefix) {
        m_Prefix = newPrefix;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. " + "(default: \"\")";
    }

    /**
     * Gets the number of words (per class if there is a class attribute
     * assigned) to attempt to keep.
     *
     * @return the target number of words in the output vector (per class if
     *         assigned).
     */
    public int getWordsToKeep() {
        return m_WordsToKeep;
    }

    /**
     * Sets the number of words (per class if there is a class attribute
     * assigned) to attempt to keep.
     *
     * @param newWordsToKeep
     *            the target number of words in the output vector (per class if
     *            assigned).
     */
    public void setWordsToKeep(int newWordsToKeep) {
        m_WordsToKeep = newWordsToKeep;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute " + "assigned) to attempt to keep.";
    }

    /**
     * Gets the rate at which the dictionary is periodically pruned, as a
     * percentage of the dataset size.
     *
     * @return the rate at which the dictionary is periodically pruned
     */
    public double getPeriodicPruning() {
        return m_PeriodicPruningRate;
    }

    /**
     * Sets the rate at which the dictionary is periodically pruned, as a
     * percentage of the dataset size.
     *
     * @param newPeriodicPruning
     *            the rate at which the dictionary is periodically pruned
     */
    public void setPeriodicPruning(double newPeriodicPruning) {
        m_PeriodicPruningRate = newPeriodicPruning;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. "
                + "wordsToKeep prunes after creating a full dictionary. You may not have enough "
                + "memory for this approach.";
    }

    /**
     * Gets whether if the word frequencies should be transformed into
     * log(1+fij) where fij is the frequency of word i in document(instance) j.
     *
     * @return true if word frequencies are to be transformed.
     */
    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    /**
     * Sets whether if the word frequencies should be transformed into
     * log(1+fij) where fij is the frequency of word i in document(instance) j.
     *
     * @param TFTransform
     *            true if word frequencies are to be transformed.
     */
    public void setTFTransform(boolean TFTransform) {
        this.m_TFTransform = TFTransform;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n " + "   log(1+fij) \n"
                + "       where fij is the frequency of word i in document (instance) j.";
    }

    /**
     * Sets whether if the word frequencies in a document should be transformed
     * into: <br>
     * fij*log(num of Docs/num of Docs with word i) <br>
     * where fij is the frequency of word i in document(instance) j.
     *
     * @return true if the word frequencies are to be transformed.
     */
    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    /**
     * Sets whether if the word frequencies in a document should be transformed
     * into: <br>
     * fij*log(num of Docs/num of Docs with word i) <br>
     * where fij is the frequency of word i in document(instance) j.
     *
     * @param IDFTransform
     *            true if the word frequecies are to be transformed
     */
    public void setIDFTransform(boolean IDFTransform) {
        this.m_IDFTransform = IDFTransform;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be " + "transformed into: \n"
                + "   fij*log(num of Docs/num of Docs with word i) \n"
                + "      where fij is the frequency of word i in document (instance) j.";
    }

    /**
     * Gets whether if the word frequencies for a document (instance) should be
     * normalized or not.
     *
     * @return true if word frequencies are to be normalized.
     */
    public SelectedTag getNormalizeDocLength() {

        return new SelectedTag(m_filterType, TAGS_FILTER);
    }

    /**
     * Sets whether if the word frequencies for a document (instance) should be
     * normalized or not.
     *
     * @param newType
     *            the new type.
     */
    public void setNormalizeDocLength(SelectedTag newType) {

        if (newType.getTags() == TAGS_FILTER) {
            m_filterType = newType.getSelectedTag().getID();
        }
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) " + "should be normalized or not.";
    }

    /**
     * Gets whether if the tokens are to be downcased or not.
     *
     * @return true if the tokens are to be downcased.
     */
    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    /**
     * Sets whether if the tokens are to be downcased or not. (Doesn't affect
     * non-alphabetic characters in tokens).
     *
     * @param downCaseTokens
     *            should be true if only lower case tokens are to be formed.
     */
    public void setLowerCaseTokens(boolean downCaseTokens) {
        this.m_lowerCaseTokens = downCaseTokens;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the "
                + "minimum term frequency is not enforced on a per-class "
                + "basis but based on the documents in all the classes " + "(even if a class attribute is set).";
    }

    /**
     * Get the DoNotOperateOnPerClassBasis value.
     *
     * @return the DoNotOperateOnPerClassBasis value.
     */
    public boolean getDoNotOperateOnPerClassBasis() {
        return m_doNotOperateOnPerClassBasis;
    }

    /**
     * Set the DoNotOperateOnPerClassBasis value.
     *
     * @param newDoNotOperateOnPerClassBasis
     *            The new DoNotOperateOnPerClassBasis value.
     */
    public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {
        this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced " + "on a per-class basis.";
    }

    /**
     * Get the MinTermFreq value.
     *
     * @return the MinTermFreq value.
     */
    public int getMinTermFreq() {
        return m_minTermFreq;
    }

    /**
     * Set the MinTermFreq value.
     *
     * @param newMinTermFreq
     *            The new MinTermFreq value.
     */
    public void setMinTermFreq(int newMinTermFreq) {
        this.m_minTermFreq = newMinTermFreq;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case "
                + "before being added to the dictionary.";
    }

    /**
     * Gets whether if the words on the stoplist are to be ignored (The stoplist
     * is in weka.core.StopWords).
     *
     * @return true if the words on the stoplist are to be ignored.
     */
    public boolean getUseStoplist() {
        return m_useStoplist;
    }

    /**
     * Sets whether if the words that are on a stoplist are to be ignored (The
     * stop list is in weka.core.StopWords).
     *
     * @param useStoplist
     *            true if the tokens that are on a stoplist are to be ignored.
     */
    public void setUseStoplist(boolean useStoplist) {
        m_useStoplist = useStoplist;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    /**
     * the stemming algorithm to use, null means no stemming at all (i.e., the
     * NullStemmer is used).
     *
     * @param value
     *            the configured stemming algorithm, or null
     * @see NullStemmer
     */
    public void setStemmer(Stemmer value) {
        if (value != null)
            m_Stemmer = value;
        else
            m_Stemmer = new NullStemmer();
    }

    /**
     * Returns the current stemming algorithm, null if none is used.
     *
     * @return the current stemming algorithm, null if none set
     */
    public Stemmer getStemmer() {
        return m_Stemmer;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    /**
     * sets the file containing the stopwords, null or a directory unset the
     * stopwords. If the file exists, it automatically turns on the flag to use
     * the stoplist.
     *
     * @param value
     *            the file containing the stopwords
     */
    public void setStopwords(File value) {
        if (value == null)
            value = new File(System.getProperty("user.dir"));

        m_Stopwords = value;
        if (value.exists() && value.isFile())
            setUseStoplist(true);
    }

    /**
     * returns the file used for obtaining the stopwords, if the file represents
     * a directory then the default ones are used.
     *
     * @return the file containing the stopwords
     */
    public File getStopwords() {
        return m_Stopwords;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String stopwordsTipText() {
        return "The file containing the stopwords (if this is a directory then the default ones are used).";
    }

    /**
     * the tokenizer algorithm to use.
     *
     * @param value
     *            the configured tokenizing algorithm
     */
    public void setTokenizer(Tokenizer value) {
        m_Tokenizer = value;
    }

    /**
     * Returns the current tokenizer algorithm.
     *
     * @return the current tokenizer algorithm
     */
    public Tokenizer getTokenizer() {
        return m_Tokenizer;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    /**
     * sorts an array.
     *
     * @param array
     *            the array to sort
     */
    private static void sortArray(int[] array) {

        int i, j, h, N = array.length - 1;

        for (h = 1; h <= N / 9; h = 3 * h + 1)
            ;

        for (; h > 0; h /= 3) {
            for (i = h + 1; i <= N; i++) {
                int v = array[i];
                j = i;
                while (j > h && array[j - h] > v) {
                    array[j] = array[j - h];
                    j -= h;
                }
                array[j] = v;
            }
        }
    }

    /**
     * determines the selected range.
     */
    private void determineSelectedRange() {

        Instances inputFormat = getInputFormat();

        // Calculate the default set of fields to convert
        if (m_SelectedRange == null) {
            StringBuffer fields = new StringBuffer();
            for (int j = 0; j < inputFormat.numAttributes(); j++) {
                if (inputFormat.attribute(j).type() == Attribute.STRING)
                    fields.append((j + 1) + ",");
            }
            m_SelectedRange = new Range(fields.toString());
        }
        m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);

        // Prevent the user from converting non-string fields
        StringBuffer fields = new StringBuffer();
        for (int j = 0; j < inputFormat.numAttributes(); j++) {
            if (m_SelectedRange.isInRange(j) && inputFormat.attribute(j).type() == Attribute.STRING)
                fields.append((j + 1) + ",");
        }
        m_SelectedRange.setRanges(fields.toString());
        m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);

        // System.err.println("Selected Range: " +
        // getSelectedRange().getRanges());
    }

    /**
     * determines the dictionary.
     */
    private void determineDictionary() {
        // initialize stopwords
        Stopwords stopwords = new Stopwords();
        if (getUseStoplist()) {
            try {
                if (getStopwords().exists() && !getStopwords().isDirectory())
                    stopwords.read(getStopwords());
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        // Operate on a per-class basis if class attribute is set
        int classInd = getInputFormat().classIndex();
        int values = 1;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            values = getInputFormat().attribute(classInd).numValues();
        }

        // TreeMap dictionaryArr [] = new TreeMap[values];
        TreeMap[] dictionaryArr = new TreeMap[values];
        for (int i = 0; i < values; i++) {
            dictionaryArr[i] = new TreeMap();
        }

        // Make sure we know which fields to convert
        determineSelectedRange();

        // Tokenize all training text into an orderedMap of "words".
        long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances());
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            Instance instance = getInputFormat().instance(i);
            int vInd = 0;
            if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
                vInd = (int) instance.classValue();
            }

            // Iterate through all relevant string attributes of the current
            // instance
            Hashtable h = new Hashtable();
            for (int j = 0; j < instance.numAttributes(); j++) {
                if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

                    // Get tokenizer
                    m_Tokenizer.tokenize(instance.stringValue(j));

                    // Iterate through tokens, perform stemming, and remove
                    // stopwords
                    // (if required)
                    while (m_Tokenizer.hasMoreElements()) {
                        String word = ((String) m_Tokenizer.nextElement()).intern();

                        if (this.m_lowerCaseTokens == true)
                            word = word.toLowerCase();

                        String[] wordsArr = word.split(" ");
                        StringBuilder stemmedStr = new StringBuilder();
                        for (String wordStr : wordsArr) {
                            if (!this.m_useStoplist || !stopwords.is(wordStr)) {
                                stemmedStr.append(m_Stemmer.stem(wordStr));
                                stemmedStr.append(" ");
                            }
                        }
                        /*for (int icounter = 0; icounter < wordsArr.length; icounter++) {
                           stemmedStr += m_Stemmer.stem(wordsArr[icounter]);
                           if (icounter + 1 < wordsArr.length)
                              stemmedStr += " ";
                        }*/

                        word = stemmedStr.toString().trim();

                        if (!(h.containsKey(word)))
                            h.put(word, new Integer(0));

                        Count count = (Count) dictionaryArr[vInd].get(word);
                        if (count == null) {
                            dictionaryArr[vInd].put(word, new Count(1));
                        } else {
                            count.count++;
                        }
                    }
                }
            }

            // updating the docCount for the words that have occurred in this
            // instance(document).
            Enumeration e = h.keys();
            while (e.hasMoreElements()) {
                String word = (String) e.nextElement();
                Count c = (Count) dictionaryArr[vInd].get(word);
                if (c != null) {
                    c.docCount++;
                } else
                    System.err.println(
                            "Warning: A word should definitely be in the " + "dictionary.Please check the code");
            }

            if (pruneRate > 0) {
                if (i % pruneRate == 0 && i > 0) {
                    for (int z = 0; z < values; z++) {
                        Vector d = new Vector(1000);
                        Iterator it = dictionaryArr[z].keySet().iterator();
                        while (it.hasNext()) {
                            String word = (String) it.next();
                            Count count = (Count) dictionaryArr[z].get(word);
                            if (count.count <= 1) {
                                d.add(word);
                            }
                        }
                        Iterator iter = d.iterator();
                        while (iter.hasNext()) {
                            String word = (String) iter.next();
                            dictionaryArr[z].remove(word);
                        }
                    }
                }
            }
        }

        // Figure out the minimum required word frequency
        int totalsize = 0;
        int prune[] = new int[values];
        for (int z = 0; z < values; z++) {
            totalsize += dictionaryArr[z].size();

            int array[] = new int[dictionaryArr[z].size()];
            int pos = 0;
            Iterator it = dictionaryArr[z].keySet().iterator();
            while (it.hasNext()) {
                String word = (String) it.next();
                Count count = (Count) dictionaryArr[z].get(word);
                array[pos] = count.count;
                pos++;
            }

            // sort the array
            sortArray(array);
            if (array.length < m_WordsToKeep) {
                // if there aren't enough words, set the threshold to
                // minFreq
                prune[z] = m_minTermFreq;
            } else {
                // otherwise set it to be at least minFreq
                prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
            }
        }

        // Convert the dictionary into an attribute index
        // and create one attribute per word
        FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes());

        // Add the non-converted attributes
        int classIndex = -1;
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (!m_SelectedRange.isInRange(i)) {
                if (getInputFormat().classIndex() == i) {
                    classIndex = attributes.size();
                }
                attributes.addElement(getInputFormat().attribute(i).copy());
            }
        }

        // Add the word vector attributes (eliminating duplicates
        // that occur in multiple classes)
        TreeMap newDictionary = new TreeMap();
        int index = attributes.size();
        for (int z = 0; z < values; z++) {
            Iterator it = dictionaryArr[z].keySet().iterator();
            while (it.hasNext()) {
                String word = (String) it.next();
                Count count = (Count) dictionaryArr[z].get(word);
                if (count.count >= prune[z]) {
                    if (newDictionary.get(word) == null) {
                        newDictionary.put(word, new Integer(index++));
                        attributes.addElement(new Attribute(m_Prefix + word));
                    }
                }
            }
        }

        // Compute document frequencies
        m_DocsCounts = new int[attributes.size()];
        Iterator it = newDictionary.keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            int idx = ((Integer) newDictionary.get(word)).intValue();
            int docsCount = 0;
            for (int j = 0; j < values; j++) {
                Count c = (Count) dictionaryArr[j].get(word);
                if (c != null)
                    docsCount += c.docCount;
            }
            m_DocsCounts[idx] = docsCount;
        }

        // Trim vector and set instance variables
        attributes.trimToSize();
        m_Dictionary = newDictionary;
        m_NumInstances = getInputFormat().numInstances();

        // Set the filter's output format
        Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0);
        outputFormat.setClassIndex(classIndex);
        setOutputFormat(outputFormat);
    }

    /**
     * Converts the instance w/o normalization.
     *
     * @oaram instance the instance to convert
     * @param v
     * @return the conerted instance
     */
    private int convertInstancewoDocNorm(Instance instance, FastVector v) {

        // Convert the instance into a sorted set of indexes
        TreeMap contained = new TreeMap();

        // Copy all non-converted attributes from input to output
        int firstCopy = 0;
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (!m_SelectedRange.isInRange(i)) {
                if (getInputFormat().attribute(i).type() != Attribute.STRING
                        && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) {
                    // Add simple nominal and numeric attributes directly
                    if (instance.value(i) != 0.0) {
                        contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                    }
                } else {
                    if (instance.isMissing(i)) {
                        contained.put(new Integer(firstCopy), new Double(Double.NaN));
                    } else if (getInputFormat().attribute(i).type() == Attribute.STRING) {

                        // If this is a string attribute, we have to first add
                        // this value to the range of possible values, then add
                        // its new internal index.
                        if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                            // Note that the first string value in a
                            // SparseInstance doesn't get printed.
                            outputFormatPeek().attribute(firstCopy)
                                    .addStringValue("Hack to defeat SparseInstance bug");
                        }
                        int newIndex = outputFormatPeek().attribute(firstCopy)
                                .addStringValue(instance.stringValue(i));
                        contained.put(new Integer(firstCopy), new Double(newIndex));
                    } else {
                        // relational
                        if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                            Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation();

                            // hack to defeat sparse instances bug
                            outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader);
                        }
                        int newIndex = outputFormatPeek().attribute(firstCopy)
                                .addRelation(instance.relationalValue(i));
                        contained.put(new Integer(firstCopy), new Double(newIndex));
                    }
                }
                firstCopy++;
            }
        }

        for (int j = 0; j < instance.numAttributes(); j++) {
            // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
            if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

                m_Tokenizer.tokenize(instance.stringValue(j));

                while (m_Tokenizer.hasMoreElements()) {
                    String word = (String) m_Tokenizer.nextElement();
                    if (this.m_lowerCaseTokens == true)
                        word = word.toLowerCase();
                    word = m_Stemmer.stem(word);
                    Integer index = (Integer) m_Dictionary.get(word);
                    if (index != null) {
                        if (m_OutputCounts) { // Separate if here rather than
                            // two lines down to avoid
                            // hashtable lookup
                            Double count = (Double) contained.get(index);
                            if (count != null) {
                                contained.put(index, new Double(count.doubleValue() + 1.0));
                            } else {
                                contained.put(index, new Double(1));
                            }
                        } else {
                            contained.put(index, new Double(1));
                        }
                    }
                }
            }
        }

        // Doing TFTransform
        if (m_TFTransform == true) {
            Iterator it = contained.keySet().iterator();
            for (int i = 0; it.hasNext(); i++) {
                Integer index = (Integer) it.next();
                if (index.intValue() >= firstCopy) {
                    double val = ((Double) contained.get(index)).doubleValue();
                    val = Math.log(val + 1);
                    contained.put(index, new Double(val));
                }
            }
        }

        // Doing IDFTransform
        if (m_IDFTransform == true) {
            Iterator it = contained.keySet().iterator();
            for (int i = 0; it.hasNext(); i++) {
                Integer index = (Integer) it.next();
                if (index.intValue() >= firstCopy) {
                    double val = ((Double) contained.get(index)).doubleValue();
                    val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                    contained.put(index, new Double(val));
                }
            }
        }

        // Convert the set to structures needed to create a sparse instance.
        double[] values = new double[contained.size()];
        int[] indices = new int[contained.size()];
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            Double value = (Double) contained.get(index);
            values[i] = value.doubleValue();
            indices[i] = index.intValue();
        }

        Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
        inst.setDataset(outputFormatPeek());

        v.addElement(inst);

        return firstCopy;
    }

    /**
     * Normalizes given instance to average doc length (only the newly
     * constructed attributes).
     *
     * @param inst
     *            the instance to normalize
     * @param firstCopy
     * @throws Exception
     *             if avg. doc length not set
     */
    private void normalizeInstance(Instance inst, int firstCopy) throws Exception {

        double docLength = 0;

        if (m_AvgDocLength < 0) {
            throw new Exception("Average document length not set.");
        }

        // Compute length of document vector
        for (int j = 0; j < inst.numValues(); j++) {
            if (inst.index(j) >= firstCopy) {
                docLength += inst.valueSparse(j) * inst.valueSparse(j);
            }
        }
        docLength = Math.sqrt(docLength);

        // Normalize document vector
        for (int j = 0; j < inst.numValues(); j++) {
            if (inst.index(j) >= firstCopy) {
                double val = inst.valueSparse(j) * m_AvgDocLength / docLength;
                inst.setValueSparse(j, val);
                if (val == 0) {
                    System.err.println("setting value " + inst.index(j) + " to zero.");
                    j--;
                }
            }
        }
    }

    /**
     * Returns the revision string.
     *
     * @return the revision
     */
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 9004 $");
    }

    /**
     * Main method for testing this class.
     *
     * @param argv
     *            should contain arguments to the filter: use -h for help
     */
    public static void main(String[] argv) {
        runFilter(new StringToWordVector(), argv);
    }
}