Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * NGramTokenizer.java * Copyright (C) 2007-2012 University of Waikato */ package weka.core.tokenizers; import java.util.Collections; import java.util.Enumeration; import java.util.LinkedList; import java.util.Vector; import weka.core.Option; import weka.core.RevisionUtils; import weka.core.Utils; /** * <!-- globalinfo-start --> Splits a string into an n-gram with min and max * grams. * <p/> * <!-- globalinfo-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -delimiters <value> * The delimiters to use * (default ' \r\n\t.,;:'"()?!'). * </pre> * * <pre> * -max <int> * The max size of the Ngram (default = 3). * </pre> * * <pre> * -min <int> * The min size of the Ngram (default = 1). * </pre> * * <!-- options-end --> * * @author Sebastian Germesin (sebastian.germesin@dfki.de) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision$ */ public class NGramTokenizer extends CharacterDelimitedTokenizer { /** for serialization */ private static final long serialVersionUID = -2181896254171647219L; /** the maximum number of N */ protected int m_NMax = 3; /** the minimum number of N */ protected int m_NMin = 1; /** the current length of the N-grams */ protected int m_N; /** the number of strings available */ protected int m_MaxPosition; /** the current position for returning elements */ protected int m_CurrentPosition; /** all the available grams */ protected String[] m_SplitString; /** * Returns a string describing the stemmer * * @return a description suitable for displaying in the explorer/experimenter * gui */ @Override public String globalInfo() { return "Splits a string into an n-gram with min and max grams."; } /** * Returns an enumeration of all the available options.. * * @return an enumeration of all available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> result = new Vector<Option>(); result.addElement(new Option("\tThe max size of the Ngram (default = 3).", "max", 1, "-max <int>")); result.addElement(new Option("\tThe min size of the Ngram (default = 1).", "min", 1, "-min <int>")); result.addAll(Collections.list(super.listOptions())); return result.elements(); } /** * Gets the current option settings for the OptionHandler. * * @return the list of current option settings as an array of strings */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); result.add("-max"); result.add("" + getNGramMaxSize()); result.add("-min"); result.add("" + getNGramMinSize()); Collections.addAll(result, super.getOptions()); return result.toArray(new String[result.size()]); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -delimiters <value> * The delimiters to use * (default ' \r\n\t.,;:'"()?!'). * </pre> * * <pre> * -max <int> * The max size of the Ngram (default = 3). * </pre> * * <pre> * -min <int> * The min size of the Ngram (default = 1). * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String value; value = Utils.getOption("max", options); if (value.length() != 0) { setNGramMaxSize(Integer.parseInt(value)); } else { setNGramMaxSize(3); } value = Utils.getOption("min", options); if (value.length() != 0) { setNGramMinSize(Integer.parseInt(value)); } else { setNGramMinSize(1); } super.setOptions(options); } /** * Gets the max N of the NGram. * * @return the size (N) of the NGram. */ public int getNGramMaxSize() { return m_NMax; } /** * Sets the max size of the Ngram. * * @param value the size of the NGram. */ public void setNGramMaxSize(int value) { if (value < 1) { m_NMax = 1; } else { m_NMax = value; } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String NGramMaxSizeTipText() { return "The max N of the NGram."; } /** * Sets the min size of the Ngram. * * @param value the size of the NGram. */ public void setNGramMinSize(int value) { if (value < 1) { m_NMin = 1; } else { m_NMin = value; } } /** * Gets the min N of the NGram. * * @return the size (N) of the NGram. */ public int getNGramMinSize() { return m_NMin; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String NGramMinSizeTipText() { return "The min N of the NGram."; } /** * returns true if there's more elements available * * @return true if there are more elements available */ @Override public boolean hasMoreElements() { // return (m_CurrentPosition < m_MaxPosition // && m_N - 1 + m_CurrentPosition < m_MaxPosition && m_N >= m_NMin); // return (m_N >= m_NMin); } /** * Returns N-grams and also (N-1)-grams and .... and 1-grams. * * @return the next element */ @Override public String nextElement() { String retValue = ""; // for (int i = 0; i < m_N && i + m_CurrentPosition < m_MaxPosition; i++) { // retValue += " " + m_SplitString[m_CurrentPosition + i]; // } for (int i = 0; i < m_N; i++) { retValue += " " + m_SplitString[m_CurrentPosition + i]; } m_CurrentPosition++; if (m_CurrentPosition + m_N - 1 == m_MaxPosition) { m_CurrentPosition = 0; m_N--; } return retValue.trim(); } /** * filters out empty strings in m_SplitString and replaces m_SplitString with * the cleaned version. * * @see #m_SplitString */ protected void filterOutEmptyStrings() { String[] newSplit; LinkedList<String> clean = new LinkedList<String>(); for (int i = 0; i < m_SplitString.length; i++) { if (!m_SplitString[i].equals("")) { clean.add(m_SplitString[i]); } } newSplit = new String[clean.size()]; for (int i = 0; i < clean.size(); i++) { newSplit[i] = clean.get(i); } m_SplitString = newSplit; } /** * Sets the string to tokenize. Tokenization happens immediately. * * @param s the string to tokenize */ @Override public void tokenize(String s) { m_N = m_NMax; m_SplitString = s.split("[" + getDelimiters() + "]"); filterOutEmptyStrings(); m_CurrentPosition = 0; m_MaxPosition = m_SplitString.length; if (m_SplitString.length < m_NMax) { m_N = m_SplitString.length; } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Runs the tokenizer with the given options and strings to tokenize. The * tokens are printed to stdout. * * @param args the commandline options and strings to tokenize */ public static void main(String[] args) { runTokenizer(new NGramTokenizer(), args); } }