source: branches/MetisMQI/src/main/java/weka/core/tokenizers/NGramTokenizer.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 7.9 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * NGramTokenizer.java
19 * Copyright (C) 2007 University of Waikato
20 */
21
22package weka.core.tokenizers;
23
24import weka.core.Option;
25import weka.core.RevisionUtils;
26import weka.core.Utils;
27
28import java.util.Enumeration;
29import java.util.LinkedList;
30import java.util.Vector;
31
32/**
33 <!-- globalinfo-start -->
34 * Splits a string into an n-gram with min and max grams.
35 * <p/>
36 <!-- globalinfo-end -->
37 *
38 <!-- options-start -->
39 * Valid options are: <p/>
40 *
41 * <pre> -delimiters &lt;value&gt;
42 *  The delimiters to use
43 *  (default ' \r\n\t.,;:'"()?!').</pre>
44 *
45 * <pre> -max &lt;int&gt;
46 *  The max size of the Ngram (default = 3).</pre>
47 *
48 * <pre> -min &lt;int&gt;
49 *  The min size of the Ngram (default = 1).</pre>
50 *
51 <!-- options-end -->
52 *
53 * @author  Sebastian Germesin (sebastian.germesin@dfki.de)
54 * @author  FracPete (fracpete at waikato dot ac dot nz)
55 * @version $Revision: 5953 $
56 */
57public class NGramTokenizer
58  extends CharacterDelimitedTokenizer {
59
60  /** for serialization */
61  private static final long serialVersionUID = -2181896254171647219L;
62
63  /** the maximum number of N */
64  protected int m_NMax = 3;
65 
66  /** the minimum number of N */
67  protected int m_NMin = 1;
68 
69  /** the current length of the N-grams */
70  protected int m_N;
71 
72  /** the number of strings available */
73  protected int m_MaxPosition;
74 
75  /** the current position for returning elements */
76  protected int m_CurrentPosition;
77 
78  /** all the available grams */
79  protected String[] m_SplitString;
80 
81  /**
82   * Returns a string describing the stemmer
83   *
84   * @return            a description suitable for displaying in the
85   *                    explorer/experimenter gui
86   */
87  public String globalInfo() {
88    return "Splits a string into an n-gram with min and max grams.";
89  }
90 
91  /**
92   * Returns an enumeration of all the available options..
93   *
94   * @return            an enumeration of all available options.
95   */
96  public Enumeration listOptions() {
97    Vector<Option>      result;
98    Enumeration enm;
99   
100    result = new Vector<Option>();
101   
102    enm = super.listOptions();
103    while (enm.hasMoreElements())
104      result.addElement((Option)enm.nextElement());
105
106    result.addElement(new Option(
107        "\tThe max size of the Ngram (default = 3).",
108        "max", 1, "-max <int>"));
109
110    result.addElement(new Option(
111        "\tThe min size of the Ngram (default = 1).",
112        "min", 1, "-min <int>"));
113   
114    return result.elements();
115  }
116 
117  /**
118   * Gets the current option settings for the OptionHandler.
119   *
120   * @return            the list of current option settings as an array of
121   *                    strings
122   */
123  public String[] getOptions() {
124    Vector<String>      result;
125    String[]            options;
126    int                 i;
127   
128    result = new Vector<String>();
129   
130    options = super.getOptions();
131    for (i = 0; i < options.length; i++)
132      result.add(options[i]);
133   
134    result.add("-max");
135    result.add("" + getNGramMaxSize());
136
137    result.add("-min");
138    result.add("" + getNGramMinSize());
139
140    return result.toArray(new String[result.size()]);
141  }
142
143  /**
144   * Parses a given list of options. <p/>
145   *
146   <!-- options-start -->
147   * Valid options are: <p/>
148   *
149   * <pre> -delimiters &lt;value&gt;
150   *  The delimiters to use
151   *  (default ' \r\n\t.,;:'"()?!').</pre>
152   *
153   * <pre> -max &lt;int&gt;
154   *  The max size of the Ngram (default = 3).</pre>
155   *
156   * <pre> -min &lt;int&gt;
157   *  The min size of the Ngram (default = 1).</pre>
158   *
159   <!-- options-end -->
160   *
161   * @param options     the list of options as an array of strings
162   * @throws Exception  if an option is not supported
163   */
164  public void setOptions(String[] options) throws Exception {
165    String      value;
166   
167    super.setOptions(options);
168
169    value = Utils.getOption("max", options);
170    if (value.length() != 0)
171      setNGramMaxSize(Integer.parseInt(value));
172    else
173      setNGramMaxSize(3);
174
175    value = Utils.getOption("min", options);
176    if (value.length() != 0)
177      setNGramMinSize(Integer.parseInt(value));
178    else
179      setNGramMinSize(1);
180  }
181 
182  /**
183   * Gets the max N of the NGram.
184   *
185   * @return            the size (N) of the NGram.
186   */
187  public int getNGramMaxSize() {
188    return m_NMax;
189  }
190
191  /**
192   * Sets the max size of the Ngram.
193   *
194   * @param value       the size of the NGram.
195   */
196  public void setNGramMaxSize(int value) {
197    if (value < 1)
198      m_NMax = 1;
199    else
200      m_NMax = value;
201  }
202
203  /**
204   * Returns the tip text for this property.
205   *
206   * @return            tip text for this property suitable for
207   *                    displaying in the explorer/experimenter gui
208   */
209  public String NGramMaxSizeTipText() {
210    return "The max N of the NGram.";
211  }
212
213  /**
214   * Sets the min size of the Ngram.
215   *
216   * @param value       the size of the NGram.
217   */
218  public void setNGramMinSize(int value) {
219    if (value < 1)
220      m_NMin = 1;
221    else
222      m_NMin = value;
223  }
224
225  /**
226   * Gets the min N of the NGram.
227   *
228   * @return            the size (N) of the NGram.
229   */
230  public int getNGramMinSize() {
231    return m_NMin;
232  }
233
234  /**
235   * Returns the tip text for this property.
236   *
237   * @return            tip text for this property suitable for
238   *                    displaying in the explorer/experimenter gui
239   */
240  public String NGramMinSizeTipText() {
241    return "The min N of the NGram.";
242  }
243
244  /**
245   * returns true if there's more elements available
246   *
247   * @return            true if there are more elements available
248   */
249  public boolean hasMoreElements() {
250    return (m_CurrentPosition < m_MaxPosition && 
251        m_N - 1 + m_CurrentPosition < m_MaxPosition && 
252        m_N >= m_NMin);
253  }
254 
255  /**
256   * Returns N-grams and also (N-1)-grams and .... and 1-grams.
257   *
258   * @return            the next element
259   */
260  public Object nextElement() {
261    String retValue = "";
262   
263    for (int i = 0; i < m_N && i + m_CurrentPosition < m_MaxPosition; i++)
264      retValue += " " + m_SplitString[m_CurrentPosition + i];
265   
266    m_CurrentPosition++;
267   
268    if (m_CurrentPosition + m_N - 1 == m_MaxPosition) {
269      m_CurrentPosition = 0;
270      m_N--;
271    }
272
273    return retValue.trim();
274  }
275
276  /**
277   * filters out empty strings in m_SplitString and
278   * replaces m_SplitString with the cleaned version.
279   *
280   * @see #m_SplitString
281   */
282  protected void filterOutEmptyStrings() {
283    String[] newSplit;
284    LinkedList<String> clean = new LinkedList<String>();
285
286    for (int i = 0; i < m_SplitString.length; i++) {
287      if (!m_SplitString[i].equals(""))
288        clean.add(m_SplitString[i]);
289    }
290
291    newSplit = new String[clean.size()];
292    for (int i = 0; i < clean.size(); i++) 
293      newSplit[i] = clean.get(i);
294
295    m_SplitString = newSplit;
296  }
297 
298  /**
299   * Sets the string to tokenize. Tokenization happens immediately.
300   *
301   * @param s           the string to tokenize
302   */
303  public void tokenize(String s) {
304    m_N           = m_NMax;
305    m_SplitString = s.split("[" + getDelimiters() + "]");
306   
307    filterOutEmptyStrings();
308
309    m_CurrentPosition = 0;
310    m_MaxPosition     = m_SplitString.length;
311  }
312 
313  /**
314   * Returns the revision string.
315   *
316   * @return            the revision
317   */
318  public String getRevision() {
319    return RevisionUtils.extract("$Revision: 5953 $");
320  }
321
322  /**
323   * Runs the tokenizer with the given options and strings to tokenize.
324   * The tokens are printed to stdout.
325   *
326   * @param args        the commandline options and strings to tokenize
327   */
328  public static void main(String[] args) {
329    runTokenizer(new NGramTokenizer(), args);
330  }
331}
332
Note: See TracBrowser for help on using the repository browser.