source: branches/MetisMQI/src/main/java/weka/core/stemmers/SnowballStemmer.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 12.8 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * SnowballStemmer.java
19 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.core.stemmers;
24
25import weka.core.ClassDiscovery;
26import weka.core.Option;
27import weka.core.OptionHandler;
28import weka.core.RevisionUtils;
29import weka.core.Utils;
30import weka.gui.GenericObjectEditor;
31
32import java.lang.reflect.Method;
33import java.util.Enumeration;
34import java.util.Vector;
35
36/**
37 <!-- globalinfo-start -->
38 * A wrapper class for the Snowball stemmers. Only available if the Snowball classes are in the classpath.<br/>
39 * If the class discovery is not dynamic, i.e., the property 'UseDynamic' in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', then the property 'org.tartarus.snowball.SnowballProgram' in the 'weka/gui/GenericObjectEditor.props' file has to be uncommented as well. If necessary you have to discover and fill in the snowball stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:<br/>
40 *   java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext<br/>
41 * <br/>
42 * For more information visit these web sites:<br/>
43 *   http://weka.wikispaces.com/Stemmers<br/>
44 *   http://snowball.tartarus.org/<br/>
45 * <p/>
46 <!-- globalinfo-end -->
47 *
48 <!-- options-start -->
49 * Valid options are: <p/>
50 *
51 * <pre> -S &lt;name&gt;
52 *  The name of the snowball stemmer (default 'porter').
53 *  available stemmers:
54 *     danish, dutch, english, finnish, french, german, italian,
55 *     norwegian, porter, portuguese, russian, spanish, swedish
56 * </pre>
57 *
58 <!-- options-end -->
59 *
60 * @author    FracPete (fracpete at waikato dot ac dot nz)
61 * @version   $Revision: 5953 $
62 */
63public class SnowballStemmer 
64  implements Stemmer, OptionHandler {
65 
66  /** for serialization. */
67  static final long serialVersionUID = -6111170431963015178L;
68 
69  /** the package name for snowball. */
70  public final static String PACKAGE = "org.tartarus.snowball";
71 
72  /** the package name where the stemmers are located. */
73  public final static String PACKAGE_EXT = PACKAGE + ".ext";
74
75  /** the snowball program, all stemmers are derived from. */
76  protected final static String SNOWBALL_PROGRAM = PACKAGE + ".SnowballProgram";
77 
78  /** whether the snowball stemmers are in the Classpath. */
79  protected static boolean m_Present = false;
80
81  /** contains the all the found stemmers (language names). */
82  protected static Vector<String> m_Stemmers;
83
84  /** the current stemmer. */
85  protected Object m_Stemmer;
86
87  /** the stem method. */
88  protected transient Method m_StemMethod;
89
90  /** the setCurrent method. */
91  protected transient Method m_SetCurrentMethod;
92
93  /** the getCurrent method. */
94  protected transient Method m_GetCurrentMethod;
95   
96  /** check for Snowball statically (needs only to be done once) */
97  static {
98    checkForSnowball();
99  }
100
101  /**
102   * initializes the stemmer ("porter").
103   */
104  public SnowballStemmer() {
105    this("porter");
106    initStemmers();
107  }
108
109  /**
110   * initializes the stemmer with the given stemmer.
111   *
112   * @param name        the name of the stemmer
113   */
114  public SnowballStemmer(String name) {
115    super();
116     
117    setStemmer(name);
118  }
119
120  /**
121   * checks whether Snowball is present in the classpath.
122   */
123  private static void checkForSnowball() {
124    try {
125      Class.forName(SNOWBALL_PROGRAM);
126      m_Present = true;
127    }
128    catch (Exception e) {
129      m_Present = false;
130    }
131  }
132
133  /**
134   * Returns a string describing the stemmer.
135   *
136   * @return a description suitable for
137   *         displaying in the explorer/experimenter gui
138   */
139  public String globalInfo() {
140    return 
141        "A wrapper class for the Snowball stemmers. Only available if the "
142      + "Snowball classes are in the classpath.\n"
143      + "If the class discovery is not dynamic, i.e., the property 'UseDynamic' "
144      + "in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', "
145      + "then the property 'org.tartarus.snowball.SnowballProgram' in the "
146      + "'weka/gui/GenericObjectEditor.props' file has to be uncommented "
147      + "as well. If necessary you have to discover and fill in the snowball "
148      + "stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:\n"
149      + "  java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext\n"
150      + "\n"
151      + "For more information visit these web sites:\n"
152      + "  http://weka.wikispaces.com/Stemmers\n"
153      + "  http://snowball.tartarus.org/\n";
154  }
155 
156  /**
157   * Returns an enumeration describing the available options.
158   *
159   * @return an enumeration of all the available options.
160   */
161  public Enumeration listOptions() {
162    Vector<Option>        result;
163   
164    result = new Vector<Option>();
165   
166    result.addElement(new Option(
167        "\tThe name of the snowball stemmer (default 'porter').\n"
168        + "\tavailable stemmers:\n" 
169        + getStemmerList(65, "\t   "),
170        "S", 1, "-S <name>"));
171   
172    return result.elements();
173  }
174 
175  /**
176   * Parses the options. <p/>
177   *
178   <!-- options-start -->
179   * Valid options are: <p/>
180   *
181   * <pre> -S &lt;name&gt;
182   *  The name of the snowball stemmer (default 'porter').
183   *  available stemmers:
184   *     danish, dutch, english, finnish, french, german, italian,
185   *     norwegian, porter, portuguese, russian, spanish, swedish
186   * </pre>
187   *
188   <!-- options-end -->
189   *
190   * @param options     the options to parse
191   * @throws Exception  if parsing fails
192   */
193  public void setOptions(String[] options) throws Exception {
194    String      tmpStr;
195   
196    tmpStr = Utils.getOption('S', options);
197    if (tmpStr.length() != 0)
198      setStemmer(tmpStr);
199    else
200      setStemmer("porter");
201  }
202 
203  /**
204   * Gets the current settings of the classifier.
205   *
206   * @return an array of strings suitable for passing to setOptions
207   */
208  public String[] getOptions() {
209    Vector<String>        result;
210   
211    result  = new Vector<String>();
212   
213    if (getStemmer() != null) {
214      result.add("-S");
215      result.add("" + getStemmer());
216    }
217   
218    return (String[]) result.toArray(new String[result.size()]);
219  }
220
221  /**
222   * extracts the stemmer name form the classname.
223   *
224   * @param classname     the full classname of the stemmer
225   * @return              the name of the stemmer
226   */
227  private static String getStemmerName(String classname) {
228    return classname.replaceAll(".*\\.", "").replaceAll("Stemmer$", "");
229  }
230
231  /**
232   * returns the full classname of the stemmer.
233   *
234   * @param name          the name of the stemmer
235   * @return              the full classname of the stemmer
236   * @see                 #PACKAGE_EXT
237   */
238  private static String getStemmerClassname(String name) {
239    return PACKAGE_EXT + "." + name + "Stemmer";
240  }
241
242  /**
243   * retrieves the language names of the availabel stemmers.
244   */
245  private static void initStemmers() {
246    Vector        classnames;
247    int           i;
248   
249    if (m_Stemmers != null)
250      return;
251   
252    m_Stemmers = new Vector<String>();
253   
254    if (!m_Present)
255      return;
256
257    classnames = GenericObjectEditor.getClassnames(SNOWBALL_PROGRAM);
258    // try dynamic discovery if not in props file
259    if (classnames.size() == 0) {
260      classnames = ClassDiscovery.find(SNOWBALL_PROGRAM, PACKAGE_EXT);
261      for (i = 0; i < classnames.size(); i++)
262        m_Stemmers.add(getStemmerName(classnames.get(i).toString()));
263    }
264  }
265
266  /**
267   * returns whether Snowball is present or not, i.e. whether the classes are
268   * in the classpath or not
269   *
270   * @return whether Snowball is available
271   */
272  public static boolean isPresent() {
273    return m_Present;
274  }
275
276  /**
277   * returns an enumeration over all currently stored stemmer names.
278   *
279   * @return all available stemmers
280   */
281  public static Enumeration listStemmers() {
282    initStemmers();
283   
284    return m_Stemmers.elements();
285  }
286
287  /**
288   * generates a comma list of the available stemmers.
289   *
290   * @param lineLength    the max line length, before a linefeed is inserted
291   *                      (0 is unlimited)
292   * @param indention     the indention of a line
293   * @return              the generated list
294   */
295  private static String getStemmerList(int lineLength, String indention) {
296    String        result;
297    Enumeration   enm;
298    String        name;
299    String        line;
300   
301    result = "";
302    line   = "";
303    enm    = listStemmers();
304    while (enm.hasMoreElements()) {
305      name = enm.nextElement().toString();
306      if (line.length() > 0)
307        line += ", ";
308      if ( (lineLength > 0) && (line.length() + name.length() > lineLength) ) {
309        result += indention + line + "\n";
310        line    = "";
311      }
312      line += name;
313    }
314
315    if (line.length() > 0)
316      result += indention + line + "\n";
317   
318    return result;
319  }
320
321  /**
322   * returns the name of the current stemmer, null if none is set.
323   *
324   * @return the name of the stemmer
325   */
326  public String getStemmer() {
327    initStemmers();
328   
329    if (m_Stemmer == null)
330      return null;
331    else
332      return getStemmerName(m_Stemmer.getClass().getName());
333  }
334
335  /**
336   * sets the stemmer with the given name, e.g., "porter".
337   *
338   * @param name        the name of the stemmer, e.g., "porter"
339   */
340  public void setStemmer(String name) {
341    Class<?>       snowballClass;
342    Class[]     argClasses;
343   
344    initStemmers();
345   
346    if (m_Stemmers.contains(name)) {
347      try {
348        snowballClass = Class.forName(getStemmerClassname(name));
349        m_Stemmer     = snowballClass.newInstance();
350
351        // methods
352        argClasses         = new Class[0];
353        m_StemMethod       = snowballClass.getMethod("stem", argClasses);
354       
355        argClasses         = new Class[1];
356        argClasses[0]      = String.class;
357        m_SetCurrentMethod = snowballClass.getMethod("setCurrent", argClasses);
358       
359        argClasses         = new Class[0];
360        m_GetCurrentMethod = snowballClass.getMethod("getCurrent", argClasses);
361      }
362      catch (Exception e) {
363        System.out.println(
364              "Error initializing stemmer '" + name + "'!"
365            + e.getMessage());
366        m_Stemmer = null;
367      }
368    }
369    else {
370      System.err.println("Stemmer '" + name + "' unknown!");
371      m_Stemmer = null;
372    }
373  }
374
375  /**
376   * Returns the tip text for this property.
377   *
378   * @return tip text for this property suitable for
379   * displaying in the explorer/experimenter gui
380   */
381  public String stemmerTipText() {
382    return "The Snowball stemmer to use, available: " + getStemmerList(0, "");
383  }
384
385  /**
386   * Returns the word in its stemmed form.
387   *
388   * @param word      the unstemmed word
389   * @return          the stemmed word
390   */
391  public String stem(String word) {
392    String      result;
393    Object[]    args;
394   
395    if (m_Stemmer == null) {
396      result = new String(word);
397    }
398    else {
399      // after de-serialization, the methods are null and need to be
400      // re-initialized
401      if (m_SetCurrentMethod == null)
402        setStemmer(getStemmer());
403     
404      try {
405        // set word
406        args    = new Object[1];
407        args[0] = word;
408        m_SetCurrentMethod.invoke(m_Stemmer, args);
409
410        // stem word
411        args = new Object[0];
412        m_StemMethod.invoke(m_Stemmer, args);
413
414        // get word
415        args   = new Object[0];
416        result = (String) m_GetCurrentMethod.invoke(m_Stemmer, args);
417      }
418      catch (Exception e) {
419        e.printStackTrace();
420        result = word;
421      }
422    }
423     
424    return result;
425  }
426
427  /**
428   * returns a string representation of the stemmer.
429   *
430   * @return a string representation of the stemmer
431   */
432  public String toString() {
433    String      result;
434
435    result  = getClass().getName();
436    result += " " + Utils.joinOptions(getOptions());
437
438    return result.trim();
439  }
440 
441  /**
442   * Returns the revision string.
443   *
444   * @return            the revision
445   */
446  public String getRevision() {
447    return RevisionUtils.extract("$Revision: 5953 $");
448  }
449
450  /**
451   * Runs the stemmer with the given options.
452   *
453   * @param args      the options
454   */
455  public static void main(String[] args) {
456    try {
457      Stemming.useStemmer(new SnowballStemmer(), args);
458    }
459    catch (Exception e) {
460      e.printStackTrace();
461    }
462  }
463}
Note: See TracBrowser for help on using the repository browser.