| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * SnowballStemmer.java |
|---|
| 19 | * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand |
|---|
| 20 | * |
|---|
| 21 | */ |
|---|
| 22 | |
|---|
| 23 | package weka.core.stemmers; |
|---|
| 24 | |
|---|
| 25 | import weka.core.ClassDiscovery; |
|---|
| 26 | import weka.core.Option; |
|---|
| 27 | import weka.core.OptionHandler; |
|---|
| 28 | import weka.core.RevisionUtils; |
|---|
| 29 | import weka.core.Utils; |
|---|
| 30 | import weka.gui.GenericObjectEditor; |
|---|
| 31 | |
|---|
| 32 | import java.lang.reflect.Method; |
|---|
| 33 | import java.util.Enumeration; |
|---|
| 34 | import java.util.Vector; |
|---|
| 35 | |
|---|
| 36 | /** |
|---|
| 37 | <!-- globalinfo-start --> |
|---|
| 38 | * A wrapper class for the Snowball stemmers. Only available if the Snowball classes are in the classpath.<br/> |
|---|
| 39 | * If the class discovery is not dynamic, i.e., the property 'UseDynamic' in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', then the property 'org.tartarus.snowball.SnowballProgram' in the 'weka/gui/GenericObjectEditor.props' file has to be uncommented as well. If necessary you have to discover and fill in the snowball stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:<br/> |
|---|
| 40 | * java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext<br/> |
|---|
| 41 | * <br/> |
|---|
| 42 | * For more information visit these web sites:<br/> |
|---|
| 43 | * http://weka.wikispaces.com/Stemmers<br/> |
|---|
| 44 | * http://snowball.tartarus.org/<br/> |
|---|
| 45 | * <p/> |
|---|
| 46 | <!-- globalinfo-end --> |
|---|
| 47 | * |
|---|
| 48 | <!-- options-start --> |
|---|
| 49 | * Valid options are: <p/> |
|---|
| 50 | * |
|---|
| 51 | * <pre> -S <name> |
|---|
| 52 | * The name of the snowball stemmer (default 'porter'). |
|---|
| 53 | * available stemmers: |
|---|
| 54 | * danish, dutch, english, finnish, french, german, italian, |
|---|
| 55 | * norwegian, porter, portuguese, russian, spanish, swedish |
|---|
| 56 | * </pre> |
|---|
| 57 | * |
|---|
| 58 | <!-- options-end --> |
|---|
| 59 | * |
|---|
| 60 | * @author FracPete (fracpete at waikato dot ac dot nz) |
|---|
| 61 | * @version $Revision: 5953 $ |
|---|
| 62 | */ |
|---|
| 63 | public class SnowballStemmer |
|---|
| 64 | implements Stemmer, OptionHandler { |
|---|
| 65 | |
|---|
| 66 | /** for serialization. */ |
|---|
| 67 | static final long serialVersionUID = -6111170431963015178L; |
|---|
| 68 | |
|---|
| 69 | /** the package name for snowball. */ |
|---|
| 70 | public final static String PACKAGE = "org.tartarus.snowball"; |
|---|
| 71 | |
|---|
| 72 | /** the package name where the stemmers are located. */ |
|---|
| 73 | public final static String PACKAGE_EXT = PACKAGE + ".ext"; |
|---|
| 74 | |
|---|
| 75 | /** the snowball program, all stemmers are derived from. */ |
|---|
| 76 | protected final static String SNOWBALL_PROGRAM = PACKAGE + ".SnowballProgram"; |
|---|
| 77 | |
|---|
| 78 | /** whether the snowball stemmers are in the Classpath. */ |
|---|
| 79 | protected static boolean m_Present = false; |
|---|
| 80 | |
|---|
| 81 | /** contains the all the found stemmers (language names). */ |
|---|
| 82 | protected static Vector<String> m_Stemmers; |
|---|
| 83 | |
|---|
| 84 | /** the current stemmer. */ |
|---|
| 85 | protected Object m_Stemmer; |
|---|
| 86 | |
|---|
| 87 | /** the stem method. */ |
|---|
| 88 | protected transient Method m_StemMethod; |
|---|
| 89 | |
|---|
| 90 | /** the setCurrent method. */ |
|---|
| 91 | protected transient Method m_SetCurrentMethod; |
|---|
| 92 | |
|---|
| 93 | /** the getCurrent method. */ |
|---|
| 94 | protected transient Method m_GetCurrentMethod; |
|---|
| 95 | |
|---|
| 96 | /** check for Snowball statically (needs only to be done once) */ |
|---|
| 97 | static { |
|---|
| 98 | checkForSnowball(); |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | /** |
|---|
| 102 | * initializes the stemmer ("porter"). |
|---|
| 103 | */ |
|---|
| 104 | public SnowballStemmer() { |
|---|
| 105 | this("porter"); |
|---|
| 106 | initStemmers(); |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | /** |
|---|
| 110 | * initializes the stemmer with the given stemmer. |
|---|
| 111 | * |
|---|
| 112 | * @param name the name of the stemmer |
|---|
| 113 | */ |
|---|
| 114 | public SnowballStemmer(String name) { |
|---|
| 115 | super(); |
|---|
| 116 | |
|---|
| 117 | setStemmer(name); |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | /** |
|---|
| 121 | * checks whether Snowball is present in the classpath. |
|---|
| 122 | */ |
|---|
| 123 | private static void checkForSnowball() { |
|---|
| 124 | try { |
|---|
| 125 | Class.forName(SNOWBALL_PROGRAM); |
|---|
| 126 | m_Present = true; |
|---|
| 127 | } |
|---|
| 128 | catch (Exception e) { |
|---|
| 129 | m_Present = false; |
|---|
| 130 | } |
|---|
| 131 | } |
|---|
| 132 | |
|---|
| 133 | /** |
|---|
| 134 | * Returns a string describing the stemmer. |
|---|
| 135 | * |
|---|
| 136 | * @return a description suitable for |
|---|
| 137 | * displaying in the explorer/experimenter gui |
|---|
| 138 | */ |
|---|
| 139 | public String globalInfo() { |
|---|
| 140 | return |
|---|
| 141 | "A wrapper class for the Snowball stemmers. Only available if the " |
|---|
| 142 | + "Snowball classes are in the classpath.\n" |
|---|
| 143 | + "If the class discovery is not dynamic, i.e., the property 'UseDynamic' " |
|---|
| 144 | + "in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', " |
|---|
| 145 | + "then the property 'org.tartarus.snowball.SnowballProgram' in the " |
|---|
| 146 | + "'weka/gui/GenericObjectEditor.props' file has to be uncommented " |
|---|
| 147 | + "as well. If necessary you have to discover and fill in the snowball " |
|---|
| 148 | + "stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:\n" |
|---|
| 149 | + " java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext\n" |
|---|
| 150 | + "\n" |
|---|
| 151 | + "For more information visit these web sites:\n" |
|---|
| 152 | + " http://weka.wikispaces.com/Stemmers\n" |
|---|
| 153 | + " http://snowball.tartarus.org/\n"; |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | /** |
|---|
| 157 | * Returns an enumeration describing the available options. |
|---|
| 158 | * |
|---|
| 159 | * @return an enumeration of all the available options. |
|---|
| 160 | */ |
|---|
| 161 | public Enumeration listOptions() { |
|---|
| 162 | Vector<Option> result; |
|---|
| 163 | |
|---|
| 164 | result = new Vector<Option>(); |
|---|
| 165 | |
|---|
| 166 | result.addElement(new Option( |
|---|
| 167 | "\tThe name of the snowball stemmer (default 'porter').\n" |
|---|
| 168 | + "\tavailable stemmers:\n" |
|---|
| 169 | + getStemmerList(65, "\t "), |
|---|
| 170 | "S", 1, "-S <name>")); |
|---|
| 171 | |
|---|
| 172 | return result.elements(); |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | /** |
|---|
| 176 | * Parses the options. <p/> |
|---|
| 177 | * |
|---|
| 178 | <!-- options-start --> |
|---|
| 179 | * Valid options are: <p/> |
|---|
| 180 | * |
|---|
| 181 | * <pre> -S <name> |
|---|
| 182 | * The name of the snowball stemmer (default 'porter'). |
|---|
| 183 | * available stemmers: |
|---|
| 184 | * danish, dutch, english, finnish, french, german, italian, |
|---|
| 185 | * norwegian, porter, portuguese, russian, spanish, swedish |
|---|
| 186 | * </pre> |
|---|
| 187 | * |
|---|
| 188 | <!-- options-end --> |
|---|
| 189 | * |
|---|
| 190 | * @param options the options to parse |
|---|
| 191 | * @throws Exception if parsing fails |
|---|
| 192 | */ |
|---|
| 193 | public void setOptions(String[] options) throws Exception { |
|---|
| 194 | String tmpStr; |
|---|
| 195 | |
|---|
| 196 | tmpStr = Utils.getOption('S', options); |
|---|
| 197 | if (tmpStr.length() != 0) |
|---|
| 198 | setStemmer(tmpStr); |
|---|
| 199 | else |
|---|
| 200 | setStemmer("porter"); |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | /** |
|---|
| 204 | * Gets the current settings of the classifier. |
|---|
| 205 | * |
|---|
| 206 | * @return an array of strings suitable for passing to setOptions |
|---|
| 207 | */ |
|---|
| 208 | public String[] getOptions() { |
|---|
| 209 | Vector<String> result; |
|---|
| 210 | |
|---|
| 211 | result = new Vector<String>(); |
|---|
| 212 | |
|---|
| 213 | if (getStemmer() != null) { |
|---|
| 214 | result.add("-S"); |
|---|
| 215 | result.add("" + getStemmer()); |
|---|
| 216 | } |
|---|
| 217 | |
|---|
| 218 | return (String[]) result.toArray(new String[result.size()]); |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | /** |
|---|
| 222 | * extracts the stemmer name form the classname. |
|---|
| 223 | * |
|---|
| 224 | * @param classname the full classname of the stemmer |
|---|
| 225 | * @return the name of the stemmer |
|---|
| 226 | */ |
|---|
| 227 | private static String getStemmerName(String classname) { |
|---|
| 228 | return classname.replaceAll(".*\\.", "").replaceAll("Stemmer$", ""); |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | /** |
|---|
| 232 | * returns the full classname of the stemmer. |
|---|
| 233 | * |
|---|
| 234 | * @param name the name of the stemmer |
|---|
| 235 | * @return the full classname of the stemmer |
|---|
| 236 | * @see #PACKAGE_EXT |
|---|
| 237 | */ |
|---|
| 238 | private static String getStemmerClassname(String name) { |
|---|
| 239 | return PACKAGE_EXT + "." + name + "Stemmer"; |
|---|
| 240 | } |
|---|
| 241 | |
|---|
| 242 | /** |
|---|
| 243 | * retrieves the language names of the availabel stemmers. |
|---|
| 244 | */ |
|---|
| 245 | private static void initStemmers() { |
|---|
| 246 | Vector classnames; |
|---|
| 247 | int i; |
|---|
| 248 | |
|---|
| 249 | if (m_Stemmers != null) |
|---|
| 250 | return; |
|---|
| 251 | |
|---|
| 252 | m_Stemmers = new Vector<String>(); |
|---|
| 253 | |
|---|
| 254 | if (!m_Present) |
|---|
| 255 | return; |
|---|
| 256 | |
|---|
| 257 | classnames = GenericObjectEditor.getClassnames(SNOWBALL_PROGRAM); |
|---|
| 258 | // try dynamic discovery if not in props file |
|---|
| 259 | if (classnames.size() == 0) { |
|---|
| 260 | classnames = ClassDiscovery.find(SNOWBALL_PROGRAM, PACKAGE_EXT); |
|---|
| 261 | for (i = 0; i < classnames.size(); i++) |
|---|
| 262 | m_Stemmers.add(getStemmerName(classnames.get(i).toString())); |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | /** |
|---|
| 267 | * returns whether Snowball is present or not, i.e. whether the classes are |
|---|
| 268 | * in the classpath or not |
|---|
| 269 | * |
|---|
| 270 | * @return whether Snowball is available |
|---|
| 271 | */ |
|---|
| 272 | public static boolean isPresent() { |
|---|
| 273 | return m_Present; |
|---|
| 274 | } |
|---|
| 275 | |
|---|
| 276 | /** |
|---|
| 277 | * returns an enumeration over all currently stored stemmer names. |
|---|
| 278 | * |
|---|
| 279 | * @return all available stemmers |
|---|
| 280 | */ |
|---|
| 281 | public static Enumeration listStemmers() { |
|---|
| 282 | initStemmers(); |
|---|
| 283 | |
|---|
| 284 | return m_Stemmers.elements(); |
|---|
| 285 | } |
|---|
| 286 | |
|---|
| 287 | /** |
|---|
| 288 | * generates a comma list of the available stemmers. |
|---|
| 289 | * |
|---|
| 290 | * @param lineLength the max line length, before a linefeed is inserted |
|---|
| 291 | * (0 is unlimited) |
|---|
| 292 | * @param indention the indention of a line |
|---|
| 293 | * @return the generated list |
|---|
| 294 | */ |
|---|
| 295 | private static String getStemmerList(int lineLength, String indention) { |
|---|
| 296 | String result; |
|---|
| 297 | Enumeration enm; |
|---|
| 298 | String name; |
|---|
| 299 | String line; |
|---|
| 300 | |
|---|
| 301 | result = ""; |
|---|
| 302 | line = ""; |
|---|
| 303 | enm = listStemmers(); |
|---|
| 304 | while (enm.hasMoreElements()) { |
|---|
| 305 | name = enm.nextElement().toString(); |
|---|
| 306 | if (line.length() > 0) |
|---|
| 307 | line += ", "; |
|---|
| 308 | if ( (lineLength > 0) && (line.length() + name.length() > lineLength) ) { |
|---|
| 309 | result += indention + line + "\n"; |
|---|
| 310 | line = ""; |
|---|
| 311 | } |
|---|
| 312 | line += name; |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | if (line.length() > 0) |
|---|
| 316 | result += indention + line + "\n"; |
|---|
| 317 | |
|---|
| 318 | return result; |
|---|
| 319 | } |
|---|
| 320 | |
|---|
| 321 | /** |
|---|
| 322 | * returns the name of the current stemmer, null if none is set. |
|---|
| 323 | * |
|---|
| 324 | * @return the name of the stemmer |
|---|
| 325 | */ |
|---|
| 326 | public String getStemmer() { |
|---|
| 327 | initStemmers(); |
|---|
| 328 | |
|---|
| 329 | if (m_Stemmer == null) |
|---|
| 330 | return null; |
|---|
| 331 | else |
|---|
| 332 | return getStemmerName(m_Stemmer.getClass().getName()); |
|---|
| 333 | } |
|---|
| 334 | |
|---|
| 335 | /** |
|---|
| 336 | * sets the stemmer with the given name, e.g., "porter". |
|---|
| 337 | * |
|---|
| 338 | * @param name the name of the stemmer, e.g., "porter" |
|---|
| 339 | */ |
|---|
| 340 | public void setStemmer(String name) { |
|---|
| 341 | Class<?> snowballClass; |
|---|
| 342 | Class[] argClasses; |
|---|
| 343 | |
|---|
| 344 | initStemmers(); |
|---|
| 345 | |
|---|
| 346 | if (m_Stemmers.contains(name)) { |
|---|
| 347 | try { |
|---|
| 348 | snowballClass = Class.forName(getStemmerClassname(name)); |
|---|
| 349 | m_Stemmer = snowballClass.newInstance(); |
|---|
| 350 | |
|---|
| 351 | // methods |
|---|
| 352 | argClasses = new Class[0]; |
|---|
| 353 | m_StemMethod = snowballClass.getMethod("stem", argClasses); |
|---|
| 354 | |
|---|
| 355 | argClasses = new Class[1]; |
|---|
| 356 | argClasses[0] = String.class; |
|---|
| 357 | m_SetCurrentMethod = snowballClass.getMethod("setCurrent", argClasses); |
|---|
| 358 | |
|---|
| 359 | argClasses = new Class[0]; |
|---|
| 360 | m_GetCurrentMethod = snowballClass.getMethod("getCurrent", argClasses); |
|---|
| 361 | } |
|---|
| 362 | catch (Exception e) { |
|---|
| 363 | System.out.println( |
|---|
| 364 | "Error initializing stemmer '" + name + "'!" |
|---|
| 365 | + e.getMessage()); |
|---|
| 366 | m_Stemmer = null; |
|---|
| 367 | } |
|---|
| 368 | } |
|---|
| 369 | else { |
|---|
| 370 | System.err.println("Stemmer '" + name + "' unknown!"); |
|---|
| 371 | m_Stemmer = null; |
|---|
| 372 | } |
|---|
| 373 | } |
|---|
| 374 | |
|---|
| 375 | /** |
|---|
| 376 | * Returns the tip text for this property. |
|---|
| 377 | * |
|---|
| 378 | * @return tip text for this property suitable for |
|---|
| 379 | * displaying in the explorer/experimenter gui |
|---|
| 380 | */ |
|---|
| 381 | public String stemmerTipText() { |
|---|
| 382 | return "The Snowball stemmer to use, available: " + getStemmerList(0, ""); |
|---|
| 383 | } |
|---|
| 384 | |
|---|
| 385 | /** |
|---|
| 386 | * Returns the word in its stemmed form. |
|---|
| 387 | * |
|---|
| 388 | * @param word the unstemmed word |
|---|
| 389 | * @return the stemmed word |
|---|
| 390 | */ |
|---|
| 391 | public String stem(String word) { |
|---|
| 392 | String result; |
|---|
| 393 | Object[] args; |
|---|
| 394 | |
|---|
| 395 | if (m_Stemmer == null) { |
|---|
| 396 | result = new String(word); |
|---|
| 397 | } |
|---|
| 398 | else { |
|---|
| 399 | // after de-serialization, the methods are null and need to be |
|---|
| 400 | // re-initialized |
|---|
| 401 | if (m_SetCurrentMethod == null) |
|---|
| 402 | setStemmer(getStemmer()); |
|---|
| 403 | |
|---|
| 404 | try { |
|---|
| 405 | // set word |
|---|
| 406 | args = new Object[1]; |
|---|
| 407 | args[0] = word; |
|---|
| 408 | m_SetCurrentMethod.invoke(m_Stemmer, args); |
|---|
| 409 | |
|---|
| 410 | // stem word |
|---|
| 411 | args = new Object[0]; |
|---|
| 412 | m_StemMethod.invoke(m_Stemmer, args); |
|---|
| 413 | |
|---|
| 414 | // get word |
|---|
| 415 | args = new Object[0]; |
|---|
| 416 | result = (String) m_GetCurrentMethod.invoke(m_Stemmer, args); |
|---|
| 417 | } |
|---|
| 418 | catch (Exception e) { |
|---|
| 419 | e.printStackTrace(); |
|---|
| 420 | result = word; |
|---|
| 421 | } |
|---|
| 422 | } |
|---|
| 423 | |
|---|
| 424 | return result; |
|---|
| 425 | } |
|---|
| 426 | |
|---|
| 427 | /** |
|---|
| 428 | * returns a string representation of the stemmer. |
|---|
| 429 | * |
|---|
| 430 | * @return a string representation of the stemmer |
|---|
| 431 | */ |
|---|
| 432 | public String toString() { |
|---|
| 433 | String result; |
|---|
| 434 | |
|---|
| 435 | result = getClass().getName(); |
|---|
| 436 | result += " " + Utils.joinOptions(getOptions()); |
|---|
| 437 | |
|---|
| 438 | return result.trim(); |
|---|
| 439 | } |
|---|
| 440 | |
|---|
| 441 | /** |
|---|
| 442 | * Returns the revision string. |
|---|
| 443 | * |
|---|
| 444 | * @return the revision |
|---|
| 445 | */ |
|---|
| 446 | public String getRevision() { |
|---|
| 447 | return RevisionUtils.extract("$Revision: 5953 $"); |
|---|
| 448 | } |
|---|
| 449 | |
|---|
| 450 | /** |
|---|
| 451 | * Runs the stemmer with the given options. |
|---|
| 452 | * |
|---|
| 453 | * @param args the options |
|---|
| 454 | */ |
|---|
| 455 | public static void main(String[] args) { |
|---|
| 456 | try { |
|---|
| 457 | Stemming.useStemmer(new SnowballStemmer(), args); |
|---|
| 458 | } |
|---|
| 459 | catch (Exception e) { |
|---|
| 460 | e.printStackTrace(); |
|---|
| 461 | } |
|---|
| 462 | } |
|---|
| 463 | } |
|---|