1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * LovinsStemmer.java |
---|
19 | * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.core.stemmers; |
---|
24 | |
---|
25 | import weka.core.RevisionUtils; |
---|
26 | import weka.core.TechnicalInformation; |
---|
27 | import weka.core.TechnicalInformation.Type; |
---|
28 | import weka.core.TechnicalInformation.Field; |
---|
29 | import weka.core.TechnicalInformationHandler; |
---|
30 | |
---|
31 | import java.util.HashMap; |
---|
32 | |
---|
33 | /** |
---|
34 | <!-- globalinfo-start --> |
---|
35 | * A stemmer based on the Lovins stemmer, described here:<br/> |
---|
36 | * <br/> |
---|
37 | * Julie Beth Lovins (1968). Development of a stemming algorithm. Mechanical Translation and Computational Linguistics. 11:22-31. |
---|
38 | * <p/> |
---|
39 | <!-- globalinfo-end --> |
---|
40 | * |
---|
41 | <!-- technical-bibtex-start --> |
---|
42 | * BibTeX: |
---|
43 | * <pre> |
---|
44 | * @article{Lovins1968, |
---|
45 | * author = {Julie Beth Lovins}, |
---|
46 | * journal = {Mechanical Translation and Computational Linguistics}, |
---|
47 | * pages = {22-31}, |
---|
48 | * title = {Development of a stemming algorithm}, |
---|
49 | * volume = {11}, |
---|
50 | * year = {1968} |
---|
51 | * } |
---|
52 | * </pre> |
---|
53 | * <p/> |
---|
54 | <!-- technical-bibtex-end --> |
---|
55 | * |
---|
56 | * @author Eibe Frank (eibe at cs dot waikato dot ac dot nz) |
---|
57 | * @version $Revision: 5953 $ |
---|
58 | */ |
---|
59 | public class LovinsStemmer |
---|
60 | implements Stemmer, TechnicalInformationHandler { |
---|
61 | |
---|
62 | /** for serialization */ |
---|
63 | static final long serialVersionUID = -6113024782588197L; |
---|
64 | |
---|
65 | /** Enters C version compatibility mode if set to true (emulates |
---|
66 | features of the original C implementation that are inconsistent |
---|
67 | with the algorithm as described in Lovins's paper) */ |
---|
68 | private static boolean m_CompMode = false; |
---|
69 | |
---|
70 | /** The hash tables containing the list of endings. */ |
---|
71 | private static HashMap<String,String> m_l11 = null; |
---|
72 | private static HashMap<String,String> m_l10 = null; |
---|
73 | private static HashMap<String,String> m_l9 = null; |
---|
74 | private static HashMap<String,String> m_l8 = null; |
---|
75 | private static HashMap<String,String> m_l7 = null; |
---|
76 | private static HashMap<String,String> m_l6 = null; |
---|
77 | private static HashMap<String,String> m_l5 = null; |
---|
78 | private static HashMap<String,String> m_l4 = null; |
---|
79 | private static HashMap<String,String> m_l3 = null; |
---|
80 | private static HashMap<String,String> m_l2 = null; |
---|
81 | private static HashMap<String,String> m_l1 = null; |
---|
82 | |
---|
83 | static { |
---|
84 | |
---|
85 | m_l11 = new HashMap<String,String>(); |
---|
86 | m_l11.put("alistically", "B"); |
---|
87 | m_l11.put("arizability", "A"); |
---|
88 | m_l11.put("izationally", "B"); |
---|
89 | m_l10 = new HashMap<String,String>(); |
---|
90 | m_l10.put("antialness", "A"); |
---|
91 | m_l10.put("arisations", "A"); |
---|
92 | m_l10.put("arizations", "A"); |
---|
93 | m_l10.put("entialness", "A"); |
---|
94 | m_l9 = new HashMap<String,String>(); |
---|
95 | m_l9.put("allically", "C"); |
---|
96 | m_l9.put("antaneous", "A"); |
---|
97 | m_l9.put("antiality", "A"); |
---|
98 | m_l9.put("arisation", "A"); |
---|
99 | m_l9.put("arization", "A"); |
---|
100 | m_l9.put("ationally", "B"); |
---|
101 | m_l9.put("ativeness", "A"); |
---|
102 | m_l9.put("eableness", "E"); |
---|
103 | m_l9.put("entations", "A"); |
---|
104 | m_l9.put("entiality", "A"); |
---|
105 | m_l9.put("entialize", "A"); |
---|
106 | m_l9.put("entiation", "A"); |
---|
107 | m_l9.put("ionalness", "A"); |
---|
108 | m_l9.put("istically", "A"); |
---|
109 | m_l9.put("itousness", "A"); |
---|
110 | m_l9.put("izability", "A"); |
---|
111 | m_l9.put("izational", "A"); |
---|
112 | m_l8 = new HashMap<String,String>(); |
---|
113 | m_l8.put("ableness", "A"); |
---|
114 | m_l8.put("arizable", "A"); |
---|
115 | m_l8.put("entation", "A"); |
---|
116 | m_l8.put("entially", "A"); |
---|
117 | m_l8.put("eousness", "A"); |
---|
118 | m_l8.put("ibleness", "A"); |
---|
119 | m_l8.put("icalness", "A"); |
---|
120 | m_l8.put("ionalism", "A"); |
---|
121 | m_l8.put("ionality", "A"); |
---|
122 | m_l8.put("ionalize", "A"); |
---|
123 | m_l8.put("iousness", "A"); |
---|
124 | m_l8.put("izations", "A"); |
---|
125 | m_l8.put("lessness", "A"); |
---|
126 | m_l7 = new HashMap<String,String>(); |
---|
127 | m_l7.put("ability", "A"); |
---|
128 | m_l7.put("aically", "A"); |
---|
129 | m_l7.put("alistic", "B"); |
---|
130 | m_l7.put("alities", "A"); |
---|
131 | m_l7.put("ariness", "E"); |
---|
132 | m_l7.put("aristic", "A"); |
---|
133 | m_l7.put("arizing", "A"); |
---|
134 | m_l7.put("ateness", "A"); |
---|
135 | m_l7.put("atingly", "A"); |
---|
136 | m_l7.put("ational", "B"); |
---|
137 | m_l7.put("atively", "A"); |
---|
138 | m_l7.put("ativism", "A"); |
---|
139 | m_l7.put("elihood", "E"); |
---|
140 | m_l7.put("encible", "A"); |
---|
141 | m_l7.put("entally", "A"); |
---|
142 | m_l7.put("entials", "A"); |
---|
143 | m_l7.put("entiate", "A"); |
---|
144 | m_l7.put("entness", "A"); |
---|
145 | m_l7.put("fulness", "A"); |
---|
146 | m_l7.put("ibility", "A"); |
---|
147 | m_l7.put("icalism", "A"); |
---|
148 | m_l7.put("icalist", "A"); |
---|
149 | m_l7.put("icality", "A"); |
---|
150 | m_l7.put("icalize", "A"); |
---|
151 | m_l7.put("ication", "G"); |
---|
152 | m_l7.put("icianry", "A"); |
---|
153 | m_l7.put("ination", "A"); |
---|
154 | m_l7.put("ingness", "A"); |
---|
155 | m_l7.put("ionally", "A"); |
---|
156 | m_l7.put("isation", "A"); |
---|
157 | m_l7.put("ishness", "A"); |
---|
158 | m_l7.put("istical", "A"); |
---|
159 | m_l7.put("iteness", "A"); |
---|
160 | m_l7.put("iveness", "A"); |
---|
161 | m_l7.put("ivistic", "A"); |
---|
162 | m_l7.put("ivities", "A"); |
---|
163 | m_l7.put("ization", "F"); |
---|
164 | m_l7.put("izement", "A"); |
---|
165 | m_l7.put("oidally", "A"); |
---|
166 | m_l7.put("ousness", "A"); |
---|
167 | m_l6 = new HashMap<String,String>(); |
---|
168 | m_l6.put("aceous", "A"); |
---|
169 | m_l6.put("acious", "B"); |
---|
170 | m_l6.put("action", "G"); |
---|
171 | m_l6.put("alness", "A"); |
---|
172 | m_l6.put("ancial", "A"); |
---|
173 | m_l6.put("ancies", "A"); |
---|
174 | m_l6.put("ancing", "B"); |
---|
175 | m_l6.put("ariser", "A"); |
---|
176 | m_l6.put("arized", "A"); |
---|
177 | m_l6.put("arizer", "A"); |
---|
178 | m_l6.put("atable", "A"); |
---|
179 | m_l6.put("ations", "B"); |
---|
180 | m_l6.put("atives", "A"); |
---|
181 | m_l6.put("eature", "Z"); |
---|
182 | m_l6.put("efully", "A"); |
---|
183 | m_l6.put("encies", "A"); |
---|
184 | m_l6.put("encing", "A"); |
---|
185 | m_l6.put("ential", "A"); |
---|
186 | m_l6.put("enting", "C"); |
---|
187 | m_l6.put("entist", "A"); |
---|
188 | m_l6.put("eously", "A"); |
---|
189 | m_l6.put("ialist", "A"); |
---|
190 | m_l6.put("iality", "A"); |
---|
191 | m_l6.put("ialize", "A"); |
---|
192 | m_l6.put("ically", "A"); |
---|
193 | m_l6.put("icance", "A"); |
---|
194 | m_l6.put("icians", "A"); |
---|
195 | m_l6.put("icists", "A"); |
---|
196 | m_l6.put("ifully", "A"); |
---|
197 | m_l6.put("ionals", "A"); |
---|
198 | m_l6.put("ionate", "D"); |
---|
199 | m_l6.put("ioning", "A"); |
---|
200 | m_l6.put("ionist", "A"); |
---|
201 | m_l6.put("iously", "A"); |
---|
202 | m_l6.put("istics", "A"); |
---|
203 | m_l6.put("izable", "E"); |
---|
204 | m_l6.put("lessly", "A"); |
---|
205 | m_l6.put("nesses", "A"); |
---|
206 | m_l6.put("oidism", "A"); |
---|
207 | m_l5 = new HashMap<String,String>(); |
---|
208 | m_l5.put("acies", "A"); |
---|
209 | m_l5.put("acity", "A"); |
---|
210 | m_l5.put("aging", "B"); |
---|
211 | m_l5.put("aical", "A"); |
---|
212 | if (!m_CompMode) { |
---|
213 | m_l5.put("alist", "A"); |
---|
214 | } |
---|
215 | m_l5.put("alism", "B"); |
---|
216 | m_l5.put("ality", "A"); |
---|
217 | m_l5.put("alize", "A"); |
---|
218 | m_l5.put("allic", "b"); |
---|
219 | m_l5.put("anced", "B"); |
---|
220 | m_l5.put("ances", "B"); |
---|
221 | m_l5.put("antic", "C"); |
---|
222 | m_l5.put("arial", "A"); |
---|
223 | m_l5.put("aries", "A"); |
---|
224 | m_l5.put("arily", "A"); |
---|
225 | m_l5.put("arity", "B"); |
---|
226 | m_l5.put("arize", "A"); |
---|
227 | m_l5.put("aroid", "A"); |
---|
228 | m_l5.put("ately", "A"); |
---|
229 | m_l5.put("ating", "I"); |
---|
230 | m_l5.put("ation", "B"); |
---|
231 | m_l5.put("ative", "A"); |
---|
232 | m_l5.put("ators", "A"); |
---|
233 | m_l5.put("atory", "A"); |
---|
234 | m_l5.put("ature", "E"); |
---|
235 | m_l5.put("early", "Y"); |
---|
236 | m_l5.put("ehood", "A"); |
---|
237 | m_l5.put("eless", "A"); |
---|
238 | if (!m_CompMode) { |
---|
239 | m_l5.put("elily", "A"); |
---|
240 | } else { |
---|
241 | m_l5.put("elity", "A"); |
---|
242 | } |
---|
243 | m_l5.put("ement", "A"); |
---|
244 | m_l5.put("enced", "A"); |
---|
245 | m_l5.put("ences", "A"); |
---|
246 | m_l5.put("eness", "E"); |
---|
247 | m_l5.put("ening", "E"); |
---|
248 | m_l5.put("ental", "A"); |
---|
249 | m_l5.put("ented", "C"); |
---|
250 | m_l5.put("ently", "A"); |
---|
251 | m_l5.put("fully", "A"); |
---|
252 | m_l5.put("ially", "A"); |
---|
253 | m_l5.put("icant", "A"); |
---|
254 | m_l5.put("ician", "A"); |
---|
255 | m_l5.put("icide", "A"); |
---|
256 | m_l5.put("icism", "A"); |
---|
257 | m_l5.put("icist", "A"); |
---|
258 | m_l5.put("icity", "A"); |
---|
259 | m_l5.put("idine", "I"); |
---|
260 | m_l5.put("iedly", "A"); |
---|
261 | m_l5.put("ihood", "A"); |
---|
262 | m_l5.put("inate", "A"); |
---|
263 | m_l5.put("iness", "A"); |
---|
264 | m_l5.put("ingly", "B"); |
---|
265 | m_l5.put("inism", "J"); |
---|
266 | m_l5.put("inity", "c"); |
---|
267 | m_l5.put("ional", "A"); |
---|
268 | m_l5.put("ioned", "A"); |
---|
269 | m_l5.put("ished", "A"); |
---|
270 | m_l5.put("istic", "A"); |
---|
271 | m_l5.put("ities", "A"); |
---|
272 | m_l5.put("itous", "A"); |
---|
273 | m_l5.put("ively", "A"); |
---|
274 | m_l5.put("ivity", "A"); |
---|
275 | m_l5.put("izers", "F"); |
---|
276 | m_l5.put("izing", "F"); |
---|
277 | m_l5.put("oidal", "A"); |
---|
278 | m_l5.put("oides", "A"); |
---|
279 | m_l5.put("otide", "A"); |
---|
280 | m_l5.put("ously", "A"); |
---|
281 | m_l4 = new HashMap<String,String>(); |
---|
282 | m_l4.put("able", "A"); |
---|
283 | m_l4.put("ably", "A"); |
---|
284 | m_l4.put("ages", "B"); |
---|
285 | m_l4.put("ally", "B"); |
---|
286 | m_l4.put("ance", "B"); |
---|
287 | m_l4.put("ancy", "B"); |
---|
288 | m_l4.put("ants", "B"); |
---|
289 | m_l4.put("aric", "A"); |
---|
290 | m_l4.put("arly", "K"); |
---|
291 | m_l4.put("ated", "I"); |
---|
292 | m_l4.put("ates", "A"); |
---|
293 | m_l4.put("atic", "B"); |
---|
294 | m_l4.put("ator", "A"); |
---|
295 | m_l4.put("ealy", "Y"); |
---|
296 | m_l4.put("edly", "E"); |
---|
297 | m_l4.put("eful", "A"); |
---|
298 | m_l4.put("eity", "A"); |
---|
299 | m_l4.put("ence", "A"); |
---|
300 | m_l4.put("ency", "A"); |
---|
301 | m_l4.put("ened", "E"); |
---|
302 | m_l4.put("enly", "E"); |
---|
303 | m_l4.put("eous", "A"); |
---|
304 | m_l4.put("hood", "A"); |
---|
305 | m_l4.put("ials", "A"); |
---|
306 | m_l4.put("ians", "A"); |
---|
307 | m_l4.put("ible", "A"); |
---|
308 | m_l4.put("ibly", "A"); |
---|
309 | m_l4.put("ical", "A"); |
---|
310 | m_l4.put("ides", "L"); |
---|
311 | m_l4.put("iers", "A"); |
---|
312 | m_l4.put("iful", "A"); |
---|
313 | m_l4.put("ines", "M"); |
---|
314 | m_l4.put("ings", "N"); |
---|
315 | m_l4.put("ions", "B"); |
---|
316 | m_l4.put("ious", "A"); |
---|
317 | m_l4.put("isms", "B"); |
---|
318 | m_l4.put("ists", "A"); |
---|
319 | m_l4.put("itic", "H"); |
---|
320 | m_l4.put("ized", "F"); |
---|
321 | m_l4.put("izer", "F"); |
---|
322 | m_l4.put("less", "A"); |
---|
323 | m_l4.put("lily", "A"); |
---|
324 | m_l4.put("ness", "A"); |
---|
325 | m_l4.put("ogen", "A"); |
---|
326 | m_l4.put("ward", "A"); |
---|
327 | m_l4.put("wise", "A"); |
---|
328 | m_l4.put("ying", "B"); |
---|
329 | m_l4.put("yish", "A"); |
---|
330 | m_l3 = new HashMap<String,String>(); |
---|
331 | m_l3.put("acy", "A"); |
---|
332 | m_l3.put("age", "B"); |
---|
333 | m_l3.put("aic", "A"); |
---|
334 | m_l3.put("als", "b"); |
---|
335 | m_l3.put("ant", "B"); |
---|
336 | m_l3.put("ars", "O"); |
---|
337 | m_l3.put("ary", "F"); |
---|
338 | m_l3.put("ata", "A"); |
---|
339 | m_l3.put("ate", "A"); |
---|
340 | m_l3.put("eal", "Y"); |
---|
341 | m_l3.put("ear", "Y"); |
---|
342 | m_l3.put("ely", "E"); |
---|
343 | m_l3.put("ene", "E"); |
---|
344 | m_l3.put("ent", "C"); |
---|
345 | m_l3.put("ery", "E"); |
---|
346 | m_l3.put("ese", "A"); |
---|
347 | m_l3.put("ful", "A"); |
---|
348 | m_l3.put("ial", "A"); |
---|
349 | m_l3.put("ian", "A"); |
---|
350 | m_l3.put("ics", "A"); |
---|
351 | m_l3.put("ide", "L"); |
---|
352 | m_l3.put("ied", "A"); |
---|
353 | m_l3.put("ier", "A"); |
---|
354 | m_l3.put("ies", "P"); |
---|
355 | m_l3.put("ily", "A"); |
---|
356 | m_l3.put("ine", "M"); |
---|
357 | m_l3.put("ing", "N"); |
---|
358 | m_l3.put("ion", "Q"); |
---|
359 | m_l3.put("ish", "C"); |
---|
360 | m_l3.put("ism", "B"); |
---|
361 | m_l3.put("ist", "A"); |
---|
362 | m_l3.put("ite", "a"); |
---|
363 | m_l3.put("ity", "A"); |
---|
364 | m_l3.put("ium", "A"); |
---|
365 | m_l3.put("ive", "A"); |
---|
366 | m_l3.put("ize", "F"); |
---|
367 | m_l3.put("oid", "A"); |
---|
368 | m_l3.put("one", "R"); |
---|
369 | m_l3.put("ous", "A"); |
---|
370 | m_l2 = new HashMap<String,String>(); |
---|
371 | m_l2.put("ae", "A"); |
---|
372 | m_l2.put("al", "b"); |
---|
373 | m_l2.put("ar", "X"); |
---|
374 | m_l2.put("as", "B"); |
---|
375 | m_l2.put("ed", "E"); |
---|
376 | m_l2.put("en", "F"); |
---|
377 | m_l2.put("es", "E"); |
---|
378 | m_l2.put("ia", "A"); |
---|
379 | m_l2.put("ic", "A"); |
---|
380 | m_l2.put("is", "A"); |
---|
381 | m_l2.put("ly", "B"); |
---|
382 | m_l2.put("on", "S"); |
---|
383 | m_l2.put("or", "T"); |
---|
384 | m_l2.put("um", "U"); |
---|
385 | m_l2.put("us", "V"); |
---|
386 | m_l2.put("yl", "R"); |
---|
387 | m_l2.put("s\'", "A"); |
---|
388 | m_l2.put("\'s", "A"); |
---|
389 | m_l1 = new HashMap<String,String>(); |
---|
390 | m_l1.put("a", "A"); |
---|
391 | m_l1.put("e", "A"); |
---|
392 | m_l1.put("i", "A"); |
---|
393 | m_l1.put("o", "A"); |
---|
394 | m_l1.put("s", "W"); |
---|
395 | m_l1.put("y", "B"); |
---|
396 | } |
---|
397 | |
---|
398 | /** |
---|
399 | * Returns a string describing the stemmer |
---|
400 | * @return a description suitable for |
---|
401 | * displaying in the explorer/experimenter gui |
---|
402 | */ |
---|
403 | public String globalInfo() { |
---|
404 | return |
---|
405 | "A stemmer based on the Lovins stemmer, described here:\n\n" |
---|
406 | + getTechnicalInformation().toString(); |
---|
407 | } |
---|
408 | |
---|
409 | /** |
---|
410 | * Returns an instance of a TechnicalInformation object, containing |
---|
411 | * detailed information about the technical background of this class, |
---|
412 | * e.g., paper reference or book this class is based on. |
---|
413 | * |
---|
414 | * @return the technical information about this class |
---|
415 | */ |
---|
416 | public TechnicalInformation getTechnicalInformation() { |
---|
417 | TechnicalInformation result; |
---|
418 | |
---|
419 | result = new TechnicalInformation(Type.ARTICLE); |
---|
420 | result.setValue(Field.AUTHOR, "Julie Beth Lovins"); |
---|
421 | result.setValue(Field.YEAR, "1968"); |
---|
422 | result.setValue(Field.TITLE, "Development of a stemming algorithm"); |
---|
423 | result.setValue(Field.JOURNAL, "Mechanical Translation and Computational Linguistics"); |
---|
424 | result.setValue(Field.VOLUME, "11"); |
---|
425 | result.setValue(Field.PAGES, "22-31"); |
---|
426 | |
---|
427 | return result; |
---|
428 | } |
---|
429 | |
---|
430 | /** |
---|
431 | * Finds and removes ending from given word. |
---|
432 | * |
---|
433 | * @param word the word to work on |
---|
434 | * @return the processed word |
---|
435 | */ |
---|
436 | private String removeEnding(String word) { |
---|
437 | |
---|
438 | int length = word.length(); |
---|
439 | int el = 11; |
---|
440 | |
---|
441 | while (el > 0) { |
---|
442 | if (length - el > 1) { |
---|
443 | String ending = word.substring(length - el); |
---|
444 | String conditionCode = null; |
---|
445 | switch (el) { |
---|
446 | case 11: conditionCode = (String)m_l11.get(ending); |
---|
447 | break; |
---|
448 | case 10: conditionCode = (String)m_l10.get(ending); |
---|
449 | break; |
---|
450 | case 9: conditionCode = (String)m_l9.get(ending); |
---|
451 | break; |
---|
452 | case 8: conditionCode = (String)m_l8.get(ending); |
---|
453 | break; |
---|
454 | case 7: conditionCode = (String)m_l7.get(ending); |
---|
455 | break; |
---|
456 | case 6: conditionCode = (String)m_l6.get(ending); |
---|
457 | break; |
---|
458 | case 5: conditionCode = (String)m_l5.get(ending); |
---|
459 | break; |
---|
460 | case 4: conditionCode = (String)m_l4.get(ending); |
---|
461 | break; |
---|
462 | case 3: conditionCode = (String)m_l3.get(ending); |
---|
463 | break; |
---|
464 | case 2: conditionCode = (String)m_l2.get(ending); |
---|
465 | break; |
---|
466 | case 1: conditionCode = (String)m_l1.get(ending); |
---|
467 | break; |
---|
468 | default: |
---|
469 | } |
---|
470 | if (conditionCode != null) { |
---|
471 | switch (conditionCode.charAt(0)) { |
---|
472 | case 'A': |
---|
473 | return word.substring(0, length - el); |
---|
474 | case 'B': |
---|
475 | if (length - el > 2) { |
---|
476 | return word.substring(0, length - el); |
---|
477 | } |
---|
478 | break; |
---|
479 | case 'C': |
---|
480 | if (length - el > 3) { |
---|
481 | return word.substring(0, length - el); |
---|
482 | } |
---|
483 | break; |
---|
484 | case 'D': |
---|
485 | if (length - el > 4) { |
---|
486 | return word.substring(0, length - el); |
---|
487 | } |
---|
488 | break; |
---|
489 | case 'E': |
---|
490 | if (word.charAt(length - el - 1) != 'e') { |
---|
491 | return word.substring(0, length - el); |
---|
492 | } |
---|
493 | break; |
---|
494 | case 'F': |
---|
495 | if ((length - el > 2) && |
---|
496 | (word.charAt(length - el - 1) != 'e')) { |
---|
497 | return word.substring(0, length - el); |
---|
498 | } |
---|
499 | break; |
---|
500 | case 'G': |
---|
501 | if ((length - el > 2) && |
---|
502 | (word.charAt(length - el - 1) == 'f')) { |
---|
503 | return word.substring(0, length - el); |
---|
504 | } |
---|
505 | break; |
---|
506 | case 'H': |
---|
507 | if ((word.charAt(length - el - 1) == 't') || |
---|
508 | ((word.charAt(length - el - 1) == 'l') && |
---|
509 | (word.charAt(length - el - 2) == 'l'))) { |
---|
510 | return word.substring(0, length - el); |
---|
511 | } |
---|
512 | break; |
---|
513 | case 'I': |
---|
514 | if ((word.charAt(length - el - 1) != 'o') && |
---|
515 | (word.charAt(length - el - 1) != 'e')) { |
---|
516 | return word.substring(0, length - el); |
---|
517 | } |
---|
518 | break; |
---|
519 | case 'J': |
---|
520 | if ((word.charAt(length - el - 1) != 'a') && |
---|
521 | (word.charAt(length - el - 1) != 'e')) { |
---|
522 | return word.substring(0, length - el); |
---|
523 | } |
---|
524 | break; |
---|
525 | case 'K': |
---|
526 | if ((length - el > 2) && |
---|
527 | ((word.charAt(length - el - 1) == 'l') || |
---|
528 | (word.charAt(length - el - 1) == 'i') || |
---|
529 | ((word.charAt(length - el - 1) == 'e') && |
---|
530 | (word.charAt(length - el - 3) == 'u')))) { |
---|
531 | return word.substring(0, length - el); |
---|
532 | } |
---|
533 | break; |
---|
534 | case 'L': |
---|
535 | if ((word.charAt(length - el - 1) != 'u') && |
---|
536 | (word.charAt(length - el - 1) != 'x') && |
---|
537 | ((word.charAt(length - el - 1) != 's') || |
---|
538 | (word.charAt(length - el - 2) == 'o'))) { |
---|
539 | return word.substring(0, length - el); |
---|
540 | } |
---|
541 | break; |
---|
542 | case 'M': |
---|
543 | if ((word.charAt(length - el - 1) != 'a') && |
---|
544 | (word.charAt(length - el - 1) != 'c') && |
---|
545 | (word.charAt(length - el - 1) != 'e') && |
---|
546 | (word.charAt(length - el - 1) != 'm')) { |
---|
547 | return word.substring(0, length - el); |
---|
548 | } |
---|
549 | break; |
---|
550 | case 'N': |
---|
551 | if ((length - el > 3) || |
---|
552 | ((length - el == 3) && |
---|
553 | ((word.charAt(length - el - 3) != 's')))) { |
---|
554 | return word.substring(0, length - el); |
---|
555 | } |
---|
556 | break; |
---|
557 | case 'O': |
---|
558 | if ((word.charAt(length - el - 1) == 'l') || |
---|
559 | (word.charAt(length - el - 1) == 'i')) { |
---|
560 | return word.substring(0, length - el); |
---|
561 | } |
---|
562 | break; |
---|
563 | case 'P': |
---|
564 | if (word.charAt(length - el - 1) != 'c') { |
---|
565 | return word.substring(0, length - el); |
---|
566 | } |
---|
567 | break; |
---|
568 | case 'Q': |
---|
569 | if ((length - el > 2) && |
---|
570 | (word.charAt(length - el - 1) != 'l') && |
---|
571 | (word.charAt(length - el - 1) != 'n')) { |
---|
572 | return word.substring(0, length - el); |
---|
573 | } |
---|
574 | break; |
---|
575 | case 'R': |
---|
576 | if ((word.charAt(length - el - 1) == 'n') || |
---|
577 | (word.charAt(length - el - 1) == 'r')) { |
---|
578 | return word.substring(0, length - el); |
---|
579 | } |
---|
580 | break; |
---|
581 | case 'S': |
---|
582 | if (((word.charAt(length - el - 1) == 'r') && |
---|
583 | (word.charAt(length - el - 2) == 'd')) || |
---|
584 | ((word.charAt(length - el - 1) == 't') && |
---|
585 | (word.charAt(length - el - 2) != 't'))) { |
---|
586 | return word.substring(0, length - el); |
---|
587 | } |
---|
588 | break; |
---|
589 | case 'T': |
---|
590 | if ((word.charAt(length - el - 1) == 's') || |
---|
591 | ((word.charAt(length - el - 1) == 't') && |
---|
592 | (word.charAt(length - el - 2) != 'o'))) { |
---|
593 | return word.substring(0, length - el); |
---|
594 | } |
---|
595 | break; |
---|
596 | case 'U': |
---|
597 | if ((word.charAt(length - el - 1) == 'l') || |
---|
598 | (word.charAt(length - el - 1) == 'm') || |
---|
599 | (word.charAt(length - el - 1) == 'n') || |
---|
600 | (word.charAt(length - el - 1) == 'r')) { |
---|
601 | return word.substring(0, length - el); |
---|
602 | } |
---|
603 | break; |
---|
604 | case 'V': |
---|
605 | if (word.charAt(length - el - 1) == 'c') { |
---|
606 | return word.substring(0, length - el); |
---|
607 | } |
---|
608 | break; |
---|
609 | case 'W': |
---|
610 | if ((word.charAt(length - el - 1) != 's') && |
---|
611 | (word.charAt(length - el - 1) != 'u')) { |
---|
612 | return word.substring(0, length - el); |
---|
613 | } |
---|
614 | break; |
---|
615 | case 'X': |
---|
616 | if ((word.charAt(length - el - 1) == 'l') || |
---|
617 | (word.charAt(length - el - 1) == 'i') || |
---|
618 | ((length - el > 2) && |
---|
619 | (word.charAt(length - el - 1) == 'e') && |
---|
620 | (word.charAt(length - el - 3) == 'u'))) { |
---|
621 | return word.substring(0, length - el); |
---|
622 | } |
---|
623 | break; |
---|
624 | case 'Y': |
---|
625 | if ((word.charAt(length - el - 1) == 'n') && |
---|
626 | (word.charAt(length - el - 2) == 'i')) { |
---|
627 | return word.substring(0, length - el); |
---|
628 | } |
---|
629 | break; |
---|
630 | case 'Z': |
---|
631 | if (word.charAt(length - el - 1) != 'f') { |
---|
632 | return word.substring(0, length - el); |
---|
633 | } |
---|
634 | break; |
---|
635 | case 'a': |
---|
636 | if ((word.charAt(length - el - 1) == 'd') || |
---|
637 | (word.charAt(length - el - 1) == 'f') || |
---|
638 | (((word.charAt(length - el - 1) == 'h') && |
---|
639 | (word.charAt(length - el - 2) == 'p'))) || |
---|
640 | (((word.charAt(length - el - 1) == 'h') && |
---|
641 | (word.charAt(length - el - 2) == 't'))) || |
---|
642 | (word.charAt(length - el - 1) == 'l') || |
---|
643 | (((word.charAt(length - el - 1) == 'r') && |
---|
644 | (word.charAt(length - el - 2) == 'e'))) || |
---|
645 | (((word.charAt(length - el - 1) == 'r') && |
---|
646 | (word.charAt(length - el - 2) == 'o'))) || |
---|
647 | (((word.charAt(length - el - 1) == 's') && |
---|
648 | (word.charAt(length - el - 2) == 'e'))) || |
---|
649 | (word.charAt(length - el - 1) == 't')) { |
---|
650 | return word.substring(0, length - el); |
---|
651 | } |
---|
652 | break; |
---|
653 | case 'b': |
---|
654 | if (m_CompMode) { |
---|
655 | if (((length - el == 3 ) && |
---|
656 | (!((word.charAt(length - el - 1) == 't') && |
---|
657 | (word.charAt(length - el - 2) == 'e') && |
---|
658 | (word.charAt(length - el - 3) == 'm')))) || |
---|
659 | ((length - el > 3) && |
---|
660 | (!((word.charAt(length - el - 1) == 't') && |
---|
661 | (word.charAt(length - el - 2) == 's') && |
---|
662 | (word.charAt(length - el - 3) == 'y') && |
---|
663 | (word.charAt(length - el - 4) == 'r'))))) { |
---|
664 | return word.substring(0, length - el); |
---|
665 | } |
---|
666 | } else { |
---|
667 | if ((length - el > 2) && |
---|
668 | (!((word.charAt(length - el - 1) == 't') && |
---|
669 | (word.charAt(length - el - 2) == 'e') && |
---|
670 | (word.charAt(length - el - 3) == 'm'))) && |
---|
671 | ((length - el < 4) || |
---|
672 | (!((word.charAt(length - el - 1) == 't') && |
---|
673 | (word.charAt(length - el - 2) == 's') && |
---|
674 | (word.charAt(length - el - 3) == 'y') && |
---|
675 | (word.charAt(length - el - 4) == 'r'))))) { |
---|
676 | return word.substring(0, length - el); |
---|
677 | } |
---|
678 | } |
---|
679 | break; |
---|
680 | case 'c': |
---|
681 | if (word.charAt(length - el - 1) == 'l') { |
---|
682 | return word.substring(0, length - el); |
---|
683 | } |
---|
684 | break; |
---|
685 | default: |
---|
686 | throw new IllegalArgumentException("Fatal error."); |
---|
687 | } |
---|
688 | } |
---|
689 | } |
---|
690 | el--; |
---|
691 | } |
---|
692 | return word; |
---|
693 | } |
---|
694 | |
---|
695 | /** |
---|
696 | * Recodes ending of given word. |
---|
697 | * |
---|
698 | * @param word the word to work on |
---|
699 | * @return the processed word |
---|
700 | */ |
---|
701 | private String recodeEnding(String word) { |
---|
702 | |
---|
703 | int lastPos = word.length() - 1; |
---|
704 | |
---|
705 | // Rule 1 |
---|
706 | if (word.endsWith("bb") || |
---|
707 | word.endsWith("dd") || |
---|
708 | word.endsWith("gg") || |
---|
709 | word.endsWith("ll") || |
---|
710 | word.endsWith("mm") || |
---|
711 | word.endsWith("nn") || |
---|
712 | word.endsWith("pp") || |
---|
713 | word.endsWith("rr") || |
---|
714 | word.endsWith("ss") || |
---|
715 | word.endsWith("tt")) { |
---|
716 | word = word.substring(0, lastPos); |
---|
717 | lastPos--; |
---|
718 | } |
---|
719 | |
---|
720 | // Rule 2 |
---|
721 | if (word.endsWith("iev")) { |
---|
722 | word = word.substring(0, lastPos - 2).concat("ief"); |
---|
723 | } |
---|
724 | |
---|
725 | // Rule 3 |
---|
726 | if (word.endsWith("uct")) { |
---|
727 | word = word.substring(0, lastPos - 2).concat("uc"); |
---|
728 | lastPos--; |
---|
729 | } |
---|
730 | |
---|
731 | // Rule 4 |
---|
732 | if (word.endsWith("umpt")) { |
---|
733 | word = word.substring(0, lastPos - 3).concat("um"); |
---|
734 | lastPos -= 2; |
---|
735 | } |
---|
736 | |
---|
737 | // Rule 5 |
---|
738 | if (word.endsWith("rpt")) { |
---|
739 | word = word.substring(0, lastPos - 2).concat("rb"); |
---|
740 | lastPos--; |
---|
741 | } |
---|
742 | |
---|
743 | // Rule 6 |
---|
744 | if (word.endsWith("urs")) { |
---|
745 | word = word.substring(0, lastPos - 2).concat("ur"); |
---|
746 | lastPos--; |
---|
747 | } |
---|
748 | |
---|
749 | // Rule 7 |
---|
750 | if (word.endsWith("istr")) { |
---|
751 | word = word.substring(0, lastPos - 3).concat("ister"); |
---|
752 | lastPos++; |
---|
753 | } |
---|
754 | |
---|
755 | // Rule 7a |
---|
756 | if (word.endsWith("metr")) { |
---|
757 | word = word.substring(0, lastPos - 3).concat("meter"); |
---|
758 | lastPos++; |
---|
759 | } |
---|
760 | |
---|
761 | // Rule 8 |
---|
762 | if (word.endsWith("olv")) { |
---|
763 | word = word.substring(0, lastPos - 2).concat("olut"); |
---|
764 | lastPos++; |
---|
765 | } |
---|
766 | |
---|
767 | // Rule 9 |
---|
768 | if (word.endsWith("ul")) { |
---|
769 | if ((lastPos - 2 < 0) || |
---|
770 | ((word.charAt(lastPos - 2) != 'a') && |
---|
771 | (word.charAt(lastPos - 2) != 'i') && |
---|
772 | (word.charAt(lastPos - 2) != 'o'))) { |
---|
773 | word = word.substring(0, lastPos - 1).concat("l"); |
---|
774 | lastPos--; |
---|
775 | } |
---|
776 | } |
---|
777 | |
---|
778 | // Rule 10 |
---|
779 | if (word.endsWith("bex")) { |
---|
780 | word = word.substring(0, lastPos - 2).concat("bic"); |
---|
781 | } |
---|
782 | |
---|
783 | // Rule 11 |
---|
784 | if (word.endsWith("dex")) { |
---|
785 | word = word.substring(0, lastPos - 2).concat("dic"); |
---|
786 | } |
---|
787 | |
---|
788 | // Rule 12 |
---|
789 | if (word.endsWith("pex")) { |
---|
790 | word = word.substring(0, lastPos - 2).concat("pic"); |
---|
791 | } |
---|
792 | |
---|
793 | // Rule 13 |
---|
794 | if (word.endsWith("tex")) { |
---|
795 | word = word.substring(0, lastPos - 2).concat("tic"); |
---|
796 | } |
---|
797 | |
---|
798 | // Rule 14 |
---|
799 | if (word.endsWith("ax")) { |
---|
800 | word = word.substring(0, lastPos - 1).concat("ac"); |
---|
801 | } |
---|
802 | |
---|
803 | // Rule 15 |
---|
804 | if (word.endsWith("ex")) { |
---|
805 | word = word.substring(0, lastPos - 1).concat("ec"); |
---|
806 | } |
---|
807 | |
---|
808 | // Rule 16 |
---|
809 | if (word.endsWith("ix")) { |
---|
810 | word = word.substring(0, lastPos - 1).concat("ic"); |
---|
811 | } |
---|
812 | |
---|
813 | // Rule 17 |
---|
814 | if (word.endsWith("lux")) { |
---|
815 | word = word.substring(0, lastPos - 2).concat("luc"); |
---|
816 | } |
---|
817 | |
---|
818 | // Rule 18 |
---|
819 | if (word.endsWith("uad")) { |
---|
820 | word = word.substring(0, lastPos - 2).concat("uas"); |
---|
821 | } |
---|
822 | |
---|
823 | // Rule 19 |
---|
824 | if (word.endsWith("vad")) { |
---|
825 | word = word.substring(0, lastPos - 2).concat("vas"); |
---|
826 | } |
---|
827 | |
---|
828 | // Rule 20 |
---|
829 | if (word.endsWith("cid")) { |
---|
830 | word = word.substring(0, lastPos - 2).concat("cis"); |
---|
831 | } |
---|
832 | |
---|
833 | // Rule 21 |
---|
834 | if (word.endsWith("lid")) { |
---|
835 | word = word.substring(0, lastPos - 2).concat("lis"); |
---|
836 | } |
---|
837 | |
---|
838 | // Rule 22 |
---|
839 | if (word.endsWith("erid")) { |
---|
840 | word = word.substring(0, lastPos - 3).concat("eris"); |
---|
841 | } |
---|
842 | |
---|
843 | // Rule 23 |
---|
844 | if (word.endsWith("pand")) { |
---|
845 | word = word.substring(0, lastPos - 3).concat("pans"); |
---|
846 | } |
---|
847 | |
---|
848 | // Rule 24 |
---|
849 | if (word.endsWith("end")) { |
---|
850 | if ((lastPos - 3 < 0) || |
---|
851 | (word.charAt(lastPos - 3) != 's')) { |
---|
852 | word = word.substring(0, lastPos - 2).concat("ens"); |
---|
853 | } |
---|
854 | } |
---|
855 | |
---|
856 | // Rule 25 |
---|
857 | if (word.endsWith("ond")) { |
---|
858 | word = word.substring(0, lastPos - 2).concat("ons"); |
---|
859 | } |
---|
860 | |
---|
861 | // Rule 26 |
---|
862 | if (word.endsWith("lud")) { |
---|
863 | word = word.substring(0, lastPos - 2).concat("lus"); |
---|
864 | } |
---|
865 | |
---|
866 | // Rule 27 |
---|
867 | if (word.endsWith("rud")) { |
---|
868 | word = word.substring(0, lastPos - 2).concat("rus"); |
---|
869 | } |
---|
870 | |
---|
871 | // Rule 28 |
---|
872 | if (word.endsWith("her")) { |
---|
873 | if ((lastPos - 3 < 0) || |
---|
874 | ((word.charAt(lastPos - 3) != 'p') && |
---|
875 | (word.charAt(lastPos - 3) != 't'))) { |
---|
876 | word = word.substring(0, lastPos - 2).concat("hes"); |
---|
877 | } |
---|
878 | } |
---|
879 | |
---|
880 | // Rule 29 |
---|
881 | if (word.endsWith("mit")) { |
---|
882 | word = word.substring(0, lastPos - 2).concat("mis"); |
---|
883 | } |
---|
884 | |
---|
885 | // Rule 30 |
---|
886 | if (word.endsWith("end")) { |
---|
887 | if ((lastPos - 3 < 0) || |
---|
888 | (word.charAt(lastPos - 3) != 'm')) { |
---|
889 | word = word.substring(0, lastPos - 2).concat("ens"); |
---|
890 | } |
---|
891 | } |
---|
892 | |
---|
893 | // Rule 31 |
---|
894 | if (word.endsWith("ert")) { |
---|
895 | word = word.substring(0, lastPos - 2).concat("ers"); |
---|
896 | } |
---|
897 | |
---|
898 | // Rule 32 |
---|
899 | if (word.endsWith("et")) { |
---|
900 | if ((lastPos - 2 < 0) || |
---|
901 | (word.charAt(lastPos - 2) != 'n')) { |
---|
902 | word = word.substring(0, lastPos - 1).concat("es"); |
---|
903 | } |
---|
904 | } |
---|
905 | |
---|
906 | // Rule 33 |
---|
907 | if (word.endsWith("yt")) { |
---|
908 | word = word.substring(0, lastPos - 1).concat("ys"); |
---|
909 | } |
---|
910 | |
---|
911 | // Rule 34 |
---|
912 | if (word.endsWith("yz")) { |
---|
913 | word = word.substring(0, lastPos - 1).concat("ys"); |
---|
914 | } |
---|
915 | |
---|
916 | return word; |
---|
917 | } |
---|
918 | |
---|
919 | /** |
---|
920 | * Returns the stemmed version of the given word. |
---|
921 | * Word is converted to lower case before stemming. |
---|
922 | * |
---|
923 | * @param word a string consisting of a single word |
---|
924 | * @return the stemmed word |
---|
925 | */ |
---|
926 | public String stem(String word) { |
---|
927 | |
---|
928 | if (word.length() > 2) { |
---|
929 | return recodeEnding(removeEnding(word.toLowerCase())); |
---|
930 | } else { |
---|
931 | return word.toLowerCase(); |
---|
932 | } |
---|
933 | } |
---|
934 | |
---|
935 | /** |
---|
936 | * Stems everything in the given string. String |
---|
937 | * is converted to lower case before stemming. |
---|
938 | * |
---|
939 | * @param str the string to stem |
---|
940 | * @return the processed string |
---|
941 | */ |
---|
942 | public String stemString(String str) { |
---|
943 | |
---|
944 | StringBuffer result = new StringBuffer(); |
---|
945 | int start = -1; |
---|
946 | for (int j = 0; j < str.length(); j++) { |
---|
947 | char c = str.charAt(j); |
---|
948 | if (Character.isLetterOrDigit(c)) { |
---|
949 | if (start == -1) { |
---|
950 | start = j; |
---|
951 | } |
---|
952 | } else if (c == '\'') { |
---|
953 | if (start == -1) { |
---|
954 | result.append(c); |
---|
955 | } |
---|
956 | } else { |
---|
957 | if (start != -1) { |
---|
958 | result.append(stem(str.substring(start, j))); |
---|
959 | start = -1; |
---|
960 | } |
---|
961 | result.append(c); |
---|
962 | } |
---|
963 | } |
---|
964 | if (start != -1) { |
---|
965 | result.append(stem(str.substring(start, str.length()))); |
---|
966 | } |
---|
967 | return result.toString(); |
---|
968 | } |
---|
969 | |
---|
970 | /** |
---|
971 | * returns a string representation of the stemmer |
---|
972 | * |
---|
973 | * @return a string representation of the stemmer |
---|
974 | */ |
---|
975 | public String toString() { |
---|
976 | return getClass().getName(); |
---|
977 | } |
---|
978 | |
---|
979 | /** |
---|
980 | * Returns the revision string. |
---|
981 | * |
---|
982 | * @return the revision |
---|
983 | */ |
---|
984 | public String getRevision() { |
---|
985 | return RevisionUtils.extract("$Revision: 5953 $"); |
---|
986 | } |
---|
987 | |
---|
988 | /** |
---|
989 | * Runs the stemmer with the given options |
---|
990 | * |
---|
991 | * @param args the options |
---|
992 | */ |
---|
993 | public static void main(String[] args) { |
---|
994 | try { |
---|
995 | Stemming.useStemmer(new LovinsStemmer(), args); |
---|
996 | } |
---|
997 | catch (Exception e) { |
---|
998 | e.printStackTrace(); |
---|
999 | } |
---|
1000 | } |
---|
1001 | } |
---|
1002 | |
---|