Context Navigation

LovinsStemmer.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 29.8 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* LovinsStemmer.java
19	* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.core.stemmers;
24
25	import weka.core.RevisionUtils;
26	import weka.core.TechnicalInformation;
27	import weka.core.TechnicalInformation.Type;
28	import weka.core.TechnicalInformation.Field;
29	import weka.core.TechnicalInformationHandler;
30
31	import java.util.HashMap;
32
33	/**
34	<!-- globalinfo-start -->
35	* A stemmer based on the Lovins stemmer, described here:<br/>
36	* <br/>
37	* Julie Beth Lovins (1968). Development of a stemming algorithm. Mechanical Translation and Computational Linguistics. 11:22-31.
38	* <p/>
39	<!-- globalinfo-end -->
40	*
41	<!-- technical-bibtex-start -->
42	* BibTeX:
43	* <pre>
44	* @article{Lovins1968,
45	* author = {Julie Beth Lovins},
46	* journal = {Mechanical Translation and Computational Linguistics},
47	* pages = {22-31},
48	* title = {Development of a stemming algorithm},
49	* volume = {11},
50	* year = {1968}
51	* }
52	* </pre>
53	* <p/>
54	<!-- technical-bibtex-end -->
55	*
56	* @author Eibe Frank (eibe at cs dot waikato dot ac dot nz)
57	* @version $Revision: 5953 $
58	*/
59	public class LovinsStemmer
60	implements Stemmer, TechnicalInformationHandler {
61
62	/** for serialization */
63	static final long serialVersionUID = -6113024782588197L;
64
65	/** Enters C version compatibility mode if set to true (emulates
66	features of the original C implementation that are inconsistent
67	with the algorithm as described in Lovins's paper) */
68	private static boolean m_CompMode = false;
69
70	/** The hash tables containing the list of endings. */
71	private static HashMap<String,String> m_l11 = null;
72	private static HashMap<String,String> m_l10 = null;
73	private static HashMap<String,String> m_l9 = null;
74	private static HashMap<String,String> m_l8 = null;
75	private static HashMap<String,String> m_l7 = null;
76	private static HashMap<String,String> m_l6 = null;
77	private static HashMap<String,String> m_l5 = null;
78	private static HashMap<String,String> m_l4 = null;
79	private static HashMap<String,String> m_l3 = null;
80	private static HashMap<String,String> m_l2 = null;
81	private static HashMap<String,String> m_l1 = null;
82
83	static {
84
85	m_l11 = new HashMap<String,String>();
86	m_l11.put("alistically", "B");
87	m_l11.put("arizability", "A");
88	m_l11.put("izationally", "B");
89	m_l10 = new HashMap<String,String>();
90	m_l10.put("antialness", "A");
91	m_l10.put("arisations", "A");
92	m_l10.put("arizations", "A");
93	m_l10.put("entialness", "A");
94	m_l9 = new HashMap<String,String>();
95	m_l9.put("allically", "C");
96	m_l9.put("antaneous", "A");
97	m_l9.put("antiality", "A");
98	m_l9.put("arisation", "A");
99	m_l9.put("arization", "A");
100	m_l9.put("ationally", "B");
101	m_l9.put("ativeness", "A");
102	m_l9.put("eableness", "E");
103	m_l9.put("entations", "A");
104	m_l9.put("entiality", "A");
105	m_l9.put("entialize", "A");
106	m_l9.put("entiation", "A");
107	m_l9.put("ionalness", "A");
108	m_l9.put("istically", "A");
109	m_l9.put("itousness", "A");
110	m_l9.put("izability", "A");
111	m_l9.put("izational", "A");
112	m_l8 = new HashMap<String,String>();
113	m_l8.put("ableness", "A");
114	m_l8.put("arizable", "A");
115	m_l8.put("entation", "A");
116	m_l8.put("entially", "A");
117	m_l8.put("eousness", "A");
118	m_l8.put("ibleness", "A");
119	m_l8.put("icalness", "A");
120	m_l8.put("ionalism", "A");
121	m_l8.put("ionality", "A");
122	m_l8.put("ionalize", "A");
123	m_l8.put("iousness", "A");
124	m_l8.put("izations", "A");
125	m_l8.put("lessness", "A");
126	m_l7 = new HashMap<String,String>();
127	m_l7.put("ability", "A");
128	m_l7.put("aically", "A");
129	m_l7.put("alistic", "B");
130	m_l7.put("alities", "A");
131	m_l7.put("ariness", "E");
132	m_l7.put("aristic", "A");
133	m_l7.put("arizing", "A");
134	m_l7.put("ateness", "A");
135	m_l7.put("atingly", "A");
136	m_l7.put("ational", "B");
137	m_l7.put("atively", "A");
138	m_l7.put("ativism", "A");
139	m_l7.put("elihood", "E");
140	m_l7.put("encible", "A");
141	m_l7.put("entally", "A");
142	m_l7.put("entials", "A");
143	m_l7.put("entiate", "A");
144	m_l7.put("entness", "A");
145	m_l7.put("fulness", "A");
146	m_l7.put("ibility", "A");
147	m_l7.put("icalism", "A");
148	m_l7.put("icalist", "A");
149	m_l7.put("icality", "A");
150	m_l7.put("icalize", "A");
151	m_l7.put("ication", "G");
152	m_l7.put("icianry", "A");
153	m_l7.put("ination", "A");
154	m_l7.put("ingness", "A");
155	m_l7.put("ionally", "A");
156	m_l7.put("isation", "A");
157	m_l7.put("ishness", "A");
158	m_l7.put("istical", "A");
159	m_l7.put("iteness", "A");
160	m_l7.put("iveness", "A");
161	m_l7.put("ivistic", "A");
162	m_l7.put("ivities", "A");
163	m_l7.put("ization", "F");
164	m_l7.put("izement", "A");
165	m_l7.put("oidally", "A");
166	m_l7.put("ousness", "A");
167	m_l6 = new HashMap<String,String>();
168	m_l6.put("aceous", "A");
169	m_l6.put("acious", "B");
170	m_l6.put("action", "G");
171	m_l6.put("alness", "A");
172	m_l6.put("ancial", "A");
173	m_l6.put("ancies", "A");
174	m_l6.put("ancing", "B");
175	m_l6.put("ariser", "A");
176	m_l6.put("arized", "A");
177	m_l6.put("arizer", "A");
178	m_l6.put("atable", "A");
179	m_l6.put("ations", "B");
180	m_l6.put("atives", "A");
181	m_l6.put("eature", "Z");
182	m_l6.put("efully", "A");
183	m_l6.put("encies", "A");
184	m_l6.put("encing", "A");
185	m_l6.put("ential", "A");
186	m_l6.put("enting", "C");
187	m_l6.put("entist", "A");
188	m_l6.put("eously", "A");
189	m_l6.put("ialist", "A");
190	m_l6.put("iality", "A");
191	m_l6.put("ialize", "A");
192	m_l6.put("ically", "A");
193	m_l6.put("icance", "A");
194	m_l6.put("icians", "A");
195	m_l6.put("icists", "A");
196	m_l6.put("ifully", "A");
197	m_l6.put("ionals", "A");
198	m_l6.put("ionate", "D");
199	m_l6.put("ioning", "A");
200	m_l6.put("ionist", "A");
201	m_l6.put("iously", "A");
202	m_l6.put("istics", "A");
203	m_l6.put("izable", "E");
204	m_l6.put("lessly", "A");
205	m_l6.put("nesses", "A");
206	m_l6.put("oidism", "A");
207	m_l5 = new HashMap<String,String>();
208	m_l5.put("acies", "A");
209	m_l5.put("acity", "A");
210	m_l5.put("aging", "B");
211	m_l5.put("aical", "A");
212	if (!m_CompMode) {
213	m_l5.put("alist", "A");
214	}
215	m_l5.put("alism", "B");
216	m_l5.put("ality", "A");
217	m_l5.put("alize", "A");
218	m_l5.put("allic", "b");
219	m_l5.put("anced", "B");
220	m_l5.put("ances", "B");
221	m_l5.put("antic", "C");
222	m_l5.put("arial", "A");
223	m_l5.put("aries", "A");
224	m_l5.put("arily", "A");
225	m_l5.put("arity", "B");
226	m_l5.put("arize", "A");
227	m_l5.put("aroid", "A");
228	m_l5.put("ately", "A");
229	m_l5.put("ating", "I");
230	m_l5.put("ation", "B");
231	m_l5.put("ative", "A");
232	m_l5.put("ators", "A");
233	m_l5.put("atory", "A");
234	m_l5.put("ature", "E");
235	m_l5.put("early", "Y");
236	m_l5.put("ehood", "A");
237	m_l5.put("eless", "A");
238	if (!m_CompMode) {
239	m_l5.put("elily", "A");
240	} else {
241	m_l5.put("elity", "A");
242	}
243	m_l5.put("ement", "A");
244	m_l5.put("enced", "A");
245	m_l5.put("ences", "A");
246	m_l5.put("eness", "E");
247	m_l5.put("ening", "E");
248	m_l5.put("ental", "A");
249	m_l5.put("ented", "C");
250	m_l5.put("ently", "A");
251	m_l5.put("fully", "A");
252	m_l5.put("ially", "A");
253	m_l5.put("icant", "A");
254	m_l5.put("ician", "A");
255	m_l5.put("icide", "A");
256	m_l5.put("icism", "A");
257	m_l5.put("icist", "A");
258	m_l5.put("icity", "A");
259	m_l5.put("idine", "I");
260	m_l5.put("iedly", "A");
261	m_l5.put("ihood", "A");
262	m_l5.put("inate", "A");
263	m_l5.put("iness", "A");
264	m_l5.put("ingly", "B");
265	m_l5.put("inism", "J");
266	m_l5.put("inity", "c");
267	m_l5.put("ional", "A");
268	m_l5.put("ioned", "A");
269	m_l5.put("ished", "A");
270	m_l5.put("istic", "A");
271	m_l5.put("ities", "A");
272	m_l5.put("itous", "A");
273	m_l5.put("ively", "A");
274	m_l5.put("ivity", "A");
275	m_l5.put("izers", "F");
276	m_l5.put("izing", "F");
277	m_l5.put("oidal", "A");
278	m_l5.put("oides", "A");
279	m_l5.put("otide", "A");
280	m_l5.put("ously", "A");
281	m_l4 = new HashMap<String,String>();
282	m_l4.put("able", "A");
283	m_l4.put("ably", "A");
284	m_l4.put("ages", "B");
285	m_l4.put("ally", "B");
286	m_l4.put("ance", "B");
287	m_l4.put("ancy", "B");
288	m_l4.put("ants", "B");
289	m_l4.put("aric", "A");
290	m_l4.put("arly", "K");
291	m_l4.put("ated", "I");
292	m_l4.put("ates", "A");
293	m_l4.put("atic", "B");
294	m_l4.put("ator", "A");
295	m_l4.put("ealy", "Y");
296	m_l4.put("edly", "E");
297	m_l4.put("eful", "A");
298	m_l4.put("eity", "A");
299	m_l4.put("ence", "A");
300	m_l4.put("ency", "A");
301	m_l4.put("ened", "E");
302	m_l4.put("enly", "E");
303	m_l4.put("eous", "A");
304	m_l4.put("hood", "A");
305	m_l4.put("ials", "A");
306	m_l4.put("ians", "A");
307	m_l4.put("ible", "A");
308	m_l4.put("ibly", "A");
309	m_l4.put("ical", "A");
310	m_l4.put("ides", "L");
311	m_l4.put("iers", "A");
312	m_l4.put("iful", "A");
313	m_l4.put("ines", "M");
314	m_l4.put("ings", "N");
315	m_l4.put("ions", "B");
316	m_l4.put("ious", "A");
317	m_l4.put("isms", "B");
318	m_l4.put("ists", "A");
319	m_l4.put("itic", "H");
320	m_l4.put("ized", "F");
321	m_l4.put("izer", "F");
322	m_l4.put("less", "A");
323	m_l4.put("lily", "A");
324	m_l4.put("ness", "A");
325	m_l4.put("ogen", "A");
326	m_l4.put("ward", "A");
327	m_l4.put("wise", "A");
328	m_l4.put("ying", "B");
329	m_l4.put("yish", "A");
330	m_l3 = new HashMap<String,String>();
331	m_l3.put("acy", "A");
332	m_l3.put("age", "B");
333	m_l3.put("aic", "A");
334	m_l3.put("als", "b");
335	m_l3.put("ant", "B");
336	m_l3.put("ars", "O");
337	m_l3.put("ary", "F");
338	m_l3.put("ata", "A");
339	m_l3.put("ate", "A");
340	m_l3.put("eal", "Y");
341	m_l3.put("ear", "Y");
342	m_l3.put("ely", "E");
343	m_l3.put("ene", "E");
344	m_l3.put("ent", "C");
345	m_l3.put("ery", "E");
346	m_l3.put("ese", "A");
347	m_l3.put("ful", "A");
348	m_l3.put("ial", "A");
349	m_l3.put("ian", "A");
350	m_l3.put("ics", "A");
351	m_l3.put("ide", "L");
352	m_l3.put("ied", "A");
353	m_l3.put("ier", "A");
354	m_l3.put("ies", "P");
355	m_l3.put("ily", "A");
356	m_l3.put("ine", "M");
357	m_l3.put("ing", "N");
358	m_l3.put("ion", "Q");
359	m_l3.put("ish", "C");
360	m_l3.put("ism", "B");
361	m_l3.put("ist", "A");
362	m_l3.put("ite", "a");
363	m_l3.put("ity", "A");
364	m_l3.put("ium", "A");
365	m_l3.put("ive", "A");
366	m_l3.put("ize", "F");
367	m_l3.put("oid", "A");
368	m_l3.put("one", "R");
369	m_l3.put("ous", "A");
370	m_l2 = new HashMap<String,String>();
371	m_l2.put("ae", "A");
372	m_l2.put("al", "b");
373	m_l2.put("ar", "X");
374	m_l2.put("as", "B");
375	m_l2.put("ed", "E");
376	m_l2.put("en", "F");
377	m_l2.put("es", "E");
378	m_l2.put("ia", "A");
379	m_l2.put("ic", "A");
380	m_l2.put("is", "A");
381	m_l2.put("ly", "B");
382	m_l2.put("on", "S");
383	m_l2.put("or", "T");
384	m_l2.put("um", "U");
385	m_l2.put("us", "V");
386	m_l2.put("yl", "R");
387	m_l2.put("s\'", "A");
388	m_l2.put("\'s", "A");
389	m_l1 = new HashMap<String,String>();
390	m_l1.put("a", "A");
391	m_l1.put("e", "A");
392	m_l1.put("i", "A");
393	m_l1.put("o", "A");
394	m_l1.put("s", "W");
395	m_l1.put("y", "B");
396	}
397
398	/**
399	* Returns a string describing the stemmer
400	* @return a description suitable for
401	* displaying in the explorer/experimenter gui
402	*/
403	public String globalInfo() {
404	return
405	"A stemmer based on the Lovins stemmer, described here:\n\n"
406	+ getTechnicalInformation().toString();
407	}
408
409	/**
410	* Returns an instance of a TechnicalInformation object, containing
411	* detailed information about the technical background of this class,
412	* e.g., paper reference or book this class is based on.
413	*
414	* @return the technical information about this class
415	*/
416	public TechnicalInformation getTechnicalInformation() {
417	TechnicalInformation result;
418
419	result = new TechnicalInformation(Type.ARTICLE);
420	result.setValue(Field.AUTHOR, "Julie Beth Lovins");
421	result.setValue(Field.YEAR, "1968");
422	result.setValue(Field.TITLE, "Development of a stemming algorithm");
423	result.setValue(Field.JOURNAL, "Mechanical Translation and Computational Linguistics");
424	result.setValue(Field.VOLUME, "11");
425	result.setValue(Field.PAGES, "22-31");
426
427	return result;
428	}
429
430	/**
431	* Finds and removes ending from given word.
432	*
433	* @param word the word to work on
434	* @return the processed word
435	*/
436	private String removeEnding(String word) {
437
438	int length = word.length();
439	int el = 11;
440
441	while (el > 0) {
442	if (length - el > 1) {
443	String ending = word.substring(length - el);
444	String conditionCode = null;
445	switch (el) {
446	case 11: conditionCode = (String)m_l11.get(ending);
447	break;
448	case 10: conditionCode = (String)m_l10.get(ending);
449	break;
450	case 9: conditionCode = (String)m_l9.get(ending);
451	break;
452	case 8: conditionCode = (String)m_l8.get(ending);
453	break;
454	case 7: conditionCode = (String)m_l7.get(ending);
455	break;
456	case 6: conditionCode = (String)m_l6.get(ending);
457	break;
458	case 5: conditionCode = (String)m_l5.get(ending);
459	break;
460	case 4: conditionCode = (String)m_l4.get(ending);
461	break;
462	case 3: conditionCode = (String)m_l3.get(ending);
463	break;
464	case 2: conditionCode = (String)m_l2.get(ending);
465	break;
466	case 1: conditionCode = (String)m_l1.get(ending);
467	break;
468	default:
469	}
470	if (conditionCode != null) {
471	switch (conditionCode.charAt(0)) {
472	case 'A':
473	return word.substring(0, length - el);
474	case 'B':
475	if (length - el > 2) {
476	return word.substring(0, length - el);
477	}
478	break;
479	case 'C':
480	if (length - el > 3) {
481	return word.substring(0, length - el);
482	}
483	break;
484	case 'D':
485	if (length - el > 4) {
486	return word.substring(0, length - el);
487	}
488	break;
489	case 'E':
490	if (word.charAt(length - el - 1) != 'e') {
491	return word.substring(0, length - el);
492	}
493	break;
494	case 'F':
495	if ((length - el > 2) &&
496	(word.charAt(length - el - 1) != 'e')) {
497	return word.substring(0, length - el);
498	}
499	break;
500	case 'G':
501	if ((length - el > 2) &&
502	(word.charAt(length - el - 1) == 'f')) {
503	return word.substring(0, length - el);
504	}
505	break;
506	case 'H':
507	if ((word.charAt(length - el - 1) == 't') \|\|
508	((word.charAt(length - el - 1) == 'l') &&
509	(word.charAt(length - el - 2) == 'l'))) {
510	return word.substring(0, length - el);
511	}
512	break;
513	case 'I':
514	if ((word.charAt(length - el - 1) != 'o') &&
515	(word.charAt(length - el - 1) != 'e')) {
516	return word.substring(0, length - el);
517	}
518	break;
519	case 'J':
520	if ((word.charAt(length - el - 1) != 'a') &&
521	(word.charAt(length - el - 1) != 'e')) {
522	return word.substring(0, length - el);
523	}
524	break;
525	case 'K':
526	if ((length - el > 2) &&
527	((word.charAt(length - el - 1) == 'l') \|\|
528	(word.charAt(length - el - 1) == 'i') \|\|
529	((word.charAt(length - el - 1) == 'e') &&
530	(word.charAt(length - el - 3) == 'u')))) {
531	return word.substring(0, length - el);
532	}
533	break;
534	case 'L':
535	if ((word.charAt(length - el - 1) != 'u') &&
536	(word.charAt(length - el - 1) != 'x') &&
537	((word.charAt(length - el - 1) != 's') \|\|
538	(word.charAt(length - el - 2) == 'o'))) {
539	return word.substring(0, length - el);
540	}
541	break;
542	case 'M':
543	if ((word.charAt(length - el - 1) != 'a') &&
544	(word.charAt(length - el - 1) != 'c') &&
545	(word.charAt(length - el - 1) != 'e') &&
546	(word.charAt(length - el - 1) != 'm')) {
547	return word.substring(0, length - el);
548	}
549	break;
550	case 'N':
551	if ((length - el > 3) \|\|
552	((length - el == 3) &&
553	((word.charAt(length - el - 3) != 's')))) {
554	return word.substring(0, length - el);
555	}
556	break;
557	case 'O':
558	if ((word.charAt(length - el - 1) == 'l') \|\|
559	(word.charAt(length - el - 1) == 'i')) {
560	return word.substring(0, length - el);
561	}
562	break;
563	case 'P':
564	if (word.charAt(length - el - 1) != 'c') {
565	return word.substring(0, length - el);
566	}
567	break;
568	case 'Q':
569	if ((length - el > 2) &&
570	(word.charAt(length - el - 1) != 'l') &&
571	(word.charAt(length - el - 1) != 'n')) {
572	return word.substring(0, length - el);
573	}
574	break;
575	case 'R':
576	if ((word.charAt(length - el - 1) == 'n') \|\|
577	(word.charAt(length - el - 1) == 'r')) {
578	return word.substring(0, length - el);
579	}
580	break;
581	case 'S':
582	if (((word.charAt(length - el - 1) == 'r') &&
583	(word.charAt(length - el - 2) == 'd')) \|\|
584	((word.charAt(length - el - 1) == 't') &&
585	(word.charAt(length - el - 2) != 't'))) {
586	return word.substring(0, length - el);
587	}
588	break;
589	case 'T':
590	if ((word.charAt(length - el - 1) == 's') \|\|
591	((word.charAt(length - el - 1) == 't') &&
592	(word.charAt(length - el - 2) != 'o'))) {
593	return word.substring(0, length - el);
594	}
595	break;
596	case 'U':
597	if ((word.charAt(length - el - 1) == 'l') \|\|
598	(word.charAt(length - el - 1) == 'm') \|\|
599	(word.charAt(length - el - 1) == 'n') \|\|
600	(word.charAt(length - el - 1) == 'r')) {
601	return word.substring(0, length - el);
602	}
603	break;
604	case 'V':
605	if (word.charAt(length - el - 1) == 'c') {
606	return word.substring(0, length - el);
607	}
608	break;
609	case 'W':
610	if ((word.charAt(length - el - 1) != 's') &&
611	(word.charAt(length - el - 1) != 'u')) {
612	return word.substring(0, length - el);
613	}
614	break;
615	case 'X':
616	if ((word.charAt(length - el - 1) == 'l') \|\|
617	(word.charAt(length - el - 1) == 'i') \|\|
618	((length - el > 2) &&
619	(word.charAt(length - el - 1) == 'e') &&
620	(word.charAt(length - el - 3) == 'u'))) {
621	return word.substring(0, length - el);
622	}
623	break;
624	case 'Y':
625	if ((word.charAt(length - el - 1) == 'n') &&
626	(word.charAt(length - el - 2) == 'i')) {
627	return word.substring(0, length - el);
628	}
629	break;
630	case 'Z':
631	if (word.charAt(length - el - 1) != 'f') {
632	return word.substring(0, length - el);
633	}
634	break;
635	case 'a':
636	if ((word.charAt(length - el - 1) == 'd') \|\|
637	(word.charAt(length - el - 1) == 'f') \|\|
638	(((word.charAt(length - el - 1) == 'h') &&
639	(word.charAt(length - el - 2) == 'p'))) \|\|
640	(((word.charAt(length - el - 1) == 'h') &&
641	(word.charAt(length - el - 2) == 't'))) \|\|
642	(word.charAt(length - el - 1) == 'l') \|\|
643	(((word.charAt(length - el - 1) == 'r') &&
644	(word.charAt(length - el - 2) == 'e'))) \|\|
645	(((word.charAt(length - el - 1) == 'r') &&
646	(word.charAt(length - el - 2) == 'o'))) \|\|
647	(((word.charAt(length - el - 1) == 's') &&
648	(word.charAt(length - el - 2) == 'e'))) \|\|
649	(word.charAt(length - el - 1) == 't')) {
650	return word.substring(0, length - el);
651	}
652	break;
653	case 'b':
654	if (m_CompMode) {
655	if (((length - el == 3 ) &&
656	(!((word.charAt(length - el - 1) == 't') &&
657	(word.charAt(length - el - 2) == 'e') &&
658	(word.charAt(length - el - 3) == 'm')))) \|\|
659	((length - el > 3) &&
660	(!((word.charAt(length - el - 1) == 't') &&
661	(word.charAt(length - el - 2) == 's') &&
662	(word.charAt(length - el - 3) == 'y') &&
663	(word.charAt(length - el - 4) == 'r'))))) {
664	return word.substring(0, length - el);
665	}
666	} else {
667	if ((length - el > 2) &&
668	(!((word.charAt(length - el - 1) == 't') &&
669	(word.charAt(length - el - 2) == 'e') &&
670	(word.charAt(length - el - 3) == 'm'))) &&
671	((length - el < 4) \|\|
672	(!((word.charAt(length - el - 1) == 't') &&
673	(word.charAt(length - el - 2) == 's') &&
674	(word.charAt(length - el - 3) == 'y') &&
675	(word.charAt(length - el - 4) == 'r'))))) {
676	return word.substring(0, length - el);
677	}
678	}
679	break;
680	case 'c':
681	if (word.charAt(length - el - 1) == 'l') {
682	return word.substring(0, length - el);
683	}
684	break;
685	default:
686	throw new IllegalArgumentException("Fatal error.");
687	}
688	}
689	}
690	el--;
691	}
692	return word;
693	}
694
695	/**
696	* Recodes ending of given word.
697	*
698	* @param word the word to work on
699	* @return the processed word
700	*/
701	private String recodeEnding(String word) {
702
703	int lastPos = word.length() - 1;
704
705	// Rule 1
706	if (word.endsWith("bb") \|\|
707	word.endsWith("dd") \|\|
708	word.endsWith("gg") \|\|
709	word.endsWith("ll") \|\|
710	word.endsWith("mm") \|\|
711	word.endsWith("nn") \|\|
712	word.endsWith("pp") \|\|
713	word.endsWith("rr") \|\|
714	word.endsWith("ss") \|\|
715	word.endsWith("tt")) {
716	word = word.substring(0, lastPos);
717	lastPos--;
718	}
719
720	// Rule 2
721	if (word.endsWith("iev")) {
722	word = word.substring(0, lastPos - 2).concat("ief");
723	}
724
725	// Rule 3
726	if (word.endsWith("uct")) {
727	word = word.substring(0, lastPos - 2).concat("uc");
728	lastPos--;
729	}
730
731	// Rule 4
732	if (word.endsWith("umpt")) {
733	word = word.substring(0, lastPos - 3).concat("um");
734	lastPos -= 2;
735	}
736
737	// Rule 5
738	if (word.endsWith("rpt")) {
739	word = word.substring(0, lastPos - 2).concat("rb");
740	lastPos--;
741	}
742
743	// Rule 6
744	if (word.endsWith("urs")) {
745	word = word.substring(0, lastPos - 2).concat("ur");
746	lastPos--;
747	}
748
749	// Rule 7
750	if (word.endsWith("istr")) {
751	word = word.substring(0, lastPos - 3).concat("ister");
752	lastPos++;
753	}
754
755	// Rule 7a
756	if (word.endsWith("metr")) {
757	word = word.substring(0, lastPos - 3).concat("meter");
758	lastPos++;
759	}
760
761	// Rule 8
762	if (word.endsWith("olv")) {
763	word = word.substring(0, lastPos - 2).concat("olut");
764	lastPos++;
765	}
766
767	// Rule 9
768	if (word.endsWith("ul")) {
769	if ((lastPos - 2 < 0) \|\|
770	((word.charAt(lastPos - 2) != 'a') &&
771	(word.charAt(lastPos - 2) != 'i') &&
772	(word.charAt(lastPos - 2) != 'o'))) {
773	word = word.substring(0, lastPos - 1).concat("l");
774	lastPos--;
775	}
776	}
777
778	// Rule 10
779	if (word.endsWith("bex")) {
780	word = word.substring(0, lastPos - 2).concat("bic");
781	}
782
783	// Rule 11
784	if (word.endsWith("dex")) {
785	word = word.substring(0, lastPos - 2).concat("dic");
786	}
787
788	// Rule 12
789	if (word.endsWith("pex")) {
790	word = word.substring(0, lastPos - 2).concat("pic");
791	}
792
793	// Rule 13
794	if (word.endsWith("tex")) {
795	word = word.substring(0, lastPos - 2).concat("tic");
796	}
797
798	// Rule 14
799	if (word.endsWith("ax")) {
800	word = word.substring(0, lastPos - 1).concat("ac");
801	}
802
803	// Rule 15
804	if (word.endsWith("ex")) {
805	word = word.substring(0, lastPos - 1).concat("ec");
806	}
807
808	// Rule 16
809	if (word.endsWith("ix")) {
810	word = word.substring(0, lastPos - 1).concat("ic");
811	}
812
813	// Rule 17
814	if (word.endsWith("lux")) {
815	word = word.substring(0, lastPos - 2).concat("luc");
816	}
817
818	// Rule 18
819	if (word.endsWith("uad")) {
820	word = word.substring(0, lastPos - 2).concat("uas");
821	}
822
823	// Rule 19
824	if (word.endsWith("vad")) {
825	word = word.substring(0, lastPos - 2).concat("vas");
826	}
827
828	// Rule 20
829	if (word.endsWith("cid")) {
830	word = word.substring(0, lastPos - 2).concat("cis");
831	}
832
833	// Rule 21
834	if (word.endsWith("lid")) {
835	word = word.substring(0, lastPos - 2).concat("lis");
836	}
837
838	// Rule 22
839	if (word.endsWith("erid")) {
840	word = word.substring(0, lastPos - 3).concat("eris");
841	}
842
843	// Rule 23
844	if (word.endsWith("pand")) {
845	word = word.substring(0, lastPos - 3).concat("pans");
846	}
847
848	// Rule 24
849	if (word.endsWith("end")) {
850	if ((lastPos - 3 < 0) \|\|
851	(word.charAt(lastPos - 3) != 's')) {
852	word = word.substring(0, lastPos - 2).concat("ens");
853	}
854	}
855
856	// Rule 25
857	if (word.endsWith("ond")) {
858	word = word.substring(0, lastPos - 2).concat("ons");
859	}
860
861	// Rule 26
862	if (word.endsWith("lud")) {
863	word = word.substring(0, lastPos - 2).concat("lus");
864	}
865
866	// Rule 27
867	if (word.endsWith("rud")) {
868	word = word.substring(0, lastPos - 2).concat("rus");
869	}
870
871	// Rule 28
872	if (word.endsWith("her")) {
873	if ((lastPos - 3 < 0) \|\|
874	((word.charAt(lastPos - 3) != 'p') &&
875	(word.charAt(lastPos - 3) != 't'))) {
876	word = word.substring(0, lastPos - 2).concat("hes");
877	}
878	}
879
880	// Rule 29
881	if (word.endsWith("mit")) {
882	word = word.substring(0, lastPos - 2).concat("mis");
883	}
884
885	// Rule 30
886	if (word.endsWith("end")) {
887	if ((lastPos - 3 < 0) \|\|
888	(word.charAt(lastPos - 3) != 'm')) {
889	word = word.substring(0, lastPos - 2).concat("ens");
890	}
891	}
892
893	// Rule 31
894	if (word.endsWith("ert")) {
895	word = word.substring(0, lastPos - 2).concat("ers");
896	}
897
898	// Rule 32
899	if (word.endsWith("et")) {
900	if ((lastPos - 2 < 0) \|\|
901	(word.charAt(lastPos - 2) != 'n')) {
902	word = word.substring(0, lastPos - 1).concat("es");
903	}
904	}
905
906	// Rule 33
907	if (word.endsWith("yt")) {
908	word = word.substring(0, lastPos - 1).concat("ys");
909	}
910
911	// Rule 34
912	if (word.endsWith("yz")) {
913	word = word.substring(0, lastPos - 1).concat("ys");
914	}
915
916	return word;
917	}
918
919	/**
920	* Returns the stemmed version of the given word.
921	* Word is converted to lower case before stemming.
922	*
923	* @param word a string consisting of a single word
924	* @return the stemmed word
925	*/
926	public String stem(String word) {
927
928	if (word.length() > 2) {
929	return recodeEnding(removeEnding(word.toLowerCase()));
930	} else {
931	return word.toLowerCase();
932	}
933	}
934
935	/**
936	* Stems everything in the given string. String
937	* is converted to lower case before stemming.
938	*
939	* @param str the string to stem
940	* @return the processed string
941	*/
942	public String stemString(String str) {
943
944	StringBuffer result = new StringBuffer();
945	int start = -1;
946	for (int j = 0; j < str.length(); j++) {
947	char c = str.charAt(j);
948	if (Character.isLetterOrDigit(c)) {
949	if (start == -1) {
950	start = j;
951	}
952	} else if (c == '\'') {
953	if (start == -1) {
954	result.append(c);
955	}
956	} else {
957	if (start != -1) {
958	result.append(stem(str.substring(start, j)));
959	start = -1;
960	}
961	result.append(c);
962	}
963	}
964	if (start != -1) {
965	result.append(stem(str.substring(start, str.length())));
966	}
967	return result.toString();
968	}
969
970	/**
971	* returns a string representation of the stemmer
972	*
973	* @return a string representation of the stemmer
974	*/
975	public String toString() {
976	return getClass().getName();
977	}
978
979	/**
980	* Returns the revision string.
981	*
982	* @return the revision
983	*/
984	public String getRevision() {
985	return RevisionUtils.extract("$Revision: 5953 $");
986	}
987
988	/**
989	* Runs the stemmer with the given options
990	*
991	* @param args the options
992	*/
993	public static void main(String[] args) {
994	try {
995	Stemming.useStemmer(new LovinsStemmer(), args);
996	}
997	catch (Exception e) {
998	e.printStackTrace();
999	}
1000	}
1001	}
1002

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/core/stemmers/LovinsStemmer.java

Download in other formats: