Context Navigation

AlphabeticTokenizer.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 3.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* AlphabeticStringTokenizer.java
19	* Copyright (C) 2003, 2007 University of Waikato, Hamilton, New Zealand
20	*/
21
22	package weka.core.tokenizers;
23
24	import weka.core.RevisionUtils;
25
26	import java.util.NoSuchElementException;
27
28	/**
29	<!-- globalinfo-start -->
30	* Alphabetic string tokenizer, tokens are to be formed only from contiguous alphabetic sequences.
31	* <p/>
32	<!-- globalinfo-end -->
33	*
34	* @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
35	* @author FracPete (fracpete at waikato dot ac dot nz)
36	* @version $Revision: 5953 $
37	*/
38	public class AlphabeticTokenizer
39	extends Tokenizer {
40
41	/** for serialization */
42	private static final long serialVersionUID = 6705199562609861697L;
43
44	/** the characters of the string */
45	protected char[] m_Str;
46
47	/** the current position */
48	protected int m_CurrentPos;
49
50	/**
51	* Returns a string describing the stemmer
52	*
53	* @return a description suitable for displaying in the
54	* explorer/experimenter gui
55	*/
56	public String globalInfo() {
57	return
58	"Alphabetic string tokenizer, tokens are to be formed only from "
59	+ "contiguous alphabetic sequences.";
60	}
61
62	/**
63	* returns whether there are more elements still
64	*
65	* @return true if there are still more elements
66	*/
67	public boolean hasMoreElements() {
68	int beginpos = m_CurrentPos;
69
70	while ( (beginpos < m_Str.length) &&
71	((m_Str[beginpos] < 'a') \|\| (m_Str[beginpos] > 'z')) &&
72	((m_Str[beginpos] < 'A') \|\| (m_Str[beginpos] > 'Z')) ) {
73	beginpos++;
74	}
75	m_CurrentPos = beginpos;
76
77	if ( (beginpos < m_Str.length) &&
78	(((m_Str[beginpos] >= 'a') && (m_Str[beginpos] <= 'z')) \|\|
79	((m_Str[beginpos] >= 'A') && (m_Str[beginpos] <= 'Z'))) ) {
80	return true;
81	}
82	else {
83	return false;
84	}
85	}
86
87	/**
88	* returns the next element
89	*
90	* @return the next element
91	*/
92	public Object nextElement() {
93	int beginpos, endpos;
94
95	beginpos = m_CurrentPos;
96
97	while ( (beginpos < m_Str.length) &&
98	((m_Str[beginpos] < 'a') && (m_Str[beginpos] > 'z')) &&
99	((m_Str[beginpos] < 'A') && (m_Str[beginpos] > 'Z')) ) {
100	beginpos++;
101	}
102	m_CurrentPos = endpos = beginpos;
103
104	if (beginpos >= m_Str.length)
105	throw new NoSuchElementException("No more tokens present");
106
107	while ((endpos < m_Str.length) &&
108	( ((m_Str[endpos] >= 'a') && (m_Str[endpos]<='z')) \|\|
109	((m_Str[endpos] >= 'A') && (m_Str[endpos]<='Z'))) ) {
110	endpos++;
111	}
112
113	String s = new String(m_Str, beginpos, endpos - m_CurrentPos);
114	m_CurrentPos = endpos;
115
116	return s;
117	}
118
119	/**
120	* Sets the string to tokenize. Tokenization happens immediately.
121	*
122	* @param s the string to tokenize
123	*/
124	public void tokenize(String s) {
125	m_CurrentPos = 0;
126	m_Str = new char[s.length()];
127	s.getChars(0, s.length(), m_Str, 0);
128	}
129
130	/**
131	* Returns the revision string.
132	*
133	* @return the revision
134	*/
135	public String getRevision() {
136	return RevisionUtils.extract("$Revision: 5953 $");
137	}
138
139	/**
140	* Runs the tokenizer with the given options and strings to tokenize.
141	* The tokens are printed to stdout.
142	*
143	* @param args the commandline options and strings to tokenize
144	*/
145	public static void main(String[] args) {
146	runTokenizer(new AlphabeticTokenizer(), args);
147	}
148	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/core/tokenizers/AlphabeticTokenizer.java

Download in other formats: