| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * AlphabeticStringTokenizer.java |
|---|
| 19 | * Copyright (C) 2003, 2007 University of Waikato, Hamilton, New Zealand |
|---|
| 20 | */ |
|---|
| 21 | |
|---|
| 22 | package weka.core.tokenizers; |
|---|
| 23 | |
|---|
| 24 | import weka.core.RevisionUtils; |
|---|
| 25 | |
|---|
| 26 | import java.util.NoSuchElementException; |
|---|
| 27 | |
|---|
| 28 | /** |
|---|
| 29 | <!-- globalinfo-start --> |
|---|
| 30 | * Alphabetic string tokenizer, tokens are to be formed only from contiguous alphabetic sequences. |
|---|
| 31 | * <p/> |
|---|
| 32 | <!-- globalinfo-end --> |
|---|
| 33 | * |
|---|
| 34 | * @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz) |
|---|
| 35 | * @author FracPete (fracpete at waikato dot ac dot nz) |
|---|
| 36 | * @version $Revision: 5953 $ |
|---|
| 37 | */ |
|---|
| 38 | public class AlphabeticTokenizer |
|---|
| 39 | extends Tokenizer { |
|---|
| 40 | |
|---|
| 41 | /** for serialization */ |
|---|
| 42 | private static final long serialVersionUID = 6705199562609861697L; |
|---|
| 43 | |
|---|
| 44 | /** the characters of the string */ |
|---|
| 45 | protected char[] m_Str; |
|---|
| 46 | |
|---|
| 47 | /** the current position */ |
|---|
| 48 | protected int m_CurrentPos; |
|---|
| 49 | |
|---|
| 50 | /** |
|---|
| 51 | * Returns a string describing the stemmer |
|---|
| 52 | * |
|---|
| 53 | * @return a description suitable for displaying in the |
|---|
| 54 | * explorer/experimenter gui |
|---|
| 55 | */ |
|---|
| 56 | public String globalInfo() { |
|---|
| 57 | return |
|---|
| 58 | "Alphabetic string tokenizer, tokens are to be formed only from " |
|---|
| 59 | + "contiguous alphabetic sequences."; |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | /** |
|---|
| 63 | * returns whether there are more elements still |
|---|
| 64 | * |
|---|
| 65 | * @return true if there are still more elements |
|---|
| 66 | */ |
|---|
| 67 | public boolean hasMoreElements() { |
|---|
| 68 | int beginpos = m_CurrentPos; |
|---|
| 69 | |
|---|
| 70 | while ( (beginpos < m_Str.length) && |
|---|
| 71 | ((m_Str[beginpos] < 'a') || (m_Str[beginpos] > 'z')) && |
|---|
| 72 | ((m_Str[beginpos] < 'A') || (m_Str[beginpos] > 'Z')) ) { |
|---|
| 73 | beginpos++; |
|---|
| 74 | } |
|---|
| 75 | m_CurrentPos = beginpos; |
|---|
| 76 | |
|---|
| 77 | if ( (beginpos < m_Str.length) && |
|---|
| 78 | (((m_Str[beginpos] >= 'a') && (m_Str[beginpos] <= 'z')) || |
|---|
| 79 | ((m_Str[beginpos] >= 'A') && (m_Str[beginpos] <= 'Z'))) ) { |
|---|
| 80 | return true; |
|---|
| 81 | } |
|---|
| 82 | else { |
|---|
| 83 | return false; |
|---|
| 84 | } |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | /** |
|---|
| 88 | * returns the next element |
|---|
| 89 | * |
|---|
| 90 | * @return the next element |
|---|
| 91 | */ |
|---|
| 92 | public Object nextElement() { |
|---|
| 93 | int beginpos, endpos; |
|---|
| 94 | |
|---|
| 95 | beginpos = m_CurrentPos; |
|---|
| 96 | |
|---|
| 97 | while ( (beginpos < m_Str.length) && |
|---|
| 98 | ((m_Str[beginpos] < 'a') && (m_Str[beginpos] > 'z')) && |
|---|
| 99 | ((m_Str[beginpos] < 'A') && (m_Str[beginpos] > 'Z')) ) { |
|---|
| 100 | beginpos++; |
|---|
| 101 | } |
|---|
| 102 | m_CurrentPos = endpos = beginpos; |
|---|
| 103 | |
|---|
| 104 | if (beginpos >= m_Str.length) |
|---|
| 105 | throw new NoSuchElementException("No more tokens present"); |
|---|
| 106 | |
|---|
| 107 | while ((endpos < m_Str.length) && |
|---|
| 108 | ( ((m_Str[endpos] >= 'a') && (m_Str[endpos]<='z')) || |
|---|
| 109 | ((m_Str[endpos] >= 'A') && (m_Str[endpos]<='Z'))) ) { |
|---|
| 110 | endpos++; |
|---|
| 111 | } |
|---|
| 112 | |
|---|
| 113 | String s = new String(m_Str, beginpos, endpos - m_CurrentPos); |
|---|
| 114 | m_CurrentPos = endpos; |
|---|
| 115 | |
|---|
| 116 | return s; |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | /** |
|---|
| 120 | * Sets the string to tokenize. Tokenization happens immediately. |
|---|
| 121 | * |
|---|
| 122 | * @param s the string to tokenize |
|---|
| 123 | */ |
|---|
| 124 | public void tokenize(String s) { |
|---|
| 125 | m_CurrentPos = 0; |
|---|
| 126 | m_Str = new char[s.length()]; |
|---|
| 127 | s.getChars(0, s.length(), m_Str, 0); |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | /** |
|---|
| 131 | * Returns the revision string. |
|---|
| 132 | * |
|---|
| 133 | * @return the revision |
|---|
| 134 | */ |
|---|
| 135 | public String getRevision() { |
|---|
| 136 | return RevisionUtils.extract("$Revision: 5953 $"); |
|---|
| 137 | } |
|---|
| 138 | |
|---|
| 139 | /** |
|---|
| 140 | * Runs the tokenizer with the given options and strings to tokenize. |
|---|
| 141 | * The tokens are printed to stdout. |
|---|
| 142 | * |
|---|
| 143 | * @param args the commandline options and strings to tokenize |
|---|
| 144 | */ |
|---|
| 145 | public static void main(String[] args) { |
|---|
| 146 | runTokenizer(new AlphabeticTokenizer(), args); |
|---|
| 147 | } |
|---|
| 148 | } |
|---|