| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * SimpleStringTokenizer.java |
|---|
| 19 | * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand |
|---|
| 20 | */ |
|---|
| 21 | |
|---|
| 22 | package weka.core.tokenizers; |
|---|
| 23 | |
|---|
| 24 | import weka.core.RevisionUtils; |
|---|
| 25 | |
|---|
| 26 | import java.util.StringTokenizer; |
|---|
| 27 | |
|---|
| 28 | /** |
|---|
| 29 | <!-- globalinfo-start --> |
|---|
| 30 | * A simple tokenizer that is using the java.util.StringTokenizer class to tokenize the strings. |
|---|
| 31 | * <p/> |
|---|
| 32 | <!-- globalinfo-end --> |
|---|
| 33 | * |
|---|
| 34 | <!-- options-start --> |
|---|
| 35 | * Valid options are: <p/> |
|---|
| 36 | * |
|---|
| 37 | * <pre> -delimiters <value> |
|---|
| 38 | * The delimiters to use |
|---|
| 39 | * (default ' \r\n\t.,;:'"()?!').</pre> |
|---|
| 40 | * |
|---|
| 41 | <!-- options-end --> |
|---|
| 42 | * |
|---|
| 43 | * @author FracPete (fracpete at waikato dot ac dot nz) |
|---|
| 44 | * @version $Revision: 5953 $ |
|---|
| 45 | */ |
|---|
| 46 | public class WordTokenizer |
|---|
| 47 | extends CharacterDelimitedTokenizer { |
|---|
| 48 | |
|---|
| 49 | /** for serialization */ |
|---|
| 50 | private static final long serialVersionUID = -930893034037880773L; |
|---|
| 51 | |
|---|
| 52 | /** the actual tokenizer */ |
|---|
| 53 | protected transient StringTokenizer m_Tokenizer; |
|---|
| 54 | |
|---|
| 55 | /** |
|---|
| 56 | * Returns a string describing the stemmer |
|---|
| 57 | * |
|---|
| 58 | * @return a description suitable for displaying in the |
|---|
| 59 | * explorer/experimenter gui |
|---|
| 60 | */ |
|---|
| 61 | public String globalInfo() { |
|---|
| 62 | return |
|---|
| 63 | "A simple tokenizer that is using the java.util.StringTokenizer " |
|---|
| 64 | + "class to tokenize the strings."; |
|---|
| 65 | } |
|---|
| 66 | |
|---|
| 67 | /** |
|---|
| 68 | * Tests if this enumeration contains more elements. |
|---|
| 69 | * |
|---|
| 70 | * @return true if and only if this enumeration object contains |
|---|
| 71 | * at least one more element to provide; false otherwise. |
|---|
| 72 | */ |
|---|
| 73 | public boolean hasMoreElements() { |
|---|
| 74 | return m_Tokenizer.hasMoreElements(); |
|---|
| 75 | } |
|---|
| 76 | |
|---|
| 77 | /** |
|---|
| 78 | * Returns the next element of this enumeration if this enumeration object |
|---|
| 79 | * has at least one more element to provide. |
|---|
| 80 | * |
|---|
| 81 | * @return the next element of this enumeration. |
|---|
| 82 | */ |
|---|
| 83 | public Object nextElement() { |
|---|
| 84 | return m_Tokenizer.nextElement(); |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | /** |
|---|
| 88 | * Sets the string to tokenize. Tokenization happens immediately. |
|---|
| 89 | * |
|---|
| 90 | * @param s the string to tokenize |
|---|
| 91 | */ |
|---|
| 92 | public void tokenize(String s) { |
|---|
| 93 | m_Tokenizer = new StringTokenizer(s, getDelimiters()); |
|---|
| 94 | } |
|---|
| 95 | |
|---|
| 96 | /** |
|---|
| 97 | * Returns the revision string. |
|---|
| 98 | * |
|---|
| 99 | * @return the revision |
|---|
| 100 | */ |
|---|
| 101 | public String getRevision() { |
|---|
| 102 | return RevisionUtils.extract("$Revision: 5953 $"); |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | /** |
|---|
| 106 | * Runs the tokenizer with the given options and strings to tokenize. |
|---|
| 107 | * The tokens are printed to stdout. |
|---|
| 108 | * |
|---|
| 109 | * @param args the commandline options and strings to tokenize |
|---|
| 110 | */ |
|---|
| 111 | public static void main(String[] args) { |
|---|
| 112 | runTokenizer(new WordTokenizer(), args); |
|---|
| 113 | } |
|---|
| 114 | } |
|---|
| 115 | |
|---|