| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * Tokenizer.java |
|---|
| 19 | * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand |
|---|
| 20 | */ |
|---|
| 21 | |
|---|
| 22 | package weka.core.tokenizers; |
|---|
| 23 | |
|---|
| 24 | import weka.core.OptionHandler; |
|---|
| 25 | import weka.core.RevisionHandler; |
|---|
| 26 | |
|---|
| 27 | import java.io.BufferedReader; |
|---|
| 28 | import java.io.InputStreamReader; |
|---|
| 29 | import java.io.Serializable; |
|---|
| 30 | import java.util.Enumeration; |
|---|
| 31 | import java.util.Vector; |
|---|
| 32 | |
|---|
| 33 | /** |
|---|
| 34 | * A superclass for all tokenizer algorithms. |
|---|
| 35 | * |
|---|
| 36 | * @author FracPete (fracpete at waikato dot ac dot nz) |
|---|
| 37 | * @version $Revision: 5953 $ |
|---|
| 38 | */ |
|---|
| 39 | public abstract class Tokenizer |
|---|
| 40 | implements Enumeration, OptionHandler, Serializable, RevisionHandler { |
|---|
| 41 | |
|---|
| 42 | /** |
|---|
| 43 | * Returns a string describing the stemmer |
|---|
| 44 | * |
|---|
| 45 | * @return a description suitable for displaying in the |
|---|
| 46 | * explorer/experimenter gui |
|---|
| 47 | */ |
|---|
| 48 | public abstract String globalInfo(); |
|---|
| 49 | |
|---|
| 50 | /** |
|---|
| 51 | * Returns an enumeration of all the available options.. |
|---|
| 52 | * |
|---|
| 53 | * @return an enumeration of all available options. |
|---|
| 54 | */ |
|---|
| 55 | public Enumeration listOptions() { |
|---|
| 56 | return (new Vector()).elements(); |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | /** |
|---|
| 60 | * Gets the current option settings for the OptionHandler. |
|---|
| 61 | * |
|---|
| 62 | * @return the list of current option settings as an array of |
|---|
| 63 | * strings |
|---|
| 64 | */ |
|---|
| 65 | public String[] getOptions() { |
|---|
| 66 | return new String[0]; |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | /** |
|---|
| 70 | * Sets the OptionHandler's options using the given list. All options |
|---|
| 71 | * will be set (or reset) during this call (i.e. incremental setting |
|---|
| 72 | * of options is not possible). |
|---|
| 73 | * |
|---|
| 74 | * @param options the list of options as an array of strings |
|---|
| 75 | * @throws Exception if an option is not supported |
|---|
| 76 | */ |
|---|
| 77 | public void setOptions(String[] options) throws Exception { |
|---|
| 78 | // nothing in this class |
|---|
| 79 | } |
|---|
| 80 | |
|---|
| 81 | /** |
|---|
| 82 | * Tests if this enumeration contains more elements. |
|---|
| 83 | * |
|---|
| 84 | * @return true if and only if this enumeration object contains |
|---|
| 85 | * at least one more element to provide; false otherwise. |
|---|
| 86 | */ |
|---|
| 87 | public abstract boolean hasMoreElements(); |
|---|
| 88 | |
|---|
| 89 | /** |
|---|
| 90 | * Returns the next element of this enumeration if this enumeration object |
|---|
| 91 | * has at least one more element to provide. |
|---|
| 92 | * |
|---|
| 93 | * @return the next element of this enumeration. |
|---|
| 94 | */ |
|---|
| 95 | public abstract Object nextElement(); |
|---|
| 96 | |
|---|
| 97 | /** |
|---|
| 98 | * Sets the string to tokenize. Tokenization happens immediately. |
|---|
| 99 | * |
|---|
| 100 | * @param s the string to tokenize |
|---|
| 101 | */ |
|---|
| 102 | public abstract void tokenize(String s); |
|---|
| 103 | |
|---|
| 104 | /** |
|---|
| 105 | * initializes the given tokenizer with the given options and runs the |
|---|
| 106 | * tokenizer over all the remaining strings in the options array. If no |
|---|
| 107 | * strings remained in the option string then data is read from stdin, line |
|---|
| 108 | * by line. |
|---|
| 109 | * |
|---|
| 110 | * @param tokenizer the tokenizer to use |
|---|
| 111 | * @param options the options for the tokenizer |
|---|
| 112 | * @return the tokenized strings |
|---|
| 113 | * @throws Exception if setting of options or tokenization fails |
|---|
| 114 | */ |
|---|
| 115 | public static String[] tokenize(Tokenizer tokenizer, String[] options) throws Exception { |
|---|
| 116 | Vector<String> result; |
|---|
| 117 | Vector<String> tmpResult; |
|---|
| 118 | Vector<String> data; |
|---|
| 119 | int i; |
|---|
| 120 | boolean processed; |
|---|
| 121 | BufferedReader reader; |
|---|
| 122 | String line; |
|---|
| 123 | |
|---|
| 124 | result = new Vector<String>(); |
|---|
| 125 | |
|---|
| 126 | // init tokenizer |
|---|
| 127 | tokenizer.setOptions(options); |
|---|
| 128 | |
|---|
| 129 | // for storing the data to process |
|---|
| 130 | data = new Vector<String>(); |
|---|
| 131 | |
|---|
| 132 | // run over all un-processed strings in the options array |
|---|
| 133 | processed = false; |
|---|
| 134 | for (i = 0; i < options.length; i++) { |
|---|
| 135 | if (options[i].length() != 0) { |
|---|
| 136 | processed = true; |
|---|
| 137 | data.add(options[i]); |
|---|
| 138 | } |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | // if no strings in option string then read from stdin |
|---|
| 142 | if (!processed) { |
|---|
| 143 | reader = new BufferedReader(new InputStreamReader(System.in)); |
|---|
| 144 | while ((line = reader.readLine()) != null) { |
|---|
| 145 | data.add(line); |
|---|
| 146 | } |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | // process data |
|---|
| 150 | for (i = 0; i < data.size(); i++) { |
|---|
| 151 | tmpResult = new Vector<String>(); |
|---|
| 152 | tokenizer.tokenize(data.get(i)); |
|---|
| 153 | while (tokenizer.hasMoreElements()) |
|---|
| 154 | tmpResult.add((String) tokenizer.nextElement()); |
|---|
| 155 | // add to result |
|---|
| 156 | result.addAll(tmpResult); |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | return result.toArray(new String[result.size()]); |
|---|
| 160 | } |
|---|
| 161 | |
|---|
| 162 | /** |
|---|
| 163 | * initializes the given tokenizer with the given options and runs the |
|---|
| 164 | * tokenizer over all the remaining strings in the options array. The |
|---|
| 165 | * generated tokens are then printed to stdout. If no strings remained |
|---|
| 166 | * in the option string then data is read from stdin, line by line. |
|---|
| 167 | * |
|---|
| 168 | * @param tokenizer the tokenizer to use |
|---|
| 169 | * @param options the options for the tokenizer |
|---|
| 170 | */ |
|---|
| 171 | public static void runTokenizer(Tokenizer tokenizer, String[] options) { |
|---|
| 172 | String[] result; |
|---|
| 173 | int i; |
|---|
| 174 | |
|---|
| 175 | try { |
|---|
| 176 | result = tokenize(tokenizer, options); |
|---|
| 177 | for (i = 0; i < result.length; i++) |
|---|
| 178 | System.out.println(result[i]); |
|---|
| 179 | } |
|---|
| 180 | catch (Exception e) { |
|---|
| 181 | e.printStackTrace(); |
|---|
| 182 | } |
|---|
| 183 | } |
|---|
| 184 | } |
|---|