source: tags/MetisMQIDemo/src/main/java/weka/core/tokenizers/Tokenizer.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 5.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * Tokenizer.java
19 * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.core.tokenizers;
23
24import weka.core.OptionHandler;
25import weka.core.RevisionHandler;
26
27import java.io.BufferedReader;
28import java.io.InputStreamReader;
29import java.io.Serializable;
30import java.util.Enumeration;
31import java.util.Vector;
32
33/**
34 * A superclass for all tokenizer algorithms.
35 *
36 * @author  FracPete (fracpete at waikato dot ac dot nz)
37 * @version $Revision: 5953 $
38 */
39public abstract class Tokenizer
40  implements Enumeration, OptionHandler, Serializable, RevisionHandler {
41 
42  /**
43   * Returns a string describing the stemmer
44   *
45   * @return            a description suitable for displaying in the
46   *                    explorer/experimenter gui
47   */
48  public abstract String globalInfo();
49   
50  /**
51   * Returns an enumeration of all the available options..
52   *
53   * @return            an enumeration of all available options.
54   */
55  public Enumeration listOptions() {
56    return (new Vector()).elements();
57  }
58 
59  /**
60   * Gets the current option settings for the OptionHandler.
61   *
62   * @return            the list of current option settings as an array of
63   *                    strings
64   */
65  public String[] getOptions() {
66    return new String[0];
67  }
68
69  /**
70   * Sets the OptionHandler's options using the given list. All options
71   * will be set (or reset) during this call (i.e. incremental setting
72   * of options is not possible).
73   *
74   * @param options     the list of options as an array of strings
75   * @throws Exception  if an option is not supported
76   */
77  public void setOptions(String[] options) throws Exception {
78    // nothing in this class
79  }
80
81  /**
82   * Tests if this enumeration contains more elements.
83   *
84   * @return            true if and only if this enumeration object contains
85   *                    at least one more element to provide; false otherwise.
86   */
87  public abstract boolean hasMoreElements();
88
89  /**
90   * Returns the next element of this enumeration if this enumeration object
91   * has at least one more element to provide.
92   *
93   * @return            the next element of this enumeration.
94   */
95  public abstract Object nextElement();
96 
97  /**
98   * Sets the string to tokenize. Tokenization happens immediately.
99   *
100   * @param s           the string to tokenize
101   */
102  public abstract void tokenize(String s);
103 
104  /**
105   * initializes the given tokenizer with the given options and runs the
106   * tokenizer over all the remaining strings in the options array. If no
107   * strings remained in the option string then data is read from stdin, line
108   * by line.
109   *
110   * @param tokenizer   the tokenizer to use
111   * @param options     the options for the tokenizer
112   * @return            the tokenized strings
113   * @throws Exception  if setting of options or tokenization fails
114   */
115  public static String[] tokenize(Tokenizer tokenizer, String[] options) throws Exception {
116    Vector<String>      result;
117    Vector<String>      tmpResult;
118    Vector<String>      data;
119    int                 i;
120    boolean             processed;
121    BufferedReader      reader;
122    String              line;
123   
124    result = new Vector<String>();
125   
126    // init tokenizer
127    tokenizer.setOptions(options);
128
129    // for storing the data to process
130    data = new Vector<String>();
131   
132    // run over all un-processed strings in the options array
133    processed = false;
134    for (i = 0; i < options.length; i++) {
135      if (options[i].length() != 0) {
136        processed = true;
137        data.add(options[i]);
138      }
139    }
140   
141    // if no strings in option string then read from stdin
142    if (!processed) {
143      reader = new BufferedReader(new InputStreamReader(System.in));
144      while ((line = reader.readLine()) != null) {
145        data.add(line);
146      }
147    }
148
149    // process data
150    for (i = 0; i < data.size(); i++) {
151      tmpResult = new Vector<String>();
152      tokenizer.tokenize(data.get(i));
153      while (tokenizer.hasMoreElements())
154        tmpResult.add((String) tokenizer.nextElement());
155      // add to result
156      result.addAll(tmpResult);
157    }
158   
159    return result.toArray(new String[result.size()]);
160  }
161 
162  /**
163   * initializes the given tokenizer with the given options and runs the
164   * tokenizer over all the remaining strings in the options array. The
165   * generated tokens are then printed to stdout. If no strings remained
166   * in the option string then data is read from stdin, line by line.
167   *
168   * @param tokenizer   the tokenizer to use
169   * @param options     the options for the tokenizer
170   */
171  public static void runTokenizer(Tokenizer tokenizer, String[] options) {
172    String[]    result;
173    int         i;
174
175    try {
176      result = tokenize(tokenizer, options);
177      for (i = 0; i < result.length; i++)
178        System.out.println(result[i]);
179    }
180    catch (Exception e) {
181      e.printStackTrace();
182    }
183  }
184}
Note: See TracBrowser for help on using the repository browser.