| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * DelimitedTokenizer.java |
|---|
| 19 | * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand |
|---|
| 20 | */ |
|---|
| 21 | |
|---|
| 22 | package weka.core.tokenizers; |
|---|
| 23 | |
|---|
| 24 | import weka.core.Option; |
|---|
| 25 | import weka.core.RevisionUtils; |
|---|
| 26 | import weka.core.Utils; |
|---|
| 27 | |
|---|
| 28 | import java.util.Enumeration; |
|---|
| 29 | import java.util.Vector; |
|---|
| 30 | |
|---|
| 31 | /** |
|---|
| 32 | * Abstract superclass for tokenizers that take characters as delimiters. |
|---|
| 33 | * |
|---|
| 34 | * @author fracpete (fracpete at waikato dot ac dot nz) |
|---|
| 35 | * @version $Revision: 5953 $ |
|---|
| 36 | */ |
|---|
| 37 | public abstract class CharacterDelimitedTokenizer |
|---|
| 38 | extends Tokenizer { |
|---|
| 39 | |
|---|
| 40 | /** Delimiters used in tokenization */ |
|---|
| 41 | protected String m_Delimiters = " \r\n\t.,;:'\"()?!"; |
|---|
| 42 | |
|---|
| 43 | /** |
|---|
| 44 | * Returns an enumeration of all the available options.. |
|---|
| 45 | * |
|---|
| 46 | * @return an enumeration of all available options. |
|---|
| 47 | */ |
|---|
| 48 | public Enumeration listOptions() { |
|---|
| 49 | Vector<Option> result; |
|---|
| 50 | |
|---|
| 51 | result = new Vector<Option>(); |
|---|
| 52 | |
|---|
| 53 | result.addElement(new Option( |
|---|
| 54 | "\tThe delimiters to use\n" |
|---|
| 55 | + "\t(default ' \\r\\n\\t.,;:'\"()?!').", |
|---|
| 56 | "delimiters", 1, "-delimiters <value>")); |
|---|
| 57 | |
|---|
| 58 | return result.elements(); |
|---|
| 59 | } |
|---|
| 60 | |
|---|
| 61 | /** |
|---|
| 62 | * Gets the current option settings for the OptionHandler. |
|---|
| 63 | * |
|---|
| 64 | * @return the list of current option settings as an array of |
|---|
| 65 | * strings |
|---|
| 66 | */ |
|---|
| 67 | public String[] getOptions() { |
|---|
| 68 | Vector<String> result; |
|---|
| 69 | |
|---|
| 70 | result = new Vector<String>(); |
|---|
| 71 | |
|---|
| 72 | result.add("-delimiters"); |
|---|
| 73 | result.add(getDelimiters()); |
|---|
| 74 | |
|---|
| 75 | return result.toArray(new String[result.size()]); |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | /** |
|---|
| 79 | * Sets the OptionHandler's options using the given list. All options |
|---|
| 80 | * will be set (or reset) during this call (i.e. incremental setting |
|---|
| 81 | * of options is not possible). |
|---|
| 82 | * |
|---|
| 83 | * @param options the list of options as an array of strings |
|---|
| 84 | * @throws Exception if an option is not supported |
|---|
| 85 | */ |
|---|
| 86 | public void setOptions(String[] options) throws Exception { |
|---|
| 87 | String tmpStr; |
|---|
| 88 | |
|---|
| 89 | tmpStr = Utils.getOption("delimiters", options); |
|---|
| 90 | if (tmpStr.length() != 0) |
|---|
| 91 | setDelimiters(tmpStr); |
|---|
| 92 | else |
|---|
| 93 | setDelimiters(" \r\n\t.,;:'\"()?!"); |
|---|
| 94 | } |
|---|
| 95 | |
|---|
| 96 | /** |
|---|
| 97 | * Get the value of delimiters (not backquoted). |
|---|
| 98 | * |
|---|
| 99 | * @return Value of delimiters. |
|---|
| 100 | */ |
|---|
| 101 | public String getDelimiters() { |
|---|
| 102 | return m_Delimiters; |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | /** |
|---|
| 106 | * Set the value of delimiters. For convenienve, the strings |
|---|
| 107 | * "\r", "\n", "\t", "\'", "\\" get automatically translated into their |
|---|
| 108 | * character representations '\r', '\n', '\t', '\'', '\\'. This means, one |
|---|
| 109 | * can either use <code>setDelimiters("\r\n\t\\");</code> or |
|---|
| 110 | * <code>setDelimiters("\\r\\n\\t\\\\");</code>. |
|---|
| 111 | * |
|---|
| 112 | * @param value Value to assign to delimiters. |
|---|
| 113 | * @see Utils#unbackQuoteChars(String) |
|---|
| 114 | */ |
|---|
| 115 | public void setDelimiters(String value) { |
|---|
| 116 | m_Delimiters = Utils.unbackQuoteChars(value); |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | /** |
|---|
| 120 | * Returns the tip text for this property |
|---|
| 121 | * |
|---|
| 122 | * @return tip text for this property suitable for |
|---|
| 123 | * displaying in the explorer/experimenter gui |
|---|
| 124 | */ |
|---|
| 125 | public String delimitersTipText() { |
|---|
| 126 | return "Set of delimiter characters to use in tokenizing (\\r, \\n and \\t can be used for carriage-return, line-feed and tab)"; |
|---|
| 127 | } |
|---|
| 128 | } |
|---|