source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/NumericToNominal.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 12.4 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * NumericToNominal.java
19 * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.filters.unsupervised.attribute;
23
24import weka.core.Attribute;
25import weka.core.Capabilities;
26import weka.core.FastVector;
27import weka.core.Instance; 
28import weka.core.DenseInstance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.Range;
32import weka.core.RevisionUtils;
33import weka.core.SparseInstance;
34import weka.core.Utils;
35import weka.core.Capabilities.Capability;
36import weka.filters.SimpleBatchFilter;
37
38import java.util.Collections;
39import java.util.Enumeration;
40import java.util.HashSet;
41import java.util.Vector;
42
43/**
44 <!-- globalinfo-start -->
45 * A filter for turning numeric attributes into nominal ones. Unlike discretization, it just takes all numeric values and adds them to the list of nominal values of that attribute. Useful after CSV imports, to enforce certain attributes to become nominal, e.g., the class attribute, containing values from 1 to 5.
46 * <p/>
47 <!-- globalinfo-end -->
48 *
49 <!-- options-start -->
50 * Valid options are: <p/>
51 *
52 * <pre> -R &lt;col1,col2-col4,...&gt;
53 *  Specifies list of columns to Discretize. First and last are valid indexes.
54 *  (default: first-last)</pre>
55 *
56 * <pre> -V
57 *  Invert matching sense of column indexes.</pre>
58 *
59 <!-- options-end -->
60 *
61 * @author  fracpete (fracpete at waikato dot ac dot nz)
62 * @version $Revision: 5987 $
63 */
64public class NumericToNominal
65  extends SimpleBatchFilter {
66
67  /** for serialization */
68  private static final long serialVersionUID = -6614630932899796239L;
69
70  /** the maximum number of decimals to use */
71  protected final static int MAX_DECIMALS = 6;
72 
73  /** Stores which columns to turn into nominals */
74  protected Range m_Cols = new Range("first-last");
75
76  /** The default columns to turn into nominals */
77  protected String m_DefaultCols = "first-last";
78
79  /**
80   * Returns a string describing this filter
81   *
82   * @return            a description of the filter suitable for
83   *                    displaying in the explorer/experimenter gui
84   */
85  public String globalInfo() {
86    return 
87        "A filter for turning numeric attributes into nominal ones. Unlike "
88      + "discretization, it just takes all numeric values and adds them to "
89      + "the list of nominal values of that attribute. Useful after CSV "
90      + "imports, to enforce certain attributes to become nominal, e.g., "
91      + "the class attribute, containing values from 1 to 5.";
92  }
93
94  /**
95   * Gets an enumeration describing the available options.
96   *
97   * @return            an enumeration of all the available options.
98   */
99  public Enumeration listOptions() {
100    Vector result = new Vector();
101
102    result.addElement(new Option(
103        "\tSpecifies list of columns to Discretize. First"
104        + " and last are valid indexes.\n"
105        + "\t(default: first-last)",
106        "R", 1, "-R <col1,col2-col4,...>"));
107
108    result.addElement(new Option(
109        "\tInvert matching sense of column indexes.",
110        "V", 0, "-V"));
111
112    return result.elements();
113  }
114
115  /**
116   * Parses a given list of options. <p/>
117   *
118   <!-- options-start -->
119   * Valid options are: <p/>
120   *
121   * <pre> -R &lt;col1,col2-col4,...&gt;
122   *  Specifies list of columns to Discretize. First and last are valid indexes.
123   *  (default: first-last)</pre>
124   *
125   * <pre> -V
126   *  Invert matching sense of column indexes.</pre>
127   *
128   <!-- options-end -->
129   *
130   * @param options the list of options as an array of strings
131   * @throws Exception if an option is not supported
132   */
133  public void setOptions(String[] options) throws Exception {
134    String      tmpStr;
135
136    super.setOptions(options);
137   
138    setInvertSelection(Utils.getFlag('V', options));
139
140    tmpStr = Utils.getOption('R', options);
141    if (tmpStr.length() != 0)
142      setAttributeIndices(tmpStr);
143    else
144      setAttributeIndices(m_DefaultCols);
145
146    if (getInputFormat() != null)
147      setInputFormat(getInputFormat());
148  }
149
150  /**
151   * Gets the current settings of the filter.
152   *
153   * @return an array of strings suitable for passing to setOptions
154   */
155  public String[] getOptions() {
156    int       i;
157    Vector    result;
158    String[]  options;
159
160    result = new Vector();
161    options = super.getOptions();
162    for (i = 0; i < options.length; i++)
163      result.add(options[i]);
164
165    if (!getAttributeIndices().equals("")) {
166      result.add("-R");
167      result.add(getAttributeIndices());
168    }
169
170    if (getInvertSelection())
171      result.add("-V");
172
173    return (String[]) result.toArray(new String[result.size()]);         
174  }
175
176  /**
177   * Returns the tip text for this property
178   *
179   * @return            tip text for this property suitable for
180   *                    displaying in the explorer/experimenter gui
181   */
182  public String invertSelectionTipText() {
183    return 
184        "Set attribute selection mode. If false, only selected"
185      + " (numeric) attributes in the range will be 'nominalized'; if"
186      + " true, only non-selected attributes will be 'nominalized'.";
187  }
188
189  /**
190   * Gets whether the supplied columns are to be worked on or the others.
191   *
192   * @return            true if the supplied columns will be worked on
193   */
194  public boolean getInvertSelection() {
195    return m_Cols.getInvert();
196  }
197
198  /**
199   * Sets whether selected columns should be worked on or all the others apart
200   * from these. If true all the other columns are considered for
201   * "nominalization".
202   *
203   * @param value       the new invert setting
204   */
205  public void setInvertSelection(boolean value) {
206    m_Cols.setInvert(value);
207  }
208
209  /**
210   * Returns the tip text for this property
211   *
212   * @return            tip text for this property suitable for
213   *                    displaying in the explorer/experimenter gui
214   */
215  public String attributeIndicesTipText() {
216    return "Specify range of attributes to act on."
217      + " This is a comma separated list of attribute indices, with"
218      + " \"first\" and \"last\" valid values. Specify an inclusive"
219      + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
220  }
221
222  /**
223   * Gets the current range selection
224   *
225   * @return            a string containing a comma separated list of ranges
226   */
227  public String getAttributeIndices() {
228    return m_Cols.getRanges();
229  }
230
231  /**
232   * Sets which attributes are to be "nominalized" (only numeric
233   * attributes among the selection will be transformed).
234   *
235   * @param value       a string representing the list of attributes. Since
236   *                    the string will typically come from a user, attributes
237   *                    are indexed from 1. <br> eg: first-3,5,6-last
238   * @throws IllegalArgumentException if an invalid range list is supplied
239   */
240  public void setAttributeIndices(String value) {
241    m_Cols.setRanges(value);
242  }
243
244  /**
245   * Sets which attributes are to be transoformed to nominal. (only numeric
246   * attributes among the selection will be transformed).
247   *
248   * @param value       an array containing indexes of attributes to nominalize.
249   *                    Since the array will typically come from a program,
250   *                    attributes are indexed from 0.
251   * @throws IllegalArgumentException if an invalid set of ranges is supplied
252   */
253  public void setAttributeIndicesArray(int[] value) {
254    setAttributeIndices(Range.indicesToRangeList(value));
255  }
256
257  /**
258   * Returns the Capabilities of this filter.
259   *
260   * @return            the capabilities of this object
261   * @see               Capabilities
262   */
263  public Capabilities getCapabilities() {
264    Capabilities result = super.getCapabilities();
265    result.disableAll();
266
267    // attributes
268    result.enableAllAttributes();
269    result.enable(Capability.MISSING_VALUES);
270   
271    // class
272    result.enableAllClasses();
273    result.enable(Capability.MISSING_CLASS_VALUES);
274    result.enable(Capability.NO_CLASS);
275   
276    return result;
277  }
278
279  /**
280   * Determines the output format based on the input format and returns
281   * this. In case the output format cannot be returned immediately, i.e.,
282   * immediateOutputFormat() returns false, then this method will be called
283   * from batchFinished().
284   *
285   * @param inputFormat     the input format to base the output format on
286   * @return                the output format
287   * @throws Exception      in case the determination goes wrong
288   * @see   #hasImmediateOutputFormat()
289   * @see   #batchFinished()
290   */
291  protected Instances determineOutputFormat(Instances inputFormat)
292      throws Exception {
293   
294    Instances   data;
295    Instances   result;
296    FastVector  atts;
297    FastVector  values;
298    HashSet     hash;
299    int         i;
300    int         n;
301    boolean     isDate;
302    Instance    inst;
303    Vector      sorted;
304
305    m_Cols.setUpper(inputFormat.numAttributes() - 1);
306    data = new Instances(inputFormat);
307    atts = new FastVector();
308    for (i = 0; i < data.numAttributes(); i++) {
309      if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
310        atts.addElement(data.attribute(i));
311        continue;
312      }
313     
314      // date attribute?
315      isDate = (data.attribute(i).type() == Attribute.DATE);
316     
317      // determine all available attribtues in dataset
318      hash   = new HashSet();
319      for (n = 0; n < data.numInstances(); n++) {
320        inst = data.instance(n);
321        if (inst.isMissing(i))
322          continue;
323       
324        if (isDate)
325          hash.add(inst.stringValue(i));
326        else
327          hash.add(new Double(inst.value(i)));
328      }
329     
330      // sort values
331      sorted = new Vector();
332      for (Object o: hash)
333        sorted.add(o);
334      Collections.sort(sorted);
335     
336      // create attribute from sorted values
337      values = new FastVector();
338      for (Object o: sorted) {
339        if (isDate)
340          values.addElement(
341              o.toString());
342        else
343          values.addElement(
344              Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
345      }
346      atts.addElement(new Attribute(data.attribute(i).name(), values));
347    }
348   
349    result = new Instances(inputFormat.relationName(), atts, 0);
350    result.setClassIndex(inputFormat.classIndex());
351   
352    return result;
353  }
354
355  /**
356   * Processes the given data (may change the provided dataset) and returns
357   * the modified version. This method is called in batchFinished().
358   *
359   * @param instances   the data to process
360   * @return            the modified data
361   * @throws Exception  in case the processing goes wrong
362   * @see               #batchFinished()
363   */
364  protected Instances process(Instances instances) throws Exception {
365    Instances   result;
366    int         i;
367    int         n;
368    double[]    values;
369    String      value;
370    Instance    inst;
371    Instance    newInst;
372   
373    // we need the complete input data!
374    if (!isFirstBatchDone())
375      setOutputFormat(determineOutputFormat(getInputFormat()));
376   
377    result = new Instances(getOutputFormat());
378   
379    for (i = 0; i < instances.numInstances(); i++) {
380      inst   = instances.instance(i);
381      values = inst.toDoubleArray();
382     
383      for (n = 0; n < values.length; n++) {
384        if (    !m_Cols.isInRange(n)
385             || !instances.attribute(n).isNumeric() 
386             || inst.isMissing(n) )
387          continue;
388
389        // get index of value
390        if (instances.attribute(n).type() == Attribute.DATE)
391          value = inst.stringValue(n);
392        else
393          value = Utils.doubleToString(inst.value(n), MAX_DECIMALS);
394       
395        values[n] = result.attribute(n).indexOfValue(value);
396      }
397     
398      // generate new instance
399      if (inst instanceof SparseInstance)
400        newInst = new SparseInstance(inst.weight(), values);
401      else
402        newInst = new DenseInstance(inst.weight(), values);
403     
404      // copy possible string, relational values
405      newInst.setDataset(getOutputFormat());
406      copyValues(newInst, false, inst.dataset(), getOutputFormat());
407     
408      result.add(newInst);
409    }
410   
411    return result;
412  }
413 
414  /**
415   * Returns the revision string.
416   *
417   * @return            the revision
418   */
419  public String getRevision() {
420    return RevisionUtils.extract("$Revision: 5987 $");
421  }
422
423  /**
424   * Runs the filter with the given parameters. Use -h to list options.
425   *
426   * @param args        the commandline options
427   */
428  public static void main(String[] args) {
429    runFilter(new NumericToNominal(), args);
430  }
431}
Note: See TracBrowser for help on using the repository browser.