source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/NumericCleaner.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 21.9 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * NumericCleaner.java
19 * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.filters.unsupervised.attribute;
23
24import weka.core.Capabilities;
25import weka.core.Instance; 
26import weka.core.DenseInstance;
27import weka.core.Instances;
28import weka.core.Option;
29import weka.core.Range;
30import weka.core.RevisionUtils;
31import weka.core.Utils;
32import weka.core.Capabilities.Capability;
33import weka.filters.SimpleStreamFilter;
34
35import java.util.Enumeration;
36import java.util.Vector;
37
38
39/**
40 <!-- globalinfo-start -->
41 * A filter that 'cleanses' the numeric data from values that are too small, too big or very close to a certain value (e.g., 0) and sets these values to a pre-defined default.
42 * <p/>
43 <!-- globalinfo-end -->
44 *
45 <!-- options-start -->
46 * Valid options are: <p/>
47 *
48 * <pre> -D
49 *  Turns on output of debugging information.</pre>
50 *
51 * <pre> -min &lt;double&gt;
52 *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
53 *
54 * <pre> -min-default &lt;double&gt;
55 *  The replacement for values smaller than the minimum threshold.
56 *  (default -Double.MAX_VALUE)</pre>
57 *
58 * <pre> -max &lt;double&gt;
59 *  The maximum threshold. (default Double.MAX_VALUE)</pre>
60 *
61 * <pre> -max-default &lt;double&gt;
62 *  The replacement for values larger than the maximum threshold.
63 *  (default Double.MAX_VALUE)</pre>
64 *
65 * <pre> -closeto &lt;double&gt;
66 *  The number values are checked for closeness. (default 0)</pre>
67 *
68 * <pre> -closeto-default &lt;double&gt;
69 *  The replacement for values that are close to '-closeto'.
70 *  (default 0)</pre>
71 *
72 * <pre> -closeto-tolerance &lt;double&gt;
73 *  The tolerance below which numbers are considered being close to
74 *  to each other. (default 1E-6)</pre>
75 *
76 * <pre> -decimals &lt;int&gt;
77 *  The number of decimals to round to, -1 means no rounding at all.
78 *  (default -1)</pre>
79 *
80 * <pre> -R &lt;col1,col2,...&gt;
81 *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
82 *  (default first-last)</pre>
83 *
84 * <pre> -V
85 *  Inverts the matching sense.</pre>
86 *
87 * <pre> -include-class
88 *  Whether to include the class in the cleansing.
89 *  The class column will always be skipped, if this flag is not
90 *  present. (default no)</pre>
91 *
92 <!-- options-end -->
93 *
94 * @author  fracpete (fracpete at waikato dot ac dot nz)
95 * @version $Revision: 5987 $
96 */
97public class NumericCleaner
98  extends SimpleStreamFilter {
99
100  /** for serialization */
101  private static final long serialVersionUID = -352890679895066592L;
102
103  /** the minimum threshold */
104  protected double m_MinThreshold = -Double.MAX_VALUE;
105
106  /** the minimum default replacement value */
107  protected double m_MinDefault = -Double.MAX_VALUE;
108
109  /** the maximum threshold */
110  protected double m_MaxThreshold = Double.MAX_VALUE;
111
112  /** the maximum default replacement value */
113  protected double m_MaxDefault = Double.MAX_VALUE;
114
115  /** the number the values are checked for closeness to */
116  protected double m_CloseTo = 0;
117
118  /** the default replacement value for numbers "close-to" */
119  protected double m_CloseToDefault = 0;
120
121  /** the tolerance distance, below which numbers are considered being "close-to" */
122  protected double m_CloseToTolerance = 1E-6;
123
124  /** Stores which columns to cleanse */
125  protected Range m_Cols = new Range("first-last");
126
127  /** whether to include the class attribute */
128  protected boolean m_IncludeClass = false;
129 
130  /** the number of decimals to round to (-1 means no rounding) */
131  protected int m_Decimals = -1;
132 
133  /**
134   * Returns a string describing this filter.
135   *
136   * @return      a description of the filter suitable for
137   *              displaying in the explorer/experimenter gui
138   */
139  public String globalInfo() {
140    return 
141        "A filter that 'cleanses' the numeric data from values that are too "
142      + "small, too big or very close to a certain value (e.g., 0) and sets "
143      + "these values to a pre-defined default.";
144  }
145
146  /**
147   * Returns an enumeration describing the available options.
148   *
149   * @return an enumeration of all the available options.
150   */
151  public Enumeration listOptions() {
152    Vector        result;
153    Enumeration   enm;
154
155    result = new Vector();
156
157    enm = super.listOptions();
158    while (enm.hasMoreElements())
159      result.addElement(enm.nextElement());
160
161    result.addElement(new Option(
162        "\tThe minimum threshold. (default -Double.MAX_VALUE)",
163        "min", 1, "-min <double>"));
164   
165    result.addElement(new Option(
166        "\tThe replacement for values smaller than the minimum threshold.\n"
167        + "\t(default -Double.MAX_VALUE)",
168        "min-default", 1, "-min-default <double>"));
169
170    result.addElement(new Option(
171        "\tThe maximum threshold. (default Double.MAX_VALUE)",
172        "max", 1, "-max <double>"));
173   
174    result.addElement(new Option(
175        "\tThe replacement for values larger than the maximum threshold.\n"
176        + "\t(default Double.MAX_VALUE)",
177        "max-default", 1, "-max-default <double>"));
178
179    result.addElement(new Option(
180        "\tThe number values are checked for closeness. (default 0)",
181        "closeto", 1, "-closeto <double>"));
182   
183    result.addElement(new Option(
184        "\tThe replacement for values that are close to '-closeto'.\n"
185        + "\t(default 0)",
186        "closeto-default", 1, "-closeto-default <double>"));
187   
188    result.addElement(new Option(
189        "\tThe tolerance below which numbers are considered being close to \n"
190        + "\tto each other. (default 1E-6)",
191        "closeto-tolerance", 1, "-closeto-tolerance <double>"));
192
193    result.addElement(new Option(
194        "\tThe number of decimals to round to, -1 means no rounding at all.\n"
195        + "\t(default -1)",
196        "decimals", 1, "-decimals <int>"));
197   
198    result.addElement(new Option(
199        "\tThe list of columns to cleanse, e.g., first-last or first-3,5-last.\n"
200        + "\t(default first-last)",
201        "R", 1, "-R <col1,col2,...>"));
202
203    result.addElement(new Option(
204        "\tInverts the matching sense.",
205        "V", 0, "-V"));
206
207    result.addElement(new Option(
208        "\tWhether to include the class in the cleansing.\n"
209        + "\tThe class column will always be skipped, if this flag is not\n"
210        + "\tpresent. (default no)",
211        "include-class", 0, "-include-class"));
212
213    return result.elements();
214  }       
215
216  /**
217   * Gets the current settings of the filter.
218   *
219   * @return an array of strings suitable for passing to setOptions
220   */
221  public String[] getOptions() {
222    int       i;
223    Vector    result;
224    String[]  options;
225
226    result = new Vector();
227    options = super.getOptions();
228    for (i = 0; i < options.length; i++)
229      result.add(options[i]);
230
231    result.add("-min"); 
232    result.add("" + m_MinThreshold);
233
234    result.add("-min-default"); 
235    result.add("" + m_MinDefault);
236
237    result.add("-max"); 
238    result.add("" + m_MaxThreshold);
239
240    result.add("-max-default"); 
241    result.add("" + m_MaxDefault);
242
243    result.add("-closeto"); 
244    result.add("" + m_CloseTo);
245
246    result.add("-closeto-default"); 
247    result.add("" + m_CloseToDefault);
248   
249    result.add("-closeto-tolerance"); 
250    result.add("" + m_CloseToTolerance);
251
252    result.add("-R"); 
253    result.add("" + m_Cols.getRanges());
254
255    if (m_Cols.getInvert())
256      result.add("-V");
257   
258    if (m_IncludeClass)
259      result.add("-include-class"); 
260
261    result.add("-decimals"); 
262    result.add("" + getDecimals());
263
264    return (String[]) result.toArray(new String[result.size()]);         
265  }       
266
267  /**
268   * Parses a given list of options. <p/>
269   *
270   <!-- options-start -->
271   * Valid options are: <p/>
272   *
273   * <pre> -D
274   *  Turns on output of debugging information.</pre>
275   *
276   * <pre> -min &lt;double&gt;
277   *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
278   *
279   * <pre> -min-default &lt;double&gt;
280   *  The replacement for values smaller than the minimum threshold.
281   *  (default -Double.MAX_VALUE)</pre>
282   *
283   * <pre> -max &lt;double&gt;
284   *  The maximum threshold. (default Double.MAX_VALUE)</pre>
285   *
286   * <pre> -max-default &lt;double&gt;
287   *  The replacement for values larger than the maximum threshold.
288   *  (default Double.MAX_VALUE)</pre>
289   *
290   * <pre> -closeto &lt;double&gt;
291   *  The number values are checked for closeness. (default 0)</pre>
292   *
293   * <pre> -closeto-default &lt;double&gt;
294   *  The replacement for values that are close to '-closeto'.
295   *  (default 0)</pre>
296   *
297   * <pre> -closeto-tolerance &lt;double&gt;
298   *  The tolerance below which numbers are considered being close to
299   *  to each other. (default 1E-6)</pre>
300   *
301   * <pre> -decimals &lt;int&gt;
302   *  The number of decimals to round to, -1 means no rounding at all.
303   *  (default -1)</pre>
304   *
305   * <pre> -R &lt;col1,col2,...&gt;
306   *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
307   *  (default first-last)</pre>
308   *
309   * <pre> -V
310   *  Inverts the matching sense.</pre>
311   *
312   * <pre> -include-class
313   *  Whether to include the class in the cleansing.
314   *  The class column will always be skipped, if this flag is not
315   *  present. (default no)</pre>
316   *
317   <!-- options-end -->
318   *
319   * @param options the list of options as an array of strings
320   * @throws Exception if an option is not supported
321   */
322  public void setOptions(String[] options) throws Exception {
323    String      tmpStr;
324
325    tmpStr = Utils.getOption("min", options);
326    if (tmpStr.length() != 0)
327      setMinThreshold(Double.parseDouble(tmpStr));
328    else
329      setMinThreshold(-Double.MAX_VALUE);
330   
331    tmpStr = Utils.getOption("min-default", options);
332    if (tmpStr.length() != 0)
333      setMinDefault(Double.parseDouble(tmpStr));
334    else
335      setMinDefault(-Double.MAX_VALUE);
336   
337    tmpStr = Utils.getOption("max", options);
338    if (tmpStr.length() != 0)
339      setMaxThreshold(Double.parseDouble(tmpStr));
340    else
341      setMaxThreshold(Double.MAX_VALUE);
342   
343    tmpStr = Utils.getOption("max-default", options);
344    if (tmpStr.length() != 0)
345      setMaxDefault(Double.parseDouble(tmpStr));
346    else
347      setMaxDefault(Double.MAX_VALUE);
348   
349    tmpStr = Utils.getOption("closeto", options);
350    if (tmpStr.length() != 0)
351      setCloseTo(Double.parseDouble(tmpStr));
352    else
353      setCloseTo(0);
354   
355    tmpStr = Utils.getOption("closeto-default", options);
356    if (tmpStr.length() != 0)
357      setCloseToDefault(Double.parseDouble(tmpStr));
358    else
359      setCloseToDefault(0);
360   
361    tmpStr = Utils.getOption("closeto-tolerance", options);
362    if (tmpStr.length() != 0)
363      setCloseToTolerance(Double.parseDouble(tmpStr));
364    else
365      setCloseToTolerance(1E-6);
366   
367    tmpStr = Utils.getOption("R", options);
368    if (tmpStr.length() != 0)
369      setAttributeIndices(tmpStr);
370    else
371      setAttributeIndices("first-last");
372   
373    setInvertSelection(Utils.getFlag("V", options));
374   
375    setIncludeClass(Utils.getFlag("include-class", options));
376
377    tmpStr = Utils.getOption("decimals", options);
378    if (tmpStr.length() != 0)
379      setDecimals(Integer.parseInt(tmpStr));
380    else
381      setDecimals(-1);
382   
383    super.setOptions(options);
384  }       
385
386  /**
387   * Returns the Capabilities of this filter.
388   *
389   * @return            the capabilities of this object
390   * @see               Capabilities
391   */
392  public Capabilities getCapabilities() {
393    Capabilities result = super.getCapabilities();
394    result.disableAll();
395
396    // attributes
397    result.enableAllAttributes();
398    result.enable(Capability.MISSING_VALUES);
399   
400    // class
401    result.enableAllClasses();
402    result.enable(Capability.MISSING_CLASS_VALUES);
403    result.enable(Capability.NO_CLASS);
404   
405    return result;
406  }
407 
408  /**
409   * Determines the output format based on the input format and returns
410   * this. In case the output format cannot be returned immediately, i.e.,
411   * immediateOutputFormat() returns false, then this method will be called
412   * from batchFinished().
413   *
414   * @param inputFormat     the input format to base the output format on
415   * @return                the output format
416   * @throws Exception      in case the determination goes wrong
417   * @see   #hasImmediateOutputFormat()
418   * @see   #batchFinished()
419   */
420  protected Instances determineOutputFormat(Instances inputFormat)
421      throws Exception {
422
423    m_Cols.setUpper(inputFormat.numAttributes() - 1);
424   
425    return new Instances(inputFormat);
426  }
427
428  /**
429   * processes the given instance (may change the provided instance) and
430   * returns the modified version.
431   *
432   * @param instance    the instance to process
433   * @return            the modified data
434   * @throws Exception  in case the processing goes wrong
435   */
436  protected Instance process(Instance instance) throws Exception {
437    Instance            result;
438    int                 i;
439    double              val;
440    double              factor;
441   
442    result = (Instance) instance.copy();
443   
444    if (m_Decimals > -1)
445      factor = StrictMath.pow(10, m_Decimals);
446    else
447      factor = 1;
448   
449    for (i = 0; i < result.numAttributes(); i++) {
450      // only numeric attributes
451      if (!result.attribute(i).isNumeric())
452        continue;
453
454      // out of range?
455      if (!m_Cols.isInRange(i))
456        continue;
457     
458      // skip class?
459      if ( (result.classIndex() == i) && (!m_IncludeClass) )
460        continue;
461     
462      // too small?
463      if (result.value(i) < m_MinThreshold) {
464        if (getDebug())
465          System.out.println("Too small: " + result.value(i) + " -> " + m_MinDefault);
466        result.setValue(i, m_MinDefault);
467      }
468      // too big?
469      else if (result.value(i) > m_MaxThreshold) {
470        if (getDebug())
471          System.out.println("Too big: " + result.value(i) + " -> " + m_MaxDefault);
472        result.setValue(i, m_MaxDefault);
473      }
474      // too close?
475      else if (    (result.value(i) - m_CloseTo < m_CloseToTolerance) 
476                && (m_CloseTo - result.value(i) < m_CloseToTolerance) 
477                && (result.value(i) != m_CloseTo) ) {
478        if (getDebug())
479          System.out.println("Too close: " + result.value(i) + " -> " + m_CloseToDefault);
480        result.setValue(i, m_CloseToDefault);
481      }
482     
483      // decimals?
484      if (m_Decimals > -1) {
485        val = result.value(i);
486        val = StrictMath.round(val * factor) / factor;
487        result.setValue(i, val);
488      }
489    }
490
491    return result;
492  }
493
494  /**
495   * Returns the tip text for this property
496   *
497   * @return            tip text for this property suitable for
498   *                    displaying in the explorer/experimenter gui
499   */
500  public String minThresholdTipText() {
501    return "The minimum threshold below values are replaced by a default.";
502  }
503
504  /**
505   * Get the minimum threshold.
506   *
507   * @return            the minimum threshold.
508   */
509  public double getMinThreshold() {
510    return m_MinThreshold;
511  }
512
513  /**
514   * Set the minimum threshold.
515   *
516   * @param value       the minimum threshold to use.
517   */
518  public void setMinThreshold(double value) {
519    m_MinThreshold = value;
520  }
521
522  /**
523   * Returns the tip text for this property
524   *
525   * @return            tip text for this property suitable for
526   *                    displaying in the explorer/experimenter gui
527   */
528  public String minDefaultTipText() {
529    return "The default value to replace values that are below the minimum threshold.";
530  }
531
532  /**
533   * Get the minimum default.
534   *
535   * @return            the minimum default.
536   */
537  public double getMinDefault() {
538    return m_MinDefault;
539  }
540
541  /**
542   * Set the minimum default.
543   *
544   * @param value       the minimum default to use.
545   */
546  public void setMinDefault(double value) {
547    m_MinDefault = value;
548  }
549
550  /**
551   * Returns the tip text for this property
552   *
553   * @return            tip text for this property suitable for
554   *                    displaying in the explorer/experimenter gui
555   */
556  public String maxThresholdTipText() {
557    return "The maximum threshold above values are replaced by a default.";
558  }
559
560  /**
561   * Get the maximum threshold.
562   *
563   * @return            the maximum threshold.
564   */
565  public double getMaxThreshold() {
566    return m_MaxThreshold;
567  }
568
569  /**
570   * Set the maximum threshold.
571   *
572   * @param value       the maximum threshold to use.
573   */
574  public void setMaxThreshold(double value) {
575    m_MaxThreshold = value;
576  }
577
578  /**
579   * Returns the tip text for this property
580   *
581   * @return            tip text for this property suitable for
582   *                    displaying in the explorer/experimenter gui
583   */
584  public String maxDefaultTipText() {
585    return "The default value to replace values that are above the maximum threshold.";
586  }
587
588  /**
589   * Get the maximum default.
590   *
591   * @return            the maximum default.
592   */
593  public double getMaxDefault() {
594    return m_MaxDefault;
595  }
596
597  /**
598   * Set the naximum default.
599   *
600   * @param value       the maximum default to use.
601   */
602  public void setMaxDefault(double value) {
603    m_MaxDefault = value;
604  }
605
606  /**
607   * Returns the tip text for this property
608   *
609   * @return            tip text for this property suitable for
610   *                    displaying in the explorer/experimenter gui
611   */
612  public String closeToTipText() {
613    return 
614        "The number values are checked for whether they are too close to "
615      + "and get replaced by a default.";
616  }
617
618  /**
619   * Get the "close to" number.
620   *
621   * @return            the "close to" number.
622   */
623  public double getCloseTo() {
624    return m_CloseTo;
625  }
626
627  /**
628   * Set the "close to" number.
629   *
630   * @param value       the number to use for checking closeness.
631   */
632  public void setCloseTo(double value) {
633    m_CloseTo = value;
634  }
635
636  /**
637   * Returns the tip text for this property
638   *
639   * @return            tip text for this property suitable for
640   *                    displaying in the explorer/experimenter gui
641   */
642  public String closeToDefaultTipText() {
643    return "The default value to replace values with that are too close.";
644  }
645
646  /**
647   * Get the "close to" default.
648   *
649   * @return            the "close to" default.
650   */
651  public double getCloseToDefault() {
652    return m_CloseToDefault;
653  }
654
655  /**
656   * Set the "close to" default.
657   *
658   * @param value       the "close to" default to use.
659   */
660  public void setCloseToDefault(double value) {
661    m_CloseToDefault = value;
662  }
663
664  /**
665   * Returns the tip text for this property
666   *
667   * @return            tip text for this property suitable for
668   *                    displaying in the explorer/experimenter gui
669   */
670  public String closeToToleranceTipText() {
671    return "The value below which values are considered close to.";
672  }
673
674  /**
675   * Get the "close to" Tolerance.
676   *
677   * @return            the "close to" Tolerance.
678   */
679  public double getCloseToTolerance() {
680    return m_CloseToTolerance;
681  }
682
683  /**
684   * Set the "close to" Tolerance.
685   *
686   * @param value       the "close to" Tolerance to use.
687   */
688  public void setCloseToTolerance(double value) {
689    m_CloseToTolerance = value;
690  }
691
692  /**
693   * Returns the tip text for this property
694   *
695   * @return            tip text for this property suitable for
696   *                    displaying in the explorer/experimenter gui
697   */
698  public String attributeIndicesTipText() {
699    return "The selection of columns to use in the cleansing processs, first and last are valid indices.";
700  }
701
702  /**
703   * Gets the selection of the columns, e.g., first-last or first-3,5-last
704   *
705   * @return            the selected indices
706   */
707  public String getAttributeIndices() {
708    return m_Cols.getRanges();
709  }
710
711  /**
712   * Sets the columns to use, e.g., first-last or first-3,5-last
713   *
714   * @param value       the columns to use
715   */
716  public void setAttributeIndices(String value) {
717    m_Cols.setRanges(value);
718  }
719
720  /**
721   * Returns the tip text for this property
722   *
723   * @return            tip text for this property suitable for
724   *                    displaying in the explorer/experimenter gui
725   */
726  public String invertSelectionTipText() {
727    return "If enabled the selection of the columns is inverted.";
728  }
729
730  /**
731   * Gets whether the selection of the columns is inverted
732   *
733   * @return            true if the selection is inverted
734   */
735  public boolean getInvertSelection() {
736    return m_Cols.getInvert();
737  }
738
739  /**
740   * Sets whether the selection of the indices is inverted or not
741   *
742   * @param value       the new invert setting
743   */
744  public void setInvertSelection(boolean value) {
745    m_Cols.setInvert(value);
746  }
747
748  /**
749   * Returns the tip text for this property
750   *
751   * @return            tip text for this property suitable for
752   *                    displaying in the explorer/experimenter gui
753   */
754  public String includeClassTipText() {
755    return "If disabled, the class attribute will be always left out of the cleaning process.";
756  }
757
758  /**
759   * Gets whether the class is included in the cleaning process or always
760   * skipped.
761   *
762   * @return            true if the class can be considered for cleaning.
763   */
764  public boolean getIncludeClass() {
765    return m_IncludeClass;
766  }
767
768  /**
769   * Sets whether the class can be cleaned, too.
770   *
771   * @param value       true if the class can be cleansed, too
772   */
773  public void setIncludeClass(boolean value) {
774    m_IncludeClass = value;
775  }
776
777  /**
778   * Returns the tip text for this property
779   *
780   * @return            tip text for this property suitable for
781   *                    displaying in the explorer/experimenter gui
782   */
783  public String decimalsTipText() {
784    return "The number of decimals to round to, -1 means no rounding at all.";
785  }
786
787  /**
788   * Get the number of decimals to round to.
789   *
790   * @return            the number of decimals.
791   */
792  public int getDecimals() {
793    return m_Decimals;
794  }
795
796  /**
797   * Set the number of decimals to round to.
798   *
799   * @param value       the number of decimals.
800   */
801  public void setDecimals(int value) {
802    m_Decimals = value;
803  }
804 
805  /**
806   * Returns the revision string.
807   *
808   * @return            the revision
809   */
810  public String getRevision() {
811    return RevisionUtils.extract("$Revision: 5987 $");
812  }
813
814  /**
815   * Runs the filter from commandline, use "-h" to see all options.
816   *
817   * @param args the commandline options for the filter
818   */
819  public static void main(String[] args) {
820    runFilter(new NumericCleaner(), args);
821  }
822}
Note: See TracBrowser for help on using the repository browser.