Context Navigation

NumericCleaner.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 21.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* NumericCleaner.java
19	* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20	*/
21
22	package weka.filters.unsupervised.attribute;
23
24	import weka.core.Capabilities;
25	import weka.core.Instance;
26	import weka.core.DenseInstance;
27	import weka.core.Instances;
28	import weka.core.Option;
29	import weka.core.Range;
30	import weka.core.RevisionUtils;
31	import weka.core.Utils;
32	import weka.core.Capabilities.Capability;
33	import weka.filters.SimpleStreamFilter;
34
35	import java.util.Enumeration;
36	import java.util.Vector;
37
38
39	/**
40	<!-- globalinfo-start -->
41	* A filter that 'cleanses' the numeric data from values that are too small, too big or very close to a certain value (e.g., 0) and sets these values to a pre-defined default.
42	* <p/>
43	<!-- globalinfo-end -->
44	*
45	<!-- options-start -->
46	* Valid options are: <p/>
47	*
48	* <pre> -D
49	* Turns on output of debugging information.</pre>
50	*
51	* <pre> -min <double>
52	* The minimum threshold. (default -Double.MAX_VALUE)</pre>
53	*
54	* <pre> -min-default <double>
55	* The replacement for values smaller than the minimum threshold.
56	* (default -Double.MAX_VALUE)</pre>
57	*
58	* <pre> -max <double>
59	* The maximum threshold. (default Double.MAX_VALUE)</pre>
60	*
61	* <pre> -max-default <double>
62	* The replacement for values larger than the maximum threshold.
63	* (default Double.MAX_VALUE)</pre>
64	*
65	* <pre> -closeto <double>
66	* The number values are checked for closeness. (default 0)</pre>
67	*
68	* <pre> -closeto-default <double>
69	* The replacement for values that are close to '-closeto'.
70	* (default 0)</pre>
71	*
72	* <pre> -closeto-tolerance <double>
73	* The tolerance below which numbers are considered being close to
74	* to each other. (default 1E-6)</pre>
75	*
76	* <pre> -decimals <int>
77	* The number of decimals to round to, -1 means no rounding at all.
78	* (default -1)</pre>
79	*
80	* <pre> -R <col1,col2,...>
81	* The list of columns to cleanse, e.g., first-last or first-3,5-last.
82	* (default first-last)</pre>
83	*
84	* <pre> -V
85	* Inverts the matching sense.</pre>
86	*
87	* <pre> -include-class
88	* Whether to include the class in the cleansing.
89	* The class column will always be skipped, if this flag is not
90	* present. (default no)</pre>
91	*
92	<!-- options-end -->
93	*
94	* @author fracpete (fracpete at waikato dot ac dot nz)
95	* @version $Revision: 5987 $
96	*/
97	public class NumericCleaner
98	extends SimpleStreamFilter {
99
100	/** for serialization */
101	private static final long serialVersionUID = -352890679895066592L;
102
103	/** the minimum threshold */
104	protected double m_MinThreshold = -Double.MAX_VALUE;
105
106	/** the minimum default replacement value */
107	protected double m_MinDefault = -Double.MAX_VALUE;
108
109	/** the maximum threshold */
110	protected double m_MaxThreshold = Double.MAX_VALUE;
111
112	/** the maximum default replacement value */
113	protected double m_MaxDefault = Double.MAX_VALUE;
114
115	/** the number the values are checked for closeness to */
116	protected double m_CloseTo = 0;
117
118	/** the default replacement value for numbers "close-to" */
119	protected double m_CloseToDefault = 0;
120
121	/** the tolerance distance, below which numbers are considered being "close-to" */
122	protected double m_CloseToTolerance = 1E-6;
123
124	/** Stores which columns to cleanse */
125	protected Range m_Cols = new Range("first-last");
126
127	/** whether to include the class attribute */
128	protected boolean m_IncludeClass = false;
129
130	/** the number of decimals to round to (-1 means no rounding) */
131	protected int m_Decimals = -1;
132
133	/**
134	* Returns a string describing this filter.
135	*
136	* @return a description of the filter suitable for
137	* displaying in the explorer/experimenter gui
138	*/
139	public String globalInfo() {
140	return
141	"A filter that 'cleanses' the numeric data from values that are too "
142	+ "small, too big or very close to a certain value (e.g., 0) and sets "
143	+ "these values to a pre-defined default.";
144	}
145
146	/**
147	* Returns an enumeration describing the available options.
148	*
149	* @return an enumeration of all the available options.
150	*/
151	public Enumeration listOptions() {
152	Vector result;
153	Enumeration enm;
154
155	result = new Vector();
156
157	enm = super.listOptions();
158	while (enm.hasMoreElements())
159	result.addElement(enm.nextElement());
160
161	result.addElement(new Option(
162	"\tThe minimum threshold. (default -Double.MAX_VALUE)",
163	"min", 1, "-min <double>"));
164
165	result.addElement(new Option(
166	"\tThe replacement for values smaller than the minimum threshold.\n"
167	+ "\t(default -Double.MAX_VALUE)",
168	"min-default", 1, "-min-default <double>"));
169
170	result.addElement(new Option(
171	"\tThe maximum threshold. (default Double.MAX_VALUE)",
172	"max", 1, "-max <double>"));
173
174	result.addElement(new Option(
175	"\tThe replacement for values larger than the maximum threshold.\n"
176	+ "\t(default Double.MAX_VALUE)",
177	"max-default", 1, "-max-default <double>"));
178
179	result.addElement(new Option(
180	"\tThe number values are checked for closeness. (default 0)",
181	"closeto", 1, "-closeto <double>"));
182
183	result.addElement(new Option(
184	"\tThe replacement for values that are close to '-closeto'.\n"
185	+ "\t(default 0)",
186	"closeto-default", 1, "-closeto-default <double>"));
187
188	result.addElement(new Option(
189	"\tThe tolerance below which numbers are considered being close to \n"
190	+ "\tto each other. (default 1E-6)",
191	"closeto-tolerance", 1, "-closeto-tolerance <double>"));
192
193	result.addElement(new Option(
194	"\tThe number of decimals to round to, -1 means no rounding at all.\n"
195	+ "\t(default -1)",
196	"decimals", 1, "-decimals <int>"));
197
198	result.addElement(new Option(
199	"\tThe list of columns to cleanse, e.g., first-last or first-3,5-last.\n"
200	+ "\t(default first-last)",
201	"R", 1, "-R <col1,col2,...>"));
202
203	result.addElement(new Option(
204	"\tInverts the matching sense.",
205	"V", 0, "-V"));
206
207	result.addElement(new Option(
208	"\tWhether to include the class in the cleansing.\n"
209	+ "\tThe class column will always be skipped, if this flag is not\n"
210	+ "\tpresent. (default no)",
211	"include-class", 0, "-include-class"));
212
213	return result.elements();
214	}
215
216	/**
217	* Gets the current settings of the filter.
218	*
219	* @return an array of strings suitable for passing to setOptions
220	*/
221	public String[] getOptions() {
222	int i;
223	Vector result;
224	String[] options;
225
226	result = new Vector();
227	options = super.getOptions();
228	for (i = 0; i < options.length; i++)
229	result.add(options[i]);
230
231	result.add("-min");
232	result.add("" + m_MinThreshold);
233
234	result.add("-min-default");
235	result.add("" + m_MinDefault);
236
237	result.add("-max");
238	result.add("" + m_MaxThreshold);
239
240	result.add("-max-default");
241	result.add("" + m_MaxDefault);
242
243	result.add("-closeto");
244	result.add("" + m_CloseTo);
245
246	result.add("-closeto-default");
247	result.add("" + m_CloseToDefault);
248
249	result.add("-closeto-tolerance");
250	result.add("" + m_CloseToTolerance);
251
252	result.add("-R");
253	result.add("" + m_Cols.getRanges());
254
255	if (m_Cols.getInvert())
256	result.add("-V");
257
258	if (m_IncludeClass)
259	result.add("-include-class");
260
261	result.add("-decimals");
262	result.add("" + getDecimals());
263
264	return (String[]) result.toArray(new String[result.size()]);
265	}
266
267	/**
268	* Parses a given list of options. <p/>
269	*
270	<!-- options-start -->
271	* Valid options are: <p/>
272	*
273	* <pre> -D
274	* Turns on output of debugging information.</pre>
275	*
276	* <pre> -min <double>
277	* The minimum threshold. (default -Double.MAX_VALUE)</pre>
278	*
279	* <pre> -min-default <double>
280	* The replacement for values smaller than the minimum threshold.
281	* (default -Double.MAX_VALUE)</pre>
282	*
283	* <pre> -max <double>
284	* The maximum threshold. (default Double.MAX_VALUE)</pre>
285	*
286	* <pre> -max-default <double>
287	* The replacement for values larger than the maximum threshold.
288	* (default Double.MAX_VALUE)</pre>
289	*
290	* <pre> -closeto <double>
291	* The number values are checked for closeness. (default 0)</pre>
292	*
293	* <pre> -closeto-default <double>
294	* The replacement for values that are close to '-closeto'.
295	* (default 0)</pre>
296	*
297	* <pre> -closeto-tolerance <double>
298	* The tolerance below which numbers are considered being close to
299	* to each other. (default 1E-6)</pre>
300	*
301	* <pre> -decimals <int>
302	* The number of decimals to round to, -1 means no rounding at all.
303	* (default -1)</pre>
304	*
305	* <pre> -R <col1,col2,...>
306	* The list of columns to cleanse, e.g., first-last or first-3,5-last.
307	* (default first-last)</pre>
308	*
309	* <pre> -V
310	* Inverts the matching sense.</pre>
311	*
312	* <pre> -include-class
313	* Whether to include the class in the cleansing.
314	* The class column will always be skipped, if this flag is not
315	* present. (default no)</pre>
316	*
317	<!-- options-end -->
318	*
319	* @param options the list of options as an array of strings
320	* @throws Exception if an option is not supported
321	*/
322	public void setOptions(String[] options) throws Exception {
323	String tmpStr;
324
325	tmpStr = Utils.getOption("min", options);
326	if (tmpStr.length() != 0)
327	setMinThreshold(Double.parseDouble(tmpStr));
328	else
329	setMinThreshold(-Double.MAX_VALUE);
330
331	tmpStr = Utils.getOption("min-default", options);
332	if (tmpStr.length() != 0)
333	setMinDefault(Double.parseDouble(tmpStr));
334	else
335	setMinDefault(-Double.MAX_VALUE);
336
337	tmpStr = Utils.getOption("max", options);
338	if (tmpStr.length() != 0)
339	setMaxThreshold(Double.parseDouble(tmpStr));
340	else
341	setMaxThreshold(Double.MAX_VALUE);
342
343	tmpStr = Utils.getOption("max-default", options);
344	if (tmpStr.length() != 0)
345	setMaxDefault(Double.parseDouble(tmpStr));
346	else
347	setMaxDefault(Double.MAX_VALUE);
348
349	tmpStr = Utils.getOption("closeto", options);
350	if (tmpStr.length() != 0)
351	setCloseTo(Double.parseDouble(tmpStr));
352	else
353	setCloseTo(0);
354
355	tmpStr = Utils.getOption("closeto-default", options);
356	if (tmpStr.length() != 0)
357	setCloseToDefault(Double.parseDouble(tmpStr));
358	else
359	setCloseToDefault(0);
360
361	tmpStr = Utils.getOption("closeto-tolerance", options);
362	if (tmpStr.length() != 0)
363	setCloseToTolerance(Double.parseDouble(tmpStr));
364	else
365	setCloseToTolerance(1E-6);
366
367	tmpStr = Utils.getOption("R", options);
368	if (tmpStr.length() != 0)
369	setAttributeIndices(tmpStr);
370	else
371	setAttributeIndices("first-last");
372
373	setInvertSelection(Utils.getFlag("V", options));
374
375	setIncludeClass(Utils.getFlag("include-class", options));
376
377	tmpStr = Utils.getOption("decimals", options);
378	if (tmpStr.length() != 0)
379	setDecimals(Integer.parseInt(tmpStr));
380	else
381	setDecimals(-1);
382
383	super.setOptions(options);
384	}
385
386	/**
387	* Returns the Capabilities of this filter.
388	*
389	* @return the capabilities of this object
390	* @see Capabilities
391	*/
392	public Capabilities getCapabilities() {
393	Capabilities result = super.getCapabilities();
394	result.disableAll();
395
396	// attributes
397	result.enableAllAttributes();
398	result.enable(Capability.MISSING_VALUES);
399
400	// class
401	result.enableAllClasses();
402	result.enable(Capability.MISSING_CLASS_VALUES);
403	result.enable(Capability.NO_CLASS);
404
405	return result;
406	}
407
408	/**
409	* Determines the output format based on the input format and returns
410	* this. In case the output format cannot be returned immediately, i.e.,
411	* immediateOutputFormat() returns false, then this method will be called
412	* from batchFinished().
413	*
414	* @param inputFormat the input format to base the output format on
415	* @return the output format
416	* @throws Exception in case the determination goes wrong
417	* @see #hasImmediateOutputFormat()
418	* @see #batchFinished()
419	*/
420	protected Instances determineOutputFormat(Instances inputFormat)
421	throws Exception {
422
423	m_Cols.setUpper(inputFormat.numAttributes() - 1);
424
425	return new Instances(inputFormat);
426	}
427
428	/**
429	* processes the given instance (may change the provided instance) and
430	* returns the modified version.
431	*
432	* @param instance the instance to process
433	* @return the modified data
434	* @throws Exception in case the processing goes wrong
435	*/
436	protected Instance process(Instance instance) throws Exception {
437	Instance result;
438	int i;
439	double val;
440	double factor;
441
442	result = (Instance) instance.copy();
443
444	if (m_Decimals > -1)
445	factor = StrictMath.pow(10, m_Decimals);
446	else
447	factor = 1;
448
449	for (i = 0; i < result.numAttributes(); i++) {
450	// only numeric attributes
451	if (!result.attribute(i).isNumeric())
452	continue;
453
454	// out of range?
455	if (!m_Cols.isInRange(i))
456	continue;
457
458	// skip class?
459	if ( (result.classIndex() == i) && (!m_IncludeClass) )
460	continue;
461
462	// too small?
463	if (result.value(i) < m_MinThreshold) {
464	if (getDebug())
465	System.out.println("Too small: " + result.value(i) + " -> " + m_MinDefault);
466	result.setValue(i, m_MinDefault);
467	}
468	// too big?
469	else if (result.value(i) > m_MaxThreshold) {
470	if (getDebug())
471	System.out.println("Too big: " + result.value(i) + " -> " + m_MaxDefault);
472	result.setValue(i, m_MaxDefault);
473	}
474	// too close?
475	else if ( (result.value(i) - m_CloseTo < m_CloseToTolerance)
476	&& (m_CloseTo - result.value(i) < m_CloseToTolerance)
477	&& (result.value(i) != m_CloseTo) ) {
478	if (getDebug())
479	System.out.println("Too close: " + result.value(i) + " -> " + m_CloseToDefault);
480	result.setValue(i, m_CloseToDefault);
481	}
482
483	// decimals?
484	if (m_Decimals > -1) {
485	val = result.value(i);
486	val = StrictMath.round(val * factor) / factor;
487	result.setValue(i, val);
488	}
489	}
490
491	return result;
492	}
493
494	/**
495	* Returns the tip text for this property
496	*
497	* @return tip text for this property suitable for
498	* displaying in the explorer/experimenter gui
499	*/
500	public String minThresholdTipText() {
501	return "The minimum threshold below values are replaced by a default.";
502	}
503
504	/**
505	* Get the minimum threshold.
506	*
507	* @return the minimum threshold.
508	*/
509	public double getMinThreshold() {
510	return m_MinThreshold;
511	}
512
513	/**
514	* Set the minimum threshold.
515	*
516	* @param value the minimum threshold to use.
517	*/
518	public void setMinThreshold(double value) {
519	m_MinThreshold = value;
520	}
521
522	/**
523	* Returns the tip text for this property
524	*
525	* @return tip text for this property suitable for
526	* displaying in the explorer/experimenter gui
527	*/
528	public String minDefaultTipText() {
529	return "The default value to replace values that are below the minimum threshold.";
530	}
531
532	/**
533	* Get the minimum default.
534	*
535	* @return the minimum default.
536	*/
537	public double getMinDefault() {
538	return m_MinDefault;
539	}
540
541	/**
542	* Set the minimum default.
543	*
544	* @param value the minimum default to use.
545	*/
546	public void setMinDefault(double value) {
547	m_MinDefault = value;
548	}
549
550	/**
551	* Returns the tip text for this property
552	*
553	* @return tip text for this property suitable for
554	* displaying in the explorer/experimenter gui
555	*/
556	public String maxThresholdTipText() {
557	return "The maximum threshold above values are replaced by a default.";
558	}
559
560	/**
561	* Get the maximum threshold.
562	*
563	* @return the maximum threshold.
564	*/
565	public double getMaxThreshold() {
566	return m_MaxThreshold;
567	}
568
569	/**
570	* Set the maximum threshold.
571	*
572	* @param value the maximum threshold to use.
573	*/
574	public void setMaxThreshold(double value) {
575	m_MaxThreshold = value;
576	}
577
578	/**
579	* Returns the tip text for this property
580	*
581	* @return tip text for this property suitable for
582	* displaying in the explorer/experimenter gui
583	*/
584	public String maxDefaultTipText() {
585	return "The default value to replace values that are above the maximum threshold.";
586	}
587
588	/**
589	* Get the maximum default.
590	*
591	* @return the maximum default.
592	*/
593	public double getMaxDefault() {
594	return m_MaxDefault;
595	}
596
597	/**
598	* Set the naximum default.
599	*
600	* @param value the maximum default to use.
601	*/
602	public void setMaxDefault(double value) {
603	m_MaxDefault = value;
604	}
605
606	/**
607	* Returns the tip text for this property
608	*
609	* @return tip text for this property suitable for
610	* displaying in the explorer/experimenter gui
611	*/
612	public String closeToTipText() {
613	return
614	"The number values are checked for whether they are too close to "
615	+ "and get replaced by a default.";
616	}
617
618	/**
619	* Get the "close to" number.
620	*
621	* @return the "close to" number.
622	*/
623	public double getCloseTo() {
624	return m_CloseTo;
625	}
626
627	/**
628	* Set the "close to" number.
629	*
630	* @param value the number to use for checking closeness.
631	*/
632	public void setCloseTo(double value) {
633	m_CloseTo = value;
634	}
635
636	/**
637	* Returns the tip text for this property
638	*
639	* @return tip text for this property suitable for
640	* displaying in the explorer/experimenter gui
641	*/
642	public String closeToDefaultTipText() {
643	return "The default value to replace values with that are too close.";
644	}
645
646	/**
647	* Get the "close to" default.
648	*
649	* @return the "close to" default.
650	*/
651	public double getCloseToDefault() {
652	return m_CloseToDefault;
653	}
654
655	/**
656	* Set the "close to" default.
657	*
658	* @param value the "close to" default to use.
659	*/
660	public void setCloseToDefault(double value) {
661	m_CloseToDefault = value;
662	}
663
664	/**
665	* Returns the tip text for this property
666	*
667	* @return tip text for this property suitable for
668	* displaying in the explorer/experimenter gui
669	*/
670	public String closeToToleranceTipText() {
671	return "The value below which values are considered close to.";
672	}
673
674	/**
675	* Get the "close to" Tolerance.
676	*
677	* @return the "close to" Tolerance.
678	*/
679	public double getCloseToTolerance() {
680	return m_CloseToTolerance;
681	}
682
683	/**
684	* Set the "close to" Tolerance.
685	*
686	* @param value the "close to" Tolerance to use.
687	*/
688	public void setCloseToTolerance(double value) {
689	m_CloseToTolerance = value;
690	}
691
692	/**
693	* Returns the tip text for this property
694	*
695	* @return tip text for this property suitable for
696	* displaying in the explorer/experimenter gui
697	*/
698	public String attributeIndicesTipText() {
699	return "The selection of columns to use in the cleansing processs, first and last are valid indices.";
700	}
701
702	/**
703	* Gets the selection of the columns, e.g., first-last or first-3,5-last
704	*
705	* @return the selected indices
706	*/
707	public String getAttributeIndices() {
708	return m_Cols.getRanges();
709	}
710
711	/**
712	* Sets the columns to use, e.g., first-last or first-3,5-last
713	*
714	* @param value the columns to use
715	*/
716	public void setAttributeIndices(String value) {
717	m_Cols.setRanges(value);
718	}
719
720	/**
721	* Returns the tip text for this property
722	*
723	* @return tip text for this property suitable for
724	* displaying in the explorer/experimenter gui
725	*/
726	public String invertSelectionTipText() {
727	return "If enabled the selection of the columns is inverted.";
728	}
729
730	/**
731	* Gets whether the selection of the columns is inverted
732	*
733	* @return true if the selection is inverted
734	*/
735	public boolean getInvertSelection() {
736	return m_Cols.getInvert();
737	}
738
739	/**
740	* Sets whether the selection of the indices is inverted or not
741	*
742	* @param value the new invert setting
743	*/
744	public void setInvertSelection(boolean value) {
745	m_Cols.setInvert(value);
746	}
747
748	/**
749	* Returns the tip text for this property
750	*
751	* @return tip text for this property suitable for
752	* displaying in the explorer/experimenter gui
753	*/
754	public String includeClassTipText() {
755	return "If disabled, the class attribute will be always left out of the cleaning process.";
756	}
757
758	/**
759	* Gets whether the class is included in the cleaning process or always
760	* skipped.
761	*
762	* @return true if the class can be considered for cleaning.
763	*/
764	public boolean getIncludeClass() {
765	return m_IncludeClass;
766	}
767
768	/**
769	* Sets whether the class can be cleaned, too.
770	*
771	* @param value true if the class can be cleansed, too
772	*/
773	public void setIncludeClass(boolean value) {
774	m_IncludeClass = value;
775	}
776
777	/**
778	* Returns the tip text for this property
779	*
780	* @return tip text for this property suitable for
781	* displaying in the explorer/experimenter gui
782	*/
783	public String decimalsTipText() {
784	return "The number of decimals to round to, -1 means no rounding at all.";
785	}
786
787	/**
788	* Get the number of decimals to round to.
789	*
790	* @return the number of decimals.
791	*/
792	public int getDecimals() {
793	return m_Decimals;
794	}
795
796	/**
797	* Set the number of decimals to round to.
798	*
799	* @param value the number of decimals.
800	*/
801	public void setDecimals(int value) {
802	m_Decimals = value;
803	}
804
805	/**
806	* Returns the revision string.
807	*
808	* @return the revision
809	*/
810	public String getRevision() {
811	return RevisionUtils.extract("$Revision: 5987 $");
812	}
813
814	/**
815	* Runs the filter from commandline, use "-h" to see all options.
816	*
817	* @param args the commandline options for the filter
818	*/
819	public static void main(String[] args) {
820	runFilter(new NumericCleaner(), args);
821	}
822	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/NumericCleaner.java

Download in other formats: