1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * Discretize.java |
---|
19 | * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | |
---|
24 | package weka.filters.unsupervised.attribute; |
---|
25 | |
---|
26 | import weka.core.Attribute; |
---|
27 | import weka.core.Capabilities; |
---|
28 | import weka.core.FastVector; |
---|
29 | import weka.core.Instance; |
---|
30 | import weka.core.DenseInstance; |
---|
31 | import weka.core.Instances; |
---|
32 | import weka.core.Option; |
---|
33 | import weka.core.Range; |
---|
34 | import weka.core.RevisionUtils; |
---|
35 | import weka.core.SparseInstance; |
---|
36 | import weka.core.Utils; |
---|
37 | import weka.core.WeightedInstancesHandler; |
---|
38 | import weka.core.Capabilities.Capability; |
---|
39 | import weka.filters.UnsupervisedFilter; |
---|
40 | |
---|
41 | import java.util.Enumeration; |
---|
42 | import java.util.Vector; |
---|
43 | |
---|
44 | /** |
---|
45 | <!-- globalinfo-start --> |
---|
46 | * An instance filter that discretizes a range of numeric attributes in the dataset into nominal attributes. Discretization is by simple binning. Skips the class attribute if set. |
---|
47 | * <p/> |
---|
48 | <!-- globalinfo-end --> |
---|
49 | * |
---|
50 | <!-- options-start --> |
---|
51 | * Valid options are: <p/> |
---|
52 | * |
---|
53 | * <pre> -unset-class-temporarily |
---|
54 | * Unsets the class index temporarily before the filter is |
---|
55 | * applied to the data. |
---|
56 | * (default: no)</pre> |
---|
57 | * |
---|
58 | * <pre> -B <num> |
---|
59 | * Specifies the (maximum) number of bins to divide numeric attributes into. |
---|
60 | * (default = 10)</pre> |
---|
61 | * |
---|
62 | * <pre> -M <num> |
---|
63 | * Specifies the desired weight of instances per bin for |
---|
64 | * equal-frequency binning. If this is set to a positive |
---|
65 | * number then the -B option will be ignored. |
---|
66 | * (default = -1)</pre> |
---|
67 | * |
---|
68 | * <pre> -F |
---|
69 | * Use equal-frequency instead of equal-width discretization.</pre> |
---|
70 | * |
---|
71 | * <pre> -O |
---|
72 | * Optimize number of bins using leave-one-out estimate |
---|
73 | * of estimated entropy (for equal-width discretization). |
---|
74 | * If this is set then the -B option will be ignored.</pre> |
---|
75 | * |
---|
76 | * <pre> -R <col1,col2-col4,...> |
---|
77 | * Specifies list of columns to Discretize. First and last are valid indexes. |
---|
78 | * (default: first-last)</pre> |
---|
79 | * |
---|
80 | * <pre> -V |
---|
81 | * Invert matching sense of column indexes.</pre> |
---|
82 | * |
---|
83 | * <pre> -D |
---|
84 | * Output binary attributes for discretized attributes.</pre> |
---|
85 | * |
---|
86 | <!-- options-end --> |
---|
87 | * |
---|
88 | * @author Len Trigg (trigg@cs.waikato.ac.nz) |
---|
89 | * @author Eibe Frank (eibe@cs.waikato.ac.nz) |
---|
90 | * @version $Revision: 5987 $ |
---|
91 | */ |
---|
92 | public class Discretize |
---|
93 | extends PotentialClassIgnorer |
---|
94 | implements UnsupervisedFilter, WeightedInstancesHandler { |
---|
95 | |
---|
96 | /** for serialization */ |
---|
97 | static final long serialVersionUID = -1358531742174527279L; |
---|
98 | |
---|
99 | /** Stores which columns to Discretize */ |
---|
100 | protected Range m_DiscretizeCols = new Range(); |
---|
101 | |
---|
102 | /** The number of bins to divide the attribute into */ |
---|
103 | protected int m_NumBins = 10; |
---|
104 | |
---|
105 | /** The desired weight of instances per bin */ |
---|
106 | protected double m_DesiredWeightOfInstancesPerInterval = -1; |
---|
107 | |
---|
108 | /** Store the current cutpoints */ |
---|
109 | protected double [][] m_CutPoints = null; |
---|
110 | |
---|
111 | /** Output binary attributes for discretized attributes. */ |
---|
112 | protected boolean m_MakeBinary = false; |
---|
113 | |
---|
114 | /** Find the number of bins using cross-validated entropy. */ |
---|
115 | protected boolean m_FindNumBins = false; |
---|
116 | |
---|
117 | /** Use equal-frequency binning if unsupervised discretization turned on */ |
---|
118 | protected boolean m_UseEqualFrequency = false; |
---|
119 | |
---|
120 | /** The default columns to discretize */ |
---|
121 | protected String m_DefaultCols; |
---|
122 | |
---|
123 | /** Constructor - initialises the filter */ |
---|
124 | public Discretize() { |
---|
125 | |
---|
126 | m_DefaultCols = "first-last"; |
---|
127 | setAttributeIndices("first-last"); |
---|
128 | } |
---|
129 | |
---|
130 | /** |
---|
131 | * Another constructor, sets the attribute indices immediately |
---|
132 | * |
---|
133 | * @param cols the attribute indices |
---|
134 | */ |
---|
135 | public Discretize(String cols) { |
---|
136 | |
---|
137 | m_DefaultCols = cols; |
---|
138 | setAttributeIndices(cols); |
---|
139 | } |
---|
140 | |
---|
141 | /** |
---|
142 | * Gets an enumeration describing the available options. |
---|
143 | * |
---|
144 | * @return an enumeration of all the available options. |
---|
145 | */ |
---|
146 | public Enumeration listOptions() { |
---|
147 | Vector result = new Vector(); |
---|
148 | Enumeration enm = super.listOptions(); |
---|
149 | while (enm.hasMoreElements()) |
---|
150 | result.add(enm.nextElement()); |
---|
151 | |
---|
152 | result.addElement(new Option( |
---|
153 | "\tSpecifies the (maximum) number of bins to divide numeric" |
---|
154 | + " attributes into.\n" |
---|
155 | + "\t(default = 10)", |
---|
156 | "B", 1, "-B <num>")); |
---|
157 | |
---|
158 | result.addElement(new Option( |
---|
159 | "\tSpecifies the desired weight of instances per bin for\n" |
---|
160 | + "\tequal-frequency binning. If this is set to a positive\n" |
---|
161 | + "\tnumber then the -B option will be ignored.\n" |
---|
162 | + "\t(default = -1)", |
---|
163 | "M", 1, "-M <num>")); |
---|
164 | |
---|
165 | result.addElement(new Option( |
---|
166 | "\tUse equal-frequency instead of equal-width discretization.", |
---|
167 | "F", 0, "-F")); |
---|
168 | |
---|
169 | result.addElement(new Option( |
---|
170 | "\tOptimize number of bins using leave-one-out estimate\n"+ |
---|
171 | "\tof estimated entropy (for equal-width discretization).\n"+ |
---|
172 | "\tIf this is set then the -B option will be ignored.", |
---|
173 | "O", 0, "-O")); |
---|
174 | |
---|
175 | result.addElement(new Option( |
---|
176 | "\tSpecifies list of columns to Discretize. First" |
---|
177 | + " and last are valid indexes.\n" |
---|
178 | + "\t(default: first-last)", |
---|
179 | "R", 1, "-R <col1,col2-col4,...>")); |
---|
180 | |
---|
181 | result.addElement(new Option( |
---|
182 | "\tInvert matching sense of column indexes.", |
---|
183 | "V", 0, "-V")); |
---|
184 | |
---|
185 | result.addElement(new Option( |
---|
186 | "\tOutput binary attributes for discretized attributes.", |
---|
187 | "D", 0, "-D")); |
---|
188 | |
---|
189 | return result.elements(); |
---|
190 | } |
---|
191 | |
---|
192 | |
---|
193 | /** |
---|
194 | * Parses a given list of options. <p/> |
---|
195 | * |
---|
196 | <!-- options-start --> |
---|
197 | * Valid options are: <p/> |
---|
198 | * |
---|
199 | * <pre> -unset-class-temporarily |
---|
200 | * Unsets the class index temporarily before the filter is |
---|
201 | * applied to the data. |
---|
202 | * (default: no)</pre> |
---|
203 | * |
---|
204 | * <pre> -B <num> |
---|
205 | * Specifies the (maximum) number of bins to divide numeric attributes into. |
---|
206 | * (default = 10)</pre> |
---|
207 | * |
---|
208 | * <pre> -M <num> |
---|
209 | * Specifies the desired weight of instances per bin for |
---|
210 | * equal-frequency binning. If this is set to a positive |
---|
211 | * number then the -B option will be ignored. |
---|
212 | * (default = -1)</pre> |
---|
213 | * |
---|
214 | * <pre> -F |
---|
215 | * Use equal-frequency instead of equal-width discretization.</pre> |
---|
216 | * |
---|
217 | * <pre> -O |
---|
218 | * Optimize number of bins using leave-one-out estimate |
---|
219 | * of estimated entropy (for equal-width discretization). |
---|
220 | * If this is set then the -B option will be ignored.</pre> |
---|
221 | * |
---|
222 | * <pre> -R <col1,col2-col4,...> |
---|
223 | * Specifies list of columns to Discretize. First and last are valid indexes. |
---|
224 | * (default: first-last)</pre> |
---|
225 | * |
---|
226 | * <pre> -V |
---|
227 | * Invert matching sense of column indexes.</pre> |
---|
228 | * |
---|
229 | * <pre> -D |
---|
230 | * Output binary attributes for discretized attributes.</pre> |
---|
231 | * |
---|
232 | <!-- options-end --> |
---|
233 | * |
---|
234 | * @param options the list of options as an array of strings |
---|
235 | * @throws Exception if an option is not supported |
---|
236 | */ |
---|
237 | public void setOptions(String[] options) throws Exception { |
---|
238 | |
---|
239 | super.setOptions(options); |
---|
240 | |
---|
241 | setMakeBinary(Utils.getFlag('D', options)); |
---|
242 | setUseEqualFrequency(Utils.getFlag('F', options)); |
---|
243 | setFindNumBins(Utils.getFlag('O', options)); |
---|
244 | setInvertSelection(Utils.getFlag('V', options)); |
---|
245 | |
---|
246 | String weight = Utils.getOption('M', options); |
---|
247 | if (weight.length() != 0) { |
---|
248 | setDesiredWeightOfInstancesPerInterval((new Double(weight)).doubleValue()); |
---|
249 | } else { |
---|
250 | setDesiredWeightOfInstancesPerInterval(-1); |
---|
251 | } |
---|
252 | |
---|
253 | String numBins = Utils.getOption('B', options); |
---|
254 | if (numBins.length() != 0) { |
---|
255 | setBins(Integer.parseInt(numBins)); |
---|
256 | } else { |
---|
257 | setBins(10); |
---|
258 | } |
---|
259 | |
---|
260 | String convertList = Utils.getOption('R', options); |
---|
261 | if (convertList.length() != 0) { |
---|
262 | setAttributeIndices(convertList); |
---|
263 | } else { |
---|
264 | setAttributeIndices(m_DefaultCols); |
---|
265 | } |
---|
266 | |
---|
267 | if (getInputFormat() != null) { |
---|
268 | setInputFormat(getInputFormat()); |
---|
269 | } |
---|
270 | } |
---|
271 | |
---|
272 | /** |
---|
273 | * Gets the current settings of the filter. |
---|
274 | * |
---|
275 | * @return an array of strings suitable for passing to setOptions |
---|
276 | */ |
---|
277 | public String [] getOptions() { |
---|
278 | Vector result; |
---|
279 | String[] options; |
---|
280 | int i; |
---|
281 | |
---|
282 | result = new Vector(); |
---|
283 | |
---|
284 | options = super.getOptions(); |
---|
285 | for (i = 0; i < options.length; i++) |
---|
286 | result.add(options[i]); |
---|
287 | |
---|
288 | if (getMakeBinary()) |
---|
289 | result.add("-D"); |
---|
290 | |
---|
291 | if (getUseEqualFrequency()) |
---|
292 | result.add("-F"); |
---|
293 | |
---|
294 | if (getFindNumBins()) |
---|
295 | result.add("-O"); |
---|
296 | |
---|
297 | if (getInvertSelection()) |
---|
298 | result.add("-V"); |
---|
299 | |
---|
300 | result.add("-B"); |
---|
301 | result.add("" + getBins()); |
---|
302 | |
---|
303 | result.add("-M"); |
---|
304 | result.add("" + getDesiredWeightOfInstancesPerInterval()); |
---|
305 | |
---|
306 | if (!getAttributeIndices().equals("")) { |
---|
307 | result.add("-R"); |
---|
308 | result.add(getAttributeIndices()); |
---|
309 | } |
---|
310 | |
---|
311 | return (String[]) result.toArray(new String[result.size()]); |
---|
312 | } |
---|
313 | |
---|
314 | /** |
---|
315 | * Returns the Capabilities of this filter. |
---|
316 | * |
---|
317 | * @return the capabilities of this object |
---|
318 | * @see Capabilities |
---|
319 | */ |
---|
320 | public Capabilities getCapabilities() { |
---|
321 | Capabilities result = super.getCapabilities(); |
---|
322 | result.disableAll(); |
---|
323 | |
---|
324 | // attributes |
---|
325 | result.enableAllAttributes(); |
---|
326 | result.enable(Capability.MISSING_VALUES); |
---|
327 | |
---|
328 | // class |
---|
329 | result.enableAllClasses(); |
---|
330 | result.enable(Capability.MISSING_CLASS_VALUES); |
---|
331 | if (!getMakeBinary()) |
---|
332 | result.enable(Capability.NO_CLASS); |
---|
333 | |
---|
334 | return result; |
---|
335 | } |
---|
336 | |
---|
337 | /** |
---|
338 | * Sets the format of the input instances. |
---|
339 | * |
---|
340 | * @param instanceInfo an Instances object containing the input instance |
---|
341 | * structure (any instances contained in the object are ignored - only the |
---|
342 | * structure is required). |
---|
343 | * @return true if the outputFormat may be collected immediately |
---|
344 | * @throws Exception if the input format can't be set successfully |
---|
345 | */ |
---|
346 | public boolean setInputFormat(Instances instanceInfo) throws Exception { |
---|
347 | |
---|
348 | if (m_MakeBinary && m_IgnoreClass) { |
---|
349 | throw new IllegalArgumentException("Can't ignore class when " + |
---|
350 | "changing the number of attributes!"); |
---|
351 | } |
---|
352 | |
---|
353 | super.setInputFormat(instanceInfo); |
---|
354 | |
---|
355 | m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1); |
---|
356 | m_CutPoints = null; |
---|
357 | |
---|
358 | if (getFindNumBins() && getUseEqualFrequency()) { |
---|
359 | throw new IllegalArgumentException("Bin number optimization in conjunction "+ |
---|
360 | "with equal-frequency binning not implemented."); |
---|
361 | } |
---|
362 | |
---|
363 | // If we implement loading cutfiles, then load |
---|
364 | //them here and set the output format |
---|
365 | return false; |
---|
366 | } |
---|
367 | |
---|
368 | /** |
---|
369 | * Input an instance for filtering. Ordinarily the instance is processed |
---|
370 | * and made available for output immediately. Some filters require all |
---|
371 | * instances be read before producing output. |
---|
372 | * |
---|
373 | * @param instance the input instance |
---|
374 | * @return true if the filtered instance may now be |
---|
375 | * collected with output(). |
---|
376 | * @throws IllegalStateException if no input format has been defined. |
---|
377 | */ |
---|
378 | public boolean input(Instance instance) { |
---|
379 | |
---|
380 | if (getInputFormat() == null) { |
---|
381 | throw new IllegalStateException("No input instance format defined"); |
---|
382 | } |
---|
383 | if (m_NewBatch) { |
---|
384 | resetQueue(); |
---|
385 | m_NewBatch = false; |
---|
386 | } |
---|
387 | |
---|
388 | if (m_CutPoints != null) { |
---|
389 | convertInstance(instance); |
---|
390 | return true; |
---|
391 | } |
---|
392 | |
---|
393 | bufferInput(instance); |
---|
394 | return false; |
---|
395 | } |
---|
396 | |
---|
397 | /** |
---|
398 | * Signifies that this batch of input to the filter is finished. If the |
---|
399 | * filter requires all instances prior to filtering, output() may now |
---|
400 | * be called to retrieve the filtered instances. |
---|
401 | * |
---|
402 | * @return true if there are instances pending output |
---|
403 | * @throws IllegalStateException if no input structure has been defined |
---|
404 | */ |
---|
405 | public boolean batchFinished() { |
---|
406 | |
---|
407 | if (getInputFormat() == null) { |
---|
408 | throw new IllegalStateException("No input instance format defined"); |
---|
409 | } |
---|
410 | if (m_CutPoints == null) { |
---|
411 | calculateCutPoints(); |
---|
412 | |
---|
413 | setOutputFormat(); |
---|
414 | |
---|
415 | // If we implement saving cutfiles, save the cuts here |
---|
416 | |
---|
417 | // Convert pending input instances |
---|
418 | for(int i = 0; i < getInputFormat().numInstances(); i++) { |
---|
419 | convertInstance(getInputFormat().instance(i)); |
---|
420 | } |
---|
421 | } |
---|
422 | flushInput(); |
---|
423 | |
---|
424 | m_NewBatch = true; |
---|
425 | return (numPendingOutput() != 0); |
---|
426 | } |
---|
427 | |
---|
428 | /** |
---|
429 | * Returns a string describing this filter |
---|
430 | * |
---|
431 | * @return a description of the filter suitable for |
---|
432 | * displaying in the explorer/experimenter gui |
---|
433 | */ |
---|
434 | public String globalInfo() { |
---|
435 | |
---|
436 | return "An instance filter that discretizes a range of numeric" |
---|
437 | + " attributes in the dataset into nominal attributes." |
---|
438 | + " Discretization is by simple binning. Skips the class" |
---|
439 | + " attribute if set."; |
---|
440 | } |
---|
441 | |
---|
442 | /** |
---|
443 | * Returns the tip text for this property |
---|
444 | * |
---|
445 | * @return tip text for this property suitable for |
---|
446 | * displaying in the explorer/experimenter gui |
---|
447 | */ |
---|
448 | public String findNumBinsTipText() { |
---|
449 | |
---|
450 | return "Optimize number of equal-width bins using leave-one-out. Doesn't " + |
---|
451 | "work for equal-frequency binning"; |
---|
452 | } |
---|
453 | |
---|
454 | /** |
---|
455 | * Get the value of FindNumBins. |
---|
456 | * |
---|
457 | * @return Value of FindNumBins. |
---|
458 | */ |
---|
459 | public boolean getFindNumBins() { |
---|
460 | |
---|
461 | return m_FindNumBins; |
---|
462 | } |
---|
463 | |
---|
464 | /** |
---|
465 | * Set the value of FindNumBins. |
---|
466 | * |
---|
467 | * @param newFindNumBins Value to assign to FindNumBins. |
---|
468 | */ |
---|
469 | public void setFindNumBins(boolean newFindNumBins) { |
---|
470 | |
---|
471 | m_FindNumBins = newFindNumBins; |
---|
472 | } |
---|
473 | |
---|
474 | /** |
---|
475 | * Returns the tip text for this property |
---|
476 | * |
---|
477 | * @return tip text for this property suitable for |
---|
478 | * displaying in the explorer/experimenter gui |
---|
479 | */ |
---|
480 | public String makeBinaryTipText() { |
---|
481 | |
---|
482 | return "Make resulting attributes binary."; |
---|
483 | } |
---|
484 | |
---|
485 | /** |
---|
486 | * Gets whether binary attributes should be made for discretized ones. |
---|
487 | * |
---|
488 | * @return true if attributes will be binarized |
---|
489 | */ |
---|
490 | public boolean getMakeBinary() { |
---|
491 | |
---|
492 | return m_MakeBinary; |
---|
493 | } |
---|
494 | |
---|
495 | /** |
---|
496 | * Sets whether binary attributes should be made for discretized ones. |
---|
497 | * |
---|
498 | * @param makeBinary if binary attributes are to be made |
---|
499 | */ |
---|
500 | public void setMakeBinary(boolean makeBinary) { |
---|
501 | |
---|
502 | m_MakeBinary = makeBinary; |
---|
503 | } |
---|
504 | |
---|
505 | /** |
---|
506 | * Returns the tip text for this property |
---|
507 | * |
---|
508 | * @return tip text for this property suitable for |
---|
509 | * displaying in the explorer/experimenter gui |
---|
510 | */ |
---|
511 | public String desiredWeightOfInstancesPerIntervalTipText() { |
---|
512 | |
---|
513 | return "Sets the desired weight of instances per interval for " + |
---|
514 | "equal-frequency binning."; |
---|
515 | } |
---|
516 | |
---|
517 | /** |
---|
518 | * Get the DesiredWeightOfInstancesPerInterval value. |
---|
519 | * @return the DesiredWeightOfInstancesPerInterval value. |
---|
520 | */ |
---|
521 | public double getDesiredWeightOfInstancesPerInterval() { |
---|
522 | |
---|
523 | return m_DesiredWeightOfInstancesPerInterval; |
---|
524 | } |
---|
525 | |
---|
526 | /** |
---|
527 | * Set the DesiredWeightOfInstancesPerInterval value. |
---|
528 | * @param newDesiredNumber The new DesiredNumber value. |
---|
529 | */ |
---|
530 | public void setDesiredWeightOfInstancesPerInterval(double newDesiredNumber) { |
---|
531 | |
---|
532 | m_DesiredWeightOfInstancesPerInterval = newDesiredNumber; |
---|
533 | } |
---|
534 | |
---|
535 | /** |
---|
536 | * Returns the tip text for this property |
---|
537 | * |
---|
538 | * @return tip text for this property suitable for |
---|
539 | * displaying in the explorer/experimenter gui |
---|
540 | */ |
---|
541 | public String useEqualFrequencyTipText() { |
---|
542 | |
---|
543 | return "If set to true, equal-frequency binning will be used instead of" + |
---|
544 | " equal-width binning."; |
---|
545 | } |
---|
546 | |
---|
547 | /** |
---|
548 | * Get the value of UseEqualFrequency. |
---|
549 | * |
---|
550 | * @return Value of UseEqualFrequency. |
---|
551 | */ |
---|
552 | public boolean getUseEqualFrequency() { |
---|
553 | |
---|
554 | return m_UseEqualFrequency; |
---|
555 | } |
---|
556 | |
---|
557 | /** |
---|
558 | * Set the value of UseEqualFrequency. |
---|
559 | * |
---|
560 | * @param newUseEqualFrequency Value to assign to UseEqualFrequency. |
---|
561 | */ |
---|
562 | public void setUseEqualFrequency(boolean newUseEqualFrequency) { |
---|
563 | |
---|
564 | m_UseEqualFrequency = newUseEqualFrequency; |
---|
565 | } |
---|
566 | |
---|
567 | /** |
---|
568 | * Returns the tip text for this property |
---|
569 | * |
---|
570 | * @return tip text for this property suitable for |
---|
571 | * displaying in the explorer/experimenter gui |
---|
572 | */ |
---|
573 | public String binsTipText() { |
---|
574 | |
---|
575 | return "Number of bins."; |
---|
576 | } |
---|
577 | |
---|
578 | /** |
---|
579 | * Gets the number of bins numeric attributes will be divided into |
---|
580 | * |
---|
581 | * @return the number of bins. |
---|
582 | */ |
---|
583 | public int getBins() { |
---|
584 | |
---|
585 | return m_NumBins; |
---|
586 | } |
---|
587 | |
---|
588 | /** |
---|
589 | * Sets the number of bins to divide each selected numeric attribute into |
---|
590 | * |
---|
591 | * @param numBins the number of bins |
---|
592 | */ |
---|
593 | public void setBins(int numBins) { |
---|
594 | |
---|
595 | m_NumBins = numBins; |
---|
596 | } |
---|
597 | |
---|
598 | /** |
---|
599 | * Returns the tip text for this property |
---|
600 | * |
---|
601 | * @return tip text for this property suitable for |
---|
602 | * displaying in the explorer/experimenter gui |
---|
603 | */ |
---|
604 | public String invertSelectionTipText() { |
---|
605 | |
---|
606 | return "Set attribute selection mode. If false, only selected" |
---|
607 | + " (numeric) attributes in the range will be discretized; if" |
---|
608 | + " true, only non-selected attributes will be discretized."; |
---|
609 | } |
---|
610 | |
---|
611 | /** |
---|
612 | * Gets whether the supplied columns are to be removed or kept |
---|
613 | * |
---|
614 | * @return true if the supplied columns will be kept |
---|
615 | */ |
---|
616 | public boolean getInvertSelection() { |
---|
617 | |
---|
618 | return m_DiscretizeCols.getInvert(); |
---|
619 | } |
---|
620 | |
---|
621 | /** |
---|
622 | * Sets whether selected columns should be removed or kept. If true the |
---|
623 | * selected columns are kept and unselected columns are deleted. If false |
---|
624 | * selected columns are deleted and unselected columns are kept. |
---|
625 | * |
---|
626 | * @param invert the new invert setting |
---|
627 | */ |
---|
628 | public void setInvertSelection(boolean invert) { |
---|
629 | |
---|
630 | m_DiscretizeCols.setInvert(invert); |
---|
631 | } |
---|
632 | |
---|
633 | /** |
---|
634 | * Returns the tip text for this property |
---|
635 | * |
---|
636 | * @return tip text for this property suitable for |
---|
637 | * displaying in the explorer/experimenter gui |
---|
638 | */ |
---|
639 | public String attributeIndicesTipText() { |
---|
640 | return "Specify range of attributes to act on." |
---|
641 | + " This is a comma separated list of attribute indices, with" |
---|
642 | + " \"first\" and \"last\" valid values. Specify an inclusive" |
---|
643 | + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; |
---|
644 | } |
---|
645 | |
---|
646 | /** |
---|
647 | * Gets the current range selection |
---|
648 | * |
---|
649 | * @return a string containing a comma separated list of ranges |
---|
650 | */ |
---|
651 | public String getAttributeIndices() { |
---|
652 | |
---|
653 | return m_DiscretizeCols.getRanges(); |
---|
654 | } |
---|
655 | |
---|
656 | /** |
---|
657 | * Sets which attributes are to be Discretized (only numeric |
---|
658 | * attributes among the selection will be Discretized). |
---|
659 | * |
---|
660 | * @param rangeList a string representing the list of attributes. Since |
---|
661 | * the string will typically come from a user, attributes are indexed from |
---|
662 | * 1. <br> |
---|
663 | * eg: first-3,5,6-last |
---|
664 | * @throws IllegalArgumentException if an invalid range list is supplied |
---|
665 | */ |
---|
666 | public void setAttributeIndices(String rangeList) { |
---|
667 | |
---|
668 | m_DiscretizeCols.setRanges(rangeList); |
---|
669 | } |
---|
670 | |
---|
671 | /** |
---|
672 | * Sets which attributes are to be Discretized (only numeric |
---|
673 | * attributes among the selection will be Discretized). |
---|
674 | * |
---|
675 | * @param attributes an array containing indexes of attributes to Discretize. |
---|
676 | * Since the array will typically come from a program, attributes are indexed |
---|
677 | * from 0. |
---|
678 | * @throws IllegalArgumentException if an invalid set of ranges |
---|
679 | * is supplied |
---|
680 | */ |
---|
681 | public void setAttributeIndicesArray(int [] attributes) { |
---|
682 | |
---|
683 | setAttributeIndices(Range.indicesToRangeList(attributes)); |
---|
684 | } |
---|
685 | |
---|
686 | /** |
---|
687 | * Gets the cut points for an attribute |
---|
688 | * |
---|
689 | * @param attributeIndex the index (from 0) of the attribute to get the cut points of |
---|
690 | * @return an array containing the cutpoints (or null if the |
---|
691 | * attribute requested has been discretized into only one interval.) |
---|
692 | */ |
---|
693 | public double [] getCutPoints(int attributeIndex) { |
---|
694 | |
---|
695 | if (m_CutPoints == null) { |
---|
696 | return null; |
---|
697 | } |
---|
698 | return m_CutPoints[attributeIndex]; |
---|
699 | } |
---|
700 | |
---|
701 | /** Generate the cutpoints for each attribute */ |
---|
702 | protected void calculateCutPoints() { |
---|
703 | |
---|
704 | m_CutPoints = new double [getInputFormat().numAttributes()] []; |
---|
705 | for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) { |
---|
706 | if ((m_DiscretizeCols.isInRange(i)) && |
---|
707 | (getInputFormat().attribute(i).isNumeric()) && |
---|
708 | (getInputFormat().classIndex() != i)) { |
---|
709 | if (m_FindNumBins) { |
---|
710 | findNumBins(i); |
---|
711 | } else if (!m_UseEqualFrequency) { |
---|
712 | calculateCutPointsByEqualWidthBinning(i); |
---|
713 | } else { |
---|
714 | calculateCutPointsByEqualFrequencyBinning(i); |
---|
715 | } |
---|
716 | } |
---|
717 | } |
---|
718 | } |
---|
719 | |
---|
720 | /** |
---|
721 | * Set cutpoints for a single attribute. |
---|
722 | * |
---|
723 | * @param index the index of the attribute to set cutpoints for |
---|
724 | */ |
---|
725 | protected void calculateCutPointsByEqualWidthBinning(int index) { |
---|
726 | |
---|
727 | // Scan for max and min values |
---|
728 | double max = 0, min = 1, currentVal; |
---|
729 | Instance currentInstance; |
---|
730 | for(int i = 0; i < getInputFormat().numInstances(); i++) { |
---|
731 | currentInstance = getInputFormat().instance(i); |
---|
732 | if (!currentInstance.isMissing(index)) { |
---|
733 | currentVal = currentInstance.value(index); |
---|
734 | if (max < min) { |
---|
735 | max = min = currentVal; |
---|
736 | } |
---|
737 | if (currentVal > max) { |
---|
738 | max = currentVal; |
---|
739 | } |
---|
740 | if (currentVal < min) { |
---|
741 | min = currentVal; |
---|
742 | } |
---|
743 | } |
---|
744 | } |
---|
745 | double binWidth = (max - min) / m_NumBins; |
---|
746 | double [] cutPoints = null; |
---|
747 | if ((m_NumBins > 1) && (binWidth > 0)) { |
---|
748 | cutPoints = new double [m_NumBins - 1]; |
---|
749 | for(int i = 1; i < m_NumBins; i++) { |
---|
750 | cutPoints[i - 1] = min + binWidth * i; |
---|
751 | } |
---|
752 | } |
---|
753 | m_CutPoints[index] = cutPoints; |
---|
754 | } |
---|
755 | |
---|
756 | /** |
---|
757 | * Set cutpoints for a single attribute. |
---|
758 | * |
---|
759 | * @param index the index of the attribute to set cutpoints for |
---|
760 | */ |
---|
761 | protected void calculateCutPointsByEqualFrequencyBinning(int index) { |
---|
762 | |
---|
763 | // Copy data so that it can be sorted |
---|
764 | Instances data = new Instances(getInputFormat()); |
---|
765 | |
---|
766 | // Sort input data |
---|
767 | data.sort(index); |
---|
768 | |
---|
769 | // Compute weight of instances without missing values |
---|
770 | double sumOfWeights = 0; |
---|
771 | for (int i = 0; i < data.numInstances(); i++) { |
---|
772 | if (data.instance(i).isMissing(index)) { |
---|
773 | break; |
---|
774 | } else { |
---|
775 | sumOfWeights += data.instance(i).weight(); |
---|
776 | } |
---|
777 | } |
---|
778 | double freq; |
---|
779 | double[] cutPoints = new double[m_NumBins - 1]; |
---|
780 | if (getDesiredWeightOfInstancesPerInterval() > 0) { |
---|
781 | freq = getDesiredWeightOfInstancesPerInterval(); |
---|
782 | cutPoints = new double[(int)(sumOfWeights / freq)]; |
---|
783 | } else { |
---|
784 | freq = sumOfWeights / m_NumBins; |
---|
785 | cutPoints = new double[m_NumBins - 1]; |
---|
786 | } |
---|
787 | |
---|
788 | // Compute break points |
---|
789 | double counter = 0, last = 0; |
---|
790 | int cpindex = 0, lastIndex = -1; |
---|
791 | for (int i = 0; i < data.numInstances() - 1; i++) { |
---|
792 | |
---|
793 | // Stop if value missing |
---|
794 | if (data.instance(i).isMissing(index)) { |
---|
795 | break; |
---|
796 | } |
---|
797 | counter += data.instance(i).weight(); |
---|
798 | sumOfWeights -= data.instance(i).weight(); |
---|
799 | |
---|
800 | // Do we have a potential breakpoint? |
---|
801 | if (data.instance(i).value(index) < |
---|
802 | data.instance(i + 1).value(index)) { |
---|
803 | |
---|
804 | // Have we passed the ideal size? |
---|
805 | if (counter >= freq) { |
---|
806 | |
---|
807 | // Is this break point worse than the last one? |
---|
808 | if (((freq - last) < (counter - freq)) && (lastIndex != -1)) { |
---|
809 | cutPoints[cpindex] = (data.instance(lastIndex).value(index) + |
---|
810 | data.instance(lastIndex + 1).value(index)) / 2; |
---|
811 | counter -= last; |
---|
812 | last = counter; |
---|
813 | lastIndex = i; |
---|
814 | } else { |
---|
815 | cutPoints[cpindex] = (data.instance(i).value(index) + |
---|
816 | data.instance(i + 1).value(index)) / 2; |
---|
817 | counter = 0; |
---|
818 | last = 0; |
---|
819 | lastIndex = -1; |
---|
820 | } |
---|
821 | cpindex++; |
---|
822 | freq = (sumOfWeights + counter) / ((cutPoints.length + 1) - cpindex); |
---|
823 | } else { |
---|
824 | lastIndex = i; |
---|
825 | last = counter; |
---|
826 | } |
---|
827 | } |
---|
828 | } |
---|
829 | |
---|
830 | // Check whether there was another possibility for a cut point |
---|
831 | if ((cpindex < cutPoints.length) && (lastIndex != -1)) { |
---|
832 | cutPoints[cpindex] = (data.instance(lastIndex).value(index) + |
---|
833 | data.instance(lastIndex + 1).value(index)) / 2; |
---|
834 | cpindex++; |
---|
835 | } |
---|
836 | |
---|
837 | // Did we find any cutpoints? |
---|
838 | if (cpindex == 0) { |
---|
839 | m_CutPoints[index] = null; |
---|
840 | } else { |
---|
841 | double[] cp = new double[cpindex]; |
---|
842 | for (int i = 0; i < cpindex; i++) { |
---|
843 | cp[i] = cutPoints[i]; |
---|
844 | } |
---|
845 | m_CutPoints[index] = cp; |
---|
846 | } |
---|
847 | } |
---|
848 | |
---|
849 | /** |
---|
850 | * Optimizes the number of bins using leave-one-out cross-validation. |
---|
851 | * |
---|
852 | * @param index the attribute index |
---|
853 | */ |
---|
854 | protected void findNumBins(int index) { |
---|
855 | |
---|
856 | double min = Double.MAX_VALUE, max = -Double.MAX_VALUE, binWidth = 0, |
---|
857 | entropy, bestEntropy = Double.MAX_VALUE, currentVal; |
---|
858 | double[] distribution; |
---|
859 | int bestNumBins = 1; |
---|
860 | Instance currentInstance; |
---|
861 | |
---|
862 | // Find minimum and maximum |
---|
863 | for (int i = 0; i < getInputFormat().numInstances(); i++) { |
---|
864 | currentInstance = getInputFormat().instance(i); |
---|
865 | if (!currentInstance.isMissing(index)) { |
---|
866 | currentVal = currentInstance.value(index); |
---|
867 | if (currentVal > max) { |
---|
868 | max = currentVal; |
---|
869 | } |
---|
870 | if (currentVal < min) { |
---|
871 | min = currentVal; |
---|
872 | } |
---|
873 | } |
---|
874 | } |
---|
875 | |
---|
876 | // Find best number of bins |
---|
877 | for (int i = 0; i < m_NumBins; i++) { |
---|
878 | distribution = new double[i + 1]; |
---|
879 | binWidth = (max - min) / (i + 1); |
---|
880 | |
---|
881 | // Compute distribution |
---|
882 | for (int j = 0; j < getInputFormat().numInstances(); j++) { |
---|
883 | currentInstance = getInputFormat().instance(j); |
---|
884 | if (!currentInstance.isMissing(index)) { |
---|
885 | for (int k = 0; k < i + 1; k++) { |
---|
886 | if (currentInstance.value(index) <= |
---|
887 | (min + (((double)k + 1) * binWidth))) { |
---|
888 | distribution[k] += currentInstance.weight(); |
---|
889 | break; |
---|
890 | } |
---|
891 | } |
---|
892 | } |
---|
893 | } |
---|
894 | |
---|
895 | // Compute cross-validated entropy |
---|
896 | entropy = 0; |
---|
897 | for (int k = 0; k < i + 1; k++) { |
---|
898 | if (distribution[k] < 2) { |
---|
899 | entropy = Double.MAX_VALUE; |
---|
900 | break; |
---|
901 | } |
---|
902 | entropy -= distribution[k] * Math.log((distribution[k] - 1) / |
---|
903 | binWidth); |
---|
904 | } |
---|
905 | |
---|
906 | // Best entropy so far? |
---|
907 | if (entropy < bestEntropy) { |
---|
908 | bestEntropy = entropy; |
---|
909 | bestNumBins = i + 1; |
---|
910 | } |
---|
911 | } |
---|
912 | |
---|
913 | // Compute cut points |
---|
914 | double [] cutPoints = null; |
---|
915 | if ((bestNumBins > 1) && (binWidth > 0)) { |
---|
916 | cutPoints = new double [bestNumBins - 1]; |
---|
917 | for(int i = 1; i < bestNumBins; i++) { |
---|
918 | cutPoints[i - 1] = min + binWidth * i; |
---|
919 | } |
---|
920 | } |
---|
921 | m_CutPoints[index] = cutPoints; |
---|
922 | } |
---|
923 | |
---|
924 | /** |
---|
925 | * Set the output format. Takes the currently defined cutpoints and |
---|
926 | * m_InputFormat and calls setOutputFormat(Instances) appropriately. |
---|
927 | */ |
---|
928 | protected void setOutputFormat() { |
---|
929 | |
---|
930 | if (m_CutPoints == null) { |
---|
931 | setOutputFormat(null); |
---|
932 | return; |
---|
933 | } |
---|
934 | FastVector attributes = new FastVector(getInputFormat().numAttributes()); |
---|
935 | int classIndex = getInputFormat().classIndex(); |
---|
936 | for(int i = 0; i < getInputFormat().numAttributes(); i++) { |
---|
937 | if ((m_DiscretizeCols.isInRange(i)) |
---|
938 | && (getInputFormat().attribute(i).isNumeric()) |
---|
939 | && (getInputFormat().classIndex() != i)) { |
---|
940 | if (!m_MakeBinary) { |
---|
941 | FastVector attribValues = new FastVector(1); |
---|
942 | if (m_CutPoints[i] == null) { |
---|
943 | attribValues.addElement("'All'"); |
---|
944 | } else { |
---|
945 | for(int j = 0; j <= m_CutPoints[i].length; j++) { |
---|
946 | if (j == 0) { |
---|
947 | attribValues.addElement("'(-inf-" |
---|
948 | + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'"); |
---|
949 | } else if (j == m_CutPoints[i].length) { |
---|
950 | attribValues.addElement("'(" |
---|
951 | + Utils.doubleToString(m_CutPoints[i][j - 1], 6) |
---|
952 | + "-inf)'"); |
---|
953 | } else { |
---|
954 | attribValues.addElement("'(" |
---|
955 | + Utils.doubleToString(m_CutPoints[i][j - 1], 6) + "-" |
---|
956 | + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'"); |
---|
957 | } |
---|
958 | } |
---|
959 | } |
---|
960 | attributes.addElement(new Attribute(getInputFormat(). |
---|
961 | attribute(i).name(), |
---|
962 | attribValues)); |
---|
963 | } else { |
---|
964 | if (m_CutPoints[i] == null) { |
---|
965 | FastVector attribValues = new FastVector(1); |
---|
966 | attribValues.addElement("'All'"); |
---|
967 | attributes.addElement(new Attribute(getInputFormat(). |
---|
968 | attribute(i).name(), |
---|
969 | attribValues)); |
---|
970 | } else { |
---|
971 | if (i < getInputFormat().classIndex()) { |
---|
972 | classIndex += m_CutPoints[i].length - 1; |
---|
973 | } |
---|
974 | for(int j = 0; j < m_CutPoints[i].length; j++) { |
---|
975 | FastVector attribValues = new FastVector(2); |
---|
976 | attribValues.addElement("'(-inf-" |
---|
977 | + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'"); |
---|
978 | attribValues.addElement("'(" |
---|
979 | + Utils.doubleToString(m_CutPoints[i][j], 6) + "-inf)'"); |
---|
980 | attributes.addElement(new Attribute(getInputFormat(). |
---|
981 | attribute(i).name(), |
---|
982 | attribValues)); |
---|
983 | } |
---|
984 | } |
---|
985 | } |
---|
986 | } else { |
---|
987 | attributes.addElement(getInputFormat().attribute(i).copy()); |
---|
988 | } |
---|
989 | } |
---|
990 | Instances outputFormat = |
---|
991 | new Instances(getInputFormat().relationName(), attributes, 0); |
---|
992 | outputFormat.setClassIndex(classIndex); |
---|
993 | setOutputFormat(outputFormat); |
---|
994 | } |
---|
995 | |
---|
996 | /** |
---|
997 | * Convert a single instance over. The converted instance is added to |
---|
998 | * the end of the output queue. |
---|
999 | * |
---|
1000 | * @param instance the instance to convert |
---|
1001 | */ |
---|
1002 | protected void convertInstance(Instance instance) { |
---|
1003 | |
---|
1004 | int index = 0; |
---|
1005 | double [] vals = new double [outputFormatPeek().numAttributes()]; |
---|
1006 | // Copy and convert the values |
---|
1007 | for(int i = 0; i < getInputFormat().numAttributes(); i++) { |
---|
1008 | if (m_DiscretizeCols.isInRange(i) && |
---|
1009 | getInputFormat().attribute(i).isNumeric() && |
---|
1010 | (getInputFormat().classIndex() != i)) { |
---|
1011 | int j; |
---|
1012 | double currentVal = instance.value(i); |
---|
1013 | if (m_CutPoints[i] == null) { |
---|
1014 | if (instance.isMissing(i)) { |
---|
1015 | vals[index] = Utils.missingValue(); |
---|
1016 | } else { |
---|
1017 | vals[index] = 0; |
---|
1018 | } |
---|
1019 | index++; |
---|
1020 | } else { |
---|
1021 | if (!m_MakeBinary) { |
---|
1022 | if (instance.isMissing(i)) { |
---|
1023 | vals[index] = Utils.missingValue(); |
---|
1024 | } else { |
---|
1025 | for (j = 0; j < m_CutPoints[i].length; j++) { |
---|
1026 | if (currentVal <= m_CutPoints[i][j]) { |
---|
1027 | break; |
---|
1028 | } |
---|
1029 | } |
---|
1030 | vals[index] = j; |
---|
1031 | } |
---|
1032 | index++; |
---|
1033 | } else { |
---|
1034 | for (j = 0; j < m_CutPoints[i].length; j++) { |
---|
1035 | if (instance.isMissing(i)) { |
---|
1036 | vals[index] = Utils.missingValue(); |
---|
1037 | } else if (currentVal <= m_CutPoints[i][j]) { |
---|
1038 | vals[index] = 0; |
---|
1039 | } else { |
---|
1040 | vals[index] = 1; |
---|
1041 | } |
---|
1042 | index++; |
---|
1043 | } |
---|
1044 | } |
---|
1045 | } |
---|
1046 | } else { |
---|
1047 | vals[index] = instance.value(i); |
---|
1048 | index++; |
---|
1049 | } |
---|
1050 | } |
---|
1051 | |
---|
1052 | Instance inst = null; |
---|
1053 | if (instance instanceof SparseInstance) { |
---|
1054 | inst = new SparseInstance(instance.weight(), vals); |
---|
1055 | } else { |
---|
1056 | inst = new DenseInstance(instance.weight(), vals); |
---|
1057 | } |
---|
1058 | inst.setDataset(getOutputFormat()); |
---|
1059 | copyValues(inst, false, instance.dataset(), getOutputFormat()); |
---|
1060 | inst.setDataset(getOutputFormat()); |
---|
1061 | push(inst); |
---|
1062 | } |
---|
1063 | |
---|
1064 | /** |
---|
1065 | * Returns the revision string. |
---|
1066 | * |
---|
1067 | * @return the revision |
---|
1068 | */ |
---|
1069 | public String getRevision() { |
---|
1070 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
1071 | } |
---|
1072 | |
---|
1073 | /** |
---|
1074 | * Main method for testing this class. |
---|
1075 | * |
---|
1076 | * @param argv should contain arguments to the filter: use -h for help |
---|
1077 | */ |
---|
1078 | public static void main(String [] argv) { |
---|
1079 | runFilter(new Discretize(), argv); |
---|
1080 | } |
---|
1081 | } |
---|