source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/Discretize.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 29.9 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Discretize.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23
24package weka.filters.unsupervised.attribute;
25
26import weka.core.Attribute;
27import weka.core.Capabilities;
28import weka.core.FastVector;
29import weka.core.Instance; 
30import weka.core.DenseInstance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.Range;
34import weka.core.RevisionUtils;
35import weka.core.SparseInstance;
36import weka.core.Utils;
37import weka.core.WeightedInstancesHandler;
38import weka.core.Capabilities.Capability;
39import weka.filters.UnsupervisedFilter;
40
41import java.util.Enumeration;
42import java.util.Vector;
43
44/**
45 <!-- globalinfo-start -->
46 * An instance filter that discretizes a range of numeric attributes in the dataset into nominal attributes. Discretization is by simple binning. Skips the class attribute if set.
47 * <p/>
48 <!-- globalinfo-end -->
49 *
50 <!-- options-start -->
51 * Valid options are: <p/>
52 *
53 * <pre> -unset-class-temporarily
54 *  Unsets the class index temporarily before the filter is
55 *  applied to the data.
56 *  (default: no)</pre>
57 *
58 * <pre> -B &lt;num&gt;
59 *  Specifies the (maximum) number of bins to divide numeric attributes into.
60 *  (default = 10)</pre>
61 *
62 * <pre> -M &lt;num&gt;
63 *  Specifies the desired weight of instances per bin for
64 *  equal-frequency binning. If this is set to a positive
65 *  number then the -B option will be ignored.
66 *  (default = -1)</pre>
67 *
68 * <pre> -F
69 *  Use equal-frequency instead of equal-width discretization.</pre>
70 *
71 * <pre> -O
72 *  Optimize number of bins using leave-one-out estimate
73 *  of estimated entropy (for equal-width discretization).
74 *  If this is set then the -B option will be ignored.</pre>
75 *
76 * <pre> -R &lt;col1,col2-col4,...&gt;
77 *  Specifies list of columns to Discretize. First and last are valid indexes.
78 *  (default: first-last)</pre>
79 *
80 * <pre> -V
81 *  Invert matching sense of column indexes.</pre>
82 *
83 * <pre> -D
84 *  Output binary attributes for discretized attributes.</pre>
85 *
86 <!-- options-end -->
87 *
88 * @author Len Trigg (trigg@cs.waikato.ac.nz)
89 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
90 * @version $Revision: 5987 $
91 */
92public class Discretize 
93  extends PotentialClassIgnorer
94  implements UnsupervisedFilter, WeightedInstancesHandler {
95 
96  /** for serialization */
97  static final long serialVersionUID = -1358531742174527279L;
98
99  /** Stores which columns to Discretize */
100  protected Range m_DiscretizeCols = new Range();
101
102  /** The number of bins to divide the attribute into */
103  protected int m_NumBins = 10;
104
105  /** The desired weight of instances per bin */
106  protected double m_DesiredWeightOfInstancesPerInterval = -1;
107
108  /** Store the current cutpoints */
109  protected double [][] m_CutPoints = null;
110
111  /** Output binary attributes for discretized attributes. */
112  protected boolean m_MakeBinary = false;
113
114  /** Find the number of bins using cross-validated entropy. */
115  protected boolean m_FindNumBins = false;
116
117  /** Use equal-frequency binning if unsupervised discretization turned on */
118  protected boolean m_UseEqualFrequency = false;
119
120  /** The default columns to discretize */
121  protected String m_DefaultCols;
122
123  /** Constructor - initialises the filter */
124  public Discretize() {
125
126    m_DefaultCols = "first-last";
127    setAttributeIndices("first-last");
128  }
129
130  /**
131   * Another constructor, sets the attribute indices immediately
132   *
133   * @param cols the attribute indices
134   */
135  public Discretize(String cols) {
136
137    m_DefaultCols = cols;
138    setAttributeIndices(cols);
139  }
140
141  /**
142   * Gets an enumeration describing the available options.
143   *
144   * @return an enumeration of all the available options.
145   */
146  public Enumeration listOptions() {
147    Vector result = new Vector();
148    Enumeration enm = super.listOptions();
149    while (enm.hasMoreElements())
150      result.add(enm.nextElement());
151     
152    result.addElement(new Option(
153        "\tSpecifies the (maximum) number of bins to divide numeric"
154        + " attributes into.\n"
155        + "\t(default = 10)",
156        "B", 1, "-B <num>"));
157   
158    result.addElement(new Option(
159        "\tSpecifies the desired weight of instances per bin for\n"
160        + "\tequal-frequency binning. If this is set to a positive\n"
161        + "\tnumber then the -B option will be ignored.\n"
162        + "\t(default = -1)",
163        "M", 1, "-M <num>"));
164   
165    result.addElement(new Option(
166        "\tUse equal-frequency instead of equal-width discretization.",
167        "F", 0, "-F"));
168   
169    result.addElement(new Option(
170        "\tOptimize number of bins using leave-one-out estimate\n"+
171        "\tof estimated entropy (for equal-width discretization).\n"+
172        "\tIf this is set then the -B option will be ignored.",
173        "O", 0, "-O"));
174   
175    result.addElement(new Option(
176        "\tSpecifies list of columns to Discretize. First"
177        + " and last are valid indexes.\n"
178        + "\t(default: first-last)",
179        "R", 1, "-R <col1,col2-col4,...>"));
180   
181    result.addElement(new Option(
182        "\tInvert matching sense of column indexes.",
183        "V", 0, "-V"));
184   
185    result.addElement(new Option(
186        "\tOutput binary attributes for discretized attributes.",
187        "D", 0, "-D"));
188
189    return result.elements();
190  }
191
192
193  /**
194   * Parses a given list of options. <p/>
195   *
196   <!-- options-start -->
197   * Valid options are: <p/>
198   *
199   * <pre> -unset-class-temporarily
200   *  Unsets the class index temporarily before the filter is
201   *  applied to the data.
202   *  (default: no)</pre>
203   *
204   * <pre> -B &lt;num&gt;
205   *  Specifies the (maximum) number of bins to divide numeric attributes into.
206   *  (default = 10)</pre>
207   *
208   * <pre> -M &lt;num&gt;
209   *  Specifies the desired weight of instances per bin for
210   *  equal-frequency binning. If this is set to a positive
211   *  number then the -B option will be ignored.
212   *  (default = -1)</pre>
213   *
214   * <pre> -F
215   *  Use equal-frequency instead of equal-width discretization.</pre>
216   *
217   * <pre> -O
218   *  Optimize number of bins using leave-one-out estimate
219   *  of estimated entropy (for equal-width discretization).
220   *  If this is set then the -B option will be ignored.</pre>
221   *
222   * <pre> -R &lt;col1,col2-col4,...&gt;
223   *  Specifies list of columns to Discretize. First and last are valid indexes.
224   *  (default: first-last)</pre>
225   *
226   * <pre> -V
227   *  Invert matching sense of column indexes.</pre>
228   *
229   * <pre> -D
230   *  Output binary attributes for discretized attributes.</pre>
231   *
232   <!-- options-end -->
233   *
234   * @param options the list of options as an array of strings
235   * @throws Exception if an option is not supported
236   */
237  public void setOptions(String[] options) throws Exception {
238
239    super.setOptions(options);
240
241    setMakeBinary(Utils.getFlag('D', options));
242    setUseEqualFrequency(Utils.getFlag('F', options));
243    setFindNumBins(Utils.getFlag('O', options));
244    setInvertSelection(Utils.getFlag('V', options));
245
246    String weight = Utils.getOption('M', options);
247    if (weight.length() != 0) {
248      setDesiredWeightOfInstancesPerInterval((new Double(weight)).doubleValue());
249    } else {
250      setDesiredWeightOfInstancesPerInterval(-1);
251    }
252
253    String numBins = Utils.getOption('B', options);
254    if (numBins.length() != 0) {
255      setBins(Integer.parseInt(numBins));
256    } else {
257      setBins(10);
258    }
259   
260    String convertList = Utils.getOption('R', options);
261    if (convertList.length() != 0) {
262      setAttributeIndices(convertList);
263    } else {
264      setAttributeIndices(m_DefaultCols);
265    }
266
267    if (getInputFormat() != null) {
268      setInputFormat(getInputFormat());
269    }
270  }
271
272  /**
273   * Gets the current settings of the filter.
274   *
275   * @return an array of strings suitable for passing to setOptions
276   */
277  public String [] getOptions() {
278    Vector        result;
279    String[]      options;
280    int           i;
281
282    result = new Vector();
283
284    options = super.getOptions();
285    for (i = 0; i < options.length; i++)
286      result.add(options[i]);
287
288    if (getMakeBinary())
289      result.add("-D");
290   
291    if (getUseEqualFrequency())
292      result.add("-F");
293   
294    if (getFindNumBins())
295      result.add("-O");
296   
297    if (getInvertSelection())
298      result.add("-V");
299   
300    result.add("-B");
301    result.add("" + getBins());
302   
303    result.add("-M");
304    result.add("" + getDesiredWeightOfInstancesPerInterval());
305   
306    if (!getAttributeIndices().equals("")) {
307      result.add("-R");
308      result.add(getAttributeIndices());
309    }
310
311    return (String[]) result.toArray(new String[result.size()]);
312  }
313
314  /**
315   * Returns the Capabilities of this filter.
316   *
317   * @return            the capabilities of this object
318   * @see               Capabilities
319   */
320  public Capabilities getCapabilities() {
321    Capabilities result = super.getCapabilities();
322    result.disableAll();
323
324    // attributes
325    result.enableAllAttributes();
326    result.enable(Capability.MISSING_VALUES);
327   
328    // class
329    result.enableAllClasses();
330    result.enable(Capability.MISSING_CLASS_VALUES);
331    if (!getMakeBinary())
332      result.enable(Capability.NO_CLASS);
333   
334    return result;
335  }
336
337  /**
338   * Sets the format of the input instances.
339   *
340   * @param instanceInfo an Instances object containing the input instance
341   * structure (any instances contained in the object are ignored - only the
342   * structure is required).
343   * @return true if the outputFormat may be collected immediately
344   * @throws Exception if the input format can't be set successfully
345   */
346  public boolean setInputFormat(Instances instanceInfo) throws Exception {
347
348    if (m_MakeBinary && m_IgnoreClass) {
349      throw new IllegalArgumentException("Can't ignore class when " +
350                                         "changing the number of attributes!");
351    }
352
353    super.setInputFormat(instanceInfo);
354
355    m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
356    m_CutPoints = null;
357   
358    if (getFindNumBins() && getUseEqualFrequency()) {
359      throw new IllegalArgumentException("Bin number optimization in conjunction "+
360                                         "with equal-frequency binning not implemented.");
361    }
362
363    // If we implement loading cutfiles, then load
364    //them here and set the output format
365    return false;
366  }
367
368  /**
369   * Input an instance for filtering. Ordinarily the instance is processed
370   * and made available for output immediately. Some filters require all
371   * instances be read before producing output.
372   *
373   * @param instance the input instance
374   * @return true if the filtered instance may now be
375   * collected with output().
376   * @throws IllegalStateException if no input format has been defined.
377   */
378  public boolean input(Instance instance) {
379
380    if (getInputFormat() == null) {
381      throw new IllegalStateException("No input instance format defined");
382    }
383    if (m_NewBatch) {
384      resetQueue();
385      m_NewBatch = false;
386    }
387   
388    if (m_CutPoints != null) {
389      convertInstance(instance);
390      return true;
391    }
392
393    bufferInput(instance);
394    return false;
395  }
396
397  /**
398   * Signifies that this batch of input to the filter is finished. If the
399   * filter requires all instances prior to filtering, output() may now
400   * be called to retrieve the filtered instances.
401   *
402   * @return true if there are instances pending output
403   * @throws IllegalStateException if no input structure has been defined
404   */
405  public boolean batchFinished() {
406
407    if (getInputFormat() == null) {
408      throw new IllegalStateException("No input instance format defined");
409    }
410    if (m_CutPoints == null) {
411      calculateCutPoints();
412
413      setOutputFormat();
414
415      // If we implement saving cutfiles, save the cuts here
416
417      // Convert pending input instances
418      for(int i = 0; i < getInputFormat().numInstances(); i++) {
419        convertInstance(getInputFormat().instance(i));
420      }
421    } 
422    flushInput();
423
424    m_NewBatch = true;
425    return (numPendingOutput() != 0);
426  }
427
428  /**
429   * Returns a string describing this filter
430   *
431   * @return a description of the filter suitable for
432   * displaying in the explorer/experimenter gui
433   */
434  public String globalInfo() {
435
436    return "An instance filter that discretizes a range of numeric"
437      + " attributes in the dataset into nominal attributes."
438      + " Discretization is by simple binning. Skips the class"
439      + " attribute if set.";
440  }
441 
442  /**
443   * Returns the tip text for this property
444   *
445   * @return tip text for this property suitable for
446   * displaying in the explorer/experimenter gui
447   */
448  public String findNumBinsTipText() {
449
450    return "Optimize number of equal-width bins using leave-one-out. Doesn't " +
451      "work for equal-frequency binning";
452  }
453
454  /**
455   * Get the value of FindNumBins.
456   *
457   * @return Value of FindNumBins.
458   */
459  public boolean getFindNumBins() {
460   
461    return m_FindNumBins;
462  }
463 
464  /**
465   * Set the value of FindNumBins.
466   *
467   * @param newFindNumBins Value to assign to FindNumBins.
468   */
469  public void setFindNumBins(boolean newFindNumBins) {
470   
471    m_FindNumBins = newFindNumBins;
472  }
473 
474  /**
475   * Returns the tip text for this property
476   *
477   * @return tip text for this property suitable for
478   * displaying in the explorer/experimenter gui
479   */
480  public String makeBinaryTipText() {
481
482    return "Make resulting attributes binary.";
483  }
484
485  /**
486   * Gets whether binary attributes should be made for discretized ones.
487   *
488   * @return true if attributes will be binarized
489   */
490  public boolean getMakeBinary() {
491
492    return m_MakeBinary;
493  }
494
495  /**
496   * Sets whether binary attributes should be made for discretized ones.
497   *
498   * @param makeBinary if binary attributes are to be made
499   */
500  public void setMakeBinary(boolean makeBinary) {
501
502    m_MakeBinary = makeBinary;
503  }
504 
505  /**
506   * Returns the tip text for this property
507   *
508   * @return tip text for this property suitable for
509   * displaying in the explorer/experimenter gui
510   */
511  public String desiredWeightOfInstancesPerIntervalTipText() {
512
513    return "Sets the desired weight of instances per interval for " +
514      "equal-frequency binning.";
515  }
516 
517  /**
518   * Get the DesiredWeightOfInstancesPerInterval value.
519   * @return the DesiredWeightOfInstancesPerInterval value.
520   */
521  public double getDesiredWeightOfInstancesPerInterval() {
522
523    return m_DesiredWeightOfInstancesPerInterval;
524  }
525
526  /**
527   * Set the DesiredWeightOfInstancesPerInterval value.
528   * @param newDesiredNumber The new DesiredNumber value.
529   */
530  public void setDesiredWeightOfInstancesPerInterval(double newDesiredNumber) {
531   
532    m_DesiredWeightOfInstancesPerInterval = newDesiredNumber;
533  }
534 
535  /**
536   * Returns the tip text for this property
537   *
538   * @return tip text for this property suitable for
539   * displaying in the explorer/experimenter gui
540   */
541  public String useEqualFrequencyTipText() {
542
543    return "If set to true, equal-frequency binning will be used instead of" +
544      " equal-width binning.";
545  }
546 
547  /**
548   * Get the value of UseEqualFrequency.
549   *
550   * @return Value of UseEqualFrequency.
551   */
552  public boolean getUseEqualFrequency() {
553   
554    return m_UseEqualFrequency;
555  }
556 
557  /**
558   * Set the value of UseEqualFrequency.
559   *
560   * @param newUseEqualFrequency Value to assign to UseEqualFrequency.
561   */
562  public void setUseEqualFrequency(boolean newUseEqualFrequency) {
563   
564    m_UseEqualFrequency = newUseEqualFrequency;
565  }
566
567  /**
568   * Returns the tip text for this property
569   *
570   * @return tip text for this property suitable for
571   * displaying in the explorer/experimenter gui
572   */
573  public String binsTipText() {
574
575    return "Number of bins.";
576  }
577
578  /**
579   * Gets the number of bins numeric attributes will be divided into
580   *
581   * @return the number of bins.
582   */
583  public int getBins() {
584
585    return m_NumBins;
586  }
587
588  /**
589   * Sets the number of bins to divide each selected numeric attribute into
590   *
591   * @param numBins the number of bins
592   */
593  public void setBins(int numBins) {
594
595    m_NumBins = numBins;
596  }
597
598  /**
599   * Returns the tip text for this property
600   *
601   * @return tip text for this property suitable for
602   * displaying in the explorer/experimenter gui
603   */
604  public String invertSelectionTipText() {
605
606    return "Set attribute selection mode. If false, only selected"
607      + " (numeric) attributes in the range will be discretized; if"
608      + " true, only non-selected attributes will be discretized.";
609  }
610
611  /**
612   * Gets whether the supplied columns are to be removed or kept
613   *
614   * @return true if the supplied columns will be kept
615   */
616  public boolean getInvertSelection() {
617
618    return m_DiscretizeCols.getInvert();
619  }
620
621  /**
622   * Sets whether selected columns should be removed or kept. If true the
623   * selected columns are kept and unselected columns are deleted. If false
624   * selected columns are deleted and unselected columns are kept.
625   *
626   * @param invert the new invert setting
627   */
628  public void setInvertSelection(boolean invert) {
629
630    m_DiscretizeCols.setInvert(invert);
631  }
632
633  /**
634   * Returns the tip text for this property
635   *
636   * @return tip text for this property suitable for
637   * displaying in the explorer/experimenter gui
638   */
639  public String attributeIndicesTipText() {
640    return "Specify range of attributes to act on."
641      + " This is a comma separated list of attribute indices, with"
642      + " \"first\" and \"last\" valid values. Specify an inclusive"
643      + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
644  }
645
646  /**
647   * Gets the current range selection
648   *
649   * @return a string containing a comma separated list of ranges
650   */
651  public String getAttributeIndices() {
652
653    return m_DiscretizeCols.getRanges();
654  }
655
656  /**
657   * Sets which attributes are to be Discretized (only numeric
658   * attributes among the selection will be Discretized).
659   *
660   * @param rangeList a string representing the list of attributes. Since
661   * the string will typically come from a user, attributes are indexed from
662   * 1. <br>
663   * eg: first-3,5,6-last
664   * @throws IllegalArgumentException if an invalid range list is supplied
665   */
666  public void setAttributeIndices(String rangeList) {
667
668    m_DiscretizeCols.setRanges(rangeList);
669  }
670
671  /**
672   * Sets which attributes are to be Discretized (only numeric
673   * attributes among the selection will be Discretized).
674   *
675   * @param attributes an array containing indexes of attributes to Discretize.
676   * Since the array will typically come from a program, attributes are indexed
677   * from 0.
678   * @throws IllegalArgumentException if an invalid set of ranges
679   * is supplied
680   */
681  public void setAttributeIndicesArray(int [] attributes) {
682
683    setAttributeIndices(Range.indicesToRangeList(attributes));
684  }
685
686  /**
687   * Gets the cut points for an attribute
688   *
689   * @param attributeIndex the index (from 0) of the attribute to get the cut points of
690   * @return an array containing the cutpoints (or null if the
691   * attribute requested has been discretized into only one interval.)
692   */
693  public double [] getCutPoints(int attributeIndex) {
694
695    if (m_CutPoints == null) {
696      return null;
697    }
698    return m_CutPoints[attributeIndex];
699  }
700
701  /** Generate the cutpoints for each attribute */
702  protected void calculateCutPoints() {
703
704    m_CutPoints = new double [getInputFormat().numAttributes()] [];
705    for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) {
706      if ((m_DiscretizeCols.isInRange(i)) && 
707          (getInputFormat().attribute(i).isNumeric()) &&
708          (getInputFormat().classIndex() != i)) {
709        if (m_FindNumBins) {
710          findNumBins(i);
711        } else if (!m_UseEqualFrequency) {
712          calculateCutPointsByEqualWidthBinning(i);
713        } else {
714          calculateCutPointsByEqualFrequencyBinning(i);
715        }
716      }
717    }
718  }
719 
720  /**
721   * Set cutpoints for a single attribute.
722   *
723   * @param index the index of the attribute to set cutpoints for
724   */
725  protected void calculateCutPointsByEqualWidthBinning(int index) {
726
727    // Scan for max and min values
728    double max = 0, min = 1, currentVal;
729    Instance currentInstance;
730    for(int i = 0; i < getInputFormat().numInstances(); i++) {
731      currentInstance = getInputFormat().instance(i);
732      if (!currentInstance.isMissing(index)) {
733        currentVal = currentInstance.value(index);
734        if (max < min) {
735          max = min = currentVal;
736        }
737        if (currentVal > max) {
738          max = currentVal;
739        }
740        if (currentVal < min) {
741          min = currentVal;
742        }
743      }
744    }
745    double binWidth = (max - min) / m_NumBins;
746    double [] cutPoints = null;
747    if ((m_NumBins > 1) && (binWidth > 0)) {
748      cutPoints = new double [m_NumBins - 1];
749      for(int i = 1; i < m_NumBins; i++) {
750        cutPoints[i - 1] = min + binWidth * i;
751      }
752    }
753    m_CutPoints[index] = cutPoints;
754  }
755 
756  /**
757   * Set cutpoints for a single attribute.
758   *
759   * @param index the index of the attribute to set cutpoints for
760   */
761  protected void calculateCutPointsByEqualFrequencyBinning(int index) {
762
763    // Copy data so that it can be sorted
764    Instances data = new Instances(getInputFormat());
765
766    // Sort input data
767    data.sort(index);
768
769    // Compute weight of instances without missing values
770    double sumOfWeights = 0;
771    for (int i = 0; i < data.numInstances(); i++) {
772      if (data.instance(i).isMissing(index)) {
773        break;
774      } else {
775        sumOfWeights += data.instance(i).weight();
776      }
777    }
778    double freq;
779    double[] cutPoints = new double[m_NumBins - 1];
780    if (getDesiredWeightOfInstancesPerInterval() > 0) {
781      freq = getDesiredWeightOfInstancesPerInterval();
782      cutPoints = new double[(int)(sumOfWeights / freq)];
783    } else {
784      freq = sumOfWeights / m_NumBins;
785      cutPoints = new double[m_NumBins - 1];
786    }
787
788    // Compute break points
789    double counter = 0, last = 0;
790    int cpindex = 0, lastIndex = -1;
791    for (int i = 0; i < data.numInstances() - 1; i++) {
792
793      // Stop if value missing
794      if (data.instance(i).isMissing(index)) {
795        break;
796      }
797      counter += data.instance(i).weight();
798      sumOfWeights -= data.instance(i).weight();
799
800      // Do we have a potential breakpoint?
801      if (data.instance(i).value(index) < 
802          data.instance(i + 1).value(index)) {
803
804        // Have we passed the ideal size?
805        if (counter >= freq) {
806
807          // Is this break point worse than the last one?
808          if (((freq - last) < (counter - freq)) && (lastIndex != -1)) {
809            cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
810                                  data.instance(lastIndex + 1).value(index)) / 2;
811            counter -= last;
812            last = counter;
813            lastIndex = i;
814          } else {
815            cutPoints[cpindex] = (data.instance(i).value(index) +
816                                  data.instance(i + 1).value(index)) / 2;
817            counter = 0;
818            last = 0;
819            lastIndex = -1;
820          }
821          cpindex++;
822          freq = (sumOfWeights + counter) / ((cutPoints.length + 1) - cpindex);
823        } else {
824          lastIndex = i;
825          last = counter;
826        }
827      }
828    }
829
830    // Check whether there was another possibility for a cut point
831    if ((cpindex < cutPoints.length) && (lastIndex != -1)) {
832      cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
833                            data.instance(lastIndex + 1).value(index)) / 2;     
834      cpindex++;
835    }
836
837    // Did we find any cutpoints?
838    if (cpindex == 0) {
839      m_CutPoints[index] = null;
840    } else {
841      double[] cp = new double[cpindex];
842      for (int i = 0; i < cpindex; i++) {
843        cp[i] = cutPoints[i];
844      }
845      m_CutPoints[index] = cp;
846    }
847  }
848
849  /**
850   * Optimizes the number of bins using leave-one-out cross-validation.
851   *
852   * @param index the attribute index
853   */
854  protected void findNumBins(int index) {
855
856    double min = Double.MAX_VALUE, max = -Double.MAX_VALUE, binWidth = 0, 
857      entropy, bestEntropy = Double.MAX_VALUE, currentVal;
858    double[] distribution;
859    int bestNumBins  = 1;
860    Instance currentInstance;
861
862    // Find minimum and maximum
863    for (int i = 0; i < getInputFormat().numInstances(); i++) {
864      currentInstance = getInputFormat().instance(i);
865      if (!currentInstance.isMissing(index)) {
866        currentVal = currentInstance.value(index);
867        if (currentVal > max) {
868          max = currentVal;
869        }
870        if (currentVal < min) {
871          min = currentVal;
872        }
873      }
874    }
875
876    // Find best number of bins
877    for (int i = 0; i < m_NumBins; i++) {
878      distribution = new double[i + 1];
879      binWidth = (max - min) / (i + 1);
880
881      // Compute distribution
882      for (int j = 0; j < getInputFormat().numInstances(); j++) {
883        currentInstance = getInputFormat().instance(j);
884        if (!currentInstance.isMissing(index)) {
885          for (int k = 0; k < i + 1; k++) {
886            if (currentInstance.value(index) <= 
887                (min + (((double)k + 1) * binWidth))) {
888              distribution[k] += currentInstance.weight();
889              break;
890            }
891          }
892        }
893      }
894
895      // Compute cross-validated entropy
896      entropy = 0;
897      for (int k = 0; k < i + 1; k++) {
898        if (distribution[k] < 2) {
899          entropy = Double.MAX_VALUE;
900          break;
901        }
902        entropy -= distribution[k] * Math.log((distribution[k] - 1) / 
903                                              binWidth);
904      }
905
906      // Best entropy so far?
907      if (entropy < bestEntropy) {
908        bestEntropy = entropy;
909        bestNumBins = i + 1;
910      }
911    }
912
913    // Compute cut points
914    double [] cutPoints = null;
915    if ((bestNumBins > 1) && (binWidth > 0)) {
916      cutPoints = new double [bestNumBins - 1];
917      for(int i = 1; i < bestNumBins; i++) {
918        cutPoints[i - 1] = min + binWidth * i;
919      }
920    }
921    m_CutPoints[index] = cutPoints;
922   }
923
924  /**
925   * Set the output format. Takes the currently defined cutpoints and
926   * m_InputFormat and calls setOutputFormat(Instances) appropriately.
927   */
928  protected void setOutputFormat() {
929
930    if (m_CutPoints == null) {
931      setOutputFormat(null);
932      return;
933    }
934    FastVector attributes = new FastVector(getInputFormat().numAttributes());
935    int classIndex = getInputFormat().classIndex();
936    for(int i = 0; i < getInputFormat().numAttributes(); i++) {
937      if ((m_DiscretizeCols.isInRange(i)) 
938          && (getInputFormat().attribute(i).isNumeric())
939          && (getInputFormat().classIndex() != i)) {
940        if (!m_MakeBinary) {
941          FastVector attribValues = new FastVector(1);
942          if (m_CutPoints[i] == null) {
943            attribValues.addElement("'All'");
944          } else {
945            for(int j = 0; j <= m_CutPoints[i].length; j++) {
946              if (j == 0) {
947                attribValues.addElement("'(-inf-"
948                        + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
949              } else if (j == m_CutPoints[i].length) {
950                attribValues.addElement("'("
951                        + Utils.doubleToString(m_CutPoints[i][j - 1], 6) 
952                                        + "-inf)'");
953              } else {
954                attribValues.addElement("'("
955                        + Utils.doubleToString(m_CutPoints[i][j - 1], 6) + "-"
956                        + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
957              }
958            }
959          }
960          attributes.addElement(new Attribute(getInputFormat().
961                                              attribute(i).name(),
962                                              attribValues));
963        } else {
964          if (m_CutPoints[i] == null) {
965            FastVector attribValues = new FastVector(1);
966            attribValues.addElement("'All'");
967            attributes.addElement(new Attribute(getInputFormat().
968                                                attribute(i).name(),
969                                                attribValues));
970          } else {
971            if (i < getInputFormat().classIndex()) {
972              classIndex += m_CutPoints[i].length - 1;
973            }
974            for(int j = 0; j < m_CutPoints[i].length; j++) {
975              FastVector attribValues = new FastVector(2);
976              attribValues.addElement("'(-inf-"
977                      + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
978              attribValues.addElement("'("
979                      + Utils.doubleToString(m_CutPoints[i][j], 6) + "-inf)'");
980              attributes.addElement(new Attribute(getInputFormat().
981                                                  attribute(i).name(),
982                                                  attribValues));
983            }
984          }
985        }
986      } else {
987        attributes.addElement(getInputFormat().attribute(i).copy());
988      }
989    }
990    Instances outputFormat = 
991      new Instances(getInputFormat().relationName(), attributes, 0);
992    outputFormat.setClassIndex(classIndex);
993    setOutputFormat(outputFormat);
994  }
995
996  /**
997   * Convert a single instance over. The converted instance is added to
998   * the end of the output queue.
999   *
1000   * @param instance the instance to convert
1001   */
1002  protected void convertInstance(Instance instance) {
1003
1004    int index = 0;
1005    double [] vals = new double [outputFormatPeek().numAttributes()];
1006    // Copy and convert the values
1007    for(int i = 0; i < getInputFormat().numAttributes(); i++) {
1008      if (m_DiscretizeCols.isInRange(i) && 
1009          getInputFormat().attribute(i).isNumeric() &&
1010          (getInputFormat().classIndex() != i)) {
1011        int j;
1012        double currentVal = instance.value(i);
1013        if (m_CutPoints[i] == null) {
1014          if (instance.isMissing(i)) {
1015            vals[index] = Utils.missingValue();
1016          } else {
1017            vals[index] = 0;
1018          }
1019          index++;
1020        } else {
1021          if (!m_MakeBinary) {
1022            if (instance.isMissing(i)) {
1023              vals[index] = Utils.missingValue();
1024            } else {
1025              for (j = 0; j < m_CutPoints[i].length; j++) {
1026                if (currentVal <= m_CutPoints[i][j]) {
1027                  break;
1028                }
1029              }
1030              vals[index] = j;
1031            }
1032            index++;
1033          } else {
1034            for (j = 0; j < m_CutPoints[i].length; j++) {
1035              if (instance.isMissing(i)) {
1036                vals[index] = Utils.missingValue();
1037              } else if (currentVal <= m_CutPoints[i][j]) {
1038                vals[index] = 0;
1039              } else {
1040                vals[index] = 1;
1041              }
1042              index++;
1043            }
1044          }   
1045        }
1046      } else {
1047        vals[index] = instance.value(i);
1048        index++;
1049      }
1050    }
1051   
1052    Instance inst = null;
1053    if (instance instanceof SparseInstance) {
1054      inst = new SparseInstance(instance.weight(), vals);
1055    } else {
1056      inst = new DenseInstance(instance.weight(), vals);
1057    }
1058    inst.setDataset(getOutputFormat());
1059    copyValues(inst, false, instance.dataset(), getOutputFormat());
1060    inst.setDataset(getOutputFormat());
1061    push(inst);
1062  }
1063 
1064  /**
1065   * Returns the revision string.
1066   *
1067   * @return            the revision
1068   */
1069  public String getRevision() {
1070    return RevisionUtils.extract("$Revision: 5987 $");
1071  }
1072
1073  /**
1074   * Main method for testing this class.
1075   *
1076   * @param argv should contain arguments to the filter: use -h for help
1077   */
1078  public static void main(String [] argv) {
1079    runFilter(new Discretize(), argv);
1080  }
1081}
Note: See TracBrowser for help on using the repository browser.