source: src/main/java/weka/attributeSelection/InfoGainAttributeEval.java @ 22

Last change on this file since 22 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 13.6 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    InfoGainAttributeEval.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Capabilities;
26import weka.core.ContingencyTables;
27import weka.core.Instance;
28import weka.core.Instances;
29import weka.core.Option;
30import weka.core.OptionHandler;
31import weka.core.RevisionUtils;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.filters.Filter;
35import weka.filters.supervised.attribute.Discretize;
36import weka.filters.unsupervised.attribute.NumericToBinary;
37
38import java.util.Enumeration;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * InfoGainAttributeEval :<br/>
44 * <br/>
45 * Evaluates the worth of an attribute by measuring the information gain with respect to the class.<br/>
46 * <br/>
47 * InfoGain(Class,Attribute) = H(Class) - H(Class | Attribute).<br/>
48 * <p/>
49 <!-- globalinfo-end -->
50 *
51 <!-- options-start -->
52 * Valid options are: <p/>
53 *
54 * <pre> -M
55 *  treat missing values as a seperate value.</pre>
56 *
57 * <pre> -B
58 *  just binarize numeric attributes instead
59 *  of properly discretizing them.</pre>
60 *
61 <!-- options-end -->
62 *
63 * @author Mark Hall (mhall@cs.waikato.ac.nz)
64 * @version $Revision: 5447 $
65 * @see Discretize
66 * @see NumericToBinary
67 */
68public class InfoGainAttributeEval
69  extends ASEvaluation
70  implements AttributeEvaluator, OptionHandler {
71 
72  /** for serialization */
73  static final long serialVersionUID = -1949849512589218930L;
74
75  /** Treat missing values as a seperate value */
76  private boolean m_missing_merge;
77
78  /** Just binarize numeric attributes */
79  private boolean m_Binarize;
80
81  /** The info gain for each attribute */
82  private double[] m_InfoGains;
83
84  /**
85   * Returns a string describing this attribute evaluator
86   * @return a description of the evaluator suitable for
87   * displaying in the explorer/experimenter gui
88   */
89  public String globalInfo() {
90    return "InfoGainAttributeEval :\n\nEvaluates the worth of an attribute "
91      +"by measuring the information gain with respect to the class.\n\n"
92      +"InfoGain(Class,Attribute) = H(Class) - H(Class | Attribute).\n";
93  }
94
95  /**
96   * Constructor
97   */
98  public InfoGainAttributeEval () {
99    resetOptions();
100  }
101
102  /**
103   * Returns an enumeration describing the available options.
104   * @return an enumeration of all the available options.
105   **/
106  public Enumeration listOptions () {
107    Vector newVector = new Vector(2);
108    newVector.addElement(new Option("\ttreat missing values as a seperate " 
109                                    + "value.", "M", 0, "-M"));
110    newVector.addElement(new Option("\tjust binarize numeric attributes instead \n" 
111                                    +"\tof properly discretizing them.", "B", 0, 
112                                    "-B"));
113    return  newVector.elements();
114  }
115
116
117  /**
118   * Parses a given list of options. <p/>
119   *
120   <!-- options-start -->
121   * Valid options are: <p/>
122   *
123   * <pre> -M
124   *  treat missing values as a seperate value.</pre>
125   *
126   * <pre> -B
127   *  just binarize numeric attributes instead
128   *  of properly discretizing them.</pre>
129   *
130   <!-- options-end -->
131   *
132   * @param options the list of options as an array of strings
133   * @throws Exception if an option is not supported
134   */
135  public void setOptions (String[] options)
136    throws Exception {
137
138    resetOptions();
139    setMissingMerge(!(Utils.getFlag('M', options)));
140    setBinarizeNumericAttributes(Utils.getFlag('B', options));
141  }
142
143
144  /**
145   * Gets the current settings of WrapperSubsetEval.
146   *
147   * @return an array of strings suitable for passing to setOptions()
148   */
149  public String[] getOptions () {
150    String[] options = new String[2];
151    int current = 0;
152
153    if (!getMissingMerge()) {
154      options[current++] = "-M";
155    }
156    if (getBinarizeNumericAttributes()) {
157      options[current++] = "-B";
158    }
159
160    while (current < options.length) {
161      options[current++] = "";
162    }
163
164    return  options;
165  }
166
167  /**
168   * Returns the tip text for this property
169   * @return tip text for this property suitable for
170   * displaying in the explorer/experimenter gui
171   */
172  public String binarizeNumericAttributesTipText() {
173    return "Just binarize numeric attributes instead of properly discretizing them.";
174  }
175
176  /**
177   * Binarize numeric attributes.
178   *
179   * @param b true=binarize numeric attributes
180   */
181  public void setBinarizeNumericAttributes (boolean b) {
182    m_Binarize = b;
183  }
184
185
186  /**
187   * get whether numeric attributes are just being binarized.
188   *
189   * @return true if missing values are being distributed.
190   */
191  public boolean getBinarizeNumericAttributes () {
192    return  m_Binarize;
193  }
194
195  /**
196   * Returns the tip text for this property
197   * @return tip text for this property suitable for
198   * displaying in the explorer/experimenter gui
199   */
200  public String missingMergeTipText() {
201    return "Distribute counts for missing values. Counts are distributed "
202      +"across other values in proportion to their frequency. Otherwise, "
203      +"missing is treated as a separate value.";
204  }
205
206  /**
207   * distribute the counts for missing values across observed values
208   *
209   * @param b true=distribute missing values.
210   */
211  public void setMissingMerge (boolean b) {
212    m_missing_merge = b;
213  }
214
215
216  /**
217   * get whether missing values are being distributed or not
218   *
219   * @return true if missing values are being distributed.
220   */
221  public boolean getMissingMerge () {
222    return  m_missing_merge;
223  }
224
225  /**
226   * Returns the capabilities of this evaluator.
227   *
228   * @return            the capabilities of this evaluator
229   * @see               Capabilities
230   */
231  public Capabilities getCapabilities() {
232    Capabilities result = super.getCapabilities();
233    result.disableAll();
234   
235    // attributes
236    result.enable(Capability.NOMINAL_ATTRIBUTES);
237    result.enable(Capability.NUMERIC_ATTRIBUTES);
238    result.enable(Capability.DATE_ATTRIBUTES);
239    result.enable(Capability.MISSING_VALUES);
240   
241    // class
242    result.enable(Capability.NOMINAL_CLASS);
243    result.enable(Capability.MISSING_CLASS_VALUES);
244   
245    return result;
246  }
247
248  /**
249   * Initializes an information gain attribute evaluator.
250   * Discretizes all attributes that are numeric.
251   *
252   * @param data set of instances serving as training data
253   * @throws Exception if the evaluator has not been
254   * generated successfully
255   */
256  public void buildEvaluator (Instances data)
257    throws Exception {
258   
259    // can evaluator handle data?
260    getCapabilities().testWithFail(data);
261
262    int classIndex = data.classIndex();
263    int numInstances = data.numInstances();
264   
265    if (!m_Binarize) {
266      Discretize disTransform = new Discretize();
267      disTransform.setUseBetterEncoding(true);
268      disTransform.setInputFormat(data);
269      data = Filter.useFilter(data, disTransform);
270    } else {
271      NumericToBinary binTransform = new NumericToBinary();
272      binTransform.setInputFormat(data);
273      data = Filter.useFilter(data, binTransform);
274    }     
275    int numClasses = data.attribute(classIndex).numValues();
276
277    // Reserve space and initialize counters
278    double[][][] counts = new double[data.numAttributes()][][];
279    for (int k = 0; k < data.numAttributes(); k++) {
280      if (k != classIndex) {
281        int numValues = data.attribute(k).numValues();
282        counts[k] = new double[numValues + 1][numClasses + 1];
283      }
284    }
285
286    // Initialize counters
287    double[] temp = new double[numClasses + 1];
288    for (int k = 0; k < numInstances; k++) {
289      Instance inst = data.instance(k);
290      if (inst.classIsMissing()) {
291        temp[numClasses] += inst.weight();
292      } else {
293        temp[(int)inst.classValue()] += inst.weight();
294      }
295    }
296    for (int k = 0; k < counts.length; k++) {
297      if (k != classIndex) {
298        for (int i = 0; i < temp.length; i++) {
299          counts[k][0][i] = temp[i];
300        }
301      }
302    }
303
304    // Get counts
305    for (int k = 0; k < numInstances; k++) {
306      Instance inst = data.instance(k);
307      for (int i = 0; i < inst.numValues(); i++) {
308        if (inst.index(i) != classIndex) {
309          if (inst.isMissingSparse(i) || inst.classIsMissing()) {
310            if (!inst.isMissingSparse(i)) {
311              counts[inst.index(i)][(int)inst.valueSparse(i)][numClasses] += 
312                inst.weight();
313              counts[inst.index(i)][0][numClasses] -= inst.weight();
314            } else if (!inst.classIsMissing()) {
315              counts[inst.index(i)][data.attribute(inst.index(i)).numValues()]
316                [(int)inst.classValue()] += inst.weight();
317              counts[inst.index(i)][0][(int)inst.classValue()] -= 
318                inst.weight();
319            } else {
320              counts[inst.index(i)][data.attribute(inst.index(i)).numValues()]
321                [numClasses] += inst.weight();
322              counts[inst.index(i)][0][numClasses] -= inst.weight();
323            }
324          } else {
325            counts[inst.index(i)][(int)inst.valueSparse(i)]
326              [(int)inst.classValue()] += inst.weight();
327            counts[inst.index(i)][0][(int)inst.classValue()] -= inst.weight();
328          }
329        }
330      }
331    }
332
333    // distribute missing counts if required
334    if (m_missing_merge) {
335     
336      for (int k = 0; k < data.numAttributes(); k++) {
337        if (k != classIndex) {
338          int numValues = data.attribute(k).numValues();
339
340          // Compute marginals
341          double[] rowSums = new double[numValues];
342          double[] columnSums = new double[numClasses];
343          double sum = 0;
344          for (int i = 0; i < numValues; i++) {
345            for (int j = 0; j < numClasses; j++) {
346              rowSums[i] += counts[k][i][j];
347              columnSums[j] += counts[k][i][j];
348            }
349            sum += rowSums[i];
350          }
351
352          if (Utils.gr(sum, 0)) {
353            double[][] additions = new double[numValues][numClasses];
354
355            // Compute what needs to be added to each row
356            for (int i = 0; i < numValues; i++) {
357              for (int j = 0; j  < numClasses; j++) {
358                additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
359              }
360            }
361           
362            // Compute what needs to be added to each column
363            for (int i = 0; i < numClasses; i++) {
364              for (int j = 0; j  < numValues; j++) {
365                additions[j][i] += (columnSums[i] / sum) * 
366                  counts[k][j][numClasses];
367              }
368            }
369           
370            // Compute what needs to be added to each cell
371            for (int i = 0; i < numClasses; i++) {
372              for (int j = 0; j  < numValues; j++) {
373                additions[j][i] += (counts[k][j][i] / sum) * 
374                  counts[k][numValues][numClasses];
375              }
376            }
377           
378            // Make new contingency table
379            double[][] newTable = new double[numValues][numClasses];
380            for (int i = 0; i < numValues; i++) {
381              for (int j = 0; j < numClasses; j++) {
382                newTable[i][j] = counts[k][i][j] + additions[i][j];
383              }
384            }
385            counts[k] = newTable;
386          }
387        }
388      }
389    }
390
391    // Compute info gains
392    m_InfoGains = new double[data.numAttributes()];
393    for (int i = 0; i < data.numAttributes(); i++) {
394      if (i != classIndex) {
395        m_InfoGains[i] = 
396          (ContingencyTables.entropyOverColumns(counts[i]) 
397           - ContingencyTables.entropyConditionedOnRows(counts[i]));
398      }
399    }
400  }
401
402  /**
403   * Reset options to their default values
404   */
405  protected void resetOptions () {
406    m_InfoGains = null;
407    m_missing_merge = true;
408    m_Binarize = false;
409  }
410
411
412  /**
413   * evaluates an individual attribute by measuring the amount
414   * of information gained about the class given the attribute.
415   *
416   * @param attribute the index of the attribute to be evaluated
417   * @return the info gain
418   * @throws Exception if the attribute could not be evaluated
419   */
420  public double evaluateAttribute (int attribute)
421    throws Exception {
422
423    return m_InfoGains[attribute];
424  }
425
426  /**
427   * Describe the attribute evaluator
428   * @return a description of the attribute evaluator as a string
429   */
430  public String toString () {
431    StringBuffer text = new StringBuffer();
432
433    if (m_InfoGains == null) {
434      text.append("Information Gain attribute evaluator has not been built");
435    }
436    else {
437      text.append("\tInformation Gain Ranking Filter");
438      if (!m_missing_merge) {
439        text.append("\n\tMissing values treated as seperate");
440      }
441      if (m_Binarize) {
442        text.append("\n\tNumeric attributes are just binarized");
443      }
444    }
445   
446    text.append("\n");
447    return  text.toString();
448  }
449 
450  /**
451   * Returns the revision string.
452   *
453   * @return            the revision
454   */
455  public String getRevision() {
456    return RevisionUtils.extract("$Revision: 5447 $");
457  }
458 
459  // ============
460  // Test method.
461  // ============
462  /**
463   * Main method for testing this class.
464   *
465   * @param args the options
466   */
467  public static void main (String[] args) {
468    runEvaluator(new InfoGainAttributeEval(), args);
469  }
470}
Note: See TracBrowser for help on using the repository browser.