source: src/main/java/weka/attributeSelection/GainRatioAttributeEval.java @ 27

Last change on this file since 27 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 11.0 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    GainRatioAttributeEval.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Capabilities;
26import weka.core.ContingencyTables;
27import weka.core.Instance;
28import weka.core.Instances;
29import weka.core.Option;
30import weka.core.OptionHandler;
31import weka.core.RevisionUtils;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.filters.Filter;
35import weka.filters.supervised.attribute.Discretize;
36
37import java.util.Enumeration;
38import java.util.Vector;
39
40/**
41 <!-- globalinfo-start -->
42 * GainRatioAttributeEval :<br/>
43 * <br/>
44 * Evaluates the worth of an attribute by measuring the gain ratio with respect to the class.<br/>
45 * <br/>
46 * GainR(Class, Attribute) = (H(Class) - H(Class | Attribute)) / H(Attribute).<br/>
47 * <p/>
48 <!-- globalinfo-end -->
49 *
50 <!-- options-start -->
51 * Valid options are: <p/>
52 *
53 * <pre> -M
54 *  treat missing values as a seperate value.</pre>
55 *
56 <!-- options-end -->
57 *
58 * @author Mark Hall (mhall@cs.waikato.ac.nz)
59 * @version $Revision: 5447 $
60 * @see Discretize
61 */
62public class GainRatioAttributeEval
63  extends ASEvaluation
64  implements AttributeEvaluator, OptionHandler {
65 
66  /** for serialization */
67  static final long serialVersionUID = -8504656625598579926L;
68
69  /** The training instances */
70  private Instances m_trainInstances;
71
72  /** The class index */
73  private int m_classIndex;
74
75  /** The number of attributes */
76  private int m_numAttribs;
77
78  /** The number of instances */
79  private int m_numInstances;
80
81  /** The number of classes */
82  private int m_numClasses;
83
84  /** Merge missing values */
85  private boolean m_missing_merge;
86
87  /**
88   * Returns a string describing this attribute evaluator
89   * @return a description of the evaluator suitable for
90   * displaying in the explorer/experimenter gui
91   */
92  public String globalInfo() {
93    return "GainRatioAttributeEval :\n\nEvaluates the worth of an attribute "
94      +"by measuring the gain ratio with respect to the class.\n\n"
95      +"GainR(Class, Attribute) = (H(Class) - H(Class | Attribute)) / "
96      +"H(Attribute).\n";
97  }
98
99  /**
100   * Constructor
101   */
102  public GainRatioAttributeEval () {
103    resetOptions();
104  }
105
106
107  /**
108   * Returns an enumeration describing the available options.
109   * @return an enumeration of all the available options.
110   **/
111  public Enumeration listOptions () {
112    Vector newVector = new Vector(1);
113    newVector.addElement(new Option("\ttreat missing values as a seperate " 
114                                    + "value.", "M", 0, "-M"));
115    return  newVector.elements();
116  }
117
118
119  /**
120   * Parses a given list of options. <p/>
121   *
122   <!-- options-start -->
123   * Valid options are: <p/>
124   *
125   * <pre> -M
126   *  treat missing values as a seperate value.</pre>
127   *
128   <!-- options-end -->
129   *
130   * @param options the list of options as an array of strings
131   * @throws Exception if an option is not supported
132   **/
133  public void setOptions (String[] options)
134    throws Exception {
135    resetOptions();
136    setMissingMerge(!(Utils.getFlag('M', options)));
137  }
138 
139  /**
140   * Returns the tip text for this property
141   * @return tip text for this property suitable for
142   * displaying in the explorer/experimenter gui
143   */
144  public String missingMergeTipText() {
145    return "Distribute counts for missing values. Counts are distributed "
146      +"across other values in proportion to their frequency. Otherwise, "
147      +"missing is treated as a separate value.";
148  }
149
150  /**
151   * distribute the counts for missing values across observed values
152   *
153   * @param b true=distribute missing values.
154   */
155  public void setMissingMerge (boolean b) {
156    m_missing_merge = b;
157  }
158
159
160  /**
161   * get whether missing values are being distributed or not
162   *
163   * @return true if missing values are being distributed.
164   */
165  public boolean getMissingMerge () {
166    return  m_missing_merge;
167  }
168
169
170  /**
171   * Gets the current settings of WrapperSubsetEval.
172   * @return an array of strings suitable for passing to setOptions()
173   */
174  public String[] getOptions () {
175    String[] options = new String[1];
176    int current = 0;
177
178    if (!getMissingMerge()) {
179      options[current++] = "-M";
180    }
181
182    while (current < options.length) {
183      options[current++] = "";
184    }
185
186    return  options;
187  }
188
189  /**
190   * Returns the capabilities of this evaluator.
191   *
192   * @return            the capabilities of this evaluator
193   * @see               Capabilities
194   */
195  public Capabilities getCapabilities() {
196    Capabilities result = super.getCapabilities();
197    result.disableAll();
198   
199    // attributes
200    result.enable(Capability.NOMINAL_ATTRIBUTES);
201    result.enable(Capability.NUMERIC_ATTRIBUTES);
202    result.enable(Capability.DATE_ATTRIBUTES);
203    result.enable(Capability.MISSING_VALUES);
204   
205    // class
206    result.enable(Capability.NOMINAL_CLASS);
207    result.enable(Capability.MISSING_CLASS_VALUES);
208   
209    return result;
210  }
211
212  /**
213   * Initializes a gain ratio attribute evaluator.
214   * Discretizes all attributes that are numeric.
215   *
216   * @param data set of instances serving as training data
217   * @throws Exception if the evaluator has not been
218   * generated successfully
219   */
220  public void buildEvaluator (Instances data)
221    throws Exception {
222   
223    // can evaluator handle data?
224    getCapabilities().testWithFail(data);
225
226    m_trainInstances = data;
227    m_classIndex = m_trainInstances.classIndex();
228    m_numAttribs = m_trainInstances.numAttributes();
229    m_numInstances = m_trainInstances.numInstances();
230    Discretize disTransform = new Discretize();
231    disTransform.setUseBetterEncoding(true);
232    disTransform.setInputFormat(m_trainInstances);
233    m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
234    m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
235  }
236
237
238  /**
239   * reset options to default values
240   */
241  protected void resetOptions () {
242    m_trainInstances = null;
243    m_missing_merge = true;
244  }
245
246
247  /**
248   * evaluates an individual attribute by measuring the gain ratio
249   * of the class given the attribute.
250   *
251   * @param attribute the index of the attribute to be evaluated
252   * @return the gain ratio
253   * @throws Exception if the attribute could not be evaluated
254   */
255  public double evaluateAttribute (int attribute)
256    throws Exception {
257    int i, j, ii, jj;
258    int ni, nj;
259    double sum = 0.0;
260    ni = m_trainInstances.attribute(attribute).numValues() + 1;
261    nj = m_numClasses + 1;
262    double[] sumi, sumj;
263    Instance inst;
264    double temp = 0.0;
265    sumi = new double[ni];
266    sumj = new double[nj];
267    double[][] counts = new double[ni][nj];
268    sumi = new double[ni];
269    sumj = new double[nj];
270
271    for (i = 0; i < ni; i++) {
272      sumi[i] = 0.0;
273
274      for (j = 0; j < nj; j++) {
275        sumj[j] = 0.0;
276        counts[i][j] = 0.0;
277      }
278    }
279
280    // Fill the contingency table
281    for (i = 0; i < m_numInstances; i++) {
282      inst = m_trainInstances.instance(i);
283
284      if (inst.isMissing(attribute)) {
285        ii = ni - 1;
286      }
287      else {
288        ii = (int)inst.value(attribute);
289      }
290
291      if (inst.isMissing(m_classIndex)) {
292        jj = nj - 1;
293      }
294      else {
295        jj = (int)inst.value(m_classIndex);
296      }
297
298      counts[ii][jj]++;
299    }
300
301    // get the row totals
302    for (i = 0; i < ni; i++) {
303      sumi[i] = 0.0;
304
305      for (j = 0; j < nj; j++) {
306        sumi[i] += counts[i][j];
307        sum += counts[i][j];
308      }
309    }
310
311    // get the column totals
312    for (j = 0; j < nj; j++) {
313      sumj[j] = 0.0;
314
315      for (i = 0; i < ni; i++) {
316        sumj[j] += counts[i][j];
317      }
318    }
319
320    // distribute missing counts
321    if (m_missing_merge && 
322        (sumi[ni-1] < m_numInstances) && 
323        (sumj[nj-1] < m_numInstances)) {
324      double[] i_copy = new double[sumi.length];
325      double[] j_copy = new double[sumj.length];
326      double[][] counts_copy = new double[sumi.length][sumj.length];
327
328      for (i = 0; i < ni; i++) {
329        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
330      }
331
332      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
333      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
334      double total_missing = (sumi[ni - 1] + sumj[nj - 1] - 
335                              counts[ni - 1][nj - 1]);
336
337      // do the missing i's
338      if (sumi[ni - 1] > 0.0) {
339        for (j = 0; j < nj - 1; j++) {
340          if (counts[ni - 1][j] > 0.0) {
341            for (i = 0; i < ni - 1; i++) {
342              temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
343              counts[i][j] += temp;
344              sumi[i] += temp;
345            }
346
347            counts[ni - 1][j] = 0.0;
348          }
349        }
350      }
351
352      sumi[ni - 1] = 0.0;
353
354      // do the missing j's
355      if (sumj[nj - 1] > 0.0) {
356        for (i = 0; i < ni - 1; i++) {
357          if (counts[i][nj - 1] > 0.0) {
358            for (j = 0; j < nj - 1; j++) {
359              temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
360              counts[i][j] += temp;
361              sumj[j] += temp;
362            }
363
364            counts[i][nj - 1] = 0.0;
365          }
366        }
367      }
368
369      sumj[nj - 1] = 0.0;
370
371      // do the both missing
372      if (counts[ni - 1][nj - 1] > 0.0  && total_missing != sum) {
373        for (i = 0; i < ni - 1; i++) {
374          for (j = 0; j < nj - 1; j++) {
375            temp = (counts_copy[i][j]/(sum - total_missing)) * 
376              counts_copy[ni - 1][nj - 1];
377            counts[i][j] += temp;
378            sumi[i] += temp;
379            sumj[j] += temp;
380          }
381        }
382
383        counts[ni - 1][nj - 1] = 0.0;
384      }
385    }
386
387    return  ContingencyTables.gainRatio(counts);
388  }
389
390
391  /**
392   * Return a description of the evaluator
393   * @return description as a string
394   */
395  public String toString () {
396    StringBuffer text = new StringBuffer();
397
398    if (m_trainInstances == null) {
399      text.append("\tGain Ratio evaluator has not been built");
400    }
401    else {
402      text.append("\tGain Ratio feature evaluator");
403
404      if (!m_missing_merge) {
405        text.append("\n\tMissing values treated as seperate");
406      }
407    }
408
409    text.append("\n");
410    return  text.toString();
411  }
412 
413  /**
414   * Returns the revision string.
415   *
416   * @return            the revision
417   */
418  public String getRevision() {
419    return RevisionUtils.extract("$Revision: 5447 $");
420  }
421
422  /**
423   * Main method.
424   *
425   * @param args the options
426   * -t training file
427   */
428  public static void main (String[] args) {
429    runEvaluator(new GainRatioAttributeEval(), args);
430  }
431}
Note: See TracBrowser for help on using the repository browser.