source: src/main/java/weka/attributeSelection/SymmetricalUncertAttributeEval.java @ 6

Last change on this file since 6 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 11.0 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    SymmetricalUncertAttributeEval.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Capabilities;
26import weka.core.ContingencyTables;
27import weka.core.Instance;
28import weka.core.Instances;
29import weka.core.Option;
30import weka.core.OptionHandler;
31import weka.core.RevisionUtils;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.filters.Filter;
35import weka.filters.supervised.attribute.Discretize;
36
37import java.util.Enumeration;
38import java.util.Vector;
39
40/**
41 <!-- globalinfo-start -->
42 * SymmetricalUncertAttributeEval :<br/>
43 * <br/>
44 * Evaluates the worth of an attribute by measuring the symmetrical uncertainty with respect to the class. <br/>
45 * <br/>
46 *  SymmU(Class, Attribute) = 2 * (H(Class) - H(Class | Attribute)) / H(Class) + H(Attribute).<br/>
47 * <p/>
48 <!-- globalinfo-end -->
49 *
50 <!-- options-start -->
51 * Valid options are: <p/>
52 *
53 * <pre> -M
54 *  treat missing values as a seperate value.</pre>
55 *
56 <!-- options-end -->
57 *
58 * @author Mark Hall (mhall@cs.waikato.ac.nz)
59 * @version $Revision: 5447 $
60 * @see Discretize
61 */
62public class SymmetricalUncertAttributeEval
63  extends ASEvaluation
64  implements AttributeEvaluator, OptionHandler {
65 
66  /** for serialization */
67  static final long serialVersionUID = -8096505776132296416L;
68
69  /** The training instances */
70  private Instances m_trainInstances;
71
72  /** The class index */
73  private int m_classIndex;
74
75  /** The number of attributes */
76  private int m_numAttribs;
77
78  /** The number of instances */
79  private int m_numInstances;
80
81  /** The number of classes */
82  private int m_numClasses;
83
84  /** Treat missing values as a seperate value */
85  private boolean m_missing_merge;
86
87  /**
88   * Returns a string describing this attribute evaluator
89   * @return a description of the evaluator suitable for
90   * displaying in the explorer/experimenter gui
91   */
92  public String globalInfo() {
93    return "SymmetricalUncertAttributeEval :\n\nEvaluates the worth of an attribute "
94      +"by measuring the symmetrical uncertainty with respect to the class. "
95      +"\n\n SymmU(Class, Attribute) = 2 * (H(Class) - H(Class | Attribute)) "
96      +"/ H(Class) + H(Attribute).\n";
97  }
98 
99  /**
100   * Constructor
101   */
102  public SymmetricalUncertAttributeEval () {
103    resetOptions();
104  }
105
106
107  /**
108   * Returns an enumeration describing the available options.
109   * @return an enumeration of all the available options.
110   **/
111  public Enumeration listOptions () {
112    Vector newVector = new Vector(1);
113    newVector.addElement(new Option("\ttreat missing values as a seperate " 
114                                    + "value.", "M", 0, "-M"));
115    return  newVector.elements();
116  }
117
118
119  /**
120   * Parses a given list of options. <p/>
121   *
122   <!-- options-start -->
123   * Valid options are: <p/>
124   *
125   * <pre> -M
126   *  treat missing values as a seperate value.</pre>
127   *
128   <!-- options-end -->
129   *
130   * @param options the list of options as an array of strings
131   * @throws Exception if an option is not supported
132   **/
133  public void setOptions (String[] options)
134    throws Exception {
135    resetOptions();
136    setMissingMerge(!(Utils.getFlag('M', options)));
137  }
138
139  /**
140   * Returns the tip text for this property
141   * @return tip text for this property suitable for
142   * displaying in the explorer/experimenter gui
143   */
144  public String missingMergeTipText() {
145    return "Distribute counts for missing values. Counts are distributed "
146      +"across other values in proportion to their frequency. Otherwise, "
147      +"missing is treated as a separate value.";
148  }
149
150  /**
151   * distribute the counts for missing values across observed values
152   *
153   * @param b true=distribute missing values.
154   */
155  public void setMissingMerge (boolean b) {
156    m_missing_merge = b;
157  }
158
159
160  /**
161   * get whether missing values are being distributed or not
162   *
163   * @return true if missing values are being distributed.
164   */
165  public boolean getMissingMerge () {
166    return  m_missing_merge;
167  }
168
169
170  /**
171   * Gets the current settings of WrapperSubsetEval.
172   * @return an array of strings suitable for passing to setOptions()
173   */
174  public String[] getOptions () {
175    String[] options = new String[1];
176    int current = 0;
177
178    if (!getMissingMerge()) {
179      options[current++] = "-M";
180    }
181
182    while (current < options.length) {
183      options[current++] = "";
184    }
185
186    return  options;
187  }
188
189  /**
190   * Returns the capabilities of this evaluator.
191   *
192   * @return            the capabilities of this evaluator
193   * @see               Capabilities
194   */
195  public Capabilities getCapabilities() {
196    Capabilities result = super.getCapabilities();
197    result.disableAll();
198   
199    // attributes
200    result.enable(Capability.NOMINAL_ATTRIBUTES);
201    result.enable(Capability.NUMERIC_ATTRIBUTES);
202    result.enable(Capability.DATE_ATTRIBUTES);
203    result.enable(Capability.MISSING_VALUES);
204   
205    // class
206    result.enable(Capability.NOMINAL_CLASS);
207    result.enable(Capability.MISSING_CLASS_VALUES);
208   
209    return result;
210  }
211
212  /**
213   * Initializes a symmetrical uncertainty attribute evaluator.
214   * Discretizes all attributes that are numeric.
215   *
216   * @param data set of instances serving as training data
217   * @throws Exception if the evaluator has not been
218   * generated successfully
219   */
220  public void buildEvaluator (Instances data)
221    throws Exception {
222
223    // can evaluator handle data?
224    getCapabilities().testWithFail(data);
225
226    m_trainInstances = data;
227    m_classIndex = m_trainInstances.classIndex();
228    m_numAttribs = m_trainInstances.numAttributes();
229    m_numInstances = m_trainInstances.numInstances();
230    Discretize disTransform = new Discretize();
231    disTransform.setUseBetterEncoding(true);
232    disTransform.setInputFormat(m_trainInstances);
233    m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
234    m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
235  }
236
237
238  /**
239   * set options to default values
240   */
241  protected void resetOptions () {
242    m_trainInstances = null;
243    m_missing_merge = true;
244  }
245
246
247  /**
248   * evaluates an individual attribute by measuring the symmetrical
249   * uncertainty between it and the class.
250   *
251   * @param attribute the index of the attribute to be evaluated
252   * @return the uncertainty
253   * @throws Exception if the attribute could not be evaluated
254   */
255  public double evaluateAttribute (int attribute)
256    throws Exception {
257    int i, j, ii, jj;
258    int nnj, nni, ni, nj;
259    double sum = 0.0;
260    ni = m_trainInstances.attribute(attribute).numValues() + 1;
261    nj = m_numClasses + 1;
262    double[] sumi, sumj;
263    Instance inst;
264    double temp = 0.0;
265    sumi = new double[ni];
266    sumj = new double[nj];
267    double[][] counts = new double[ni][nj];
268    sumi = new double[ni];
269    sumj = new double[nj];
270
271    for (i = 0; i < ni; i++) {
272      sumi[i] = 0.0;
273
274      for (j = 0; j < nj; j++) {
275        sumj[j] = 0.0;
276        counts[i][j] = 0.0;
277      }
278    }
279
280    // Fill the contingency table
281    for (i = 0; i < m_numInstances; i++) {
282      inst = m_trainInstances.instance(i);
283
284      if (inst.isMissing(attribute)) {
285        ii = ni - 1;
286      }
287      else {
288        ii = (int)inst.value(attribute);
289      }
290
291      if (inst.isMissing(m_classIndex)) {
292        jj = nj - 1;
293      }
294      else {
295        jj = (int)inst.value(m_classIndex);
296      }
297
298      counts[ii][jj]++;
299    }
300
301    // get the row totals
302    for (i = 0; i < ni; i++) {
303      sumi[i] = 0.0;
304
305      for (j = 0; j < nj; j++) {
306        sumi[i] += counts[i][j];
307        sum += counts[i][j];
308      }
309    }
310
311    // get the column totals
312    for (j = 0; j < nj; j++) {
313      sumj[j] = 0.0;
314
315      for (i = 0; i < ni; i++) {
316        sumj[j] += counts[i][j];
317      }
318    }
319
320    // distribute missing counts
321    if (m_missing_merge && 
322        (sumi[ni-1] < m_numInstances) && 
323        (sumj[nj-1] < m_numInstances)) {
324      double[] i_copy = new double[sumi.length];
325      double[] j_copy = new double[sumj.length];
326      double[][] counts_copy = new double[sumi.length][sumj.length];
327
328      for (i = 0; i < ni; i++) {
329        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
330      }
331
332      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
333      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
334      double total_missing = (sumi[ni - 1] + sumj[nj - 1] 
335                              - counts[ni - 1][nj - 1]);
336
337      // do the missing i's
338      if (sumi[ni - 1] > 0.0) {
339        for (j = 0; j < nj - 1; j++) {
340          if (counts[ni - 1][j] > 0.0) {
341            for (i = 0; i < ni - 1; i++) {
342              temp = ((i_copy[i]/(sum - i_copy[ni - 1])) * 
343                      counts[ni - 1][j]);
344              counts[i][j] += temp;
345              sumi[i] += temp;
346            }
347
348            counts[ni - 1][j] = 0.0;
349          }
350        }
351      }
352
353      sumi[ni - 1] = 0.0;
354
355      // do the missing j's
356      if (sumj[nj - 1] > 0.0) {
357        for (i = 0; i < ni - 1; i++) {
358          if (counts[i][nj - 1] > 0.0) {
359            for (j = 0; j < nj - 1; j++) {
360              temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
361              counts[i][j] += temp;
362              sumj[j] += temp;
363            }
364
365            counts[i][nj - 1] = 0.0;
366          }
367        }
368      }
369
370      sumj[nj - 1] = 0.0;
371
372      // do the both missing
373      if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
374        for (i = 0; i < ni - 1; i++) {
375          for (j = 0; j < nj - 1; j++) {
376            temp = (counts_copy[i][j]/(sum - total_missing)) * 
377              counts_copy[ni - 1][nj - 1];
378            counts[i][j] += temp;
379            sumi[i] += temp;
380            sumj[j] += temp;
381          }
382        }
383
384        counts[ni - 1][nj - 1] = 0.0;
385      }
386    }
387
388    return  ContingencyTables.symmetricalUncertainty(counts);
389  }
390
391
392  /**
393   * Return a description of the evaluator
394   * @return description as a string
395   */
396  public String toString () {
397    StringBuffer text = new StringBuffer();
398
399    if (m_trainInstances == null) {
400      text.append("\tSymmetrical Uncertainty evaluator has not been built");
401    }
402    else {
403      text.append("\tSymmetrical Uncertainty Ranking Filter");
404      if (!m_missing_merge) {
405        text.append("\n\tMissing values treated as seperate");
406      }
407    }
408
409    text.append("\n");
410    return  text.toString();
411  }
412 
413  /**
414   * Returns the revision string.
415   *
416   * @return            the revision
417   */
418  public String getRevision() {
419    return RevisionUtils.extract("$Revision: 5447 $");
420  }
421
422  // ============
423  // Test method.
424  // ============
425  /**
426   * Main method for testing this class.
427   *
428   * @param argv should contain the following arguments:
429   * -t training file
430   */
431  public static void main (String[] argv) {
432    runEvaluator(new SymmetricalUncertAttributeEval(), argv);
433  }
434}
Note: See TracBrowser for help on using the repository browser.