source: src/main/java/weka/attributeSelection/SignificanceAttributeEval.java @ 6

Last change on this file since 6 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 16.4 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    SignificanceAttributeEval.java
19 *    Copyright (C) 2009 Adrian Pino
20 *    Copyright (C) 2009 University of Waikato, Hamilton, NZ
21 *
22 */
23package weka.attributeSelection;
24
25import java.util.ArrayList;
26import java.util.Enumeration;
27import java.util.List;
28import java.util.Vector;
29
30import weka.core.Capabilities;
31import weka.core.Instance;
32import weka.core.Instances;
33import weka.core.Option;
34import weka.core.OptionHandler;
35import weka.core.RevisionUtils;
36import weka.core.TechnicalInformation;
37import weka.core.TechnicalInformationHandler;
38import weka.core.Utils;
39import weka.core.Capabilities.Capability;
40import weka.core.TechnicalInformation.Field;
41import weka.core.TechnicalInformation.Type;
42import weka.filters.Filter;
43import weka.filters.supervised.attribute.Discretize;
44
45/**
46 <!-- globalinfo-start -->
47 * Significance :<br/>
48 * <br/>
49 * Evaluates the worth of an attribute by computing the Probabilistic Significance as a two-way function.<br/>
50 * (attribute-classes and classes-attribute association)<br/>
51 * <br/>
52 * For more information see:<br/>
53 * <br/>
54 * Amir Ahmad, Lipika Dey (2004). A feature selection technique for classificatory analysis.
55 * <p/>
56 <!-- globalinfo-end -->
57 *
58 <!-- options-start -->
59 * Valid options are: <p/>
60 *
61 * <pre> -M
62 *  treat missing values as a separate value.</pre>
63 *
64 <!-- options-end -->
65 *
66 <!-- technical-bibtex-start -->
67 * BibTeX:
68 * <pre>
69 * &#64;phdthesis{Ahmad2004,
70 *    author = {Amir Ahmad and Lipika Dey},
71 *    month = {October},
72 *    publisher = {ELSEVIER},
73 *    title = {A feature selection technique for classificatory analysis},
74 *    year = {2004}
75 * }
76 * </pre>
77 * <p/>
78 <!-- technical-bibtex-end -->
79 *
80 * @author Adrian Pino (apinoa@facinf.uho.edu.cu)
81 * @version $Revision: 5447 $
82 */
83public class SignificanceAttributeEval
84extends ASEvaluation
85implements AttributeEvaluator, OptionHandler, TechnicalInformationHandler {
86
87  /** for serialization */
88  static final long serialVersionUID = -8504656625598579926L;
89
90  /** The training instances */
91  private Instances m_trainInstances;
92
93  /** The class index */
94  private int m_classIndex;
95
96  /** The number of attributes */
97  private int m_numAttribs;
98
99  /** The number of instances */
100  private int m_numInstances;
101
102  /** The number of classes */
103  private int m_numClasses;
104
105  /** Merge missing values */
106  private boolean m_missing_merge;
107
108  /**
109   * Returns a string describing this attribute evaluator
110   * @return a description of the evaluator suitable for
111   * displaying in the explorer/experimenter gui
112   */
113  public String globalInfo() {
114    return "Significance :\n\nEvaluates the worth of an attribute "
115    +"by computing the Probabilistic Significance as a two-way function.\n"
116    +"(atributte-classes and classes-atribute association)\n\n"
117    + "For more information see:\n\n"
118    + getTechnicalInformation().toString();
119  }
120
121  /**
122   * Returns an instance of a TechnicalInformation object, containing
123   * detailed information about the technical background of this class,
124   * e.g., paper reference or book this class is based on.
125   *
126   * @return the technical information about this class
127   */
128  public TechnicalInformation getTechnicalInformation() {
129    TechnicalInformation        result;
130
131    result = new TechnicalInformation(Type.PHDTHESIS);
132    result.setValue(Field.AUTHOR, "Amir Ahmad and Lipika Dey");
133    result.setValue(Field.YEAR, "2004");
134    result.setValue(Field.MONTH, "October");
135    result.setValue(Field.TITLE, "A feature selection technique for classificatory analysis");
136    result.setValue(Field.PUBLISHER, "ELSEVIER");
137
138    return result;
139  }
140
141
142  /**
143   * Constructor
144   */
145  public SignificanceAttributeEval () {
146    resetOptions();
147  }
148
149
150  /**
151   * Returns an enumeration describing the available options.
152   * @return an enumeration of all the available options.
153   **/
154  public Enumeration listOptions () {
155    Vector newVector = new Vector(1);
156    newVector.addElement(new Option("\ttreat missing values as a separate "
157        + "value.", "M", 0, "-M"));
158    return  newVector.elements();
159  }
160
161
162  /**
163   * Parses a given list of options. <p/>
164   *
165   <!-- options-start -->
166   * Valid options are: <p/>
167   *
168   * <pre> -M
169   *  treat missing values as a separate value.</pre>
170   *
171   <!-- options-end -->
172   *
173   * @param options the list of options as an array of strings
174   * @throws Exception if an option is not supported
175   **/
176  public void setOptions (String[] options)
177  throws Exception {
178    resetOptions();
179    setMissingMerge(!(Utils.getFlag('M', options)));
180  }
181
182  /**
183   * Returns the tip text for this property
184   * @return tip text for this property suitable for
185   * displaying in the explorer/experimenter gui
186   */
187  public String missingMergeTipText() {
188    return "Distribute counts for missing values. Counts are distributed "
189    +"across other values in proportion to their frequency. Otherwise, "
190    +"missing is treated as a separate value.";
191  }
192
193  /**
194   * distribute the counts for missing values across observed values
195   *
196   * @param b true=distribute missing values.
197   */
198  public void setMissingMerge (boolean b) {
199    m_missing_merge = b;
200  }
201
202
203  /**
204   * get whether missing values are being distributed or not
205   *
206   * @return true if missing values are being distributed.
207   */
208  public boolean getMissingMerge () {
209    return  m_missing_merge;
210  }
211
212
213  /**
214   * Gets the current settings of WrapperSubsetEval.
215   * @return an array of strings suitable for passing to setOptions()
216   */
217  public String[] getOptions () {
218    String[] options = new String[1];
219    int current = 0;
220
221    if (!getMissingMerge()) {
222      options[current++] = "-M";
223    }
224
225    while (current < options.length) {
226      options[current++] = "";
227    }
228
229    return  options;
230  }
231
232  /**
233   * Returns the capabilities of this evaluator.
234   *
235   * @return the capabilities of this evaluator
236   * @see    Capabilities
237   */
238  public Capabilities getCapabilities() {
239    Capabilities result = super.getCapabilities();
240    result.disableAll();
241
242    // attributes
243    result.enable(Capability.NOMINAL_ATTRIBUTES);
244    result.enable(Capability.NUMERIC_ATTRIBUTES);
245    result.enable(Capability.DATE_ATTRIBUTES);
246    result.enable(Capability.MISSING_VALUES);
247
248    // class
249    result.enable(Capability.NOMINAL_CLASS);
250    result.enable(Capability.MISSING_CLASS_VALUES);
251
252    return result;
253  }
254
255  /**
256   * Initializes the Significance attribute evaluator.
257   * Discretizes all attributes that are numeric.
258   *
259   * @param data set of instances serving as training data
260   * @throws Exception if the evaluator has not been
261   * generated successfully
262   */
263  public void buildEvaluator (Instances data)
264  throws Exception {
265
266    // can evaluator handle data?
267    getCapabilities().testWithFail(data);
268
269    m_trainInstances = data;
270    m_classIndex = m_trainInstances.classIndex();
271    m_numAttribs = m_trainInstances.numAttributes();
272    m_numInstances = m_trainInstances.numInstances();
273    Discretize disTransform = new Discretize();
274    disTransform.setUseBetterEncoding(true);
275    disTransform.setInputFormat(m_trainInstances);
276    m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
277    m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
278  }
279
280
281  /**
282   * reset options to default values
283   */
284  protected void resetOptions () {
285    m_trainInstances = null;
286    m_missing_merge = true;
287  }
288
289
290  /**
291   * evaluates an individual attribute by measuring the Significance
292   *
293   * @param attribute the index of the attribute to be evaluated
294   * @return the Significance of the attribute in the data base
295   * @throws Exception if the attribute could not be evaluated
296   */
297  public double evaluateAttribute (int attribute)
298  throws Exception {
299    int i, j, ii, jj;
300    int ni, nj;
301    double sum = 0.0;
302    ni = m_trainInstances.attribute(attribute).numValues() + 1;
303    nj = m_numClasses + 1;
304    double[] sumi, sumj;
305    Instance inst;
306    double temp = 0.0;
307    sumi = new double[ni];
308    sumj = new double[nj];
309    double[][] counts = new double[ni][nj];
310
311    for (i = 0; i < ni; i++) {
312      sumi[i] = 0.0;
313
314      for (j = 0; j < nj; j++) {
315        sumj[j] = 0.0;
316        counts[i][j] = 0.0;
317      }
318    }
319
320    // Fill the contingency table
321    for (i = 0; i < m_numInstances; i++) {
322      inst = m_trainInstances.instance(i);
323
324      if (inst.isMissing(attribute)) {
325        ii = ni - 1;
326      }
327      else {
328        ii = (int)inst.value(attribute);
329      }
330
331      if (inst.isMissing(m_classIndex)) {
332        jj = nj - 1;
333      }
334      else {
335        jj = (int)inst.value(m_classIndex);
336      }
337
338      counts[ii][jj]++;
339    }
340
341    // get the row totals
342    for (i = 0; i < ni; i++) {
343      sumi[i] = 0.0;
344
345      for (j = 0; j < nj; j++) {
346        sumi[i] += counts[i][j];
347        sum += counts[i][j];
348      }
349    }
350
351    // get the column totals
352    for (j = 0; j < nj; j++) {
353      sumj[j] = 0.0;
354
355      for (i = 0; i < ni; i++) {
356        sumj[j] += counts[i][j];
357      }
358    }
359
360
361    // distribute missing counts
362    if (m_missing_merge &&
363        (sumi[ni-1] < m_numInstances) &&
364        (sumj[nj-1] < m_numInstances)) {
365      double[] i_copy = new double[sumi.length];
366      double[] j_copy = new double[sumj.length];
367      double[][] counts_copy = new double[sumi.length][sumj.length];
368
369      for (i = 0; i < ni; i++) {
370        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
371      }
372
373      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
374      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
375      double total_missing = (sumi[ni - 1] + sumj[nj - 1] -
376          counts[ni - 1][nj - 1]);
377
378      // do the missing i's
379      if (sumi[ni - 1] > 0.0) {
380        for (j = 0; j < nj - 1; j++) {
381          if (counts[ni - 1][j] > 0.0) {
382            for (i = 0; i < ni - 1; i++) {
383              temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
384              counts[i][j] += temp;
385              sumi[i] += temp;
386            }
387
388            counts[ni - 1][j] = 0.0;
389          }
390        }
391      }
392
393      sumi[ni - 1] = 0.0;
394
395      // do the missing j's
396      if (sumj[nj - 1] > 0.0) {
397        for (i = 0; i < ni - 1; i++) {
398          if (counts[i][nj - 1] > 0.0) {
399            for (j = 0; j < nj - 1; j++) {
400              temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
401              counts[i][j] += temp;
402              sumj[j] += temp;
403            }
404
405            counts[i][nj - 1] = 0.0;
406          }
407        }
408      }
409
410      sumj[nj - 1] = 0.0;
411
412      // do the both missing
413      if (counts[ni - 1][nj - 1] > 0.0  && total_missing != sum) {
414        for (i = 0; i < ni - 1; i++) {
415          for (j = 0; j < nj - 1; j++) {
416            temp = (counts_copy[i][j]/(sum - total_missing)) *
417            counts_copy[ni - 1][nj - 1];
418            counts[i][j] += temp;
419            sumi[i] += temp;
420            sumj[j] += temp;
421          }
422        }
423
424        counts[ni - 1][nj - 1] = 0.0;
425      }
426    }
427
428    /**Working on the ContingencyTables****/
429    double discriminatingPower = associationAttributeClasses(counts);
430    double separability = associationClassesAttribute(counts);
431    /*...*/
432
433
434    return  discriminatingPower + separability / 2;
435  }
436
437  /**
438   * evaluates an individual attribute by measuring the attribute-classes
439   * association
440   *
441   * @param counts the Contingency table where are the frecuency counts values
442   * @return the discriminating power of the attribute
443   */
444  public double associationAttributeClasses(double[][] counts){
445
446    List<Integer> supportSet = new ArrayList<Integer>();
447    List<Integer> not_supportSet = new ArrayList<Integer>();
448
449    double discriminatingPower = 0;
450
451
452    int numValues = counts.length;
453    int numClasses = counts[0].length;
454
455    int total = 0;
456
457    double[] sumRows = new double[numValues];
458    double[] sumCols = new double[numClasses];
459
460    // get the row totals
461    for (int i = 0; i < numValues; i++) {
462      sumRows[i] = 0.0;
463
464      for (int j = 0; j < numClasses; j++) {
465        sumRows[i] += counts[i][j];
466        total += counts[i][j];
467      }
468    }
469
470    // get the column totals
471    for (int j = 0; j < numClasses; j++) {
472      sumCols[j] = 0.0;
473
474      for (int i = 0; i < numValues; i++) {
475        sumCols[j] += counts[i][j];
476      }
477    }
478
479    for (int i = 0; i < numClasses; i++) {
480      for (int j = 0; j < numValues; j++) {
481
482        //Computing Conditional Probability P(Clasei | Valuej)
483        double numerator1 = counts[j][i];
484        double denominator1 = sumRows[j];
485        double result1;
486
487        if(denominator1 != 0)
488          result1 = numerator1/denominator1;
489        else
490          result1 = 0;
491
492        //Computing Conditional Probability P(Clasei | ^Valuej)
493        double numerator2 = sumCols[i] - counts[j][i];
494        double denominator2 = total - sumRows[j];
495        double result2;
496
497        if(denominator2 != 0)
498          result2 = numerator2/denominator2;
499        else
500          result2 = 0;
501
502
503        if(result1 > result2){
504          supportSet.add (i);
505          discriminatingPower +=result1;
506        }
507        else{
508          not_supportSet.add (i);
509          discriminatingPower +=result2;
510        }
511      }
512
513    }
514
515    return discriminatingPower/numValues - 1.0;
516  }
517
518  /**
519   * evaluates an individual attribute by measuring the classes-attribute
520   * association
521   *
522   * @param counts the Contingency table where are the frecuency counts values
523   * @return the separability power of the classes
524   */
525  public double associationClassesAttribute(double[][] counts){
526
527    List<Integer> supportSet = new ArrayList<Integer>();
528    List<Integer> not_supportSet = new ArrayList<Integer>();
529
530    double separability = 0;
531
532
533    int numValues = counts.length;
534    int numClasses = counts[0].length;
535
536    int total = 0;
537
538    double[] sumRows = new double[numValues];
539    double[] sumCols = new double[numClasses];
540
541    // get the row totals
542    for (int i = 0; i < numValues; i++) {
543      sumRows[i] = 0.0;
544
545      for (int j = 0; j < numClasses; j++) {
546        sumRows[i] += counts[i][j];
547        total += counts[i][j];
548      }
549    }
550
551    // get the column totals
552    for (int j = 0; j < numClasses; j++) {
553      sumCols[j] = 0.0;
554
555      for (int i = 0; i < numValues; i++) {
556        sumCols[j] += counts[i][j];
557      }
558    }
559
560    for (int i = 0; i < numValues; i++) {
561      for (int j = 0; j < numClasses; j++) {
562
563        //Computing Conditional Probability P(Valuei | Clasej)
564        double numerator1 = counts[i][j];
565        double denominator1 = sumCols[j];
566        double result1;
567
568        if(denominator1 != 0)
569          result1 = numerator1/denominator1;
570        else
571          result1 = 0;
572
573        //Computing Conditional Probability P(Valuei | ^Clasej)
574        double numerator2 = sumRows[i] - counts[i][j];
575        double denominator2 = total - sumCols[j];
576        double result2;
577
578        if(denominator2 != 0)
579          result2 = numerator2/denominator2;
580        else
581          result2 = 0;
582
583
584        if(result1 > result2){
585          supportSet.add (i);
586          separability +=result1;
587        }
588        else{
589          not_supportSet.add (i);
590          separability +=result2;
591        }
592      }
593
594    }
595
596    return separability/numClasses - 1.0;
597  }
598
599
600  /**
601   * Return a description of the evaluator
602   * @return description as a string
603   */
604  public String toString () {
605    StringBuffer text = new StringBuffer();
606
607    if (m_trainInstances == null) {
608      text.append("\tSignificance evaluator has not been built");
609    }
610    else {
611      text.append("\tSignificance feature evaluator");
612
613      if (!m_missing_merge) {
614        text.append("\n\tMissing values treated as seperate");
615      }
616    }
617
618    text.append("\n");
619    return  text.toString();
620  }
621
622  /**
623   * Returns the revision string.
624   *
625   * @return            the revision
626   */
627  public String getRevision() {
628    return RevisionUtils.extract("$Revision: 5447 $");
629  }
630
631  /**
632   * Main method for testing this class.
633   *
634   * @param args the options
635   */
636  public static void main (String[] args) {
637    runEvaluator(new SignificanceAttributeEval(), args);
638  }
639}
640
Note: See TracBrowser for help on using the repository browser.