source: src/main/java/weka/attributeSelection/SymmetricalUncertAttributeSetEval.java @ 19

Last change on this file since 19 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 21.4 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17 /*
18 *    RELEASE INFORMATION (December 27, 2004)
19 *   
20 *    FCBF algorithm:
21 *      Template obtained from Weka
22 *      Developed for Weka by Zheng Alan Zhao   
23 *      December 27, 2004
24 *
25 *    FCBF algorithm is a feature selection method based on Symmetrical Uncertainty Measurement for
26 *    relevance redundancy analysis. The details of FCBF algorithm are in:
27 *
28 <!-- technical-plaintext-start -->
29 * Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
30 <!-- technical-plaintext-end -->
31 *   
32 *   
33 *   
34 *    CONTACT INFORMATION
35 *   
36 *    For algorithm implementation:
37 *    Zheng Zhao: zhaozheng at asu.edu
38 *     
39 *    For the algorithm:
40 *    Lei Yu: leiyu at asu.edu
41 *    Huan Liu: hliu at asu.edu
42 *     
43 *    Data Mining and Machine Learning Lab
44 *    Computer Science and Engineering Department
45 *    Fulton School of Engineering
46 *    Arizona State University
47 *    Tempe, AZ 85287
48 *
49 *    SymmetricalUncertAttributeSetEval.java
50 *
51 *    Copyright (C) 2004 Data Mining and Machine Learning Lab,
52 *                       Computer Science and Engineering Department,
53 *                       Fulton School of Engineering,
54 *                       Arizona State University
55 *
56 */
57
58package weka.attributeSelection;
59
60import weka.core.Capabilities;
61import weka.core.ContingencyTables;
62import weka.core.Instance;
63import weka.core.Instances;
64import weka.core.Option;
65import weka.core.OptionHandler;
66import weka.core.RevisionUtils;
67import weka.core.TechnicalInformation;
68import weka.core.TechnicalInformationHandler;
69import weka.core.Utils;
70import weka.core.Capabilities.Capability;
71import weka.core.TechnicalInformation.Field;
72import weka.core.TechnicalInformation.Type;
73import weka.filters.Filter;
74import weka.filters.supervised.attribute.Discretize;
75
76import java.util.Enumeration;
77import java.util.Vector;
78
79/**
80 <!-- globalinfo-start -->
81 * SymmetricalUncertAttributeSetEval :<br/>
82 * <br/>
83 * Evaluates the worth of a set attributes by measuring the symmetrical uncertainty with respect to another set of attributes. <br/>
84 * <br/>
85 *  SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) / H(AttributeSet2) + H(AttributeSet1).<br/>
86 * <br/>
87 * For more information see:<br/>
88 * <br/>
89 * Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
90 * <p/>
91 <!-- globalinfo-end -->
92 *
93 <!-- technical-bibtex-start -->
94 * BibTeX:
95 * <pre>
96 * &#64;inproceedings{Yu2003,
97 *    author = {Lei Yu and Huan Liu},
98 *    booktitle = {Proceedings of the Twentieth International Conference on Machine Learning},
99 *    pages = {856-863},
100 *    publisher = {AAAI Press},
101 *    title = {Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution},
102 *    year = {2003}
103 * }
104 * </pre>
105 * <p/>
106 <!-- technical-bibtex-end -->
107 *
108 <!-- options-start -->
109 * Valid options are: <p/>
110 *
111 * <pre> -M
112 *  treat missing values as a seperate value.</pre>
113 *
114 <!-- options-end -->
115 *
116 * @author Zheng Zhao: zhaozheng at asu.edu
117 * @version $Revision: 5447 $
118 * @see Discretize
119 */
120public class SymmetricalUncertAttributeSetEval
121  extends AttributeSetEvaluator
122  implements OptionHandler, TechnicalInformationHandler {
123 
124  /** for serialization */
125  static final long serialVersionUID = 8351377335495873202L;
126
127  /** The training instances */
128  private Instances m_trainInstances;
129
130  /** The class index */
131  private int m_classIndex;
132
133  /** The number of attributes */
134  private int m_numAttribs;
135
136  /** The number of instances */
137  private int m_numInstances;
138
139  /** The number of classes */
140  private int m_numClasses;
141
142  /** Treat missing values as a seperate value */
143  private boolean m_missing_merge;
144
145  /**
146   * Returns a string describing this attribute evaluator
147   * @return a description of the evaluator suitable for
148   * displaying in the explorer/experimenter gui
149   */
150  public String globalInfo() {
151    return "SymmetricalUncertAttributeSetEval :\n\nEvaluates the worth of a set attributes "
152      +"by measuring the symmetrical uncertainty with respect to another set of attributes. "
153      +"\n\n SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) "
154      +"/ H(AttributeSet2) + H(AttributeSet1).\n\n"
155      + "For more information see:\n\n"
156      + getTechnicalInformation().toString();
157  }
158
159  /**
160   * Returns an instance of a TechnicalInformation object, containing
161   * detailed information about the technical background of this class,
162   * e.g., paper reference or book this class is based on.
163   *
164   * @return the technical information about this class
165   */
166  public TechnicalInformation getTechnicalInformation() {
167    TechnicalInformation        result;
168   
169    result = new TechnicalInformation(Type.INPROCEEDINGS);
170    result.setValue(Field.AUTHOR, "Lei Yu and Huan Liu");
171    result.setValue(Field.TITLE, "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution");
172    result.setValue(Field.BOOKTITLE, "Proceedings of the Twentieth International Conference on Machine Learning");
173    result.setValue(Field.YEAR, "2003");
174    result.setValue(Field.PAGES, "856-863");
175    result.setValue(Field.PUBLISHER, "AAAI Press");
176   
177    return result;
178  }
179
180  /**
181   * Constructor
182   */
183  public SymmetricalUncertAttributeSetEval () {
184    resetOptions();
185  }
186
187
188  /**
189   * Returns an enumeration describing the available options.
190   * @return an enumeration of all the available options.
191   **/
192  public Enumeration listOptions () {
193    Vector newVector = new Vector(1);
194    newVector.addElement(new Option("\ttreat missing values as a seperate "
195                                    + "value.", "M", 0, "-M"));
196    return  newVector.elements();
197  }
198
199
200  /**
201   * Parses a given list of options. <p/>
202   *
203   <!-- options-start -->
204   * Valid options are: <p/>
205   *
206   * <pre> -M
207   *  treat missing values as a seperate value.</pre>
208   *
209   <!-- options-end -->
210   *
211   * @param options the list of options as an array of strings
212   * @throws Exception if an option is not supported
213   */
214  public void setOptions (String[] options)
215    throws Exception {
216    resetOptions();
217    setMissingMerge(!(Utils.getFlag('M', options)));
218  }
219
220  /**
221   * Returns the tip text for this property
222   * @return tip text for this property suitable for
223   * displaying in the explorer/experimenter gui
224   */
225  public String missingMergeTipText() {
226    return "Distribute counts for missing values. Counts are distributed "
227      +"across other values in proportion to their frequency. Otherwise, "
228      +"missing is treated as a separate value.";
229  }
230
231  /**
232   * distribute the counts for missing values across observed values
233   *
234   * @param b true=distribute missing values.
235   */
236  public void setMissingMerge (boolean b) {
237    m_missing_merge = b;
238  }
239
240
241  /**
242   * get whether missing values are being distributed or not
243   *
244   * @return true if missing values are being distributed.
245   */
246  public boolean getMissingMerge () {
247    return  m_missing_merge;
248  }
249
250
251  /**
252   * Gets the current settings of WrapperSubsetEval.
253   * @return an array of strings suitable for passing to setOptions()
254   */
255  public String[] getOptions () {
256    String[] options = new String[1];
257    int current = 0;
258
259    if (!getMissingMerge()) {
260      options[current++] = "-M";
261    }
262
263    while (current < options.length) {
264      options[current++] = "";
265    }
266
267    return  options;
268  }
269
270  /**
271   * Returns the capabilities of this evaluator.
272   *
273   * @return            the capabilities of this evaluator
274   * @see               Capabilities
275   */
276  public Capabilities getCapabilities() {
277    Capabilities result = super.getCapabilities();
278    result.disableAll();
279   
280    // attributes
281    result.enable(Capability.NOMINAL_ATTRIBUTES);
282    result.enable(Capability.NUMERIC_ATTRIBUTES);
283    result.enable(Capability.DATE_ATTRIBUTES);
284    result.enable(Capability.MISSING_VALUES);
285   
286    // class
287    result.enable(Capability.NOMINAL_CLASS);
288    result.enable(Capability.MISSING_CLASS_VALUES);
289   
290    return result;
291  }
292
293  /**
294   * Initializes a symmetrical uncertainty attribute evaluator.
295   * Discretizes all attributes that are numeric.
296   *
297   * @param data set of instances serving as training data
298   * @throws Exception if the evaluator has not been
299   * generated successfully
300   */
301  public void buildEvaluator (Instances data)
302    throws Exception {
303
304    // can evaluator handle data?
305    getCapabilities().testWithFail(data);
306
307    m_trainInstances = data;
308    m_classIndex = m_trainInstances.classIndex();
309    m_numAttribs = m_trainInstances.numAttributes();
310    m_numInstances = m_trainInstances.numInstances();
311    Discretize disTransform = new Discretize();
312    disTransform.setUseBetterEncoding(true);
313    disTransform.setInputFormat(m_trainInstances);
314    m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
315    m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
316  }
317
318
319  /**
320   * set options to default values
321   */
322  protected void resetOptions () {
323    m_trainInstances = null;
324    m_missing_merge = true;
325  }
326
327  /**
328   * evaluates an individual attribute by measuring the symmetrical
329   * uncertainty between it and the class.
330   *
331   * @param attribute the index of the attribute to be evaluated
332   * @return the uncertainty
333   * @throws Exception if the attribute could not be evaluated
334   */
335  public double evaluateAttribute (int attribute)
336    throws Exception {
337    int i, j, ii, jj;
338    int ni, nj;
339    double sum = 0.0;
340    ni = m_trainInstances.attribute(attribute).numValues() + 1;
341    nj = m_numClasses + 1;
342    double[] sumi, sumj;
343    Instance inst;
344    double temp = 0.0;
345    sumi = new double[ni];
346    sumj = new double[nj];
347    double[][] counts = new double[ni][nj];
348    sumi = new double[ni];
349    sumj = new double[nj];
350
351    for (i = 0; i < ni; i++) {
352      sumi[i] = 0.0;
353
354      for (j = 0; j < nj; j++) {
355        sumj[j] = 0.0;
356        counts[i][j] = 0.0;
357      }
358    }
359
360    // Fill the contingency table
361    for (i = 0; i < m_numInstances; i++) {
362      inst = m_trainInstances.instance(i);
363
364      if (inst.isMissing(attribute)) {
365        ii = ni - 1;
366      }
367      else {
368        ii = (int)inst.value(attribute);
369      }
370
371      if (inst.isMissing(m_classIndex)) {
372        jj = nj - 1;
373      }
374      else {
375        jj = (int)inst.value(m_classIndex);
376      }
377
378      counts[ii][jj]++;
379    }
380
381    // get the row totals
382    for (i = 0; i < ni; i++) {
383      sumi[i] = 0.0;
384
385      for (j = 0; j < nj; j++) {
386        //there are how many happen of a special feature value
387        sumi[i] += counts[i][j];
388        sum += counts[i][j];
389      }
390    }
391
392    // get the column totals
393    for (j = 0; j < nj; j++) {
394      sumj[j] = 0.0;
395
396      for (i = 0; i < ni; i++) {
397        //a class value include how many instance.
398        sumj[j] += counts[i][j];
399      }
400    }
401
402    // distribute missing counts
403    if (m_missing_merge &&
404        (sumi[ni-1] < m_numInstances) &&
405        (sumj[nj-1] < m_numInstances)) {
406      double[] i_copy = new double[sumi.length];
407      double[] j_copy = new double[sumj.length];
408      double[][] counts_copy = new double[sumi.length][sumj.length];
409
410      for (i = 0; i < ni; i++) {
411        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
412      }
413
414      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
415      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
416      double total_missing = (sumi[ni - 1] + sumj[nj - 1]
417                              - counts[ni - 1][nj - 1]);
418
419      // do the missing i's
420      if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values.
421        for (j = 0; j < nj - 1; j++) {
422          if (counts[ni - 1][j] > 0.0) {
423            for (i = 0; i < ni - 1; i++) {
424              temp = ((i_copy[i]/(sum - i_copy[ni - 1])) *
425                      counts[ni - 1][j]);
426              counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it
427              sumi[i] += temp;
428            }
429
430            counts[ni - 1][j] = 0.0;
431          }
432        }
433      }
434
435      sumi[ni - 1] = 0.0;
436
437      // do the missing j's
438      if (sumj[nj - 1] > 0.0) {
439        for (i = 0; i < ni - 1; i++) {
440          if (counts[i][nj - 1] > 0.0) {
441            for (j = 0; j < nj - 1; j++) {
442              temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
443              counts[i][j] += temp;
444              sumj[j] += temp;
445            }
446
447            counts[i][nj - 1] = 0.0;
448          }
449        }
450      }
451
452      sumj[nj - 1] = 0.0;
453
454      // do the both missing
455      if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
456        for (i = 0; i < ni - 1; i++) {
457          for (j = 0; j < nj - 1; j++) {
458            temp = (counts_copy[i][j]/(sum - total_missing)) *
459              counts_copy[ni - 1][nj - 1];
460            counts[i][j] += temp;
461            sumi[i] += temp;
462            sumj[j] += temp;
463          }
464        }
465
466        counts[ni - 1][nj - 1] = 0.0;
467      }
468    }
469
470    return  ContingencyTables.symmetricalUncertainty(counts);
471  }
472
473  /**
474   * calculate symmetrical uncertainty between sets of attributes
475   *
476   * @param attributes the indexes of the attributes
477   * @param classAttributes the indexes of the attributes whose combination will
478   *                         be used as class label
479   * @return the uncertainty
480   * @throws Exception if the attribute could not be evaluated
481   */
482  public double evaluateAttribute (int[] attributes, int[] classAttributes)
483    throws Exception {
484
485    int i, j;     //variable for looping.
486    int p;     //variable for looping.
487    int ii, jj;   //specifying the position in the contingency table.
488    int nnj, nni; //counting base for attributes[].
489    int ni, nj;   //the nubmer of rows and columns in the ContingencyTables.
490
491    double sum = 0.0;
492    boolean b_missing_attribute = false;
493    boolean b_missing_classAtrribute = false;
494
495    if(attributes.length==0)
496    {
497      throw new Exception("the parameter attributes[] is empty;SEQ:W-FS-Eval-SUAS-001");
498    }
499    if(classAttributes.length==0)
500    {
501      throw new Exception("the parameter classAttributes[] is empty;SEQ:W-FS-Eval-SUAS-002");
502    }
503
504    /*calculate the number of the rows in ContingencyTable*/
505    ni = m_trainInstances.attribute(attributes[0]).numValues();
506    if (ni == 0)
507    {
508      throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;"+1);
509    }
510
511    for (i = 1;i<attributes.length;i++)
512    {
513      if (m_trainInstances.attribute(attributes[i]).numValues() == 0)
514      {
515        throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;" +
516                            (i+1));
517      }
518      ni = ni*m_trainInstances.attribute(attributes[i]).numValues();
519    }
520    ni = ni+1;
521
522    /*calculate the number of the colums in the ContingencyTable*/
523    nj = m_trainInstances.attribute(classAttributes[0]).numValues();
524    if (nj == 0)
525    {
526      throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;"+1);
527    }
528
529    for (i = 1;i<classAttributes.length;i++)
530    {
531      if (m_trainInstances.attribute(classAttributes[i]).numValues() == 0)
532      {
533        throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;" +
534                            (i+1));
535      }
536      nj = nj*m_trainInstances.attribute(classAttributes[i]).numValues();
537    }
538    nj = nj+1;
539
540    double[] sumi, sumj;
541    Instance inst;
542    double temp = 0.0;
543    sumi = new double[ni];
544    sumj = new double[nj];
545    double[][] counts = new double[ni][nj];
546    sumi = new double[ni];
547    sumj = new double[nj];
548
549    for (i = 0; i < ni; i++) {
550      sumi[i] = 0.0;
551
552      for (j = 0; j < nj; j++) {
553        sumj[j] = 0.0;
554        counts[i][j] = 0.0;
555      }
556    }
557
558    // Fill the contingency table
559    for (i = 0; i < m_numInstances; i++) {
560      inst = m_trainInstances.instance(i);
561
562      b_missing_attribute = false;
563      b_missing_classAtrribute = false;
564
565      /*get row position in contingency table*/
566      nni = 1;
567      ii = 0;
568      for (p=attributes.length-1; p>=0; p--)
569      {
570        if (inst.isMissing(attributes[p])) {
571           b_missing_attribute = true;
572        }
573        ii = ((int)inst.value(attributes[p])*nni)+ii;
574        if (p<attributes.length-1){
575          nni = nni * (m_trainInstances.attribute(attributes[p]).numValues());
576        }
577        else {
578          nni = m_trainInstances.attribute(attributes[p]).numValues();
579        }
580      }
581      if (b_missing_attribute) {
582        ii = ni-1;
583      }
584
585      /*get colum position in contingency table*/
586      nnj = 1;
587      jj = 0;
588      for (p=classAttributes.length-1; p>=0; p--)
589      {
590        if (inst.isMissing(classAttributes[p])) {
591           b_missing_classAtrribute = true;
592        }
593        jj = ((int)inst.value(classAttributes[p])*nnj)+jj;
594        if (p<attributes.length-1){
595          nnj = nnj * (m_trainInstances.attribute(classAttributes[p]).numValues());
596        }
597        else {
598          nnj = m_trainInstances.attribute(classAttributes[p]).numValues();
599        }
600      }
601      if (b_missing_classAtrribute) {
602        jj = nj-1;
603      }
604
605      counts[ii][jj]++;
606    }
607
608    // get the row totals
609    for (i = 0; i < ni; i++) {
610      sumi[i] = 0.0;
611
612      for (j = 0; j < nj; j++) {
613        //there are how many happen of a special feature value
614        sumi[i] += counts[i][j];
615        sum += counts[i][j];
616      }
617    }
618
619    // get the column totals
620    for (j = 0; j < nj; j++) {
621      sumj[j] = 0.0;
622
623      for (i = 0; i < ni; i++) {
624        //a class value include how many instance.
625        sumj[j] += counts[i][j];
626      }
627    }
628
629    // distribute missing counts
630    if (m_missing_merge &&
631        (sumi[ni-1] < m_numInstances) &&
632        (sumj[nj-1] < m_numInstances)) {
633      double[] i_copy = new double[sumi.length];
634      double[] j_copy = new double[sumj.length];
635      double[][] counts_copy = new double[sumi.length][sumj.length];
636
637      for (i = 0; i < ni; i++) {
638        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
639      }
640
641      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
642      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
643      double total_missing = (sumi[ni - 1] + sumj[nj - 1]
644                              - counts[ni - 1][nj - 1]);
645
646      // do the missing i's
647      if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values.
648        for (j = 0; j < nj - 1; j++) {
649          if (counts[ni - 1][j] > 0.0) {
650            for (i = 0; i < ni - 1; i++) {
651              temp = ((i_copy[i]/(sum - i_copy[ni - 1])) *
652                      counts[ni - 1][j]);
653              counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it
654              sumi[i] += temp;
655            }
656
657            counts[ni - 1][j] = 0.0;
658          }
659        }
660      }
661
662      sumi[ni - 1] = 0.0;
663
664      // do the missing j's
665      if (sumj[nj - 1] > 0.0) {
666        for (i = 0; i < ni - 1; i++) {
667          if (counts[i][nj - 1] > 0.0) {
668            for (j = 0; j < nj - 1; j++) {
669              temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
670              counts[i][j] += temp;
671              sumj[j] += temp;
672            }
673
674            counts[i][nj - 1] = 0.0;
675          }
676        }
677      }
678
679      sumj[nj - 1] = 0.0;
680
681      // do the both missing
682      if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
683        for (i = 0; i < ni - 1; i++) {
684          for (j = 0; j < nj - 1; j++) {
685            temp = (counts_copy[i][j]/(sum - total_missing)) *
686              counts_copy[ni - 1][nj - 1];
687            counts[i][j] += temp;
688            sumi[i] += temp;
689            sumj[j] += temp;
690          }
691        }
692
693        counts[ni - 1][nj - 1] = 0.0;
694      }
695    }
696
697    return  ContingencyTables.symmetricalUncertainty(counts);
698  }
699
700
701  /**
702   * Return a description of the evaluator
703   * @return description as a string
704   */
705  public String toString () {
706    StringBuffer text = new StringBuffer();
707
708    if (m_trainInstances == null) {
709      text.append("\tSymmetrical Uncertainty evaluator has not been built");
710    }
711    else {
712      text.append("\tSymmetrical Uncertainty Ranking Filter");
713      if (!m_missing_merge) {
714        text.append("\n\tMissing values treated as seperate");
715      }
716    }
717
718    text.append("\n");
719    return  text.toString();
720  }
721 
722  /**
723   * Returns the revision string.
724   *
725   * @return            the revision
726   */
727  public String getRevision() {
728    return RevisionUtils.extract("$Revision: 5447 $");
729  }
730
731  // ============
732  // Test method.
733  // ============
734  /**
735   * Main method for testing this class.
736   *
737   * @param argv should contain the following arguments:
738   * -t training file
739   */
740  public static void main (String[] argv) {
741    runEvaluator(new SymmetricalUncertAttributeSetEval(), argv);
742  }
743}
Note: See TracBrowser for help on using the repository browser.