source: src/main/java/weka/attributeSelection/ConsistencySubsetEval.java @ 5

Last change on this file since 5 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 14.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    ConsistencySubsetEval.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package  weka.attributeSelection;
24
25import weka.core.Capabilities;
26import weka.core.Instance;
27import weka.core.Instances;
28import weka.core.RevisionHandler;
29import weka.core.RevisionUtils;
30import weka.core.TechnicalInformation;
31import weka.core.TechnicalInformationHandler;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.core.TechnicalInformation.Field;
35import weka.core.TechnicalInformation.Type;
36import weka.filters.Filter;
37import weka.filters.supervised.attribute.Discretize;
38
39import java.io.Serializable;
40import java.util.BitSet;
41import java.util.Enumeration;
42import java.util.Hashtable;
43
44/**
45 <!-- globalinfo-start -->
46 * ConsistencySubsetEval :<br/>
47 * <br/>
48 * Evaluates the worth of a subset of attributes by the level of consistency in the class values when the training instances are projected onto the subset of attributes. <br/>
49 * <br/>
50 * Consistency of any subset can never be lower than that of the full set of attributes, hence the usual practice is to use this subset evaluator in conjunction with a Random or Exhaustive search which looks for the smallest subset with consistency equal to that of the full set of attributes.<br/>
51 * <br/>
52 * For more information see:<br/>
53 * <br/>
54 * H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter solution. In: 13th International Conference on Machine Learning, 319-327, 1996.
55 * <p/>
56 <!-- globalinfo-end -->
57 *
58 <!-- technical-bibtex-start -->
59 * BibTeX:
60 * <pre>
61 * &#64;inproceedings{Liu1996,
62 *    author = {H. Liu and R. Setiono},
63 *    booktitle = {13th International Conference on Machine Learning},
64 *    pages = {319-327},
65 *    title = {A probabilistic approach to feature selection - A filter solution},
66 *    year = {1996}
67 * }
68 * </pre>
69 * <p/>
70 <!-- technical-bibtex-end -->
71 *
72 * @author Mark Hall (mhall@cs.waikato.ac.nz)
73 * @version $Revision: 5447 $
74 * @see Discretize
75 */
76public class ConsistencySubsetEval 
77  extends ASEvaluation
78  implements SubsetEvaluator,
79             TechnicalInformationHandler {
80 
81  /** for serialization */
82  static final long serialVersionUID = -2880323763295270402L;
83 
84  /** training instances */
85  private Instances m_trainInstances;
86
87  /** class index */
88  private int m_classIndex;
89
90  /** number of attributes in the training data */
91  private int m_numAttribs;
92
93  /** number of instances in the training data */
94  private int m_numInstances;
95
96  /** Discretise numeric attributes */
97  private Discretize m_disTransform;
98
99  /** Hash table for evaluating feature subsets */
100  private Hashtable m_table;
101
102  /**
103   * Class providing keys to the hash table.
104   */
105  public class hashKey 
106    implements Serializable, RevisionHandler {
107   
108    /** for serialization */
109    static final long serialVersionUID = 6144138512017017408L;
110   
111    /** Array of attribute values for an instance */
112    private double [] attributes;
113   
114    /** True for an index if the corresponding attribute value is missing. */
115    private boolean [] missing;
116
117    /** The key */
118    private int key;
119
120    /**
121     * Constructor for a hashKey
122     *
123     * @param t an instance from which to generate a key
124     * @param numAtts the number of attributes
125     * @throws Exception if something goes wrong
126     */
127    public hashKey(Instance t, int numAtts) throws Exception {
128
129      int i;
130      int cindex = t.classIndex();
131
132      key = -999;
133      attributes = new double [numAtts];
134      missing = new boolean [numAtts];
135      for (i=0;i<numAtts;i++) {
136        if (i == cindex) {
137          missing[i] = true;
138        } else {
139          if ((missing[i] = t.isMissing(i)) == false) {
140            attributes[i] = t.value(i);
141          }
142        }
143      }
144    }
145
146    /**
147     * Convert a hash entry to a string
148     *
149     * @param t the set of instances
150     * @param maxColWidth width to make the fields
151     * @return the hash entry as string
152     */
153    public String toString(Instances t, int maxColWidth) {
154
155      int i;
156      int cindex = t.classIndex();
157      StringBuffer text = new StringBuffer();
158     
159      for (i=0;i<attributes.length;i++) {
160        if (i != cindex) {
161          if (missing[i]) {
162            text.append("?");
163            for (int j=0;j<maxColWidth;j++) {
164              text.append(" ");
165            }
166          } else {
167            String ss = t.attribute(i).value((int)attributes[i]);
168            StringBuffer sb = new StringBuffer(ss);
169           
170            for (int j=0;j < (maxColWidth-ss.length()+1); j++) {
171                sb.append(" ");
172            }
173            text.append(sb);
174          }
175        }
176      }
177      return text.toString();
178    }
179
180    /**
181     * Constructor for a hashKey
182     *
183     * @param t an array of feature values
184     */
185    public hashKey(double [] t) {
186
187      int i;
188      int l = t.length;
189
190      key = -999;
191      attributes = new double [l];
192      missing = new boolean [l];
193      for (i=0;i<l;i++) {
194        if (t[i] == Double.MAX_VALUE) {
195          missing[i] = true;
196        } else {
197          missing[i] = false;
198          attributes[i] = t[i];
199        }
200      }
201    }
202   
203    /**
204     * Calculates a hash code
205     *
206     * @return the hash code as an integer
207     */
208    public int hashCode() {
209
210      int hv = 0;
211     
212      if (key != -999)
213        return key;
214      for (int i=0;i<attributes.length;i++) {
215        if (missing[i]) {
216          hv += (i*13);
217        } else {
218          hv += (i * 5 * (attributes[i]+1));
219        }
220      }
221      if (key == -999) {
222        key = hv;
223      }
224      return hv;
225    }
226
227    /**
228     * Tests if two instances are equal
229     *
230     * @param b a key to compare with
231     * @return true if the objects are equal
232     */
233    public boolean equals(Object b) {
234     
235      if ((b == null) || !(b.getClass().equals(this.getClass()))) {
236        return false;
237      }
238      boolean ok = true;
239      boolean l;
240      if (b instanceof hashKey) {
241        hashKey n = (hashKey)b;
242        for (int i=0;i<attributes.length;i++) {
243          l = n.missing[i];
244          if (missing[i] || l) {
245            if ((missing[i] && !l) || (!missing[i] && l)) {
246              ok = false;
247              break;
248            }
249          } else {
250            if (attributes[i] != n.attributes[i]) {
251              ok = false;
252              break;
253            }
254          }
255        }
256      } else {
257        return false;
258      }
259      return ok;
260    }
261   
262    /**
263     * Prints the hash code
264     */
265    public void print_hash_code() {
266     
267      System.out.println("Hash val: "+hashCode());
268    }
269   
270    /**
271     * Returns the revision string.
272     *
273     * @return          the revision
274     */
275    public String getRevision() {
276      return RevisionUtils.extract("$Revision: 5447 $");
277    }
278  }
279
280  /**
281   * Returns a string describing this search method
282   * @return a description of the search suitable for
283   * displaying in the explorer/experimenter gui
284   */
285  public String globalInfo() {
286    return "ConsistencySubsetEval :\n\nEvaluates the worth of a subset of "
287      +"attributes by the level of consistency in the class values when the "
288      +"training instances are projected onto the subset of attributes. "
289      +"\n\nConsistency of any subset can never be lower than that of the "
290      +"full set of attributes, hence the usual practice is to use this "
291      +"subset evaluator in conjunction with a Random or Exhaustive search "
292      +"which looks for the smallest subset with consistency equal to that "
293      +"of the full set of attributes.\n\n"
294      + "For more information see:\n\n"
295      + getTechnicalInformation().toString();
296  }
297
298  /**
299   * Returns an instance of a TechnicalInformation object, containing
300   * detailed information about the technical background of this class,
301   * e.g., paper reference or book this class is based on.
302   *
303   * @return the technical information about this class
304   */
305  public TechnicalInformation getTechnicalInformation() {
306    TechnicalInformation        result;
307   
308    result = new TechnicalInformation(Type.INPROCEEDINGS);
309    result.setValue(Field.AUTHOR, "H. Liu and R. Setiono");
310    result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution");
311    result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning");
312    result.setValue(Field.YEAR, "1996");
313    result.setValue(Field.PAGES, "319-327");
314   
315    return result;
316  }
317
318  /**
319   * Constructor. Calls restOptions to set default options
320   **/
321  public ConsistencySubsetEval () {
322    resetOptions();
323  }
324
325  /**
326   * reset to defaults
327   */
328  private void resetOptions () {
329    m_trainInstances = null;
330  }
331
332  /**
333   * Returns the capabilities of this evaluator.
334   *
335   * @return            the capabilities of this evaluator
336   * @see               Capabilities
337   */
338  public Capabilities getCapabilities() {
339    Capabilities result = super.getCapabilities();
340    result.disableAll();
341   
342    // attributes
343    result.enable(Capability.NOMINAL_ATTRIBUTES);
344    result.enable(Capability.NUMERIC_ATTRIBUTES);
345    result.enable(Capability.DATE_ATTRIBUTES);
346    result.enable(Capability.MISSING_VALUES);
347   
348    // class
349    result.enable(Capability.NOMINAL_CLASS);
350    result.enable(Capability.MISSING_CLASS_VALUES);
351   
352    return result;
353  }
354
355  /**
356   * Generates a attribute evaluator. Has to initialize all fields of the
357   * evaluator that are not being set via options.
358   *
359   * @param data set of instances serving as training data
360   * @throws Exception if the evaluator has not been
361   * generated successfully
362   */
363  public void buildEvaluator (Instances data) throws Exception {
364   
365    // can evaluator handle data?
366    getCapabilities().testWithFail(data);
367
368    m_trainInstances = new Instances(data);
369    m_trainInstances.deleteWithMissingClass();
370    m_classIndex = m_trainInstances.classIndex();
371    m_numAttribs = m_trainInstances.numAttributes();
372    m_numInstances = m_trainInstances.numInstances();
373
374    m_disTransform = new Discretize();
375    m_disTransform.setUseBetterEncoding(true);
376    m_disTransform.setInputFormat(m_trainInstances);
377    m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
378  }
379
380  /**
381   * Evaluates a subset of attributes
382   *
383   * @param subset a bitset representing the attribute subset to be
384   * evaluated
385   * @throws Exception if the subset could not be evaluated
386   */
387  public double evaluateSubset (BitSet subset) throws Exception {
388    int [] fs;
389    int i;
390    int count = 0;
391
392    for (i=0;i<m_numAttribs;i++) {
393      if (subset.get(i)) {
394        count++;
395      }
396    }
397
398    double [] instArray = new double[count];
399    int index = 0;
400    fs = new int[count];
401    for (i=0;i<m_numAttribs;i++) {
402      if (subset.get(i)) {
403        fs[index++] = i;
404      }
405    }
406   
407    // create new hash table
408    m_table = new Hashtable((int)(m_numInstances * 1.5));
409   
410    for (i=0;i<m_numInstances;i++) {
411      Instance inst = m_trainInstances.instance(i);
412      for (int j=0;j<fs.length;j++) {
413        if (fs[j] == m_classIndex) {
414          throw new Exception("A subset should not contain the class!");
415        }
416        if (inst.isMissing(fs[j])) {
417          instArray[j] = Double.MAX_VALUE;
418        } else {
419          instArray[j] = inst.value(fs[j]);
420        }
421      }
422      insertIntoTable(inst, instArray);
423    }
424
425    return consistencyCount();
426  }
427
428  /**
429   * calculates the level of consistency in a dataset using a subset of
430   * features. The consistency of a hash table entry is the total number
431   * of instances hashed to that location minus the number of instances in
432   * the largest class hashed to that location. The total consistency is
433   * 1.0 minus the sum of the individual consistencies divided by the
434   * total number of instances.
435   * @return the consistency of the hash table as a value between 0 and 1.
436   */
437  private double consistencyCount() {
438    Enumeration e = m_table.keys();
439    double [] classDist;
440    double count = 0.0;
441   
442    while (e.hasMoreElements()) {
443      hashKey tt = (hashKey)e.nextElement();
444      classDist = (double []) m_table.get(tt);
445      count += Utils.sum(classDist);
446      int max = Utils.maxIndex(classDist);
447      count -= classDist[max];
448    }
449
450    count /= (double)m_numInstances;
451    return (1.0 - count);
452  }
453
454  /**
455   * Inserts an instance into the hash table
456   *
457   * @param inst instance to be inserted
458   * @param instA the instance to be inserted as an array of attribute
459   * values.
460   * @throws Exception if the instance can't be inserted
461   */
462  private void insertIntoTable(Instance inst, double [] instA)
463       throws Exception {
464
465    double [] tempClassDist2;
466    double [] newDist;
467    hashKey thekey;
468
469    thekey = new hashKey(instA);
470
471    // see if this one is already in the table
472    tempClassDist2 = (double []) m_table.get(thekey);
473    if (tempClassDist2 == null) {
474      newDist = new double [m_trainInstances.classAttribute().numValues()];
475      newDist[(int)inst.classValue()] = inst.weight();
476     
477      // add to the table
478      m_table.put(thekey, newDist);
479    } else { 
480      // update the distribution for this instance
481      tempClassDist2[(int)inst.classValue()]+=inst.weight();
482     
483      // update the table
484      m_table.put(thekey, tempClassDist2);
485    }
486  }
487
488  /**
489   * returns a description of the evaluator
490   * @return a description of the evaluator as a String.
491   */
492  public String toString() {
493    StringBuffer text = new StringBuffer();
494
495    if (m_trainInstances == null) {
496      text.append("\tConsistency subset evaluator has not been built yet\n");
497    }
498    else {
499      text.append("\tConsistency Subset Evaluator\n");
500    }
501
502    return text.toString();
503  }
504 
505  /**
506   * Returns the revision string.
507   *
508   * @return            the revision
509   */
510  public String getRevision() {
511    return RevisionUtils.extract("$Revision: 5447 $");
512  }
513
514  /**
515   * Main method for testing this class.
516   *
517   * @param args the options
518   */
519  public static void main (String[] args) {
520    runEvaluator(new ConsistencySubsetEval(), args);
521  }
522}
Note: See TracBrowser for help on using the repository browser.