source: src/main/java/weka/attributeSelection/ClassifierSubsetEval.java @ 6

Last change on this file since 6 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 20.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    ClassifierSubsetEval.java
19 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.classifiers.Classifier;
26import weka.classifiers.AbstractClassifier;
27import weka.classifiers.Evaluation;
28import weka.classifiers.rules.ZeroR;
29import weka.core.Capabilities;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.OptionHandler;
34import weka.core.RevisionUtils;
35import weka.core.Utils;
36import weka.core.Capabilities.Capability;
37import weka.filters.Filter;
38import weka.filters.unsupervised.attribute.Remove;
39
40import java.io.File;
41import java.util.BitSet;
42import java.util.Enumeration;
43import java.util.Vector;
44
45
46/**
47 <!-- globalinfo-start -->
48 * Classifier subset evaluator:<br/>
49 * <br/>
50 * Evaluates attribute subsets on training data or a seperate hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes.
51 * <p/>
52 <!-- globalinfo-end -->
53 *
54 <!-- options-start -->
55 * Valid options are: <p/>
56 *
57 * <pre> -B &lt;classifier&gt;
58 *  class name of the classifier to use for accuracy estimation.
59 *  Place any classifier options LAST on the command line
60 *  following a "--". eg.:
61 *   -B weka.classifiers.bayes.NaiveBayes ... -- -K
62 *  (default: weka.classifiers.rules.ZeroR)</pre>
63 *
64 * <pre> -T
65 *  Use the training data to estimate accuracy.</pre>
66 *
67 * <pre> -H &lt;filename&gt;
68 *  Name of the hold out/test set to
69 *  estimate accuracy on.</pre>
70 *
71 * <pre>
72 * Options specific to scheme weka.classifiers.rules.ZeroR:
73 * </pre>
74 *
75 * <pre> -D
76 *  If set, classifier is run in debug mode and
77 *  may output additional info to the console</pre>
78 *
79 <!-- options-end -->
80 *
81 * @author Mark Hall (mhall@cs.waikato.ac.nz)
82 * @version $Revision: 5928 $
83 */
84public class ClassifierSubsetEval 
85  extends HoldOutSubsetEvaluator
86  implements OptionHandler, ErrorBasedMeritEvaluator {
87 
88  /** for serialization */
89  static final long serialVersionUID = 7532217899385278710L;
90
91  /** training instances */
92  private Instances m_trainingInstances;
93
94  /** class index */
95  private int m_classIndex;
96
97  /** number of attributes in the training data */
98  private int m_numAttribs;
99 
100  /** number of training instances */
101  private int m_numInstances;
102
103  /** holds the classifier to use for error estimates */
104  private Classifier m_Classifier = new ZeroR();
105
106  /** holds the evaluation object to use for evaluating the classifier */
107  private Evaluation m_Evaluation;
108
109  /** the file that containts hold out/test instances */
110  private File m_holdOutFile = new File("Click to set hold out or "
111                                        +"test instances");
112
113  /** the instances to test on */
114  private Instances m_holdOutInstances = null;
115
116  /** evaluate on training data rather than seperate hold out/test set */
117  private boolean m_useTraining = true;
118
119  /**
120   * Returns a string describing this attribute evaluator
121   * @return a description of the evaluator suitable for
122   * displaying in the explorer/experimenter gui
123   */
124  public String globalInfo() {
125    return 
126        "Classifier subset evaluator:\n\nEvaluates attribute subsets on training data or a seperate "
127      + "hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes.";
128  }
129
130  /**
131   * Returns an enumeration describing the available options.
132   *
133   * @return an enumeration of all the available options.
134   **/
135  public Enumeration listOptions () {
136    Vector newVector = new Vector(3);
137   
138    newVector.addElement(new Option(
139        "\tclass name of the classifier to use for accuracy estimation.\n"
140        + "\tPlace any classifier options LAST on the command line\n"
141        + "\tfollowing a \"--\". eg.:\n"
142        + "\t\t-B weka.classifiers.bayes.NaiveBayes ... -- -K\n"
143        + "\t(default: weka.classifiers.rules.ZeroR)", 
144        "B", 1, "-B <classifier>"));
145   
146    newVector.addElement(new Option(
147        "\tUse the training data to estimate"
148        +" accuracy.",
149        "T",0,"-T"));
150   
151    newVector.addElement(new Option(
152        "\tName of the hold out/test set to "
153        +"\n\testimate accuracy on.",
154        "H", 1,"-H <filename>"));
155
156    if ((m_Classifier != null) && 
157        (m_Classifier instanceof OptionHandler)) {
158      newVector.addElement(new Option("", "", 0, "\nOptions specific to " 
159                                      + "scheme " 
160                                      + m_Classifier.getClass().getName() 
161                                      + ":"));
162      Enumeration enu = ((OptionHandler)m_Classifier).listOptions();
163
164      while (enu.hasMoreElements()) {
165        newVector.addElement(enu.nextElement());
166      }
167    }
168
169    return  newVector.elements();
170  }
171
172  /**
173   * Parses a given list of options. <p/>
174   *
175   <!-- options-start -->
176   * Valid options are: <p/>
177   *
178   * <pre> -B &lt;classifier&gt;
179   *  class name of the classifier to use for accuracy estimation.
180   *  Place any classifier options LAST on the command line
181   *  following a "--". eg.:
182   *   -B weka.classifiers.bayes.NaiveBayes ... -- -K
183   *  (default: weka.classifiers.rules.ZeroR)</pre>
184   *
185   * <pre> -T
186   *  Use the training data to estimate accuracy.</pre>
187   *
188   * <pre> -H &lt;filename&gt;
189   *  Name of the hold out/test set to
190   *  estimate accuracy on.</pre>
191   *
192   * <pre>
193   * Options specific to scheme weka.classifiers.rules.ZeroR:
194   * </pre>
195   *
196   * <pre> -D
197   *  If set, classifier is run in debug mode and
198   *  may output additional info to the console</pre>
199   *
200   <!-- options-end -->
201   *
202   * @param options the list of options as an array of strings
203   * @throws Exception if an option is not supported
204   */
205  public void setOptions (String[] options)
206    throws Exception {
207    String optionString;
208    resetOptions();
209
210    optionString = Utils.getOption('B', options);
211    if (optionString.length() == 0)
212      optionString = ZeroR.class.getName();
213    setClassifier(AbstractClassifier.forName(optionString,
214                                     Utils.partitionOptions(options)));
215
216    optionString = Utils.getOption('H',options);
217    if (optionString.length() != 0) {
218      setHoldOutFile(new File(optionString));
219    }
220
221    setUseTraining(Utils.getFlag('T',options));
222  }
223
224    /**
225   * Returns the tip text for this property
226   * @return tip text for this property suitable for
227   * displaying in the explorer/experimenter gui
228   */
229  public String classifierTipText() {
230    return "Classifier to use for estimating the accuracy of subsets";
231  }
232
233  /**
234   * Set the classifier to use for accuracy estimation
235   *
236   * @param newClassifier the Classifier to use.
237   */
238  public void setClassifier (Classifier newClassifier) {
239    m_Classifier = newClassifier;
240  }
241
242
243  /**
244   * Get the classifier used as the base learner.
245   *
246   * @return the classifier used as the classifier
247   */
248  public Classifier getClassifier () {
249    return  m_Classifier;
250  }
251
252  /**
253   * Returns the tip text for this property
254   * @return tip text for this property suitable for
255   * displaying in the explorer/experimenter gui
256   */
257  public String holdOutFileTipText() {
258    return "File containing hold out/test instances.";
259  }
260
261  /**
262   * Gets the file that holds hold out/test instances.
263   * @return File that contains hold out instances
264   */
265  public File getHoldOutFile() {
266    return m_holdOutFile;
267  }
268
269
270  /**
271   * Set the file that contains hold out/test instances
272   * @param h the hold out file
273   */
274  public void setHoldOutFile(File h) {
275    m_holdOutFile = h;
276  }
277
278  /**
279   * Returns the tip text for this property
280   * @return tip text for this property suitable for
281   * displaying in the explorer/experimenter gui
282   */
283  public String useTrainingTipText() {
284    return "Use training data instead of hold out/test instances.";
285  }
286
287  /**
288   * Get if training data is to be used instead of hold out/test data
289   * @return true if training data is to be used instead of hold out data
290   */
291  public boolean getUseTraining() {
292    return m_useTraining;
293  }
294
295  /**
296   * Set if training data is to be used instead of hold out/test data
297   * @param t true if training data is to be used instead of hold out data
298   */
299  public void setUseTraining(boolean t) {
300    m_useTraining = t;
301  }
302
303  /**
304   * Gets the current settings of ClassifierSubsetEval
305   *
306   * @return an array of strings suitable for passing to setOptions()
307   */
308  public String[] getOptions () {
309    String[] classifierOptions = new String[0];
310
311    if ((m_Classifier != null) && 
312        (m_Classifier instanceof OptionHandler)) {
313      classifierOptions = ((OptionHandler)m_Classifier).getOptions();
314    }
315
316    String[] options = new String[6 + classifierOptions.length];
317    int current = 0;
318
319    if (getClassifier() != null) {
320      options[current++] = "-B";
321      options[current++] = getClassifier().getClass().getName();
322    }
323
324    if (getUseTraining()) {
325      options[current++] = "-T";
326    }
327    options[current++] = "-H"; options[current++] = getHoldOutFile().getPath();
328
329    if (classifierOptions.length > 0) {
330      options[current++] = "--";
331      System.arraycopy(classifierOptions, 0, options, current, 
332          classifierOptions.length);
333      current += classifierOptions.length;
334    }
335
336    while (current < options.length) {
337        options[current++] = "";
338    }
339
340    return  options;
341  }
342
343  /**
344   * Returns the capabilities of this evaluator.
345   *
346   * @return            the capabilities of this evaluator
347   * @see               Capabilities
348   */
349  public Capabilities getCapabilities() {
350    Capabilities        result;
351   
352    if (getClassifier() == null) {
353      result = super.getCapabilities();
354      result.disableAll();
355    } else {
356      result = getClassifier().getCapabilities();
357    }
358   
359    // set dependencies
360    for (Capability cap: Capability.values())
361      result.enableDependency(cap);
362   
363    return result;
364  }
365
366  /**
367   * Generates a attribute evaluator. Has to initialize all fields of the
368   * evaluator that are not being set via options.
369   *
370   * @param data set of instances serving as training data
371   * @throws Exception if the evaluator has not been
372   * generated successfully
373   */
374  public void buildEvaluator (Instances data)
375    throws Exception {
376   
377    // can evaluator handle data?
378    getCapabilities().testWithFail(data);
379
380    m_trainingInstances = data;
381    m_classIndex = m_trainingInstances.classIndex();
382    m_numAttribs = m_trainingInstances.numAttributes();
383    m_numInstances = m_trainingInstances.numInstances();
384
385    // load the testing data
386    if (!m_useTraining && 
387        (!getHoldOutFile().getPath().startsWith("Click to set"))) {
388      java.io.Reader r = new java.io.BufferedReader(
389                         new java.io.FileReader(getHoldOutFile().getPath()));
390        m_holdOutInstances = new Instances(r);
391        m_holdOutInstances.setClassIndex(m_trainingInstances.classIndex());
392        if (m_trainingInstances.equalHeaders(m_holdOutInstances) == false) {
393          throw new Exception("Hold out/test set is not compatable with "
394                              +"training data.\n" 
395                              + m_trainingInstances.equalHeadersMsg(m_holdOutInstances));
396        }
397    }
398  }
399
400  /**
401   * Evaluates a subset of attributes
402   *
403   * @param subset a bitset representing the attribute subset to be
404   * evaluated
405   * @return the error rate
406   * @throws Exception if the subset could not be evaluated
407   */
408  public double evaluateSubset (BitSet subset)
409    throws Exception {
410    int i,j;
411    double errorRate = 0;
412    int numAttributes = 0;
413    Instances trainCopy=null;
414    Instances testCopy=null;
415
416    Remove delTransform = new Remove();
417    delTransform.setInvertSelection(true);
418    // copy the training instances
419    trainCopy = new Instances(m_trainingInstances);
420   
421    if (!m_useTraining) {
422      if (m_holdOutInstances == null) {
423        throw new Exception("Must specify a set of hold out/test instances "
424                            +"with -H");
425      } 
426      // copy the test instances
427      testCopy = new Instances(m_holdOutInstances);
428    }
429   
430    // count attributes set in the BitSet
431    for (i = 0; i < m_numAttribs; i++) {
432      if (subset.get(i)) {
433        numAttributes++;
434      }
435    }
436   
437    // set up an array of attribute indexes for the filter (+1 for the class)
438    int[] featArray = new int[numAttributes + 1];
439   
440    for (i = 0, j = 0; i < m_numAttribs; i++) {
441      if (subset.get(i)) {
442        featArray[j++] = i;
443      }
444    }
445   
446    featArray[j] = m_classIndex;
447    delTransform.setAttributeIndicesArray(featArray);
448    delTransform.setInputFormat(trainCopy);
449    trainCopy = Filter.useFilter(trainCopy, delTransform);
450    if (!m_useTraining) {
451      testCopy = Filter.useFilter(testCopy, delTransform);
452    }
453
454    // build the classifier
455    m_Classifier.buildClassifier(trainCopy);
456
457    m_Evaluation = new Evaluation(trainCopy);
458    if (!m_useTraining) {
459      m_Evaluation.evaluateModel(m_Classifier, testCopy);
460    } else {
461      m_Evaluation.evaluateModel(m_Classifier, trainCopy);
462    }
463
464    if (m_trainingInstances.classAttribute().isNominal()) {
465      errorRate = m_Evaluation.errorRate();
466    } else {
467      errorRate = m_Evaluation.meanAbsoluteError();
468    }
469
470    m_Evaluation = null;
471    // return the negative of the error rate as search methods  need to
472    // maximize something
473    return -errorRate;
474  }
475
476  /**
477   * Evaluates a subset of attributes with respect to a set of instances.
478   * Calling this function overides any test/hold out instancs set from
479   * setHoldOutFile.
480   * @param subset a bitset representing the attribute subset to be
481   * evaluated
482   * @param holdOut a set of instances (possibly seperate and distinct
483   * from those use to build/train the evaluator) with which to
484   * evaluate the merit of the subset
485   * @return the "merit" of the subset on the holdOut data
486   * @throws Exception if the subset cannot be evaluated
487   */
488  public double evaluateSubset(BitSet subset, Instances holdOut) 
489    throws Exception {
490    int i,j;
491    double errorRate;
492    int numAttributes = 0;
493    Instances trainCopy=null;
494    Instances testCopy=null;
495
496    if (m_trainingInstances.equalHeaders(holdOut) == false) {
497      throw new Exception("evaluateSubset : Incompatable instance types.\n"
498          + m_trainingInstances.equalHeadersMsg(holdOut));
499    }
500
501    Remove delTransform = new Remove();
502    delTransform.setInvertSelection(true);
503    // copy the training instances
504    trainCopy = new Instances(m_trainingInstances);
505   
506    testCopy = new Instances(holdOut);
507
508    // count attributes set in the BitSet
509    for (i = 0; i < m_numAttribs; i++) {
510      if (subset.get(i)) {
511        numAttributes++;
512      }
513    }
514   
515    // set up an array of attribute indexes for the filter (+1 for the class)
516    int[] featArray = new int[numAttributes + 1];
517   
518    for (i = 0, j = 0; i < m_numAttribs; i++) {
519      if (subset.get(i)) {
520        featArray[j++] = i;
521      }
522    }
523   
524    featArray[j] = m_classIndex;
525    delTransform.setAttributeIndicesArray(featArray);
526    delTransform.setInputFormat(trainCopy);
527    trainCopy = Filter.useFilter(trainCopy, delTransform);
528    testCopy = Filter.useFilter(testCopy, delTransform);
529
530    // build the classifier
531    m_Classifier.buildClassifier(trainCopy);
532
533    m_Evaluation = new Evaluation(trainCopy);
534    m_Evaluation.evaluateModel(m_Classifier, testCopy);
535
536    if (m_trainingInstances.classAttribute().isNominal()) {
537      errorRate = m_Evaluation.errorRate();
538    } else {
539      errorRate = m_Evaluation.meanAbsoluteError();
540    }
541
542    m_Evaluation = null;
543    // return the negative of the error as search methods need to
544    // maximize something
545   return -errorRate;
546  }
547
548  /**
549   * Evaluates a subset of attributes with respect to a single instance.
550   * Calling this function overides any hold out/test instances set
551   * through setHoldOutFile.
552   * @param subset a bitset representing the attribute subset to be
553   * evaluated
554   * @param holdOut a single instance (possibly not one of those used to
555   * build/train the evaluator) with which to evaluate the merit of the subset
556   * @param retrain true if the classifier should be retrained with respect
557   * to the new subset before testing on the holdOut instance.
558   * @return the "merit" of the subset on the holdOut instance
559   * @throws Exception if the subset cannot be evaluated
560   */
561  public double evaluateSubset(BitSet subset, Instance holdOut,
562                               boolean retrain) 
563    throws Exception {
564    int i,j;
565    double error;
566    int numAttributes = 0;
567    Instances trainCopy=null;
568    Instance testCopy=null;
569
570    if (m_trainingInstances.equalHeaders(holdOut.dataset()) == false) {
571      throw new Exception("evaluateSubset : Incompatable instance types.\n"
572          + m_trainingInstances.equalHeadersMsg(holdOut.dataset()));
573    }
574
575    Remove delTransform = new Remove();
576    delTransform.setInvertSelection(true);
577    // copy the training instances
578    trainCopy = new Instances(m_trainingInstances);
579   
580    testCopy = (Instance)holdOut.copy();
581
582    // count attributes set in the BitSet
583    for (i = 0; i < m_numAttribs; i++) {
584      if (subset.get(i)) {
585        numAttributes++;
586      }
587    }
588   
589    // set up an array of attribute indexes for the filter (+1 for the class)
590    int[] featArray = new int[numAttributes + 1];
591   
592    for (i = 0, j = 0; i < m_numAttribs; i++) {
593      if (subset.get(i)) {
594        featArray[j++] = i;
595      }
596    }
597    featArray[j] = m_classIndex;
598    delTransform.setAttributeIndicesArray(featArray);
599    delTransform.setInputFormat(trainCopy);
600
601    if (retrain) {
602      trainCopy = Filter.useFilter(trainCopy, delTransform);
603      // build the classifier
604      m_Classifier.buildClassifier(trainCopy);
605    }
606
607    delTransform.input(testCopy);
608    testCopy = delTransform.output();
609
610    double pred;
611    double [] distrib;
612    distrib = m_Classifier.distributionForInstance(testCopy);
613    if (m_trainingInstances.classAttribute().isNominal()) {
614      pred = distrib[(int)testCopy.classValue()];
615    } else {
616      pred = distrib[0];
617    }
618
619    if (m_trainingInstances.classAttribute().isNominal()) {
620      error = 1.0 - pred;
621    } else {
622      error = testCopy.classValue() - pred;
623    }
624
625    // return the negative of the error as search methods need to
626    // maximize something
627    return -error;
628  }
629
630  /**
631   * Returns a string describing classifierSubsetEval
632   *
633   * @return the description as a string
634   */
635  public String toString() {
636    StringBuffer text = new StringBuffer();
637   
638    if (m_trainingInstances == null) {
639      text.append("\tClassifier subset evaluator has not been built yet\n");
640    }
641    else {
642      text.append("\tClassifier Subset Evaluator\n");
643      text.append("\tLearning scheme: " 
644                  + getClassifier().getClass().getName() + "\n");
645      text.append("\tScheme options: ");
646      String[] classifierOptions = new String[0];
647
648      if (m_Classifier instanceof OptionHandler) {
649        classifierOptions = ((OptionHandler)m_Classifier).getOptions();
650
651        for (int i = 0; i < classifierOptions.length; i++) {
652          text.append(classifierOptions[i] + " ");
653        }
654      }
655
656      text.append("\n");
657      text.append("\tHold out/test set: ");
658      if (!m_useTraining) {
659        if (getHoldOutFile().getPath().startsWith("Click to set")) {
660          text.append("none\n");
661        } else {
662          text.append(getHoldOutFile().getPath()+'\n');
663        }
664      } else {
665        text.append("Training data\n");
666      }
667      if (m_trainingInstances.attribute(m_classIndex).isNumeric()) {
668        text.append("\tAccuracy estimation: MAE\n");
669      } else {
670        text.append("\tAccuracy estimation: classification error\n");
671      }
672    }
673    return text.toString();
674  }
675 
676  /**
677   * reset to defaults
678   */
679  protected void resetOptions () {
680    m_trainingInstances = null;
681    m_Evaluation = null;
682    m_Classifier = new ZeroR();
683    m_holdOutFile = new File("Click to set hold out or test instances");
684    m_holdOutInstances = null;
685    m_useTraining = false;
686  }
687 
688  /**
689   * Returns the revision string.
690   *
691   * @return            the revision
692   */
693  public String getRevision() {
694    return RevisionUtils.extract("$Revision: 5928 $");
695  }
696 
697  /**
698   * Main method for testing this class.
699   *
700   * @param args the options
701   */
702  public static void main (String[] args) {
703    runEvaluator(new ClassifierSubsetEval(), args);
704  }
705}
Note: See TracBrowser for help on using the repository browser.