source: src/main/java/weka/classifiers/meta/MetaCost.java @ 27

Last change on this file since 27 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 20.8 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    MetaCost.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.meta;
24
25import weka.classifiers.Classifier;
26import weka.classifiers.AbstractClassifier;
27import weka.classifiers.CostMatrix;
28import weka.classifiers.RandomizableSingleClassifierEnhancer;
29import weka.core.Capabilities;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.OptionHandler;
34import weka.core.RevisionUtils;
35import weka.core.SelectedTag;
36import weka.core.Tag;
37import weka.core.TechnicalInformation;
38import weka.core.TechnicalInformationHandler;
39import weka.core.Utils;
40import weka.core.Capabilities.Capability;
41import weka.core.TechnicalInformation.Field;
42import weka.core.TechnicalInformation.Type;
43
44import java.io.BufferedReader;
45import java.io.File;
46import java.io.FileReader;
47import java.io.StringReader;
48import java.io.StringWriter;
49import java.util.Enumeration;
50import java.util.Vector;
51
52
53/**
54 <!-- globalinfo-start -->
55 * This metaclassifier makes its base classifier cost-sensitive using the method specified in<br/>
56 * <br/>
57 * Pedro Domingos: MetaCost: A general method for making classifiers cost-sensitive. In: Fifth International Conference on Knowledge Discovery and Data Mining, 155-164, 1999.<br/>
58 * <br/>
59 * This classifier should produce similar results to one created by passing the base learner to Bagging, which is in turn passed to a CostSensitiveClassifier operating on minimum expected cost. The difference is that MetaCost produces a single cost-sensitive classifier of the base learner, giving the benefits of fast classification and interpretable output (if the base learner itself is interpretable). This implementation  uses all bagging iterations when reclassifying training data (the MetaCost paper reports a marginal improvement when only those iterations containing each training instance are used in reclassifying that instance).
60 * <p/>
61 <!-- globalinfo-end -->
62 *
63 <!-- technical-bibtex-start -->
64 * BibTeX:
65 * <pre>
66 * &#64;inproceedings{Domingos1999,
67 *    author = {Pedro Domingos},
68 *    booktitle = {Fifth International Conference on Knowledge Discovery and Data Mining},
69 *    pages = {155-164},
70 *    title = {MetaCost: A general method for making classifiers cost-sensitive},
71 *    year = {1999}
72 * }
73 * </pre>
74 * <p/>
75 <!-- technical-bibtex-end -->
76 *
77 <!-- options-start -->
78 * Valid options are: <p/>
79 *
80 * <pre> -I &lt;num&gt;
81 *  Number of bagging iterations.
82 *  (default 10)</pre>
83 *
84 * <pre> -C &lt;cost file name&gt;
85 *  File name of a cost matrix to use. If this is not supplied,
86 *  a cost matrix will be loaded on demand. The name of the
87 *  on-demand file is the relation name of the training data
88 *  plus ".cost", and the path to the on-demand file is
89 *  specified with the -N option.</pre>
90 *
91 * <pre> -N &lt;directory&gt;
92 *  Name of a directory to search for cost files when loading
93 *  costs on demand (default current directory).</pre>
94 *
95 * <pre> -cost-matrix &lt;matrix&gt;
96 *  The cost matrix in Matlab single line format.</pre>
97 *
98 * <pre> -P
99 *  Size of each bag, as a percentage of the
100 *  training set size. (default 100)</pre>
101 *
102 * <pre> -S &lt;num&gt;
103 *  Random number seed.
104 *  (default 1)</pre>
105 *
106 * <pre> -D
107 *  If set, classifier is run in debug mode and
108 *  may output additional info to the console</pre>
109 *
110 * <pre> -W
111 *  Full name of base classifier.
112 *  (default: weka.classifiers.rules.ZeroR)</pre>
113 *
114 * <pre>
115 * Options specific to classifier weka.classifiers.rules.ZeroR:
116 * </pre>
117 *
118 * <pre> -D
119 *  If set, classifier is run in debug mode and
120 *  may output additional info to the console</pre>
121 *
122 <!-- options-end -->
123 *
124 * Options after -- are passed to the designated classifier.<p>
125 *
126 * @author Len Trigg (len@reeltwo.com)
127 * @version $Revision: 5928 $
128 */
129public class MetaCost 
130  extends RandomizableSingleClassifierEnhancer
131  implements TechnicalInformationHandler {
132
133  /** for serialization */
134  static final long serialVersionUID = 1205317833344726855L;
135 
136  /** load cost matrix on demand */
137  public static final int MATRIX_ON_DEMAND = 1;
138  /** use explicit matrix */
139  public static final int MATRIX_SUPPLIED = 2;
140  /** Specify possible sources of the cost matrix */
141  public static final Tag [] TAGS_MATRIX_SOURCE = {
142    new Tag(MATRIX_ON_DEMAND, "Load cost matrix on demand"),
143    new Tag(MATRIX_SUPPLIED, "Use explicit cost matrix")
144  };
145
146  /** Indicates the current cost matrix source */
147  protected int m_MatrixSource = MATRIX_ON_DEMAND;
148
149  /**
150   * The directory used when loading cost files on demand, null indicates
151   * current directory
152   */
153  protected File m_OnDemandDirectory = new File(System.getProperty("user.dir"));
154
155  /** The name of the cost file, for command line options */
156  protected String m_CostFile;
157
158  /** The cost matrix */
159  protected CostMatrix m_CostMatrix = new CostMatrix(1);
160
161  /** The number of iterations. */
162  protected int m_NumIterations = 10;
163
164  /** The size of each bag sample, as a percentage of the training size */
165  protected int m_BagSizePercent = 100;
166   
167  /**
168   * Returns a string describing classifier
169   * @return a description suitable for
170   * displaying in the explorer/experimenter gui
171   */
172  public String globalInfo() {
173 
174    return  "This metaclassifier makes its base classifier cost-sensitive using the "
175      + "method specified in\n\n"
176      + getTechnicalInformation().toString() + "\n\n"
177      + "This classifier should produce similar results to one created by "
178      + "passing the base learner to Bagging, which is in turn passed to a "
179      + "CostSensitiveClassifier operating on minimum expected cost. The difference "
180      + "is that MetaCost produces a single cost-sensitive classifier of the "
181      + "base learner, giving the benefits of fast classification and interpretable "
182      + "output (if the base learner itself is interpretable). This implementation  "
183      + "uses all bagging iterations when reclassifying training data (the MetaCost "
184      + "paper reports a marginal improvement when only those iterations containing "
185      + "each training instance are used in reclassifying that instance).";
186 
187  }
188
189  /**
190   * Returns an instance of a TechnicalInformation object, containing
191   * detailed information about the technical background of this class,
192   * e.g., paper reference or book this class is based on.
193   *
194   * @return the technical information about this class
195   */
196  public TechnicalInformation getTechnicalInformation() {
197    TechnicalInformation        result;
198   
199    result = new TechnicalInformation(Type.INPROCEEDINGS);
200    result.setValue(Field.AUTHOR, "Pedro Domingos");
201    result.setValue(Field.TITLE, "MetaCost: A general method for making classifiers cost-sensitive");
202    result.setValue(Field.BOOKTITLE, "Fifth International Conference on Knowledge Discovery and Data Mining");
203    result.setValue(Field.YEAR, "1999");
204    result.setValue(Field.PAGES, "155-164");
205   
206    return result;
207  }
208
209  /**
210   * Returns an enumeration describing the available options.
211   *
212   * @return an enumeration of all the available options.
213   */
214  public Enumeration listOptions() {
215
216    Vector newVector = new Vector(6);
217
218    newVector.addElement(new Option(
219              "\tNumber of bagging iterations.\n"
220              + "\t(default 10)",
221              "I", 1, "-I <num>"));
222    newVector.addElement(new Option(
223              "\tFile name of a cost matrix to use. If this is not supplied,\n"
224              +"\ta cost matrix will be loaded on demand. The name of the\n"
225              +"\ton-demand file is the relation name of the training data\n"
226              +"\tplus \".cost\", and the path to the on-demand file is\n"
227              +"\tspecified with the -N option.",
228              "C", 1, "-C <cost file name>"));
229    newVector.addElement(new Option(
230              "\tName of a directory to search for cost files when loading\n"
231              +"\tcosts on demand (default current directory).",
232              "N", 1, "-N <directory>"));
233    newVector.addElement(new Option(
234              "\tThe cost matrix in Matlab single line format.",
235              "cost-matrix", 1, "-cost-matrix <matrix>"));
236    newVector.addElement(new Option(
237              "\tSize of each bag, as a percentage of the\n" 
238              + "\ttraining set size. (default 100)",
239              "P", 1, "-P"));
240
241    Enumeration enu = super.listOptions();
242    while (enu.hasMoreElements()) {
243      newVector.addElement(enu.nextElement());
244    }
245    return newVector.elements();
246  }
247
248  /**
249   * Parses a given list of options. <p/>
250   *
251   <!-- options-start -->
252   * Valid options are: <p/>
253   *
254   * <pre> -I &lt;num&gt;
255   *  Number of bagging iterations.
256   *  (default 10)</pre>
257   *
258   * <pre> -C &lt;cost file name&gt;
259   *  File name of a cost matrix to use. If this is not supplied,
260   *  a cost matrix will be loaded on demand. The name of the
261   *  on-demand file is the relation name of the training data
262   *  plus ".cost", and the path to the on-demand file is
263   *  specified with the -N option.</pre>
264   *
265   * <pre> -N &lt;directory&gt;
266   *  Name of a directory to search for cost files when loading
267   *  costs on demand (default current directory).</pre>
268   *
269   * <pre> -cost-matrix &lt;matrix&gt;
270   *  The cost matrix in Matlab single line format.</pre>
271   *
272   * <pre> -P
273   *  Size of each bag, as a percentage of the
274   *  training set size. (default 100)</pre>
275   *
276   * <pre> -S &lt;num&gt;
277   *  Random number seed.
278   *  (default 1)</pre>
279   *
280   * <pre> -D
281   *  If set, classifier is run in debug mode and
282   *  may output additional info to the console</pre>
283   *
284   * <pre> -W
285   *  Full name of base classifier.
286   *  (default: weka.classifiers.rules.ZeroR)</pre>
287   *
288   * <pre>
289   * Options specific to classifier weka.classifiers.rules.ZeroR:
290   * </pre>
291   *
292   * <pre> -D
293   *  If set, classifier is run in debug mode and
294   *  may output additional info to the console</pre>
295   *
296   <!-- options-end -->
297   *
298   * Options after -- are passed to the designated classifier.<p>
299   *
300   * @param options the list of options as an array of strings
301   * @throws Exception if an option is not supported
302   */
303  public void setOptions(String[] options) throws Exception {
304
305    String bagIterations = Utils.getOption('I', options);
306    if (bagIterations.length() != 0) {
307      setNumIterations(Integer.parseInt(bagIterations));
308    } else {
309      setNumIterations(10);
310    }
311
312    String bagSize = Utils.getOption('P', options);
313    if (bagSize.length() != 0) {
314      setBagSizePercent(Integer.parseInt(bagSize));
315    } else {
316      setBagSizePercent(100);
317    }
318
319    String costFile = Utils.getOption('C', options);
320    if (costFile.length() != 0) {
321      setCostMatrix(new CostMatrix(new BufferedReader(
322                                   new FileReader(costFile))));
323      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED,
324                                          TAGS_MATRIX_SOURCE));
325      m_CostFile = costFile;
326    } else {
327      setCostMatrixSource(new SelectedTag(MATRIX_ON_DEMAND, 
328                                          TAGS_MATRIX_SOURCE));
329    }
330   
331    String demandDir = Utils.getOption('N', options);
332    if (demandDir.length() != 0) {
333      setOnDemandDirectory(new File(demandDir));
334    }
335
336    String cost_matrix= Utils.getOption("cost-matrix", options);
337    if (cost_matrix.length() != 0) {
338      StringWriter writer = new StringWriter();
339      CostMatrix.parseMatlab(cost_matrix).write(writer);
340      setCostMatrix(new CostMatrix(new StringReader(writer.toString())));
341      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED,
342                                          TAGS_MATRIX_SOURCE));
343    }
344   
345    super.setOptions(options);
346  }
347
348  /**
349   * Gets the current settings of the Classifier.
350   *
351   * @return an array of strings suitable for passing to setOptions
352   */
353  public String [] getOptions() {
354
355
356    String [] superOptions = super.getOptions();
357    String [] options;
358
359    options = new String [superOptions.length + 6];
360    int current = 0;
361
362    if (m_MatrixSource == MATRIX_SUPPLIED) {
363      if (m_CostFile != null) {
364        options[current++] = "-C";
365        options[current++] = "" + m_CostFile;
366      }
367      else {
368        options[current++] = "-cost-matrix";
369        options[current++] = getCostMatrix().toMatlab();
370      }
371    } else {
372      options[current++] = "-N";
373      options[current++] = "" + getOnDemandDirectory();
374    }
375    options[current++] = "-I"; options[current++] = "" + getNumIterations();
376    options[current++] = "-P"; options[current++] = "" + getBagSizePercent();
377
378    System.arraycopy(superOptions, 0, options, current, 
379                     superOptions.length);
380    return options;
381  }
382 
383  /**
384   * Returns the tip text for this property
385   * @return tip text for this property suitable for
386   * displaying in the explorer/experimenter gui
387   */
388  public String costMatrixSourceTipText() {
389    return "Gets the source location method of the cost matrix. Will "
390      + "be one of MATRIX_ON_DEMAND or MATRIX_SUPPLIED.";
391  }
392
393  /**
394   * Gets the source location method of the cost matrix. Will be one of
395   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED.
396   *
397   * @return the cost matrix source.
398   */
399  public SelectedTag getCostMatrixSource() {
400
401    return new SelectedTag(m_MatrixSource, TAGS_MATRIX_SOURCE);
402  }
403 
404  /**
405   * Sets the source location of the cost matrix. Values other than
406   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED will be ignored.
407   *
408   * @param newMethod the cost matrix location method.
409   */
410  public void setCostMatrixSource(SelectedTag newMethod) {
411   
412    if (newMethod.getTags() == TAGS_MATRIX_SOURCE) {
413      m_MatrixSource = newMethod.getSelectedTag().getID();
414    }
415  }
416 
417  /**
418   * Returns the tip text for this property
419   * @return tip text for this property suitable for
420   * displaying in the explorer/experimenter gui
421   */
422  public String onDemandDirectoryTipText() {
423    return "Name of directory to search for cost files when loading "
424      + "costs on demand.";
425  }
426
427  /**
428   * Returns the directory that will be searched for cost files when
429   * loading on demand.
430   *
431   * @return The cost file search directory.
432   */
433  public File getOnDemandDirectory() {
434
435    return m_OnDemandDirectory;
436  }
437
438  /**
439   * Sets the directory that will be searched for cost files when
440   * loading on demand.
441   *
442   * @param newDir The cost file search directory.
443   */
444  public void setOnDemandDirectory(File newDir) {
445
446    if (newDir.isDirectory()) {
447      m_OnDemandDirectory = newDir;
448    } else {
449      m_OnDemandDirectory = new File(newDir.getParent());
450    }
451    m_MatrixSource = MATRIX_ON_DEMAND;
452  }
453 
454  /**
455   * Returns the tip text for this property
456   * @return tip text for this property suitable for
457   * displaying in the explorer/experimenter gui
458   */
459  public String bagSizePercentTipText() {
460    return "The size of each bag, as a percentage of the training set "
461      + "size.";
462  }
463
464  /**
465   * Gets the size of each bag, as a percentage of the training set size.
466   *
467   * @return the bag size, as a percentage.
468   */
469  public int getBagSizePercent() {
470
471    return m_BagSizePercent;
472  }
473 
474  /**
475   * Sets the size of each bag, as a percentage of the training set size.
476   *
477   * @param newBagSizePercent the bag size, as a percentage.
478   */
479  public void setBagSizePercent(int newBagSizePercent) {
480
481    m_BagSizePercent = newBagSizePercent;
482  }
483 
484  /**
485   * Returns the tip text for this property
486   * @return tip text for this property suitable for
487   * displaying in the explorer/experimenter gui
488   */
489  public String numIterationsTipText() {
490    return "The number of bagging iterations.";
491  }
492 
493  /**
494   * Sets the number of bagging iterations
495   *
496   * @param numIterations the number of iterations to use
497   */
498  public void setNumIterations(int numIterations) {
499
500    m_NumIterations = numIterations;
501  }
502
503  /**
504   * Gets the number of bagging iterations
505   *
506   * @return the maximum number of bagging iterations
507   */
508  public int getNumIterations() {
509   
510    return m_NumIterations;
511  }
512 
513  /**
514   * Returns the tip text for this property
515   * @return tip text for this property suitable for
516   * displaying in the explorer/experimenter gui
517   */
518  public String costMatrixTipText() {
519    return "A misclassification cost matrix.";
520  }
521
522  /**
523   * Gets the misclassification cost matrix.
524   *
525   * @return the cost matrix
526   */
527  public CostMatrix getCostMatrix() {
528   
529    return m_CostMatrix;
530  }
531 
532  /**
533   * Sets the misclassification cost matrix.
534   *
535   * @param newCostMatrix the cost matrix
536   */
537  public void setCostMatrix(CostMatrix newCostMatrix) {
538   
539    m_CostMatrix = newCostMatrix;
540    m_MatrixSource = MATRIX_SUPPLIED;
541  }
542
543  /**
544   * Returns default capabilities of the classifier.
545   *
546   * @return      the capabilities of this classifier
547   */
548  public Capabilities getCapabilities() {
549    Capabilities result = super.getCapabilities();
550
551    // class
552    result.disableAllClasses();
553    result.disableAllClassDependencies();
554    result.enable(Capability.NOMINAL_CLASS);
555   
556    return result;
557  }
558
559  /**
560   * Builds the model of the base learner.
561   *
562   * @param data the training data
563   * @throws Exception if the classifier could not be built successfully
564   */
565  public void buildClassifier(Instances data) throws Exception {
566
567    // can classifier handle the data?
568    getCapabilities().testWithFail(data);
569
570    // remove instances with missing class
571    data = new Instances(data);
572    data.deleteWithMissingClass();
573   
574    if (m_MatrixSource == MATRIX_ON_DEMAND) {
575      String costName = data.relationName() + CostMatrix.FILE_EXTENSION;
576      File costFile = new File(getOnDemandDirectory(), costName);
577      if (!costFile.exists()) {
578        throw new Exception("On-demand cost file doesn't exist: " + costFile);
579      }
580      setCostMatrix(new CostMatrix(new BufferedReader(
581                                   new FileReader(costFile))));
582    }
583
584    // Set up the bagger
585    Bagging bagger = new Bagging();
586    bagger.setClassifier(getClassifier());
587    bagger.setSeed(getSeed());
588    bagger.setNumIterations(getNumIterations());
589    bagger.setBagSizePercent(getBagSizePercent());
590    bagger.buildClassifier(data);
591   
592    // Use the bagger to reassign class values according to minimum expected
593    // cost
594    Instances newData = new Instances(data);
595    for (int i = 0; i < newData.numInstances(); i++) {
596      Instance current = newData.instance(i);
597      double [] pred = bagger.distributionForInstance(current);
598      int minCostPred = Utils.minIndex(m_CostMatrix.expectedCosts(pred));
599      current.setClassValue(minCostPred);
600    }
601
602    // Build a classifier using the reassigned data
603    m_Classifier.buildClassifier(newData);
604  }
605
606  /**
607   * Classifies a given instance after filtering.
608   *
609   * @param instance the instance to be classified
610   * @return the class distribution for the given instance
611   * @throws Exception if instance could not be classified
612   * successfully
613   */
614  public double[] distributionForInstance(Instance instance) throws Exception {
615    return m_Classifier.distributionForInstance(instance);
616  }
617
618  /**
619   * Gets the classifier specification string, which contains the
620   * class name of the classifier and any options to the classifier
621   *
622   * @return the classifier string.
623   */
624  protected String getClassifierSpec() {
625   
626    Classifier c = getClassifier();
627    return c.getClass().getName() + " "
628      + Utils.joinOptions(((OptionHandler)c).getOptions());
629  }
630
631  /**
632   * Output a representation of this classifier
633   *
634   * @return a string representaiton of the classifier
635   */
636  public String toString() {
637
638    if (m_Classifier == null) {
639      return "MetaCost: No model built yet.";
640    }
641
642    String result = "MetaCost cost sensitive classifier induction";
643    result += "\nOptions: " + Utils.joinOptions(getOptions());
644    result += "\nBase learner: " + getClassifierSpec()
645      + "\n\nClassifier Model\n"
646      + m_Classifier.toString()
647      + "\n\nCost Matrix\n"
648      + m_CostMatrix.toString();
649   
650    return result;
651  }
652 
653  /**
654   * Returns the revision string.
655   *
656   * @return            the revision
657   */
658  public String getRevision() {
659    return RevisionUtils.extract("$Revision: 5928 $");
660  }
661
662  /**
663   * Main method for testing this class.
664   *
665   * @param argv should contain the following arguments:
666   * -t training file [-T test file] [-c class index]
667   */
668  public static void main(String [] argv) {
669    runClassifier(new MetaCost(), argv);
670  }
671}
Note: See TracBrowser for help on using the repository browser.