source: src/main/java/weka/attributeSelection/CostSensitiveASEvaluation.java @ 26

Last change on this file since 26 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 17.6 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    CostSensitiveASEvaluation.java
19 *    Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package  weka.attributeSelection;
24
25import weka.core.Capabilities;
26import weka.core.Instance;
27import weka.core.Instances;
28import weka.core.Option;
29import weka.core.OptionHandler;
30import weka.core.Utils;
31import weka.core.Capabilities.Capability;
32import weka.core.SelectedTag;
33import weka.core.Tag;
34import weka.classifiers.CostMatrix;
35import weka.core.WeightedInstancesHandler;
36import weka.core.RevisionUtils;
37
38import java.io.BufferedReader;
39import java.io.File;
40import java.io.FileReader;
41import java.io.StringReader;
42import java.io.StringWriter;
43import java.io.Serializable;
44import java.util.Enumeration;
45import java.util.Random;
46import java.util.Vector;
47import java.util.ArrayList;
48
49/**
50 * Abstract base class for cost-sensitive subset and attribute evaluators.
51 *
52 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
53 * @version $Revision: 5987 $
54 */
55public abstract class CostSensitiveASEvaluation
56  extends ASEvaluation
57  implements OptionHandler, Serializable {
58
59  /** for serialization */
60  static final long serialVersionUID = -7045833833363396977L;
61
62  /** load cost matrix on demand */
63  public static final int MATRIX_ON_DEMAND = 1;
64  /** use explicit cost matrix */
65  public static final int MATRIX_SUPPLIED = 2;
66  /** Specify possible sources of the cost matrix */
67  public static final Tag [] TAGS_MATRIX_SOURCE = {
68    new Tag(MATRIX_ON_DEMAND, "Load cost matrix on demand"),
69    new Tag(MATRIX_SUPPLIED, "Use explicit cost matrix")
70  };
71
72  /** Indicates the current cost matrix source */
73  protected int m_MatrixSource = MATRIX_ON_DEMAND;
74
75  /**
76   * The directory used when loading cost files on demand, null indicates
77   * current directory
78   */
79  protected File m_OnDemandDirectory = new File(System.getProperty("user.dir"));
80
81  /** The name of the cost file, for command line options */
82  protected String m_CostFile;
83
84  /** The cost matrix */
85  protected CostMatrix m_CostMatrix = new CostMatrix(1);
86
87  /** The base evaluator to use */
88  protected ASEvaluation m_evaluator;
89
90  /** random number seed */
91  protected int m_seed = 1;
92
93  /**
94   * Returns an enumeration describing the available options.
95   *
96   * @return an enumeration of all the available options.
97   */
98  public Enumeration listOptions() {
99
100    Vector newVector = new Vector(4);
101
102    newVector.addElement(new Option(
103                                    "\tFile name of a cost matrix to use. If this is not supplied,\n"
104                                    +"\ta cost matrix will be loaded on demand. The name of the\n"
105                                    +"\ton-demand file is the relation name of the training data\n"
106                                    +"\tplus \".cost\", and the path to the on-demand file is\n"
107                                    +"\tspecified with the -N option.",
108                                    "C", 1, "-C <cost file name>"));
109    newVector.addElement(new Option(
110                                    "\tName of a directory to search for cost files when loading\n"
111                                    +"\tcosts on demand (default current directory).",
112                                    "N", 1, "-N <directory>"));
113    newVector.addElement(new Option(
114                                    "\tThe cost matrix in Matlab single line format.",
115                                    "cost-matrix", 1, "-cost-matrix <matrix>"));
116    newVector.addElement(new Option(
117                                    "\tThe seed to use for random number generation.",
118                                    "S", 1, "-S <integer>"));
119
120    newVector.addElement(new Option(
121                                    "\tFull name of base evaluator. Options after -- are "
122                                    +"passed to the evaluator.\n"
123                                    + "\t(default: " + defaultEvaluatorString() +")",
124                                    "W", 1, "-W"));
125
126    if (m_evaluator instanceof OptionHandler) {
127      newVector.addElement(new Option(
128                                      "",
129                                      "", 0, "\nOptions specific to evaluator "
130                                      + m_evaluator.getClass().getName() + ":"));
131      Enumeration enu = ((OptionHandler)m_evaluator).listOptions();
132      while (enu.hasMoreElements()) {
133        newVector.addElement(enu.nextElement());
134      }
135    }
136
137
138    return newVector.elements();
139  }
140
141  /**
142   * Parses a given list of options. <p/>
143   *
144   * Valid options are: <p/>
145   *
146   * <pre> -C &lt;cost file name&gt;
147   *  File name of a cost matrix to use. If this is not supplied,
148   *  a cost matrix will be loaded on demand. The name of the
149   *  on-demand file is the relation name of the training data
150   *  plus ".cost", and the path to the on-demand file is
151   *  specified with the -N option.</pre>
152   *
153   * <pre> -N &lt;directory&gt;
154   *  Name of a directory to search for cost files when loading
155   *  costs on demand (default current directory).</pre>
156   *
157   * <pre> -cost-matrix &lt;matrix&gt;
158   *  The cost matrix in Matlab single line format.</pre>
159   *
160   * <pre> -S &lt;integer&gt;
161   *  The seed to use for random number generation.</pre>
162   *
163   * <pre> -W
164   *  Full name of base evaluator.
165   *  (default: weka.attributeSelection.CfsSubsetEval)</pre>
166   *
167   * Options after -- are passed to the designated evaluator.<p>
168   *
169   * @param options the list of options as an array of strings
170   * @throws Exception if an option is not supported
171   */
172  public void setOptions(String[] options) throws Exception {
173    String costFile = Utils.getOption('C', options);
174    if (costFile.length() != 0) {
175      try {
176        setCostMatrix(new CostMatrix(new BufferedReader(
177                                                        new FileReader(costFile))));
178      } catch (Exception ex) {
179        // now flag as possible old format cost matrix. Delay cost matrix
180        // loading until buildClassifer is called
181        setCostMatrix(null);
182      }
183      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED,
184                                          TAGS_MATRIX_SOURCE));
185      m_CostFile = costFile;
186    } else {
187      setCostMatrixSource(new SelectedTag(MATRIX_ON_DEMAND, 
188                                          TAGS_MATRIX_SOURCE));
189    }
190   
191    String demandDir = Utils.getOption('N', options);
192    if (demandDir.length() != 0) {
193      setOnDemandDirectory(new File(demandDir));
194    }
195
196    String cost_matrix = Utils.getOption("cost-matrix", options);
197    if (cost_matrix.length() != 0) {
198      StringWriter writer = new StringWriter();
199      CostMatrix.parseMatlab(cost_matrix).write(writer);
200      setCostMatrix(new CostMatrix(new StringReader(writer.toString())));
201      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED,
202                                          TAGS_MATRIX_SOURCE));
203    }
204
205    String seed = Utils.getOption('S', options);
206    if (seed.length() != 0) {
207      setSeed(Integer.parseInt(seed));
208    } else {
209      setSeed(1);
210    }
211
212    String evaluatorName = Utils.getOption('W', options);
213   
214    if (evaluatorName.length() > 0) { 
215     
216      // This is just to set the evaluator in case the option
217      // parsing fails.
218      setEvaluator(ASEvaluation.forName(evaluatorName, null));
219      setEvaluator(ASEvaluation.forName(evaluatorName,
220                                        Utils.partitionOptions(options)));
221    } else {
222     
223      // This is just to set the classifier in case the option
224      // parsing fails.
225      setEvaluator(ASEvaluation.forName(defaultEvaluatorString(), null));
226      setEvaluator(ASEvaluation.forName(defaultEvaluatorString(),
227                                        Utils.partitionOptions(options)));
228    }
229  }
230
231  /**
232   * Gets the current settings of the subset evaluator.
233   *
234   * @return an array of strings suitable for passing to setOptions
235   */
236  public String[] getOptions() {
237    ArrayList<String> options = new ArrayList<String>();
238
239    if (m_MatrixSource == MATRIX_SUPPLIED) {
240      if (m_CostFile != null) {
241        options.add("-C");
242        options.add("" + m_CostFile);
243      }
244      else {
245        options.add("-cost-matrix");
246        options.add(getCostMatrix().toMatlab());
247      }
248    } else {
249      options.add("-N");
250      options.add("" + getOnDemandDirectory());
251    }
252
253    options.add("-S");
254    options.add("" + getSeed());
255
256    options.add("-W");
257    options.add(m_evaluator.getClass().getName());
258
259    if (m_evaluator instanceof OptionHandler) {
260      String[] evaluatorOptions = ((OptionHandler)m_evaluator).getOptions();
261      if (evaluatorOptions.length > 0) {
262        options.add("--");
263        for (int i = 0; i < evaluatorOptions.length; i++) {
264          options.add(evaluatorOptions[i]);
265        }
266      }
267    }
268
269    return options.toArray(new String[0]);
270  }
271
272  /**
273   * @return a description of the classifier suitable for
274   * displaying in the explorer/experimenter gui
275   */
276  public String globalInfo() {
277
278    return "A meta subset evaluator that makes its base subset evaluator cost-sensitive. ";
279  }
280
281  /**
282   * Return the name of the default evaluator.
283   *
284   * @return the name of the default evaluator
285   */
286  public String defaultEvaluatorString() {
287    return "weka.attributeSelection.CfsSubsetEval";
288  }
289
290  /**
291   * @return tip text for this property suitable for
292   * displaying in the explorer/experimenter gui
293   */
294  public String costMatrixSourceTipText() {
295
296    return "Sets where to get the cost matrix. The two options are"
297      + "to use the supplied explicit cost matrix (the setting of the "
298      + "costMatrix property), or to load a cost matrix from a file when "
299      + "required (this file will be loaded from the directory set by the "
300      + "onDemandDirectory property and will be named relation_name" 
301      + CostMatrix.FILE_EXTENSION + ").";
302  }
303
304  /**
305   * Gets the source location method of the cost matrix. Will be one of
306   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED.
307   *
308   * @return the cost matrix source.
309   */
310  public SelectedTag getCostMatrixSource() {
311
312    return new SelectedTag(m_MatrixSource, TAGS_MATRIX_SOURCE);
313  }
314
315  /**
316   * Sets the source location of the cost matrix. Values other than
317   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED will be ignored.
318   *
319   * @param newMethod the cost matrix location method.
320   */
321  public void setCostMatrixSource(SelectedTag newMethod) {
322   
323    if (newMethod.getTags() == TAGS_MATRIX_SOURCE) {
324      m_MatrixSource = newMethod.getSelectedTag().getID();
325    }
326  }
327
328  /**
329   * @return tip text for this property suitable for
330   * displaying in the explorer/experimenter gui
331   */
332  public String onDemandDirectoryTipText() {
333
334    return "Sets the directory where cost files are loaded from. This option "
335      + "is used when the costMatrixSource is set to \"On Demand\".";
336  }
337
338  /**
339   * Returns the directory that will be searched for cost files when
340   * loading on demand.
341   *
342   * @return The cost file search directory.
343   */
344  public File getOnDemandDirectory() {
345
346    return m_OnDemandDirectory;
347  }
348
349  /**
350   * Sets the directory that will be searched for cost files when
351   * loading on demand.
352   *
353   * @param newDir The cost file search directory.
354   */
355  public void setOnDemandDirectory(File newDir) {
356
357    if (newDir.isDirectory()) {
358      m_OnDemandDirectory = newDir;
359    } else {
360      m_OnDemandDirectory = new File(newDir.getParent());
361    }
362    m_MatrixSource = MATRIX_ON_DEMAND;
363  }
364
365  /**
366   * Gets the evaluator specification string, which contains the class name of
367   * the evaluator and any options to the evaluator
368   *
369   * @return the evaluator string.
370   */
371  protected String getEvaluatorSpec() {
372   
373    ASEvaluation ase = getEvaluator();
374    if (ase instanceof OptionHandler) {
375      return ase.getClass().getName() + " "
376        + Utils.joinOptions(((OptionHandler)ase).getOptions());
377    }
378    return ase.getClass().getName();
379  }
380
381  /**
382   * @return tip text for this property suitable for
383   * displaying in the explorer/experimenter gui
384   */
385  public String costMatrixTipText() {
386    return "Sets the cost matrix explicitly. This matrix is used if the "
387      + "costMatrixSource property is set to \"Supplied\".";
388  }
389
390  /**
391   * Gets the misclassification cost matrix.
392   *
393   * @return the cost matrix
394   */
395  public CostMatrix getCostMatrix() {
396   
397    return m_CostMatrix;
398  }
399 
400  /**
401   * Sets the misclassification cost matrix.
402   *
403   * @param newCostMatrix the cost matrix
404   */
405  public void setCostMatrix(CostMatrix newCostMatrix) {
406   
407    m_CostMatrix = newCostMatrix;
408    m_MatrixSource = MATRIX_SUPPLIED;
409  }
410
411  /**
412   * Returns the tip text for this property
413   * @return tip text for this property suitable for
414   * displaying in the explorer/experimenter gui
415   */
416  public String seedTipText() {
417    return "The random number seed to be used.";
418  }
419
420  /**
421   * Set the seed for random number generation.
422   *
423   * @param seed the seed
424   */
425  public void setSeed(int seed) {
426
427    m_seed = seed;
428  }
429
430  /**
431   * Gets the seed for the random number generations.
432   *
433   * @return the seed for the random number generation
434   */
435  public int getSeed() {
436   
437    return m_seed;
438  }
439
440  /**
441   * Returns the tip text for this property
442   * @return tip text for this property suitable for
443   * displaying in the explorer/experimenter gui
444   */
445  public String evaluatorTipText() {
446    return "The base evaluator to be used.";
447  }
448
449  /**
450   * Set the base evaluator.
451   *
452   * @param newEvaluator the evaluator to use.
453   * @throws IllegalArgumentException if the evaluator is of the wrong type
454   */
455  public void setEvaluator(ASEvaluation newEvaluator) throws IllegalArgumentException {
456
457    m_evaluator = newEvaluator;
458  }
459
460  /**
461   * Get the evaluator used as the base evaluator.
462   *
463   * @return the evaluator used as the base evaluator
464   */
465  public ASEvaluation getEvaluator() {
466
467    return m_evaluator;
468  }
469
470  /**
471   * Returns default capabilities of the classifier.
472   *
473   * @return      the capabilities of this classifier
474   */
475  public Capabilities getCapabilities() {
476    Capabilities result;
477
478    if (getEvaluator() != null) {
479      result = getEvaluator().getCapabilities();
480    } else {
481      result = new Capabilities(this);
482      result.disableAll();
483    }
484
485    // class
486    result.disableAllClasses();
487    result.disableAllClassDependencies();
488    result.enable(Capability.NOMINAL_CLASS);
489   
490    return result;
491  }
492
493  /**
494   * Generates a attribute evaluator. Has to initialize all fields of the
495   * evaluator that are not being set via options.
496   *
497   * @param data set of instances serving as training data
498   * @exception Exception if the evaluator has not been
499   * generated successfully
500   */
501  public void buildEvaluator(Instances data) throws Exception {
502    // can evaluator handle the data?
503    getCapabilities().testWithFail(data);
504
505    // remove instances with missing class
506    data = new Instances(data);
507    data.deleteWithMissingClass();
508
509    if (m_evaluator == null) {
510      throw new Exception("No base evaluator has been set!");
511    }
512
513    if (m_MatrixSource == MATRIX_ON_DEMAND) {
514      String costName = data.relationName() + CostMatrix.FILE_EXTENSION;
515      File costFile = new File(getOnDemandDirectory(), costName);
516      if (!costFile.exists()) {
517        throw new Exception("On-demand cost file doesn't exist: " + costFile);
518      }
519      setCostMatrix(new CostMatrix(new BufferedReader(
520                                                      new FileReader(costFile))));
521    } else if (m_CostMatrix == null) {
522      // try loading an old format cost file
523      m_CostMatrix = new CostMatrix(data.numClasses());
524      m_CostMatrix.readOldFormat(new BufferedReader(
525                                                    new FileReader(m_CostFile)));
526    }
527   
528    Random random = null;
529    if (!(m_evaluator instanceof WeightedInstancesHandler)) {
530      random = new Random(m_seed);
531    }
532    data = m_CostMatrix.applyCostMatrix(data, random);
533    m_evaluator.buildEvaluator(data);
534  }
535
536  /**
537   * Provides a chance for a attribute evaluator to do any special
538   * post processing of the selected attribute set.
539   *
540   * @param attributeSet the set of attributes found by the search
541   * @return a possibly ranked list of postprocessed attributes
542   * @exception Exception if postprocessing fails for some reason
543   */
544  public int [] postProcess(int [] attributeSet) 
545    throws Exception {
546    return m_evaluator.postProcess(attributeSet);
547  }
548
549  /**
550   * Output a representation of this evaluator
551   *
552   * @return a string representation of the classifier
553   */
554  public String toString() {
555
556    if (m_evaluator == null) {
557      return "CostSensitiveASEvaluation: No model built yet.";
558    }
559 
560    String result = (m_evaluator instanceof AttributeEvaluator)
561      ? "CostSensitiveAttributeEval using "
562      : "CostSensitiveSubsetEval using ";
563
564    result += "\n\n" + getEvaluatorSpec()
565      + "\n\nEvaluator\n"
566      + m_evaluator.toString()
567      + "\n\nCost Matrix\n"
568      + m_CostMatrix.toString();
569   
570    return result;
571  }
572
573  /**
574   * Returns the revision string.
575   *
576   * @return            the revision
577   */
578  public String getRevision() {
579    return RevisionUtils.extract("$Revision: 5987 $");
580  }
581}
Note: See TracBrowser for help on using the repository browser.