source: src/main/java/weka/classifiers/meta/Dagging.java @ 18

Last change on this file since 18 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 16.4 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * Dagging.java
19 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.meta;
24
25import weka.classifiers.Classifier;
26import weka.classifiers.AbstractClassifier;
27import weka.classifiers.RandomizableSingleClassifierEnhancer;
28import weka.core.Instance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.RevisionUtils;
32import weka.core.TechnicalInformation;
33import weka.core.TechnicalInformationHandler;
34import weka.core.Utils;
35import weka.core.TechnicalInformation.Field;
36import weka.core.TechnicalInformation.Type;
37
38import java.util.Enumeration;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * This meta classifier creates a number of disjoint, stratified folds out of the data and feeds each chunk of data to a copy of the supplied base classifier. Predictions are made via majority vote, since all the generated base classifiers are put into the Vote meta classifier. <br/>
44 * Useful for base classifiers that are quadratic or worse in time behavior, regarding number of instances in the training data. <br/>
45 * <br/>
46 * For more information, see: <br/>
47 * Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models. In: Fourteenth international Conference on Machine Learning, San Francisco, CA, 367-375, 1997.
48 * <p/>
49 <!-- globalinfo-end -->
50 *
51 <!-- technical-bibtex-start -->
52 * BibTeX:
53 * <pre>
54 * &#64;inproceedings{Ting1997,
55 *    address = {San Francisco, CA},
56 *    author = {Ting, K. M. and Witten, I. H.},
57 *    booktitle = {Fourteenth international Conference on Machine Learning},
58 *    editor = {D. H. Fisher},
59 *    pages = {367-375},
60 *    publisher = {Morgan Kaufmann Publishers},
61 *    title = {Stacking Bagged and Dagged Models},
62 *    year = {1997}
63 * }
64 * </pre>
65 * <p/>
66 <!-- technical-bibtex-end -->
67 *
68 <!-- options-start -->
69 * Valid options are: <p/>
70 *
71 * <pre> -F &lt;folds&gt;
72 *  The number of folds for splitting the training set into
73 *  smaller chunks for the base classifier.
74 *  (default 10)</pre>
75 *
76 * <pre> -verbose
77 *  Whether to print some more information during building the
78 *  classifier.
79 *  (default is off)</pre>
80 *
81 * <pre> -S &lt;num&gt;
82 *  Random number seed.
83 *  (default 1)</pre>
84 *
85 * <pre> -D
86 *  If set, classifier is run in debug mode and
87 *  may output additional info to the console</pre>
88 *
89 * <pre> -W
90 *  Full name of base classifier.
91 *  (default: weka.classifiers.functions.SMO)</pre>
92 *
93 * <pre>
94 * Options specific to classifier weka.classifiers.functions.SMO:
95 * </pre>
96 *
97 * <pre> -D
98 *  If set, classifier is run in debug mode and
99 *  may output additional info to the console</pre>
100 *
101 * <pre> -no-checks
102 *  Turns off all checks - use with caution!
103 *  Turning them off assumes that data is purely numeric, doesn't
104 *  contain any missing values, and has a nominal class. Turning them
105 *  off also means that no header information will be stored if the
106 *  machine is linear. Finally, it also assumes that no instance has
107 *  a weight equal to 0.
108 *  (default: checks on)</pre>
109 *
110 * <pre> -C &lt;double&gt;
111 *  The complexity constant C. (default 1)</pre>
112 *
113 * <pre> -N
114 *  Whether to 0=normalize/1=standardize/2=neither. (default 0=normalize)</pre>
115 *
116 * <pre> -L &lt;double&gt;
117 *  The tolerance parameter. (default 1.0e-3)</pre>
118 *
119 * <pre> -P &lt;double&gt;
120 *  The epsilon for round-off error. (default 1.0e-12)</pre>
121 *
122 * <pre> -M
123 *  Fit logistic models to SVM outputs. </pre>
124 *
125 * <pre> -V &lt;double&gt;
126 *  The number of folds for the internal
127 *  cross-validation. (default -1, use training data)</pre>
128 *
129 * <pre> -W &lt;double&gt;
130 *  The random number seed. (default 1)</pre>
131 *
132 * <pre> -K &lt;classname and parameters&gt;
133 *  The Kernel to use.
134 *  (default: weka.classifiers.functions.supportVector.PolyKernel)</pre>
135 *
136 * <pre>
137 * Options specific to kernel weka.classifiers.functions.supportVector.PolyKernel:
138 * </pre>
139 *
140 * <pre> -D
141 *  Enables debugging output (if available) to be printed.
142 *  (default: off)</pre>
143 *
144 * <pre> -no-checks
145 *  Turns off all checks - use with caution!
146 *  (default: checks on)</pre>
147 *
148 * <pre> -C &lt;num&gt;
149 *  The size of the cache (a prime number), 0 for full cache and
150 *  -1 to turn it off.
151 *  (default: 250007)</pre>
152 *
153 * <pre> -E &lt;num&gt;
154 *  The Exponent to use.
155 *  (default: 1.0)</pre>
156 *
157 * <pre> -L
158 *  Use lower-order terms.
159 *  (default: no)</pre>
160 *
161 <!-- options-end -->
162 *
163 * Options after -- are passed to the designated classifier.<p/>
164 *
165 * @author Bernhard Pfahringer (bernhard at cs dot waikato dot ac dot nz)
166 * @author FracPete (fracpete at waikato dot ac dot nz)
167 * @version $Revision: 5928 $
168 * @see       Vote
169 */
170public class Dagging
171  extends RandomizableSingleClassifierEnhancer
172  implements TechnicalInformationHandler {
173 
174  /** for serialization */
175  static final long serialVersionUID = 4560165876570074309L;
176
177  /** the number of folds to use to split the training data */
178  protected int m_NumFolds = 10;
179
180  /** the classifier used for voting */
181  protected Vote m_Vote = null;
182
183  /** whether to output some progress information during building */
184  protected boolean m_Verbose = false;
185   
186  /**
187   * Returns a string describing classifier
188   * @return a description suitable for
189   * displaying in the explorer/experimenter gui
190   */
191  public String globalInfo() {
192    return 
193     "This meta classifier creates a number of disjoint, stratified folds out "
194     + "of the data and feeds each chunk of data to a copy of the supplied "
195     + "base classifier. Predictions are made via averaging, since all the "
196     + "generated base classifiers are put into the Vote meta classifier. \n"
197     + "Useful for base classifiers that are quadratic or worse in time "
198     + "behavior, regarding number of instances in the training data. \n"
199     + "\n"
200     + "For more information, see: \n"
201     + getTechnicalInformation().toString();
202  }
203
204  /**
205   * Returns an instance of a TechnicalInformation object, containing
206   * detailed information about the technical background of this class,
207   * e.g., paper reference or book this class is based on.
208   *
209   * @return the technical information about this class
210   */
211  public TechnicalInformation getTechnicalInformation() {
212    TechnicalInformation        result;
213   
214    result = new TechnicalInformation(Type.INPROCEEDINGS);
215    result.setValue(Field.AUTHOR, "Ting, K. M. and Witten, I. H.");
216    result.setValue(Field.TITLE, "Stacking Bagged and Dagged Models");
217    result.setValue(Field.BOOKTITLE, "Fourteenth international Conference on Machine Learning");
218    result.setValue(Field.EDITOR, "D. H. Fisher");
219    result.setValue(Field.YEAR, "1997");
220    result.setValue(Field.PAGES, "367-375");
221    result.setValue(Field.PUBLISHER, "Morgan Kaufmann Publishers");
222    result.setValue(Field.ADDRESS, "San Francisco, CA");
223   
224    return result;
225  }
226   
227  /**
228   * Constructor.
229   */
230  public Dagging() {
231    m_Classifier = new weka.classifiers.functions.SMO();
232  }
233
234  /**
235   * String describing default classifier.
236   *
237   * @return the default classifier classname
238   */
239  protected String defaultClassifierString() {
240    return weka.classifiers.functions.SMO.class.getName();
241  }
242
243  /**
244   * Returns an enumeration describing the available options.
245   *
246   * @return an enumeration of all the available options.
247   */
248  public Enumeration listOptions() {
249    Vector result = new Vector();
250   
251    result.addElement(new Option(
252        "\tThe number of folds for splitting the training set into\n"
253        + "\tsmaller chunks for the base classifier.\n"
254        + "\t(default 10)",
255        "F", 1, "-F <folds>"));
256   
257    result.addElement(new Option(
258        "\tWhether to print some more information during building the\n"
259        + "\tclassifier.\n"
260        + "\t(default is off)",
261        "verbose", 0, "-verbose"));
262   
263    Enumeration en = super.listOptions();
264    while (en.hasMoreElements())
265      result.addElement(en.nextElement());
266     
267    return result.elements();
268  }
269
270
271  /**
272   * Parses a given list of options. <p/>
273   *
274   <!-- options-start -->
275   * Valid options are: <p/>
276   *
277   * <pre> -F &lt;folds&gt;
278   *  The number of folds for splitting the training set into
279   *  smaller chunks for the base classifier.
280   *  (default 10)</pre>
281   *
282   * <pre> -verbose
283   *  Whether to print some more information during building the
284   *  classifier.
285   *  (default is off)</pre>
286   *
287   * <pre> -S &lt;num&gt;
288   *  Random number seed.
289   *  (default 1)</pre>
290   *
291   * <pre> -D
292   *  If set, classifier is run in debug mode and
293   *  may output additional info to the console</pre>
294   *
295   * <pre> -W
296   *  Full name of base classifier.
297   *  (default: weka.classifiers.functions.SMO)</pre>
298   *
299   * <pre>
300   * Options specific to classifier weka.classifiers.functions.SMO:
301   * </pre>
302   *
303   * <pre> -D
304   *  If set, classifier is run in debug mode and
305   *  may output additional info to the console</pre>
306   *
307   * <pre> -no-checks
308   *  Turns off all checks - use with caution!
309   *  Turning them off assumes that data is purely numeric, doesn't
310   *  contain any missing values, and has a nominal class. Turning them
311   *  off also means that no header information will be stored if the
312   *  machine is linear. Finally, it also assumes that no instance has
313   *  a weight equal to 0.
314   *  (default: checks on)</pre>
315   *
316   * <pre> -C &lt;double&gt;
317   *  The complexity constant C. (default 1)</pre>
318   *
319   * <pre> -N
320   *  Whether to 0=normalize/1=standardize/2=neither. (default 0=normalize)</pre>
321   *
322   * <pre> -L &lt;double&gt;
323   *  The tolerance parameter. (default 1.0e-3)</pre>
324   *
325   * <pre> -P &lt;double&gt;
326   *  The epsilon for round-off error. (default 1.0e-12)</pre>
327   *
328   * <pre> -M
329   *  Fit logistic models to SVM outputs. </pre>
330   *
331   * <pre> -V &lt;double&gt;
332   *  The number of folds for the internal
333   *  cross-validation. (default -1, use training data)</pre>
334   *
335   * <pre> -W &lt;double&gt;
336   *  The random number seed. (default 1)</pre>
337   *
338   * <pre> -K &lt;classname and parameters&gt;
339   *  The Kernel to use.
340   *  (default: weka.classifiers.functions.supportVector.PolyKernel)</pre>
341   *
342   * <pre>
343   * Options specific to kernel weka.classifiers.functions.supportVector.PolyKernel:
344   * </pre>
345   *
346   * <pre> -D
347   *  Enables debugging output (if available) to be printed.
348   *  (default: off)</pre>
349   *
350   * <pre> -no-checks
351   *  Turns off all checks - use with caution!
352   *  (default: checks on)</pre>
353   *
354   * <pre> -C &lt;num&gt;
355   *  The size of the cache (a prime number), 0 for full cache and
356   *  -1 to turn it off.
357   *  (default: 250007)</pre>
358   *
359   * <pre> -E &lt;num&gt;
360   *  The Exponent to use.
361   *  (default: 1.0)</pre>
362   *
363   * <pre> -L
364   *  Use lower-order terms.
365   *  (default: no)</pre>
366   *
367   <!-- options-end -->
368   *
369   * Options after -- are passed to the designated classifier.<p>
370   *
371   * @param options the list of options as an array of strings
372   * @throws Exception if an option is not supported
373   */
374  public void setOptions(String[] options) throws Exception {
375    String        tmpStr;
376
377    tmpStr = Utils.getOption('F', options);
378    if (tmpStr.length() != 0)
379      setNumFolds(Integer.parseInt(tmpStr));
380    else
381      setNumFolds(10);
382   
383    setVerbose(Utils.getFlag("verbose", options));
384   
385    super.setOptions(options);
386  }
387
388  /**
389   * Gets the current settings of the Classifier.
390   *
391   * @return an array of strings suitable for passing to setOptions
392   */
393  public String[] getOptions() {
394    Vector        result;
395    String[]      options;
396    int           i;
397   
398    result  = new Vector();
399
400    result.add("-F");
401    result.add("" + getNumFolds());
402   
403    if (getVerbose())
404      result.add("-verbose");
405   
406    options = super.getOptions();
407    for (i = 0; i < options.length; i++)
408      result.add(options[i]);
409   
410    return (String[]) result.toArray(new String[result.size()]);
411  }
412
413  /**
414   * Gets the number of folds to use for splitting the training set.
415   *
416   * @return the number of folds
417   */
418  public int getNumFolds() {
419    return m_NumFolds;
420  }
421 
422  /**
423   * Sets the number of folds to use for splitting the training set.
424   *
425   * @param value     the new number of folds
426   */
427  public void setNumFolds(int value) {
428    if (value > 0)
429      m_NumFolds = value;
430    else
431      System.out.println(
432          "At least 1 fold is necessary (provided: " + value + ")!");
433  }
434
435  /**
436   * Returns the tip text for this property
437   *
438   * @return tip text for this property suitable for
439   *         displaying in the explorer/experimenter gui
440   */
441  public String numFoldsTipText() {
442    return "The number of folds to use for splitting the training set into smaller chunks for the base classifier.";
443  }
444 
445  /**
446   * Set the verbose state.
447   *
448   * @param value the verbose state
449   */
450  public void setVerbose(boolean value) {
451    m_Verbose = value;
452  }
453 
454  /**
455   * Gets the verbose state
456   *
457   * @return the verbose state
458   */
459  public boolean getVerbose() {
460    return m_Verbose;
461  }
462 
463  /**
464   * Returns the tip text for this property
465   * @return tip text for this property suitable for
466   * displaying in the explorer/experimenter gui
467   */
468  public String verboseTipText() {
469    return "Whether to ouput some additional information during building.";
470  }
471
472  /**
473   * Bagging method.
474   *
475   * @param data the training data to be used for generating the
476   * bagged classifier.
477   * @throws Exception if the classifier could not be built successfully
478   */
479  public void buildClassifier(Instances data) throws Exception {
480    Classifier[]        base;
481    int                 i;
482    int                 n;
483    int                 fromIndex;
484    int                 toIndex;
485    Instances           train;
486    double              chunkSize;
487   
488    // can classifier handle the data?
489    getCapabilities().testWithFail(data);
490
491    // remove instances with missing class
492    data = new Instances(data);
493    data.deleteWithMissingClass();
494   
495    m_Vote    = new Vote();
496    base      = new Classifier[getNumFolds()];
497    chunkSize = (double) data.numInstances() / (double) getNumFolds();
498   
499    // stratify data
500    if (getNumFolds() > 1) {
501      data.randomize(data.getRandomNumberGenerator(getSeed()));
502      data.stratify(getNumFolds());
503    }
504
505    // generate <folds> classifiers
506    for (i = 0; i < getNumFolds(); i++) {
507      base[i] = makeCopy(getClassifier());
508
509      // generate training data
510      if (getNumFolds() > 1) {
511        // some progress information
512        if (getVerbose())
513          System.out.print(".");
514       
515        train     = data.testCV(getNumFolds(), i);
516      }
517      else {
518        train = data;
519      }
520
521      // train classifier
522      base[i].buildClassifier(train);
523    }
524   
525    // init vote
526    m_Vote.setClassifiers(base);
527   
528    if (getVerbose())
529      System.out.println();
530  }
531
532  /**
533   * Calculates the class membership probabilities for the given test
534   * instance.
535   *
536   * @param instance the instance to be classified
537   * @return preedicted class probability distribution
538   * @throws Exception if distribution can't be computed successfully
539   */
540  public double[] distributionForInstance(Instance instance) throws Exception {
541    return m_Vote.distributionForInstance(instance);
542  }
543
544  /**
545   * Returns description of the classifier.
546   *
547   * @return description of the classifier as a string
548   */
549  public String toString() {
550    if (m_Vote == null)
551      return this.getClass().getName().replaceAll(".*\\.", "") 
552             + ": No model built yet.";
553    else
554      return m_Vote.toString();
555  }
556 
557  /**
558   * Returns the revision string.
559   *
560   * @return            the revision
561   */
562  public String getRevision() {
563    return RevisionUtils.extract("$Revision: 5928 $");
564  }
565
566  /**
567   * Main method for testing this class.
568   *
569   * @param args the options
570   */
571  public static void main(String[] args) {
572    runClassifier(new Dagging(), args);
573  }
574}
Note: See TracBrowser for help on using the repository browser.