source: src/main/java/weka/classifiers/rules/PART.java @ 7

Last change on this file since 7 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 20.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    PART.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.rules;
24
25import weka.classifiers.Classifier;
26import weka.classifiers.AbstractClassifier;
27import weka.classifiers.rules.part.MakeDecList;
28import weka.classifiers.trees.j48.BinC45ModelSelection;
29import weka.classifiers.trees.j48.C45ModelSelection;
30import weka.classifiers.trees.j48.ModelSelection;
31import weka.core.AdditionalMeasureProducer;
32import weka.core.Capabilities;
33import weka.core.Instance;
34import weka.core.Instances;
35import weka.core.Option;
36import weka.core.OptionHandler;
37import weka.core.RevisionUtils;
38import weka.core.Summarizable;
39import weka.core.TechnicalInformation;
40import weka.core.TechnicalInformationHandler;
41import weka.core.Utils;
42import weka.core.WeightedInstancesHandler;
43import weka.core.TechnicalInformation.Field;
44import weka.core.TechnicalInformation.Type;
45
46import java.util.Enumeration;
47import java.util.Vector;
48
49/**
50 <!-- globalinfo-start -->
51 * Class for generating a PART decision list. Uses separate-and-conquer. Builds a partial C4.5 decision tree in each iteration and makes the "best" leaf into a rule.<br/>
52 * <br/>
53 * For more information, see:<br/>
54 * <br/>
55 * Eibe Frank, Ian H. Witten: Generating Accurate Rule Sets Without Global Optimization. In: Fifteenth International Conference on Machine Learning, 144-151, 1998.
56 * <p/>
57 <!-- globalinfo-end -->
58 *
59 <!-- technical-bibtex-start -->
60 * BibTeX:
61 * <pre>
62 * &#64;inproceedings{Frank1998,
63 *    author = {Eibe Frank and Ian H. Witten},
64 *    booktitle = {Fifteenth International Conference on Machine Learning},
65 *    editor = {J. Shavlik},
66 *    pages = {144-151},
67 *    publisher = {Morgan Kaufmann},
68 *    title = {Generating Accurate Rule Sets Without Global Optimization},
69 *    year = {1998},
70 *    PS = {http://www.cs.waikato.ac.nz/\~eibe/pubs/ML98-57.ps.gz}
71 * }
72 * </pre>
73 * <p/>
74 <!-- technical-bibtex-end -->
75 *
76 <!-- options-start -->
77 * Valid options are: <p/>
78 *
79 * <pre> -C &lt;pruning confidence&gt;
80 *  Set confidence threshold for pruning.
81 *  (default 0.25)</pre>
82 *
83 * <pre> -M &lt;minimum number of objects&gt;
84 *  Set minimum number of objects per leaf.
85 *  (default 2)</pre>
86 *
87 * <pre> -R
88 *  Use reduced error pruning.</pre>
89 *
90 * <pre> -N &lt;number of folds&gt;
91 *  Set number of folds for reduced error
92 *  pruning. One fold is used as pruning set.
93 *  (default 3)</pre>
94 *
95 * <pre> -B
96 *  Use binary splits only.</pre>
97 *
98 * <pre> -U
99 *  Generate unpruned decision list.</pre>
100 *
101 * <pre> -J
102 *  Do not use MDL correction for info gain on numeric attributes.</pre>
103 *
104 * <pre> -Q &lt;seed&gt;
105 *  Seed for random data shuffling (default 1).</pre>
106 *
107 <!-- options-end -->
108 *
109 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
110 * @version $Revision: 6089 $
111 */
112public class PART 
113  extends AbstractClassifier
114  implements OptionHandler, WeightedInstancesHandler, Summarizable, 
115             AdditionalMeasureProducer, TechnicalInformationHandler {
116
117  /** for serialization */
118  static final long serialVersionUID = 8121455039782598361L;
119 
120  /** The decision list */
121  private MakeDecList m_root;
122
123  /** Confidence level */
124  private float m_CF = 0.25f;
125
126  /** Minimum number of objects */
127  private int m_minNumObj = 2;
128
129  /** Use MDL correction? */
130  private boolean m_useMDLcorrection = true;         
131
132  /** Use reduced error pruning? */
133  private boolean m_reducedErrorPruning = false;
134
135  /** Number of folds for reduced error pruning. */
136  private int m_numFolds = 3;
137
138  /** Binary splits on nominal attributes? */
139  private boolean m_binarySplits = false;
140 
141  /** Generate unpruned list? */
142  private boolean m_unpruned = false;
143
144  /** The seed for random number generation. */
145  private int m_Seed = 1;
146   
147  /**
148   * Returns a string describing classifier
149   * @return a description suitable for
150   * displaying in the explorer/experimenter gui
151   */
152  public String globalInfo() {
153
154    return  "Class for generating a PART decision list. Uses "
155      + "separate-and-conquer. Builds a partial C4.5 decision tree "
156      + "in each iteration and makes the \"best\" leaf into a rule.\n\n"
157      + "For more information, see:\n\n"
158      + getTechnicalInformation().toString();
159  }
160
161  /**
162   * Returns an instance of a TechnicalInformation object, containing
163   * detailed information about the technical background of this class,
164   * e.g., paper reference or book this class is based on.
165   *
166   * @return the technical information about this class
167   */
168  public TechnicalInformation getTechnicalInformation() {
169    TechnicalInformation        result;
170   
171    result = new TechnicalInformation(Type.INPROCEEDINGS);
172    result.setValue(Field.AUTHOR, "Eibe Frank and Ian H. Witten");
173    result.setValue(Field.TITLE, "Generating Accurate Rule Sets Without Global Optimization");
174    result.setValue(Field.BOOKTITLE, "Fifteenth International Conference on Machine Learning");
175    result.setValue(Field.EDITOR, "J. Shavlik");
176    result.setValue(Field.YEAR, "1998");
177    result.setValue(Field.PAGES, "144-151");
178    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
179    result.setValue(Field.PS, "http://www.cs.waikato.ac.nz/~eibe/pubs/ML98-57.ps.gz");
180   
181    return result;
182  }
183
184  /**
185   * Returns default capabilities of the classifier.
186   *
187   * @return      the capabilities of this classifier
188   */
189  public Capabilities getCapabilities() {
190    Capabilities      result;
191
192    if (m_unpruned) 
193      result = new MakeDecList(null, m_minNumObj).getCapabilities();
194    else if (m_reducedErrorPruning) 
195      result = new MakeDecList(null, m_numFolds, m_minNumObj, m_Seed).getCapabilities();
196    else
197      result = new MakeDecList(null, m_CF, m_minNumObj).getCapabilities();
198   
199    return result;
200  }
201
202  /**
203   * Generates the classifier.
204   *
205   * @param instances the data to train with
206   * @throws Exception if classifier can't be built successfully
207   */
208  public void buildClassifier(Instances instances) 
209       throws Exception {
210
211    // can classifier handle the data?
212    getCapabilities().testWithFail(instances);
213
214    // remove instances with missing class
215    instances = new Instances(instances);
216    instances.deleteWithMissingClass();
217   
218    ModelSelection modSelection;         
219
220    if (m_binarySplits)
221      modSelection = new BinC45ModelSelection(m_minNumObj, instances, m_useMDLcorrection);
222    else
223      modSelection = new C45ModelSelection(m_minNumObj, instances, m_useMDLcorrection);
224    if (m_unpruned) 
225      m_root = new MakeDecList(modSelection, m_minNumObj);
226    else if (m_reducedErrorPruning) 
227      m_root = new MakeDecList(modSelection, m_numFolds, m_minNumObj, m_Seed);
228    else
229      m_root = new MakeDecList(modSelection, m_CF, m_minNumObj);
230    m_root.buildClassifier(instances);
231    if (m_binarySplits) {
232      ((BinC45ModelSelection)modSelection).cleanup();
233    } else {
234      ((C45ModelSelection)modSelection).cleanup();
235    }
236  }
237
238  /**
239   * Classifies an instance.
240   *
241   * @param instance the instance to classify
242   * @return the classification
243   * @throws Exception if instance can't be classified successfully
244   */
245  public double classifyInstance(Instance instance) 
246       throws Exception {
247
248    return m_root.classifyInstance(instance);
249  }
250
251  /**
252   * Returns class probabilities for an instance.
253   *
254   * @param instance the instance to get the distribution for
255   * @return the class probabilities
256   * @throws Exception if the distribution can't be computed successfully
257   */
258  public final double [] distributionForInstance(Instance instance) 
259       throws Exception {
260
261    return m_root.distributionForInstance(instance);
262  }
263
264  /**
265   * Returns an enumeration describing the available options.
266   *
267   * Valid options are: <p>
268   *
269   * -C confidence <br>
270   * Set confidence threshold for pruning. (Default: 0.25) <p>
271   *
272   * -M number <br>
273   * Set minimum number of instances per leaf. (Default: 2) <p>
274   *
275   * -R <br>
276   * Use reduced error pruning. <p>
277   *
278   * -N number <br>
279   * Set number of folds for reduced error pruning. One fold is
280   * used as the pruning set. (Default: 3) <p>
281   *
282   * -B <br>
283   * Use binary splits for nominal attributes. <p>
284   *
285   * -U <br>
286   * Generate unpruned decision list. <p>
287   *
288   * -Q <br>
289   * The seed for reduced-error pruning. <p>
290   *
291   * @return an enumeration of all the available options.
292   */
293  public Enumeration listOptions() {
294
295    Vector newVector = new Vector(8);
296
297    newVector.
298        addElement(new Option("\tSet confidence threshold for pruning.\n" +
299                              "\t(default 0.25)",
300                              "C", 1, "-C <pruning confidence>"));
301    newVector.
302        addElement(new Option("\tSet minimum number of objects per leaf.\n" +
303                              "\t(default 2)",
304                              "M", 1, "-M <minimum number of objects>"));
305    newVector.
306        addElement(new Option("\tUse reduced error pruning.",
307                              "R", 0, "-R"));
308    newVector.
309        addElement(new Option("\tSet number of folds for reduced error\n" +
310                              "\tpruning. One fold is used as pruning set.\n" +
311                              "\t(default 3)",
312                              "N", 1, "-N <number of folds>"));
313    newVector.
314        addElement(new Option("\tUse binary splits only.",
315                              "B", 0, "-B"));
316    newVector.
317        addElement(new Option("\tGenerate unpruned decision list.",
318                              "U", 0, "-U"));
319    newVector.
320      addElement(new Option("\tDo not use MDL correction for info gain on numeric attributes.",
321                            "J", 0, "-J"));
322    newVector.
323      addElement(new Option("\tSeed for random data shuffling (default 1).",
324                            "Q", 1, "-Q <seed>"));
325
326    return newVector.elements();
327  }
328
329  /**
330   * Parses a given list of options. <p/>
331   *
332   <!-- options-start -->
333   * Valid options are: <p/>
334   *
335   * <pre> -C &lt;pruning confidence&gt;
336   *  Set confidence threshold for pruning.
337   *  (default 0.25)</pre>
338   *
339   * <pre> -M &lt;minimum number of objects&gt;
340   *  Set minimum number of objects per leaf.
341   *  (default 2)</pre>
342   *
343   * <pre> -R
344   *  Use reduced error pruning.</pre>
345   *
346   * <pre> -N &lt;number of folds&gt;
347   *  Set number of folds for reduced error
348   *  pruning. One fold is used as pruning set.
349   *  (default 3)</pre>
350   *
351   * <pre> -B
352   *  Use binary splits only.</pre>
353   *
354   * <pre> -U
355   *  Generate unpruned decision list.</pre>
356   *
357   * <pre> -J
358   *  Do not use MDL correction for info gain on numeric attributes.</pre>
359   *
360   * <pre> -Q &lt;seed&gt;
361   *  Seed for random data shuffling (default 1).</pre>
362   *
363   <!-- options-end -->
364   *
365   * @param options the list of options as an array of strings
366   * @throws Exception if an option is not supported
367   */
368  public void setOptions(String[] options) throws Exception {
369
370    // Pruning options
371    m_unpruned = Utils.getFlag('U', options);
372    m_reducedErrorPruning = Utils.getFlag('R', options);
373    m_binarySplits = Utils.getFlag('B', options);
374    m_useMDLcorrection = !Utils.getFlag('J', options);
375    String confidenceString = Utils.getOption('C', options);
376    if (confidenceString.length() != 0) {
377      if (m_reducedErrorPruning) {
378        throw new Exception("Setting CF doesn't make sense " +
379                            "for reduced error pruning.");
380      } else {
381        m_CF = (new Float(confidenceString)).floatValue();
382        if ((m_CF <= 0) || (m_CF >= 1)) {
383          throw new Exception("CF has to be greater than zero and smaller than one!");
384        } 
385      }
386    } else {
387      m_CF = 0.25f;
388    }
389    String numFoldsString = Utils.getOption('N', options);
390    if (numFoldsString.length() != 0) {
391      if (!m_reducedErrorPruning) {
392        throw new Exception("Setting the number of folds" +
393                            " does only make sense for" +
394                            " reduced error pruning.");
395      } else {
396        m_numFolds = Integer.parseInt(numFoldsString);
397      }
398    } else {
399      m_numFolds = 3;
400    }
401
402    // Other options
403    String minNumString = Utils.getOption('M', options);
404    if (minNumString.length() != 0) {
405      m_minNumObj = Integer.parseInt(minNumString);
406    } else {
407      m_minNumObj = 2;
408    }
409    String seedString = Utils.getOption('Q', options);
410    if (seedString.length() != 0) {
411      m_Seed = Integer.parseInt(seedString);
412    } else {
413      m_Seed = 1;
414    }
415  }
416
417  /**
418   * Gets the current settings of the Classifier.
419   *
420   * @return an array of strings suitable for passing to setOptions
421   */
422  public String [] getOptions() {
423
424    String [] options = new String [12];
425    int current = 0;
426
427    if (m_unpruned) {
428      options[current++] = "-U";
429    }
430    if (m_reducedErrorPruning) {
431      options[current++] = "-R";
432    }
433    if (m_binarySplits) {
434      options[current++] = "-B";
435    }
436    options[current++] = "-M"; options[current++] = "" + m_minNumObj;
437    if (!m_reducedErrorPruning) {
438      options[current++] = "-C"; options[current++] = "" + m_CF;
439    }
440    if (m_reducedErrorPruning) {
441      options[current++] = "-N"; options[current++] = "" + m_numFolds;
442    }
443    options[current++] = "-Q"; options[current++] = "" + m_Seed;
444    if (!m_useMDLcorrection) {
445      options[current++] = "-J";
446    }
447
448    while (current < options.length) {
449      options[current++] = "";
450    }
451    return options;
452  }
453
454  /**
455   * Returns a description of the classifier
456   *
457   * @return a string representation of the classifier
458   */
459  public String toString() {
460
461    if (m_root == null) {
462      return "No classifier built";
463    }
464    return "PART decision list\n------------------\n\n" + m_root.toString();
465  }
466 
467  /**
468   * Returns a superconcise version of the model
469   *
470   * @return a concise version of the model
471   */
472  public String toSummaryString() {
473
474    return "Number of rules: " + m_root.numRules() + "\n";
475  }
476 
477  /**
478   * Return the number of rules.
479   * @return the number of rules
480   */
481  public double measureNumRules() {
482    return m_root.numRules();
483  }
484 
485  /**
486   * Returns an enumeration of the additional measure names
487   * @return an enumeration of the measure names
488   */
489  public Enumeration enumerateMeasures() {
490    Vector newVector = new Vector(1);
491    newVector.addElement("measureNumRules");
492    return newVector.elements();
493  }
494
495  /**
496   * Returns the value of the named measure
497   * @param additionalMeasureName the name of the measure to query for its value
498   * @return the value of the named measure
499   * @throws IllegalArgumentException if the named measure is not supported
500   */
501  public double getMeasure(String additionalMeasureName) {
502    if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) {
503      return measureNumRules();
504    } else {
505      throw new IllegalArgumentException(additionalMeasureName
506                          + " not supported (PART)");
507    }
508  }
509
510  /**
511   * Returns the tip text for this property
512   * @return tip text for this property suitable for
513   * displaying in the explorer/experimenter gui
514   */
515  public String confidenceFactorTipText() {
516    return "The confidence factor used for pruning (smaller values incur "
517      + "more pruning).";
518  }
519
520  /**
521   * Get the value of CF.
522   *
523   * @return Value of CF.
524   */
525  public float getConfidenceFactor() {
526   
527    return m_CF;
528  }
529 
530  /**
531   * Set the value of CF.
532   *
533   * @param v  Value to assign to CF.
534   */
535  public void setConfidenceFactor(float v) {
536   
537    m_CF = v;
538  }
539 
540  /**
541   * Returns the tip text for this property
542   * @return tip text for this property suitable for
543   * displaying in the explorer/experimenter gui
544   */
545  public String minNumObjTipText() {
546    return "The minimum number of instances per rule.";
547  }
548
549  /**
550   * Get the value of minNumObj.
551   *
552   * @return Value of minNumObj.
553   */
554  public int getMinNumObj() {
555   
556    return m_minNumObj;
557  }
558 
559  /**
560   * Set the value of minNumObj.
561   *
562   * @param v  Value to assign to minNumObj.
563   */
564  public void setMinNumObj(int v) {
565   
566    m_minNumObj = v;
567  }
568 
569  /**
570   * Returns the tip text for this property
571   * @return tip text for this property suitable for
572   * displaying in the explorer/experimenter gui
573   */
574  public String reducedErrorPruningTipText() {
575    return "Whether reduced-error pruning is used instead of C.4.5 pruning.";
576  }
577
578  /**
579   * Get the value of reducedErrorPruning.
580   *
581   * @return Value of reducedErrorPruning.
582   */
583  public boolean getReducedErrorPruning() {
584   
585    return m_reducedErrorPruning;
586  }
587 
588  /**
589   * Set the value of reducedErrorPruning.
590   *
591   * @param v  Value to assign to reducedErrorPruning.
592   */
593  public void setReducedErrorPruning(boolean v) {
594   
595    m_reducedErrorPruning = v;
596  }
597 
598  /**
599   * Returns the tip text for this property
600   * @return tip text for this property suitable for
601   * displaying in the explorer/experimenter gui
602   */
603  public String unprunedTipText() {
604    return "Whether pruning is performed.";
605  }
606
607  /**
608   * Get the value of unpruned.
609   *
610   * @return Value of unpruned.
611   */
612  public boolean getUnpruned() {
613   
614    return m_unpruned;
615  }
616 
617  /**
618   * Set the value of unpruned.
619   *
620   * @param newunpruned Value to assign to unpruned.
621   */
622  public void setUnpruned(boolean newunpruned) {
623   
624    m_unpruned = newunpruned;
625  }
626
627  /**
628   * Returns the tip text for this property
629   * @return tip text for this property suitable for
630   * displaying in the explorer/experimenter gui
631   */
632  public String useMDLcorrectionTipText() {
633    return "Whether MDL correction is used when finding splits on numeric attributes.";
634  }
635
636  /**
637   * Get the value of useMDLcorrection.
638   *
639   * @return Value of useMDLcorrection.
640   */
641  public boolean getUseMDLcorrection() {
642   
643    return m_useMDLcorrection;
644  }
645 
646  /**
647   * Set the value of useMDLcorrection.
648   *
649   * @param newuseMDLcorrection Value to assign to useMDLcorrection.
650   */
651  public void setUseMDLcorrection(boolean newuseMDLcorrection) {
652   
653    m_useMDLcorrection = newuseMDLcorrection;
654  }
655 
656  /**
657   * Returns the tip text for this property
658   * @return tip text for this property suitable for
659   * displaying in the explorer/experimenter gui
660   */
661  public String numFoldsTipText() {
662    return "Determines the amount of data used for reduced-error pruning. "
663      + " One fold is used for pruning, the rest for growing the rules.";
664  }
665
666  /**
667   * Get the value of numFolds.
668   *
669   * @return Value of numFolds.
670   */
671  public int getNumFolds() {
672   
673    return m_numFolds;
674  }
675 
676  /**
677   * Set the value of numFolds.
678   *
679   * @param v  Value to assign to numFolds.
680   */
681  public void setNumFolds(int v) {
682   
683    m_numFolds = v;
684  }
685 
686  /**
687   * Returns the tip text for this property
688   * @return tip text for this property suitable for
689   * displaying in the explorer/experimenter gui
690   */
691  public String seedTipText() {
692    return "The seed used for randomizing the data " +
693      "when reduced-error pruning is used.";
694  }
695
696  /**
697   * Get the value of Seed.
698   *
699   * @return Value of Seed.
700   */
701  public int getSeed() {
702   
703    return m_Seed;
704  }
705 
706  /**
707   * Set the value of Seed.
708   *
709   * @param newSeed Value to assign to Seed.
710   */
711  public void setSeed(int newSeed) {
712   
713    m_Seed = newSeed;
714  }
715
716  /**
717   * Returns the tip text for this property
718   * @return tip text for this property suitable for
719   * displaying in the explorer/experimenter gui
720   */
721  public String binarySplitsTipText() {
722    return "Whether to use binary splits on nominal attributes when "
723      + "building the partial trees.";
724  }
725 
726  /**
727   * Get the value of binarySplits.
728   *
729   * @return Value of binarySplits.
730   */
731  public boolean getBinarySplits() {
732   
733    return m_binarySplits;
734  }
735 
736  /**
737   * Set the value of binarySplits.
738   *
739   * @param v  Value to assign to binarySplits.
740   */
741  public void setBinarySplits(boolean v) {
742   
743    m_binarySplits = v;
744  }
745 
746  /**
747   * Returns the revision string.
748   *
749   * @return            the revision
750   */
751  public String getRevision() {
752    return RevisionUtils.extract("$Revision: 6089 $");
753  }
754 
755  /**
756   * Main method for testing this class.
757   *
758   * @param argv command line options
759   */
760  public static void main(String [] argv){
761    runClassifier(new PART(), argv);
762  }
763}
764
Note: See TracBrowser for help on using the repository browser.