source: src/main/java/weka/classifiers/rules/JRip.java @ 25

Last change on this file since 25 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 63.6 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    JRip.java
19 *    Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.classifiers.rules;
23
24import weka.classifiers.Classifier;
25import weka.classifiers.AbstractClassifier;
26import weka.core.AdditionalMeasureProducer;
27import weka.core.Attribute;
28import weka.core.Capabilities;
29import weka.core.Copyable;
30import weka.core.FastVector;
31import weka.core.Instance;
32import weka.core.Instances;
33import weka.core.Option;
34import weka.core.RevisionHandler;
35import weka.core.RevisionUtils;
36import weka.core.TechnicalInformation;
37import weka.core.TechnicalInformationHandler;
38import weka.core.Utils;
39import weka.core.WeightedInstancesHandler;
40import weka.core.Capabilities.Capability;
41import weka.core.TechnicalInformation.Field;
42import weka.core.TechnicalInformation.Type;
43import weka.filters.Filter;
44import weka.filters.supervised.attribute.ClassOrder;
45
46import java.io.Serializable;
47import java.util.Enumeration;
48import java.util.Random;
49import java.util.Vector;
50
51/**
52 <!-- globalinfo-start -->
53 * This class implements a propositional rule learner, Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was proposed by William W. Cohen as an optimized version of IREP. <br/>
54 * <br/>
55 * The algorithm is briefly described as follows: <br/>
56 * <br/>
57 * Initialize RS = {}, and for each class from the less prevalent one to the more frequent one, DO: <br/>
58 * <br/>
59 * 1. Building stage:<br/>
60 * Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and examples is 64 bits greater than the smallest DL met so far, or there are no positive examples, or the error rate &gt;= 50%. <br/>
61 * <br/>
62 * 1.1. Grow phase:<br/>
63 * Grow one rule by greedily adding antecedents (or conditions) to the rule until the rule is perfect (i.e. 100% accurate).  The procedure tries every possible value of each attribute and selects the condition with highest information gain: p(log(p/t)-log(P/T)).<br/>
64 * <br/>
65 * 1.2. Prune phase:<br/>
66 * Incrementally prune each rule and allow the pruning of any final sequences of the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<br/>
67 * <br/>
68 * 2. Optimization stage:<br/>
69 *  after generating the initial ruleset {Ri}, generate and prune two variants of each rule Ri from randomized data using procedure 1.1 and 1.2. But one variant is generated from an empty rule while the other is generated by greedily adding antecedents to the original rule. Moreover, the pruning metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each variant and the original rule is computed.  The variant with the minimal DL is selected as the final representative of Ri in the ruleset.After all the rules in {Ri} have been examined and if there are still residual positives, more rules are generated based on the residual positives using Building Stage again. <br/>
70 * 3. Delete the rules from the ruleset that would increase the DL of the whole ruleset if it were in it. and add resultant ruleset to RS. <br/>
71 * ENDDO<br/>
72 * <br/>
73 * Note that there seem to be 2 bugs in the original ripper program that would affect the ruleset size and accuracy slightly.  This implementation avoids these bugs and thus is a little bit different from Cohen's original implementation. Even after fixing the bugs, since the order of classes with the same frequency is not defined in ripper, there still seems to be some trivial difference between this implementation and the original ripper, especially for audiology data in UCI repository, where there are lots of classes of few instances.<br/>
74 * <br/>
75 * Details please see:<br/>
76 * <br/>
77 * William W. Cohen: Fast Effective Rule Induction. In: Twelfth International Conference on Machine Learning, 115-123, 1995.<br/>
78 * <br/>
79 * PS.  We have compared this implementation with the original ripper implementation in aspects of accuracy, ruleset size and running time on both artificial data "ab+bcd+defg" and UCI datasets.  In all these aspects it seems to be quite comparable to the original ripper implementation.  However, we didn't consider memory consumption optimization in this implementation.<br/>
80 * <br/>
81 * <p/>
82 <!-- globalinfo-end -->
83 *
84 <!-- technical-bibtex-start -->
85 * BibTeX:
86 * <pre>
87 * &#64;inproceedings{Cohen1995,
88 *    author = {William W. Cohen},
89 *    booktitle = {Twelfth International Conference on Machine Learning},
90 *    pages = {115-123},
91 *    publisher = {Morgan Kaufmann},
92 *    title = {Fast Effective Rule Induction},
93 *    year = {1995}
94 * }
95 * </pre>
96 * <p/>
97 <!-- technical-bibtex-end -->
98 *
99 <!-- options-start -->
100 * Valid options are: <p/>
101 *
102 * <pre> -F &lt;number of folds&gt;
103 *  Set number of folds for REP
104 *  One fold is used as pruning set.
105 *  (default 3)</pre>
106 *
107 * <pre> -N &lt;min. weights&gt;
108 *  Set the minimal weights of instances
109 *  within a split.
110 *  (default 2.0)</pre>
111 *
112 * <pre> -O &lt;number of runs&gt;
113 *  Set the number of runs of
114 *  optimizations. (Default: 2)</pre>
115 *
116 * <pre> -D
117 *  Set whether turn on the
118 *  debug mode (Default: false)</pre>
119 *
120 * <pre> -S &lt;seed&gt;
121 *  The seed of randomization
122 *  (Default: 1)</pre>
123 *
124 * <pre> -E
125 *  Whether NOT check the error rate&gt;=0.5
126 *  in stopping criteria  (default: check)</pre>
127 *
128 * <pre> -P
129 *  Whether NOT use pruning
130 *  (default: use pruning)</pre>
131 *
132 <!-- options-end -->
133 *
134 * @author Xin Xu (xx5@cs.waikato.ac.nz)
135 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
136 * @version $Revision: 6041 $
137 */
138public class JRip 
139  extends AbstractClassifier
140  implements AdditionalMeasureProducer, 
141             WeightedInstancesHandler,
142             TechnicalInformationHandler {   
143
144  /** for serialization */
145  static final long serialVersionUID = -6589312996832147161L;
146 
147  /** The limit of description length surplus in ruleset generation */
148  private static double MAX_DL_SURPLUS = 64.0;
149   
150  /** The class attribute of the data*/
151  private Attribute m_Class; 
152   
153  /** The ruleset */
154  private FastVector m_Ruleset;
155 
156  /** The predicted class distribution */
157  private FastVector m_Distributions;
158 
159  /** Runs of optimizations */
160  private int m_Optimizations = 2;
161   
162  /** Random object used in this class */
163  private Random m_Random = null;
164   
165  /** # of all the possible conditions in a rule */
166  private double m_Total = 0;
167
168  /** The seed to perform randomization */
169  private long m_Seed = 1;
170
171  /** The number of folds to split data into Grow and Prune for IREP */
172  private int m_Folds = 3;
173   
174  /** The minimal number of instance weights within a split*/
175  private double m_MinNo = 2.0;
176
177  /** Whether in a debug mode */
178  private boolean m_Debug = false;
179
180  /** Whether check the error rate >= 0.5 in stopping criteria */
181  private boolean m_CheckErr = true;
182
183  /** Whether use pruning, i.e. the data is clean or not */
184  private boolean m_UsePruning = true;
185
186  /** The filter used to randomize the class order */
187  private Filter m_Filter = null;
188
189  /** The RuleStats for the ruleset of each class value */
190  private FastVector m_RulesetStats;
191   
192  /**
193   * Returns a string describing classifier
194   * @return a description suitable for
195   * displaying in the explorer/experimenter gui
196   */
197  public String globalInfo() {
198
199    return "This class implements a propositional rule learner, Repeated Incremental "
200      + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William "
201      + "W. Cohen as an optimized version of IREP. \n\n"
202      + "The algorithm is briefly described as follows: \n\n"
203      + "Initialize RS = {}, and for each class from the less prevalent one to "
204      + "the more frequent one, DO: \n\n"
205      + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) "
206      + "of the ruleset and examples is 64 bits greater than the smallest DL "
207      + "met so far, or there are no positive examples, or the error rate >= 50%. "
208      + "\n\n"
209      + "1.1. Grow phase:\n"
210      + "Grow one rule by greedily adding antecedents (or conditions) to "
211      + "the rule until the rule is perfect (i.e. 100% accurate).  The "
212      + "procedure tries every possible value of each attribute and selects "
213      + "the condition with highest information gain: p(log(p/t)-log(P/T))."
214      + "\n\n"
215      + "1.2. Prune phase:\n"
216      + "Incrementally prune each rule and allow the pruning of any "
217      + "final sequences of the antecedents;"
218      + "The pruning metric is (p-n)/(p+n) -- but it's actually "
219      + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) "
220      + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n"
221      + "2. Optimization stage:\n after generating the initial ruleset {Ri}, "
222      + "generate and prune two variants of each rule Ri from randomized data "
223      + "using procedure 1.1 and 1.2. But one variant is generated from an "
224      + "empty rule while the other is generated by greedily adding antecedents "
225      + "to the original rule. Moreover, the pruning metric used here is "
226      + "(TP+TN)/(P+N)."
227      + "Then the smallest possible DL for each variant and the original rule "
228      + "is computed.  The variant with the minimal DL is selected as the final "
229      + "representative of Ri in the ruleset."
230      + "After all the rules in {Ri} have been examined and if there are still "
231      + "residual positives, more rules are generated based on the residual "
232      + "positives using Building Stage again. \n"
233      + "3. Delete the rules from the ruleset that would increase the DL of the "
234      + "whole ruleset if it were in it. and add resultant ruleset to RS. \n"
235      + "ENDDO\n\n"
236      + "Note that there seem to be 2 bugs in the original ripper program that would "
237      + "affect the ruleset size and accuracy slightly.  This implementation avoids "
238      + "these bugs and thus is a little bit different from Cohen's original "
239      + "implementation. Even after fixing the bugs, since the order of classes with "
240      + "the same frequency is not defined in ripper, there still seems to be "
241      + "some trivial difference between this implementation and the original ripper, "
242      + "especially for audiology data in UCI repository, where there are lots of "
243      + "classes of few instances.\n\n"
244      + "Details please see:\n\n"
245      + getTechnicalInformation().toString() + "\n\n"
246      + "PS.  We have compared this implementation with the original ripper "
247      + "implementation in aspects of accuracy, ruleset size and running time "
248      + "on both artificial data \"ab+bcd+defg\" and UCI datasets.  In all these "
249      + "aspects it seems to be quite comparable to the original ripper "
250      + "implementation.  However, we didn't consider memory consumption "
251      + "optimization in this implementation.\n\n";   
252  }
253
254  /**
255   * Returns an instance of a TechnicalInformation object, containing
256   * detailed information about the technical background of this class,
257   * e.g., paper reference or book this class is based on.
258   *
259   * @return the technical information about this class
260   */
261  public TechnicalInformation getTechnicalInformation() {
262    TechnicalInformation        result;
263   
264    result = new TechnicalInformation(Type.INPROCEEDINGS);
265    result.setValue(Field.AUTHOR, "William W. Cohen");
266    result.setValue(Field.TITLE, "Fast Effective Rule Induction");
267    result.setValue(Field.BOOKTITLE, "Twelfth International Conference on Machine Learning");
268    result.setValue(Field.YEAR, "1995");
269    result.setValue(Field.PAGES, "115-123");
270    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
271   
272    return result;
273  }
274   
275  /**
276   * Returns an enumeration describing the available options
277   * Valid options are: <p>
278   * 
279   * -F number <br>
280   * The number of folds for reduced error pruning. One fold is
281   * used as the pruning set. (Default: 3) <p>
282   *
283   * -N number <br>
284   * The minimal weights of instances within a split.
285   * (Default: 2) <p>
286   *   
287   * -O number <br>
288   * Set the number of runs of optimizations. (Default: 2)<p>
289   *
290   * -D <br>
291   * Whether turn on the debug mode
292   *
293   * -S number <br>
294   * The seed of randomization used in Ripper.(Default: 1)<p>
295   *
296   * -E <br>
297   * Whether NOT check the error rate >= 0.5 in stopping criteria.
298   * (default: check)<p>
299   *
300   * -P <br>
301   * Whether NOT use pruning. (default: use pruning)<p>
302   *
303   * @return an enumeration of all the available options
304   */
305  public Enumeration listOptions() {
306    Vector newVector = new Vector(3);
307    newVector.addElement(new Option("\tSet number of folds for REP\n" +
308                                    "\tOne fold is used as pruning set.\n" +
309                                    "\t(default 3)","F", 1, "-F <number of folds>"));
310    newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
311                                    "\twithin a split.\n" +
312                                    "\t(default 2.0)","N", 1, "-N <min. weights>"));             
313    newVector.addElement(new Option("\tSet the number of runs of\n"+
314                                    "\toptimizations. (Default: 2)", "O",
315                                    1,"-O <number of runs>"));
316       
317    newVector.addElement(new Option("\tSet whether turn on the\n"+
318                                    "\tdebug mode (Default: false)", "D",
319                                    0,"-D"));
320       
321    newVector.addElement(new Option("\tThe seed of randomization\n"+
322                                    "\t(Default: 1)", "S",
323                                    1,"-S <seed>"));
324       
325    newVector.addElement(new Option("\tWhether NOT check the error rate>=0.5\n"
326                                    +"\tin stopping criteria "
327                                    +"\t(default: check)", "E", 
328                                    0, "-E")); 
329       
330    newVector.addElement(new Option("\tWhether NOT use pruning\n"
331                                    +"\t(default: use pruning)", "P", 
332                                    0, "-P")); 
333    return newVector.elements();
334  }
335   
336  /**
337   * Parses a given list of options. <p/>
338   *
339   <!-- options-start -->
340   * Valid options are: <p/>
341   *
342   * <pre> -F &lt;number of folds&gt;
343   *  Set number of folds for REP
344   *  One fold is used as pruning set.
345   *  (default 3)</pre>
346   *
347   * <pre> -N &lt;min. weights&gt;
348   *  Set the minimal weights of instances
349   *  within a split.
350   *  (default 2.0)</pre>
351   *
352   * <pre> -O &lt;number of runs&gt;
353   *  Set the number of runs of
354   *  optimizations. (Default: 2)</pre>
355   *
356   * <pre> -D
357   *  Set whether turn on the
358   *  debug mode (Default: false)</pre>
359   *
360   * <pre> -S &lt;seed&gt;
361   *  The seed of randomization
362   *  (Default: 1)</pre>
363   *
364   * <pre> -E
365   *  Whether NOT check the error rate&gt;=0.5
366   *  in stopping criteria  (default: check)</pre>
367   *
368   * <pre> -P
369   *  Whether NOT use pruning
370   *  (default: use pruning)</pre>
371   *
372   <!-- options-end -->
373   *
374   * @param options the list of options as an array of strings
375   * @throws Exception if an option is not supported
376   */
377  public void setOptions(String[] options) throws Exception {
378    String numFoldsString = Utils.getOption('F', options);
379    if (numFoldsString.length() != 0) 
380      m_Folds = Integer.parseInt(numFoldsString);
381    else 
382      m_Folds = 3;   
383       
384    String minNoString = Utils.getOption('N', options);
385    if (minNoString.length() != 0) 
386      m_MinNo = Double.parseDouble(minNoString);
387    else 
388      m_MinNo = 2.0;
389       
390    String seedString = Utils.getOption('S', options);
391    if (seedString.length() != 0)
392      m_Seed = Long.parseLong(seedString);
393    else 
394      m_Seed = 1;
395
396    String runString = Utils.getOption('O', options);
397    if (runString.length() != 0)
398      m_Optimizations = Integer.parseInt(runString);
399    else 
400      m_Optimizations = 2;
401
402    m_Debug = Utils.getFlag('D', options);
403    m_CheckErr = !Utils.getFlag('E', options);
404    m_UsePruning = !Utils.getFlag('P', options);
405  }
406   
407  /**
408   * Gets the current settings of the Classifier.
409   *
410   * @return an array of strings suitable for passing to setOptions
411   */
412  public String [] getOptions() {
413
414    String [] options = new String [11];
415    int current = 0;
416    options[current++] = "-F"; options[current++] = "" + m_Folds;
417    options[current++] = "-N"; options[current++] = "" + m_MinNo;
418    options[current++] = "-O"; options[current++] = "" + m_Optimizations;
419    options[current++] = "-S"; options[current++] = "" + m_Seed;
420       
421    if(m_Debug)
422      options[current++] = "-D";
423
424    if(!m_CheckErr)
425      options[current++] = "-E";
426       
427    if(!m_UsePruning)
428      options[current++] = "-P";
429       
430    while(current < options.length)
431      options[current++] = "";
432       
433    return options;
434  }
435
436  /**
437   * Returns an enumeration of the additional measure names
438   * @return an enumeration of the measure names
439   */
440  public Enumeration enumerateMeasures() {
441    Vector newVector = new Vector(1);
442    newVector.addElement("measureNumRules");
443    return newVector.elements();
444  }
445   
446  /**
447   * Returns the value of the named measure
448   * @param additionalMeasureName the name of the measure to query for its value
449   * @return the value of the named measure
450   * @throws IllegalArgumentException if the named measure is not supported
451   */
452  public double getMeasure(String additionalMeasureName) {
453    if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) 
454      return m_Ruleset.size();
455    else 
456      throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
457  } 
458
459  /**
460   * Returns the tip text for this property
461   * @return tip text for this property suitable for
462   * displaying in the explorer/experimenter gui
463   */
464  public String foldsTipText() {
465    return "Determines the amount of data used for pruning. One fold is used for "
466      + "pruning, the rest for growing the rules.";
467  }
468
469  /**
470   * Sets the number of folds to use
471   *
472   * @param fold the number of folds
473   */
474  public void setFolds(int fold) { 
475    m_Folds = fold; 
476  }
477 
478  /**
479   * Gets the number of folds
480   *
481   * @return the number of folds
482   */
483  public int getFolds(){ 
484    return m_Folds; 
485  }
486
487  /**
488   * Returns the tip text for this property
489   * @return tip text for this property suitable for
490   * displaying in the explorer/experimenter gui
491   */
492  public String minNoTipText() {
493    return "The minimum total weight of the instances in a rule.";
494  }
495
496  /**
497   * Sets the minimum total weight of the instances in a rule
498   *
499   * @param m the minimum total weight of the instances in a rule
500   */
501  public void setMinNo(double m) {
502    m_MinNo = m;
503  }
504 
505  /**
506   * Gets the minimum total weight of the instances in a rule
507   *
508   * @return the minimum total weight of the instances in a rule
509   */
510  public double getMinNo(){ 
511    return m_MinNo;
512  }
513
514  /**
515   * Returns the tip text for this property
516   * @return tip text for this property suitable for
517   * displaying in the explorer/experimenter gui
518   */
519  public String seedTipText() {
520    return "The seed used for randomizing the data.";
521  }
522
523  /**
524   * Sets the seed value to use in randomizing the data
525   *
526   * @param s the new seed value
527   */
528  public void setSeed(long s) {
529    m_Seed = s;
530  }
531 
532  /**
533   * Gets the current seed value to use in randomizing the data
534   *
535   * @return the seed value
536   */
537  public long getSeed(){
538    return m_Seed;
539  }
540
541  /**
542   * Returns the tip text for this property
543   * @return tip text for this property suitable for
544   * displaying in the explorer/experimenter gui
545   */
546  public String optimizationsTipText() {
547    return "The number of optimization runs.";
548  }
549
550  /**
551   * Sets the number of optimization runs
552   *
553   * @param run the number of optimization runs
554   */
555  public void setOptimizations(int run) {
556    m_Optimizations = run;
557  }
558 
559  /**
560   * Gets the the number of optimization runs
561   *
562   * @return the number of optimization runs
563   */
564  public int getOptimizations() {
565    return m_Optimizations;
566  }
567
568  /**
569   * Returns the tip text for this property
570   * @return tip text for this property suitable for
571   * displaying in the explorer/experimenter gui
572   */
573  public String debugTipText() {
574    return "Whether debug information is output to the console.";
575  }
576
577  /**
578   * Sets whether debug information is output to the console
579   *
580   * @param d whether debug information is output to the console
581   */
582  public void setDebug(boolean d) {
583    m_Debug = d;
584  }
585 
586  /**
587   * Gets whether debug information is output to the console
588   *
589   * @return whether debug information is output to the console
590   */
591  public boolean getDebug(){
592    return m_Debug;
593  }
594
595  /**
596   * Returns the tip text for this property
597   * @return tip text for this property suitable for
598   * displaying in the explorer/experimenter gui
599   */
600  public String checkErrorRateTipText() {
601    return "Whether check for error rate >= 1/2 is included" +
602      " in stopping criterion.";
603  }
604
605  /**
606   * Sets whether to check for error rate is in stopping criterion
607   *
608   * @param d whether to check for error rate is in stopping criterion
609   */
610  public void setCheckErrorRate(boolean d) { 
611    m_CheckErr = d;
612  }
613 
614  /**
615   * Gets whether to check for error rate is in stopping criterion
616   *
617   * @return true if checking for error rate is in stopping criterion
618   */
619  public boolean getCheckErrorRate(){ 
620    return m_CheckErr; 
621  }
622
623  /**
624   * Returns the tip text for this property
625   * @return tip text for this property suitable for
626   * displaying in the explorer/experimenter gui
627   */
628  public String usePruningTipText() {
629    return "Whether pruning is performed.";
630  }
631
632  /**
633   * Sets whether pruning is performed
634   *
635   * @param d Whether pruning is performed
636   */
637  public void setUsePruning(boolean d) { 
638    m_UsePruning = d;
639  }
640 
641  /**
642   * Gets whether pruning is performed
643   *
644   * @return true if pruning is performed
645   */
646  public boolean getUsePruning(){ 
647    return m_UsePruning; 
648  }
649   
650  /**
651   * Get the ruleset generated by Ripper
652   *
653   * @return the ruleset
654   */
655  public FastVector getRuleset(){ return m_Ruleset; }
656
657  /**
658   * Get the statistics of the ruleset in the given position
659   *
660   * @param pos the position of the stats, assuming correct
661   * @return the statistics of the ruleset in the given position
662   */
663  public RuleStats getRuleStats(int pos) {
664    return (RuleStats)m_RulesetStats.elementAt(pos);
665  }
666   
667  /**
668   * The single antecedent in the rule, which is composed of an attribute and
669   * the corresponding value.  There are two inherited classes, namely NumericAntd
670   * and NominalAntd in which the attributes are numeric and nominal respectively.
671   */   
672  private abstract class Antd 
673    implements WeightedInstancesHandler, Copyable, Serializable, RevisionHandler {
674
675    /** for serialization */
676    private static final long serialVersionUID = -8929754772994154334L;
677       
678    /** The attribute of the antecedent */
679    protected Attribute att;
680       
681    /** The attribute value of the antecedent. 
682       For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */
683    protected double value; 
684       
685    /** The maximum infoGain achieved by this antecedent test
686     * in the growing data */
687    protected double maxInfoGain;
688       
689    /** The accurate rate of this antecedent test on the growing data */
690    protected double accuRate;
691       
692    /** The coverage of this antecedent in the growing data */
693    protected double cover;
694       
695    /** The accurate data for this antecedent in the growing data */
696    protected double accu;
697       
698    /**
699     * Constructor
700     */
701    public Antd(Attribute a){
702      att=a;
703      value=Double.NaN; 
704      maxInfoGain = 0;
705      accuRate = Double.NaN;
706      cover = Double.NaN;
707      accu = Double.NaN;
708    }
709       
710    /* The abstract members for inheritance */
711    public abstract Instances[] splitData(Instances data, double defAcRt, 
712                                          double cla);
713    public abstract boolean covers(Instance inst);
714    public abstract String toString();
715       
716    /**
717     * Implements Copyable
718     *
719     * @return a copy of this object
720     */
721    public abstract Object copy(); 
722           
723    /* Get functions of this antecedent */
724    public Attribute getAttr(){ return att; }
725    public double getAttrValue(){ return value; }
726    public double getMaxInfoGain(){ return maxInfoGain; }
727    public double getAccuRate(){ return accuRate; } 
728    public double getAccu(){ return accu; } 
729    public double getCover(){ return cover; } 
730   
731    /**
732     * Returns the revision string.
733     *
734     * @return          the revision
735     */
736    public String getRevision() {
737      return RevisionUtils.extract("$Revision: 6041 $");
738    }
739  }
740   
741  /**
742   * The antecedent with numeric attribute
743   */
744  private class 
745    NumericAntd extends Antd {
746   
747    /** for serialization */
748    static final long serialVersionUID = 5699457269983735442L;
749       
750    /** The split point for this numeric antecedent */
751    private double splitPoint;
752   
753    /**
754     * Constructor
755     */
756    public NumericAntd(Attribute a){ 
757      super(a);
758      splitPoint = Double.NaN;
759    }   
760       
761    /**
762     * Get split point of this numeric antecedent
763     *
764     * @return the split point of this numeric antecedent
765     */
766    public double getSplitPoint(){ 
767      return splitPoint;
768    }
769       
770    /**
771     * Implements Copyable
772     *
773     * @return a copy of this object
774     */
775    public Object copy(){ 
776      NumericAntd na = new NumericAntd(getAttr());
777      na.value = this.value;
778      na.splitPoint = this.splitPoint;
779      return na;
780    }
781       
782    /**
783     * Implements the splitData function. 
784     * This procedure is to split the data into two bags according
785     * to the information gain of the numeric attribute value
786     * The maximum infoGain is also calculated. 
787     *
788     * @param insts the data to be split
789     * @param defAcRt the default accuracy rate for data
790     * @param cl the class label to be predicted
791     * @return the array of data after split
792     */
793    public Instances[] splitData(Instances insts, double defAcRt, 
794                                 double cl){
795        Instances data = insts;
796      int total=data.numInstances();// Total number of instances without
797      // missing value for att
798           
799      int split=1;                  // Current split position
800      int prev=0;                   // Previous split position
801      int finalSplit=split;         // Final split position
802      maxInfoGain = 0;
803      value = 0;       
804           
805      double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0;
806           
807      data.sort(att);
808      // Find the las instance without missing value
809      for(int x=0; x<data.numInstances(); x++){
810        Instance inst = data.instance(x);
811        if(inst.isMissing(att)){
812          total = x;
813          break;
814        }
815               
816        sndCover += inst.weight();
817        if(Utils.eq(inst.classValue(), cl))
818          sndAccu += inst.weight();             
819      }     
820
821      if(total == 0) return null; // Data all missing for the attribute
822      splitPoint = data.instance(total-1).value(att);   
823           
824      for(; split <= total; split++){
825        if((split == total) ||
826           (data.instance(split).value(att) > // Can't split within
827            data.instance(prev).value(att))){ // same value         
828                   
829          for(int y=prev; y<split; y++){
830            Instance inst = data.instance(y);
831            fstCover += inst.weight(); 
832            if(Utils.eq(data.instance(y).classValue(), cl)){
833              fstAccu += inst.weight();  // First bag positive# ++
834            }                     
835          }
836                   
837          double fstAccuRate = (fstAccu+1.0)/(fstCover+1.0),
838            sndAccuRate = (sndAccu+1.0)/(sndCover+1.0);
839                   
840          /* Which bag has higher information gain? */
841          boolean isFirst; 
842          double fstInfoGain, sndInfoGain;
843          double accRate, infoGain, coverage, accurate;
844                   
845          fstInfoGain = 
846            //Utils.eq(defAcRt, 1.0) ?
847            //fstAccu/(double)numConds :
848            fstAccu*(Utils.log2(fstAccuRate)-Utils.log2(defAcRt));
849                   
850          sndInfoGain = 
851            //Utils.eq(defAcRt, 1.0) ?
852            //sndAccu/(double)numConds :
853            sndAccu*(Utils.log2(sndAccuRate)-Utils.log2(defAcRt));
854                   
855          if(fstInfoGain > sndInfoGain){
856            isFirst = true;
857            infoGain = fstInfoGain;
858            accRate = fstAccuRate;
859            accurate = fstAccu;
860            coverage = fstCover;
861          }
862          else{
863            isFirst = false;
864            infoGain = sndInfoGain;
865            accRate = sndAccuRate;
866            accurate = sndAccu;
867            coverage = sndCover;
868          }
869                   
870          /* Check whether so far the max infoGain */
871          if(infoGain > maxInfoGain){
872            splitPoint = data.instance(prev).value(att);
873            value = (isFirst) ? 0 : 1;
874            accuRate = accRate;
875            accu = accurate;
876            cover = coverage;
877            maxInfoGain = infoGain;
878            finalSplit = (isFirst) ? split : prev;
879          }
880                   
881          for(int y=prev; y<split; y++){
882            Instance inst = data.instance(y);
883            sndCover -= inst.weight(); 
884            if(Utils.eq(data.instance(y).classValue(), cl)){
885              sndAccu -= inst.weight();  // Second bag positive# --
886            }                     
887          }                 
888          prev=split;
889        }
890      }
891           
892      /* Split the data */
893      Instances[] splitData = new Instances[2];
894      splitData[0] = new Instances(data, 0, finalSplit);
895      splitData[1] = new Instances(data, finalSplit, total-finalSplit);
896           
897      return splitData;
898    }
899       
900    /**
901     * Whether the instance is covered by this antecedent
902     *
903     * @param inst the instance in question
904     * @return the boolean value indicating whether the instance is covered
905     *         by this antecedent
906     */
907    public boolean covers(Instance inst){
908      boolean isCover=true;
909      if(!inst.isMissing(att)){
910        if((int)value == 0){ // First bag
911          if(inst.value(att) > splitPoint)
912            isCover=false;
913        }
914        else if(inst.value(att) < splitPoint) // Second bag
915          isCover=false;
916      }
917      else
918        isCover = false;
919           
920      return isCover;
921    }
922       
923    /**
924     * Prints this antecedent
925     *
926     * @return a textual description of this antecedent
927     */
928    public String toString() {
929      String symbol = ((int)value == 0) ? " <= " : " >= ";
930      return (att.name() + symbol + Utils.doubleToString(splitPoint, 6));
931    }   
932   
933    /**
934     * Returns the revision string.
935     *
936     * @return          the revision
937     */
938    public String getRevision() {
939      return RevisionUtils.extract("$Revision: 6041 $");
940    }
941  }
942   
943   
944  /**
945   * The antecedent with nominal attribute
946   */
947  private class NominalAntd 
948    extends Antd {
949       
950    /** for serialization */
951    static final long serialVersionUID = -9102297038837585135L;
952   
953    /* The parameters of infoGain calculated for each attribute value
954     * in the growing data */
955    private double[] accurate;
956    private double[] coverage;
957       
958    /**
959     * Constructor
960     */
961    public NominalAntd(Attribute a){ 
962      super(a);   
963      int bag = att.numValues();
964      accurate = new double[bag];
965      coverage = new double[bag];
966    }   
967
968    /**
969     * Implements Copyable
970     *
971     * @return a copy of this object
972     */
973    public Object copy(){
974      Antd antec = new NominalAntd(getAttr());
975      antec.value = this.value;
976      return antec;         
977    }
978       
979    /**
980     * Implements the splitData function. 
981     * This procedure is to split the data into bags according
982     * to the nominal attribute value
983     * The infoGain for each bag is also calculated. 
984     *
985     * @param data the data to be split
986     * @param defAcRt the default accuracy rate for data
987     * @param cl the class label to be predicted
988     * @return the array of data after split
989     */
990    public Instances[] splitData(Instances data, double defAcRt, 
991                                 double cl){
992      int bag = att.numValues();
993      Instances[] splitData = new Instances[bag];
994           
995      for(int x=0; x<bag; x++){
996        splitData[x] = new Instances(data, data.numInstances());
997        accurate[x] = 0;
998        coverage[x] = 0;
999      }
1000           
1001      for(int x=0; x<data.numInstances(); x++){
1002        Instance inst=data.instance(x);
1003        if(!inst.isMissing(att)){
1004          int v = (int)inst.value(att);
1005          splitData[v].add(inst);
1006          coverage[v] += inst.weight();
1007          if((int)inst.classValue() == (int)cl)
1008            accurate[v] += inst.weight();
1009        }
1010      }
1011           
1012      for(int x=0; x<bag; x++){
1013        double t = coverage[x]+1.0;
1014        double p = accurate[x] + 1.0;           
1015        double infoGain = 
1016          //Utils.eq(defAcRt, 1.0) ?
1017          //accurate[x]/(double)numConds :
1018          accurate[x]*(Utils.log2(p/t)-Utils.log2(defAcRt));
1019               
1020        if(infoGain > maxInfoGain){
1021          maxInfoGain = infoGain;
1022          cover = coverage[x];
1023          accu = accurate[x];
1024          accuRate = p/t;
1025          value = (double)x;
1026        }
1027      }
1028           
1029      return splitData;
1030    }
1031       
1032    /**
1033     * Whether the instance is covered by this antecedent
1034     *
1035     * @param inst the instance in question
1036     * @return the boolean value indicating whether the instance is
1037     *         covered by this antecedent
1038     */
1039    public boolean covers(Instance inst){
1040      boolean isCover=false;
1041      if(!inst.isMissing(att)){
1042        if((int)inst.value(att) == (int)value)
1043          isCover=true;     
1044      }
1045      return isCover;
1046    }
1047       
1048    /**
1049     * Prints this antecedent
1050     *
1051     * @return a textual description of this antecedent
1052     */
1053    public String toString() {
1054      return (att.name() + " = " +att.value((int)value));
1055    } 
1056   
1057    /**
1058     * Returns the revision string.
1059     *
1060     * @return          the revision
1061     */
1062    public String getRevision() {
1063      return RevisionUtils.extract("$Revision: 6041 $");
1064    }
1065  }
1066
1067   
1068  /**
1069   * This class implements a single rule that predicts specified class. 
1070   *
1071   * A rule consists of antecedents "AND"ed together and the consequent
1072   * (class value) for the classification. 
1073   * In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to
1074   * select an antecedent and Reduced Error Prunning (REP) with the metric
1075   * of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule.
1076   */   
1077  protected class RipperRule 
1078    extends Rule {
1079   
1080    /** for serialization */
1081    static final long serialVersionUID = -2410020717305262952L;
1082       
1083    /** The internal representation of the class label to be predicted */
1084    private double m_Consequent = -1;   
1085               
1086    /** The vector of antecedents of this rule*/
1087    protected FastVector m_Antds = null;       
1088       
1089    /** Constructor */
1090    public RipperRule(){   
1091      m_Antds = new FastVector();       
1092    }
1093       
1094    /**
1095     * Sets the internal representation of the class label to be predicted
1096     *
1097     * @param cl the internal representation of the class label to be predicted
1098     */
1099    public void setConsequent(double cl) {
1100      m_Consequent = cl; 
1101    }
1102   
1103    /**
1104     * Gets the internal representation of the class label to be predicted
1105     *
1106     * @return the internal representation of the class label to be predicted
1107     */
1108    public double getConsequent() { 
1109      return m_Consequent; 
1110    }
1111       
1112    /**
1113     * Get a shallow copy of this rule
1114     *
1115     * @return the copy
1116     */
1117    public Object copy(){
1118      RipperRule copy = new RipperRule();
1119      copy.setConsequent(getConsequent());
1120      copy.m_Antds = (FastVector)this.m_Antds.copyElements();
1121      return copy;
1122    }
1123       
1124    /**
1125     * Whether the instance covered by this rule
1126     *
1127     * @param datum the instance in question
1128     * @return the boolean value indicating whether the instance
1129     *         is covered by this rule
1130     */
1131    public boolean covers(Instance datum){
1132      boolean isCover=true;
1133           
1134      for(int i=0; i<m_Antds.size(); i++){
1135        Antd antd = (Antd)m_Antds.elementAt(i);
1136        if(!antd.covers(datum)){
1137          isCover = false;
1138          break;
1139        }
1140      }
1141           
1142      return isCover;
1143    }       
1144       
1145    /**
1146     * Whether this rule has antecedents, i.e. whether it is a default rule
1147     *
1148     * @return the boolean value indicating whether the rule has antecedents
1149     */
1150    public boolean hasAntds(){
1151      if (m_Antds == null)
1152        return false;
1153      else
1154        return (m_Antds.size() > 0);
1155    }     
1156       
1157    /**
1158     * the number of antecedents of the rule
1159     *
1160     * @return the size of this rule
1161     */
1162    public double size(){ return (double)m_Antds.size(); }             
1163
1164       
1165    /**
1166     * Private function to compute default number of accurate instances
1167     * in the specified data for the consequent of the rule
1168     *
1169     * @param data the data in question
1170     * @return the default accuracy number
1171     */
1172    private double computeDefAccu(Instances data){ 
1173      double defAccu=0;
1174      for(int i=0; i<data.numInstances(); i++){
1175        Instance inst = data.instance(i);
1176        if((int)inst.classValue() == (int)m_Consequent)
1177          defAccu += inst.weight();
1178      }
1179      return defAccu;
1180    }
1181       
1182       
1183    /**
1184     * Build one rule using the growing data
1185     *
1186     * @param data the growing data used to build the rule
1187     * @throws Exception if the consequent is not set yet
1188     */   
1189    public void grow(Instances data) throws Exception {
1190      if(m_Consequent == -1)
1191        throw new Exception(" Consequent not set yet.");
1192
1193      Instances growData = data;                 
1194      double sumOfWeights = growData.sumOfWeights();
1195      if(!Utils.gr(sumOfWeights, 0.0))
1196        return;
1197
1198      /* Compute the default accurate rate of the growing data */
1199      double defAccu = computeDefAccu(growData);
1200      double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0); 
1201
1202      /* Keep the record of which attributes have already been used*/   
1203      boolean[] used=new boolean [growData.numAttributes()];
1204      for (int k=0; k<used.length; k++)
1205        used[k]=false;
1206      int numUnused=used.length;
1207
1208      // If there are already antecedents existing
1209      for(int j=0; j < m_Antds.size(); j++){
1210        Antd antdj = (Antd)m_Antds.elementAt(j);
1211        if(!antdj.getAttr().isNumeric()){ 
1212          used[antdj.getAttr().index()]=true;
1213          numUnused--;
1214        } 
1215      }     
1216
1217      double maxInfoGain;           
1218      while (Utils.gr(growData.numInstances(), 0.0) && 
1219          (numUnused > 0) 
1220          && Utils.sm(defAcRt, 1.0)
1221          ){   
1222
1223        // We require that infoGain be positive
1224        /*if(numAntds == originalSize)
1225          maxInfoGain = 0.0; // At least one condition allowed
1226          else
1227          maxInfoGain = Utils.eq(defAcRt, 1.0) ?
1228          defAccu/(double)numAntds : 0.0; */
1229        maxInfoGain = 0.0; 
1230
1231        /* Build a list of antecedents */
1232        Antd oneAntd=null;
1233        Instances coverData = null;
1234        Enumeration enumAttr=growData.enumerateAttributes();         
1235
1236        /* Build one condition based on all attributes not used yet*/
1237        while (enumAttr.hasMoreElements()){
1238          Attribute att= (Attribute)(enumAttr.nextElement());
1239
1240          if(m_Debug)
1241            System.err.println("\nOne condition: size = " 
1242                + growData.sumOfWeights());
1243
1244          Antd antd =null;     
1245          if(att.isNumeric())
1246            antd = new NumericAntd(att);
1247          else
1248            antd = new NominalAntd(att);
1249
1250          if(!used[att.index()]){
1251            /* Compute the best information gain for each attribute,
1252               it's stored in the antecedent formed by this attribute.
1253               This procedure returns the data covered by the antecedent*/
1254            Instances coveredData = computeInfoGain(growData, defAcRt,
1255                antd);
1256            if(coveredData != null){
1257              double infoGain = antd.getMaxInfoGain();     
1258              if(m_Debug)
1259                System.err.println("Test of \'"+antd.toString()+
1260                    "\': infoGain = "+
1261                    infoGain + " | Accuracy = " +
1262                    antd.getAccuRate()+
1263                    "="+antd.getAccu()
1264                    +"/"+antd.getCover()+
1265                    " def. accuracy: "+defAcRt);
1266
1267              if(infoGain > maxInfoGain){         
1268                oneAntd=antd;
1269                coverData = coveredData; 
1270                maxInfoGain = infoGain;
1271              }             
1272            }
1273          }
1274        }
1275
1276        if(oneAntd == null) break; // Cannot find antds         
1277        if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage
1278
1279        //Numeric attributes can be used more than once
1280        if(!oneAntd.getAttr().isNumeric()){ 
1281          used[oneAntd.getAttr().index()]=true;
1282          numUnused--;
1283        }
1284
1285        m_Antds.addElement(oneAntd);
1286        growData = coverData;// Grow data size is shrinking     
1287        defAcRt = oneAntd.getAccuRate();
1288          }
1289    }
1290       
1291       
1292    /**
1293     * Compute the best information gain for the specified antecedent
1294     * 
1295     * @param instances the data based on which the infoGain is computed
1296     * @param defAcRt the default accuracy rate of data
1297     * @param antd the specific antecedent
1298     * @param numConds the number of antecedents in the rule so far
1299     * @return the data covered by the antecedent
1300     */
1301    private Instances computeInfoGain(Instances instances, double defAcRt, 
1302                                      Antd antd){
1303        Instances data = instances;
1304       
1305      /* Split the data into bags.
1306         The information gain of each bag is also calculated in this procedure */
1307      Instances[] splitData = antd.splitData(data, defAcRt, 
1308                                             m_Consequent); 
1309           
1310      /* Get the bag of data to be used for next antecedents */
1311      if(splitData != null)
1312        return splitData[(int)antd.getAttrValue()];
1313      else return null;
1314    }
1315       
1316    /**
1317     * Prune all the possible final sequences of the rule using the
1318     * pruning data.  The measure used to prune the rule is based on
1319     * flag given.
1320     *
1321     * @param pruneData the pruning data used to prune the rule
1322     * @param useWhole flag to indicate whether use the error rate of
1323     *                 the whole pruning data instead of the data covered
1324     */   
1325    public void prune(Instances pruneData, boolean useWhole){
1326        Instances data = pruneData;
1327       
1328      double total = data.sumOfWeights();
1329      if(!Utils.gr(total, 0.0))
1330        return;
1331       
1332      /* The default accurate # and rate on pruning data */
1333      double defAccu=computeDefAccu(data);
1334           
1335      if(m_Debug)       
1336        System.err.println("Pruning with " + defAccu + 
1337                           " positive data out of " + total +
1338                           " instances");       
1339           
1340      int size=m_Antds.size();
1341      if(size == 0) return; // Default rule before pruning
1342           
1343      double[] worthRt = new double[size];
1344      double[] coverage = new double[size];
1345      double[] worthValue = new double[size];
1346      for(int w=0; w<size; w++){
1347        worthRt[w]=coverage[w]=worthValue[w]=0.0;
1348      }
1349           
1350      /* Calculate accuracy parameters for all the antecedents in this rule */
1351      double tn = 0.0; // True negative if useWhole
1352      for(int x=0; x<size; x++){
1353        Antd antd=(Antd)m_Antds.elementAt(x);
1354        Instances newData = data;
1355        data = new Instances(newData, 0); // Make data empty
1356               
1357        for(int y=0; y<newData.numInstances(); y++){
1358          Instance ins=newData.instance(y);
1359                   
1360          if(antd.covers(ins)){   // Covered by this antecedent
1361            coverage[x] += ins.weight();
1362            data.add(ins);                 // Add to data for further pruning
1363            if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction
1364              worthValue[x] += ins.weight();
1365          }
1366          else if(useWhole){ // Not covered
1367            if((int)ins.classValue() != (int)m_Consequent)
1368              tn += ins.weight();
1369          }                     
1370        }
1371               
1372        if(useWhole){
1373          worthValue[x] += tn;
1374          worthRt[x] = worthValue[x] / total;
1375        }
1376        else // Note if coverage is 0, accuracy is 0.5
1377          worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0);
1378      }
1379           
1380      double maxValue = (defAccu+1.0)/(total+2.0);
1381      int maxIndex = -1;
1382      for(int i=0; i<worthValue.length; i++){
1383        if(m_Debug){
1384          double denom = useWhole ? total : coverage[i];
1385          System.err.println(i+"(useAccuray? "+!useWhole+"): "
1386                             + worthRt[i] + 
1387                             "="+worthValue[i]+
1388                             "/"+denom);
1389        }
1390        if(worthRt[i] > maxValue){ // Prefer to the
1391          maxValue = worthRt[i]; // shorter rule
1392          maxIndex = i;
1393        }
1394      }
1395           
1396      /* Prune the antecedents according to the accuracy parameters */
1397      for(int z=size-1;z>maxIndex;z--)
1398        m_Antds.removeElementAt(z);       
1399    }
1400       
1401    /**
1402     * Prints this rule
1403     *
1404     * @param classAttr the class attribute in the data
1405     * @return a textual description of this rule
1406     */
1407    public String toString(Attribute classAttr) {
1408      StringBuffer text =  new StringBuffer();
1409      if(m_Antds.size() > 0){
1410        for(int j=0; j< (m_Antds.size()-1); j++)
1411          text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
1412        text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
1413      }
1414      text.append(" => " + classAttr.name() +
1415                  "=" + classAttr.value((int)m_Consequent));
1416           
1417      return text.toString();
1418    }
1419   
1420    /**
1421     * Returns the revision string.
1422     *
1423     * @return          the revision
1424     */
1425    public String getRevision() {
1426      return RevisionUtils.extract("$Revision: 6041 $");
1427    }
1428  }
1429
1430  /**
1431   * Returns default capabilities of the classifier.
1432   *
1433   * @return      the capabilities of this classifier
1434   */
1435  public Capabilities getCapabilities() {
1436    Capabilities result = super.getCapabilities();
1437    result.disableAll();
1438
1439    // attributes
1440    result.enable(Capability.NOMINAL_ATTRIBUTES);
1441    result.enable(Capability.NUMERIC_ATTRIBUTES);
1442    result.enable(Capability.DATE_ATTRIBUTES);
1443    result.enable(Capability.MISSING_VALUES);
1444
1445    // class
1446    result.enable(Capability.NOMINAL_CLASS);
1447    result.enable(Capability.MISSING_CLASS_VALUES);
1448   
1449    // instances
1450    result.setMinimumNumberInstances(m_Folds);
1451   
1452    return result;
1453  }
1454   
1455  /**
1456   * Builds Ripper in the order of class frequencies.  For each class
1457   * it's built in two stages: building and optimization
1458   *
1459   * @param instances the training data
1460   * @throws Exception if classifier can't be built successfully
1461   */
1462  public void buildClassifier(Instances instances) throws Exception {
1463   
1464    // can classifier handle the data?
1465    getCapabilities().testWithFail(instances);
1466
1467    // remove instances with missing class
1468    instances = new Instances(instances);
1469    instances.deleteWithMissingClass();
1470   
1471    m_Random = instances.getRandomNumberGenerator(m_Seed); 
1472    m_Total = RuleStats.numAllConditions(instances);
1473    if(m_Debug)
1474      System.err.println("Number of all possible conditions = "+m_Total);
1475   
1476    Instances data = null;
1477    m_Filter = new ClassOrder();
1478    ((ClassOrder)m_Filter).setSeed(m_Random.nextInt()); 
1479    ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
1480    m_Filter.setInputFormat(instances);
1481    data = Filter.useFilter(instances, m_Filter);
1482       
1483    if(data == null)
1484      throw new Exception(" Unable to randomize the class orders.");
1485   
1486    m_Class = data.classAttribute();   
1487    m_Ruleset = new FastVector();
1488    m_RulesetStats = new FastVector();
1489    m_Distributions = new FastVector();
1490
1491    // Sort by classes frequency
1492    double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts();
1493    if(m_Debug){
1494      System.err.println("Sorted classes:");
1495      for(int x=0; x < m_Class.numValues(); x++)
1496        System.err.println(x+": "+m_Class.value(x) + " has " +
1497                           orderedClasses[x] + " instances.");
1498    }
1499    // Iterate from less prevalent class to more frequent one
1500  oneClass:     
1501    for(int y=0; y < data.numClasses()-1; y++){ // For each class             
1502           
1503      double classIndex = (double)y;
1504      if(m_Debug){
1505        int ci = (int)classIndex;
1506        System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): "
1507                           + orderedClasses[y] + "instances\n"+
1508                           "=====================================\n");
1509      }
1510               
1511      if(Utils.eq(orderedClasses[y],0.0)) // No data for this class
1512        continue oneClass;             
1513           
1514      // The expected FP/err is the proportion of the class
1515      double all = 0;
1516      for(int i=y; i<orderedClasses.length; i++)
1517        all += orderedClasses[i];
1518      double expFPRate = orderedClasses[y] / all;           
1519               
1520      double classYWeights = 0, totalWeights = 0;
1521      for(int j=0; j < data.numInstances(); j++){
1522          Instance datum = data.instance(j);
1523          totalWeights += datum.weight();
1524          if((int)datum.classValue() == y){
1525              classYWeights += datum.weight();
1526          }               
1527      } 
1528         
1529      // DL of default rule, no theory DL, only data DL
1530      double defDL;
1531      if(classYWeights > 0)
1532          defDL = RuleStats.dataDL(expFPRate, 
1533                                   0.0,
1534                                   totalWeights,
1535                                   0.0,
1536                                   classYWeights);         
1537      else
1538          continue oneClass; // Subsumed by previous rules
1539
1540      if(Double.isNaN(defDL) || Double.isInfinite(defDL))
1541        throw new Exception("Should never happen: "+
1542                            "defDL NaN or infinite!");
1543      if(m_Debug)
1544        System.err.println("The default DL = "+defDL);
1545           
1546      data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
1547    }
1548
1549    // Set the default rule
1550    RipperRule defRule = new RipperRule();
1551    defRule.setConsequent((double)(data.numClasses()-1));
1552    m_Ruleset.addElement(defRule);
1553       
1554    RuleStats defRuleStat = new RuleStats();
1555    defRuleStat.setData(data);
1556    defRuleStat.setNumAllConds(m_Total);
1557    defRuleStat.addAndUpdate(defRule);
1558    m_RulesetStats.addElement(defRuleStat);
1559
1560    for(int z=0; z < m_RulesetStats.size(); z++){
1561        RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z);
1562        for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){
1563            double[] classDist = oneClass.getDistributions(xyz);
1564            Utils.normalize(classDist);
1565            if(classDist != null)
1566                m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist));
1567        }       
1568    }   
1569
1570    // free up memory
1571    for (int i = 0; i < m_RulesetStats.size(); i++)
1572      ((RuleStats) m_RulesetStats.elementAt(i)).cleanUp();
1573  }
1574   
1575  /**
1576   * Classify the test instance with the rule learner and provide
1577   * the class distributions
1578   *
1579   * @param datum the instance to be classified
1580   * @return the distribution
1581   */
1582    public double[] distributionForInstance(Instance datum){
1583        try{
1584            for(int i=0; i < m_Ruleset.size(); i++){
1585                RipperRule rule = (RipperRule)m_Ruleset.elementAt(i);
1586                if(rule.covers(datum))
1587                    return (double[])m_Distributions.elementAt(i); 
1588            }
1589        }catch(Exception e){
1590            System.err.println(e.getMessage());
1591            e.printStackTrace();
1592        }
1593       
1594        System.err.println("Should never happen!");
1595        return new double[datum.classAttribute().numValues()];
1596    }
1597
1598  /** Build a ruleset for the given class according to the given data
1599   *
1600   * @param expFPRate the expected FP/(FP+FN) used in DL calculation
1601   * @param data the given data
1602   * @param classIndex the given class index
1603   * @param defDL the default DL in the data
1604   * @throws Exception if the ruleset can be built properly
1605   */
1606  protected Instances rulesetForOneClass(double expFPRate, 
1607                                         Instances data, 
1608                                         double classIndex,
1609                                         double defDL)
1610    throws Exception {
1611       
1612    Instances newData = data, growData, pruneData;     
1613    boolean stop = false;
1614    FastVector ruleset = new FastVector();             
1615       
1616    double dl = defDL, minDL = defDL;
1617    RuleStats rstats = null;
1618    double[] rst;
1619       
1620    // Check whether data have positive examples
1621    boolean defHasPositive = true; // No longer used
1622    boolean hasPositive = defHasPositive;
1623       
1624    /********************** Building stage ***********************/     
1625    if(m_Debug)
1626      System.err.println("\n*** Building stage ***");
1627   
1628    while((!stop) && hasPositive){ // Generate new rules until
1629      // stopping criteria met
1630      RipperRule oneRule;
1631      if(m_UsePruning){         
1632        /* Split data into Grow and Prune*/
1633
1634        // We should have stratified the data, but ripper seems
1635        // to have a bug that makes it not to do so.  In order
1636        // to simulate it more precisely, we do the same thing.
1637        //newData.randomize(m_Random); 
1638        newData = RuleStats.stratify(newData, m_Folds, m_Random);               
1639        Instances[] part = RuleStats.partition(newData, m_Folds);
1640        growData=part[0];
1641        pruneData=part[1];
1642        //growData=newData.trainCV(m_Folds, m_Folds-1);
1643        //pruneData=newData.testCV(m_Folds, m_Folds-1);
1644               
1645        oneRule = new RipperRule();
1646        oneRule.setConsequent(classIndex);  // Must set first
1647               
1648        if(m_Debug)
1649          System.err.println("\nGrowing a rule ..."); 
1650        oneRule.grow(growData);             // Build the rule
1651        if(m_Debug)
1652          System.err.println("One rule found before pruning:"+
1653                             oneRule.toString(m_Class));
1654               
1655        if(m_Debug)
1656          System.err.println("\nPruning the rule ..."); 
1657        oneRule.prune(pruneData, false);    // Prune the rule
1658        if(m_Debug)
1659          System.err.println("One rule found after pruning:"+
1660                             oneRule.toString(m_Class));
1661      }
1662      else{
1663        oneRule = new RipperRule();
1664        oneRule.setConsequent(classIndex);  // Must set first
1665        if(m_Debug)
1666          System.err.println("\nNo pruning: growing a rule ...");
1667        oneRule.grow(newData);             // Build the rule
1668        if(m_Debug)
1669          System.err.println("No pruning: one rule found:\n"+
1670                             oneRule.toString(m_Class));
1671      }
1672           
1673      // Compute the DL of this ruleset
1674      if(rstats == null){ // First rule
1675        rstats = new RuleStats();
1676        rstats.setNumAllConds(m_Total);
1677        rstats.setData(newData);
1678      }
1679           
1680      rstats.addAndUpdate(oneRule);                 
1681      int last = rstats.getRuleset().size()-1; // Index of last rule
1682      dl += rstats.relativeDL(last, expFPRate, m_CheckErr);
1683           
1684      if(Double.isNaN(dl) || Double.isInfinite(dl))
1685        throw new Exception("Should never happen: dl in "+
1686                            "building stage NaN or infinite!");
1687      if(m_Debug)
1688        System.err.println("Before optimization("+last+
1689                           "): the dl = "+dl+" | best: "+minDL);
1690           
1691      if(dl < minDL)
1692        minDL = dl;  // The best dl so far     
1693           
1694      rst = rstats.getSimpleStats(last);           
1695      if(m_Debug)
1696        System.err.println("The rule covers: "+rst[0]+
1697                           " | pos = " + rst[2] + 
1698                           " | neg = " + rst[4]+
1699                           "\nThe rule doesn't cover: "+rst[1]+
1700                           " | pos = " + rst[5]);
1701           
1702      stop = checkStop(rst, minDL, dl);
1703           
1704      if(!stop){                       
1705        ruleset.addElement(oneRule);          // Accepted
1706        newData = rstats.getFiltered(last)[1];// Data not covered
1707        hasPositive = Utils.gr(rst[5], 0.0);  // Positives remaining?
1708        if(m_Debug)
1709          System.err.println("One rule added: has positive? "
1710                             +hasPositive);
1711      }
1712      else{
1713        if(m_Debug)
1714          System.err.println("Quit rule");
1715        rstats.removeLast(); // Remove last to be re-used
1716      }
1717    }// while !stop     
1718       
1719    /******************** Optimization stage *******************/
1720    RuleStats finalRulesetStat = null;
1721    if(m_UsePruning){   
1722      for(int z=0; z < m_Optimizations; z++){
1723        if(m_Debug)
1724          System.err.println("\n*** Optimization: run #"
1725                             +z+" ***");
1726               
1727        newData = data;             
1728        finalRulesetStat = new RuleStats();
1729        finalRulesetStat.setData(newData);
1730        finalRulesetStat.setNumAllConds(m_Total);
1731        int position=0;
1732        stop = false;
1733        boolean isResidual = false;         
1734        hasPositive = defHasPositive;               
1735        dl = minDL = defDL;
1736               
1737      oneRule:   
1738        while(!stop && hasPositive){                   
1739                   
1740          isResidual = (position>=ruleset.size()); // Cover residual positive examples 
1741          // Re-do shuffling and stratification   
1742          //newData.randomize(m_Random);       
1743          newData = RuleStats.stratify(newData, m_Folds, m_Random);
1744          Instances[] part = RuleStats.partition(newData, m_Folds);
1745          growData=part[0];
1746          pruneData=part[1];
1747          //growData=newData.trainCV(m_Folds, m_Folds-1);
1748          //pruneData=newData.testCV(m_Folds, m_Folds-1);         
1749          RipperRule finalRule;
1750                   
1751          if(m_Debug)
1752            System.err.println("\nRule #"+position +
1753                               "| isResidual?" + isResidual+
1754                               "| data size: "+newData.sumOfWeights());
1755                   
1756          if(isResidual){
1757            RipperRule newRule = new RipperRule();   
1758            newRule.setConsequent(classIndex);
1759            if(m_Debug)
1760              System.err.println("\nGrowing and pruning"+
1761                                 " a new rule ..."); 
1762            newRule.grow(growData);
1763            newRule.prune(pruneData, false);
1764            finalRule = newRule;
1765            if(m_Debug)
1766              System.err.println("\nNew rule found: "+
1767                                 newRule.toString(m_Class));
1768          }
1769          else{
1770            RipperRule oldRule = (RipperRule)ruleset.elementAt(position);
1771            boolean covers = false;
1772            // Test coverage of the next old rule
1773            for(int i=0; i<newData.numInstances(); i++)
1774              if(oldRule.covers(newData.instance(i))){
1775                covers = true;
1776                break;
1777              }
1778                       
1779            if(!covers){// Null coverage, no variants can be generated
1780              finalRulesetStat.addAndUpdate(oldRule);
1781              position++;
1782              continue oneRule;
1783            } 
1784                       
1785            // 2 variants
1786            if(m_Debug)
1787              System.err.println("\nGrowing and pruning"+
1788                                 " Replace ..."); 
1789            RipperRule replace = new RipperRule();   
1790            replace.setConsequent(classIndex);
1791            replace.grow(growData);
1792                       
1793            // Remove the pruning data covered by the following
1794            // rules, then simply compute the error rate of the
1795            // current rule to prune it.  According to Ripper,
1796            // it's equivalent to computing the error of the
1797            // whole ruleset -- is it true?
1798            pruneData = RuleStats.rmCoveredBySuccessives(pruneData,ruleset, position);         
1799            replace.prune(pruneData, true);
1800                       
1801            if(m_Debug)
1802              System.err.println("\nGrowing and pruning"+
1803                                 " Revision ..."); 
1804            RipperRule revision = (RipperRule)oldRule.copy(); 
1805                       
1806            // For revision, first rm the data covered by the old rule
1807            Instances newGrowData = new Instances(growData, 0);
1808            for(int b=0; b<growData.numInstances(); b++){
1809              Instance inst = growData.instance(b);
1810              if(revision.covers(inst))
1811                newGrowData.add(inst);
1812            }
1813            revision.grow(newGrowData);       
1814            revision.prune(pruneData, true);
1815                       
1816            double[][] prevRuleStats = new double[position][6];
1817            for(int c=0; c < position; c++)
1818                prevRuleStats[c] = finalRulesetStat.getSimpleStats(c);
1819
1820            // Now compare the relative DL of variants
1821            FastVector tempRules = (FastVector)ruleset.copyElements();
1822            tempRules.setElementAt(replace, position);
1823                       
1824            RuleStats repStat = new RuleStats(data, tempRules);
1825            repStat.setNumAllConds(m_Total);
1826            repStat.countData(position, newData, prevRuleStats);
1827            //repStat.countData();
1828            rst = repStat.getSimpleStats(position);         
1829            if(m_Debug)
1830              System.err.println("Replace rule covers: "+rst[0]+
1831                                 " | pos = " + rst[2] + 
1832                                 " | neg = " + rst[4]+
1833                                 "\nThe rule doesn't cover: "+rst[1]+
1834                                 " | pos = " + rst[5]);
1835                       
1836            double repDL = repStat.relativeDL(position, expFPRate,
1837                                              m_CheckErr);
1838            if(m_Debug)
1839              System.err.println("\nReplace: "+
1840                                 replace.toString(m_Class)
1841                                 +" |dl = "+repDL); 
1842                       
1843            if(Double.isNaN(repDL) || Double.isInfinite(repDL))
1844              throw new Exception("Should never happen: repDL"+
1845                                  "in optmz. stage NaN or "+
1846                                  "infinite!");
1847                       
1848            tempRules.setElementAt(revision, position);
1849            RuleStats revStat = new RuleStats(data, tempRules);
1850            revStat.setNumAllConds(m_Total);
1851            revStat.countData(position, newData, prevRuleStats);
1852            //revStat.countData();
1853            double revDL = revStat.relativeDL(position, expFPRate,
1854                                              m_CheckErr);
1855                       
1856            if(m_Debug)
1857              System.err.println("Revision: "
1858                                 + revision.toString(m_Class)
1859                                 +" |dl = "+revDL);
1860                       
1861            if(Double.isNaN(revDL) || Double.isInfinite(revDL))
1862              throw new Exception("Should never happen: revDL"+
1863                                  "in optmz. stage NaN or "+
1864                                  "infinite!");
1865                       
1866            rstats = new RuleStats(data, ruleset);
1867            rstats.setNumAllConds(m_Total);
1868            rstats.countData(position, newData, prevRuleStats);
1869            //rstats.countData();
1870            double oldDL = rstats.relativeDL(position, expFPRate,
1871                                             m_CheckErr);
1872                       
1873            if(Double.isNaN(oldDL) || Double.isInfinite(oldDL))
1874              throw new Exception("Should never happen: oldDL"+
1875                                  "in optmz. stage NaN or "+
1876                                  "infinite!");
1877            if(m_Debug)
1878              System.err.println("Old rule: "+
1879                                 oldRule.toString(m_Class)
1880                                 +" |dl = "+oldDL); 
1881                       
1882            if(m_Debug)
1883              System.err.println("\nrepDL: "+repDL+ 
1884                                 "\nrevDL: "+revDL+
1885                                 "\noldDL: "+oldDL);
1886                       
1887            if((oldDL <= revDL) && (oldDL <= repDL))
1888              finalRule = oldRule; // Old the best
1889            else if(revDL <= repDL)
1890              finalRule = revision; // Revision the best
1891            else
1892              finalRule = replace; // Replace the best 
1893          }             
1894                   
1895          finalRulesetStat.addAndUpdate(finalRule);     
1896          rst = finalRulesetStat.getSimpleStats(position);
1897                   
1898          if(isResidual){
1899                       
1900            dl += finalRulesetStat.relativeDL(position, 
1901                                              expFPRate,
1902                                              m_CheckErr);
1903            if(m_Debug)
1904              System.err.println("After optimization: the dl"
1905                                 +"="+dl+" | best: "+minDL);
1906                       
1907            if(dl < minDL)
1908              minDL = dl;  // The best dl so far
1909                       
1910            stop = checkStop(rst, minDL, dl);
1911            if(!stop)
1912              ruleset.addElement(finalRule); // Accepted
1913            else{
1914              finalRulesetStat.removeLast(); // Remove last to be re-used
1915              position--;
1916            }
1917          }
1918          else
1919            ruleset.setElementAt(finalRule, position); // Accepted
1920
1921          if(m_Debug){
1922            System.err.println("The rule covers: "+rst[0]+
1923                               " | pos = " + rst[2] + 
1924                               " | neg = " + rst[4]+
1925                               "\nThe rule doesn't cover: "+rst[1]+
1926                               " | pos = " + rst[5]);           
1927            System.err.println("\nRuleset so far: ");
1928            for(int x=0; x<ruleset.size(); x++)
1929              System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
1930            System.err.println();
1931          }
1932                   
1933          //Data not covered   
1934          if(finalRulesetStat.getRulesetSize() > 0)// If any rules     
1935            newData = finalRulesetStat.getFiltered(position)[1]; 
1936          hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining?
1937          position++;
1938        } // while !stop && hasPositive
1939               
1940        if(ruleset.size() > (position+1)){ // Hasn't gone through yet
1941          for(int k=position+1; k<ruleset.size(); k++)
1942            finalRulesetStat.addAndUpdate((Rule)ruleset.elementAt(k));
1943        }
1944        if(m_Debug)
1945          System.err.println("\nDeleting rules to decrease"+
1946                             " DL of the whole ruleset ..."); 
1947        finalRulesetStat.reduceDL(expFPRate, m_CheckErr);
1948        if(m_Debug){
1949          int del = ruleset.size() -
1950            finalRulesetStat.getRulesetSize(); 
1951          System.err.println(del+" rules are deleted"+
1952                             " after DL reduction procedure");
1953        }
1954        ruleset = finalRulesetStat.getRuleset();
1955        rstats = finalRulesetStat;                 
1956               
1957      } // For each run of optimization
1958    } // if pruning is used
1959       
1960    // Concatenate the ruleset for this class to the whole ruleset
1961    if(m_Debug){
1962      System.err.println("\nFinal ruleset: ");
1963      for(int x=0; x<ruleset.size(); x++)
1964        System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
1965      System.err.println();
1966    }
1967       
1968    m_Ruleset.appendElements(ruleset);
1969    m_RulesetStats.addElement(rstats);
1970       
1971    if(ruleset.size() > 0)// If any rules for this class
1972      return rstats.getFiltered(ruleset.size()-1)[1]; // Data not
1973    else                                                // covered
1974      return data; 
1975  }   
1976   
1977  /**
1978   * Check whether the stopping criterion meets
1979   *
1980   * @param rst the statistic of the ruleset
1981   * @param minDL the min description length so far
1982   * @param dl the current description length of the ruleset
1983   * @return true if stop criterion meets, false otherwise
1984   */
1985  private boolean checkStop(double[] rst, double minDL, double dl){
1986       
1987    if(dl > minDL+MAX_DL_SURPLUS){
1988      if(m_Debug)
1989        System.err.println("DL too large: "+dl+" | "+minDL);
1990      return true;
1991    }
1992    else if(!Utils.gr(rst[2], 0.0)){// Covered positives
1993      if(m_Debug)
1994        System.err.println("Too few positives.");
1995      return true;
1996    }   
1997    else if((rst[4]/rst[0]) >= 0.5){// Err rate
1998      if(m_CheckErr){
1999        if(m_Debug)
2000          System.err.println("Error too large: "+
2001                             rst[4] + "/" + rst[0]);
2002        return  true;
2003      }
2004      else
2005        return false;
2006    }           
2007    else{// Not stops
2008      if(m_Debug)
2009        System.err.println("Continue.");
2010      return  false;
2011    }                           
2012  }
2013 
2014  /**
2015   * Prints the all the rules of the rule learner.
2016   *
2017   * @return a textual description of the classifier
2018   */
2019  public String toString() {
2020    if (m_Ruleset == null) 
2021      return "JRIP: No model built yet.";
2022       
2023    StringBuffer sb = new StringBuffer("JRIP rules:\n"+
2024                                       "===========\n\n"); 
2025    for(int j=0; j<m_RulesetStats.size(); j++){
2026      RuleStats rs = (RuleStats)m_RulesetStats.elementAt(j);
2027      FastVector rules = rs.getRuleset();
2028      for(int k=0; k<rules.size(); k++){
2029        double[] simStats = rs.getSimpleStats(k);
2030        sb.append(((RipperRule)rules.elementAt(k)).toString(m_Class)
2031                  + " ("+simStats[0]+"/"+simStats[4]+")\n");
2032      }                     
2033    }
2034    if(m_Debug){
2035      System.err.println("Inside m_Ruleset");
2036      for(int i=0; i<m_Ruleset.size(); i++)
2037        System.err.println(((RipperRule)m_Ruleset.elementAt(i)).toString(m_Class));
2038    }
2039    sb.append("\nNumber of Rules : " 
2040              + m_Ruleset.size() + "\n");
2041    return sb.toString();
2042  }
2043 
2044  /**
2045   * Returns the revision string.
2046   *
2047   * @return            the revision
2048   */
2049  public String getRevision() {
2050    return RevisionUtils.extract("$Revision: 6041 $");
2051  }
2052   
2053  /**
2054   * Main method.
2055   *
2056   * @param args the options for the classifier
2057   */
2058  public static void main(String[] args) {     
2059    runClassifier(new JRip(), args);
2060  } 
2061}
Note: See TracBrowser for help on using the repository browser.