/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * JRip.java * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand */ package weka.classifiers.rules; import weka.classifiers.Classifier; import weka.classifiers.AbstractClassifier; import weka.core.AdditionalMeasureProducer; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Copyable; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.WeightedInstancesHandler; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.filters.Filter; import weka.filters.supervised.attribute.ClassOrder; import java.io.Serializable; import java.util.Enumeration; import java.util.Random; import java.util.Vector; /** * This class implements a propositional rule learner, Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was proposed by William W. Cohen as an optimized version of IREP.
*
* The algorithm is briefly described as follows:
*
* Initialize RS = {}, and for each class from the less prevalent one to the more frequent one, DO:
*
* 1. Building stage:
* Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and examples is 64 bits greater than the smallest DL met so far, or there are no positive examples, or the error rate >= 50%.
*
* 1.1. Grow phase:
* Grow one rule by greedily adding antecedents (or conditions) to the rule until the rule is perfect (i.e. 100% accurate). The procedure tries every possible value of each attribute and selects the condition with highest information gain: p(log(p/t)-log(P/T)).
*
* 1.2. Prune phase:
* Incrementally prune each rule and allow the pruning of any final sequences of the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).
*
* 2. Optimization stage:
* after generating the initial ruleset {Ri}, generate and prune two variants of each rule Ri from randomized data using procedure 1.1 and 1.2. But one variant is generated from an empty rule while the other is generated by greedily adding antecedents to the original rule. Moreover, the pruning metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each variant and the original rule is computed. The variant with the minimal DL is selected as the final representative of Ri in the ruleset.After all the rules in {Ri} have been examined and if there are still residual positives, more rules are generated based on the residual positives using Building Stage again.
* 3. Delete the rules from the ruleset that would increase the DL of the whole ruleset if it were in it. and add resultant ruleset to RS.
* ENDDO
*
* Note that there seem to be 2 bugs in the original ripper program that would affect the ruleset size and accuracy slightly. This implementation avoids these bugs and thus is a little bit different from Cohen's original implementation. Even after fixing the bugs, since the order of classes with the same frequency is not defined in ripper, there still seems to be some trivial difference between this implementation and the original ripper, especially for audiology data in UCI repository, where there are lots of classes of few instances.
*
* Details please see:
*
* William W. Cohen: Fast Effective Rule Induction. In: Twelfth International Conference on Machine Learning, 115-123, 1995.
*
* PS. We have compared this implementation with the original ripper implementation in aspects of accuracy, ruleset size and running time on both artificial data "ab+bcd+defg" and UCI datasets. In all these aspects it seems to be quite comparable to the original ripper implementation. However, we didn't consider memory consumption optimization in this implementation.
*
*

* * BibTeX: *

 * @inproceedings{Cohen1995,
 *    author = {William W. Cohen},
 *    booktitle = {Twelfth International Conference on Machine Learning},
 *    pages = {115-123},
 *    publisher = {Morgan Kaufmann},
 *    title = {Fast Effective Rule Induction},
 *    year = {1995}
 * }
 * 
*

* * Valid options are:

* *

 -F <number of folds>
 *  Set number of folds for REP
 *  One fold is used as pruning set.
 *  (default 3)
* *
 -N <min. weights>
 *  Set the minimal weights of instances
 *  within a split.
 *  (default 2.0)
* *
 -O <number of runs>
 *  Set the number of runs of
 *  optimizations. (Default: 2)
* *
 -D
 *  Set whether turn on the
 *  debug mode (Default: false)
* *
 -S <seed>
 *  The seed of randomization
 *  (Default: 1)
* *
 -E
 *  Whether NOT check the error rate>=0.5
 *  in stopping criteria  (default: check)
* *
 -P
 *  Whether NOT use pruning
 *  (default: use pruning)
* * * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 6041 $ */ public class JRip extends AbstractClassifier implements AdditionalMeasureProducer, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -6589312996832147161L; /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data*/ private Attribute m_Class; /** The ruleset */ private FastVector m_Ruleset; /** The predicted class distribution */ private FastVector m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private FastVector m_RulesetStats; /** * Returns a string describing classifier * @return a description suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "This class implements a propositional rule learner, Repeated Incremental " + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William " + "W. Cohen as an optimized version of IREP. \n\n" + "The algorithm is briefly described as follows: \n\n" + "Initialize RS = {}, and for each class from the less prevalent one to " + "the more frequent one, DO: \n\n" + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) " + "of the ruleset and examples is 64 bits greater than the smallest DL " + "met so far, or there are no positive examples, or the error rate >= 50%. " + "\n\n" + "1.1. Grow phase:\n" + "Grow one rule by greedily adding antecedents (or conditions) to " + "the rule until the rule is perfect (i.e. 100% accurate). The " + "procedure tries every possible value of each attribute and selects " + "the condition with highest information gain: p(log(p/t)-log(P/T))." + "\n\n" + "1.2. Prune phase:\n" + "Incrementally prune each rule and allow the pruning of any " + "final sequences of the antecedents;" + "The pruning metric is (p-n)/(p+n) -- but it's actually " + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) " + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n" + "2. Optimization stage:\n after generating the initial ruleset {Ri}, " + "generate and prune two variants of each rule Ri from randomized data " + "using procedure 1.1 and 1.2. But one variant is generated from an " + "empty rule while the other is generated by greedily adding antecedents " + "to the original rule. Moreover, the pruning metric used here is " + "(TP+TN)/(P+N)." + "Then the smallest possible DL for each variant and the original rule " + "is computed. The variant with the minimal DL is selected as the final " + "representative of Ri in the ruleset." + "After all the rules in {Ri} have been examined and if there are still " + "residual positives, more rules are generated based on the residual " + "positives using Building Stage again. \n" + "3. Delete the rules from the ruleset that would increase the DL of the " + "whole ruleset if it were in it. and add resultant ruleset to RS. \n" + "ENDDO\n\n" + "Note that there seem to be 2 bugs in the original ripper program that would " + "affect the ruleset size and accuracy slightly. This implementation avoids " + "these bugs and thus is a little bit different from Cohen's original " + "implementation. Even after fixing the bugs, since the order of classes with " + "the same frequency is not defined in ripper, there still seems to be " + "some trivial difference between this implementation and the original ripper, " + "especially for audiology data in UCI repository, where there are lots of " + "classes of few instances.\n\n" + "Details please see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "PS. We have compared this implementation with the original ripper " + "implementation in aspects of accuracy, ruleset size and running time " + "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these " + "aspects it seems to be quite comparable to the original ripper " + "implementation. However, we didn't consider memory consumption " + "optimization in this implementation.\n\n"; } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "William W. Cohen"); result.setValue(Field.TITLE, "Fast Effective Rule Induction"); result.setValue(Field.BOOKTITLE, "Twelfth International Conference on Machine Learning"); result.setValue(Field.YEAR, "1995"); result.setValue(Field.PAGES, "115-123"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); return result; } /** * Returns an enumeration describing the available options * Valid options are:

* * -F number
* The number of folds for reduced error pruning. One fold is * used as the pruning set. (Default: 3)

* * -N number
* The minimal weights of instances within a split. * (Default: 2)

* * -O number
* Set the number of runs of optimizations. (Default: 2)

* * -D
* Whether turn on the debug mode * * -S number
* The seed of randomization used in Ripper.(Default: 1)

* * -E
* Whether NOT check the error rate >= 0.5 in stopping criteria. * (default: check)

* * -P
* Whether NOT use pruning. (default: use pruning)

* * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(3); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","F", 1, "-F ")); newVector.addElement(new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)","N", 1, "-N ")); newVector.addElement(new Option("\tSet the number of runs of\n"+ "\toptimizations. (Default: 2)", "O", 1,"-O ")); newVector.addElement(new Option("\tSet whether turn on the\n"+ "\tdebug mode (Default: false)", "D", 0,"-D")); newVector.addElement(new Option("\tThe seed of randomization\n"+ "\t(Default: 1)", "S", 1,"-S ")); newVector.addElement(new Option("\tWhether NOT check the error rate>=0.5\n" +"\tin stopping criteria " +"\t(default: check)", "E", 0, "-E")); newVector.addElement(new Option("\tWhether NOT use pruning\n" +"\t(default: use pruning)", "P", 0, "-P")); return newVector.elements(); } /** * Parses a given list of options.

* * Valid options are:

* *

 -F <number of folds>
   *  Set number of folds for REP
   *  One fold is used as pruning set.
   *  (default 3)
* *
 -N <min. weights>
   *  Set the minimal weights of instances
   *  within a split.
   *  (default 2.0)
* *
 -O <number of runs>
   *  Set the number of runs of
   *  optimizations. (Default: 2)
* *
 -D
   *  Set whether turn on the
   *  debug mode (Default: false)
* *
 -S <seed>
   *  The seed of randomization
   *  (Default: 1)
* *
 -E
   *  Whether NOT check the error rate>=0.5
   *  in stopping criteria  (default: check)
* *
 -P
   *  Whether NOT use pruning
   *  (default: use pruning)
* * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('F', options); if (numFoldsString.length() != 0) m_Folds = Integer.parseInt(numFoldsString); else m_Folds = 3; String minNoString = Utils.getOption('N', options); if (minNoString.length() != 0) m_MinNo = Double.parseDouble(minNoString); else m_MinNo = 2.0; String seedString = Utils.getOption('S', options); if (seedString.length() != 0) m_Seed = Long.parseLong(seedString); else m_Seed = 1; String runString = Utils.getOption('O', options); if (runString.length() != 0) m_Optimizations = Integer.parseInt(runString); else m_Optimizations = 2; m_Debug = Utils.getFlag('D', options); m_CheckErr = !Utils.getFlag('E', options); m_UsePruning = !Utils.getFlag('P', options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [11]; int current = 0; options[current++] = "-F"; options[current++] = "" + m_Folds; options[current++] = "-N"; options[current++] = "" + m_MinNo; options[current++] = "-O"; options[current++] = "" + m_Optimizations; options[current++] = "-S"; options[current++] = "" + m_Seed; if(m_Debug) options[current++] = "-D"; if(!m_CheckErr) options[current++] = "-E"; if(!m_UsePruning) options[current++] = "-P"; while(current < options.length) options[current++] = ""; return options; } /** * Returns an enumeration of the additional measure names * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(1); newVector.addElement("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * @param additionalMeasureName the name of the measure to query for its value * @return the value of the named measure * @throws IllegalArgumentException if the named measure is not supported */ public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) return m_Ruleset.size(); else throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)"); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String foldsTipText() { return "Determines the amount of data used for pruning. One fold is used for " + "pruning, the rest for growing the rules."; } /** * Sets the number of folds to use * * @param fold the number of folds */ public void setFolds(int fold) { m_Folds = fold; } /** * Gets the number of folds * * @return the number of folds */ public int getFolds(){ return m_Folds; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String minNoTipText() { return "The minimum total weight of the instances in a rule."; } /** * Sets the minimum total weight of the instances in a rule * * @param m the minimum total weight of the instances in a rule */ public void setMinNo(double m) { m_MinNo = m; } /** * Gets the minimum total weight of the instances in a rule * * @return the minimum total weight of the instances in a rule */ public double getMinNo(){ return m_MinNo; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String seedTipText() { return "The seed used for randomizing the data."; } /** * Sets the seed value to use in randomizing the data * * @param s the new seed value */ public void setSeed(long s) { m_Seed = s; } /** * Gets the current seed value to use in randomizing the data * * @return the seed value */ public long getSeed(){ return m_Seed; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String optimizationsTipText() { return "The number of optimization runs."; } /** * Sets the number of optimization runs * * @param run the number of optimization runs */ public void setOptimizations(int run) { m_Optimizations = run; } /** * Gets the the number of optimization runs * * @return the number of optimization runs */ public int getOptimizations() { return m_Optimizations; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String debugTipText() { return "Whether debug information is output to the console."; } /** * Sets whether debug information is output to the console * * @param d whether debug information is output to the console */ public void setDebug(boolean d) { m_Debug = d; } /** * Gets whether debug information is output to the console * * @return whether debug information is output to the console */ public boolean getDebug(){ return m_Debug; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String checkErrorRateTipText() { return "Whether check for error rate >= 1/2 is included" + " in stopping criterion."; } /** * Sets whether to check for error rate is in stopping criterion * * @param d whether to check for error rate is in stopping criterion */ public void setCheckErrorRate(boolean d) { m_CheckErr = d; } /** * Gets whether to check for error rate is in stopping criterion * * @return true if checking for error rate is in stopping criterion */ public boolean getCheckErrorRate(){ return m_CheckErr; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String usePruningTipText() { return "Whether pruning is performed."; } /** * Sets whether pruning is performed * * @param d Whether pruning is performed */ public void setUsePruning(boolean d) { m_UsePruning = d; } /** * Gets whether pruning is performed * * @return true if pruning is performed */ public boolean getUsePruning(){ return m_UsePruning; } /** * Get the ruleset generated by Ripper * * @return the ruleset */ public FastVector getRuleset(){ return m_Ruleset; } /** * Get the statistics of the ruleset in the given position * * @param pos the position of the stats, assuming correct * @return the statistics of the ruleset in the given position */ public RuleStats getRuleStats(int pos) { return (RuleStats)m_RulesetStats.elementAt(pos); } /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely NumericAntd * and NominalAntd in which the attributes are numeric and nominal respectively. */ private abstract class Antd implements WeightedInstancesHandler, Copyable, Serializable, RevisionHandler { /** for serialization */ private static final long serialVersionUID = -8929754772994154334L; /** The attribute of the antecedent */ protected Attribute att; /** The attribute value of the antecedent. For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */ protected double value; /** The maximum infoGain achieved by this antecedent test * in the growing data */ protected double maxInfoGain; /** The accurate rate of this antecedent test on the growing data */ protected double accuRate; /** The coverage of this antecedent in the growing data */ protected double cover; /** The accurate data for this antecedent in the growing data */ protected double accu; /** * Constructor */ public Antd(Attribute a){ att=a; value=Double.NaN; maxInfoGain = 0; accuRate = Double.NaN; cover = Double.NaN; accu = Double.NaN; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defAcRt, double cla); public abstract boolean covers(Instance inst); public abstract String toString(); /** * Implements Copyable * * @return a copy of this object */ public abstract Object copy(); /* Get functions of this antecedent */ public Attribute getAttr(){ return att; } public double getAttrValue(){ return value; } public double getMaxInfoGain(){ return maxInfoGain; } public double getAccuRate(){ return accuRate; } public double getAccu(){ return accu; } public double getCover(){ return cover; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 6041 $"); } } /** * The antecedent with numeric attribute */ private class NumericAntd extends Antd { /** for serialization */ static final long serialVersionUID = 5699457269983735442L; /** The split point for this numeric antecedent */ private double splitPoint; /** * Constructor */ public NumericAntd(Attribute a){ super(a); splitPoint = Double.NaN; } /** * Get split point of this numeric antecedent * * @return the split point of this numeric antecedent */ public double getSplitPoint(){ return splitPoint; } /** * Implements Copyable * * @return a copy of this object */ public Object copy(){ NumericAntd na = new NumericAntd(getAttr()); na.value = this.value; na.splitPoint = this.splitPoint; return na; } /** * Implements the splitData function. * This procedure is to split the data into two bags according * to the information gain of the numeric attribute value * The maximum infoGain is also calculated. * * @param insts the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances insts, double defAcRt, double cl){ Instances data = insts; int total=data.numInstances();// Total number of instances without // missing value for att int split=1; // Current split position int prev=0; // Previous split position int finalSplit=split; // Final split position maxInfoGain = 0; value = 0; double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0; data.sort(att); // Find the las instance without missing value for(int x=0; x // Can't split within data.instance(prev).value(att))){ // same value for(int y=prev; y sndInfoGain){ isFirst = true; infoGain = fstInfoGain; accRate = fstAccuRate; accurate = fstAccu; coverage = fstCover; } else{ isFirst = false; infoGain = sndInfoGain; accRate = sndAccuRate; accurate = sndAccu; coverage = sndCover; } /* Check whether so far the max infoGain */ if(infoGain > maxInfoGain){ splitPoint = data.instance(prev).value(att); value = (isFirst) ? 0 : 1; accuRate = accRate; accu = accurate; cover = coverage; maxInfoGain = infoGain; finalSplit = (isFirst) ? split : prev; } for(int y=prev; y splitPoint) isCover=false; } else if(inst.value(att) < splitPoint) // Second bag isCover=false; } else isCover = false; return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = ((int)value == 0) ? " <= " : " >= "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 6041 $"); } } /** * The antecedent with nominal attribute */ private class NominalAntd extends Antd { /** for serialization */ static final long serialVersionUID = -9102297038837585135L; /* The parameters of infoGain calculated for each attribute value * in the growing data */ private double[] accurate; private double[] coverage; /** * Constructor */ public NominalAntd(Attribute a){ super(a); int bag = att.numValues(); accurate = new double[bag]; coverage = new double[bag]; } /** * Implements Copyable * * @return a copy of this object */ public Object copy(){ Antd antec = new NominalAntd(getAttr()); antec.value = this.value; return antec; } /** * Implements the splitData function. * This procedure is to split the data into bags according * to the nominal attribute value * The infoGain for each bag is also calculated. * * @param data the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances data, double defAcRt, double cl){ int bag = att.numValues(); Instances[] splitData = new Instances[bag]; for(int x=0; x maxInfoGain){ maxInfoGain = infoGain; cover = coverage[x]; accu = accurate[x]; accuRate = p/t; value = (double)x; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is * covered by this antecedent */ public boolean covers(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if((int)inst.value(att) == (int)value) isCover=true; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { return (att.name() + " = " +att.value((int)value)); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 6041 $"); } } /** * This class implements a single rule that predicts specified class. * * A rule consists of antecedents "AND"ed together and the consequent * (class value) for the classification. * In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to * select an antecedent and Reduced Error Prunning (REP) with the metric * of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule. */ protected class RipperRule extends Rule { /** for serialization */ static final long serialVersionUID = -2410020717305262952L; /** The internal representation of the class label to be predicted */ private double m_Consequent = -1; /** The vector of antecedents of this rule*/ protected FastVector m_Antds = null; /** Constructor */ public RipperRule(){ m_Antds = new FastVector(); } /** * Sets the internal representation of the class label to be predicted * * @param cl the internal representation of the class label to be predicted */ public void setConsequent(double cl) { m_Consequent = cl; } /** * Gets the internal representation of the class label to be predicted * * @return the internal representation of the class label to be predicted */ public double getConsequent() { return m_Consequent; } /** * Get a shallow copy of this rule * * @return the copy */ public Object copy(){ RipperRule copy = new RipperRule(); copy.setConsequent(getConsequent()); copy.m_Antds = (FastVector)this.m_Antds.copyElements(); return copy; } /** * Whether the instance covered by this rule * * @param datum the instance in question * @return the boolean value indicating whether the instance * is covered by this rule */ public boolean covers(Instance datum){ boolean isCover=true; for(int i=0; i 0); } /** * the number of antecedents of the rule * * @return the size of this rule */ public double size(){ return (double)m_Antds.size(); } /** * Private function to compute default number of accurate instances * in the specified data for the consequent of the rule * * @param data the data in question * @return the default accuracy number */ private double computeDefAccu(Instances data){ double defAccu=0; for(int i=0; i 0) && Utils.sm(defAcRt, 1.0) ){ // We require that infoGain be positive /*if(numAntds == originalSize) maxInfoGain = 0.0; // At least one condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ? defAccu/(double)numAntds : 0.0; */ maxInfoGain = 0.0; /* Build a list of antecedents */ Antd oneAntd=null; Instances coverData = null; Enumeration enumAttr=growData.enumerateAttributes(); /* Build one condition based on all attributes not used yet*/ while (enumAttr.hasMoreElements()){ Attribute att= (Attribute)(enumAttr.nextElement()); if(m_Debug) System.err.println("\nOne condition: size = " + growData.sumOfWeights()); Antd antd =null; if(att.isNumeric()) antd = new NumericAntd(att); else antd = new NominalAntd(att); if(!used[att.index()]){ /* Compute the best information gain for each attribute, it's stored in the antecedent formed by this attribute. This procedure returns the data covered by the antecedent*/ Instances coveredData = computeInfoGain(growData, defAcRt, antd); if(coveredData != null){ double infoGain = antd.getMaxInfoGain(); if(m_Debug) System.err.println("Test of \'"+antd.toString()+ "\': infoGain = "+ infoGain + " | Accuracy = " + antd.getAccuRate()+ "="+antd.getAccu() +"/"+antd.getCover()+ " def. accuracy: "+defAcRt); if(infoGain > maxInfoGain){ oneAntd=antd; coverData = coveredData; maxInfoGain = infoGain; } } } } if(oneAntd == null) break; // Cannot find antds if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage //Numeric attributes can be used more than once if(!oneAntd.getAttr().isNumeric()){ used[oneAntd.getAttr().index()]=true; numUnused--; } m_Antds.addElement(oneAntd); growData = coverData;// Grow data size is shrinking defAcRt = oneAntd.getAccuRate(); } } /** * Compute the best information gain for the specified antecedent * * @param instances the data based on which the infoGain is computed * @param defAcRt the default accuracy rate of data * @param antd the specific antecedent * @param numConds the number of antecedents in the rule so far * @return the data covered by the antecedent */ private Instances computeInfoGain(Instances instances, double defAcRt, Antd antd){ Instances data = instances; /* Split the data into bags. The information gain of each bag is also calculated in this procedure */ Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent); /* Get the bag of data to be used for next antecedents */ if(splitData != null) return splitData[(int)antd.getAttrValue()]; else return null; } /** * Prune all the possible final sequences of the rule using the * pruning data. The measure used to prune the rule is based on * flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of * the whole pruning data instead of the data covered */ public void prune(Instances pruneData, boolean useWhole){ Instances data = pruneData; double total = data.sumOfWeights(); if(!Utils.gr(total, 0.0)) return; /* The default accurate # and rate on pruning data */ double defAccu=computeDefAccu(data); if(m_Debug) System.err.println("Pruning with " + defAccu + " positive data out of " + total + " instances"); int size=m_Antds.size(); if(size == 0) return; // Default rule before pruning double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for(int w=0; w maxValue){ // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } /* Prune the antecedents according to the accuracy parameters */ for(int z=size-1;z>maxIndex;z--) m_Antds.removeElementAt(z); } /** * Prints this rule * * @param classAttr the class attribute in the data * @return a textual description of this rule */ public String toString(Attribute classAttr) { StringBuffer text = new StringBuffer(); if(m_Antds.size() > 0){ for(int j=0; j< (m_Antds.size()-1); j++) text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and "); text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")"); } text.append(" => " + classAttr.name() + "=" + classAttr.value((int)m_Consequent)); return text.toString(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 6041 $"); } } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(m_Folds); return result; } /** * Builds Ripper in the order of class frequencies. For each class * it's built in two stages: building and optimization * * @param instances the training data * @throws Exception if classifier can't be built successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_Random = instances.getRandomNumberGenerator(m_Seed); m_Total = RuleStats.numAllConditions(instances); if(m_Debug) System.err.println("Number of all possible conditions = "+m_Total); Instances data = null; m_Filter = new ClassOrder(); ((ClassOrder)m_Filter).setSeed(m_Random.nextInt()); ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND); m_Filter.setInputFormat(instances); data = Filter.useFilter(instances, m_Filter); if(data == null) throw new Exception(" Unable to randomize the class orders."); m_Class = data.classAttribute(); m_Ruleset = new FastVector(); m_RulesetStats = new FastVector(); m_Distributions = new FastVector(); // Sort by classes frequency double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts(); if(m_Debug){ System.err.println("Sorted classes:"); for(int x=0; x < m_Class.numValues(); x++) System.err.println(x+": "+m_Class.value(x) + " has " + orderedClasses[x] + " instances."); } // Iterate from less prevalent class to more frequent one oneClass: for(int y=0; y < data.numClasses()-1; y++){ // For each class double classIndex = (double)y; if(m_Debug){ int ci = (int)classIndex; System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): " + orderedClasses[y] + "instances\n"+ "=====================================\n"); } if(Utils.eq(orderedClasses[y],0.0)) // No data for this class continue oneClass; // The expected FP/err is the proportion of the class double all = 0; for(int i=y; i 0) defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0, classYWeights); else continue oneClass; // Subsumed by previous rules if(Double.isNaN(defDL) || Double.isInfinite(defDL)) throw new Exception("Should never happen: "+ "defDL NaN or infinite!"); if(m_Debug) System.err.println("The default DL = "+defDL); data = rulesetForOneClass(expFPRate, data, classIndex, defDL); } // Set the default rule RipperRule defRule = new RipperRule(); defRule.setConsequent((double)(data.numClasses()-1)); m_Ruleset.addElement(defRule); RuleStats defRuleStat = new RuleStats(); defRuleStat.setData(data); defRuleStat.setNumAllConds(m_Total); defRuleStat.addAndUpdate(defRule); m_RulesetStats.addElement(defRuleStat); for(int z=0; z < m_RulesetStats.size(); z++){ RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z); for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){ double[] classDist = oneClass.getDistributions(xyz); Utils.normalize(classDist); if(classDist != null) m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist)); } } // free up memory for (int i = 0; i < m_RulesetStats.size(); i++) ((RuleStats) m_RulesetStats.elementAt(i)).cleanUp(); } /** * Classify the test instance with the rule learner and provide * the class distributions * * @param datum the instance to be classified * @return the distribution */ public double[] distributionForInstance(Instance datum){ try{ for(int i=0; i < m_Ruleset.size(); i++){ RipperRule rule = (RipperRule)m_Ruleset.elementAt(i); if(rule.covers(datum)) return (double[])m_Distributions.elementAt(i); } }catch(Exception e){ System.err.println(e.getMessage()); e.printStackTrace(); } System.err.println("Should never happen!"); return new double[datum.classAttribute().numValues()]; } /** Build a ruleset for the given class according to the given data * * @param expFPRate the expected FP/(FP+FN) used in DL calculation * @param data the given data * @param classIndex the given class index * @param defDL the default DL in the data * @throws Exception if the ruleset can be built properly */ protected Instances rulesetForOneClass(double expFPRate, Instances data, double classIndex, double defDL) throws Exception { Instances newData = data, growData, pruneData; boolean stop = false; FastVector ruleset = new FastVector(); double dl = defDL, minDL = defDL; RuleStats rstats = null; double[] rst; // Check whether data have positive examples boolean defHasPositive = true; // No longer used boolean hasPositive = defHasPositive; /********************** Building stage ***********************/ if(m_Debug) System.err.println("\n*** Building stage ***"); while((!stop) && hasPositive){ // Generate new rules until // stopping criteria met RipperRule oneRule; if(m_UsePruning){ /* Split data into Grow and Prune*/ // We should have stratified the data, but ripper seems // to have a bug that makes it not to do so. In order // to simulate it more precisely, we do the same thing. //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nGrowing a rule ..."); oneRule.grow(growData); // Build the rule if(m_Debug) System.err.println("One rule found before pruning:"+ oneRule.toString(m_Class)); if(m_Debug) System.err.println("\nPruning the rule ..."); oneRule.prune(pruneData, false); // Prune the rule if(m_Debug) System.err.println("One rule found after pruning:"+ oneRule.toString(m_Class)); } else{ oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nNo pruning: growing a rule ..."); oneRule.grow(newData); // Build the rule if(m_Debug) System.err.println("No pruning: one rule found:\n"+ oneRule.toString(m_Class)); } // Compute the DL of this ruleset if(rstats == null){ // First rule rstats = new RuleStats(); rstats.setNumAllConds(m_Total); rstats.setData(newData); } rstats.addAndUpdate(oneRule); int last = rstats.getRuleset().size()-1; // Index of last rule dl += rstats.relativeDL(last, expFPRate, m_CheckErr); if(Double.isNaN(dl) || Double.isInfinite(dl)) throw new Exception("Should never happen: dl in "+ "building stage NaN or infinite!"); if(m_Debug) System.err.println("Before optimization("+last+ "): the dl = "+dl+" | best: "+minDL); if(dl < minDL) minDL = dl; // The best dl so far rst = rstats.getSimpleStats(last); if(m_Debug) System.err.println("The rule covers: "+rst[0]+ " | pos = " + rst[2] + " | neg = " + rst[4]+ "\nThe rule doesn't cover: "+rst[1]+ " | pos = " + rst[5]); stop = checkStop(rst, minDL, dl); if(!stop){ ruleset.addElement(oneRule); // Accepted newData = rstats.getFiltered(last)[1];// Data not covered hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining? if(m_Debug) System.err.println("One rule added: has positive? " +hasPositive); } else{ if(m_Debug) System.err.println("Quit rule"); rstats.removeLast(); // Remove last to be re-used } }// while !stop /******************** Optimization stage *******************/ RuleStats finalRulesetStat = null; if(m_UsePruning){ for(int z=0; z < m_Optimizations; z++){ if(m_Debug) System.err.println("\n*** Optimization: run #" +z+" ***"); newData = data; finalRulesetStat = new RuleStats(); finalRulesetStat.setData(newData); finalRulesetStat.setNumAllConds(m_Total); int position=0; stop = false; boolean isResidual = false; hasPositive = defHasPositive; dl = minDL = defDL; oneRule: while(!stop && hasPositive){ isResidual = (position>=ruleset.size()); // Cover residual positive examples // Re-do shuffling and stratification //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); RipperRule finalRule; if(m_Debug) System.err.println("\nRule #"+position + "| isResidual?" + isResidual+ "| data size: "+newData.sumOfWeights()); if(isResidual){ RipperRule newRule = new RipperRule(); newRule.setConsequent(classIndex); if(m_Debug) System.err.println("\nGrowing and pruning"+ " a new rule ..."); newRule.grow(growData); newRule.prune(pruneData, false); finalRule = newRule; if(m_Debug) System.err.println("\nNew rule found: "+ newRule.toString(m_Class)); } else{ RipperRule oldRule = (RipperRule)ruleset.elementAt(position); boolean covers = false; // Test coverage of the next old rule for(int i=0; i 0)// If any rules newData = finalRulesetStat.getFiltered(position)[1]; hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining? position++; } // while !stop && hasPositive if(ruleset.size() > (position+1)){ // Hasn't gone through yet for(int k=position+1; k 0)// If any rules for this class return rstats.getFiltered(ruleset.size()-1)[1]; // Data not else // covered return data; } /** * Check whether the stopping criterion meets * * @param rst the statistic of the ruleset * @param minDL the min description length so far * @param dl the current description length of the ruleset * @return true if stop criterion meets, false otherwise */ private boolean checkStop(double[] rst, double minDL, double dl){ if(dl > minDL+MAX_DL_SURPLUS){ if(m_Debug) System.err.println("DL too large: "+dl+" | "+minDL); return true; } else if(!Utils.gr(rst[2], 0.0)){// Covered positives if(m_Debug) System.err.println("Too few positives."); return true; } else if((rst[4]/rst[0]) >= 0.5){// Err rate if(m_CheckErr){ if(m_Debug) System.err.println("Error too large: "+ rst[4] + "/" + rst[0]); return true; } else return false; } else{// Not stops if(m_Debug) System.err.println("Continue."); return false; } } /** * Prints the all the rules of the rule learner. * * @return a textual description of the classifier */ public String toString() { if (m_Ruleset == null) return "JRIP: No model built yet."; StringBuffer sb = new StringBuffer("JRIP rules:\n"+ "===========\n\n"); for(int j=0; j