/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* JRip.java
* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
*/
package weka.classifiers.rules;
import weka.classifiers.Classifier;
import weka.classifiers.AbstractClassifier;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Copyable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.supervised.attribute.ClassOrder;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* This class implements a propositional rule learner, Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was proposed by William W. Cohen as an optimized version of IREP.
*
* The algorithm is briefly described as follows:
*
* Initialize RS = {}, and for each class from the less prevalent one to the more frequent one, DO:
*
* 1. Building stage:
* Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and examples is 64 bits greater than the smallest DL met so far, or there are no positive examples, or the error rate >= 50%.
*
* 1.1. Grow phase:
* Grow one rule by greedily adding antecedents (or conditions) to the rule until the rule is perfect (i.e. 100% accurate). The procedure tries every possible value of each attribute and selects the condition with highest information gain: p(log(p/t)-log(P/T)).
*
* 1.2. Prune phase:
* Incrementally prune each rule and allow the pruning of any final sequences of the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).
*
* 2. Optimization stage:
* after generating the initial ruleset {Ri}, generate and prune two variants of each rule Ri from randomized data using procedure 1.1 and 1.2. But one variant is generated from an empty rule while the other is generated by greedily adding antecedents to the original rule. Moreover, the pruning metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each variant and the original rule is computed. The variant with the minimal DL is selected as the final representative of Ri in the ruleset.After all the rules in {Ri} have been examined and if there are still residual positives, more rules are generated based on the residual positives using Building Stage again.
* 3. Delete the rules from the ruleset that would increase the DL of the whole ruleset if it were in it. and add resultant ruleset to RS.
* ENDDO
*
* Note that there seem to be 2 bugs in the original ripper program that would affect the ruleset size and accuracy slightly. This implementation avoids these bugs and thus is a little bit different from Cohen's original implementation. Even after fixing the bugs, since the order of classes with the same frequency is not defined in ripper, there still seems to be some trivial difference between this implementation and the original ripper, especially for audiology data in UCI repository, where there are lots of classes of few instances.
*
* Details please see:
*
* William W. Cohen: Fast Effective Rule Induction. In: Twelfth International Conference on Machine Learning, 115-123, 1995.
*
* PS. We have compared this implementation with the original ripper implementation in aspects of accuracy, ruleset size and running time on both artificial data "ab+bcd+defg" and UCI datasets. In all these aspects it seems to be quite comparable to the original ripper implementation. However, we didn't consider memory consumption optimization in this implementation.
*
*
* @inproceedings{Cohen1995, * author = {William W. Cohen}, * booktitle = {Twelfth International Conference on Machine Learning}, * pages = {115-123}, * publisher = {Morgan Kaufmann}, * title = {Fast Effective Rule Induction}, * year = {1995} * } ** * * Valid options are: * *
-F <number of folds> * Set number of folds for REP * One fold is used as pruning set. * (default 3)* *
-N <min. weights> * Set the minimal weights of instances * within a split. * (default 2.0)* *
-O <number of runs> * Set the number of runs of * optimizations. (Default: 2)* *
-D * Set whether turn on the * debug mode (Default: false)* *
-S <seed> * The seed of randomization * (Default: 1)* *
-E * Whether NOT check the error rate>=0.5 * in stopping criteria (default: check)* *
-P * Whether NOT use pruning * (default: use pruning)* * * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 6041 $ */ public class JRip extends AbstractClassifier implements AdditionalMeasureProducer, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -6589312996832147161L; /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data*/ private Attribute m_Class; /** The ruleset */ private FastVector m_Ruleset; /** The predicted class distribution */ private FastVector m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private FastVector m_RulesetStats; /** * Returns a string describing classifier * @return a description suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "This class implements a propositional rule learner, Repeated Incremental " + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William " + "W. Cohen as an optimized version of IREP. \n\n" + "The algorithm is briefly described as follows: \n\n" + "Initialize RS = {}, and for each class from the less prevalent one to " + "the more frequent one, DO: \n\n" + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) " + "of the ruleset and examples is 64 bits greater than the smallest DL " + "met so far, or there are no positive examples, or the error rate >= 50%. " + "\n\n" + "1.1. Grow phase:\n" + "Grow one rule by greedily adding antecedents (or conditions) to " + "the rule until the rule is perfect (i.e. 100% accurate). The " + "procedure tries every possible value of each attribute and selects " + "the condition with highest information gain: p(log(p/t)-log(P/T))." + "\n\n" + "1.2. Prune phase:\n" + "Incrementally prune each rule and allow the pruning of any " + "final sequences of the antecedents;" + "The pruning metric is (p-n)/(p+n) -- but it's actually " + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) " + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n" + "2. Optimization stage:\n after generating the initial ruleset {Ri}, " + "generate and prune two variants of each rule Ri from randomized data " + "using procedure 1.1 and 1.2. But one variant is generated from an " + "empty rule while the other is generated by greedily adding antecedents " + "to the original rule. Moreover, the pruning metric used here is " + "(TP+TN)/(P+N)." + "Then the smallest possible DL for each variant and the original rule " + "is computed. The variant with the minimal DL is selected as the final " + "representative of Ri in the ruleset." + "After all the rules in {Ri} have been examined and if there are still " + "residual positives, more rules are generated based on the residual " + "positives using Building Stage again. \n" + "3. Delete the rules from the ruleset that would increase the DL of the " + "whole ruleset if it were in it. and add resultant ruleset to RS. \n" + "ENDDO\n\n" + "Note that there seem to be 2 bugs in the original ripper program that would " + "affect the ruleset size and accuracy slightly. This implementation avoids " + "these bugs and thus is a little bit different from Cohen's original " + "implementation. Even after fixing the bugs, since the order of classes with " + "the same frequency is not defined in ripper, there still seems to be " + "some trivial difference between this implementation and the original ripper, " + "especially for audiology data in UCI repository, where there are lots of " + "classes of few instances.\n\n" + "Details please see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "PS. We have compared this implementation with the original ripper " + "implementation in aspects of accuracy, ruleset size and running time " + "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these " + "aspects it seems to be quite comparable to the original ripper " + "implementation. However, we didn't consider memory consumption " + "optimization in this implementation.\n\n"; } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "William W. Cohen"); result.setValue(Field.TITLE, "Fast Effective Rule Induction"); result.setValue(Field.BOOKTITLE, "Twelfth International Conference on Machine Learning"); result.setValue(Field.YEAR, "1995"); result.setValue(Field.PAGES, "115-123"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); return result; } /** * Returns an enumeration describing the available options * Valid options are:
*
* -F number
* The number of folds for reduced error pruning. One fold is
* used as the pruning set. (Default: 3)
*
* -N number
* The minimal weights of instances within a split.
* (Default: 2)
*
* -O number
* Set the number of runs of optimizations. (Default: 2)
*
* -D
* Whether turn on the debug mode
*
* -S number
* The seed of randomization used in Ripper.(Default: 1)
*
* -E
* Whether NOT check the error rate >= 0.5 in stopping criteria.
* (default: check)
*
* -P
* Whether NOT use pruning. (default: use pruning)
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tSet number of folds for REP\n" +
"\tOne fold is used as pruning set.\n" +
"\t(default 3)","F", 1, "-F -F <number of folds>
* Set number of folds for REP
* One fold is used as pruning set.
* (default 3)
*
* -N <min. weights>
* Set the minimal weights of instances
* within a split.
* (default 2.0)
*
* -O <number of runs>
* Set the number of runs of
* optimizations. (Default: 2)
*
* -D
* Set whether turn on the
* debug mode (Default: false)
*
* -S <seed>
* The seed of randomization
* (Default: 1)
*
* -E
* Whether NOT check the error rate>=0.5
* in stopping criteria (default: check)
*
* -P
* Whether NOT use pruning
* (default: use pruning)
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String numFoldsString = Utils.getOption('F', options);
if (numFoldsString.length() != 0)
m_Folds = Integer.parseInt(numFoldsString);
else
m_Folds = 3;
String minNoString = Utils.getOption('N', options);
if (minNoString.length() != 0)
m_MinNo = Double.parseDouble(minNoString);
else
m_MinNo = 2.0;
String seedString = Utils.getOption('S', options);
if (seedString.length() != 0)
m_Seed = Long.parseLong(seedString);
else
m_Seed = 1;
String runString = Utils.getOption('O', options);
if (runString.length() != 0)
m_Optimizations = Integer.parseInt(runString);
else
m_Optimizations = 2;
m_Debug = Utils.getFlag('D', options);
m_CheckErr = !Utils.getFlag('E', options);
m_UsePruning = !Utils.getFlag('P', options);
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [11];
int current = 0;
options[current++] = "-F"; options[current++] = "" + m_Folds;
options[current++] = "-N"; options[current++] = "" + m_MinNo;
options[current++] = "-O"; options[current++] = "" + m_Optimizations;
options[current++] = "-S"; options[current++] = "" + m_Seed;
if(m_Debug)
options[current++] = "-D";
if(!m_CheckErr)
options[current++] = "-E";
if(!m_UsePruning)
options[current++] = "-P";
while(current < options.length)
options[current++] = "";
return options;
}
/**
* Returns an enumeration of the additional measure names
* @return an enumeration of the measure names
*/
public Enumeration enumerateMeasures() {
Vector newVector = new Vector(1);
newVector.addElement("measureNumRules");
return newVector.elements();
}
/**
* Returns the value of the named measure
* @param additionalMeasureName the name of the measure to query for its value
* @return the value of the named measure
* @throws IllegalArgumentException if the named measure is not supported
*/
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0)
return m_Ruleset.size();
else
throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String foldsTipText() {
return "Determines the amount of data used for pruning. One fold is used for "
+ "pruning, the rest for growing the rules.";
}
/**
* Sets the number of folds to use
*
* @param fold the number of folds
*/
public void setFolds(int fold) {
m_Folds = fold;
}
/**
* Gets the number of folds
*
* @return the number of folds
*/
public int getFolds(){
return m_Folds;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String minNoTipText() {
return "The minimum total weight of the instances in a rule.";
}
/**
* Sets the minimum total weight of the instances in a rule
*
* @param m the minimum total weight of the instances in a rule
*/
public void setMinNo(double m) {
m_MinNo = m;
}
/**
* Gets the minimum total weight of the instances in a rule
*
* @return the minimum total weight of the instances in a rule
*/
public double getMinNo(){
return m_MinNo;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String seedTipText() {
return "The seed used for randomizing the data.";
}
/**
* Sets the seed value to use in randomizing the data
*
* @param s the new seed value
*/
public void setSeed(long s) {
m_Seed = s;
}
/**
* Gets the current seed value to use in randomizing the data
*
* @return the seed value
*/
public long getSeed(){
return m_Seed;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String optimizationsTipText() {
return "The number of optimization runs.";
}
/**
* Sets the number of optimization runs
*
* @param run the number of optimization runs
*/
public void setOptimizations(int run) {
m_Optimizations = run;
}
/**
* Gets the the number of optimization runs
*
* @return the number of optimization runs
*/
public int getOptimizations() {
return m_Optimizations;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String debugTipText() {
return "Whether debug information is output to the console.";
}
/**
* Sets whether debug information is output to the console
*
* @param d whether debug information is output to the console
*/
public void setDebug(boolean d) {
m_Debug = d;
}
/**
* Gets whether debug information is output to the console
*
* @return whether debug information is output to the console
*/
public boolean getDebug(){
return m_Debug;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String checkErrorRateTipText() {
return "Whether check for error rate >= 1/2 is included" +
" in stopping criterion.";
}
/**
* Sets whether to check for error rate is in stopping criterion
*
* @param d whether to check for error rate is in stopping criterion
*/
public void setCheckErrorRate(boolean d) {
m_CheckErr = d;
}
/**
* Gets whether to check for error rate is in stopping criterion
*
* @return true if checking for error rate is in stopping criterion
*/
public boolean getCheckErrorRate(){
return m_CheckErr;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String usePruningTipText() {
return "Whether pruning is performed.";
}
/**
* Sets whether pruning is performed
*
* @param d Whether pruning is performed
*/
public void setUsePruning(boolean d) {
m_UsePruning = d;
}
/**
* Gets whether pruning is performed
*
* @return true if pruning is performed
*/
public boolean getUsePruning(){
return m_UsePruning;
}
/**
* Get the ruleset generated by Ripper
*
* @return the ruleset
*/
public FastVector getRuleset(){ return m_Ruleset; }
/**
* Get the statistics of the ruleset in the given position
*
* @param pos the position of the stats, assuming correct
* @return the statistics of the ruleset in the given position
*/
public RuleStats getRuleStats(int pos) {
return (RuleStats)m_RulesetStats.elementAt(pos);
}
/**
* The single antecedent in the rule, which is composed of an attribute and
* the corresponding value. There are two inherited classes, namely NumericAntd
* and NominalAntd in which the attributes are numeric and nominal respectively.
*/
private abstract class Antd
implements WeightedInstancesHandler, Copyable, Serializable, RevisionHandler {
/** for serialization */
private static final long serialVersionUID = -8929754772994154334L;
/** The attribute of the antecedent */
protected Attribute att;
/** The attribute value of the antecedent.
For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */
protected double value;
/** The maximum infoGain achieved by this antecedent test
* in the growing data */
protected double maxInfoGain;
/** The accurate rate of this antecedent test on the growing data */
protected double accuRate;
/** The coverage of this antecedent in the growing data */
protected double cover;
/** The accurate data for this antecedent in the growing data */
protected double accu;
/**
* Constructor
*/
public Antd(Attribute a){
att=a;
value=Double.NaN;
maxInfoGain = 0;
accuRate = Double.NaN;
cover = Double.NaN;
accu = Double.NaN;
}
/* The abstract members for inheritance */
public abstract Instances[] splitData(Instances data, double defAcRt,
double cla);
public abstract boolean covers(Instance inst);
public abstract String toString();
/**
* Implements Copyable
*
* @return a copy of this object
*/
public abstract Object copy();
/* Get functions of this antecedent */
public Attribute getAttr(){ return att; }
public double getAttrValue(){ return value; }
public double getMaxInfoGain(){ return maxInfoGain; }
public double getAccuRate(){ return accuRate; }
public double getAccu(){ return accu; }
public double getCover(){ return cover; }
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 6041 $");
}
}
/**
* The antecedent with numeric attribute
*/
private class
NumericAntd extends Antd {
/** for serialization */
static final long serialVersionUID = 5699457269983735442L;
/** The split point for this numeric antecedent */
private double splitPoint;
/**
* Constructor
*/
public NumericAntd(Attribute a){
super(a);
splitPoint = Double.NaN;
}
/**
* Get split point of this numeric antecedent
*
* @return the split point of this numeric antecedent
*/
public double getSplitPoint(){
return splitPoint;
}
/**
* Implements Copyable
*
* @return a copy of this object
*/
public Object copy(){
NumericAntd na = new NumericAntd(getAttr());
na.value = this.value;
na.splitPoint = this.splitPoint;
return na;
}
/**
* Implements the splitData function.
* This procedure is to split the data into two bags according
* to the information gain of the numeric attribute value
* The maximum infoGain is also calculated.
*
* @param insts the data to be split
* @param defAcRt the default accuracy rate for data
* @param cl the class label to be predicted
* @return the array of data after split
*/
public Instances[] splitData(Instances insts, double defAcRt,
double cl){
Instances data = insts;
int total=data.numInstances();// Total number of instances without
// missing value for att
int split=1; // Current split position
int prev=0; // Previous split position
int finalSplit=split; // Final split position
maxInfoGain = 0;
value = 0;
double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0;
data.sort(att);
// Find the las instance without missing value
for(int x=0; x