/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * RuleStats.java * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand */ package weka.classifiers.rules; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.Utils; import java.io.Serializable; import java.util.Enumeration; import java.util.Random; /** * This class implements the statistics functions used in the * propositional rule learner, from the simpler ones like count of * true/false positive/negatives, filter data based on the ruleset, etc. * to the more sophisticated ones such as MDL calculation and rule * variants generation for each rule in the ruleset.
* * Obviously the statistics functions listed above need the specific * data and the specific ruleset, which are given in order to instantiate * an object of this class.
*
* @author Xin Xu (xx5@cs.waikato.ac.nz)
* @version $Revision: 4608 $
*/
public class RuleStats
implements Serializable, RevisionHandler {
/** for serialization */
static final long serialVersionUID = -5708153367675298624L;
/** The data on which the stats calculation is based */
private Instances m_Data;
/** The specific ruleset in question */
private FastVector m_Ruleset;
/** The simple stats of each rule */
private FastVector m_SimpleStats;
/** The set of instances filtered by the ruleset */
private FastVector m_Filtered;
/** The total number of possible conditions that could
* appear in a rule */
private double m_Total;
/** The redundancy factor in theory description length */
private static double REDUNDANCY_FACTOR = 0.5;
/** The theory weight in the MDL calculation */
private double MDL_THEORY_WEIGHT = 1.0;
/** The class distributions predicted by each rule */
private FastVector m_Distributions;
/** Default constructor */
public RuleStats(){
m_Data = null;
m_Ruleset = null;
m_SimpleStats = null;
m_Filtered = null;
m_Distributions = null;
m_Total = -1;
}
/**
* Constructor that provides ruleset and data
*
* @param data the data
* @param rules the ruleset
*/
public RuleStats(Instances data, FastVector rules){
this();
m_Data = data;
m_Ruleset = rules;
}
/**
* Frees up memory after classifier has been built.
*/
public void cleanUp() {
m_Data = null;
m_Filtered = null;
}
/**
* Set the number of all conditions that could appear
* in a rule in this RuleStats object, if the number set
* is smaller than 0 (typically -1), then it calcualtes
* based on the data store
*
* @param total the set number
*/
public void setNumAllConds(double total){
if(total < 0)
m_Total = numAllConditions(m_Data);
else
m_Total = total;
}
/**
* Set the data of the stats, overwriting the old one if any
*
* @param data the data to be set
*/
public void setData(Instances data){
m_Data = data;
}
/**
* Get the data of the stats
*
* @return the data
*/
public Instances getData(){
return m_Data;
}
/**
* Set the ruleset of the stats, overwriting the old one if any
*
* @param rules the set of rules to be set
*/
public void setRuleset(FastVector rules){
m_Ruleset = rules;
}
/**
* Get the ruleset of the stats
*
* @return the set of rules
*/
public FastVector getRuleset(){
return m_Ruleset;
}
/**
* Get the size of the ruleset in the stats
*
* @return the size of ruleset
*/
public int getRulesetSize(){
return m_Ruleset.size();
}
/**
* Get the simple stats of one rule, including 6 parameters:
* 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;
* 4: false positives; 5: false negatives
*
* @param index the index of the rule
* @return the stats
*/
public double[] getSimpleStats(int index){
if((m_SimpleStats != null) && (index < m_SimpleStats.size()))
return (double[])m_SimpleStats.elementAt(index);
return null;
}
/**
* Get the data after filtering the given rule
*
* @param index the index of the rule
* @return the data covered and uncovered by the rule
*/
public Instances[] getFiltered(int index){
if((m_Filtered != null) && (index < m_Filtered.size()))
return (Instances[])m_Filtered.elementAt(index);
return null;
}
/**
* Get the class distribution predicted by the rule in
* given position
*
* @param index the position index of the rule
* @return the class distributions
*/
public double[] getDistributions(int index){
if((m_Distributions != null) && (index < m_Distributions.size()))
return (double[])m_Distributions.elementAt(index);
return null;
}
/**
* Set the weight of theory in MDL calcualtion
*
* @param weight the weight to be set
*/
public void setMDLTheoryWeight(double weight){
MDL_THEORY_WEIGHT = weight;
}
/**
* Compute the number of all possible conditions that could
* appear in a rule of a given data. For nominal attributes,
* it's the number of values that could appear; for numeric
* attributes, it's the number of values * 2, i.e. <= and >=
* are counted as different possible conditions.
*
* @param data the given data
* @return number of all conditions of the data
*/
public static double numAllConditions(Instances data){
double total = 0;
Enumeration attEnum = data.enumerateAttributes();
while(attEnum.hasMoreElements()){
Attribute att= (Attribute)attEnum.nextElement();
if(att.isNominal())
total += (double)att.numValues();
else
total += 2.0 * (double)data.numDistinctValues(att);
}
return total;
}
/**
* Filter the data according to the ruleset and compute the basic
* stats: coverage/uncoverage, true/false positive/negatives of
* each rule
*/
public void countData(){
if((m_Filtered != null) ||
(m_Ruleset == null) ||
(m_Data == null))
return;
int size = m_Ruleset.size();
m_Filtered = new FastVector(size);
m_SimpleStats = new FastVector(size);
m_Distributions = new FastVector(size);
Instances data = new Instances(m_Data);
for(int i=0; i < size; i++){
double[] stats = new double[6]; // 6 statistics parameters
double[] classCounts = new double[m_Data.classAttribute().numValues()];
Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);
m_Filtered.addElement(filtered);
m_SimpleStats.addElement(stats);
m_Distributions.addElement(classCounts);
data = filtered[1]; // Data not covered
}
}
/**
* Count data from the position index in the ruleset
* assuming that given data are not covered by the rules
* in position 0...(index-1), and the statistics of these
* rules are provided.
*
* Details see Quilan: "MDL and categorical theories (Continued)",ML95
*
* @param index the index of the given rule (assuming correct)
* @return the theory DL, weighted if weight != 1.0
*/
public double theoryDL(int index){
double k = ((Rule)m_Ruleset.elementAt(index)).size();
if(k == 0)
return 0.0;
double tdl = Utils.log2(k);
if(k > 1) // Approximation
tdl += 2.0 * Utils.log2(tdl); // of log2 star
tdl += subsetDL(m_Total, k, k/m_Total);
//System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
}
/**
* The description length of data given the parameters of the data
* based on the ruleset.
* Details see Quinlan: "MDL and categorical theories (Continued)",ML95
*
* @param expFPOverErr expected FP/(FP+FN)
* @param cover coverage
* @param uncover uncoverage
* @param fp False Positive
* @param fn False Negative
* @return the description length
*/
public static double dataDL(double expFPOverErr, double cover,
double uncover, double fp, double fn){
double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
double coverBits, uncoverBits; // What's the error?
double expErr; // Expected FP or FN
if(Utils.gr(cover, uncover)){
expErr = expFPOverErr*(fp+fn);
coverBits = subsetDL(cover, fp, expErr/cover);
uncoverBits = Utils.gr(uncover, 0.0) ?
subsetDL(uncover, fn, fn/uncover) : 0.0;
}
else{
expErr = (1.0-expFPOverErr)*(fp+fn);
coverBits = Utils.gr(cover, 0.0) ?
subsetDL(cover, fp, fp/cover) : 0.0;
uncoverBits = subsetDL(uncover, fn, expErr/uncover);
}
/*
System.err.println("!!!cover: " + cover + "|uncover" + uncover +
"|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
"|FPRate: "+expFPOverErr + "|expErr: "+expErr+
"|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
*/
return (totalBits + coverBits + uncoverBits);
}
/**
* Calculate the potential to decrease DL of the ruleset,
* i.e. the possible DL that could be decreased by deleting
* the rule whose index and simple statstics are given.
* If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),
* it returns NaN.
*
* The way this procedure does is copied from original RIPPER
* implementation and is quite bizzare because it
* does not update the following rules' stats recursively
* any more when testing each rule, which means it assumes
* after deletion no data covered by the following rules (or
* regards the deleted rule as the last rule). Reasonable
* assumption?
*
* @param index the index of the rule in m_Ruleset to be deleted
* @param expFPOverErr expected FP/(FP+FN)
* @param rulesetStat the simple statistics of the ruleset, updated
* if the rule should be deleted
* @param ruleStat the simple statistics of the rule to be deleted
* @param checkErr whether check if error rate >= 0.5
* @return the potential DL that could be decreased
*/
public double potential(int index, double expFPOverErr,
double[] rulesetStat, double[] ruleStat,
boolean checkErr){
//System.out.println("!!!inside potential: ");
// Restore the stats if deleted
double pcov = rulesetStat[0] - ruleStat[0];
double puncov = rulesetStat[1] + ruleStat[0];
double pfp = rulesetStat[4] - ruleStat[4];
double pfn = rulesetStat[5] + ruleStat[2];
double dataDLWith = dataDL(expFPOverErr, rulesetStat[0],
rulesetStat[1], rulesetStat[4],
rulesetStat[5]);
double theoryDLWith = theoryDL(index);
double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);
double potential = dataDLWith + theoryDLWith - dataDLWithout;
double err = ruleStat[4] / ruleStat[0];
/*System.out.println("!!!"+dataDLWith +" | "+
theoryDLWith + " | "
+dataDLWithout+"|"+ruleStat[4] + " / " + ruleStat[0]);
*/
boolean overErr = Utils.grOrEq(err, 0.5);
if(!checkErr)
overErr = false;
if(Utils.grOrEq(potential, 0.0) || overErr){
// If deleted, update ruleset stats. Other stats do not matter
rulesetStat[0] = pcov;
rulesetStat[1] = puncov;
rulesetStat[4] = pfp;
rulesetStat[5] = pfn;
return potential;
}
else
return Double.NaN;
}
/**
* Compute the minimal data description length of the ruleset
* if the rule in the given position is deleted.
*
* @param index the given position of the rule in question
* (assuming correct)
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
* @return the relative DL
*/
public double relativeDL(int index, double expFPRate, boolean checkErr){
return (minDataDLIfExists(index, expFPRate, checkErr)
+ theoryDL(index) -
minDataDLIfDeleted(index, expFPRate, checkErr));
}
/**
* Try to reduce the DL of the ruleset by testing removing the rules
* one by one in reverse order and update all the stats
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
*/
public void reduceDL(double expFPRate, boolean checkErr){
boolean needUpdate = false;
double[] rulesetStat = new double[6];
for(int j=0; j
* This procedure is typically useful when a temporary
* object of RuleStats is constructed in order to efficiently
* calculate the relative DL of rule in position index,
* thus all other stuff is not needed.
*
* @param index the given position
* @param uncovered the data not covered by rules before index
* @param prevRuleStats the provided stats of previous rules
*/
public void countData(int index, Instances uncovered,
double[][] prevRuleStats){
if((m_Filtered != null) ||
(m_Ruleset == null))
return;
int size = m_Ruleset.size();
m_Filtered = new FastVector(size);
m_SimpleStats = new FastVector(size);
Instances[] data = new Instances[2];
data[1] = uncovered;
for(int i=0; i < index; i++){
m_SimpleStats.addElement(prevRuleStats[i]);
if(i+1 == index)
m_Filtered.addElement(data);
else
m_Filtered.addElement(new Object()); // Stuff sth.
}
for(int j=index; j < size; j++){
double[] stats = new double[6]; // 6 statistics parameters
Instances[] filtered = computeSimpleStats(j, data[1], stats, null);
m_Filtered.addElement(filtered);
m_SimpleStats.addElement(stats);
data = filtered; // Data not covered
}
}
/**
* Find all the instances in the dataset covered/not covered by
* the rule in given index, and the correponding simple statistics
* and predicted class distributions are stored in the given double array,
* which can be obtained by getSimpleStats() and getDistributions().
*
* @param index the given index, assuming correct
* @param insts the dataset to be covered by the rule
* @param stats the given double array to hold stats, side-effected
* @param dist the given array to hold class distributions, side-effected
* if null, the distribution is not necessary
* @return the instances covered and not covered by the rule
*/
private Instances[] computeSimpleStats(int index, Instances insts,
double[] stats, double[] dist){
Rule rule = (Rule)m_Ruleset.elementAt(index);
Instances[] data = new Instances[2];
data[0] = new Instances(insts, insts.numInstances());
data[1] = new Instances(insts, insts.numInstances());
for(int i=0; i
* 0.5* [||k||+ S(t, k, k/t)]
* where k is the number of antecedents of the rule; t is the total
* possible antecedents that could appear in a rule; ||K|| is the
* universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
* is the subset encoding length.
* The min_data_DL_if_deleted = data_DL_if_deleted - potential
*
* @param index the index of the rule in question
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
* @return the minDataDL
*/
public double minDataDLIfDeleted(int index, double expFPRate,
boolean checkErr){
//System.out.println("!!!Enter without: ");
double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
int more = m_Ruleset.size() - 1 - index; // How many rules after?
FastVector indexPlus = new FastVector(more); // Their stats
// 0...(index-1) are OK
for(int j=0; j
* Note the minimal possible DL of the ruleset is calculated(i.e. some
* other rules may also be deleted) instead of the DL of the current
* ruleset.