Context Navigation

source: src/main/java/weka/classifiers/rules/RuleStats.java @ 16

Last change on this file since 16 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 29.2 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* RuleStats.java
	19	* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
	20	*/
	21
	22	package weka.classifiers.rules;
	23
	24	import weka.core.Attribute;
	25	import weka.core.FastVector;
	26	import weka.core.Instance;
	27	import weka.core.Instances;
	28	import weka.core.RevisionHandler;
	29	import weka.core.RevisionUtils;
	30	import weka.core.Utils;
	31
	32	import java.io.Serializable;
	33	import java.util.Enumeration;
	34	import java.util.Random;
	35
	36	/**
	37	* This class implements the statistics functions used in the
	38	* propositional rule learner, from the simpler ones like count of
	39	* true/false positive/negatives, filter data based on the ruleset, etc.
	40	* to the more sophisticated ones such as MDL calculation and rule
	41	* variants generation for each rule in the ruleset. <p>
	42	*
	43	* Obviously the statistics functions listed above need the specific
	44	* data and the specific ruleset, which are given in order to instantiate
	45	* an object of this class. <p>
	46	*
	47	* @author Xin Xu (xx5@cs.waikato.ac.nz)
	48	* @version $Revision: 4608 $
	49	*/
	50	public class RuleStats
	51	implements Serializable, RevisionHandler {
	52
	53	/** for serialization */
	54	static final long serialVersionUID = -5708153367675298624L;
	55
	56	/** The data on which the stats calculation is based */
	57	private Instances m_Data;
	58
	59	/** The specific ruleset in question */
	60	private FastVector m_Ruleset;
	61
	62	/** The simple stats of each rule */
	63	private FastVector m_SimpleStats;
	64
	65	/** The set of instances filtered by the ruleset */
	66	private FastVector m_Filtered;
	67
	68	/** The total number of possible conditions that could
	69	* appear in a rule */
	70	private double m_Total;
	71
	72	/** The redundancy factor in theory description length */
	73	private static double REDUNDANCY_FACTOR = 0.5;
	74
	75	/** The theory weight in the MDL calculation */
	76	private double MDL_THEORY_WEIGHT = 1.0;
	77
	78	/** The class distributions predicted by each rule */
	79	private FastVector m_Distributions;
	80
	81	/** Default constructor */
	82	public RuleStats(){
	83	m_Data = null;
	84	m_Ruleset = null;
	85	m_SimpleStats = null;
	86	m_Filtered = null;
	87	m_Distributions = null;
	88	m_Total = -1;
	89	}
	90
	91
	92	/**
	93	* Constructor that provides ruleset and data
	94	*
	95	* @param data the data
	96	* @param rules the ruleset
	97	*/
	98	public RuleStats(Instances data, FastVector rules){
	99	this();
	100	m_Data = data;
	101	m_Ruleset = rules;
	102	}
	103
	104	/**
	105	* Frees up memory after classifier has been built.
	106	*/
	107	public void cleanUp() {
	108	m_Data = null;
	109	m_Filtered = null;
	110	}
	111
	112	/**
	113	* Set the number of all conditions that could appear
	114	* in a rule in this RuleStats object, if the number set
	115	* is smaller than 0 (typically -1), then it calcualtes
	116	* based on the data store
	117	*
	118	* @param total the set number
	119	*/
	120	public void setNumAllConds(double total){
	121	if(total < 0)
	122	m_Total = numAllConditions(m_Data);
	123	else
	124	m_Total = total;
	125	}
	126
	127	/**
	128	* Set the data of the stats, overwriting the old one if any
	129	*
	130	* @param data the data to be set
	131	*/
	132	public void setData(Instances data){
	133	m_Data = data;
	134	}
	135
	136	/**
	137	* Get the data of the stats
	138	*
	139	* @return the data
	140	*/
	141	public Instances getData(){
	142	return m_Data;
	143	}
	144
	145
	146	/**
	147	* Set the ruleset of the stats, overwriting the old one if any
	148	*
	149	* @param rules the set of rules to be set
	150	*/
	151	public void setRuleset(FastVector rules){
	152	m_Ruleset = rules;
	153	}
	154
	155
	156	/**
	157	* Get the ruleset of the stats
	158	*
	159	* @return the set of rules
	160	*/
	161	public FastVector getRuleset(){
	162	return m_Ruleset;
	163	}
	164
	165	/**
	166	* Get the size of the ruleset in the stats
	167	*
	168	* @return the size of ruleset
	169	*/
	170	public int getRulesetSize(){
	171	return m_Ruleset.size();
	172	}
	173
	174	/**
	175	* Get the simple stats of one rule, including 6 parameters:
	176	* 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;
	177	* 4: false positives; 5: false negatives
	178	*
	179	* @param index the index of the rule
	180	* @return the stats
	181	*/
	182	public double[] getSimpleStats(int index){
	183	if((m_SimpleStats != null) && (index < m_SimpleStats.size()))
	184	return (double[])m_SimpleStats.elementAt(index);
	185
	186	return null;
	187	}
	188
	189
	190	/**
	191	* Get the data after filtering the given rule
	192	*
	193	* @param index the index of the rule
	194	* @return the data covered and uncovered by the rule
	195	*/
	196	public Instances[] getFiltered(int index){
	197
	198	if((m_Filtered != null) && (index < m_Filtered.size()))
	199	return (Instances[])m_Filtered.elementAt(index);
	200
	201	return null;
	202	}
	203
	204	/**
	205	* Get the class distribution predicted by the rule in
	206	* given position
	207	*
	208	* @param index the position index of the rule
	209	* @return the class distributions
	210	*/
	211	public double[] getDistributions(int index){
	212
	213	if((m_Distributions != null) && (index < m_Distributions.size()))
	214	return (double[])m_Distributions.elementAt(index);
	215
	216	return null;
	217	}
	218
	219	/**
	220	* Set the weight of theory in MDL calcualtion
	221	*
	222	* @param weight the weight to be set
	223	*/
	224	public void setMDLTheoryWeight(double weight){
	225	MDL_THEORY_WEIGHT = weight;
	226	}
	227
	228	/**
	229	* Compute the number of all possible conditions that could
	230	* appear in a rule of a given data. For nominal attributes,
	231	* it's the number of values that could appear; for numeric
	232	* attributes, it's the number of values * 2, i.e. <= and >=
	233	* are counted as different possible conditions.
	234	*
	235	* @param data the given data
	236	* @return number of all conditions of the data
	237	*/
	238	public static double numAllConditions(Instances data){
	239	double total = 0;
	240	Enumeration attEnum = data.enumerateAttributes();
	241	while(attEnum.hasMoreElements()){
	242	Attribute att= (Attribute)attEnum.nextElement();
	243	if(att.isNominal())
	244	total += (double)att.numValues();
	245	else
	246	total += 2.0 * (double)data.numDistinctValues(att);
	247	}
	248	return total;
	249	}
	250
	251
	252	/**
	253	* Filter the data according to the ruleset and compute the basic
	254	* stats: coverage/uncoverage, true/false positive/negatives of
	255	* each rule
	256	*/
	257	public void countData(){
	258	if((m_Filtered != null) \|\|
	259	(m_Ruleset == null) \|\|
	260	(m_Data == null))
	261	return;
	262
	263	int size = m_Ruleset.size();
	264	m_Filtered = new FastVector(size);
	265	m_SimpleStats = new FastVector(size);
	266	m_Distributions = new FastVector(size);
	267	Instances data = new Instances(m_Data);
	268
	269	for(int i=0; i < size; i++){
	270	double[] stats = new double[6]; // 6 statistics parameters
	271	double[] classCounts = new double[m_Data.classAttribute().numValues()];
	272	Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);
	273	m_Filtered.addElement(filtered);
	274	m_SimpleStats.addElement(stats);
	275	m_Distributions.addElement(classCounts);
	276	data = filtered[1]; // Data not covered
	277	}
	278	}
	279
	280	/**
	281	* Count data from the position index in the ruleset
	282	* assuming that given data are not covered by the rules
	283	* in position 0...(index-1), and the statistics of these
	284	* rules are provided.<br>
	285	* This procedure is typically useful when a temporary
	286	* object of RuleStats is constructed in order to efficiently
	287	* calculate the relative DL of rule in position index,
	288	* thus all other stuff is not needed.
	289	*
	290	* @param index the given position
	291	* @param uncovered the data not covered by rules before index
	292	* @param prevRuleStats the provided stats of previous rules
	293	*/
	294	public void countData(int index, Instances uncovered,
	295	double[][] prevRuleStats){
	296	if((m_Filtered != null) \|\|
	297	(m_Ruleset == null))
	298	return;
	299
	300	int size = m_Ruleset.size();
	301	m_Filtered = new FastVector(size);
	302	m_SimpleStats = new FastVector(size);
	303	Instances[] data = new Instances[2];
	304	data[1] = uncovered;
	305
	306	for(int i=0; i < index; i++){
	307	m_SimpleStats.addElement(prevRuleStats[i]);
	308	if(i+1 == index)
	309	m_Filtered.addElement(data);
	310	else
	311	m_Filtered.addElement(new Object()); // Stuff sth.
	312	}
	313
	314	for(int j=index; j < size; j++){
	315	double[] stats = new double[6]; // 6 statistics parameters
	316	Instances[] filtered = computeSimpleStats(j, data[1], stats, null);
	317	m_Filtered.addElement(filtered);
	318	m_SimpleStats.addElement(stats);
	319	data = filtered; // Data not covered
	320	}
	321	}
	322
	323	/**
	324	* Find all the instances in the dataset covered/not covered by
	325	* the rule in given index, and the correponding simple statistics
	326	* and predicted class distributions are stored in the given double array,
	327	* which can be obtained by getSimpleStats() and getDistributions().<br>
	328	*
	329	* @param index the given index, assuming correct
	330	* @param insts the dataset to be covered by the rule
	331	* @param stats the given double array to hold stats, side-effected
	332	* @param dist the given array to hold class distributions, side-effected
	333	* if null, the distribution is not necessary
	334	* @return the instances covered and not covered by the rule
	335	*/
	336	private Instances[] computeSimpleStats(int index, Instances insts,
	337	double[] stats, double[] dist){
	338	Rule rule = (Rule)m_Ruleset.elementAt(index);
	339
	340	Instances[] data = new Instances[2];
	341	data[0] = new Instances(insts, insts.numInstances());
	342	data[1] = new Instances(insts, insts.numInstances());
	343
	344	for(int i=0; i<insts.numInstances(); i++){
	345	Instance datum = insts.instance(i);
	346	double weight = datum.weight();
	347	if(rule.covers(datum)){
	348	data[0].add(datum); // Covered by this rule
	349	stats[0] += weight; // Coverage
	350	if((int)datum.classValue() == (int)rule.getConsequent())
	351	stats[2] += weight; // True positives
	352	else
	353	stats[4] += weight; // False positives
	354	if(dist != null)
	355	dist[(int)datum.classValue()] += weight;
	356	}
	357	else{
	358	data[1].add(datum); // Not covered by this rule
	359	stats[1] += weight;
	360	if((int)datum.classValue() != (int)rule.getConsequent())
	361	stats[3] += weight; // True negatives
	362	else
	363	stats[5] += weight; // False negatives
	364	}
	365	}
	366
	367	return data;
	368	}
	369
	370
	371	/**
	372	* Add a rule to the ruleset and update the stats
	373	*
	374	* @param lastRule the rule to be added
	375	*/
	376	public void addAndUpdate(Rule lastRule){
	377	if(m_Ruleset == null)
	378	m_Ruleset = new FastVector();
	379	m_Ruleset.addElement(lastRule);
	380
	381	Instances data = (m_Filtered == null) ?
	382	m_Data : ((Instances[])m_Filtered.lastElement())[1];
	383	double[] stats = new double[6];
	384	double[] classCounts = new double[m_Data.classAttribute().numValues()];
	385	Instances[] filtered =
	386	computeSimpleStats(m_Ruleset.size()-1, data, stats, classCounts);
	387
	388	if(m_Filtered == null)
	389	m_Filtered = new FastVector();
	390	m_Filtered.addElement(filtered);
	391
	392	if(m_SimpleStats == null)
	393	m_SimpleStats = new FastVector();
	394	m_SimpleStats.addElement(stats);
	395
	396	if(m_Distributions == null)
	397	m_Distributions = new FastVector();
	398	m_Distributions.addElement(classCounts);
	399	}
	400
	401
	402	/**
	403	* Subset description length: <br>
	404	* S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
	405	*
	406	* Details see Quilan: "MDL and categorical theories (Continued)",ML95
	407	*
	408	* @param t the number of elements in a known set
	409	* @param k the number of elements in a subset
	410	* @param p the expected proportion of subset known by recipient
	411	* @return the subset description length
	412	*/
	413	public static double subsetDL(double t, double k, double p){
	414	double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
	415	rt -= (t-k)*Utils.log2(1-p);
	416	return rt;
	417	}
	418
	419
	420	/**
	421	* The description length of the theory for a given rule. Computed as:<br>
	422	* 0.5* [\|\|k\|\|+ S(t, k, k/t)]<br>
	423	* where k is the number of antecedents of the rule; t is the total
	424	* possible antecedents that could appear in a rule; \|\|K\|\| is the
	425	* universal prior for k , log2(k) and S(t,k,p) = -klog2(p)-(n-k)log2(1-p)
	426	* is the subset encoding length.<p>
	427	*
	428	* Details see Quilan: "MDL and categorical theories (Continued)",ML95
	429	*
	430	* @param index the index of the given rule (assuming correct)
	431	* @return the theory DL, weighted if weight != 1.0
	432	*/
	433	public double theoryDL(int index){
	434
	435	double k = ((Rule)m_Ruleset.elementAt(index)).size();
	436
	437	if(k == 0)
	438	return 0.0;
	439
	440	double tdl = Utils.log2(k);
	441	if(k > 1) // Approximation
	442	tdl += 2.0 * Utils.log2(tdl); // of log2 star
	443	tdl += subsetDL(m_Total, k, k/m_Total);
	444	//System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
	445	return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
	446	}
	447
	448
	449	/**
	450	* The description length of data given the parameters of the data
	451	* based on the ruleset. <p>
	452	* Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
	453	*
	454	* @param expFPOverErr expected FP/(FP+FN)
	455	* @param cover coverage
	456	* @param uncover uncoverage
	457	* @param fp False Positive
	458	* @param fn False Negative
	459	* @return the description length
	460	*/
	461	public static double dataDL(double expFPOverErr, double cover,
	462	double uncover, double fp, double fn){
	463	double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
	464	double coverBits, uncoverBits; // What's the error?
	465	double expErr; // Expected FP or FN
	466
	467	if(Utils.gr(cover, uncover)){
	468	expErr = expFPOverErr*(fp+fn);
	469	coverBits = subsetDL(cover, fp, expErr/cover);
	470	uncoverBits = Utils.gr(uncover, 0.0) ?
	471	subsetDL(uncover, fn, fn/uncover) : 0.0;
	472	}
	473	else{
	474	expErr = (1.0-expFPOverErr)*(fp+fn);
	475	coverBits = Utils.gr(cover, 0.0) ?
	476	subsetDL(cover, fp, fp/cover) : 0.0;
	477	uncoverBits = subsetDL(uncover, fn, expErr/uncover);
	478	}
	479
	480	/*
	481	System.err.println("!!!cover: " + cover + "\|uncover" + uncover +
	482	"\|coverBits: "+coverBits+"\|uncBits: "+ uncoverBits+
	483	"\|FPRate: "+expFPOverErr + "\|expErr: "+expErr+
	484	"\|fp: "+fp+"\|fn: "+fn+"\|total: "+totalBits);
	485	*/
	486	return (totalBits + coverBits + uncoverBits);
	487	}
	488
	489
	490	/**
	491	* Calculate the potential to decrease DL of the ruleset,
	492	* i.e. the possible DL that could be decreased by deleting
	493	* the rule whose index and simple statstics are given.
	494	* If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),
	495	* it returns NaN. <p>
	496	*
	497	* The way this procedure does is copied from original RIPPER
	498	* implementation and is quite bizzare because it
	499	* does not update the following rules' stats recursively
	500	* any more when testing each rule, which means it assumes
	501	* after deletion no data covered by the following rules (or
	502	* regards the deleted rule as the last rule). Reasonable
	503	* assumption?<p>
	504	*
	505	* @param index the index of the rule in m_Ruleset to be deleted
	506	* @param expFPOverErr expected FP/(FP+FN)
	507	* @param rulesetStat the simple statistics of the ruleset, updated
	508	* if the rule should be deleted
	509	* @param ruleStat the simple statistics of the rule to be deleted
	510	* @param checkErr whether check if error rate >= 0.5
	511	* @return the potential DL that could be decreased
	512	*/
	513	public double potential(int index, double expFPOverErr,
	514	double[] rulesetStat, double[] ruleStat,
	515	boolean checkErr){
	516	//System.out.println("!!!inside potential: ");
	517	// Restore the stats if deleted
	518	double pcov = rulesetStat[0] - ruleStat[0];
	519	double puncov = rulesetStat[1] + ruleStat[0];
	520	double pfp = rulesetStat[4] - ruleStat[4];
	521	double pfn = rulesetStat[5] + ruleStat[2];
	522
	523	double dataDLWith = dataDL(expFPOverErr, rulesetStat[0],
	524	rulesetStat[1], rulesetStat[4],
	525	rulesetStat[5]);
	526	double theoryDLWith = theoryDL(index);
	527	double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);
	528
	529	double potential = dataDLWith + theoryDLWith - dataDLWithout;
	530	double err = ruleStat[4] / ruleStat[0];
	531	/*System.out.println("!!!"+dataDLWith +" \| "+
	532	theoryDLWith + " \| "
	533	+dataDLWithout+"\|"+ruleStat[4] + " / " + ruleStat[0]);
	534	*/
	535	boolean overErr = Utils.grOrEq(err, 0.5);
	536	if(!checkErr)
	537	overErr = false;
	538
	539	if(Utils.grOrEq(potential, 0.0) \|\| overErr){
	540	// If deleted, update ruleset stats. Other stats do not matter
	541	rulesetStat[0] = pcov;
	542	rulesetStat[1] = puncov;
	543	rulesetStat[4] = pfp;
	544	rulesetStat[5] = pfn;
	545	return potential;
	546	}
	547	else
	548	return Double.NaN;
	549	}
	550
	551
	552	/**
	553	* Compute the minimal data description length of the ruleset
	554	* if the rule in the given position is deleted.<br>
	555	* The min_data_DL_if_deleted = data_DL_if_deleted - potential
	556	*
	557	* @param index the index of the rule in question
	558	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
	559	* @param checkErr whether check if error rate >= 0.5
	560	* @return the minDataDL
	561	*/
	562	public double minDataDLIfDeleted(int index, double expFPRate,
	563	boolean checkErr){
	564	//System.out.println("!!!Enter without: ");
	565	double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
	566	int more = m_Ruleset.size() - 1 - index; // How many rules after?
	567	FastVector indexPlus = new FastVector(more); // Their stats
	568
	569	// 0...(index-1) are OK
	570	for(int j=0; j<index; j++){
	571	// Covered stats are cumulative
	572	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
	573	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
	574	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
	575	}
	576
	577	// Recount data from index+1
	578	Instances data = (index == 0) ?
	579	m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1];
	580	//System.out.println("!!!without: " + data.sumOfWeights());
	581
	582	for(int j=(index+1); j<m_Ruleset.size(); j++){
	583	double[] stats = new double[6];
	584	Instances[] split = computeSimpleStats(j, data, stats, null);
	585	indexPlus.addElement(stats);
	586	rulesetStat[0] += stats[0];
	587	rulesetStat[2] += stats[2];
	588	rulesetStat[4] += stats[4];
	589	data = split[1];
	590	}
	591	// Uncovered stats are those of the last rule
	592	if(more > 0){
	593	rulesetStat[1] = ((double[])indexPlus.lastElement())[1];
	594	rulesetStat[3] = ((double[])indexPlus.lastElement())[3];
	595	rulesetStat[5] = ((double[])indexPlus.lastElement())[5];
	596	}
	597	else if(index > 0){
	598	rulesetStat[1] =
	599	((double[])m_SimpleStats.elementAt(index-1))[1];
	600	rulesetStat[3] =
	601	((double[])m_SimpleStats.elementAt(index-1))[3];
	602	rulesetStat[5] =
	603	((double[])m_SimpleStats.elementAt(index-1))[5];
	604	}
	605	else{ // Null coverage
	606	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] +
	607	((double[])m_SimpleStats.elementAt(0))[1];
	608	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] +
	609	((double[])m_SimpleStats.elementAt(0))[4];
	610	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] +
	611	((double[])m_SimpleStats.elementAt(0))[5];
	612	}
	613
	614	// Potential
	615	double potential = 0;
	616	for(int k=index+1; k<m_Ruleset.size(); k++){
	617	double[] ruleStat = (double[])indexPlus.elementAt(k-index-1);
	618	double ifDeleted = potential(k, expFPRate, rulesetStat,
	619	ruleStat, checkErr);
	620	if(!Double.isNaN(ifDeleted))
	621	potential += ifDeleted;
	622	}
	623
	624	// Data DL of the ruleset without the rule
	625	// Note that ruleset stats has already been updated to reflect
	626	// deletion if any potential
	627	double dataDLWithout = dataDL(expFPRate, rulesetStat[0],
	628	rulesetStat[1], rulesetStat[4],
	629	rulesetStat[5]);
	630	//System.out.println("!!!without: "+dataDLWithout + " \|potential: "+
	631	// potential);
	632	// Why subtract potential again? To reflect change of theory DL??
	633	return (dataDLWithout - potential);
	634	}
	635
	636
	637	/**
	638	* Compute the minimal data description length of the ruleset
	639	* if the rule in the given position is NOT deleted.<br>
	640	* The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential
	641	*
	642	* @param index the index of the rule in question
	643	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
	644	* @param checkErr whether check if error rate >= 0.5
	645	* @return the minDataDL
	646	*/
	647	public double minDataDLIfExists(int index, double expFPRate,
	648	boolean checkErr){
	649	// System.out.println("!!!Enter with: ");
	650	double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists
	651	for(int j=0; j<m_SimpleStats.size(); j++){
	652	// Covered stats are cumulative
	653	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
	654	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
	655	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
	656	if(j == m_SimpleStats.size()-1){ // Last rule
	657	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
	658	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
	659	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
	660	}
	661	}
	662
	663	// Potential
	664	double potential = 0;
	665	for(int k=index+1; k<m_SimpleStats.size(); k++){
	666	double[] ruleStat = (double[])getSimpleStats(k);
	667	double ifDeleted = potential(k, expFPRate, rulesetStat,
	668	ruleStat, checkErr);
	669	if(!Double.isNaN(ifDeleted))
	670	potential += ifDeleted;
	671	}
	672
	673	// Data DL of the ruleset without the rule
	674	// Note that ruleset stats has already been updated to reflect deletion
	675	// if any potential
	676	double dataDLWith = dataDL(expFPRate, rulesetStat[0],
	677	rulesetStat[1], rulesetStat[4],
	678	rulesetStat[5]);
	679	//System.out.println("!!!with: "+dataDLWith + " \|potential: "+
	680	// potential);
	681	return (dataDLWith - potential);
	682	}
	683
	684
	685	/**
	686	* The description length (DL) of the ruleset relative to if the
	687	* rule in the given position is deleted, which is obtained by: <br>
	688	* MDL if the rule exists - MDL if the rule does not exist <br>
	689	* Note the minimal possible DL of the ruleset is calculated(i.e. some
	690	* other rules may also be deleted) instead of the DL of the current
	691	* ruleset.<p>
	692	*
	693	* @param index the given position of the rule in question
	694	* (assuming correct)
	695	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
	696	* @param checkErr whether check if error rate >= 0.5
	697	* @return the relative DL
	698	*/
	699	public double relativeDL(int index, double expFPRate, boolean checkErr){
	700
	701	return (minDataDLIfExists(index, expFPRate, checkErr)
	702	+ theoryDL(index) -
	703	minDataDLIfDeleted(index, expFPRate, checkErr));
	704	}
	705
	706
	707	/**
	708	* Try to reduce the DL of the ruleset by testing removing the rules
	709	* one by one in reverse order and update all the stats
	710	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
	711	* @param checkErr whether check if error rate >= 0.5
	712	*/
	713	public void reduceDL(double expFPRate, boolean checkErr){
	714
	715	boolean needUpdate = false;
	716	double[] rulesetStat = new double[6];
	717	for(int j=0; j<m_SimpleStats.size(); j++){
	718	// Covered stats are cumulative
	719	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
	720	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
	721	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
	722	if(j == m_SimpleStats.size()-1){ // Last rule
	723	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
	724	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
	725	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
	726	}
	727	}
	728
	729	// Potential
	730	for(int k=m_SimpleStats.size()-1; k>=0; k--){
	731
	732	double[] ruleStat = (double[])m_SimpleStats.elementAt(k);
	733
	734	// rulesetStat updated
	735	double ifDeleted = potential(k, expFPRate, rulesetStat,
	736	ruleStat, checkErr);
	737	if(!Double.isNaN(ifDeleted)){
	738	/*System.err.println("!!!deleted ("+k+"): save "+ifDeleted
	739	+" \| "+rulesetStat[0]
	740	+" \| "+rulesetStat[1]
	741	+" \| "+rulesetStat[4]
	742	+" \| "+rulesetStat[5]);
	743	*/
	744
	745	if(k == (m_SimpleStats.size()-1))
	746	removeLast();
	747	else{
	748	m_Ruleset.removeElementAt(k);
	749	needUpdate = true;
	750	}
	751	}
	752	}
	753
	754	if(needUpdate){
	755	m_Filtered = null;
	756	m_SimpleStats = null;
	757	countData();
	758	}
	759	}
	760
	761	/**
	762	* Remove the last rule in the ruleset as well as it's stats.
	763	* It might be useful when the last rule was added for testing
	764	* purpose and then the test failed
	765	*/
	766	public void removeLast(){
	767	int last = m_Ruleset.size()-1;
	768	m_Ruleset.removeElementAt(last);
	769	m_Filtered.removeElementAt(last);
	770	m_SimpleStats.removeElementAt(last);
	771	if(m_Distributions != null)
	772	m_Distributions.removeElementAt(last);
	773	}
	774
	775	/**
	776	* Static utility function to count the data covered by the
	777	* rules after the given index in the given rules, and then
	778	* remove them. It returns the data not covered by the
	779	* successive rules.
	780	*
	781	* @param data the data to be processed
	782	* @param rules the ruleset
	783	* @param index the given index
	784	* @return the data after processing
	785	*/
	786	public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){
	787	Instances rt = new Instances(data, 0);
	788
	789	for(int i=0; i < data.numInstances(); i++){
	790	Instance datum = data.instance(i);
	791	boolean covered = false;
	792
	793	for(int j=index+1; j<rules.size();j++){
	794	Rule rule = (Rule)rules.elementAt(j);
	795	if(rule.covers(datum)){
	796	covered = true;
	797	break;
	798	}
	799	}
	800
	801	if(!covered)
	802	rt.add(datum);
	803	}
	804	return rt;
	805	}
	806
	807	/**
	808	* Stratify the given data into the given number of bags based on the class
	809	* values. It differs from the <code>Instances.stratify(int fold)</code>
	810	* that before stratification it sorts the instances according to the
	811	* class order in the header file. It assumes no missing values in the class.
	812	*
	813	* @param data the given data
	814	* @param folds the given number of folds
	815	* @param rand the random object used to randomize the instances
	816	* @return the stratified instances
	817	*/
	818	public static final Instances stratify(Instances data, int folds, Random rand){
	819	if(!data.classAttribute().isNominal())
	820	return data;
	821
	822	Instances result = new Instances(data, 0);
	823	Instances[] bagsByClasses = new Instances[data.numClasses()];
	824
	825	for(int i=0; i < bagsByClasses.length; i++)
	826	bagsByClasses[i] = new Instances(data, 0);
	827
	828	// Sort by class
	829	for(int j=0; j < data.numInstances(); j++){
	830	Instance datum = data.instance(j);
	831	bagsByClasses[(int)datum.classValue()].add(datum);
	832	}
	833
	834	// Randomize each class
	835	for(int j=0; j < bagsByClasses.length; j++)
	836	bagsByClasses[j].randomize(rand);
	837
	838	for(int k=0; k < folds; k++){
	839	int offset = k, bag = 0;
	840	oneFold:
	841	while (true){
	842	while(offset >= bagsByClasses[bag].numInstances()){
	843	offset -= bagsByClasses[bag].numInstances();
	844	if (++bag >= bagsByClasses.length)// Next bag
	845	break oneFold;
	846	}
	847
	848	result.add(bagsByClasses[bag].instance(offset));
	849	offset += folds;
	850	}
	851	}
	852
	853	return result;
	854	}
	855
	856	/**
	857	* Compute the combined DL of the ruleset in this class, i.e. theory
	858	* DL and data DL. Note this procedure computes the combined DL
	859	* according to the current status of the ruleset in this class
	860	*
	861	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
	862	* @param predicted the default classification if ruleset covers null
	863	* @return the combined class
	864	*/
	865	public double combinedDL(double expFPRate, double predicted){
	866	double rt = 0;
	867
	868	if(getRulesetSize() > 0) {
	869	double[] stats = (double[])m_SimpleStats.lastElement();
	870	for(int j=getRulesetSize()-2; j >= 0; j--){
	871	stats[0] += getSimpleStats(j)[0];
	872	stats[2] += getSimpleStats(j)[2];
	873	stats[4] += getSimpleStats(j)[4];
	874	}
	875	rt += dataDL(expFPRate, stats[0], stats[1],
	876	stats[4], stats[5]); // Data DL
	877	}
	878	else{ // Null coverage ruleset
	879	double fn = 0.0;
	880	for(int j=0; j < m_Data.numInstances(); j++)
	881	if((int)m_Data.instance(j).classValue() == (int)predicted)
	882	fn += m_Data.instance(j).weight();
	883	rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);
	884	}
	885
	886	for(int i=0; i<getRulesetSize(); i++) // Theory DL
	887	rt += theoryDL(i);
	888
	889	return rt;
	890	}
	891
	892	/**
	893	* Patition the data into 2, first of which has (numFolds-1)/numFolds of
	894	* the data and the second has 1/numFolds of the data
	895	*
	896	*
	897	* @param data the given data
	898	* @param numFolds the given number of folds
	899	* @return the patitioned instances
	900	*/
	901	public static final Instances[] partition(Instances data, int numFolds){
	902	Instances[] rt = new Instances[2];
	903	int splits = data.numInstances() * (numFolds - 1) / numFolds;
	904
	905	rt[0] = new Instances(data, 0, splits);
	906	rt[1] = new Instances(data, splits, data.numInstances()-splits);
	907
	908	return rt;
	909	}
	910
	911	/**
	912	* Returns the revision string.
	913	*
	914	* @return the revision
	915	*/
	916	public String getRevision() {
	917	return RevisionUtils.extract("$Revision: 4608 $");
	918	}
	919	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: