Context Navigation

source: src/main/java/weka/classifiers/rules/RuleStats.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 29.2 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* RuleStats.java
19	* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20	*/
21
22	package weka.classifiers.rules;
23
24	import weka.core.Attribute;
25	import weka.core.FastVector;
26	import weka.core.Instance;
27	import weka.core.Instances;
28	import weka.core.RevisionHandler;
29	import weka.core.RevisionUtils;
30	import weka.core.Utils;
31
32	import java.io.Serializable;
33	import java.util.Enumeration;
34	import java.util.Random;
35
36	/**
37	* This class implements the statistics functions used in the
38	* propositional rule learner, from the simpler ones like count of
39	* true/false positive/negatives, filter data based on the ruleset, etc.
40	* to the more sophisticated ones such as MDL calculation and rule
41	* variants generation for each rule in the ruleset. <p>
42	*
43	* Obviously the statistics functions listed above need the specific
44	* data and the specific ruleset, which are given in order to instantiate
45	* an object of this class. <p>
46	*
47	* @author Xin Xu (xx5@cs.waikato.ac.nz)
48	* @version $Revision: 4608 $
49	*/
50	public class RuleStats
51	implements Serializable, RevisionHandler {
52
53	/** for serialization */
54	static final long serialVersionUID = -5708153367675298624L;
55
56	/** The data on which the stats calculation is based */
57	private Instances m_Data;
58
59	/** The specific ruleset in question */
60	private FastVector m_Ruleset;
61
62	/** The simple stats of each rule */
63	private FastVector m_SimpleStats;
64
65	/** The set of instances filtered by the ruleset */
66	private FastVector m_Filtered;
67
68	/** The total number of possible conditions that could
69	* appear in a rule */
70	private double m_Total;
71
72	/** The redundancy factor in theory description length */
73	private static double REDUNDANCY_FACTOR = 0.5;
74
75	/** The theory weight in the MDL calculation */
76	private double MDL_THEORY_WEIGHT = 1.0;
77
78	/** The class distributions predicted by each rule */
79	private FastVector m_Distributions;
80
81	/** Default constructor */
82	public RuleStats(){
83	m_Data = null;
84	m_Ruleset = null;
85	m_SimpleStats = null;
86	m_Filtered = null;
87	m_Distributions = null;
88	m_Total = -1;
89	}
90
91
92	/**
93	* Constructor that provides ruleset and data
94	*
95	* @param data the data
96	* @param rules the ruleset
97	*/
98	public RuleStats(Instances data, FastVector rules){
99	this();
100	m_Data = data;
101	m_Ruleset = rules;
102	}
103
104	/**
105	* Frees up memory after classifier has been built.
106	*/
107	public void cleanUp() {
108	m_Data = null;
109	m_Filtered = null;
110	}
111
112	/**
113	* Set the number of all conditions that could appear
114	* in a rule in this RuleStats object, if the number set
115	* is smaller than 0 (typically -1), then it calcualtes
116	* based on the data store
117	*
118	* @param total the set number
119	*/
120	public void setNumAllConds(double total){
121	if(total < 0)
122	m_Total = numAllConditions(m_Data);
123	else
124	m_Total = total;
125	}
126
127	/**
128	* Set the data of the stats, overwriting the old one if any
129	*
130	* @param data the data to be set
131	*/
132	public void setData(Instances data){
133	m_Data = data;
134	}
135
136	/**
137	* Get the data of the stats
138	*
139	* @return the data
140	*/
141	public Instances getData(){
142	return m_Data;
143	}
144
145
146	/**
147	* Set the ruleset of the stats, overwriting the old one if any
148	*
149	* @param rules the set of rules to be set
150	*/
151	public void setRuleset(FastVector rules){
152	m_Ruleset = rules;
153	}
154
155
156	/**
157	* Get the ruleset of the stats
158	*
159	* @return the set of rules
160	*/
161	public FastVector getRuleset(){
162	return m_Ruleset;
163	}
164
165	/**
166	* Get the size of the ruleset in the stats
167	*
168	* @return the size of ruleset
169	*/
170	public int getRulesetSize(){
171	return m_Ruleset.size();
172	}
173
174	/**
175	* Get the simple stats of one rule, including 6 parameters:
176	* 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;
177	* 4: false positives; 5: false negatives
178	*
179	* @param index the index of the rule
180	* @return the stats
181	*/
182	public double[] getSimpleStats(int index){
183	if((m_SimpleStats != null) && (index < m_SimpleStats.size()))
184	return (double[])m_SimpleStats.elementAt(index);
185
186	return null;
187	}
188
189
190	/**
191	* Get the data after filtering the given rule
192	*
193	* @param index the index of the rule
194	* @return the data covered and uncovered by the rule
195	*/
196	public Instances[] getFiltered(int index){
197
198	if((m_Filtered != null) && (index < m_Filtered.size()))
199	return (Instances[])m_Filtered.elementAt(index);
200
201	return null;
202	}
203
204	/**
205	* Get the class distribution predicted by the rule in
206	* given position
207	*
208	* @param index the position index of the rule
209	* @return the class distributions
210	*/
211	public double[] getDistributions(int index){
212
213	if((m_Distributions != null) && (index < m_Distributions.size()))
214	return (double[])m_Distributions.elementAt(index);
215
216	return null;
217	}
218
219	/**
220	* Set the weight of theory in MDL calcualtion
221	*
222	* @param weight the weight to be set
223	*/
224	public void setMDLTheoryWeight(double weight){
225	MDL_THEORY_WEIGHT = weight;
226	}
227
228	/**
229	* Compute the number of all possible conditions that could
230	* appear in a rule of a given data. For nominal attributes,
231	* it's the number of values that could appear; for numeric
232	* attributes, it's the number of values * 2, i.e. <= and >=
233	* are counted as different possible conditions.
234	*
235	* @param data the given data
236	* @return number of all conditions of the data
237	*/
238	public static double numAllConditions(Instances data){
239	double total = 0;
240	Enumeration attEnum = data.enumerateAttributes();
241	while(attEnum.hasMoreElements()){
242	Attribute att= (Attribute)attEnum.nextElement();
243	if(att.isNominal())
244	total += (double)att.numValues();
245	else
246	total += 2.0 * (double)data.numDistinctValues(att);
247	}
248	return total;
249	}
250
251
252	/**
253	* Filter the data according to the ruleset and compute the basic
254	* stats: coverage/uncoverage, true/false positive/negatives of
255	* each rule
256	*/
257	public void countData(){
258	if((m_Filtered != null) \|\|
259	(m_Ruleset == null) \|\|
260	(m_Data == null))
261	return;
262
263	int size = m_Ruleset.size();
264	m_Filtered = new FastVector(size);
265	m_SimpleStats = new FastVector(size);
266	m_Distributions = new FastVector(size);
267	Instances data = new Instances(m_Data);
268
269	for(int i=0; i < size; i++){
270	double[] stats = new double[6]; // 6 statistics parameters
271	double[] classCounts = new double[m_Data.classAttribute().numValues()];
272	Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);
273	m_Filtered.addElement(filtered);
274	m_SimpleStats.addElement(stats);
275	m_Distributions.addElement(classCounts);
276	data = filtered[1]; // Data not covered
277	}
278	}
279
280	/**
281	* Count data from the position index in the ruleset
282	* assuming that given data are not covered by the rules
283	* in position 0...(index-1), and the statistics of these
284	* rules are provided.<br>
285	* This procedure is typically useful when a temporary
286	* object of RuleStats is constructed in order to efficiently
287	* calculate the relative DL of rule in position index,
288	* thus all other stuff is not needed.
289	*
290	* @param index the given position
291	* @param uncovered the data not covered by rules before index
292	* @param prevRuleStats the provided stats of previous rules
293	*/
294	public void countData(int index, Instances uncovered,
295	double[][] prevRuleStats){
296	if((m_Filtered != null) \|\|
297	(m_Ruleset == null))
298	return;
299
300	int size = m_Ruleset.size();
301	m_Filtered = new FastVector(size);
302	m_SimpleStats = new FastVector(size);
303	Instances[] data = new Instances[2];
304	data[1] = uncovered;
305
306	for(int i=0; i < index; i++){
307	m_SimpleStats.addElement(prevRuleStats[i]);
308	if(i+1 == index)
309	m_Filtered.addElement(data);
310	else
311	m_Filtered.addElement(new Object()); // Stuff sth.
312	}
313
314	for(int j=index; j < size; j++){
315	double[] stats = new double[6]; // 6 statistics parameters
316	Instances[] filtered = computeSimpleStats(j, data[1], stats, null);
317	m_Filtered.addElement(filtered);
318	m_SimpleStats.addElement(stats);
319	data = filtered; // Data not covered
320	}
321	}
322
323	/**
324	* Find all the instances in the dataset covered/not covered by
325	* the rule in given index, and the correponding simple statistics
326	* and predicted class distributions are stored in the given double array,
327	* which can be obtained by getSimpleStats() and getDistributions().<br>
328	*
329	* @param index the given index, assuming correct
330	* @param insts the dataset to be covered by the rule
331	* @param stats the given double array to hold stats, side-effected
332	* @param dist the given array to hold class distributions, side-effected
333	* if null, the distribution is not necessary
334	* @return the instances covered and not covered by the rule
335	*/
336	private Instances[] computeSimpleStats(int index, Instances insts,
337	double[] stats, double[] dist){
338	Rule rule = (Rule)m_Ruleset.elementAt(index);
339
340	Instances[] data = new Instances[2];
341	data[0] = new Instances(insts, insts.numInstances());
342	data[1] = new Instances(insts, insts.numInstances());
343
344	for(int i=0; i<insts.numInstances(); i++){
345	Instance datum = insts.instance(i);
346	double weight = datum.weight();
347	if(rule.covers(datum)){
348	data[0].add(datum); // Covered by this rule
349	stats[0] += weight; // Coverage
350	if((int)datum.classValue() == (int)rule.getConsequent())
351	stats[2] += weight; // True positives
352	else
353	stats[4] += weight; // False positives
354	if(dist != null)
355	dist[(int)datum.classValue()] += weight;
356	}
357	else{
358	data[1].add(datum); // Not covered by this rule
359	stats[1] += weight;
360	if((int)datum.classValue() != (int)rule.getConsequent())
361	stats[3] += weight; // True negatives
362	else
363	stats[5] += weight; // False negatives
364	}
365	}
366
367	return data;
368	}
369
370
371	/**
372	* Add a rule to the ruleset and update the stats
373	*
374	* @param lastRule the rule to be added
375	*/
376	public void addAndUpdate(Rule lastRule){
377	if(m_Ruleset == null)
378	m_Ruleset = new FastVector();
379	m_Ruleset.addElement(lastRule);
380
381	Instances data = (m_Filtered == null) ?
382	m_Data : ((Instances[])m_Filtered.lastElement())[1];
383	double[] stats = new double[6];
384	double[] classCounts = new double[m_Data.classAttribute().numValues()];
385	Instances[] filtered =
386	computeSimpleStats(m_Ruleset.size()-1, data, stats, classCounts);
387
388	if(m_Filtered == null)
389	m_Filtered = new FastVector();
390	m_Filtered.addElement(filtered);
391
392	if(m_SimpleStats == null)
393	m_SimpleStats = new FastVector();
394	m_SimpleStats.addElement(stats);
395
396	if(m_Distributions == null)
397	m_Distributions = new FastVector();
398	m_Distributions.addElement(classCounts);
399	}
400
401
402	/**
403	* Subset description length: <br>
404	* S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
405	*
406	* Details see Quilan: "MDL and categorical theories (Continued)",ML95
407	*
408	* @param t the number of elements in a known set
409	* @param k the number of elements in a subset
410	* @param p the expected proportion of subset known by recipient
411	* @return the subset description length
412	*/
413	public static double subsetDL(double t, double k, double p){
414	double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
415	rt -= (t-k)*Utils.log2(1-p);
416	return rt;
417	}
418
419
420	/**
421	* The description length of the theory for a given rule. Computed as:<br>
422	* 0.5* [\|\|k\|\|+ S(t, k, k/t)]<br>
423	* where k is the number of antecedents of the rule; t is the total
424	* possible antecedents that could appear in a rule; \|\|K\|\| is the
425	* universal prior for k , log2(k) and S(t,k,p) = -klog2(p)-(n-k)log2(1-p)
426	* is the subset encoding length.<p>
427	*
428	* Details see Quilan: "MDL and categorical theories (Continued)",ML95
429	*
430	* @param index the index of the given rule (assuming correct)
431	* @return the theory DL, weighted if weight != 1.0
432	*/
433	public double theoryDL(int index){
434
435	double k = ((Rule)m_Ruleset.elementAt(index)).size();
436
437	if(k == 0)
438	return 0.0;
439
440	double tdl = Utils.log2(k);
441	if(k > 1) // Approximation
442	tdl += 2.0 * Utils.log2(tdl); // of log2 star
443	tdl += subsetDL(m_Total, k, k/m_Total);
444	//System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
445	return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
446	}
447
448
449	/**
450	* The description length of data given the parameters of the data
451	* based on the ruleset. <p>
452	* Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
453	*
454	* @param expFPOverErr expected FP/(FP+FN)
455	* @param cover coverage
456	* @param uncover uncoverage
457	* @param fp False Positive
458	* @param fn False Negative
459	* @return the description length
460	*/
461	public static double dataDL(double expFPOverErr, double cover,
462	double uncover, double fp, double fn){
463	double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
464	double coverBits, uncoverBits; // What's the error?
465	double expErr; // Expected FP or FN
466
467	if(Utils.gr(cover, uncover)){
468	expErr = expFPOverErr*(fp+fn);
469	coverBits = subsetDL(cover, fp, expErr/cover);
470	uncoverBits = Utils.gr(uncover, 0.0) ?
471	subsetDL(uncover, fn, fn/uncover) : 0.0;
472	}
473	else{
474	expErr = (1.0-expFPOverErr)*(fp+fn);
475	coverBits = Utils.gr(cover, 0.0) ?
476	subsetDL(cover, fp, fp/cover) : 0.0;
477	uncoverBits = subsetDL(uncover, fn, expErr/uncover);
478	}
479
480	/*
481	System.err.println("!!!cover: " + cover + "\|uncover" + uncover +
482	"\|coverBits: "+coverBits+"\|uncBits: "+ uncoverBits+
483	"\|FPRate: "+expFPOverErr + "\|expErr: "+expErr+
484	"\|fp: "+fp+"\|fn: "+fn+"\|total: "+totalBits);
485	*/
486	return (totalBits + coverBits + uncoverBits);
487	}
488
489
490	/**
491	* Calculate the potential to decrease DL of the ruleset,
492	* i.e. the possible DL that could be decreased by deleting
493	* the rule whose index and simple statstics are given.
494	* If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),
495	* it returns NaN. <p>
496	*
497	* The way this procedure does is copied from original RIPPER
498	* implementation and is quite bizzare because it
499	* does not update the following rules' stats recursively
500	* any more when testing each rule, which means it assumes
501	* after deletion no data covered by the following rules (or
502	* regards the deleted rule as the last rule). Reasonable
503	* assumption?<p>
504	*
505	* @param index the index of the rule in m_Ruleset to be deleted
506	* @param expFPOverErr expected FP/(FP+FN)
507	* @param rulesetStat the simple statistics of the ruleset, updated
508	* if the rule should be deleted
509	* @param ruleStat the simple statistics of the rule to be deleted
510	* @param checkErr whether check if error rate >= 0.5
511	* @return the potential DL that could be decreased
512	*/
513	public double potential(int index, double expFPOverErr,
514	double[] rulesetStat, double[] ruleStat,
515	boolean checkErr){
516	//System.out.println("!!!inside potential: ");
517	// Restore the stats if deleted
518	double pcov = rulesetStat[0] - ruleStat[0];
519	double puncov = rulesetStat[1] + ruleStat[0];
520	double pfp = rulesetStat[4] - ruleStat[4];
521	double pfn = rulesetStat[5] + ruleStat[2];
522
523	double dataDLWith = dataDL(expFPOverErr, rulesetStat[0],
524	rulesetStat[1], rulesetStat[4],
525	rulesetStat[5]);
526	double theoryDLWith = theoryDL(index);
527	double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);
528
529	double potential = dataDLWith + theoryDLWith - dataDLWithout;
530	double err = ruleStat[4] / ruleStat[0];
531	/*System.out.println("!!!"+dataDLWith +" \| "+
532	theoryDLWith + " \| "
533	+dataDLWithout+"\|"+ruleStat[4] + " / " + ruleStat[0]);
534	*/
535	boolean overErr = Utils.grOrEq(err, 0.5);
536	if(!checkErr)
537	overErr = false;
538
539	if(Utils.grOrEq(potential, 0.0) \|\| overErr){
540	// If deleted, update ruleset stats. Other stats do not matter
541	rulesetStat[0] = pcov;
542	rulesetStat[1] = puncov;
543	rulesetStat[4] = pfp;
544	rulesetStat[5] = pfn;
545	return potential;
546	}
547	else
548	return Double.NaN;
549	}
550
551
552	/**
553	* Compute the minimal data description length of the ruleset
554	* if the rule in the given position is deleted.<br>
555	* The min_data_DL_if_deleted = data_DL_if_deleted - potential
556	*
557	* @param index the index of the rule in question
558	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
559	* @param checkErr whether check if error rate >= 0.5
560	* @return the minDataDL
561	*/
562	public double minDataDLIfDeleted(int index, double expFPRate,
563	boolean checkErr){
564	//System.out.println("!!!Enter without: ");
565	double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
566	int more = m_Ruleset.size() - 1 - index; // How many rules after?
567	FastVector indexPlus = new FastVector(more); // Their stats
568
569	// 0...(index-1) are OK
570	for(int j=0; j<index; j++){
571	// Covered stats are cumulative
572	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
573	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
574	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
575	}
576
577	// Recount data from index+1
578	Instances data = (index == 0) ?
579	m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1];
580	//System.out.println("!!!without: " + data.sumOfWeights());
581
582	for(int j=(index+1); j<m_Ruleset.size(); j++){
583	double[] stats = new double[6];
584	Instances[] split = computeSimpleStats(j, data, stats, null);
585	indexPlus.addElement(stats);
586	rulesetStat[0] += stats[0];
587	rulesetStat[2] += stats[2];
588	rulesetStat[4] += stats[4];
589	data = split[1];
590	}
591	// Uncovered stats are those of the last rule
592	if(more > 0){
593	rulesetStat[1] = ((double[])indexPlus.lastElement())[1];
594	rulesetStat[3] = ((double[])indexPlus.lastElement())[3];
595	rulesetStat[5] = ((double[])indexPlus.lastElement())[5];
596	}
597	else if(index > 0){
598	rulesetStat[1] =
599	((double[])m_SimpleStats.elementAt(index-1))[1];
600	rulesetStat[3] =
601	((double[])m_SimpleStats.elementAt(index-1))[3];
602	rulesetStat[5] =
603	((double[])m_SimpleStats.elementAt(index-1))[5];
604	}
605	else{ // Null coverage
606	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] +
607	((double[])m_SimpleStats.elementAt(0))[1];
608	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] +
609	((double[])m_SimpleStats.elementAt(0))[4];
610	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] +
611	((double[])m_SimpleStats.elementAt(0))[5];
612	}
613
614	// Potential
615	double potential = 0;
616	for(int k=index+1; k<m_Ruleset.size(); k++){
617	double[] ruleStat = (double[])indexPlus.elementAt(k-index-1);
618	double ifDeleted = potential(k, expFPRate, rulesetStat,
619	ruleStat, checkErr);
620	if(!Double.isNaN(ifDeleted))
621	potential += ifDeleted;
622	}
623
624	// Data DL of the ruleset without the rule
625	// Note that ruleset stats has already been updated to reflect
626	// deletion if any potential
627	double dataDLWithout = dataDL(expFPRate, rulesetStat[0],
628	rulesetStat[1], rulesetStat[4],
629	rulesetStat[5]);
630	//System.out.println("!!!without: "+dataDLWithout + " \|potential: "+
631	// potential);
632	// Why subtract potential again? To reflect change of theory DL??
633	return (dataDLWithout - potential);
634	}
635
636
637	/**
638	* Compute the minimal data description length of the ruleset
639	* if the rule in the given position is NOT deleted.<br>
640	* The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential
641	*
642	* @param index the index of the rule in question
643	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
644	* @param checkErr whether check if error rate >= 0.5
645	* @return the minDataDL
646	*/
647	public double minDataDLIfExists(int index, double expFPRate,
648	boolean checkErr){
649	// System.out.println("!!!Enter with: ");
650	double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists
651	for(int j=0; j<m_SimpleStats.size(); j++){
652	// Covered stats are cumulative
653	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
654	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
655	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
656	if(j == m_SimpleStats.size()-1){ // Last rule
657	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
658	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
659	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
660	}
661	}
662
663	// Potential
664	double potential = 0;
665	for(int k=index+1; k<m_SimpleStats.size(); k++){
666	double[] ruleStat = (double[])getSimpleStats(k);
667	double ifDeleted = potential(k, expFPRate, rulesetStat,
668	ruleStat, checkErr);
669	if(!Double.isNaN(ifDeleted))
670	potential += ifDeleted;
671	}
672
673	// Data DL of the ruleset without the rule
674	// Note that ruleset stats has already been updated to reflect deletion
675	// if any potential
676	double dataDLWith = dataDL(expFPRate, rulesetStat[0],
677	rulesetStat[1], rulesetStat[4],
678	rulesetStat[5]);
679	//System.out.println("!!!with: "+dataDLWith + " \|potential: "+
680	// potential);
681	return (dataDLWith - potential);
682	}
683
684
685	/**
686	* The description length (DL) of the ruleset relative to if the
687	* rule in the given position is deleted, which is obtained by: <br>
688	* MDL if the rule exists - MDL if the rule does not exist <br>
689	* Note the minimal possible DL of the ruleset is calculated(i.e. some
690	* other rules may also be deleted) instead of the DL of the current
691	* ruleset.<p>
692	*
693	* @param index the given position of the rule in question
694	* (assuming correct)
695	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
696	* @param checkErr whether check if error rate >= 0.5
697	* @return the relative DL
698	*/
699	public double relativeDL(int index, double expFPRate, boolean checkErr){
700
701	return (minDataDLIfExists(index, expFPRate, checkErr)
702	+ theoryDL(index) -
703	minDataDLIfDeleted(index, expFPRate, checkErr));
704	}
705
706
707	/**
708	* Try to reduce the DL of the ruleset by testing removing the rules
709	* one by one in reverse order and update all the stats
710	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
711	* @param checkErr whether check if error rate >= 0.5
712	*/
713	public void reduceDL(double expFPRate, boolean checkErr){
714
715	boolean needUpdate = false;
716	double[] rulesetStat = new double[6];
717	for(int j=0; j<m_SimpleStats.size(); j++){
718	// Covered stats are cumulative
719	rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
720	rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
721	rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
722	if(j == m_SimpleStats.size()-1){ // Last rule
723	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
724	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
725	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
726	}
727	}
728
729	// Potential
730	for(int k=m_SimpleStats.size()-1; k>=0; k--){
731
732	double[] ruleStat = (double[])m_SimpleStats.elementAt(k);
733
734	// rulesetStat updated
735	double ifDeleted = potential(k, expFPRate, rulesetStat,
736	ruleStat, checkErr);
737	if(!Double.isNaN(ifDeleted)){
738	/*System.err.println("!!!deleted ("+k+"): save "+ifDeleted
739	+" \| "+rulesetStat[0]
740	+" \| "+rulesetStat[1]
741	+" \| "+rulesetStat[4]
742	+" \| "+rulesetStat[5]);
743	*/
744
745	if(k == (m_SimpleStats.size()-1))
746	removeLast();
747	else{
748	m_Ruleset.removeElementAt(k);
749	needUpdate = true;
750	}
751	}
752	}
753
754	if(needUpdate){
755	m_Filtered = null;
756	m_SimpleStats = null;
757	countData();
758	}
759	}
760
761	/**
762	* Remove the last rule in the ruleset as well as it's stats.
763	* It might be useful when the last rule was added for testing
764	* purpose and then the test failed
765	*/
766	public void removeLast(){
767	int last = m_Ruleset.size()-1;
768	m_Ruleset.removeElementAt(last);
769	m_Filtered.removeElementAt(last);
770	m_SimpleStats.removeElementAt(last);
771	if(m_Distributions != null)
772	m_Distributions.removeElementAt(last);
773	}
774
775	/**
776	* Static utility function to count the data covered by the
777	* rules after the given index in the given rules, and then
778	* remove them. It returns the data not covered by the
779	* successive rules.
780	*
781	* @param data the data to be processed
782	* @param rules the ruleset
783	* @param index the given index
784	* @return the data after processing
785	*/
786	public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){
787	Instances rt = new Instances(data, 0);
788
789	for(int i=0; i < data.numInstances(); i++){
790	Instance datum = data.instance(i);
791	boolean covered = false;
792
793	for(int j=index+1; j<rules.size();j++){
794	Rule rule = (Rule)rules.elementAt(j);
795	if(rule.covers(datum)){
796	covered = true;
797	break;
798	}
799	}
800
801	if(!covered)
802	rt.add(datum);
803	}
804	return rt;
805	}
806
807	/**
808	* Stratify the given data into the given number of bags based on the class
809	* values. It differs from the <code>Instances.stratify(int fold)</code>
810	* that before stratification it sorts the instances according to the
811	* class order in the header file. It assumes no missing values in the class.
812	*
813	* @param data the given data
814	* @param folds the given number of folds
815	* @param rand the random object used to randomize the instances
816	* @return the stratified instances
817	*/
818	public static final Instances stratify(Instances data, int folds, Random rand){
819	if(!data.classAttribute().isNominal())
820	return data;
821
822	Instances result = new Instances(data, 0);
823	Instances[] bagsByClasses = new Instances[data.numClasses()];
824
825	for(int i=0; i < bagsByClasses.length; i++)
826	bagsByClasses[i] = new Instances(data, 0);
827
828	// Sort by class
829	for(int j=0; j < data.numInstances(); j++){
830	Instance datum = data.instance(j);
831	bagsByClasses[(int)datum.classValue()].add(datum);
832	}
833
834	// Randomize each class
835	for(int j=0; j < bagsByClasses.length; j++)
836	bagsByClasses[j].randomize(rand);
837
838	for(int k=0; k < folds; k++){
839	int offset = k, bag = 0;
840	oneFold:
841	while (true){
842	while(offset >= bagsByClasses[bag].numInstances()){
843	offset -= bagsByClasses[bag].numInstances();
844	if (++bag >= bagsByClasses.length)// Next bag
845	break oneFold;
846	}
847
848	result.add(bagsByClasses[bag].instance(offset));
849	offset += folds;
850	}
851	}
852
853	return result;
854	}
855
856	/**
857	* Compute the combined DL of the ruleset in this class, i.e. theory
858	* DL and data DL. Note this procedure computes the combined DL
859	* according to the current status of the ruleset in this class
860	*
861	* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
862	* @param predicted the default classification if ruleset covers null
863	* @return the combined class
864	*/
865	public double combinedDL(double expFPRate, double predicted){
866	double rt = 0;
867
868	if(getRulesetSize() > 0) {
869	double[] stats = (double[])m_SimpleStats.lastElement();
870	for(int j=getRulesetSize()-2; j >= 0; j--){
871	stats[0] += getSimpleStats(j)[0];
872	stats[2] += getSimpleStats(j)[2];
873	stats[4] += getSimpleStats(j)[4];
874	}
875	rt += dataDL(expFPRate, stats[0], stats[1],
876	stats[4], stats[5]); // Data DL
877	}
878	else{ // Null coverage ruleset
879	double fn = 0.0;
880	for(int j=0; j < m_Data.numInstances(); j++)
881	if((int)m_Data.instance(j).classValue() == (int)predicted)
882	fn += m_Data.instance(j).weight();
883	rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);
884	}
885
886	for(int i=0; i<getRulesetSize(); i++) // Theory DL
887	rt += theoryDL(i);
888
889	return rt;
890	}
891
892	/**
893	* Patition the data into 2, first of which has (numFolds-1)/numFolds of
894	* the data and the second has 1/numFolds of the data
895	*
896	*
897	* @param data the given data
898	* @param numFolds the given number of folds
899	* @return the patitioned instances
900	*/
901	public static final Instances[] partition(Instances data, int numFolds){
902	Instances[] rt = new Instances[2];
903	int splits = data.numInstances() * (numFolds - 1) / numFolds;
904
905	rt[0] = new Instances(data, 0, splits);
906	rt[1] = new Instances(data, splits, data.numInstances()-splits);
907
908	return rt;
909	}
910
911	/**
912	* Returns the revision string.
913	*
914	* @return the revision
915	*/
916	public String getRevision() {
917	return RevisionUtils.extract("$Revision: 4608 $");
918	}
919	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: