Context Navigation

source: src/main/java/weka/associations/RuleGeneration.java @ 23

Last change on this file since 23 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 12.8 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* RuleGeneration.java
19	* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.associations;
24
25	import weka.core.FastVector;
26	import weka.core.Instances;
27	import weka.core.RevisionHandler;
28	import weka.core.RevisionUtils;
29	import weka.core.Statistics;
30	import weka.core.Utils;
31
32	import java.io.Serializable;
33	import java.util.Hashtable;
34	import java.util.TreeSet;
35
36	/**
37	* Class implementing the rule generation procedure of the predictive apriori algorithm.
38	*
39	* Reference: T. Scheffer (2001). <i>Finding Association Rules That Trade Support
40	* Optimally against Confidence</i>. Proc of the 5th European Conf.
41	* on Principles and Practice of Knowledge Discovery in Databases (PKDD'01),
42	* pp. 424-435. Freiburg, Germany: Springer-Verlag. <p>
43	*
44	* The implementation follows the paper expect for adding a rule to the output of the
45	* <i>n</i> best rules. A rule is added if:
46	* the expected predictive accuracy of this rule is among the <i>n</i> best and it is
47	* not subsumed by a rule with at least the same expected predictive accuracy
48	* (out of an unpublished manuscript from T. Scheffer).
49	*
50	* @author Stefan Mutter (mutter@cs.waikato.ac.nz)
51	* @version $Revision: 1.4 $ */
52	public class RuleGeneration
53	implements Serializable, RevisionHandler {
54
55	/** for serialization */
56	private static final long serialVersionUID = -8927041669872491432L;
57
58	/** The items stored as an array of of integer. */
59	protected int[] m_items;
60
61	/** Counter for how many transactions contain this item set. */
62	protected int m_counter;
63
64	/** The total number of transactions */
65	protected int m_totalTransactions;
66
67	/** Flag indicating whether the list fo the best rules has changed. */
68	protected boolean m_change = false;
69
70	/** The minimum expected predictive accuracy that is needed to be a candidate for the list of the best rules. */
71	protected double m_expectation;
72
73	/** Threshold. If the support of the premise is higher the binomial distrubution is approximated by a normal one. */
74	protected static final int MAX_N = 300;
75
76	/** The minimum support a rule needs to be a candidate for the list of the best rules. */
77	protected int m_minRuleCount;
78
79	/** Sorted array of the mied points of the intervals used for prior estimation. */
80	protected double[] m_midPoints;
81
82	/** Hashtable conatining the estimated prior probabilities. */
83	protected Hashtable m_priors;
84
85	/** The list of the actual <i>n</i> best rules. */
86	protected TreeSet m_best;
87
88	/** Integer indicating the generation time of a rule. */
89	protected int m_count;
90
91	/** The instances. */
92	protected Instances m_instances;
93
94
95	/**
96	* Constructor
97	* @param itemSet item set for that rules should be generated.
98	* The item set will form the premise of the rules.
99	*/
100	public RuleGeneration(ItemSet itemSet){
101
102	m_totalTransactions = itemSet.m_totalTransactions;
103	m_counter = itemSet.m_counter;
104	m_items = itemSet.m_items;
105	}
106
107
108	/**
109	* calculates the probability using a binomial distribution.
110	* If the support of the premise is too large this distribution
111	* is approximated by a normal distribution.
112	* @param accuracy the accuracy value
113	* @param ruleCount the support of the whole rule
114	* @param premiseCount the support of the premise
115	* @return the probability value
116	*/
117	public static final double binomialDistribution(double accuracy, double ruleCount, double premiseCount){
118
119	double mu, sigma;
120
121	if(premiseCount < MAX_N)
122	return Math.pow(2,(Utils.log2(Math.pow(accuracy,ruleCount))+Utils.log2(Math.pow((1.0-accuracy),(premiseCount-ruleCount)))+PriorEstimation.logbinomialCoefficient((int)premiseCount,(int)ruleCount)));
123	else{
124	mu = premiseCount * accuracy;
125	sigma = Math.sqrt((premiseCount * (1.0 - accuracy))*accuracy);
126	return Statistics.normalProbability(((ruleCount+0.5)-mu)/(sigma*Math.sqrt(2)));
127	}
128	}
129
130	/**
131	* calculates the expected predctive accuracy of a rule
132	* @param ruleCount the support of the rule
133	* @param premiseCount the premise support of the rule
134	* @param midPoints array with all mid points
135	* @param priors hashtable containing the prior probabilities
136	* @return the expected predictive accuracy
137	*/
138	public static final double expectation(double ruleCount, int premiseCount,double[] midPoints, Hashtable priors){
139
140	double numerator = 0, denominator = 0;
141	for(int i = 0;i < midPoints.length; i++){
142	Double actualPrior = (Double)priors.get(new Double(midPoints[i]));
143	if(actualPrior != null){
144	if(actualPrior.doubleValue() != 0){
145	double addend = actualPrior.doubleValue() * binomialDistribution(midPoints[i], ruleCount, (double)premiseCount);
146	denominator += addend;
147	numerator += addend*midPoints[i];
148	}
149	}
150	}
151	if(denominator <= 0 \|\| Double.isNaN(denominator))
152	System.out.println("RuleItem denominator: "+denominator);
153	if(numerator <= 0 \|\| Double.isNaN(numerator))
154	System.out.println("RuleItem numerator: "+numerator);
155	return numerator/denominator;
156	}
157
158	/**
159	* Generates all rules for an item set. The item set is the premise.
160	* @param numRules the number of association rules the use wants to mine.
161	* This number equals the size <i>n</i> of the list of the
162	* best rules.
163	* @param midPoints the mid points of the intervals
164	* @param priors Hashtable that contains the prior probabilities
165	* @param expectation the minimum value of the expected predictive accuracy
166	* that is needed to get into the list of the best rules
167	* @param instances the instances for which association rules are generated
168	* @param best the list of the <i>n</i> best rules.
169	* The list is implemented as a TreeSet
170	* @param genTime the maximum time of generation
171	* @return all the rules with minimum confidence for the given item set
172	*/
173	public TreeSet generateRules(int numRules, double[] midPoints, Hashtable priors, double expectation, Instances instances,TreeSet best,int genTime) {
174
175	boolean redundant = false;
176	FastVector consequences = new FastVector(), consequencesMinusOne = new FastVector();
177	ItemSet premise;
178	int s = 0;
179	RuleItem current = null, old;
180
181	Hashtable hashtable;
182
183	m_change = false;
184	m_midPoints = midPoints;
185	m_priors = priors;
186	m_best = best;
187	m_expectation = expectation;
188	m_count = genTime;
189	m_instances = instances;
190
191	//create rule body
192	premise =null;
193	premise = new ItemSet(m_totalTransactions);
194	premise.m_items = new int[m_items.length];
195	System.arraycopy(m_items, 0, premise.m_items, 0, m_items.length);
196	premise.m_counter = m_counter;
197
198
199	do{
200	m_minRuleCount = 1;
201	while(expectation((double)m_minRuleCount,premise.m_counter,m_midPoints,m_priors) <= m_expectation){
202	m_minRuleCount++;
203	if(m_minRuleCount > premise.m_counter)
204	return m_best;
205	}
206	redundant = false;
207	for(int i = 0; i < instances.numAttributes();i++){
208	if(i == 0){
209	for(int j = 0; j < m_items.length;j++)
210	if(m_items[j] == -1)
211	consequences = singleConsequence(instances, j,consequences);
212	if(premise == null \|\| consequences.size() == 0)
213	return m_best;
214	}
215	FastVector allRuleItems = new FastVector();
216	int index = 0;
217	do {
218	int h = 0;
219	while(h < consequences.size()){
220	RuleItem dummie = new RuleItem();
221	current = dummie.generateRuleItem(premise,(ItemSet)consequences.elementAt(h),instances,m_count,m_minRuleCount,m_midPoints,m_priors);
222	if(current != null){
223	allRuleItems.addElement(current);
224	h++;
225	}
226	else
227	consequences.removeElementAt(h);
228	}
229	if(index == i)
230	break;
231	consequencesMinusOne = consequences;
232	consequences = ItemSet.mergeAllItemSets(consequencesMinusOne, index, instances.numInstances());
233	hashtable = ItemSet.getHashtable(consequencesMinusOne, consequencesMinusOne.size());
234	consequences = ItemSet.pruneItemSets(consequences, hashtable);
235	index++;
236	} while (consequences.size() > 0);
237	for(int h = 0;h < allRuleItems.size();h++){
238	current = (RuleItem)allRuleItems.elementAt(h);
239	m_count++;
240	if(m_best.size() < numRules){
241	m_change =true;
242	redundant = removeRedundant(current);
243	}
244	else{
245	if(current.accuracy() > m_expectation){
246	m_expectation = ((RuleItem)(m_best.first())).accuracy();
247	boolean remove = m_best.remove(m_best.first());
248	m_change = true;
249	redundant = removeRedundant(current);
250	m_expectation = ((RuleItem)(m_best.first())).accuracy();
251	while(expectation((double)m_minRuleCount, (current.premise()).m_counter,m_midPoints,m_priors) < m_expectation){
252	m_minRuleCount++;
253	if(m_minRuleCount > (current.premise()).m_counter)
254	break;
255	}
256	}
257	}
258	}
259
260	}
261	}while(redundant);
262	return m_best;
263	}
264
265	/**
266	* Methods that decides whether or not rule a subsumes rule b.
267	* The defintion of subsumption is:
268	* Rule a subsumes rule b, if a subsumes b
269	* AND
270	* a has got least the same expected predictive accuracy as b.
271	* @param a an association rule stored as a RuleItem
272	* @param b an association rule stored as a RuleItem
273	* @return true if rule a subsumes rule b or false otherwise.
274	*/
275	public static boolean aSubsumesB(RuleItem a, RuleItem b){
276
277	if(a.m_accuracy < b.m_accuracy)
278	return false;
279	for(int k = 0; k < a.premise().m_items.length;k++){
280	if(a.premise().m_items[k] != b.premise().m_items[k]){
281	if((a.premise().m_items[k] != -1 && b.premise().m_items[k] != -1) \|\| b.premise().m_items[k] == -1)
282	return false;
283	}
284	if(a.consequence().m_items[k] != b.consequence().m_items[k]){
285	if((a.consequence().m_items[k] != -1 && b.consequence().m_items[k] != -1) \|\| a.consequence().m_items[k] == -1)
286	return false;
287	}
288	}
289	return true;
290
291	}
292
293	/**
294	* generates a consequence of length 1 for an association rule.
295	* @param instances the instances under consideration
296	* @param attNum an item that does not occur in the premise
297	* @param consequences FastVector that possibly already contains other consequences of length 1
298	* @return FastVector with consequences of length 1
299	*/
300	public static FastVector singleConsequence(Instances instances, int attNum, FastVector consequences){
301
302	ItemSet consequence;
303
304	for (int i = 0; i < instances.numAttributes(); i++) {
305	if( i == attNum){
306	for (int j = 0; j < instances.attribute(i).numValues(); j++) {
307	consequence = new ItemSet(instances.numInstances());
308	consequence.m_items = new int[instances.numAttributes()];
309	for (int k = 0; k < instances.numAttributes(); k++)
310	consequence.m_items[k] = -1;
311	consequence.m_items[i] = j;
312	consequences.addElement(consequence);
313	}
314	}
315	}
316	return consequences;
317
318	}
319
320	/**
321	* Method that removes redundant rules out of the list of the best rules.
322	* A rule is in that list if:
323	* the expected predictive accuracy of this rule is among the best and it is
324	* not subsumed by a rule with at least the same expected predictive accuracy
325	* @param toInsert the rule that should be inserted into the list
326	* @return true if the method has changed the list, false otherwise
327	*/
328	public boolean removeRedundant(RuleItem toInsert){
329
330	boolean redundant = false, fSubsumesT = false, tSubsumesF = false;
331	RuleItem first;
332	int subsumes = 0;
333	Object [] best = m_best.toArray();
334	for(int i=0; i < best.length; i++){
335	first = (RuleItem)best[i];
336	fSubsumesT = aSubsumesB(first,toInsert);
337	tSubsumesF = aSubsumesB(toInsert, first);
338	if(fSubsumesT){
339	subsumes = 1;
340	break;
341	}
342	else{
343	if(tSubsumesF){
344	boolean remove = m_best.remove(first);
345	subsumes = 2;
346	redundant =true;
347	}
348	}
349	}
350	if(subsumes == 0 \|\| subsumes == 2)
351	m_best.add(toInsert);
352	return redundant;
353	}
354
355	/**
356	* Gets the actual maximum value of the generation time
357	* @return the actual maximum value of the generation time
358	*/
359	public int count(){
360
361	return m_count;
362	}
363
364	/**
365	* Gets if the list fo the best rules has been changed
366	* @return whether or not the list fo the best rules has been changed
367	*/
368	public boolean change(){
369
370	return m_change;
371	}
372
373	/**
374	* Returns the revision string.
375	*
376	* @return the revision
377	*/
378	public String getRevision() {
379	return RevisionUtils.extract("$Revision: 1.4 $");
380	}
381	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: