Context Navigation

source: src/main/java/weka/associations/RuleGeneration.java @ 21

Last change on this file since 21 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 12.8 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* RuleGeneration.java
	19	* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.associations;
	24
	25	import weka.core.FastVector;
	26	import weka.core.Instances;
	27	import weka.core.RevisionHandler;
	28	import weka.core.RevisionUtils;
	29	import weka.core.Statistics;
	30	import weka.core.Utils;
	31
	32	import java.io.Serializable;
	33	import java.util.Hashtable;
	34	import java.util.TreeSet;
	35
	36	/**
	37	* Class implementing the rule generation procedure of the predictive apriori algorithm.
	38	*
	39	* Reference: T. Scheffer (2001). <i>Finding Association Rules That Trade Support
	40	* Optimally against Confidence</i>. Proc of the 5th European Conf.
	41	* on Principles and Practice of Knowledge Discovery in Databases (PKDD'01),
	42	* pp. 424-435. Freiburg, Germany: Springer-Verlag. <p>
	43	*
	44	* The implementation follows the paper expect for adding a rule to the output of the
	45	* <i>n</i> best rules. A rule is added if:
	46	* the expected predictive accuracy of this rule is among the <i>n</i> best and it is
	47	* not subsumed by a rule with at least the same expected predictive accuracy
	48	* (out of an unpublished manuscript from T. Scheffer).
	49	*
	50	* @author Stefan Mutter (mutter@cs.waikato.ac.nz)
	51	* @version $Revision: 1.4 $ */
	52	public class RuleGeneration
	53	implements Serializable, RevisionHandler {
	54
	55	/** for serialization */
	56	private static final long serialVersionUID = -8927041669872491432L;
	57
	58	/** The items stored as an array of of integer. */
	59	protected int[] m_items;
	60
	61	/** Counter for how many transactions contain this item set. */
	62	protected int m_counter;
	63
	64	/** The total number of transactions */
	65	protected int m_totalTransactions;
	66
	67	/** Flag indicating whether the list fo the best rules has changed. */
	68	protected boolean m_change = false;
	69
	70	/** The minimum expected predictive accuracy that is needed to be a candidate for the list of the best rules. */
	71	protected double m_expectation;
	72
	73	/** Threshold. If the support of the premise is higher the binomial distrubution is approximated by a normal one. */
	74	protected static final int MAX_N = 300;
	75
	76	/** The minimum support a rule needs to be a candidate for the list of the best rules. */
	77	protected int m_minRuleCount;
	78
	79	/** Sorted array of the mied points of the intervals used for prior estimation. */
	80	protected double[] m_midPoints;
	81
	82	/** Hashtable conatining the estimated prior probabilities. */
	83	protected Hashtable m_priors;
	84
	85	/** The list of the actual <i>n</i> best rules. */
	86	protected TreeSet m_best;
	87
	88	/** Integer indicating the generation time of a rule. */
	89	protected int m_count;
	90
	91	/** The instances. */
	92	protected Instances m_instances;
	93
	94
	95	/**
	96	* Constructor
	97	* @param itemSet item set for that rules should be generated.
	98	* The item set will form the premise of the rules.
	99	*/
	100	public RuleGeneration(ItemSet itemSet){
	101
	102	m_totalTransactions = itemSet.m_totalTransactions;
	103	m_counter = itemSet.m_counter;
	104	m_items = itemSet.m_items;
	105	}
	106
	107
	108	/**
	109	* calculates the probability using a binomial distribution.
	110	* If the support of the premise is too large this distribution
	111	* is approximated by a normal distribution.
	112	* @param accuracy the accuracy value
	113	* @param ruleCount the support of the whole rule
	114	* @param premiseCount the support of the premise
	115	* @return the probability value
	116	*/
	117	public static final double binomialDistribution(double accuracy, double ruleCount, double premiseCount){
	118
	119	double mu, sigma;
	120
	121	if(premiseCount < MAX_N)
	122	return Math.pow(2,(Utils.log2(Math.pow(accuracy,ruleCount))+Utils.log2(Math.pow((1.0-accuracy),(premiseCount-ruleCount)))+PriorEstimation.logbinomialCoefficient((int)premiseCount,(int)ruleCount)));
	123	else{
	124	mu = premiseCount * accuracy;
	125	sigma = Math.sqrt((premiseCount * (1.0 - accuracy))*accuracy);
	126	return Statistics.normalProbability(((ruleCount+0.5)-mu)/(sigma*Math.sqrt(2)));
	127	}
	128	}
	129
	130	/**
	131	* calculates the expected predctive accuracy of a rule
	132	* @param ruleCount the support of the rule
	133	* @param premiseCount the premise support of the rule
	134	* @param midPoints array with all mid points
	135	* @param priors hashtable containing the prior probabilities
	136	* @return the expected predictive accuracy
	137	*/
	138	public static final double expectation(double ruleCount, int premiseCount,double[] midPoints, Hashtable priors){
	139
	140	double numerator = 0, denominator = 0;
	141	for(int i = 0;i < midPoints.length; i++){
	142	Double actualPrior = (Double)priors.get(new Double(midPoints[i]));
	143	if(actualPrior != null){
	144	if(actualPrior.doubleValue() != 0){
	145	double addend = actualPrior.doubleValue() * binomialDistribution(midPoints[i], ruleCount, (double)premiseCount);
	146	denominator += addend;
	147	numerator += addend*midPoints[i];
	148	}
	149	}
	150	}
	151	if(denominator <= 0 \|\| Double.isNaN(denominator))
	152	System.out.println("RuleItem denominator: "+denominator);
	153	if(numerator <= 0 \|\| Double.isNaN(numerator))
	154	System.out.println("RuleItem numerator: "+numerator);
	155	return numerator/denominator;
	156	}
	157
	158	/**
	159	* Generates all rules for an item set. The item set is the premise.
	160	* @param numRules the number of association rules the use wants to mine.
	161	* This number equals the size <i>n</i> of the list of the
	162	* best rules.
	163	* @param midPoints the mid points of the intervals
	164	* @param priors Hashtable that contains the prior probabilities
	165	* @param expectation the minimum value of the expected predictive accuracy
	166	* that is needed to get into the list of the best rules
	167	* @param instances the instances for which association rules are generated
	168	* @param best the list of the <i>n</i> best rules.
	169	* The list is implemented as a TreeSet
	170	* @param genTime the maximum time of generation
	171	* @return all the rules with minimum confidence for the given item set
	172	*/
	173	public TreeSet generateRules(int numRules, double[] midPoints, Hashtable priors, double expectation, Instances instances,TreeSet best,int genTime) {
	174
	175	boolean redundant = false;
	176	FastVector consequences = new FastVector(), consequencesMinusOne = new FastVector();
	177	ItemSet premise;
	178	int s = 0;
	179	RuleItem current = null, old;
	180
	181	Hashtable hashtable;
	182
	183	m_change = false;
	184	m_midPoints = midPoints;
	185	m_priors = priors;
	186	m_best = best;
	187	m_expectation = expectation;
	188	m_count = genTime;
	189	m_instances = instances;
	190
	191	//create rule body
	192	premise =null;
	193	premise = new ItemSet(m_totalTransactions);
	194	premise.m_items = new int[m_items.length];
	195	System.arraycopy(m_items, 0, premise.m_items, 0, m_items.length);
	196	premise.m_counter = m_counter;
	197
	198
	199	do{
	200	m_minRuleCount = 1;
	201	while(expectation((double)m_minRuleCount,premise.m_counter,m_midPoints,m_priors) <= m_expectation){
	202	m_minRuleCount++;
	203	if(m_minRuleCount > premise.m_counter)
	204	return m_best;
	205	}
	206	redundant = false;
	207	for(int i = 0; i < instances.numAttributes();i++){
	208	if(i == 0){
	209	for(int j = 0; j < m_items.length;j++)
	210	if(m_items[j] == -1)
	211	consequences = singleConsequence(instances, j,consequences);
	212	if(premise == null \|\| consequences.size() == 0)
	213	return m_best;
	214	}
	215	FastVector allRuleItems = new FastVector();
	216	int index = 0;
	217	do {
	218	int h = 0;
	219	while(h < consequences.size()){
	220	RuleItem dummie = new RuleItem();
	221	current = dummie.generateRuleItem(premise,(ItemSet)consequences.elementAt(h),instances,m_count,m_minRuleCount,m_midPoints,m_priors);
	222	if(current != null){
	223	allRuleItems.addElement(current);
	224	h++;
	225	}
	226	else
	227	consequences.removeElementAt(h);
	228	}
	229	if(index == i)
	230	break;
	231	consequencesMinusOne = consequences;
	232	consequences = ItemSet.mergeAllItemSets(consequencesMinusOne, index, instances.numInstances());
	233	hashtable = ItemSet.getHashtable(consequencesMinusOne, consequencesMinusOne.size());
	234	consequences = ItemSet.pruneItemSets(consequences, hashtable);
	235	index++;
	236	} while (consequences.size() > 0);
	237	for(int h = 0;h < allRuleItems.size();h++){
	238	current = (RuleItem)allRuleItems.elementAt(h);
	239	m_count++;
	240	if(m_best.size() < numRules){
	241	m_change =true;
	242	redundant = removeRedundant(current);
	243	}
	244	else{
	245	if(current.accuracy() > m_expectation){
	246	m_expectation = ((RuleItem)(m_best.first())).accuracy();
	247	boolean remove = m_best.remove(m_best.first());
	248	m_change = true;
	249	redundant = removeRedundant(current);
	250	m_expectation = ((RuleItem)(m_best.first())).accuracy();
	251	while(expectation((double)m_minRuleCount, (current.premise()).m_counter,m_midPoints,m_priors) < m_expectation){
	252	m_minRuleCount++;
	253	if(m_minRuleCount > (current.premise()).m_counter)
	254	break;
	255	}
	256	}
	257	}
	258	}
	259
	260	}
	261	}while(redundant);
	262	return m_best;
	263	}
	264
	265	/**
	266	* Methods that decides whether or not rule a subsumes rule b.
	267	* The defintion of subsumption is:
	268	* Rule a subsumes rule b, if a subsumes b
	269	* AND
	270	* a has got least the same expected predictive accuracy as b.
	271	* @param a an association rule stored as a RuleItem
	272	* @param b an association rule stored as a RuleItem
	273	* @return true if rule a subsumes rule b or false otherwise.
	274	*/
	275	public static boolean aSubsumesB(RuleItem a, RuleItem b){
	276
	277	if(a.m_accuracy < b.m_accuracy)
	278	return false;
	279	for(int k = 0; k < a.premise().m_items.length;k++){
	280	if(a.premise().m_items[k] != b.premise().m_items[k]){
	281	if((a.premise().m_items[k] != -1 && b.premise().m_items[k] != -1) \|\| b.premise().m_items[k] == -1)
	282	return false;
	283	}
	284	if(a.consequence().m_items[k] != b.consequence().m_items[k]){
	285	if((a.consequence().m_items[k] != -1 && b.consequence().m_items[k] != -1) \|\| a.consequence().m_items[k] == -1)
	286	return false;
	287	}
	288	}
	289	return true;
	290
	291	}
	292
	293	/**
	294	* generates a consequence of length 1 for an association rule.
	295	* @param instances the instances under consideration
	296	* @param attNum an item that does not occur in the premise
	297	* @param consequences FastVector that possibly already contains other consequences of length 1
	298	* @return FastVector with consequences of length 1
	299	*/
	300	public static FastVector singleConsequence(Instances instances, int attNum, FastVector consequences){
	301
	302	ItemSet consequence;
	303
	304	for (int i = 0; i < instances.numAttributes(); i++) {
	305	if( i == attNum){
	306	for (int j = 0; j < instances.attribute(i).numValues(); j++) {
	307	consequence = new ItemSet(instances.numInstances());
	308	consequence.m_items = new int[instances.numAttributes()];
	309	for (int k = 0; k < instances.numAttributes(); k++)
	310	consequence.m_items[k] = -1;
	311	consequence.m_items[i] = j;
	312	consequences.addElement(consequence);
	313	}
	314	}
	315	}
	316	return consequences;
	317
	318	}
	319
	320	/**
	321	* Method that removes redundant rules out of the list of the best rules.
	322	* A rule is in that list if:
	323	* the expected predictive accuracy of this rule is among the best and it is
	324	* not subsumed by a rule with at least the same expected predictive accuracy
	325	* @param toInsert the rule that should be inserted into the list
	326	* @return true if the method has changed the list, false otherwise
	327	*/
	328	public boolean removeRedundant(RuleItem toInsert){
	329
	330	boolean redundant = false, fSubsumesT = false, tSubsumesF = false;
	331	RuleItem first;
	332	int subsumes = 0;
	333	Object [] best = m_best.toArray();
	334	for(int i=0; i < best.length; i++){
	335	first = (RuleItem)best[i];
	336	fSubsumesT = aSubsumesB(first,toInsert);
	337	tSubsumesF = aSubsumesB(toInsert, first);
	338	if(fSubsumesT){
	339	subsumes = 1;
	340	break;
	341	}
	342	else{
	343	if(tSubsumesF){
	344	boolean remove = m_best.remove(first);
	345	subsumes = 2;
	346	redundant =true;
	347	}
	348	}
	349	}
	350	if(subsumes == 0 \|\| subsumes == 2)
	351	m_best.add(toInsert);
	352	return redundant;
	353	}
	354
	355	/**
	356	* Gets the actual maximum value of the generation time
	357	* @return the actual maximum value of the generation time
	358	*/
	359	public int count(){
	360
	361	return m_count;
	362	}
	363
	364	/**
	365	* Gets if the list fo the best rules has been changed
	366	* @return whether or not the list fo the best rules has been changed
	367	*/
	368	public boolean change(){
	369
	370	return m_change;
	371	}
	372
	373	/**
	374	* Returns the revision string.
	375	*
	376	* @return the revision
	377	*/
	378	public String getRevision() {
	379	return RevisionUtils.extract("$Revision: 1.4 $");
	380	}
	381	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: