Context Navigation

AprioriItemSet.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 17.8 KB

Rev	Line
[29]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* AprioriItemSet.java
	19	* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.associations;
	24
	25	import weka.core.ContingencyTables;
	26	import weka.core.FastVector;
	27	import weka.core.Instances;
	28	import weka.core.RevisionHandler;
	29	import weka.core.RevisionUtils;
	30
	31	import java.io.Serializable;
	32	import java.util.Enumeration;
	33	import java.util.Hashtable;
	34
	35
	36	/**
	37	* Class for storing a set of items. Item sets are stored in a lexicographic
	38	* order, which is determined by the header information of the set of instances
	39	* used for generating the set of items. All methods in this class assume that
	40	* item sets are stored in lexicographic order.
	41	* The class provides methods that are used in the Apriori algorithm to construct
	42	* association rules.
	43	*
	44	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
	45	* @author Stefan Mutter (mutter@cs.waikato.ac.nz)
	46	* @version $Revision: 5130 $
	47	*/
	48	public class AprioriItemSet
	49	extends ItemSet
	50	implements Serializable, RevisionHandler {
	51
	52	/** for serialization */
	53	static final long serialVersionUID = 7684467755712672058L;
	54
	55	/**
	56	* Constructor
	57	*
	58	* @param totalTrans the total number of transactions in the data
	59	*/
	60	public AprioriItemSet(int totalTrans) {
	61	super(totalTrans);
	62	}
	63
	64
	65	/**
	66	* Outputs the confidence for a rule.
	67	*
	68	* @param premise the premise of the rule
	69	* @param consequence the consequence of the rule
	70	* @return the confidence on the training data
	71	*/
	72	public static double confidenceForRule(AprioriItemSet premise,
	73	AprioriItemSet consequence) {
	74
	75	return (double)consequence.m_counter/(double)premise.m_counter;
	76	}
	77
	78	/**
	79	* Outputs the lift for a rule. Lift is defined as:<br>
	80	* confidence / prob(consequence)
	81	*
	82	* @param premise the premise of the rule
	83	* @param consequence the consequence of the rule
	84	* @param consequenceCount how many times the consequence occurs independent
	85	* of the premise
	86	* @return the lift on the training data
	87	*/
	88	public double liftForRule(AprioriItemSet premise,
	89	AprioriItemSet consequence,
	90	int consequenceCount) {
	91	double confidence = confidenceForRule(premise, consequence);
	92
	93	return confidence / ((double)consequenceCount /
	94	(double)m_totalTransactions);
	95	}
	96
	97	/**
	98	* Outputs the leverage for a rule. Leverage is defined as: <br>
	99	* prob(premise & consequence) - (prob(premise) * prob(consequence))
	100	*
	101	* @param premise the premise of the rule
	102	* @param consequence the consequence of the rule
	103	* @param premiseCount how many times the premise occurs independent
	104	* of the consequent
	105	* @param consequenceCount how many times the consequence occurs independent
	106	* of the premise
	107	* @return the leverage on the training data
	108	*/
	109	public double leverageForRule(AprioriItemSet premise,
	110	AprioriItemSet consequence,
	111	int premiseCount,
	112	int consequenceCount) {
	113	double coverageForItemSet = (double)consequence.m_counter /
	114	(double)m_totalTransactions;
	115	double expectedCoverageIfIndependent =
	116	((double)premiseCount / (double)m_totalTransactions) *
	117	((double)consequenceCount / (double)m_totalTransactions);
	118	double lev = coverageForItemSet - expectedCoverageIfIndependent;
	119	return lev;
	120	}
	121
	122	/**
	123	* Outputs the conviction for a rule. Conviction is defined as: <br>
	124	* prob(premise) * prob(!consequence) / prob(premise & !consequence)
	125	*
	126	* @param premise the premise of the rule
	127	* @param consequence the consequence of the rule
	128	* @param premiseCount how many times the premise occurs independent
	129	* of the consequent
	130	* @param consequenceCount how many times the consequence occurs independent
	131	* of the premise
	132	* @return the conviction on the training data
	133	*/
	134	public double convictionForRule(AprioriItemSet premise,
	135	AprioriItemSet consequence,
	136	int premiseCount,
	137	int consequenceCount) {
	138	double num =
	139	(double)premiseCount * (double)(m_totalTransactions - consequenceCount) /
	140	(double)m_totalTransactions;
	141	double denom =
	142	((premiseCount - consequence.m_counter)+1);
	143
	144	if (num < 0 \|\| denom < 0) {
	145	System.err.println("*** "+num+" "+denom);
	146	System.err.println("premis count: "+premiseCount+" consequence count "+consequenceCount+" total trans "+m_totalTransactions);
	147	}
	148	return num / denom;
	149	}
	150
	151
	152
	153	/**
	154	* Generates all rules for an item set.
	155	*
	156	* @param minConfidence the minimum confidence the rules have to have
	157	* @param hashtables containing all(!) previously generated
	158	* item sets
	159	* @param numItemsInSet the size of the item set for which the rules
	160	* are to be generated
	161	* @return all the rules with minimum confidence for the given item set
	162	*/
	163	public FastVector[] generateRules(double minConfidence,
	164	FastVector hashtables,
	165	int numItemsInSet) {
	166
	167	FastVector premises = new FastVector(),consequences = new FastVector(),
	168	conf = new FastVector();
	169	FastVector[] rules = new FastVector[3], moreResults;
	170	AprioriItemSet premise, consequence;
	171	Hashtable hashtable = (Hashtable)hashtables.elementAt(numItemsInSet - 2);
	172
	173	// Generate all rules with one item in the consequence.
	174	for (int i = 0; i < m_items.length; i++)
	175	if (m_items[i] != -1) {
	176	premise = new AprioriItemSet(m_totalTransactions);
	177	consequence = new AprioriItemSet(m_totalTransactions);
	178	premise.m_items = new int[m_items.length];
	179	consequence.m_items = new int[m_items.length];
	180	consequence.m_counter = m_counter;
	181
	182	for (int j = 0; j < m_items.length; j++)
	183	consequence.m_items[j] = -1;
	184	System.arraycopy(m_items, 0, premise.m_items, 0, m_items.length);
	185	premise.m_items[i] = -1;
	186
	187	consequence.m_items[i] = m_items[i];
	188	premise.m_counter = ((Integer)hashtable.get(premise)).intValue();
	189	premises.addElement(premise);
	190	consequences.addElement(consequence);
	191	conf.addElement(new Double(confidenceForRule(premise, consequence)));
	192	}
	193	rules[0] = premises;
	194	rules[1] = consequences;
	195	rules[2] = conf;
	196	pruneRules(rules, minConfidence);
	197
	198	// Generate all the other rules
	199	moreResults = moreComplexRules(rules, numItemsInSet, 1, minConfidence,
	200	hashtables);
	201	if (moreResults != null)
	202	for (int i = 0; i < moreResults[0].size(); i++) {
	203	rules[0].addElement(moreResults[0].elementAt(i));
	204	rules[1].addElement(moreResults[1].elementAt(i));
	205	rules[2].addElement(moreResults[2].elementAt(i));
	206	}
	207	return rules;
	208	}
	209
	210
	211
	212	/**
	213	* Generates all significant rules for an item set.
	214	*
	215	* @param minMetric the minimum metric (confidence, lift, leverage,
	216	* improvement) the rules have to have
	217	* @param metricType (confidence=0, lift, leverage, improvement)
	218	* @param hashtables containing all(!) previously generated
	219	* item sets
	220	* @param numItemsInSet the size of the item set for which the rules
	221	* are to be generated
	222	* @param numTransactions
	223	* @param significanceLevel the significance level for testing the rules
	224	* @return all the rules with minimum metric for the given item set
	225	* @exception Exception if something goes wrong
	226	*/
	227	public final FastVector[] generateRulesBruteForce(double minMetric,
	228	int metricType,
	229	FastVector hashtables,
	230	int numItemsInSet,
	231	int numTransactions,
	232	double significanceLevel)
	233	throws Exception {
	234
	235	FastVector premises = new FastVector(),consequences = new FastVector(),
	236	conf = new FastVector(), lift = new FastVector(), lev = new FastVector(),
	237	conv = new FastVector();
	238	FastVector[] rules = new FastVector[6];
	239	AprioriItemSet premise, consequence;
	240	Hashtable hashtableForPremise, hashtableForConsequence;
	241	int numItemsInPremise, help, max, consequenceUnconditionedCounter;
	242	double[][] contingencyTable = new double[2][2];
	243	double metric, chiSquared;
	244
	245	// Generate all possible rules for this item set and test their
	246	// significance.
	247	max = (int)Math.pow(2, numItemsInSet);
	248	for (int j = 1; j < max; j++) {
	249	numItemsInPremise = 0;
	250	help = j;
	251	while (help > 0) {
	252	if (help % 2 == 1)
	253	numItemsInPremise++;
	254	help /= 2;
	255	}
	256	if (numItemsInPremise < numItemsInSet) {
	257	hashtableForPremise =
	258	(Hashtable)hashtables.elementAt(numItemsInPremise-1);
	259	hashtableForConsequence =
	260	(Hashtable)hashtables.elementAt(numItemsInSet-numItemsInPremise-1);
	261	premise = new AprioriItemSet(m_totalTransactions);
	262	consequence = new AprioriItemSet(m_totalTransactions);
	263	premise.m_items = new int[m_items.length];
	264
	265	consequence.m_items = new int[m_items.length];
	266	consequence.m_counter = m_counter;
	267	help = j;
	268	for (int i = 0; i < m_items.length; i++)
	269	if (m_items[i] != -1) {
	270	if (help % 2 == 1) {
	271	premise.m_items[i] = m_items[i];
	272	consequence.m_items[i] = -1;
	273	} else {
	274	premise.m_items[i] = -1;
	275	consequence.m_items[i] = m_items[i];
	276	}
	277	help /= 2;
	278	} else {
	279	premise.m_items[i] = -1;
	280	consequence.m_items[i] = -1;
	281	}
	282	premise.m_counter = ((Integer)hashtableForPremise.get(premise)).intValue();
	283	consequenceUnconditionedCounter =
	284	((Integer)hashtableForConsequence.get(consequence)).intValue();
	285
	286	if (metricType == 0) {
	287	contingencyTable[0][0] = (double)(consequence.m_counter);
	288	contingencyTable[0][1] = (double)(premise.m_counter - consequence.m_counter);
	289	contingencyTable[1][0] = (double)(consequenceUnconditionedCounter -
	290	consequence.m_counter);
	291	contingencyTable[1][1] = (double)(numTransactions - premise.m_counter -
	292	consequenceUnconditionedCounter +
	293	consequence.m_counter);
	294	chiSquared = ContingencyTables.chiSquared(contingencyTable, false);
	295
	296	metric = confidenceForRule(premise, consequence);
	297
	298	if ((!(metric < minMetric)) &&
	299	(!(chiSquared > significanceLevel))) {
	300	premises.addElement(premise);
	301	consequences.addElement(consequence);
	302	conf.addElement(new Double(metric));
	303	lift.addElement(new Double(liftForRule(premise, consequence,
	304	consequenceUnconditionedCounter)));
	305	lev.addElement(new Double(leverageForRule(premise, consequence,
	306	premise.m_counter,
	307	consequenceUnconditionedCounter)));
	308	conv.addElement(new Double(convictionForRule(premise, consequence,
	309	premise.m_counter,
	310	consequenceUnconditionedCounter)));
	311	}
	312	} else {
	313	double tempConf = confidenceForRule(premise, consequence);
	314	double tempLift = liftForRule(premise, consequence,
	315	consequenceUnconditionedCounter);
	316	double tempLev = leverageForRule(premise, consequence,
	317	premise.m_counter,
	318	consequenceUnconditionedCounter);
	319	double tempConv = convictionForRule(premise, consequence,
	320	premise.m_counter,
	321	consequenceUnconditionedCounter);
	322	switch(metricType) {
	323	case 1:
	324	metric = tempLift;
	325	break;
	326	case 2:
	327	metric = tempLev;
	328	break;
	329	case 3:
	330	metric = tempConv;
	331	break;
	332	default:
	333	throw new Exception("ItemSet: Unknown metric type!");
	334	}
	335	if (!(metric < minMetric)) {
	336	premises.addElement(premise);
	337	consequences.addElement(consequence);
	338	conf.addElement(new Double(tempConf));
	339	lift.addElement(new Double(tempLift));
	340	lev.addElement(new Double(tempLev));
	341	conv.addElement(new Double(tempConv));
	342	}
	343	}
	344	}
	345	}
	346	rules[0] = premises;
	347	rules[1] = consequences;
	348	rules[2] = conf;
	349	rules[3] = lift;
	350	rules[4] = lev;
	351	rules[5] = conv;
	352	return rules;
	353	}
	354
	355	/**
	356	* Subtracts an item set from another one.
	357	*
	358	* @param toSubtract the item set to be subtracted from this one.
	359	* @return an item set that only contains items form this item sets that
	360	* are not contained by toSubtract
	361	*/
	362	public final AprioriItemSet subtract(AprioriItemSet toSubtract) {
	363
	364	AprioriItemSet result = new AprioriItemSet(m_totalTransactions);
	365
	366	result.m_items = new int[m_items.length];
	367
	368	for (int i = 0; i < m_items.length; i++)
	369	if (toSubtract.m_items[i] == -1)
	370	result.m_items[i] = m_items[i];
	371	else
	372	result.m_items[i] = -1;
	373	result.m_counter = 0;
	374	return result;
	375	}
	376
	377
	378	/**
	379	* Generates rules with more than one item in the consequence.
	380	*
	381	* @param rules all the rules having (k-1)-item sets as consequences
	382	* @param numItemsInSet the size of the item set for which the rules
	383	* are to be generated
	384	* @param numItemsInConsequence the value of (k-1)
	385	* @param minConfidence the minimum confidence a rule has to have
	386	* @param hashtables the hashtables containing all(!) previously generated
	387	* item sets
	388	* @return all the rules having (k)-item sets as consequences
	389	*/
	390	private final FastVector[] moreComplexRules(FastVector[] rules,
	391	int numItemsInSet,
	392	int numItemsInConsequence,
	393	double minConfidence,
	394	FastVector hashtables) {
	395
	396	AprioriItemSet newPremise;
	397	FastVector[] result, moreResults;
	398	FastVector newConsequences, newPremises = new FastVector(),
	399	newConf = new FastVector();
	400	Hashtable hashtable;
	401
	402	if (numItemsInSet > numItemsInConsequence + 1) {
	403	hashtable =
	404	(Hashtable)hashtables.elementAt(numItemsInSet - numItemsInConsequence - 2);
	405	newConsequences = mergeAllItemSets(rules[1],
	406	numItemsInConsequence - 1,
	407	m_totalTransactions);
	408	Enumeration enu = newConsequences.elements();
	409	while (enu.hasMoreElements()) {
	410	AprioriItemSet current = (AprioriItemSet)enu.nextElement();
	411	current.m_counter = m_counter;
	412	newPremise = subtract(current);
	413	newPremise.m_counter = ((Integer)hashtable.get(newPremise)).intValue();
	414	newPremises.addElement(newPremise);
	415	newConf.addElement(new Double(confidenceForRule(newPremise, current)));
	416	}
	417	result = new FastVector[3];
	418	result[0] = newPremises;
	419	result[1] = newConsequences;
	420	result[2] = newConf;
	421	pruneRules(result, minConfidence);
	422	moreResults = moreComplexRules(result,numItemsInSet,numItemsInConsequence+1,
	423	minConfidence, hashtables);
	424	if (moreResults != null)
	425	for (int i = 0; i < moreResults[0].size(); i++) {
	426	result[0].addElement(moreResults[0].elementAt(i));
	427	result[1].addElement(moreResults[1].elementAt(i));
	428	result[2].addElement(moreResults[2].elementAt(i));
	429	}
	430	return result;
	431	} else
	432	return null;
	433	}
	434
	435
	436	/**
	437	* Returns the contents of an item set as a string.
	438	*
	439	* @param instances contains the relevant header information
	440	* @return string describing the item set
	441	*/
	442	public final String toString(Instances instances) {
	443
	444	return super.toString(instances);
	445	}
	446
	447	/**
	448	* Converts the header info of the given set of instances into a set
	449	* of item sets (singletons). The ordering of values in the header file
	450	* determines the lexicographic order.
	451	*
	452	* @param instances the set of instances whose header info is to be used
	453	* @return a set of item sets, each containing a single item
	454	* @exception Exception if singletons can't be generated successfully
	455	*/
	456	public static FastVector singletons(Instances instances,
	457	boolean treatZeroAsMissing) throws Exception {
	458
	459	FastVector setOfItemSets = new FastVector();
	460	ItemSet current;
	461
	462	for (int i = 0; i < instances.numAttributes(); i++) {
	463	if (instances.attribute(i).isNumeric())
	464	throw new Exception("Can't handle numeric attributes!");
	465	int j = (treatZeroAsMissing) ? 1 : 0;
	466	for (; j < instances.attribute(i).numValues(); j++) {
	467	current = new AprioriItemSet(instances.numInstances());
	468	current.setTreatZeroAsMissing(treatZeroAsMissing);
	469	current.m_items = new int[instances.numAttributes()];
	470	for (int k = 0; k < instances.numAttributes(); k++)
	471	current.m_items[k] = -1;
	472	current.m_items[i] = j;
	473	setOfItemSets.addElement(current);
	474	}
	475	}
	476	return setOfItemSets;
	477	}
	478
	479	/**
	480	* Merges all item sets in the set of (k-1)-item sets
	481	* to create the (k)-item sets and updates the counters.
	482	*
	483	* @param itemSets the set of (k-1)-item sets
	484	* @param size the value of (k-1)
	485	* @param totalTrans the total number of transactions in the data
	486	* @return the generated (k)-item sets
	487	*/
	488	public static FastVector mergeAllItemSets(FastVector itemSets, int size,
	489	int totalTrans) {
	490
	491	FastVector newVector = new FastVector();
	492	ItemSet result;
	493	int numFound, k;
	494
	495	for (int i = 0; i < itemSets.size(); i++) {
	496	ItemSet first = (ItemSet)itemSets.elementAt(i);
	497	out:
	498	for (int j = i+1; j < itemSets.size(); j++) {
	499	ItemSet second = (ItemSet)itemSets.elementAt(j);
	500	result = new AprioriItemSet(totalTrans);
	501	result.m_items = new int[first.m_items.length];
	502
	503	// Find and copy common prefix of size 'size'
	504	numFound = 0;
	505	k = 0;
	506	while (numFound < size) {
	507	if (first.m_items[k] == second.m_items[k]) {
	508	if (first.m_items[k] != -1)
	509	numFound++;
	510	result.m_items[k] = first.m_items[k];
	511	} else
	512	break out;
	513	k++;
	514	}
	515
	516	// Check difference
	517	while (k < first.m_items.length) {
	518	if ((first.m_items[k] != -1) && (second.m_items[k] != -1))
	519	break;
	520	else {
	521	if (first.m_items[k] != -1)
	522	result.m_items[k] = first.m_items[k];
	523	else
	524	result.m_items[k] = second.m_items[k];
	525	}
	526	k++;
	527	}
	528	if (k == first.m_items.length) {
	529	result.m_counter = 0;
	530	newVector.addElement(result);
	531	}
	532	}
	533	}
	534	return newVector;
	535	}
	536
	537	/**
	538	* Returns the revision string.
	539	*
	540	* @return the revision
	541	*/
	542	public String getRevision() {
	543	return RevisionUtils.extract("$Revision: 5130 $");
	544	}
	545	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/associations/AprioriItemSet.java

Download in other formats: