source: src/main/java/weka/associations/RuleGeneration.java @ 21

Last change on this file since 21 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 12.8 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RuleGeneration.java
19 *    Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.associations;
24
25import weka.core.FastVector;
26import weka.core.Instances;
27import weka.core.RevisionHandler;
28import weka.core.RevisionUtils;
29import weka.core.Statistics;
30import weka.core.Utils;
31
32import java.io.Serializable;
33import java.util.Hashtable;
34import java.util.TreeSet;
35
36/**
37 * Class implementing the rule generation procedure of the predictive apriori algorithm.
38 *
39 * Reference: T. Scheffer (2001). <i>Finding Association Rules That Trade Support
40 * Optimally against Confidence</i>. Proc of the 5th European Conf.
41 * on Principles and Practice of Knowledge Discovery in Databases (PKDD'01),
42 * pp. 424-435. Freiburg, Germany: Springer-Verlag. <p>
43 *
44 * The implementation follows the paper expect for adding a rule to the output of the
45 * <i>n</i> best rules. A rule is added if:
46 * the expected predictive accuracy of this rule is among the <i>n</i> best and it is
47 * not subsumed by a rule with at least the same expected predictive accuracy
48 * (out of an unpublished manuscript from T. Scheffer).
49 *
50 * @author Stefan Mutter (mutter@cs.waikato.ac.nz)
51 * @version $Revision: 1.4 $ */
52public class RuleGeneration
53  implements Serializable, RevisionHandler {
54
55  /** for serialization */
56  private static final long serialVersionUID = -8927041669872491432L;
57
58  /** The items stored as an array of of integer. */
59  protected int[] m_items;
60
61  /** Counter for how many transactions contain this item set. */
62  protected int m_counter;
63
64  /** The total number of transactions */
65  protected int m_totalTransactions;
66
67  /** Flag indicating whether the list fo the best rules has changed. */
68  protected boolean m_change = false;
69
70  /** The minimum expected predictive accuracy that is needed to be a candidate for the list of the best rules. */
71  protected double m_expectation;
72
73  /** Threshold. If the support of the premise is higher the binomial distrubution is approximated by a normal one. */
74  protected static final int MAX_N = 300;
75
76  /** The minimum support a rule needs to be a candidate for the list of the best rules. */
77  protected int m_minRuleCount;
78
79  /** Sorted array of the mied points of the intervals used for prior estimation. */
80  protected double[] m_midPoints;
81
82  /** Hashtable conatining the estimated prior probabilities. */
83  protected Hashtable m_priors;
84
85  /** The list of the actual <i>n</i> best rules. */
86  protected TreeSet m_best;
87
88  /** Integer indicating the generation time of a rule. */
89  protected int m_count;
90
91  /** The instances. */
92  protected Instances m_instances;
93
94
95  /**
96   * Constructor
97   * @param itemSet item set for that rules should be generated.
98   * The item set will form the premise of the rules.
99   */
100  public RuleGeneration(ItemSet itemSet){
101
102    m_totalTransactions = itemSet.m_totalTransactions;
103    m_counter = itemSet.m_counter;
104    m_items = itemSet.m_items;
105  }
106
107
108  /**
109   * calculates the probability using a binomial distribution.
110   * If the support of the premise is too large this distribution
111   * is approximated by a normal distribution.
112   * @param accuracy the accuracy value
113   * @param ruleCount the support of the whole rule
114   * @param premiseCount the support of the premise
115   * @return the probability value
116   */ 
117  public static final double binomialDistribution(double accuracy, double ruleCount, double premiseCount){
118
119    double mu, sigma;
120
121    if(premiseCount < MAX_N) 
122      return Math.pow(2,(Utils.log2(Math.pow(accuracy,ruleCount))+Utils.log2(Math.pow((1.0-accuracy),(premiseCount-ruleCount)))+PriorEstimation.logbinomialCoefficient((int)premiseCount,(int)ruleCount)));
123    else{
124      mu = premiseCount * accuracy;
125      sigma = Math.sqrt((premiseCount * (1.0 - accuracy))*accuracy);
126      return Statistics.normalProbability(((ruleCount+0.5)-mu)/(sigma*Math.sqrt(2)));
127    }
128  }
129
130  /**
131   * calculates the expected predctive accuracy of a rule
132   * @param ruleCount the support of the rule
133   * @param premiseCount the premise support of the rule
134   * @param midPoints array with all mid points
135   * @param priors hashtable containing the prior probabilities
136   * @return the expected predictive accuracy
137   */ 
138  public static final double expectation(double ruleCount, int premiseCount,double[] midPoints, Hashtable priors){
139
140    double numerator = 0, denominator = 0;
141    for(int i = 0;i < midPoints.length; i++){
142      Double actualPrior = (Double)priors.get(new Double(midPoints[i]));
143      if(actualPrior != null){
144        if(actualPrior.doubleValue() != 0){
145          double addend = actualPrior.doubleValue() * binomialDistribution(midPoints[i], ruleCount, (double)premiseCount);
146          denominator += addend;
147          numerator += addend*midPoints[i];
148        }
149      }
150    }
151    if(denominator <= 0 || Double.isNaN(denominator))
152      System.out.println("RuleItem denominator: "+denominator);
153    if(numerator <= 0 || Double.isNaN(numerator))
154      System.out.println("RuleItem numerator: "+numerator);
155    return numerator/denominator;
156  }
157
158  /**
159   * Generates all rules for an item set. The item set is the premise.
160   * @param numRules the number of association rules the use wants to mine.
161   * This number equals the size <i>n</i> of the list of the
162   * best rules.
163   * @param midPoints the mid points of the intervals
164   * @param priors Hashtable that contains the prior probabilities
165   * @param expectation the minimum value of the expected predictive accuracy
166   * that is needed to get into the list of the best rules
167   * @param instances the instances for which association rules are generated
168   * @param best the list of the <i>n</i> best rules.
169   * The list is implemented as a TreeSet
170   * @param genTime the maximum time of generation
171   * @return all the rules with minimum confidence for the given item set
172   */
173  public TreeSet generateRules(int numRules, double[] midPoints, Hashtable priors, double expectation, Instances instances,TreeSet best,int genTime) {
174
175    boolean redundant = false;
176    FastVector consequences = new FastVector(), consequencesMinusOne = new FastVector();
177    ItemSet premise;
178    int s = 0;
179    RuleItem current = null, old;
180
181    Hashtable hashtable;
182
183    m_change = false;
184    m_midPoints = midPoints;
185    m_priors = priors;
186    m_best = best;
187    m_expectation = expectation;
188    m_count = genTime;
189    m_instances = instances;
190
191    //create rule body
192    premise =null;
193    premise = new ItemSet(m_totalTransactions);
194    premise.m_items = new int[m_items.length];
195    System.arraycopy(m_items, 0, premise.m_items, 0, m_items.length);
196    premise.m_counter = m_counter;
197
198
199    do{ 
200      m_minRuleCount = 1;
201      while(expectation((double)m_minRuleCount,premise.m_counter,m_midPoints,m_priors) <= m_expectation){
202        m_minRuleCount++;
203        if(m_minRuleCount > premise.m_counter)
204          return m_best;
205      }
206      redundant = false;
207      for(int i = 0; i < instances.numAttributes();i++){       
208        if(i == 0){   
209          for(int j = 0; j < m_items.length;j++)
210            if(m_items[j] == -1)
211              consequences = singleConsequence(instances, j,consequences);           
212          if(premise == null || consequences.size() == 0)
213            return m_best;
214        }
215        FastVector allRuleItems = new FastVector();
216        int index = 0;
217        do {
218          int h = 0;
219          while(h < consequences.size()){
220            RuleItem dummie = new RuleItem();
221            current = dummie.generateRuleItem(premise,(ItemSet)consequences.elementAt(h),instances,m_count,m_minRuleCount,m_midPoints,m_priors);
222            if(current != null){
223              allRuleItems.addElement(current);
224              h++;
225            }
226            else
227              consequences.removeElementAt(h);
228          }
229          if(index == i)
230            break;
231          consequencesMinusOne = consequences;
232          consequences = ItemSet.mergeAllItemSets(consequencesMinusOne, index, instances.numInstances());
233          hashtable = ItemSet.getHashtable(consequencesMinusOne, consequencesMinusOne.size());
234          consequences = ItemSet.pruneItemSets(consequences, hashtable);
235          index++;
236        } while (consequences.size() > 0); 
237        for(int h = 0;h < allRuleItems.size();h++){
238          current = (RuleItem)allRuleItems.elementAt(h);
239          m_count++;
240          if(m_best.size() < numRules){
241            m_change =true;
242            redundant = removeRedundant(current);
243          }
244          else{
245            if(current.accuracy() > m_expectation){
246              m_expectation = ((RuleItem)(m_best.first())).accuracy();
247              boolean remove = m_best.remove(m_best.first());
248              m_change = true;
249              redundant = removeRedundant(current);
250              m_expectation = ((RuleItem)(m_best.first())).accuracy();
251              while(expectation((double)m_minRuleCount, (current.premise()).m_counter,m_midPoints,m_priors) < m_expectation){
252                m_minRuleCount++;
253                if(m_minRuleCount > (current.premise()).m_counter)
254                  break;
255              }
256            } 
257          }
258        }   
259
260      }
261    }while(redundant); 
262    return m_best;
263  }
264
265  /**
266   * Methods that decides whether or not rule a subsumes rule b.
267   * The defintion of subsumption is:
268   * Rule a subsumes rule b, if a subsumes b
269   * AND
270   * a has got least the same expected predictive accuracy as b.
271   * @param a an association rule stored as a RuleItem
272   * @param b an association rule stored as a RuleItem
273   * @return true if rule a subsumes rule b or false otherwise.
274   */ 
275  public static boolean aSubsumesB(RuleItem a, RuleItem b){
276
277    if(a.m_accuracy < b.m_accuracy)
278      return false;
279    for(int k = 0; k < a.premise().m_items.length;k++){
280      if(a.premise().m_items[k] != b.premise().m_items[k]){
281        if((a.premise().m_items[k] != -1 && b.premise().m_items[k] != -1) || b.premise().m_items[k] == -1)
282          return false;
283      }
284      if(a.consequence().m_items[k] != b.consequence().m_items[k]){
285        if((a.consequence().m_items[k] != -1 && b.consequence().m_items[k] != -1) || a.consequence().m_items[k] == -1)
286          return false;
287      }
288    }
289    return true;
290
291  }
292
293  /**
294   * generates a consequence of length 1 for an association rule.
295   * @param instances the instances under consideration
296   * @param attNum an item that does not occur in the premise
297   * @param consequences FastVector that possibly already contains other consequences of length 1
298   * @return FastVector with consequences of length 1
299   */ 
300  public static FastVector singleConsequence(Instances instances, int attNum, FastVector consequences){
301
302    ItemSet consequence;
303
304    for (int i = 0; i < instances.numAttributes(); i++) {
305      if( i == attNum){
306        for (int j = 0; j < instances.attribute(i).numValues(); j++) {
307          consequence = new ItemSet(instances.numInstances());
308          consequence.m_items = new int[instances.numAttributes()];
309          for (int k = 0; k < instances.numAttributes(); k++) 
310            consequence.m_items[k] = -1;
311          consequence.m_items[i] = j;
312          consequences.addElement(consequence);
313        }
314      }
315    }
316    return consequences;
317
318  }
319
320  /**
321   * Method that removes redundant rules out of the list of the best rules.
322   * A rule is in that list if:
323   * the expected predictive accuracy of this rule is among the best and it is
324   * not subsumed by a rule with at least the same expected predictive accuracy
325   * @param toInsert the rule that should be inserted into the list
326   * @return true if the method has changed the list, false otherwise
327   */ 
328  public boolean removeRedundant(RuleItem toInsert){
329
330    boolean redundant = false, fSubsumesT = false, tSubsumesF = false;
331    RuleItem first;
332    int subsumes = 0;
333    Object [] best = m_best.toArray();
334    for(int i=0; i < best.length; i++){
335      first = (RuleItem)best[i];
336      fSubsumesT = aSubsumesB(first,toInsert);
337      tSubsumesF = aSubsumesB(toInsert, first);
338      if(fSubsumesT){
339        subsumes = 1;
340        break;
341      }
342      else{
343        if(tSubsumesF){
344          boolean remove = m_best.remove(first);
345          subsumes = 2;
346          redundant =true;
347        }
348      }
349    }
350    if(subsumes == 0 || subsumes == 2)
351      m_best.add(toInsert);
352    return redundant;
353  }
354
355  /**
356   * Gets the actual maximum value of the generation time
357   * @return the actual maximum value of the generation time
358   */
359  public int count(){
360
361    return m_count;
362  }
363
364  /**
365   * Gets if the list fo the best rules has been changed
366   * @return whether or not the list fo the best rules has been changed
367   */
368  public boolean change(){
369
370    return m_change;
371  }
372 
373  /**
374   * Returns the revision string.
375   *
376   * @return            the revision
377   */
378  public String getRevision() {
379    return RevisionUtils.extract("$Revision: 1.4 $");
380  }
381}
Note: See TracBrowser for help on using the repository browser.