source: src/main/java/weka/associations/FPGrowth.java @ 19

Last change on this file since 19 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 68.7 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    FPGrowth.java
19 *    Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.associations;
24
25import java.io.Serializable;
26import java.util.ArrayList;
27import java.util.Collection;
28import java.util.Collections;
29import java.util.Comparator;
30import java.util.Enumeration;
31import java.util.HashMap;
32import java.util.Iterator;
33import java.util.LinkedList;
34import java.util.List;
35import java.util.Map;
36import java.util.Set;
37import java.util.Vector;
38
39import weka.core.Attribute;
40import weka.core.Capabilities;
41import weka.core.Instance;
42import weka.core.Instances;
43import weka.core.Option;
44import weka.core.OptionHandler;
45import weka.core.RevisionUtils;
46import weka.core.SelectedTag;
47import weka.core.SparseInstance;
48import weka.core.Tag;
49import weka.core.TechnicalInformation;
50import weka.core.TechnicalInformationHandler;
51import weka.core.Utils;
52import weka.core.Capabilities.Capability;
53import weka.core.TechnicalInformation.Field;
54import weka.core.TechnicalInformation.Type;
55
56/**
57 <!-- globalinfo-start -->
58 * Class implementing the FP-growth algorithm for finding large item sets without candidate generation. Iteratively reduces the minimum support until it finds the required number of rules with the given minimum metric. For more information see:<br/>
59 * <br/>
60 * J. Han, J.Pei, Y. Yin: Mining frequent patterns without candidate generation. In: Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data, 1-12, 2000.
61 * <p/>
62 <!-- globalinfo-end -->
63 *
64 <!-- technical-bibtex-start -->
65 * BibTeX:
66 * <pre>
67 * &#64;inproceedings{Han2000,
68 *    author = {J. Han and J.Pei and Y. Yin},
69 *    booktitle = {Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data},
70 *    pages = {1-12},
71 *    title = {Mining frequent patterns without candidate generation},
72 *    year = {2000}
73 * }
74 * </pre>
75 * <p/>
76 <!-- technical-bibtex-end -->
77 *
78 <!-- options-start -->
79 * Valid options are: <p/>
80 *
81 * <pre> -P &lt;attribute index of positive value&gt;
82 *  Set the index of the attribute value to consider as 'positive'
83 *  for binary attributes in normal dense instances. Index 2 is always
84 *  used for sparse instances. (default = 2)</pre>
85 *
86 * <pre> -I &lt;max items&gt;
87 *  The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)</pre>
88 *
89 * <pre> -N &lt;require number of rules&gt;
90 *  The required number of rules. (default = 10)</pre>
91 *
92 * <pre> -T &lt;0=confidence | 1=lift | 2=leverage | 3=Conviction&gt;
93 *  The metric by which to rank rules. (default = confidence)</pre>
94 *
95 * <pre> -C &lt;minimum metric score of a rule&gt;
96 *  The minimum metric score of a rule. (default = 0.9)</pre>
97 *
98 * <pre> -U &lt;upper bound for minimum support&gt;
99 *  Upper bound for minimum support. (default = 1.0)</pre>
100 *
101 * <pre> -M &lt;lower bound for minimum support&gt;
102 *  The lower bound for the minimum support. (default = 0.1)</pre>
103 *
104 * <pre> -D &lt;delta for minimum support&gt;
105 *  The delta by which the minimum support is decreased in
106 *  each iteration. (default = 0.05)</pre>
107 *
108 * <pre> -S
109 *  Find all rules that meet the lower bound on
110 *  minimum support and the minimum metric constraint.
111 *  Turning this mode on will disable the iterative support reduction
112 *  procedure to find the specified number of rules.</pre>
113 *
114 <!-- options-end -->
115 *
116 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
117 * @version $Revision: 6158 $
118 */
119public class FPGrowth extends AbstractAssociator
120  implements OptionHandler, TechnicalInformationHandler {
121 
122  /** For serialization */
123  private static final long serialVersionUID = 3620717108603442911L;
124
125  /**
126   * Inner class that handles a single binary item
127   */
128  public static class BinaryItem implements Serializable, Comparable<BinaryItem> {
129   
130    /** For serialization */
131    private static final long serialVersionUID = -3372941834914147669L;
132   
133    /** The frequency of the item */
134    protected int m_frequency;
135   
136    /** The attribute that the item corresponds to */
137    protected Attribute m_attribute;
138   
139    /** The index of the value considered to be positive */
140    protected int m_valueIndex;
141   
142    public BinaryItem(Attribute att, int valueIndex) throws Exception {
143      if (att.isNumeric() || (att.isNominal() && att.numValues() > 2)) {
144        throw new Exception("BinaryItem must be constructed using a nominal attribute" +
145                        " with at most 2 values!");
146      }
147      m_attribute = att;
148      if (m_attribute.numValues() == 1) {
149        m_valueIndex = 0; // unary attribute (? used to indicate absence from a basket)
150      } else {
151        m_valueIndex = valueIndex;
152      }
153    }
154   
155    /**
156     * Increase the frequency of this item.
157     *
158     * @param f the amount to increase the frequency by.
159     */
160    public void increaseFrequency(int f) {
161      m_frequency += f;
162    }
163   
164    /**
165     * Decrease the frequency of this item.
166     *
167     * @param f the amount by which to decrease the frequency.
168     */
169    public void decreaseFrequency(int f) {
170      m_frequency -= f;
171    }
172   
173    /**
174     * Increment the frequency of this item.
175     */
176    public void increaseFrequency() {
177      m_frequency++;
178    }
179   
180    /**
181     * Decrement the frequency of this item.
182     */
183    public void decreaseFrequency() {
184      m_frequency--;
185    }
186   
187    /**
188     * Get the frequency of this item.
189     *
190     * @return the frequency.
191     */
192    public int getFrequency() {
193      return m_frequency;
194    }
195   
196    /**
197     * Get the attribute that this item corresponds to.
198     *
199     * @return the corresponding attribute.
200     */
201    public Attribute getAttribute() {
202      return m_attribute;
203    }
204   
205    /**
206     * Get the value index for this item.
207     *
208     * @return the value index.
209     */
210    public int getValueIndex() {
211      return m_valueIndex;
212    }
213   
214    /**
215     * A string representation of this item.
216     *
217     * @return a string representation of this item.
218     */
219    public String toString() {
220      return toString(false);
221    }
222   
223    /**
224     * A string representation of this item.
225     *
226     * @param freq true if the frequency should be included.
227     * @return a string representation of this item.
228     */
229    public String toString(boolean freq) {
230      String result = m_attribute.name() + "=" + m_attribute.value(m_valueIndex);
231      if (freq) {
232        result += ":" + m_frequency;
233      }
234      return result;
235    }
236   
237    /**
238     * Ensures that items will be sorted in descending order of frequency.
239     * Ties are ordered by attribute name.
240     *
241     * @param comp the BinaryItem to compare against.
242     */
243    public int compareTo(BinaryItem comp) {
244      if (m_frequency == comp.getFrequency()) {
245        // sort by name
246        return -1 * m_attribute.name().compareTo(comp.getAttribute().name());
247      }
248      if (comp.getFrequency() < m_frequency) {
249        return -1;
250      }
251      return 1;
252    }
253   
254    public boolean equals(Object compareTo) {
255      if (!(compareTo instanceof BinaryItem)) {
256        return false;
257      }
258     
259      BinaryItem b = (BinaryItem)compareTo;
260      if (m_attribute.equals(b.getAttribute()) && m_frequency == b.getFrequency()) {
261        return true;
262      }
263     
264      return false;
265    }
266   
267    public int hashCode() {
268      return (m_attribute.name().hashCode() ^ 
269          m_attribute.numValues()) * m_frequency;
270    }
271  }
272 
273  /**
274   * Class for maintaining a frequent item set.
275   */
276  protected static class FrequentBinaryItemSet 
277    implements Serializable, Cloneable {
278   
279    /** For serialization */
280    private static final long serialVersionUID = -6543815873565829448L;
281
282    /** The list of items in the item set */
283    protected ArrayList<BinaryItem> m_items = new ArrayList<BinaryItem>();
284   
285    /** the support of this item set **/
286    protected int m_support;
287   
288    /**
289     * Constructor
290     *
291     * @param items the items that make up the frequent item set.
292     * @param support the support of this item set.
293     */
294    public FrequentBinaryItemSet(ArrayList<BinaryItem> items, int support) {
295      m_items = items;
296      m_support = support;
297      Collections.sort(m_items);
298    }
299   
300    /**
301     * Add an item to this item set.
302     *
303     * @param i the item to add.
304     */
305    public void addItem(BinaryItem i) {
306      m_items.add(i);
307      Collections.sort(m_items);
308    }
309   
310    /**
311     * Set the support for this item set.
312     *
313     * @param support the support for this item set.
314     */
315    public void setSupport(int support) {
316      m_support = support;
317    }
318   
319    /**
320     * Get the support of this item set.
321     *
322     * @return the support of this item set.
323     */
324    public int getSupport() {
325      return m_support;
326    }
327   
328    /**
329     * Get the items in this item set.
330     *
331     * @return the items in this item set.
332     */
333    public Collection<BinaryItem> getItems() {
334      return m_items;
335    }
336   
337    /**
338     * Get a particular item from this item set.
339     *
340     * @param index the index of the item to get.
341     * @return the item.
342     */
343    public BinaryItem getItem(int index) {
344      return m_items.get(index);
345    }
346   
347    /**
348     * Get the number of items in this item set.
349     *
350     * @return the number of items in this item set.
351     */
352    public int numberOfItems() {
353      return m_items.size();
354    }
355   
356    /**
357     * Get a textual description of this item set.
358     *
359     * @return a textual description of this item set.
360     */
361    public String toString() {
362      StringBuffer buff = new StringBuffer();
363      Iterator<BinaryItem> i = m_items.iterator();
364     
365      while (i.hasNext()) {
366        buff.append(i.next().toString() + " ");       
367      }
368      buff.append(": " + m_support);
369      return buff.toString();
370    }
371   
372    /**
373     * Make a copy of this item set.
374     *
375     * @return a copy of this item set.
376     */
377    public Object clone() {
378      ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(m_items);
379      return new FrequentBinaryItemSet(items, m_support);
380    }
381  }
382 
383  /**
384   * Maintains a list of frequent item sets.
385   */
386  protected static class FrequentItemSets implements Serializable {
387   
388    /** For serialization */
389    private static final long serialVersionUID = 4173606872363973588L;
390
391    /** The list of frequent item sets */
392    protected ArrayList<FrequentBinaryItemSet> m_sets = 
393      new ArrayList<FrequentBinaryItemSet>();
394   
395    /** The total number of transactions in the data */
396    protected int m_numberOfTransactions;
397   
398    /**
399     * Constructor.
400     *
401     * @param numTransactions the total number of transactions in the data.
402     */
403    public FrequentItemSets(int numTransactions) {
404      m_numberOfTransactions = numTransactions;
405    }
406   
407    /**
408     * Get an item set.
409     *
410     * @param index the index of the item set to get.
411     * @return an item set.
412     */
413    public FrequentBinaryItemSet getItemSet(int index) {
414      return m_sets.get(index);
415    }
416   
417    /**
418     * Get an iterator that can be used to access all the item sets.
419     *
420     * @return an iterator.
421     */
422    public Iterator<FrequentBinaryItemSet> iterator() {
423      return m_sets.iterator();
424    }
425   
426    /**
427     * Get the total number of transactions in the data that the item
428     * sets were derived from.
429     *
430     * @return the total number of transactions in the data.
431     */
432    public int getNumberOfTransactions() {
433      return m_numberOfTransactions;
434    }
435   
436    /**
437     * Add an item set.
438     *
439     * @param setToAdd the item set to add.
440     */
441    public void addItemSet(FrequentBinaryItemSet setToAdd) {
442      m_sets.add(setToAdd);
443    }
444   
445    /**
446     * Sort the item sets according to the supplied comparator.
447     *
448     * @param comp the comparator to use.
449     */
450    public void sort(Comparator<FrequentBinaryItemSet> comp) {
451      Collections.sort(m_sets, comp);
452    }
453   
454    /**
455     * Get the number of item sets.
456     *
457     * @return the number of item sets.
458     */
459    public int size() {
460      return m_sets.size();
461    }
462   
463    /**
464     * Sort the item sets. Sorts by item set length. Ties are broken by comparing
465     * the items in the two item sets.
466     */
467    public void sort() {
468      Comparator<FrequentBinaryItemSet> compF = new Comparator<FrequentBinaryItemSet>() {
469        public int compare(FrequentBinaryItemSet one, FrequentBinaryItemSet two) {
470          Collection<BinaryItem> compOne = one.getItems();
471          Collection<BinaryItem> compTwo = two.getItems();
472         
473//          if (one.getSupport() == two.getSupport()) {
474            // if supports are equal then list shorter item sets before longer ones
475            if (compOne.size() < compTwo.size()) {
476              return -1;
477            } else if (compOne.size() > compTwo.size()) {
478              return 1;
479            } else {
480              // compare items
481              Iterator<BinaryItem> twoIterator = compTwo.iterator();
482              for (BinaryItem oneI : compOne) {
483                BinaryItem twoI = twoIterator.next();
484                int result = oneI.compareTo(twoI);
485                if (result != 0) {
486                  return result;
487                }
488              }
489              return 0; // equal
490            }
491           
492//            return 0;
493    /*      } else if (one.getSupport() > two.getSupport()) {
494            // reverse ordering (i.e. descending by support)
495            return -1;
496          } */
497         
498    //      return 1;
499        }
500      };
501     
502      sort(compF);
503    }
504   
505    /**
506     * Get a textual description of this list of item sets.
507     *
508     * @param numSets the number of item sets to display.
509     * @return a textual description of the item sets.
510     */
511    public String toString(int numSets) {
512      if (m_sets.size() == 0) {
513        return "No frequent items sets found!";
514      }
515     
516      StringBuffer result = new StringBuffer();
517      result.append("" + m_sets.size() + " frequent item sets found");
518      if (numSets > 0) {
519        result.append(" , displaying " + numSets);
520      }
521      result.append(":\n\n");
522     
523      int count = 0;
524      for (FrequentBinaryItemSet i : m_sets) {
525        if (numSets > 0 && count > numSets) {
526          break;
527        }
528        result.append(i.toString() + "\n");
529        count++;
530      }
531     
532      return result.toString();
533    }
534  }
535 
536  /**
537   * This class holds the counts for projected tree nodes
538   * and header lists.
539   */
540  protected static class ShadowCounts implements Serializable {
541   
542    /** For serialization */
543    private static final long serialVersionUID = 4435433714185969155L;
544   
545    /** Holds the counts at different recursion levels */
546    private ArrayList<Integer> m_counts = new ArrayList<Integer>();
547   
548    /**
549     * Get the count at the specified recursion depth.
550     *
551     * @param recursionLevel the depth of the recursion.
552     * @return the count.
553     */
554    public int getCount(int recursionLevel) {
555      if (recursionLevel >= m_counts.size()) {
556        return 0;
557      } else {
558        return m_counts.get(recursionLevel);
559      }
560    }
561   
562    /**
563     * Increase the count at a given recursion level.
564     *
565     * @param recursionLevel the level at which to increase the count.
566     * @param incr the amount by which to increase the count.
567     */
568    public void increaseCount(int recursionLevel, int incr) {
569      // basically treat the list like a stack where we
570      // can add a new element, or increment the element
571      // at the top
572     
573      if (recursionLevel == m_counts.size()) {
574        // new element
575        m_counts.add(incr);
576      } else if (recursionLevel == m_counts.size() - 1) {
577        // otherwise increment the top
578        int n = m_counts.get(recursionLevel).intValue();
579        m_counts.set(recursionLevel, (n + incr));
580      }
581    }
582   
583    /**
584     * Remove the count at the given recursion level.
585     *
586     * @param recursionLevel the level at which to remove the count.
587     */
588    public void removeCount(int recursionLevel) {
589      if (recursionLevel < m_counts.size()) {
590        m_counts.remove(recursionLevel);
591      }
592    }
593  }
594 
595  /**
596   * A node in the FP-tree.
597   */
598  protected static class FPTreeNode implements Serializable {
599               
600    /** For serialization */
601    private static final long serialVersionUID = 4396315323673737660L;
602
603    /** link to another sibling at this level in the tree */
604    protected FPTreeNode m_levelSibling;
605   
606    /** link to the parent node */
607    protected FPTreeNode m_parent;
608   
609    /** item at this node */
610    protected BinaryItem m_item;
611   
612    /** ID (for graphing the tree) */
613    protected int m_ID;
614   
615    /** the children of this node */
616    protected Map<BinaryItem, FPTreeNode> m_children = 
617      new HashMap<BinaryItem, FPTreeNode>();
618   
619    /** counts associated with projected versions of this node */
620    protected ShadowCounts m_projectedCounts = new ShadowCounts();
621   
622    /**
623     * Construct a new node with the given parent link and item.
624     *
625     * @param parent a pointer to the parent of this node.
626     * @param item the item at this node.
627     */
628    public FPTreeNode(FPTreeNode parent, BinaryItem item) {
629      m_parent = parent;
630      m_item = item;
631    }
632   
633    /**
634     * Insert an item set into the tree at this node. Removes the first item
635     * from the supplied item set and makes a recursive call to insert the
636     * remaining items.
637     *
638     * @param itemSet the item set to insert.
639     * @param headerTable the header table for the tree.
640     * @param incr the amount by which to increase counts.
641     */
642    public void addItemSet(Collection<BinaryItem> itemSet, 
643        Map<BinaryItem, FPTreeRoot.Header> headerTable, int incr) {
644     
645      Iterator<BinaryItem> i = itemSet.iterator();
646     
647      if (i.hasNext()) {
648        BinaryItem first = i.next();
649       
650        FPTreeNode aChild;
651        if (!m_children.containsKey(first)) {
652          // not in the tree, so add it.
653          aChild = new FPTreeNode(this, first);
654          m_children.put(first, aChild);
655         
656          // update the header
657          if (!headerTable.containsKey(first)) {
658            headerTable.put(first, new FPTreeRoot.Header());
659          }
660         
661          // append new node to header list
662          headerTable.get(first).addToList(aChild);
663        } else {
664          // get the appropriate child node
665          aChild = m_children.get(first);
666        }
667       
668        // update counts in header table
669        headerTable.get(first).getProjectedCounts().increaseCount(0, incr);
670       
671        // increase the child's count
672        aChild.increaseProjectedCount(0, incr);
673       
674        // proceed recursively
675        itemSet.remove(first);       
676        aChild.addItemSet(itemSet, headerTable, incr);
677      }
678    }
679   
680    /**
681     * Increase the projected count at the given recursion level at this
682     * node
683     *
684     * @param recursionLevel the recursion level to increase the node count
685     * at.
686     * @param incr the amount by which to increase the count.
687     */
688    public void increaseProjectedCount(int recursionLevel, int incr) {
689      m_projectedCounts.increaseCount(recursionLevel, incr);
690    }
691   
692    /**
693     * Remove the projected count at the given recursion level for this
694     * node.
695     *
696     * @param recursionLevel the recursion level at which to remove the count.
697     */
698    public void removeProjectedCount(int recursionLevel) {
699      m_projectedCounts.removeCount(recursionLevel);
700    }
701   
702    /**
703     * Get the projected count at the given recursion level for this node.
704     *
705     * @param recursionLevel the recursion level at which to get the count.
706     * @return the count.
707     */
708    public int getProjectedCount(int recursionLevel) {
709      return m_projectedCounts.getCount(recursionLevel);
710    }
711   
712    /**
713     * Get the parent node.
714     *
715     * @return the parent node.
716     */
717    public FPTreeNode getParent() {
718      return m_parent;
719    }
720   
721    /**
722     * Get the item at this node.
723     *
724     * @return the item at this node.
725     */
726    public BinaryItem getItem() {
727      return m_item;
728    }   
729   
730    /**
731     * Return a textual description of this node for a given recursion
732     * level.
733     *
734     * @param recursionLevel the recursion depth to use.
735     * @return a textual description of this node.
736     */
737    public String toString(int recursionLevel) {
738      return toString("", recursionLevel);
739    }
740
741    /**
742     * Return a textual description of this node for a given recursion
743     * level.
744     *
745     * @param prefix a prefix string to prepend.
746     * @param recursionLevel the recursion level to use.
747     * @return a textual description of this node.
748     */
749    public String toString(String prefix, int recursionLevel) {
750      StringBuffer buffer = new StringBuffer();
751      buffer.append(prefix);
752      buffer.append("|  ");
753      buffer.append(m_item.toString());
754      buffer.append(" (");
755      buffer.append(m_projectedCounts.getCount(recursionLevel));
756      buffer.append(")\n");
757     
758      for (FPTreeNode node : m_children.values()) {
759        buffer.append(node.toString(prefix + "|  ", recursionLevel));
760      }
761      return buffer.toString();
762    }
763   
764    protected int assignIDs(int lastID) {
765      int currentLastID = lastID + 1;
766      m_ID = currentLastID;
767      if (m_children != null) {
768        Collection<FPTreeNode> kids = m_children.values();
769        for (FPTreeNode n : kids) {
770          currentLastID = n.assignIDs(currentLastID);
771        }
772      }
773      return currentLastID;
774    }
775   
776    /**
777     * Generate a dot graph description string for the tree.
778     *
779     * @param text a StringBuffer to store the graph description
780     * in.
781     */
782    public void graphFPTree(StringBuffer text) {
783      if (m_children != null) {
784        Collection<FPTreeNode> kids = m_children.values();
785        for (FPTreeNode n : kids) {
786          text.append("N" + n.m_ID);
787          text.append(" [label=\"");
788          text.append(n.getItem().toString() + " (" + n.getProjectedCount(0) + ")\\n");
789          text.append("\"]\n");
790          n.graphFPTree(text);
791          text.append("N" + m_ID + "->" + "N" + n.m_ID + "\n");
792        }
793      }
794    }
795  }
796 
797  /**
798   * Root of the FPTree
799   */
800  private static class FPTreeRoot extends FPTreeNode {
801   
802    /** For serialization */
803    private static final long serialVersionUID = 632150939785333297L;
804
805    /**
806     * Stores a header entry for an FPTree
807     */
808    protected static class Header implements Serializable {
809     
810      /** For serialization */
811      private static final long serialVersionUID = -6583156284891368909L;
812     
813      /** The list of pointers into the tree structure */
814      protected List<FPTreeNode> m_headerList = new LinkedList<FPTreeNode>();
815     
816      /** Projected header counts for this entry */
817      protected ShadowCounts m_projectedHeaderCounts = new ShadowCounts();
818     
819      /**
820       * Add a tree node into the list for this header entry.
821       *
822       * @param toAdd the node to add.
823       */
824      public void addToList(FPTreeNode toAdd) {
825        m_headerList.add(toAdd);
826      }
827     
828      /**
829       * Get the list of nodes for this header entry.
830       *
831       * @return the list of nodes for this header entry.
832       */
833      public List<FPTreeNode> getHeaderList() {
834        return m_headerList;
835      }
836     
837      /**
838       * Get the projected counts for this header entry.
839       *
840       * @return the projected counts for this header entry.
841       */
842      public ShadowCounts getProjectedCounts() {
843        return m_projectedHeaderCounts;
844      }
845    }
846   
847    /** Stores the header table as mapped Header entries */
848    protected Map<BinaryItem, Header> m_headerTable = 
849      new HashMap<BinaryItem, Header>();
850   
851    /**
852     * Create a new FPTreeRoot.
853     */
854    public FPTreeRoot() {
855      super(null, null);
856    }
857   
858    /**
859     * Insert an item set into the tree.
860     *
861     * @param itemSet the item set to insert into the tree.
862     * @param incr the increment by which to increase counters.
863     */
864    public void addItemSet(Collection<BinaryItem> itemSet, int incr) {
865      super.addItemSet(itemSet, m_headerTable, incr);
866    }
867   
868    /**
869     * Get the header table for this tree.
870     *
871     * @return the header table for this tree.
872     */
873    public Map<BinaryItem, Header> getHeaderTable() {
874      return m_headerTable;
875    }
876   
877    public boolean isEmpty(int recursionLevel) {
878      for (FPTreeNode c : m_children.values()) {
879        if (c.getProjectedCount(recursionLevel) > 0) {
880          return false;
881        }
882      }
883      return true;
884    }
885   
886    /**
887     * Get a textual description of the tree at a given recursion
888     * (projection) level.
889     *
890     * @param pad the string to use as a prefix for indenting nodes.
891     * @param recursionLevel the recursion level (projection) to use.
892     * @return the textual description of the tree.
893     */
894    public String toString(String pad, int recursionLevel) {
895      StringBuffer result = new StringBuffer();
896      result.append(pad);
897      result.append("+ ROOT\n");
898
899      for (FPTreeNode node : m_children.values()) {
900        result.append(node.toString(pad + "|  ", recursionLevel));
901      }
902      return result.toString();
903    }
904
905    /**
906     * Get a textual description of the header table for this tree.
907     *
908     * @param recursionLevel the recursion level to use.
909     * @return a textual description of the header table for this
910     * tree at a given recursion level.
911     */
912    public String printHeaderTable(int recursionLevel) {
913      StringBuffer buffer = new StringBuffer();
914      for (BinaryItem item : m_headerTable.keySet()) {
915        buffer.append(item.toString());
916        buffer.append(" : ");
917        buffer.append(m_headerTable.get(item).getProjectedCounts().getCount(recursionLevel));
918        buffer.append("\n");
919      }
920      return buffer.toString();
921    }
922   
923    public void graphHeaderTable(StringBuffer text, int maxID) {
924
925      for (BinaryItem item : m_headerTable.keySet()) {
926        Header h = m_headerTable.get(item);
927        List<FPTreeNode> headerList = h.getHeaderList();
928        if (headerList.size() > 1) {
929          text.append("N" + maxID + " [label=\"" + headerList.get(0).getItem().toString() 
930              + " (" + h.getProjectedCounts().getCount(0) + ")"
931              + "\" shape=plaintext]\n");
932
933          text.append("N" + maxID + "->" + "N" + headerList.get(1).m_ID + "\n");
934          for (int i = 1; i < headerList.size() - 1; i++) {
935            text.append("N" + headerList.get(i).m_ID + "->" + "N" + headerList.get(i+1).m_ID + "\n");
936          }
937          maxID++;
938        }
939      }
940    }
941  }
942 
943  /**
944   * Class for storing and manipulating an association rule. Also has a utility
945   * routine for generating (by brute force) all the association rules that meet
946   * a given metric threshold from a list of large item sets.
947   *
948   * @author Mark Hall (mhall{[at]}pentaho{[dot]}com).
949   */
950  /**
951   * @author mhall
952   *
953   */
954  public static class AssociationRule implements Serializable, Comparable<AssociationRule> {
955   
956    /** For serialization */
957    private static final long serialVersionUID = -661269018702294489L;
958
959    /** Enum for holding different metric types */
960    public static enum METRIC_TYPE {
961      CONFIDENCE("conf") {
962        double compute(int premiseSupport, int consequenceSupport, 
963            int totalSupport, int totalTransactions) {
964         
965          return (double)totalSupport / (double)premiseSupport;
966        }
967      },
968      LIFT("lift") {
969        double compute(int premiseSupport, int consequenceSupport, 
970            int totalSupport, int totalTransactions) {
971         
972          double confidence = 
973            METRIC_TYPE.CONFIDENCE.compute(premiseSupport, consequenceSupport, 
974                totalSupport, totalTransactions);
975          return confidence / ((double)consequenceSupport /
976              (double)totalTransactions);
977        }
978      },
979      LEVERAGE("lev") {
980        double compute(int premiseSupport, int consequenceSupport, 
981            int totalSupport, int totalTransactions) {
982         
983          double coverageForItemSet = (double)totalSupport /
984            (double)totalTransactions;
985          double expectedCoverageIfIndependent = 
986            ((double)premiseSupport / (double)totalTransactions) *
987            ((double)consequenceSupport / (double)totalTransactions);
988          return coverageForItemSet - expectedCoverageIfIndependent;
989        }
990      },
991      CONVICTION("conv") {
992        double compute(int premiseSupport, int consequenceSupport, 
993            int totalSupport, int totalTransactions) {
994         
995          double num = 
996            (double)premiseSupport * (double)(totalTransactions - consequenceSupport) /
997            (double)totalTransactions;
998          double denom = premiseSupport - totalSupport + 1;
999          return num / denom;
1000        }
1001      };
1002     
1003      private final String m_stringVal;
1004      METRIC_TYPE(String name) {
1005        m_stringVal = name;
1006      }
1007     
1008      abstract double compute(int premiseSupport, int consequenceSupport, 
1009          int totalSupport, int totalTransactions);
1010     
1011      public String toString() {
1012        return m_stringVal;
1013      }
1014     
1015      public String toStringMetric(int premiseSupport, int consequenceSupport,
1016          int totalSupport, int totalTransactions) {
1017        return m_stringVal + ":(" + Utils.doubleToString(compute(premiseSupport, consequenceSupport,
1018            totalSupport, totalTransactions), 2) + ")";
1019      }
1020    }
1021   
1022    /** Tags for display in the GUI */
1023    public static final Tag[] TAGS_SELECTION = {
1024      new Tag(METRIC_TYPE.CONFIDENCE.ordinal(), "Confidence"),
1025      new Tag(METRIC_TYPE.LIFT.ordinal(), "Lift"),
1026      new Tag(METRIC_TYPE.LEVERAGE.ordinal(), "Leverage"),
1027      new Tag(METRIC_TYPE.CONVICTION.ordinal(), "Conviction")
1028    };
1029   
1030    /** The metric type for this rule */
1031    protected METRIC_TYPE m_metricType = METRIC_TYPE.CONFIDENCE;
1032   
1033    /** The premise of the rule */
1034    protected Collection<BinaryItem> m_premise;
1035   
1036    /** The consequence of the rule */
1037    protected Collection<BinaryItem> m_consequence;
1038   
1039    /** The support for the premise */
1040    protected int m_premiseSupport;
1041   
1042    /** The support for the consequence */
1043    protected int m_consequenceSupport;
1044   
1045    /** The total support for the item set (premise + consequence) */
1046    protected int m_totalSupport;
1047   
1048    /** The total number of transactions in the data */
1049    protected int m_totalTransactions;
1050   
1051    /**
1052     * Construct a new association rule.
1053     *
1054     * @param premise the premise of the rule
1055     * @param consequence the consequence of the rule
1056     * @param metric the metric for the rule
1057     * @param premiseSupport the support of the premise
1058     * @param consequenceSupport the support of the consequence
1059     * @param totalSupport the total support of the rule
1060     * @param totalTransactions the number of transactions in the data
1061     */
1062    public AssociationRule(Collection<BinaryItem> premise, 
1063        Collection<BinaryItem> consequence, METRIC_TYPE metric,
1064        int premiseSupport, int consequenceSupport,
1065        int totalSupport, int totalTransactions) {
1066      m_premise = premise;
1067      m_consequence = consequence;
1068      m_metricType = metric;
1069      m_premiseSupport = premiseSupport;
1070      m_consequenceSupport = consequenceSupport;
1071      m_totalSupport = totalSupport;
1072      m_totalTransactions = totalTransactions;
1073    }
1074   
1075    /**
1076     * Get the premise of this rule.
1077     *
1078     * @return the premise of this rule.
1079     */
1080    public Collection<BinaryItem> getPremise() {
1081      return m_premise;
1082    }
1083   
1084    /**
1085     * Get the consequence of this rule.
1086     *
1087     * @return the consequence of this rule.
1088     */
1089    public Collection<BinaryItem> getConsequence() {
1090      return m_consequence;
1091    }
1092   
1093    /**
1094     * Get the metric type of this rule (e.g. confidence).
1095     *
1096     * @return the metric type of this rule.
1097     */
1098    public METRIC_TYPE getMetricType() {
1099      return m_metricType;
1100    }
1101   
1102    /**
1103     * Get the value of the metric for this rule.
1104     *
1105     * @return the value of the metric for this rule.
1106     */
1107    public double getMetricValue() {
1108      return m_metricType.compute(m_premiseSupport, m_consequenceSupport, 
1109          m_totalSupport, m_totalTransactions);
1110    }
1111   
1112    /**
1113     * Get the support for the premise.
1114     *
1115     * @return the support for the premise.
1116     */
1117    public int getPremiseSupport() {
1118      return m_premiseSupport;
1119    }
1120   
1121    /**
1122     * Get the support for the consequence.
1123     *
1124     * @return the support for the consequence.
1125     */
1126    public int getConsequenceSupport() {
1127      return m_consequenceSupport;
1128    }
1129   
1130    /**
1131     * Get the total support for this rule (premise + consequence).
1132     *
1133     * @return the total support for this rule.
1134     */
1135    public int getTotalSupport() {
1136      return m_totalSupport;
1137    }
1138   
1139    /**
1140     * Get the total number of transactions in the data.
1141     *
1142     * @return the total number of transactions in the data.
1143     */
1144    public int getTotalTransactions() {
1145      return m_totalTransactions;
1146    }
1147   
1148    /**
1149     * Compare this rule to the supplied rule.
1150     *
1151     * @param other the rule to compare to.
1152     * @return the result of the comparison.
1153     */
1154    public int compareTo(AssociationRule other) {
1155      return -Double.compare(getMetricValue(), other.getMetricValue());
1156    }
1157   
1158    /**
1159     * Return true if this rule is equal to the supplied one.
1160     *
1161     * @return true if this rule is the same as the supplied rule.
1162     */
1163    public boolean equals(Object other) {
1164      if (!(other instanceof AssociationRule)) {
1165        return false;
1166      }
1167     
1168      AssociationRule otherRule = (AssociationRule)other;
1169      boolean result = m_premise.equals(otherRule.getPremise()) &&
1170        m_consequence.equals(otherRule.getConsequence()) && 
1171        (getMetricValue() == otherRule.getMetricValue());
1172     
1173      return result;
1174    }
1175   
1176    /**
1177     * Get a textual description of this rule.
1178     *
1179     * @return a textual description of this rule.
1180     */
1181    public String toString() {
1182      StringBuffer result = new StringBuffer();
1183     
1184      result.append(m_premise.toString() + ": " + m_premiseSupport
1185          + " ==> " + m_consequence.toString() + ": " + m_totalSupport
1186          + "   ");
1187      for (METRIC_TYPE m : METRIC_TYPE.values()) {
1188        if (m.equals(m_metricType)) {
1189          result.append("<" + 
1190              m.toStringMetric(m_premiseSupport, m_consequenceSupport, 
1191                  m_totalSupport, m_totalTransactions) + "> ");
1192        } else {
1193          result.append("" + 
1194              m.toStringMetric(m_premiseSupport, m_consequenceSupport, 
1195                  m_totalSupport, m_totalTransactions) + " ");
1196        }
1197      }
1198      return result.toString();
1199    }
1200   
1201    private static void nextSubset(boolean[] subset) {
1202      for (int i = 0; i < subset.length; i++) {
1203        if (!subset[i]) {
1204          subset[i] = true;
1205          break;
1206        } else {
1207          subset[i] = false;
1208        }
1209      }
1210    }
1211   
1212    private static Collection<BinaryItem> getPremise(FrequentBinaryItemSet fis, 
1213        boolean[] subset) {
1214      boolean ok = false;
1215      for (int i = 0; i < subset.length; i++){
1216        if (!subset[i]) {
1217          ok = true;
1218          break;
1219        }
1220      }     
1221     
1222      if (!ok) {
1223        return null;
1224      }
1225     
1226      List<BinaryItem> premise = new ArrayList<BinaryItem>();
1227      ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(fis.getItems());
1228
1229     
1230      for (int i = 0; i < subset.length; i++) {
1231        if (subset[i]) {
1232          premise.add(items.get(i));
1233        }
1234      }
1235      return premise;
1236    }
1237   
1238    private static Collection<BinaryItem> getConsequence(FrequentBinaryItemSet fis,
1239        boolean[] subset) {
1240      List<BinaryItem> consequence = new ArrayList<BinaryItem>();
1241      ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(fis.getItems());
1242     
1243      for (int i = 0; i < subset.length; i++) {
1244        if (!subset[i]) {
1245          consequence.add(items.get(i));
1246        }
1247      }
1248      return consequence;
1249    }
1250   
1251   
1252    /**
1253     * Generate all association rules, from the supplied frequet item sets,
1254     * that meet a given minimum metric threshold. Uses a brute force approach.
1255     *
1256     * @param largeItemSets the set of frequent item sets
1257     * @param metricToUse the metric to use
1258     * @param metricThreshold the threshold value that a rule must meet
1259     * @param totalTransactions the total number of transactions in the data
1260     * @return a list of association rules
1261     */
1262    public static List<AssociationRule> 
1263      generateRulesBruteForce(FrequentItemSets largeItemSets, METRIC_TYPE metricToUse, 
1264          double metricThreshold, int totalTransactions) {
1265     
1266      List<AssociationRule> rules = new ArrayList<AssociationRule>();
1267      largeItemSets.sort();
1268      Map<Collection<BinaryItem>, Integer> frequencyLookup =
1269        new HashMap<Collection<BinaryItem>, Integer>();
1270     
1271      Iterator<FrequentBinaryItemSet> setI = largeItemSets.iterator();
1272      // process each large item set
1273      while (setI.hasNext()) {
1274        FrequentBinaryItemSet fis = setI.next();
1275        frequencyLookup.put(fis.getItems(), fis.getSupport());
1276        if (fis.getItems().size() > 1) {
1277          // generate all the possible subsets for the premise
1278          boolean[] subset = new boolean[fis.getItems().size()];
1279          Collection<BinaryItem> premise = null;
1280          Collection<BinaryItem> consequence = null;
1281          while ((premise = getPremise(fis, subset)) != null) {
1282            if (premise.size() > 0 && premise.size() < fis.getItems().size()) {
1283              consequence = getConsequence(fis, subset);
1284              int totalSupport = fis.getSupport();
1285              int supportPremise = frequencyLookup.get(premise).intValue();
1286              int supportConsequence = frequencyLookup.get(consequence).intValue();
1287             
1288              // a candidate rule
1289              AssociationRule candidate = 
1290                new AssociationRule(premise, consequence, metricToUse, supportPremise,
1291                    supportConsequence, totalSupport, totalTransactions);
1292              if (candidate.getMetricValue() > metricThreshold) {
1293                // accept this rule
1294                rules.add(candidate);
1295              }             
1296            }
1297            nextSubset(subset);
1298          }
1299        }
1300      }
1301     
1302      return rules;
1303    }
1304  }
1305 
1306  /** The number of rules to find */
1307  protected int m_numRulesToFind = 10;
1308  //protected double m_upperBoundMinSupport = 0.36;
1309 
1310  /** The upper bound on the minimum support */
1311  protected double m_upperBoundMinSupport = 1.0;
1312 
1313  /** The lower bound on minimum support */
1314  protected double m_lowerBoundMinSupport = 0.1;
1315 
1316  /** The amount by which to decrease the support in each iteration */
1317  protected double m_delta = 0.05;
1318 
1319  /**
1320   * If true, just all rules meeting the lower bound on the minimum
1321   * support will be found. The number of rules to find will be
1322   * ignored and the iterative reduction of support will not
1323   * be done.
1324   */
1325  protected boolean m_findAllRulesForSupportLevel = false;
1326 
1327  //protected double m_lowerBoundMinSupport = 0.0;
1328 
1329  /** The index (1 based) of binary attributes to treat as the positive value */
1330  protected int m_positiveIndex = 2;
1331 
1332  protected AssociationRule.METRIC_TYPE m_metric = 
1333    AssociationRule.METRIC_TYPE.CONFIDENCE;
1334 
1335  protected double m_metricThreshold = 0.9;
1336 
1337  /** Holds the large item sets found */
1338  protected FrequentItemSets m_largeItemSets;
1339 
1340  /** Holds the rules */
1341  protected List<AssociationRule> m_rules;
1342 
1343  // maximum number of items in a large item set (zero means no limit)
1344  protected int m_maxItems = -1;
1345 
1346  /**
1347   * Returns default capabilities of the classifier.
1348   *
1349   * @return      the capabilities of this classifier
1350   */
1351  public Capabilities getCapabilities() {
1352    Capabilities result = super.getCapabilities();
1353    result.disableAll();
1354
1355    // enable what we can handle
1356   
1357    // attributes
1358    result.enable(Capability.UNARY_ATTRIBUTES);
1359    result.enable(Capability.BINARY_ATTRIBUTES);
1360    result.enable(Capability.MISSING_VALUES);
1361
1362    result.enable(Capability.NO_CLASS);
1363   
1364    return result;
1365  }
1366 
1367  /**
1368   * Returns a string describing this associator
1369   *
1370   * @return a description of the evaluator suitable for
1371   * displaying in the explorer/experimenter gui
1372   */
1373  public String globalInfo() {
1374    return "Class implementing the FP-growth algorithm for finding" +
1375                " large item sets without candidate generation. Iteratively" +
1376                " reduces the minimum support until it finds the required" +
1377                " number of rules with the given minimum metric." +
1378                " For more information see:\n\n" +
1379                getTechnicalInformation().toString();
1380  }
1381 
1382  /**
1383   * Returns an instance of a TechnicalInformation object, containing
1384   * detailed information about the technical background of this class,
1385   * e.g., paper reference or book this class is based on.
1386   *
1387   * @return the technical information about this class
1388   */
1389  public TechnicalInformation getTechnicalInformation() {
1390    TechnicalInformation        result;
1391   
1392    result = new TechnicalInformation(Type.INPROCEEDINGS);
1393    result.setValue(Field.AUTHOR, "J. Han and J.Pei and Y. Yin");
1394    result.setValue(Field.TITLE, "Mining frequent patterns without candidate generation");
1395    result.setValue(Field.BOOKTITLE, "Proceedings of the 2000 ACM-SIGMID International" +
1396                " Conference on Management of Data");
1397    result.setValue(Field.YEAR, "2000");
1398    result.setValue(Field.PAGES, "1-12");
1399   
1400    return result;
1401  }
1402 
1403 
1404  /**
1405   * Get the singleton items in the data
1406   *
1407   * @param data the Instances to process
1408   * @return a list of singleton item sets
1409   * @throws Exception if the singletons can't be found for some reason
1410   */
1411  protected ArrayList<BinaryItem> getSingletons(Instances data) throws Exception {
1412    ArrayList<BinaryItem> singletons = new ArrayList<BinaryItem>();
1413   
1414    for (int i = 0; i < data.numAttributes(); i++) {
1415      singletons.add(new BinaryItem(data.attribute(i), m_positiveIndex - 1));
1416    }
1417   
1418    for (int i = 0; i < data.numInstances(); i++) {
1419      Instance current = data.instance(i);
1420      if (current instanceof SparseInstance) {
1421        for (int j = 0; j < current.numValues(); j++) {
1422          int attIndex = current.index(j);
1423          singletons.get(attIndex).increaseFrequency();
1424        }
1425      } else {
1426        for (int j = 0; j < data.numAttributes(); j++) {
1427          if (!current.isMissing(j)) {
1428            if (current.attribute(j).numValues() == 1 
1429                || current.value(j) == m_positiveIndex - 1) {
1430              singletons.get(j).increaseFrequency();
1431            }
1432          }
1433        }
1434      }
1435    }
1436   
1437    return singletons;
1438  }
1439 
1440  /*protected ArrayList<BinaryItem> getFrequent(ArrayList<BinaryItem> items, int minSupport) {
1441    ArrayList<BinaryItem> frequent = new ArrayList<BinaryItem>();
1442    for (BinaryItem b : items) {
1443      if (b.getFrequency() > minSupport) {
1444        frequent.add(b);
1445      }
1446    }
1447   
1448    // sort in descending order of support
1449    Collections.sort(frequent);
1450    return frequent;
1451  } */
1452 
1453  /**
1454   * Construct the frequent pattern tree by inserting each transaction
1455   * in the data into the tree. Only those items from each transaction that
1456   * meet the minimum support threshold are inserted.
1457   *
1458   * @param singletons the singleton item sets
1459   * @param data the Instances containing the transactions
1460   * @param minSupport the minimum support
1461   * @return the root of the tree
1462   */
1463  protected FPTreeRoot buildFPTree(ArrayList<BinaryItem> singletons, 
1464      Instances data, int minSupport) {
1465   
1466    FPTreeRoot tree = new FPTreeRoot();
1467   
1468    for (int i = 0; i < data.numInstances(); i++) {
1469      Instance current = data.instance(i);
1470      ArrayList<BinaryItem> transaction = new ArrayList<BinaryItem>();
1471      if (current instanceof SparseInstance) {
1472        for (int j = 0; j < current.numValues(); j++) {
1473          int attIndex = current.index(j);
1474          if (singletons.get(attIndex).getFrequency() >= minSupport) {
1475            transaction.add(singletons.get(attIndex));
1476          }
1477        }
1478        Collections.sort(transaction);
1479        tree.addItemSet(transaction, 1);
1480      } else {
1481        for (int j = 0; j < data.numAttributes(); j++) {
1482          if (!current.isMissing(j)) {
1483            if (current.attribute(j).numValues() == 1 
1484                || current.value(j) == m_positiveIndex - 1) {
1485              if (singletons.get(j).getFrequency() >= minSupport) {
1486                transaction.add(singletons.get(j));
1487              }
1488            }
1489          }
1490        }
1491        Collections.sort(transaction);
1492        tree.addItemSet(transaction, 1);
1493      }
1494    }
1495   
1496    return tree;
1497  }
1498 
1499  /**
1500   * Find large item sets in the FP-tree.
1501   *
1502   * @param tree the root of the tree to mine
1503   * @param largeItemSets holds the large item sets found
1504   * @param recursionLevel the recursion level for the current projected
1505   * counts
1506   * @param conditionalItems the current set of items that the current
1507   * (projected) tree is conditional on
1508   * @param minSupport the minimum acceptable support
1509   */
1510  protected void mineTree(FPTreeRoot tree, FrequentItemSets largeItemSets, 
1511      int recursionLevel, FrequentBinaryItemSet conditionalItems, int minSupport) {
1512   
1513    if (!tree.isEmpty(recursionLevel)) {
1514      if (m_maxItems > 0 && recursionLevel >= m_maxItems) {
1515        // don't mine any further
1516        return;
1517      }
1518     
1519      Map<BinaryItem, FPTreeRoot.Header> headerTable = tree.getHeaderTable();
1520      Set<BinaryItem> keys = headerTable.keySet();
1521//      System.err.println("Number of freq item sets collected " + largeItemSets.size());
1522      Iterator<BinaryItem> i = keys.iterator();
1523      while (i.hasNext()) {
1524        BinaryItem item = i.next();
1525        FPTreeRoot.Header itemHeader = headerTable.get(item);
1526       
1527        // check for minimum support at this level
1528        int support = itemHeader.getProjectedCounts().getCount(recursionLevel);
1529        if (support >= minSupport) {         
1530          // process header list at this recursion level
1531          for (FPTreeNode n : itemHeader.getHeaderList()) {
1532            // push count up path to root
1533            int currentCount = n.getProjectedCount(recursionLevel);
1534            if (currentCount > 0) {                           
1535              FPTreeNode temp = n.getParent();
1536              while (temp != tree) {
1537                // set/increase for the node
1538                temp.increaseProjectedCount(recursionLevel + 1, currentCount);
1539
1540                // set/increase for the header table
1541                headerTable.get(temp.getItem()).
1542                getProjectedCounts().increaseCount(recursionLevel + 1, currentCount);
1543               
1544                temp = temp.getParent();
1545              }
1546            }
1547          }
1548         
1549          FrequentBinaryItemSet newConditional = 
1550            (FrequentBinaryItemSet) conditionalItems.clone();
1551         
1552          // this item gets added to the conditional items
1553          newConditional.addItem(item);
1554          newConditional.setSupport(support);
1555         
1556          // now add this conditional item set to the list of large item sets
1557          largeItemSets.addItemSet(newConditional);
1558         
1559          // now recursively process the new tree
1560          mineTree(tree, largeItemSets, recursionLevel + 1, newConditional,
1561              minSupport);
1562         
1563          // reverse the propagated counts
1564          for (FPTreeNode n : itemHeader.getHeaderList()) {
1565            FPTreeNode temp = n.getParent();
1566            while (temp != tree) {
1567              temp.removeProjectedCount(recursionLevel + 1);
1568              temp = temp.getParent();
1569            }
1570          }
1571         
1572          // reverse the propagated counts in the header list
1573          // at this recursion level
1574          for (FPTreeRoot.Header h : headerTable.values()) {
1575            h.getProjectedCounts().removeCount(recursionLevel + 1);
1576          }         
1577        }
1578      }
1579    }
1580  }
1581 
1582  /**
1583   * Construct a new FPGrowth object.
1584   */
1585  public FPGrowth() {
1586    resetOptions();
1587  }
1588 
1589  /**
1590   * Reset all options to their default values.
1591   */
1592  public void resetOptions() {
1593    m_delta = 0.05;
1594    m_metricThreshold = 0.9;
1595    m_numRulesToFind = 10;
1596    m_lowerBoundMinSupport = 0.1;
1597    m_upperBoundMinSupport = 1.0;
1598//    m_minSupport = -1;
1599    m_positiveIndex = 2;
1600  }
1601 
1602  /**
1603   * Tip text for this property suitable for displaying
1604   * in the GUI.
1605   *
1606   * @return the tip text for this property.
1607   */
1608  public String positiveIndexTipText() {
1609    return "Set the index of binary valued attributes that is to be considered" +
1610                " the positive index. Has no effect for sparse data (in this case" +
1611                " the first index (i.e. non-zero values) is always treated as " +
1612                " positive. Also has no effect for unary valued attributes (i.e." +
1613                " when using the Weka Apriori-style format for market basket data," +
1614                " which uses missing value \"?\" to indicate" +
1615                " absence of an item.";
1616  }
1617 
1618  /**
1619   * Set the index of the attribute value to consider as positive
1620   * for binary attributes in normal dense instances. Index 1 is always
1621   * used for sparse instances.
1622   *
1623   * @param index the index to use for positive values in binary attributes.
1624   */
1625  public void setPositiveIndex(int index) {
1626    m_positiveIndex = index;
1627  }
1628 
1629  /**
1630   * Get the index of the attribute value to consider as positive
1631   * for binary attributes in normal dense instances. Index 1 is always
1632   * used for sparse instances.
1633   *
1634   * @return the index to use for positive values in binary attributes.
1635   */
1636  public int getPositiveIndex() {
1637    return m_positiveIndex;
1638  }
1639 
1640  /**
1641   * Set the desired number of rules to find.
1642   *
1643   * @param numR the number of rules to find.
1644   */
1645  public void setNumRulesToFind(int numR) {
1646    m_numRulesToFind = numR;
1647  }
1648 
1649  /**
1650   * Get the number of rules to find.
1651   *
1652   * @return the number of rules to find.
1653   */
1654  public int getNumRulesToFind() {
1655    return m_numRulesToFind;
1656  }
1657 
1658  /**
1659   * Tip text for this property suitable for displaying
1660   * in the GUI.
1661   *
1662   * @return the tip text for this property.
1663   */
1664  public String numRulesToFindTipText() {
1665    return "The number of rules to output";
1666  }
1667 
1668  /**
1669   * Set the metric type to use.
1670   *
1671   * @param d the metric type
1672   */
1673  public void setMetricType(SelectedTag d) {
1674    int ordinal =  d.getSelectedTag().getID();
1675    for (AssociationRule.METRIC_TYPE m : AssociationRule.METRIC_TYPE.values()) {
1676      if (m.ordinal() == ordinal) {
1677        m_metric = m;
1678        break;
1679      }
1680    }
1681  }
1682 
1683  /**
1684   * Set the maximum number of items to include in large items sets.
1685   *
1686   * @param max the maxim number of items to include in large item sets.
1687   */
1688  public void setMaxNumberOfItems(int max) {
1689    m_maxItems = max;
1690  }
1691 
1692  /**
1693   * Gets the maximum number of items to be included in large item sets.
1694   *
1695   * @return the maximum number of items to be included in large items sets.
1696   */
1697  public int getMaxNumberOfItems() {
1698    return m_maxItems;
1699  }
1700 
1701  /**
1702   * Tip text for this property suitable for displaying
1703   * in the GUI.
1704   *
1705   * @return the tip text for this property.
1706   */
1707  public String maxNumberOfItemsTipText() {
1708    return "The maximum number of items to include in frequent item sets. -1 " +
1709                "means no limit.";
1710  }
1711 
1712  /**
1713   * Get the metric type to use.
1714   *
1715   * @return the metric type to use.
1716   */
1717  public SelectedTag getMetricType() {
1718    return new SelectedTag(m_metric.ordinal(), AssociationRule.TAGS_SELECTION);
1719  }
1720 
1721  /**
1722   * Tip text for this property suitable for displaying
1723   * in the GUI.
1724   *
1725   * @return the tip text for this property.
1726   */
1727  public String metricTypeTipText() {
1728    return "Set the type of metric by which to rank rules. Confidence is "
1729    +"the proportion of the examples covered by the premise that are also "
1730    +"covered by the consequence(Class association rules can only be mined using confidence). Lift is confidence divided by the "
1731    +"proportion of all examples that are covered by the consequence. This "
1732    +"is a measure of the importance of the association that is independent "
1733    +"of support. Leverage is the proportion of additional examples covered "
1734    +"by both the premise and consequence above those expected if the "
1735    +"premise and consequence were independent of each other. The total "
1736    +"number of examples that this represents is presented in brackets "
1737    +"following the leverage. Conviction is "
1738    +"another measure of departure from independence.";
1739  }
1740 
1741  /**
1742   * Returns the tip text for this property
1743   *
1744   * @return tip text for this property suitable for
1745   * displaying in the explorer/experimenter gui
1746   */
1747  public String minMetricTipText() {
1748    return "Minimum metric score. Consider only rules with scores higher than "
1749      +"this value.";
1750  }
1751
1752  /**
1753   * Get the value of minConfidence.
1754   *
1755   * @return Value of minConfidence.
1756   */
1757  public double getMinMetric() {
1758   
1759    return m_metricThreshold;
1760  }
1761 
1762  /**
1763   * Set the value of minConfidence.
1764   *
1765   * @param v  Value to assign to minConfidence.
1766   */
1767  public void setMinMetric(double v) {
1768   
1769    m_metricThreshold = v;
1770  }
1771 
1772  /**
1773   * Returns the tip text for this property
1774   * @return tip text for this property suitable for
1775   * displaying in the explorer/experimenter gui
1776   */
1777  public String deltaTipText() {
1778    return "Iteratively decrease support by this factor. Reduces support "
1779      +"until min support is reached or required number of rules has been "
1780      +"generated.";
1781  }
1782   
1783  /**
1784   * Get the value of delta.
1785   *
1786   * @return Value of delta.
1787   */
1788  public double getDelta() {
1789   
1790    return m_delta;
1791  }
1792 
1793  /**
1794   * Set the value of delta.
1795   *
1796   * @param v  Value to assign to delta.
1797   */
1798  public void setDelta(double v) {
1799   
1800    m_delta = v;
1801  }
1802 
1803  /**
1804   * Returns the tip text for this property
1805   * @return tip text for this property suitable for
1806   * displaying in the explorer/experimenter gui
1807   */
1808  public String lowerBoundMinSupportTipText() {
1809    return "Lower bound for minimum support.";
1810  }
1811
1812  /**
1813   * Get the value of lowerBoundMinSupport.
1814   *
1815   * @return Value of lowerBoundMinSupport.
1816   */
1817  public double getLowerBoundMinSupport() {
1818   
1819    return m_lowerBoundMinSupport;
1820  }
1821 
1822  /**
1823   * Set the value of lowerBoundMinSupport.
1824   *
1825   * @param v  Value to assign to lowerBoundMinSupport.
1826   */
1827  public void setLowerBoundMinSupport(double v) {
1828   
1829    m_lowerBoundMinSupport = v;
1830  }
1831 
1832  /**
1833   * Returns the tip text for this property
1834   * @return tip text for this property suitable for
1835   * displaying in the explorer/experimenter gui
1836   */
1837  public String upperBoundMinSupportTipText() {
1838    return "Upper bound for minimum support. Start iteratively decreasing "
1839      +"minimum support from this value.";
1840  }
1841
1842  /**
1843   * Get the value of upperBoundMinSupport.
1844   *
1845   * @return Value of upperBoundMinSupport.
1846   */
1847  public double getUpperBoundMinSupport() {
1848   
1849    return m_upperBoundMinSupport;
1850  }
1851 
1852  /**
1853   * Set the value of upperBoundMinSupport.
1854   *
1855   * @param v  Value to assign to upperBoundMinSupport.
1856   */
1857  public void setUpperBoundMinSupport(double v) {
1858   
1859    m_upperBoundMinSupport = v;
1860  }
1861 
1862  /**
1863   * Tip text for this property suitable for displaying
1864   * in the GUI.
1865   *
1866   * @return the tip text for this property.
1867   */
1868  public String findAllRulesForSupportLevelTipText() {
1869    return "Find all rules that meet " +
1870    "the lower bound on minimum support and the minimum metric constraint. " +
1871    "Turning this mode on will disable the iterative support reduction " +
1872    "procedure to find the specified number of rules.";
1873  }
1874 
1875  /**
1876   * If true then turn off the iterative support reduction method
1877   * of finding x rules that meet the minimum support and metric
1878   * thresholds and just return all the rules that meet the
1879   * lower bound on minimum support and the minimum metric.
1880   *
1881   * @param s true if all rules meeting the lower bound on the support
1882   * and minimum metric thresholds are to be found.
1883   */
1884  public void setFindAllRulesForSupportLevel(boolean s) {
1885    m_findAllRulesForSupportLevel = s;
1886  }
1887 
1888  /**
1889   * Get whether all rules meeting the lower bound on min support
1890   * and the minimum metric threshold are to be found.
1891   *
1892   * @return true if all rules meeting the lower bound on min
1893   * support and the min metric threshold are to be found.
1894   */
1895  public boolean getFindAllRulesForSupportLevel() {
1896    return m_findAllRulesForSupportLevel;
1897  }
1898 
1899  /* public void setMinimumSupport(double minSupp) {
1900    m_minSupport = minSupp;
1901  }
1902 
1903  public double getMinimumSupport() {
1904    return m_minSupport;
1905  } */   
1906 
1907  /**
1908   * Gets the list of mined association rules.
1909   *
1910   * @return the list of association rules discovered during mining.
1911   * Returns null if mining hasn't been performed yet.
1912   */
1913  public List<AssociationRule> getAssociationRules() {
1914    return m_rules;
1915  }
1916 
1917  /**
1918   * Returns an enumeration describing the available options.
1919   *
1920   * @return an enumeration of all the available options.
1921   */
1922  public Enumeration<Option> listOptions() {
1923    Vector<Option> newVector = new Vector<Option>();
1924   
1925    String string00 = "\tSet the index of the attribute value to consider as 'positive'\n\t"
1926   + "for binary attributes in normal dense instances. Index 2 is always\n\t"
1927   + "used for sparse instances. (default = 2)";   
1928    String string0 = "\tThe maximum number of items to include " +
1929                "in large items sets (and rules). (default " +
1930                "= -1, i.e. no limit.)"; 
1931     
1932    String string1 = "\tThe required number of rules. (default = " 
1933      + m_numRulesToFind + ")";
1934    String string2 = "\tThe minimum metric score of a rule. (default" +
1935                " = " + m_metricThreshold + ")";
1936    String string3 = "\tThe metric by which to rank rules. (default"
1937      + " = confidence)";
1938    String string4 = "\tThe lower bound for the minimum support. (default = "
1939      + m_lowerBoundMinSupport + ")";
1940    String string5 = "\tUpper bound for minimum support. "
1941      + "(default = 1.0)";
1942    String string6 = "\tThe delta by which the minimum support is decreased in\n"
1943     + "\teach iteration. (default = " + m_delta + ")";
1944    String string7 = "\tFind all rules that meet the lower bound on\n\t" +
1945                "minimum support and the minimum metric constraint.\n\t" +
1946                "Turning this mode on will disable the iterative support reduction\n\t" +
1947                "procedure to find the specified number of rules.";
1948   
1949    newVector.add(new Option(string00, "P", 1, "-P <attribute index of positive value>"));
1950    newVector.add(new Option(string0, "I", 1, "-I <max items>"));
1951    newVector.add(new Option(string1, "N", 1, "-N <require number of rules>"));
1952    newVector.add(new Option(string3, "T", 1, "-T <0=confidence | 1=lift | "
1953                                    + "2=leverage | 3=Conviction>"));
1954    newVector.add(new Option(string2, "C", 1, "-C <minimum metric score of a rule>"));
1955    newVector.add(new Option(string5, "U", 1, "-U <upper bound for minimum support>"));
1956    newVector.add(new Option(string4, "M", 1, "-M <lower bound for minimum support>"));
1957    newVector.add(new Option(string6, "D", 1, "-D <delta for minimum support>"));
1958    newVector.add(new Option(string7, "S", 0, "-S"));
1959   
1960    return newVector.elements();
1961  }
1962 
1963  /**
1964   *
1965   * Parses a given list of options. <p/>
1966   *
1967   <!-- options-start -->
1968   * Valid options are: <p/>
1969   *
1970   * <pre> -P &lt;attribute index of positive value&gt;
1971   *  Set the index of the attribute value to consider as 'positive'
1972   *  for binary attributes in normal dense instances. Index 2 is always
1973   *  used for sparse instances. (default = 2)</pre>
1974   *
1975   * <pre> -I &lt;max items&gt;
1976   *  The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)</pre>
1977   *
1978   * <pre> -N &lt;require number of rules&gt;
1979   *  The required number of rules. (default = 10)</pre>
1980   *
1981   * <pre> -T &lt;0=confidence | 1=lift | 2=leverage | 3=Conviction&gt;
1982   *  The metric by which to rank rules. (default = confidence)</pre>
1983   *
1984   * <pre> -C &lt;minimum metric score of a rule&gt;
1985   *  The minimum metric score of a rule. (default = 0.9)</pre>
1986   *
1987   * <pre> -U &lt;upper bound for minimum support&gt;
1988   *  Upper bound for minimum support. (default = 1.0)</pre>
1989   *
1990   * <pre> -M &lt;lower bound for minimum support&gt;
1991   *  The lower bound for the minimum support. (default = 0.1)</pre>
1992   *
1993   * <pre> -D &lt;delta for minimum support&gt;
1994   *  The delta by which the minimum support is decreased in
1995   *  each iteration. (default = 0.05)</pre>
1996   *
1997   * <pre> -S
1998   *  Find all rules that meet the lower bound on
1999   *  minimum support and the minimum metric constraint.
2000   *  Turning this mode on will disable the iterative support reduction
2001   *  procedure to find the specified number of rules.</pre>
2002   *
2003   <!-- options-end -->
2004   *
2005   * @param options the list of options as an array of strings
2006   * @throws Exception if an option is not supported
2007   */
2008  public void setOptions(String[] options) throws Exception {
2009    resetOptions();
2010    String positiveIndexString = Utils.getOption('P', options);
2011    String maxItemsString = Utils.getOption('I', options);
2012    String numRulesString = Utils.getOption('N', options);
2013    String minMetricString = Utils.getOption('C', options);
2014    String metricTypeString = Utils.getOption("T", options);
2015    String lowerBoundSupportString = Utils.getOption("M", options);
2016    String upperBoundSupportString = Utils.getOption("U", options);
2017    String deltaString = Utils.getOption("D", options);
2018
2019    if (positiveIndexString.length() != 0) {
2020      setPositiveIndex(Integer.parseInt(positiveIndexString));
2021    }
2022   
2023    if (maxItemsString.length() != 0) {
2024      setMaxNumberOfItems(Integer.parseInt(maxItemsString));
2025    }
2026   
2027    if (metricTypeString.length() != 0) {
2028      setMetricType(new SelectedTag(Integer.parseInt(metricTypeString),
2029          AssociationRule.TAGS_SELECTION));
2030    }
2031   
2032    if (numRulesString.length() != 0) {
2033      setNumRulesToFind(Integer.parseInt(numRulesString));
2034    }
2035   
2036    if (minMetricString.length() != 0) {
2037      setMinMetric(Double.parseDouble(minMetricString));
2038    }
2039   
2040    if (deltaString.length() != 0) {
2041      setDelta(Double.parseDouble(deltaString));
2042    }
2043   
2044    if (lowerBoundSupportString.length() != 0) {
2045      setLowerBoundMinSupport(Double.parseDouble(lowerBoundSupportString));
2046    }
2047   
2048    if (upperBoundSupportString.length() != 0) {
2049      setUpperBoundMinSupport(Double.parseDouble(upperBoundSupportString));
2050    }
2051   
2052    setFindAllRulesForSupportLevel(Utils.getFlag('S', options));
2053  }
2054 
2055  /**
2056   * Gets the current settings of the classifier.
2057   *
2058   * @return an array of strings suitable for passing to setOptions
2059   */
2060  public String[] getOptions() {
2061    ArrayList<String> options = new ArrayList<String>();
2062   
2063    options.add("-P"); options.add("" + getPositiveIndex());
2064    options.add("-I"); options.add("" + getMaxNumberOfItems());
2065    options.add("-N"); options.add("" + getNumRulesToFind());
2066    options.add("-T"); options.add("" + getMetricType().getSelectedTag().getID());
2067    options.add("-C"); options.add("" + getMinMetric());
2068    options.add("-D"); options.add("" + getDelta());
2069    options.add("-U"); options.add("" + getUpperBoundMinSupport());
2070    options.add("-M"); options.add("" + getLowerBoundMinSupport());
2071    if (getFindAllRulesForSupportLevel()) {
2072      options.add("-S");
2073    }
2074   
2075    return options.toArray(new String[1]);
2076  }
2077
2078  /**
2079   * Method that generates all large item sets with a minimum support, and from
2080   * these all association rules with a minimum metric (i.e. confidence,
2081   * lift etc.).
2082   *
2083   * @param data the instances to be used for generating the associations
2084   * @throws Exception if rules can't be built successfully
2085   */
2086  public void buildAssociations(Instances data) throws Exception {
2087   
2088    // can we handle the data?
2089    getCapabilities().testWithFail(data);
2090   
2091    double currentSupport = m_upperBoundMinSupport;
2092   
2093    if (m_findAllRulesForSupportLevel) {
2094      currentSupport = m_lowerBoundMinSupport;
2095    }
2096    // first compute singletons
2097    ArrayList<BinaryItem> singletons = getSingletons(data);
2098    //ArrayList<BinaryItem> singletonsCopy = new ArrayList<BinaryItem>(singletons);
2099/*    Collections.sort(singletonsCopy);
2100    for (int i = 0; i < singletonsCopy.size(); i++) {
2101      System.out.println(singletonsCopy.get(i).toString(true));
2102    }
2103    System.out.println("---------"); */
2104//    System.out.println("Finished finding singletons...");
2105   
2106    // while not enough rules
2107    do {
2108      int currentSupportAsInstances = (currentSupport > 1)
2109      ? (int)currentSupport
2110      : (int)Math.ceil(currentSupport * data.numInstances());
2111     
2112      //System.err.println("Current support " + currentSupportAsInstances);
2113      //ArrayList<BinaryItem> prunedSingletons = removeNonFrequent(singletons);
2114
2115      // build the FPTree
2116      FPTreeRoot tree = buildFPTree(singletons, data, currentSupportAsInstances);
2117//      System.out.println("Finished building tree...");
2118//      System.out.println(tree.toString(0));
2119    /*System.out.println(tree.printHeaderTable(0)); */
2120
2121      FrequentItemSets largeItemSets = new FrequentItemSets(data.numInstances());
2122
2123      // mine the tree
2124      FrequentBinaryItemSet conditionalItems = 
2125        new FrequentBinaryItemSet(new ArrayList<BinaryItem>(), 0);
2126      mineTree(tree, largeItemSets, 0, conditionalItems, currentSupportAsInstances);
2127
2128      m_largeItemSets = largeItemSets;
2129//         System.err.println("Number of large item sets: " + m_largeItemSets.size());
2130  //    System.err.println(m_largeItemSets.toString(100));
2131
2132      //    m_largeItemSets.sort(compF);
2133//      System.err.println("Finished mining tree...");
2134     
2135      // save memory
2136      tree = null;
2137     
2138      m_rules = 
2139        AssociationRule.generateRulesBruteForce(m_largeItemSets, m_metric, 
2140            m_metricThreshold, data.numInstances());
2141     
2142      if (!m_findAllRulesForSupportLevel) {
2143        currentSupport -= m_delta;
2144        if (currentSupport < m_lowerBoundMinSupport) {
2145          if (currentSupport + m_delta > m_lowerBoundMinSupport) {
2146            // ensure that the lower bound does get evaluated
2147            currentSupport = m_lowerBoundMinSupport;
2148          } else {
2149            break;
2150          }
2151        }
2152      } else {
2153        // just break out of the loop as we are just finding all rules
2154        // with a minimum support + metric
2155        break;
2156      }
2157    } while (m_rules.size() < m_numRulesToFind);
2158   
2159    Collections.sort(m_rules);
2160//    for (AssociationRule)
2161   
2162//    System.out.println(graph(tree));
2163  }
2164   
2165  /**
2166   * Output the association rules.
2167   *
2168   * @return a string representation of the model.
2169   */
2170  public String toString() {
2171//    return m_largeItemSets.toString(m_numItemSetsToFind);
2172    if (m_rules == null) {
2173      return "FPGrowth hasn't been trained yet!";
2174    }
2175
2176    StringBuffer result = new StringBuffer();
2177    int numRules = (m_rules.size() < m_numRulesToFind)
2178      ? m_rules.size()
2179      : m_numRulesToFind;
2180     
2181    if (m_rules.size() == 0) {
2182      return "No rules found!";
2183    } else {     
2184      result.append("FPGrowth found " + m_rules.size() + " rules");
2185      if (!m_findAllRulesForSupportLevel) {
2186        result.append(" (displaying top " + numRules + ")");
2187      }
2188      result.append("\n\n");
2189    }
2190
2191    int count = 0;
2192    for (AssociationRule r : m_rules) {
2193      result.append(Utils.doubleToString((double)count+1,
2194          (int)(Math.log(numRules)/Math.log(10)+1), 0) + ". ");
2195      result.append(r + "\n");
2196      count++;
2197      if (!m_findAllRulesForSupportLevel && count == m_numRulesToFind) {
2198        break;
2199      }
2200    }
2201    return result.toString();
2202  }
2203 
2204  /**
2205   * Assemble a dot graph representation of the FP-tree.
2206   *
2207   * @param tree the root of the FP-tree
2208   * @return a graph representation as a String in dot format.
2209   */
2210  public String graph(FPTreeRoot tree) {
2211    //int maxID = tree.assignIDs(-1);
2212   
2213   
2214    StringBuffer text = new StringBuffer();
2215    text.append("digraph FPTree {\n");
2216    text.append("N0 [label=\"ROOT\"]\n");
2217    tree.graphFPTree(text);
2218   
2219//    tree.graphHeaderTable(text, maxID+1);
2220    text.append("}\n");
2221   
2222    return text.toString();
2223  }
2224 
2225  /**
2226   * Returns the revision string.
2227   *
2228   * @return            the revision
2229   */
2230  public String getRevision() {
2231    return RevisionUtils.extract("$Revision: 6158 $");
2232  }
2233 
2234  /**
2235   * Main method.
2236   *
2237   * @param args the commandline options
2238   */
2239  public static void main(String[] args) {
2240    runAssociator(new FPGrowth(), args);
2241  }
2242}
2243
Note: See TracBrowser for help on using the repository browser.