[29] | 1 | /* |
---|
| 2 | * This program is free software; you can redistribute it and/or modify |
---|
| 3 | * it under the terms of the GNU General Public License as published by |
---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
---|
| 5 | * (at your option) any later version. |
---|
| 6 | * |
---|
| 7 | * This program is distributed in the hope that it will be useful, |
---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 10 | * GNU General Public License for more details. |
---|
| 11 | * |
---|
| 12 | * You should have received a copy of the GNU General Public License |
---|
| 13 | * along with this program; if not, write to the Free Software |
---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
| 15 | */ |
---|
| 16 | |
---|
| 17 | /* |
---|
| 18 | * FPGrowth.java |
---|
| 19 | * Copyright (C) 2009 University of Waikato, Hamilton, New Zealand |
---|
| 20 | * |
---|
| 21 | */ |
---|
| 22 | |
---|
| 23 | package weka.associations; |
---|
| 24 | |
---|
| 25 | import java.io.Serializable; |
---|
| 26 | import java.util.ArrayList; |
---|
| 27 | import java.util.Collection; |
---|
| 28 | import java.util.Collections; |
---|
| 29 | import java.util.Comparator; |
---|
| 30 | import java.util.Enumeration; |
---|
| 31 | import java.util.HashMap; |
---|
| 32 | import java.util.Iterator; |
---|
| 33 | import java.util.LinkedList; |
---|
| 34 | import java.util.List; |
---|
| 35 | import java.util.Map; |
---|
| 36 | import java.util.Set; |
---|
| 37 | import java.util.Vector; |
---|
| 38 | |
---|
| 39 | import weka.core.Attribute; |
---|
| 40 | import weka.core.Capabilities; |
---|
| 41 | import weka.core.Instance; |
---|
| 42 | import weka.core.Instances; |
---|
| 43 | import weka.core.Option; |
---|
| 44 | import weka.core.OptionHandler; |
---|
| 45 | import weka.core.RevisionUtils; |
---|
| 46 | import weka.core.SelectedTag; |
---|
| 47 | import weka.core.SparseInstance; |
---|
| 48 | import weka.core.Tag; |
---|
| 49 | import weka.core.TechnicalInformation; |
---|
| 50 | import weka.core.TechnicalInformationHandler; |
---|
| 51 | import weka.core.Utils; |
---|
| 52 | import weka.core.Capabilities.Capability; |
---|
| 53 | import weka.core.TechnicalInformation.Field; |
---|
| 54 | import weka.core.TechnicalInformation.Type; |
---|
| 55 | |
---|
| 56 | /** |
---|
| 57 | <!-- globalinfo-start --> |
---|
| 58 | * Class implementing the FP-growth algorithm for finding large item sets without candidate generation. Iteratively reduces the minimum support until it finds the required number of rules with the given minimum metric. For more information see:<br/> |
---|
| 59 | * <br/> |
---|
| 60 | * J. Han, J.Pei, Y. Yin: Mining frequent patterns without candidate generation. In: Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data, 1-12, 2000. |
---|
| 61 | * <p/> |
---|
| 62 | <!-- globalinfo-end --> |
---|
| 63 | * |
---|
| 64 | <!-- technical-bibtex-start --> |
---|
| 65 | * BibTeX: |
---|
| 66 | * <pre> |
---|
| 67 | * @inproceedings{Han2000, |
---|
| 68 | * author = {J. Han and J.Pei and Y. Yin}, |
---|
| 69 | * booktitle = {Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data}, |
---|
| 70 | * pages = {1-12}, |
---|
| 71 | * title = {Mining frequent patterns without candidate generation}, |
---|
| 72 | * year = {2000} |
---|
| 73 | * } |
---|
| 74 | * </pre> |
---|
| 75 | * <p/> |
---|
| 76 | <!-- technical-bibtex-end --> |
---|
| 77 | * |
---|
| 78 | <!-- options-start --> |
---|
| 79 | * Valid options are: <p/> |
---|
| 80 | * |
---|
| 81 | * <pre> -P <attribute index of positive value> |
---|
| 82 | * Set the index of the attribute value to consider as 'positive' |
---|
| 83 | * for binary attributes in normal dense instances. Index 2 is always |
---|
| 84 | * used for sparse instances. (default = 2)</pre> |
---|
| 85 | * |
---|
| 86 | * <pre> -I <max items> |
---|
| 87 | * The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)</pre> |
---|
| 88 | * |
---|
| 89 | * <pre> -N <require number of rules> |
---|
| 90 | * The required number of rules. (default = 10)</pre> |
---|
| 91 | * |
---|
| 92 | * <pre> -T <0=confidence | 1=lift | 2=leverage | 3=Conviction> |
---|
| 93 | * The metric by which to rank rules. (default = confidence)</pre> |
---|
| 94 | * |
---|
| 95 | * <pre> -C <minimum metric score of a rule> |
---|
| 96 | * The minimum metric score of a rule. (default = 0.9)</pre> |
---|
| 97 | * |
---|
| 98 | * <pre> -U <upper bound for minimum support> |
---|
| 99 | * Upper bound for minimum support. (default = 1.0)</pre> |
---|
| 100 | * |
---|
| 101 | * <pre> -M <lower bound for minimum support> |
---|
| 102 | * The lower bound for the minimum support. (default = 0.1)</pre> |
---|
| 103 | * |
---|
| 104 | * <pre> -D <delta for minimum support> |
---|
| 105 | * The delta by which the minimum support is decreased in |
---|
| 106 | * each iteration. (default = 0.05)</pre> |
---|
| 107 | * |
---|
| 108 | * <pre> -S |
---|
| 109 | * Find all rules that meet the lower bound on |
---|
| 110 | * minimum support and the minimum metric constraint. |
---|
| 111 | * Turning this mode on will disable the iterative support reduction |
---|
| 112 | * procedure to find the specified number of rules.</pre> |
---|
| 113 | * |
---|
| 114 | <!-- options-end --> |
---|
| 115 | * |
---|
| 116 | * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) |
---|
| 117 | * @version $Revision: 6158 $ |
---|
| 118 | */ |
---|
| 119 | public class FPGrowth extends AbstractAssociator |
---|
| 120 | implements OptionHandler, TechnicalInformationHandler { |
---|
| 121 | |
---|
| 122 | /** For serialization */ |
---|
| 123 | private static final long serialVersionUID = 3620717108603442911L; |
---|
| 124 | |
---|
| 125 | /** |
---|
| 126 | * Inner class that handles a single binary item |
---|
| 127 | */ |
---|
| 128 | public static class BinaryItem implements Serializable, Comparable<BinaryItem> { |
---|
| 129 | |
---|
| 130 | /** For serialization */ |
---|
| 131 | private static final long serialVersionUID = -3372941834914147669L; |
---|
| 132 | |
---|
| 133 | /** The frequency of the item */ |
---|
| 134 | protected int m_frequency; |
---|
| 135 | |
---|
| 136 | /** The attribute that the item corresponds to */ |
---|
| 137 | protected Attribute m_attribute; |
---|
| 138 | |
---|
| 139 | /** The index of the value considered to be positive */ |
---|
| 140 | protected int m_valueIndex; |
---|
| 141 | |
---|
| 142 | public BinaryItem(Attribute att, int valueIndex) throws Exception { |
---|
| 143 | if (att.isNumeric() || (att.isNominal() && att.numValues() > 2)) { |
---|
| 144 | throw new Exception("BinaryItem must be constructed using a nominal attribute" + |
---|
| 145 | " with at most 2 values!"); |
---|
| 146 | } |
---|
| 147 | m_attribute = att; |
---|
| 148 | if (m_attribute.numValues() == 1) { |
---|
| 149 | m_valueIndex = 0; // unary attribute (? used to indicate absence from a basket) |
---|
| 150 | } else { |
---|
| 151 | m_valueIndex = valueIndex; |
---|
| 152 | } |
---|
| 153 | } |
---|
| 154 | |
---|
| 155 | /** |
---|
| 156 | * Increase the frequency of this item. |
---|
| 157 | * |
---|
| 158 | * @param f the amount to increase the frequency by. |
---|
| 159 | */ |
---|
| 160 | public void increaseFrequency(int f) { |
---|
| 161 | m_frequency += f; |
---|
| 162 | } |
---|
| 163 | |
---|
| 164 | /** |
---|
| 165 | * Decrease the frequency of this item. |
---|
| 166 | * |
---|
| 167 | * @param f the amount by which to decrease the frequency. |
---|
| 168 | */ |
---|
| 169 | public void decreaseFrequency(int f) { |
---|
| 170 | m_frequency -= f; |
---|
| 171 | } |
---|
| 172 | |
---|
| 173 | /** |
---|
| 174 | * Increment the frequency of this item. |
---|
| 175 | */ |
---|
| 176 | public void increaseFrequency() { |
---|
| 177 | m_frequency++; |
---|
| 178 | } |
---|
| 179 | |
---|
| 180 | /** |
---|
| 181 | * Decrement the frequency of this item. |
---|
| 182 | */ |
---|
| 183 | public void decreaseFrequency() { |
---|
| 184 | m_frequency--; |
---|
| 185 | } |
---|
| 186 | |
---|
| 187 | /** |
---|
| 188 | * Get the frequency of this item. |
---|
| 189 | * |
---|
| 190 | * @return the frequency. |
---|
| 191 | */ |
---|
| 192 | public int getFrequency() { |
---|
| 193 | return m_frequency; |
---|
| 194 | } |
---|
| 195 | |
---|
| 196 | /** |
---|
| 197 | * Get the attribute that this item corresponds to. |
---|
| 198 | * |
---|
| 199 | * @return the corresponding attribute. |
---|
| 200 | */ |
---|
| 201 | public Attribute getAttribute() { |
---|
| 202 | return m_attribute; |
---|
| 203 | } |
---|
| 204 | |
---|
| 205 | /** |
---|
| 206 | * Get the value index for this item. |
---|
| 207 | * |
---|
| 208 | * @return the value index. |
---|
| 209 | */ |
---|
| 210 | public int getValueIndex() { |
---|
| 211 | return m_valueIndex; |
---|
| 212 | } |
---|
| 213 | |
---|
| 214 | /** |
---|
| 215 | * A string representation of this item. |
---|
| 216 | * |
---|
| 217 | * @return a string representation of this item. |
---|
| 218 | */ |
---|
| 219 | public String toString() { |
---|
| 220 | return toString(false); |
---|
| 221 | } |
---|
| 222 | |
---|
| 223 | /** |
---|
| 224 | * A string representation of this item. |
---|
| 225 | * |
---|
| 226 | * @param freq true if the frequency should be included. |
---|
| 227 | * @return a string representation of this item. |
---|
| 228 | */ |
---|
| 229 | public String toString(boolean freq) { |
---|
| 230 | String result = m_attribute.name() + "=" + m_attribute.value(m_valueIndex); |
---|
| 231 | if (freq) { |
---|
| 232 | result += ":" + m_frequency; |
---|
| 233 | } |
---|
| 234 | return result; |
---|
| 235 | } |
---|
| 236 | |
---|
| 237 | /** |
---|
| 238 | * Ensures that items will be sorted in descending order of frequency. |
---|
| 239 | * Ties are ordered by attribute name. |
---|
| 240 | * |
---|
| 241 | * @param comp the BinaryItem to compare against. |
---|
| 242 | */ |
---|
| 243 | public int compareTo(BinaryItem comp) { |
---|
| 244 | if (m_frequency == comp.getFrequency()) { |
---|
| 245 | // sort by name |
---|
| 246 | return -1 * m_attribute.name().compareTo(comp.getAttribute().name()); |
---|
| 247 | } |
---|
| 248 | if (comp.getFrequency() < m_frequency) { |
---|
| 249 | return -1; |
---|
| 250 | } |
---|
| 251 | return 1; |
---|
| 252 | } |
---|
| 253 | |
---|
| 254 | public boolean equals(Object compareTo) { |
---|
| 255 | if (!(compareTo instanceof BinaryItem)) { |
---|
| 256 | return false; |
---|
| 257 | } |
---|
| 258 | |
---|
| 259 | BinaryItem b = (BinaryItem)compareTo; |
---|
| 260 | if (m_attribute.equals(b.getAttribute()) && m_frequency == b.getFrequency()) { |
---|
| 261 | return true; |
---|
| 262 | } |
---|
| 263 | |
---|
| 264 | return false; |
---|
| 265 | } |
---|
| 266 | |
---|
| 267 | public int hashCode() { |
---|
| 268 | return (m_attribute.name().hashCode() ^ |
---|
| 269 | m_attribute.numValues()) * m_frequency; |
---|
| 270 | } |
---|
| 271 | } |
---|
| 272 | |
---|
| 273 | /** |
---|
| 274 | * Class for maintaining a frequent item set. |
---|
| 275 | */ |
---|
| 276 | protected static class FrequentBinaryItemSet |
---|
| 277 | implements Serializable, Cloneable { |
---|
| 278 | |
---|
| 279 | /** For serialization */ |
---|
| 280 | private static final long serialVersionUID = -6543815873565829448L; |
---|
| 281 | |
---|
| 282 | /** The list of items in the item set */ |
---|
| 283 | protected ArrayList<BinaryItem> m_items = new ArrayList<BinaryItem>(); |
---|
| 284 | |
---|
| 285 | /** the support of this item set **/ |
---|
| 286 | protected int m_support; |
---|
| 287 | |
---|
| 288 | /** |
---|
| 289 | * Constructor |
---|
| 290 | * |
---|
| 291 | * @param items the items that make up the frequent item set. |
---|
| 292 | * @param support the support of this item set. |
---|
| 293 | */ |
---|
| 294 | public FrequentBinaryItemSet(ArrayList<BinaryItem> items, int support) { |
---|
| 295 | m_items = items; |
---|
| 296 | m_support = support; |
---|
| 297 | Collections.sort(m_items); |
---|
| 298 | } |
---|
| 299 | |
---|
| 300 | /** |
---|
| 301 | * Add an item to this item set. |
---|
| 302 | * |
---|
| 303 | * @param i the item to add. |
---|
| 304 | */ |
---|
| 305 | public void addItem(BinaryItem i) { |
---|
| 306 | m_items.add(i); |
---|
| 307 | Collections.sort(m_items); |
---|
| 308 | } |
---|
| 309 | |
---|
| 310 | /** |
---|
| 311 | * Set the support for this item set. |
---|
| 312 | * |
---|
| 313 | * @param support the support for this item set. |
---|
| 314 | */ |
---|
| 315 | public void setSupport(int support) { |
---|
| 316 | m_support = support; |
---|
| 317 | } |
---|
| 318 | |
---|
| 319 | /** |
---|
| 320 | * Get the support of this item set. |
---|
| 321 | * |
---|
| 322 | * @return the support of this item set. |
---|
| 323 | */ |
---|
| 324 | public int getSupport() { |
---|
| 325 | return m_support; |
---|
| 326 | } |
---|
| 327 | |
---|
| 328 | /** |
---|
| 329 | * Get the items in this item set. |
---|
| 330 | * |
---|
| 331 | * @return the items in this item set. |
---|
| 332 | */ |
---|
| 333 | public Collection<BinaryItem> getItems() { |
---|
| 334 | return m_items; |
---|
| 335 | } |
---|
| 336 | |
---|
| 337 | /** |
---|
| 338 | * Get a particular item from this item set. |
---|
| 339 | * |
---|
| 340 | * @param index the index of the item to get. |
---|
| 341 | * @return the item. |
---|
| 342 | */ |
---|
| 343 | public BinaryItem getItem(int index) { |
---|
| 344 | return m_items.get(index); |
---|
| 345 | } |
---|
| 346 | |
---|
| 347 | /** |
---|
| 348 | * Get the number of items in this item set. |
---|
| 349 | * |
---|
| 350 | * @return the number of items in this item set. |
---|
| 351 | */ |
---|
| 352 | public int numberOfItems() { |
---|
| 353 | return m_items.size(); |
---|
| 354 | } |
---|
| 355 | |
---|
| 356 | /** |
---|
| 357 | * Get a textual description of this item set. |
---|
| 358 | * |
---|
| 359 | * @return a textual description of this item set. |
---|
| 360 | */ |
---|
| 361 | public String toString() { |
---|
| 362 | StringBuffer buff = new StringBuffer(); |
---|
| 363 | Iterator<BinaryItem> i = m_items.iterator(); |
---|
| 364 | |
---|
| 365 | while (i.hasNext()) { |
---|
| 366 | buff.append(i.next().toString() + " "); |
---|
| 367 | } |
---|
| 368 | buff.append(": " + m_support); |
---|
| 369 | return buff.toString(); |
---|
| 370 | } |
---|
| 371 | |
---|
| 372 | /** |
---|
| 373 | * Make a copy of this item set. |
---|
| 374 | * |
---|
| 375 | * @return a copy of this item set. |
---|
| 376 | */ |
---|
| 377 | public Object clone() { |
---|
| 378 | ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(m_items); |
---|
| 379 | return new FrequentBinaryItemSet(items, m_support); |
---|
| 380 | } |
---|
| 381 | } |
---|
| 382 | |
---|
| 383 | /** |
---|
| 384 | * Maintains a list of frequent item sets. |
---|
| 385 | */ |
---|
| 386 | protected static class FrequentItemSets implements Serializable { |
---|
| 387 | |
---|
| 388 | /** For serialization */ |
---|
| 389 | private static final long serialVersionUID = 4173606872363973588L; |
---|
| 390 | |
---|
| 391 | /** The list of frequent item sets */ |
---|
| 392 | protected ArrayList<FrequentBinaryItemSet> m_sets = |
---|
| 393 | new ArrayList<FrequentBinaryItemSet>(); |
---|
| 394 | |
---|
| 395 | /** The total number of transactions in the data */ |
---|
| 396 | protected int m_numberOfTransactions; |
---|
| 397 | |
---|
| 398 | /** |
---|
| 399 | * Constructor. |
---|
| 400 | * |
---|
| 401 | * @param numTransactions the total number of transactions in the data. |
---|
| 402 | */ |
---|
| 403 | public FrequentItemSets(int numTransactions) { |
---|
| 404 | m_numberOfTransactions = numTransactions; |
---|
| 405 | } |
---|
| 406 | |
---|
| 407 | /** |
---|
| 408 | * Get an item set. |
---|
| 409 | * |
---|
| 410 | * @param index the index of the item set to get. |
---|
| 411 | * @return an item set. |
---|
| 412 | */ |
---|
| 413 | public FrequentBinaryItemSet getItemSet(int index) { |
---|
| 414 | return m_sets.get(index); |
---|
| 415 | } |
---|
| 416 | |
---|
| 417 | /** |
---|
| 418 | * Get an iterator that can be used to access all the item sets. |
---|
| 419 | * |
---|
| 420 | * @return an iterator. |
---|
| 421 | */ |
---|
| 422 | public Iterator<FrequentBinaryItemSet> iterator() { |
---|
| 423 | return m_sets.iterator(); |
---|
| 424 | } |
---|
| 425 | |
---|
| 426 | /** |
---|
| 427 | * Get the total number of transactions in the data that the item |
---|
| 428 | * sets were derived from. |
---|
| 429 | * |
---|
| 430 | * @return the total number of transactions in the data. |
---|
| 431 | */ |
---|
| 432 | public int getNumberOfTransactions() { |
---|
| 433 | return m_numberOfTransactions; |
---|
| 434 | } |
---|
| 435 | |
---|
| 436 | /** |
---|
| 437 | * Add an item set. |
---|
| 438 | * |
---|
| 439 | * @param setToAdd the item set to add. |
---|
| 440 | */ |
---|
| 441 | public void addItemSet(FrequentBinaryItemSet setToAdd) { |
---|
| 442 | m_sets.add(setToAdd); |
---|
| 443 | } |
---|
| 444 | |
---|
| 445 | /** |
---|
| 446 | * Sort the item sets according to the supplied comparator. |
---|
| 447 | * |
---|
| 448 | * @param comp the comparator to use. |
---|
| 449 | */ |
---|
| 450 | public void sort(Comparator<FrequentBinaryItemSet> comp) { |
---|
| 451 | Collections.sort(m_sets, comp); |
---|
| 452 | } |
---|
| 453 | |
---|
| 454 | /** |
---|
| 455 | * Get the number of item sets. |
---|
| 456 | * |
---|
| 457 | * @return the number of item sets. |
---|
| 458 | */ |
---|
| 459 | public int size() { |
---|
| 460 | return m_sets.size(); |
---|
| 461 | } |
---|
| 462 | |
---|
| 463 | /** |
---|
| 464 | * Sort the item sets. Sorts by item set length. Ties are broken by comparing |
---|
| 465 | * the items in the two item sets. |
---|
| 466 | */ |
---|
| 467 | public void sort() { |
---|
| 468 | Comparator<FrequentBinaryItemSet> compF = new Comparator<FrequentBinaryItemSet>() { |
---|
| 469 | public int compare(FrequentBinaryItemSet one, FrequentBinaryItemSet two) { |
---|
| 470 | Collection<BinaryItem> compOne = one.getItems(); |
---|
| 471 | Collection<BinaryItem> compTwo = two.getItems(); |
---|
| 472 | |
---|
| 473 | // if (one.getSupport() == two.getSupport()) { |
---|
| 474 | // if supports are equal then list shorter item sets before longer ones |
---|
| 475 | if (compOne.size() < compTwo.size()) { |
---|
| 476 | return -1; |
---|
| 477 | } else if (compOne.size() > compTwo.size()) { |
---|
| 478 | return 1; |
---|
| 479 | } else { |
---|
| 480 | // compare items |
---|
| 481 | Iterator<BinaryItem> twoIterator = compTwo.iterator(); |
---|
| 482 | for (BinaryItem oneI : compOne) { |
---|
| 483 | BinaryItem twoI = twoIterator.next(); |
---|
| 484 | int result = oneI.compareTo(twoI); |
---|
| 485 | if (result != 0) { |
---|
| 486 | return result; |
---|
| 487 | } |
---|
| 488 | } |
---|
| 489 | return 0; // equal |
---|
| 490 | } |
---|
| 491 | |
---|
| 492 | // return 0; |
---|
| 493 | /* } else if (one.getSupport() > two.getSupport()) { |
---|
| 494 | // reverse ordering (i.e. descending by support) |
---|
| 495 | return -1; |
---|
| 496 | } */ |
---|
| 497 | |
---|
| 498 | // return 1; |
---|
| 499 | } |
---|
| 500 | }; |
---|
| 501 | |
---|
| 502 | sort(compF); |
---|
| 503 | } |
---|
| 504 | |
---|
| 505 | /** |
---|
| 506 | * Get a textual description of this list of item sets. |
---|
| 507 | * |
---|
| 508 | * @param numSets the number of item sets to display. |
---|
| 509 | * @return a textual description of the item sets. |
---|
| 510 | */ |
---|
| 511 | public String toString(int numSets) { |
---|
| 512 | if (m_sets.size() == 0) { |
---|
| 513 | return "No frequent items sets found!"; |
---|
| 514 | } |
---|
| 515 | |
---|
| 516 | StringBuffer result = new StringBuffer(); |
---|
| 517 | result.append("" + m_sets.size() + " frequent item sets found"); |
---|
| 518 | if (numSets > 0) { |
---|
| 519 | result.append(" , displaying " + numSets); |
---|
| 520 | } |
---|
| 521 | result.append(":\n\n"); |
---|
| 522 | |
---|
| 523 | int count = 0; |
---|
| 524 | for (FrequentBinaryItemSet i : m_sets) { |
---|
| 525 | if (numSets > 0 && count > numSets) { |
---|
| 526 | break; |
---|
| 527 | } |
---|
| 528 | result.append(i.toString() + "\n"); |
---|
| 529 | count++; |
---|
| 530 | } |
---|
| 531 | |
---|
| 532 | return result.toString(); |
---|
| 533 | } |
---|
| 534 | } |
---|
| 535 | |
---|
| 536 | /** |
---|
| 537 | * This class holds the counts for projected tree nodes |
---|
| 538 | * and header lists. |
---|
| 539 | */ |
---|
| 540 | protected static class ShadowCounts implements Serializable { |
---|
| 541 | |
---|
| 542 | /** For serialization */ |
---|
| 543 | private static final long serialVersionUID = 4435433714185969155L; |
---|
| 544 | |
---|
| 545 | /** Holds the counts at different recursion levels */ |
---|
| 546 | private ArrayList<Integer> m_counts = new ArrayList<Integer>(); |
---|
| 547 | |
---|
| 548 | /** |
---|
| 549 | * Get the count at the specified recursion depth. |
---|
| 550 | * |
---|
| 551 | * @param recursionLevel the depth of the recursion. |
---|
| 552 | * @return the count. |
---|
| 553 | */ |
---|
| 554 | public int getCount(int recursionLevel) { |
---|
| 555 | if (recursionLevel >= m_counts.size()) { |
---|
| 556 | return 0; |
---|
| 557 | } else { |
---|
| 558 | return m_counts.get(recursionLevel); |
---|
| 559 | } |
---|
| 560 | } |
---|
| 561 | |
---|
| 562 | /** |
---|
| 563 | * Increase the count at a given recursion level. |
---|
| 564 | * |
---|
| 565 | * @param recursionLevel the level at which to increase the count. |
---|
| 566 | * @param incr the amount by which to increase the count. |
---|
| 567 | */ |
---|
| 568 | public void increaseCount(int recursionLevel, int incr) { |
---|
| 569 | // basically treat the list like a stack where we |
---|
| 570 | // can add a new element, or increment the element |
---|
| 571 | // at the top |
---|
| 572 | |
---|
| 573 | if (recursionLevel == m_counts.size()) { |
---|
| 574 | // new element |
---|
| 575 | m_counts.add(incr); |
---|
| 576 | } else if (recursionLevel == m_counts.size() - 1) { |
---|
| 577 | // otherwise increment the top |
---|
| 578 | int n = m_counts.get(recursionLevel).intValue(); |
---|
| 579 | m_counts.set(recursionLevel, (n + incr)); |
---|
| 580 | } |
---|
| 581 | } |
---|
| 582 | |
---|
| 583 | /** |
---|
| 584 | * Remove the count at the given recursion level. |
---|
| 585 | * |
---|
| 586 | * @param recursionLevel the level at which to remove the count. |
---|
| 587 | */ |
---|
| 588 | public void removeCount(int recursionLevel) { |
---|
| 589 | if (recursionLevel < m_counts.size()) { |
---|
| 590 | m_counts.remove(recursionLevel); |
---|
| 591 | } |
---|
| 592 | } |
---|
| 593 | } |
---|
| 594 | |
---|
| 595 | /** |
---|
| 596 | * A node in the FP-tree. |
---|
| 597 | */ |
---|
| 598 | protected static class FPTreeNode implements Serializable { |
---|
| 599 | |
---|
| 600 | /** For serialization */ |
---|
| 601 | private static final long serialVersionUID = 4396315323673737660L; |
---|
| 602 | |
---|
| 603 | /** link to another sibling at this level in the tree */ |
---|
| 604 | protected FPTreeNode m_levelSibling; |
---|
| 605 | |
---|
| 606 | /** link to the parent node */ |
---|
| 607 | protected FPTreeNode m_parent; |
---|
| 608 | |
---|
| 609 | /** item at this node */ |
---|
| 610 | protected BinaryItem m_item; |
---|
| 611 | |
---|
| 612 | /** ID (for graphing the tree) */ |
---|
| 613 | protected int m_ID; |
---|
| 614 | |
---|
| 615 | /** the children of this node */ |
---|
| 616 | protected Map<BinaryItem, FPTreeNode> m_children = |
---|
| 617 | new HashMap<BinaryItem, FPTreeNode>(); |
---|
| 618 | |
---|
| 619 | /** counts associated with projected versions of this node */ |
---|
| 620 | protected ShadowCounts m_projectedCounts = new ShadowCounts(); |
---|
| 621 | |
---|
| 622 | /** |
---|
| 623 | * Construct a new node with the given parent link and item. |
---|
| 624 | * |
---|
| 625 | * @param parent a pointer to the parent of this node. |
---|
| 626 | * @param item the item at this node. |
---|
| 627 | */ |
---|
| 628 | public FPTreeNode(FPTreeNode parent, BinaryItem item) { |
---|
| 629 | m_parent = parent; |
---|
| 630 | m_item = item; |
---|
| 631 | } |
---|
| 632 | |
---|
| 633 | /** |
---|
| 634 | * Insert an item set into the tree at this node. Removes the first item |
---|
| 635 | * from the supplied item set and makes a recursive call to insert the |
---|
| 636 | * remaining items. |
---|
| 637 | * |
---|
| 638 | * @param itemSet the item set to insert. |
---|
| 639 | * @param headerTable the header table for the tree. |
---|
| 640 | * @param incr the amount by which to increase counts. |
---|
| 641 | */ |
---|
| 642 | public void addItemSet(Collection<BinaryItem> itemSet, |
---|
| 643 | Map<BinaryItem, FPTreeRoot.Header> headerTable, int incr) { |
---|
| 644 | |
---|
| 645 | Iterator<BinaryItem> i = itemSet.iterator(); |
---|
| 646 | |
---|
| 647 | if (i.hasNext()) { |
---|
| 648 | BinaryItem first = i.next(); |
---|
| 649 | |
---|
| 650 | FPTreeNode aChild; |
---|
| 651 | if (!m_children.containsKey(first)) { |
---|
| 652 | // not in the tree, so add it. |
---|
| 653 | aChild = new FPTreeNode(this, first); |
---|
| 654 | m_children.put(first, aChild); |
---|
| 655 | |
---|
| 656 | // update the header |
---|
| 657 | if (!headerTable.containsKey(first)) { |
---|
| 658 | headerTable.put(first, new FPTreeRoot.Header()); |
---|
| 659 | } |
---|
| 660 | |
---|
| 661 | // append new node to header list |
---|
| 662 | headerTable.get(first).addToList(aChild); |
---|
| 663 | } else { |
---|
| 664 | // get the appropriate child node |
---|
| 665 | aChild = m_children.get(first); |
---|
| 666 | } |
---|
| 667 | |
---|
| 668 | // update counts in header table |
---|
| 669 | headerTable.get(first).getProjectedCounts().increaseCount(0, incr); |
---|
| 670 | |
---|
| 671 | // increase the child's count |
---|
| 672 | aChild.increaseProjectedCount(0, incr); |
---|
| 673 | |
---|
| 674 | // proceed recursively |
---|
| 675 | itemSet.remove(first); |
---|
| 676 | aChild.addItemSet(itemSet, headerTable, incr); |
---|
| 677 | } |
---|
| 678 | } |
---|
| 679 | |
---|
| 680 | /** |
---|
| 681 | * Increase the projected count at the given recursion level at this |
---|
| 682 | * node |
---|
| 683 | * |
---|
| 684 | * @param recursionLevel the recursion level to increase the node count |
---|
| 685 | * at. |
---|
| 686 | * @param incr the amount by which to increase the count. |
---|
| 687 | */ |
---|
| 688 | public void increaseProjectedCount(int recursionLevel, int incr) { |
---|
| 689 | m_projectedCounts.increaseCount(recursionLevel, incr); |
---|
| 690 | } |
---|
| 691 | |
---|
| 692 | /** |
---|
| 693 | * Remove the projected count at the given recursion level for this |
---|
| 694 | * node. |
---|
| 695 | * |
---|
| 696 | * @param recursionLevel the recursion level at which to remove the count. |
---|
| 697 | */ |
---|
| 698 | public void removeProjectedCount(int recursionLevel) { |
---|
| 699 | m_projectedCounts.removeCount(recursionLevel); |
---|
| 700 | } |
---|
| 701 | |
---|
| 702 | /** |
---|
| 703 | * Get the projected count at the given recursion level for this node. |
---|
| 704 | * |
---|
| 705 | * @param recursionLevel the recursion level at which to get the count. |
---|
| 706 | * @return the count. |
---|
| 707 | */ |
---|
| 708 | public int getProjectedCount(int recursionLevel) { |
---|
| 709 | return m_projectedCounts.getCount(recursionLevel); |
---|
| 710 | } |
---|
| 711 | |
---|
| 712 | /** |
---|
| 713 | * Get the parent node. |
---|
| 714 | * |
---|
| 715 | * @return the parent node. |
---|
| 716 | */ |
---|
| 717 | public FPTreeNode getParent() { |
---|
| 718 | return m_parent; |
---|
| 719 | } |
---|
| 720 | |
---|
| 721 | /** |
---|
| 722 | * Get the item at this node. |
---|
| 723 | * |
---|
| 724 | * @return the item at this node. |
---|
| 725 | */ |
---|
| 726 | public BinaryItem getItem() { |
---|
| 727 | return m_item; |
---|
| 728 | } |
---|
| 729 | |
---|
| 730 | /** |
---|
| 731 | * Return a textual description of this node for a given recursion |
---|
| 732 | * level. |
---|
| 733 | * |
---|
| 734 | * @param recursionLevel the recursion depth to use. |
---|
| 735 | * @return a textual description of this node. |
---|
| 736 | */ |
---|
| 737 | public String toString(int recursionLevel) { |
---|
| 738 | return toString("", recursionLevel); |
---|
| 739 | } |
---|
| 740 | |
---|
| 741 | /** |
---|
| 742 | * Return a textual description of this node for a given recursion |
---|
| 743 | * level. |
---|
| 744 | * |
---|
| 745 | * @param prefix a prefix string to prepend. |
---|
| 746 | * @param recursionLevel the recursion level to use. |
---|
| 747 | * @return a textual description of this node. |
---|
| 748 | */ |
---|
| 749 | public String toString(String prefix, int recursionLevel) { |
---|
| 750 | StringBuffer buffer = new StringBuffer(); |
---|
| 751 | buffer.append(prefix); |
---|
| 752 | buffer.append("| "); |
---|
| 753 | buffer.append(m_item.toString()); |
---|
| 754 | buffer.append(" ("); |
---|
| 755 | buffer.append(m_projectedCounts.getCount(recursionLevel)); |
---|
| 756 | buffer.append(")\n"); |
---|
| 757 | |
---|
| 758 | for (FPTreeNode node : m_children.values()) { |
---|
| 759 | buffer.append(node.toString(prefix + "| ", recursionLevel)); |
---|
| 760 | } |
---|
| 761 | return buffer.toString(); |
---|
| 762 | } |
---|
| 763 | |
---|
| 764 | protected int assignIDs(int lastID) { |
---|
| 765 | int currentLastID = lastID + 1; |
---|
| 766 | m_ID = currentLastID; |
---|
| 767 | if (m_children != null) { |
---|
| 768 | Collection<FPTreeNode> kids = m_children.values(); |
---|
| 769 | for (FPTreeNode n : kids) { |
---|
| 770 | currentLastID = n.assignIDs(currentLastID); |
---|
| 771 | } |
---|
| 772 | } |
---|
| 773 | return currentLastID; |
---|
| 774 | } |
---|
| 775 | |
---|
| 776 | /** |
---|
| 777 | * Generate a dot graph description string for the tree. |
---|
| 778 | * |
---|
| 779 | * @param text a StringBuffer to store the graph description |
---|
| 780 | * in. |
---|
| 781 | */ |
---|
| 782 | public void graphFPTree(StringBuffer text) { |
---|
| 783 | if (m_children != null) { |
---|
| 784 | Collection<FPTreeNode> kids = m_children.values(); |
---|
| 785 | for (FPTreeNode n : kids) { |
---|
| 786 | text.append("N" + n.m_ID); |
---|
| 787 | text.append(" [label=\""); |
---|
| 788 | text.append(n.getItem().toString() + " (" + n.getProjectedCount(0) + ")\\n"); |
---|
| 789 | text.append("\"]\n"); |
---|
| 790 | n.graphFPTree(text); |
---|
| 791 | text.append("N" + m_ID + "->" + "N" + n.m_ID + "\n"); |
---|
| 792 | } |
---|
| 793 | } |
---|
| 794 | } |
---|
| 795 | } |
---|
| 796 | |
---|
| 797 | /** |
---|
| 798 | * Root of the FPTree |
---|
| 799 | */ |
---|
| 800 | private static class FPTreeRoot extends FPTreeNode { |
---|
| 801 | |
---|
| 802 | /** For serialization */ |
---|
| 803 | private static final long serialVersionUID = 632150939785333297L; |
---|
| 804 | |
---|
| 805 | /** |
---|
| 806 | * Stores a header entry for an FPTree |
---|
| 807 | */ |
---|
| 808 | protected static class Header implements Serializable { |
---|
| 809 | |
---|
| 810 | /** For serialization */ |
---|
| 811 | private static final long serialVersionUID = -6583156284891368909L; |
---|
| 812 | |
---|
| 813 | /** The list of pointers into the tree structure */ |
---|
| 814 | protected List<FPTreeNode> m_headerList = new LinkedList<FPTreeNode>(); |
---|
| 815 | |
---|
| 816 | /** Projected header counts for this entry */ |
---|
| 817 | protected ShadowCounts m_projectedHeaderCounts = new ShadowCounts(); |
---|
| 818 | |
---|
| 819 | /** |
---|
| 820 | * Add a tree node into the list for this header entry. |
---|
| 821 | * |
---|
| 822 | * @param toAdd the node to add. |
---|
| 823 | */ |
---|
| 824 | public void addToList(FPTreeNode toAdd) { |
---|
| 825 | m_headerList.add(toAdd); |
---|
| 826 | } |
---|
| 827 | |
---|
| 828 | /** |
---|
| 829 | * Get the list of nodes for this header entry. |
---|
| 830 | * |
---|
| 831 | * @return the list of nodes for this header entry. |
---|
| 832 | */ |
---|
| 833 | public List<FPTreeNode> getHeaderList() { |
---|
| 834 | return m_headerList; |
---|
| 835 | } |
---|
| 836 | |
---|
| 837 | /** |
---|
| 838 | * Get the projected counts for this header entry. |
---|
| 839 | * |
---|
| 840 | * @return the projected counts for this header entry. |
---|
| 841 | */ |
---|
| 842 | public ShadowCounts getProjectedCounts() { |
---|
| 843 | return m_projectedHeaderCounts; |
---|
| 844 | } |
---|
| 845 | } |
---|
| 846 | |
---|
| 847 | /** Stores the header table as mapped Header entries */ |
---|
| 848 | protected Map<BinaryItem, Header> m_headerTable = |
---|
| 849 | new HashMap<BinaryItem, Header>(); |
---|
| 850 | |
---|
| 851 | /** |
---|
| 852 | * Create a new FPTreeRoot. |
---|
| 853 | */ |
---|
| 854 | public FPTreeRoot() { |
---|
| 855 | super(null, null); |
---|
| 856 | } |
---|
| 857 | |
---|
| 858 | /** |
---|
| 859 | * Insert an item set into the tree. |
---|
| 860 | * |
---|
| 861 | * @param itemSet the item set to insert into the tree. |
---|
| 862 | * @param incr the increment by which to increase counters. |
---|
| 863 | */ |
---|
| 864 | public void addItemSet(Collection<BinaryItem> itemSet, int incr) { |
---|
| 865 | super.addItemSet(itemSet, m_headerTable, incr); |
---|
| 866 | } |
---|
| 867 | |
---|
| 868 | /** |
---|
| 869 | * Get the header table for this tree. |
---|
| 870 | * |
---|
| 871 | * @return the header table for this tree. |
---|
| 872 | */ |
---|
| 873 | public Map<BinaryItem, Header> getHeaderTable() { |
---|
| 874 | return m_headerTable; |
---|
| 875 | } |
---|
| 876 | |
---|
| 877 | public boolean isEmpty(int recursionLevel) { |
---|
| 878 | for (FPTreeNode c : m_children.values()) { |
---|
| 879 | if (c.getProjectedCount(recursionLevel) > 0) { |
---|
| 880 | return false; |
---|
| 881 | } |
---|
| 882 | } |
---|
| 883 | return true; |
---|
| 884 | } |
---|
| 885 | |
---|
| 886 | /** |
---|
| 887 | * Get a textual description of the tree at a given recursion |
---|
| 888 | * (projection) level. |
---|
| 889 | * |
---|
| 890 | * @param pad the string to use as a prefix for indenting nodes. |
---|
| 891 | * @param recursionLevel the recursion level (projection) to use. |
---|
| 892 | * @return the textual description of the tree. |
---|
| 893 | */ |
---|
| 894 | public String toString(String pad, int recursionLevel) { |
---|
| 895 | StringBuffer result = new StringBuffer(); |
---|
| 896 | result.append(pad); |
---|
| 897 | result.append("+ ROOT\n"); |
---|
| 898 | |
---|
| 899 | for (FPTreeNode node : m_children.values()) { |
---|
| 900 | result.append(node.toString(pad + "| ", recursionLevel)); |
---|
| 901 | } |
---|
| 902 | return result.toString(); |
---|
| 903 | } |
---|
| 904 | |
---|
| 905 | /** |
---|
| 906 | * Get a textual description of the header table for this tree. |
---|
| 907 | * |
---|
| 908 | * @param recursionLevel the recursion level to use. |
---|
| 909 | * @return a textual description of the header table for this |
---|
| 910 | * tree at a given recursion level. |
---|
| 911 | */ |
---|
| 912 | public String printHeaderTable(int recursionLevel) { |
---|
| 913 | StringBuffer buffer = new StringBuffer(); |
---|
| 914 | for (BinaryItem item : m_headerTable.keySet()) { |
---|
| 915 | buffer.append(item.toString()); |
---|
| 916 | buffer.append(" : "); |
---|
| 917 | buffer.append(m_headerTable.get(item).getProjectedCounts().getCount(recursionLevel)); |
---|
| 918 | buffer.append("\n"); |
---|
| 919 | } |
---|
| 920 | return buffer.toString(); |
---|
| 921 | } |
---|
| 922 | |
---|
| 923 | public void graphHeaderTable(StringBuffer text, int maxID) { |
---|
| 924 | |
---|
| 925 | for (BinaryItem item : m_headerTable.keySet()) { |
---|
| 926 | Header h = m_headerTable.get(item); |
---|
| 927 | List<FPTreeNode> headerList = h.getHeaderList(); |
---|
| 928 | if (headerList.size() > 1) { |
---|
| 929 | text.append("N" + maxID + " [label=\"" + headerList.get(0).getItem().toString() |
---|
| 930 | + " (" + h.getProjectedCounts().getCount(0) + ")" |
---|
| 931 | + "\" shape=plaintext]\n"); |
---|
| 932 | |
---|
| 933 | text.append("N" + maxID + "->" + "N" + headerList.get(1).m_ID + "\n"); |
---|
| 934 | for (int i = 1; i < headerList.size() - 1; i++) { |
---|
| 935 | text.append("N" + headerList.get(i).m_ID + "->" + "N" + headerList.get(i+1).m_ID + "\n"); |
---|
| 936 | } |
---|
| 937 | maxID++; |
---|
| 938 | } |
---|
| 939 | } |
---|
| 940 | } |
---|
| 941 | } |
---|
| 942 | |
---|
| 943 | /** |
---|
| 944 | * Class for storing and manipulating an association rule. Also has a utility |
---|
| 945 | * routine for generating (by brute force) all the association rules that meet |
---|
| 946 | * a given metric threshold from a list of large item sets. |
---|
| 947 | * |
---|
| 948 | * @author Mark Hall (mhall{[at]}pentaho{[dot]}com). |
---|
| 949 | */ |
---|
| 950 | /** |
---|
| 951 | * @author mhall |
---|
| 952 | * |
---|
| 953 | */ |
---|
| 954 | public static class AssociationRule implements Serializable, Comparable<AssociationRule> { |
---|
| 955 | |
---|
| 956 | /** For serialization */ |
---|
| 957 | private static final long serialVersionUID = -661269018702294489L; |
---|
| 958 | |
---|
| 959 | /** Enum for holding different metric types */ |
---|
| 960 | public static enum METRIC_TYPE { |
---|
| 961 | CONFIDENCE("conf") { |
---|
| 962 | double compute(int premiseSupport, int consequenceSupport, |
---|
| 963 | int totalSupport, int totalTransactions) { |
---|
| 964 | |
---|
| 965 | return (double)totalSupport / (double)premiseSupport; |
---|
| 966 | } |
---|
| 967 | }, |
---|
| 968 | LIFT("lift") { |
---|
| 969 | double compute(int premiseSupport, int consequenceSupport, |
---|
| 970 | int totalSupport, int totalTransactions) { |
---|
| 971 | |
---|
| 972 | double confidence = |
---|
| 973 | METRIC_TYPE.CONFIDENCE.compute(premiseSupport, consequenceSupport, |
---|
| 974 | totalSupport, totalTransactions); |
---|
| 975 | return confidence / ((double)consequenceSupport / |
---|
| 976 | (double)totalTransactions); |
---|
| 977 | } |
---|
| 978 | }, |
---|
| 979 | LEVERAGE("lev") { |
---|
| 980 | double compute(int premiseSupport, int consequenceSupport, |
---|
| 981 | int totalSupport, int totalTransactions) { |
---|
| 982 | |
---|
| 983 | double coverageForItemSet = (double)totalSupport / |
---|
| 984 | (double)totalTransactions; |
---|
| 985 | double expectedCoverageIfIndependent = |
---|
| 986 | ((double)premiseSupport / (double)totalTransactions) * |
---|
| 987 | ((double)consequenceSupport / (double)totalTransactions); |
---|
| 988 | return coverageForItemSet - expectedCoverageIfIndependent; |
---|
| 989 | } |
---|
| 990 | }, |
---|
| 991 | CONVICTION("conv") { |
---|
| 992 | double compute(int premiseSupport, int consequenceSupport, |
---|
| 993 | int totalSupport, int totalTransactions) { |
---|
| 994 | |
---|
| 995 | double num = |
---|
| 996 | (double)premiseSupport * (double)(totalTransactions - consequenceSupport) / |
---|
| 997 | (double)totalTransactions; |
---|
| 998 | double denom = premiseSupport - totalSupport + 1; |
---|
| 999 | return num / denom; |
---|
| 1000 | } |
---|
| 1001 | }; |
---|
| 1002 | |
---|
| 1003 | private final String m_stringVal; |
---|
| 1004 | METRIC_TYPE(String name) { |
---|
| 1005 | m_stringVal = name; |
---|
| 1006 | } |
---|
| 1007 | |
---|
| 1008 | abstract double compute(int premiseSupport, int consequenceSupport, |
---|
| 1009 | int totalSupport, int totalTransactions); |
---|
| 1010 | |
---|
| 1011 | public String toString() { |
---|
| 1012 | return m_stringVal; |
---|
| 1013 | } |
---|
| 1014 | |
---|
| 1015 | public String toStringMetric(int premiseSupport, int consequenceSupport, |
---|
| 1016 | int totalSupport, int totalTransactions) { |
---|
| 1017 | return m_stringVal + ":(" + Utils.doubleToString(compute(premiseSupport, consequenceSupport, |
---|
| 1018 | totalSupport, totalTransactions), 2) + ")"; |
---|
| 1019 | } |
---|
| 1020 | } |
---|
| 1021 | |
---|
| 1022 | /** Tags for display in the GUI */ |
---|
| 1023 | public static final Tag[] TAGS_SELECTION = { |
---|
| 1024 | new Tag(METRIC_TYPE.CONFIDENCE.ordinal(), "Confidence"), |
---|
| 1025 | new Tag(METRIC_TYPE.LIFT.ordinal(), "Lift"), |
---|
| 1026 | new Tag(METRIC_TYPE.LEVERAGE.ordinal(), "Leverage"), |
---|
| 1027 | new Tag(METRIC_TYPE.CONVICTION.ordinal(), "Conviction") |
---|
| 1028 | }; |
---|
| 1029 | |
---|
| 1030 | /** The metric type for this rule */ |
---|
| 1031 | protected METRIC_TYPE m_metricType = METRIC_TYPE.CONFIDENCE; |
---|
| 1032 | |
---|
| 1033 | /** The premise of the rule */ |
---|
| 1034 | protected Collection<BinaryItem> m_premise; |
---|
| 1035 | |
---|
| 1036 | /** The consequence of the rule */ |
---|
| 1037 | protected Collection<BinaryItem> m_consequence; |
---|
| 1038 | |
---|
| 1039 | /** The support for the premise */ |
---|
| 1040 | protected int m_premiseSupport; |
---|
| 1041 | |
---|
| 1042 | /** The support for the consequence */ |
---|
| 1043 | protected int m_consequenceSupport; |
---|
| 1044 | |
---|
| 1045 | /** The total support for the item set (premise + consequence) */ |
---|
| 1046 | protected int m_totalSupport; |
---|
| 1047 | |
---|
| 1048 | /** The total number of transactions in the data */ |
---|
| 1049 | protected int m_totalTransactions; |
---|
| 1050 | |
---|
| 1051 | /** |
---|
| 1052 | * Construct a new association rule. |
---|
| 1053 | * |
---|
| 1054 | * @param premise the premise of the rule |
---|
| 1055 | * @param consequence the consequence of the rule |
---|
| 1056 | * @param metric the metric for the rule |
---|
| 1057 | * @param premiseSupport the support of the premise |
---|
| 1058 | * @param consequenceSupport the support of the consequence |
---|
| 1059 | * @param totalSupport the total support of the rule |
---|
| 1060 | * @param totalTransactions the number of transactions in the data |
---|
| 1061 | */ |
---|
| 1062 | public AssociationRule(Collection<BinaryItem> premise, |
---|
| 1063 | Collection<BinaryItem> consequence, METRIC_TYPE metric, |
---|
| 1064 | int premiseSupport, int consequenceSupport, |
---|
| 1065 | int totalSupport, int totalTransactions) { |
---|
| 1066 | m_premise = premise; |
---|
| 1067 | m_consequence = consequence; |
---|
| 1068 | m_metricType = metric; |
---|
| 1069 | m_premiseSupport = premiseSupport; |
---|
| 1070 | m_consequenceSupport = consequenceSupport; |
---|
| 1071 | m_totalSupport = totalSupport; |
---|
| 1072 | m_totalTransactions = totalTransactions; |
---|
| 1073 | } |
---|
| 1074 | |
---|
| 1075 | /** |
---|
| 1076 | * Get the premise of this rule. |
---|
| 1077 | * |
---|
| 1078 | * @return the premise of this rule. |
---|
| 1079 | */ |
---|
| 1080 | public Collection<BinaryItem> getPremise() { |
---|
| 1081 | return m_premise; |
---|
| 1082 | } |
---|
| 1083 | |
---|
| 1084 | /** |
---|
| 1085 | * Get the consequence of this rule. |
---|
| 1086 | * |
---|
| 1087 | * @return the consequence of this rule. |
---|
| 1088 | */ |
---|
| 1089 | public Collection<BinaryItem> getConsequence() { |
---|
| 1090 | return m_consequence; |
---|
| 1091 | } |
---|
| 1092 | |
---|
| 1093 | /** |
---|
| 1094 | * Get the metric type of this rule (e.g. confidence). |
---|
| 1095 | * |
---|
| 1096 | * @return the metric type of this rule. |
---|
| 1097 | */ |
---|
| 1098 | public METRIC_TYPE getMetricType() { |
---|
| 1099 | return m_metricType; |
---|
| 1100 | } |
---|
| 1101 | |
---|
| 1102 | /** |
---|
| 1103 | * Get the value of the metric for this rule. |
---|
| 1104 | * |
---|
| 1105 | * @return the value of the metric for this rule. |
---|
| 1106 | */ |
---|
| 1107 | public double getMetricValue() { |
---|
| 1108 | return m_metricType.compute(m_premiseSupport, m_consequenceSupport, |
---|
| 1109 | m_totalSupport, m_totalTransactions); |
---|
| 1110 | } |
---|
| 1111 | |
---|
| 1112 | /** |
---|
| 1113 | * Get the support for the premise. |
---|
| 1114 | * |
---|
| 1115 | * @return the support for the premise. |
---|
| 1116 | */ |
---|
| 1117 | public int getPremiseSupport() { |
---|
| 1118 | return m_premiseSupport; |
---|
| 1119 | } |
---|
| 1120 | |
---|
| 1121 | /** |
---|
| 1122 | * Get the support for the consequence. |
---|
| 1123 | * |
---|
| 1124 | * @return the support for the consequence. |
---|
| 1125 | */ |
---|
| 1126 | public int getConsequenceSupport() { |
---|
| 1127 | return m_consequenceSupport; |
---|
| 1128 | } |
---|
| 1129 | |
---|
| 1130 | /** |
---|
| 1131 | * Get the total support for this rule (premise + consequence). |
---|
| 1132 | * |
---|
| 1133 | * @return the total support for this rule. |
---|
| 1134 | */ |
---|
| 1135 | public int getTotalSupport() { |
---|
| 1136 | return m_totalSupport; |
---|
| 1137 | } |
---|
| 1138 | |
---|
| 1139 | /** |
---|
| 1140 | * Get the total number of transactions in the data. |
---|
| 1141 | * |
---|
| 1142 | * @return the total number of transactions in the data. |
---|
| 1143 | */ |
---|
| 1144 | public int getTotalTransactions() { |
---|
| 1145 | return m_totalTransactions; |
---|
| 1146 | } |
---|
| 1147 | |
---|
| 1148 | /** |
---|
| 1149 | * Compare this rule to the supplied rule. |
---|
| 1150 | * |
---|
| 1151 | * @param other the rule to compare to. |
---|
| 1152 | * @return the result of the comparison. |
---|
| 1153 | */ |
---|
| 1154 | public int compareTo(AssociationRule other) { |
---|
| 1155 | return -Double.compare(getMetricValue(), other.getMetricValue()); |
---|
| 1156 | } |
---|
| 1157 | |
---|
| 1158 | /** |
---|
| 1159 | * Return true if this rule is equal to the supplied one. |
---|
| 1160 | * |
---|
| 1161 | * @return true if this rule is the same as the supplied rule. |
---|
| 1162 | */ |
---|
| 1163 | public boolean equals(Object other) { |
---|
| 1164 | if (!(other instanceof AssociationRule)) { |
---|
| 1165 | return false; |
---|
| 1166 | } |
---|
| 1167 | |
---|
| 1168 | AssociationRule otherRule = (AssociationRule)other; |
---|
| 1169 | boolean result = m_premise.equals(otherRule.getPremise()) && |
---|
| 1170 | m_consequence.equals(otherRule.getConsequence()) && |
---|
| 1171 | (getMetricValue() == otherRule.getMetricValue()); |
---|
| 1172 | |
---|
| 1173 | return result; |
---|
| 1174 | } |
---|
| 1175 | |
---|
| 1176 | /** |
---|
| 1177 | * Get a textual description of this rule. |
---|
| 1178 | * |
---|
| 1179 | * @return a textual description of this rule. |
---|
| 1180 | */ |
---|
| 1181 | public String toString() { |
---|
| 1182 | StringBuffer result = new StringBuffer(); |
---|
| 1183 | |
---|
| 1184 | result.append(m_premise.toString() + ": " + m_premiseSupport |
---|
| 1185 | + " ==> " + m_consequence.toString() + ": " + m_totalSupport |
---|
| 1186 | + " "); |
---|
| 1187 | for (METRIC_TYPE m : METRIC_TYPE.values()) { |
---|
| 1188 | if (m.equals(m_metricType)) { |
---|
| 1189 | result.append("<" + |
---|
| 1190 | m.toStringMetric(m_premiseSupport, m_consequenceSupport, |
---|
| 1191 | m_totalSupport, m_totalTransactions) + "> "); |
---|
| 1192 | } else { |
---|
| 1193 | result.append("" + |
---|
| 1194 | m.toStringMetric(m_premiseSupport, m_consequenceSupport, |
---|
| 1195 | m_totalSupport, m_totalTransactions) + " "); |
---|
| 1196 | } |
---|
| 1197 | } |
---|
| 1198 | return result.toString(); |
---|
| 1199 | } |
---|
| 1200 | |
---|
| 1201 | private static void nextSubset(boolean[] subset) { |
---|
| 1202 | for (int i = 0; i < subset.length; i++) { |
---|
| 1203 | if (!subset[i]) { |
---|
| 1204 | subset[i] = true; |
---|
| 1205 | break; |
---|
| 1206 | } else { |
---|
| 1207 | subset[i] = false; |
---|
| 1208 | } |
---|
| 1209 | } |
---|
| 1210 | } |
---|
| 1211 | |
---|
| 1212 | private static Collection<BinaryItem> getPremise(FrequentBinaryItemSet fis, |
---|
| 1213 | boolean[] subset) { |
---|
| 1214 | boolean ok = false; |
---|
| 1215 | for (int i = 0; i < subset.length; i++){ |
---|
| 1216 | if (!subset[i]) { |
---|
| 1217 | ok = true; |
---|
| 1218 | break; |
---|
| 1219 | } |
---|
| 1220 | } |
---|
| 1221 | |
---|
| 1222 | if (!ok) { |
---|
| 1223 | return null; |
---|
| 1224 | } |
---|
| 1225 | |
---|
| 1226 | List<BinaryItem> premise = new ArrayList<BinaryItem>(); |
---|
| 1227 | ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(fis.getItems()); |
---|
| 1228 | |
---|
| 1229 | |
---|
| 1230 | for (int i = 0; i < subset.length; i++) { |
---|
| 1231 | if (subset[i]) { |
---|
| 1232 | premise.add(items.get(i)); |
---|
| 1233 | } |
---|
| 1234 | } |
---|
| 1235 | return premise; |
---|
| 1236 | } |
---|
| 1237 | |
---|
| 1238 | private static Collection<BinaryItem> getConsequence(FrequentBinaryItemSet fis, |
---|
| 1239 | boolean[] subset) { |
---|
| 1240 | List<BinaryItem> consequence = new ArrayList<BinaryItem>(); |
---|
| 1241 | ArrayList<BinaryItem> items = new ArrayList<BinaryItem>(fis.getItems()); |
---|
| 1242 | |
---|
| 1243 | for (int i = 0; i < subset.length; i++) { |
---|
| 1244 | if (!subset[i]) { |
---|
| 1245 | consequence.add(items.get(i)); |
---|
| 1246 | } |
---|
| 1247 | } |
---|
| 1248 | return consequence; |
---|
| 1249 | } |
---|
| 1250 | |
---|
| 1251 | |
---|
| 1252 | /** |
---|
| 1253 | * Generate all association rules, from the supplied frequet item sets, |
---|
| 1254 | * that meet a given minimum metric threshold. Uses a brute force approach. |
---|
| 1255 | * |
---|
| 1256 | * @param largeItemSets the set of frequent item sets |
---|
| 1257 | * @param metricToUse the metric to use |
---|
| 1258 | * @param metricThreshold the threshold value that a rule must meet |
---|
| 1259 | * @param totalTransactions the total number of transactions in the data |
---|
| 1260 | * @return a list of association rules |
---|
| 1261 | */ |
---|
| 1262 | public static List<AssociationRule> |
---|
| 1263 | generateRulesBruteForce(FrequentItemSets largeItemSets, METRIC_TYPE metricToUse, |
---|
| 1264 | double metricThreshold, int totalTransactions) { |
---|
| 1265 | |
---|
| 1266 | List<AssociationRule> rules = new ArrayList<AssociationRule>(); |
---|
| 1267 | largeItemSets.sort(); |
---|
| 1268 | Map<Collection<BinaryItem>, Integer> frequencyLookup = |
---|
| 1269 | new HashMap<Collection<BinaryItem>, Integer>(); |
---|
| 1270 | |
---|
| 1271 | Iterator<FrequentBinaryItemSet> setI = largeItemSets.iterator(); |
---|
| 1272 | // process each large item set |
---|
| 1273 | while (setI.hasNext()) { |
---|
| 1274 | FrequentBinaryItemSet fis = setI.next(); |
---|
| 1275 | frequencyLookup.put(fis.getItems(), fis.getSupport()); |
---|
| 1276 | if (fis.getItems().size() > 1) { |
---|
| 1277 | // generate all the possible subsets for the premise |
---|
| 1278 | boolean[] subset = new boolean[fis.getItems().size()]; |
---|
| 1279 | Collection<BinaryItem> premise = null; |
---|
| 1280 | Collection<BinaryItem> consequence = null; |
---|
| 1281 | while ((premise = getPremise(fis, subset)) != null) { |
---|
| 1282 | if (premise.size() > 0 && premise.size() < fis.getItems().size()) { |
---|
| 1283 | consequence = getConsequence(fis, subset); |
---|
| 1284 | int totalSupport = fis.getSupport(); |
---|
| 1285 | int supportPremise = frequencyLookup.get(premise).intValue(); |
---|
| 1286 | int supportConsequence = frequencyLookup.get(consequence).intValue(); |
---|
| 1287 | |
---|
| 1288 | // a candidate rule |
---|
| 1289 | AssociationRule candidate = |
---|
| 1290 | new AssociationRule(premise, consequence, metricToUse, supportPremise, |
---|
| 1291 | supportConsequence, totalSupport, totalTransactions); |
---|
| 1292 | if (candidate.getMetricValue() > metricThreshold) { |
---|
| 1293 | // accept this rule |
---|
| 1294 | rules.add(candidate); |
---|
| 1295 | } |
---|
| 1296 | } |
---|
| 1297 | nextSubset(subset); |
---|
| 1298 | } |
---|
| 1299 | } |
---|
| 1300 | } |
---|
| 1301 | |
---|
| 1302 | return rules; |
---|
| 1303 | } |
---|
| 1304 | } |
---|
| 1305 | |
---|
| 1306 | /** The number of rules to find */ |
---|
| 1307 | protected int m_numRulesToFind = 10; |
---|
| 1308 | //protected double m_upperBoundMinSupport = 0.36; |
---|
| 1309 | |
---|
| 1310 | /** The upper bound on the minimum support */ |
---|
| 1311 | protected double m_upperBoundMinSupport = 1.0; |
---|
| 1312 | |
---|
| 1313 | /** The lower bound on minimum support */ |
---|
| 1314 | protected double m_lowerBoundMinSupport = 0.1; |
---|
| 1315 | |
---|
| 1316 | /** The amount by which to decrease the support in each iteration */ |
---|
| 1317 | protected double m_delta = 0.05; |
---|
| 1318 | |
---|
| 1319 | /** |
---|
| 1320 | * If true, just all rules meeting the lower bound on the minimum |
---|
| 1321 | * support will be found. The number of rules to find will be |
---|
| 1322 | * ignored and the iterative reduction of support will not |
---|
| 1323 | * be done. |
---|
| 1324 | */ |
---|
| 1325 | protected boolean m_findAllRulesForSupportLevel = false; |
---|
| 1326 | |
---|
| 1327 | //protected double m_lowerBoundMinSupport = 0.0; |
---|
| 1328 | |
---|
| 1329 | /** The index (1 based) of binary attributes to treat as the positive value */ |
---|
| 1330 | protected int m_positiveIndex = 2; |
---|
| 1331 | |
---|
| 1332 | protected AssociationRule.METRIC_TYPE m_metric = |
---|
| 1333 | AssociationRule.METRIC_TYPE.CONFIDENCE; |
---|
| 1334 | |
---|
| 1335 | protected double m_metricThreshold = 0.9; |
---|
| 1336 | |
---|
| 1337 | /** Holds the large item sets found */ |
---|
| 1338 | protected FrequentItemSets m_largeItemSets; |
---|
| 1339 | |
---|
| 1340 | /** Holds the rules */ |
---|
| 1341 | protected List<AssociationRule> m_rules; |
---|
| 1342 | |
---|
| 1343 | // maximum number of items in a large item set (zero means no limit) |
---|
| 1344 | protected int m_maxItems = -1; |
---|
| 1345 | |
---|
| 1346 | /** |
---|
| 1347 | * Returns default capabilities of the classifier. |
---|
| 1348 | * |
---|
| 1349 | * @return the capabilities of this classifier |
---|
| 1350 | */ |
---|
| 1351 | public Capabilities getCapabilities() { |
---|
| 1352 | Capabilities result = super.getCapabilities(); |
---|
| 1353 | result.disableAll(); |
---|
| 1354 | |
---|
| 1355 | // enable what we can handle |
---|
| 1356 | |
---|
| 1357 | // attributes |
---|
| 1358 | result.enable(Capability.UNARY_ATTRIBUTES); |
---|
| 1359 | result.enable(Capability.BINARY_ATTRIBUTES); |
---|
| 1360 | result.enable(Capability.MISSING_VALUES); |
---|
| 1361 | |
---|
| 1362 | result.enable(Capability.NO_CLASS); |
---|
| 1363 | |
---|
| 1364 | return result; |
---|
| 1365 | } |
---|
| 1366 | |
---|
| 1367 | /** |
---|
| 1368 | * Returns a string describing this associator |
---|
| 1369 | * |
---|
| 1370 | * @return a description of the evaluator suitable for |
---|
| 1371 | * displaying in the explorer/experimenter gui |
---|
| 1372 | */ |
---|
| 1373 | public String globalInfo() { |
---|
| 1374 | return "Class implementing the FP-growth algorithm for finding" + |
---|
| 1375 | " large item sets without candidate generation. Iteratively" + |
---|
| 1376 | " reduces the minimum support until it finds the required" + |
---|
| 1377 | " number of rules with the given minimum metric." + |
---|
| 1378 | " For more information see:\n\n" + |
---|
| 1379 | getTechnicalInformation().toString(); |
---|
| 1380 | } |
---|
| 1381 | |
---|
| 1382 | /** |
---|
| 1383 | * Returns an instance of a TechnicalInformation object, containing |
---|
| 1384 | * detailed information about the technical background of this class, |
---|
| 1385 | * e.g., paper reference or book this class is based on. |
---|
| 1386 | * |
---|
| 1387 | * @return the technical information about this class |
---|
| 1388 | */ |
---|
| 1389 | public TechnicalInformation getTechnicalInformation() { |
---|
| 1390 | TechnicalInformation result; |
---|
| 1391 | |
---|
| 1392 | result = new TechnicalInformation(Type.INPROCEEDINGS); |
---|
| 1393 | result.setValue(Field.AUTHOR, "J. Han and J.Pei and Y. Yin"); |
---|
| 1394 | result.setValue(Field.TITLE, "Mining frequent patterns without candidate generation"); |
---|
| 1395 | result.setValue(Field.BOOKTITLE, "Proceedings of the 2000 ACM-SIGMID International" + |
---|
| 1396 | " Conference on Management of Data"); |
---|
| 1397 | result.setValue(Field.YEAR, "2000"); |
---|
| 1398 | result.setValue(Field.PAGES, "1-12"); |
---|
| 1399 | |
---|
| 1400 | return result; |
---|
| 1401 | } |
---|
| 1402 | |
---|
| 1403 | |
---|
| 1404 | /** |
---|
| 1405 | * Get the singleton items in the data |
---|
| 1406 | * |
---|
| 1407 | * @param data the Instances to process |
---|
| 1408 | * @return a list of singleton item sets |
---|
| 1409 | * @throws Exception if the singletons can't be found for some reason |
---|
| 1410 | */ |
---|
| 1411 | protected ArrayList<BinaryItem> getSingletons(Instances data) throws Exception { |
---|
| 1412 | ArrayList<BinaryItem> singletons = new ArrayList<BinaryItem>(); |
---|
| 1413 | |
---|
| 1414 | for (int i = 0; i < data.numAttributes(); i++) { |
---|
| 1415 | singletons.add(new BinaryItem(data.attribute(i), m_positiveIndex - 1)); |
---|
| 1416 | } |
---|
| 1417 | |
---|
| 1418 | for (int i = 0; i < data.numInstances(); i++) { |
---|
| 1419 | Instance current = data.instance(i); |
---|
| 1420 | if (current instanceof SparseInstance) { |
---|
| 1421 | for (int j = 0; j < current.numValues(); j++) { |
---|
| 1422 | int attIndex = current.index(j); |
---|
| 1423 | singletons.get(attIndex).increaseFrequency(); |
---|
| 1424 | } |
---|
| 1425 | } else { |
---|
| 1426 | for (int j = 0; j < data.numAttributes(); j++) { |
---|
| 1427 | if (!current.isMissing(j)) { |
---|
| 1428 | if (current.attribute(j).numValues() == 1 |
---|
| 1429 | || current.value(j) == m_positiveIndex - 1) { |
---|
| 1430 | singletons.get(j).increaseFrequency(); |
---|
| 1431 | } |
---|
| 1432 | } |
---|
| 1433 | } |
---|
| 1434 | } |
---|
| 1435 | } |
---|
| 1436 | |
---|
| 1437 | return singletons; |
---|
| 1438 | } |
---|
| 1439 | |
---|
| 1440 | /*protected ArrayList<BinaryItem> getFrequent(ArrayList<BinaryItem> items, int minSupport) { |
---|
| 1441 | ArrayList<BinaryItem> frequent = new ArrayList<BinaryItem>(); |
---|
| 1442 | for (BinaryItem b : items) { |
---|
| 1443 | if (b.getFrequency() > minSupport) { |
---|
| 1444 | frequent.add(b); |
---|
| 1445 | } |
---|
| 1446 | } |
---|
| 1447 | |
---|
| 1448 | // sort in descending order of support |
---|
| 1449 | Collections.sort(frequent); |
---|
| 1450 | return frequent; |
---|
| 1451 | } */ |
---|
| 1452 | |
---|
| 1453 | /** |
---|
| 1454 | * Construct the frequent pattern tree by inserting each transaction |
---|
| 1455 | * in the data into the tree. Only those items from each transaction that |
---|
| 1456 | * meet the minimum support threshold are inserted. |
---|
| 1457 | * |
---|
| 1458 | * @param singletons the singleton item sets |
---|
| 1459 | * @param data the Instances containing the transactions |
---|
| 1460 | * @param minSupport the minimum support |
---|
| 1461 | * @return the root of the tree |
---|
| 1462 | */ |
---|
| 1463 | protected FPTreeRoot buildFPTree(ArrayList<BinaryItem> singletons, |
---|
| 1464 | Instances data, int minSupport) { |
---|
| 1465 | |
---|
| 1466 | FPTreeRoot tree = new FPTreeRoot(); |
---|
| 1467 | |
---|
| 1468 | for (int i = 0; i < data.numInstances(); i++) { |
---|
| 1469 | Instance current = data.instance(i); |
---|
| 1470 | ArrayList<BinaryItem> transaction = new ArrayList<BinaryItem>(); |
---|
| 1471 | if (current instanceof SparseInstance) { |
---|
| 1472 | for (int j = 0; j < current.numValues(); j++) { |
---|
| 1473 | int attIndex = current.index(j); |
---|
| 1474 | if (singletons.get(attIndex).getFrequency() >= minSupport) { |
---|
| 1475 | transaction.add(singletons.get(attIndex)); |
---|
| 1476 | } |
---|
| 1477 | } |
---|
| 1478 | Collections.sort(transaction); |
---|
| 1479 | tree.addItemSet(transaction, 1); |
---|
| 1480 | } else { |
---|
| 1481 | for (int j = 0; j < data.numAttributes(); j++) { |
---|
| 1482 | if (!current.isMissing(j)) { |
---|
| 1483 | if (current.attribute(j).numValues() == 1 |
---|
| 1484 | || current.value(j) == m_positiveIndex - 1) { |
---|
| 1485 | if (singletons.get(j).getFrequency() >= minSupport) { |
---|
| 1486 | transaction.add(singletons.get(j)); |
---|
| 1487 | } |
---|
| 1488 | } |
---|
| 1489 | } |
---|
| 1490 | } |
---|
| 1491 | Collections.sort(transaction); |
---|
| 1492 | tree.addItemSet(transaction, 1); |
---|
| 1493 | } |
---|
| 1494 | } |
---|
| 1495 | |
---|
| 1496 | return tree; |
---|
| 1497 | } |
---|
| 1498 | |
---|
| 1499 | /** |
---|
| 1500 | * Find large item sets in the FP-tree. |
---|
| 1501 | * |
---|
| 1502 | * @param tree the root of the tree to mine |
---|
| 1503 | * @param largeItemSets holds the large item sets found |
---|
| 1504 | * @param recursionLevel the recursion level for the current projected |
---|
| 1505 | * counts |
---|
| 1506 | * @param conditionalItems the current set of items that the current |
---|
| 1507 | * (projected) tree is conditional on |
---|
| 1508 | * @param minSupport the minimum acceptable support |
---|
| 1509 | */ |
---|
| 1510 | protected void mineTree(FPTreeRoot tree, FrequentItemSets largeItemSets, |
---|
| 1511 | int recursionLevel, FrequentBinaryItemSet conditionalItems, int minSupport) { |
---|
| 1512 | |
---|
| 1513 | if (!tree.isEmpty(recursionLevel)) { |
---|
| 1514 | if (m_maxItems > 0 && recursionLevel >= m_maxItems) { |
---|
| 1515 | // don't mine any further |
---|
| 1516 | return; |
---|
| 1517 | } |
---|
| 1518 | |
---|
| 1519 | Map<BinaryItem, FPTreeRoot.Header> headerTable = tree.getHeaderTable(); |
---|
| 1520 | Set<BinaryItem> keys = headerTable.keySet(); |
---|
| 1521 | // System.err.println("Number of freq item sets collected " + largeItemSets.size()); |
---|
| 1522 | Iterator<BinaryItem> i = keys.iterator(); |
---|
| 1523 | while (i.hasNext()) { |
---|
| 1524 | BinaryItem item = i.next(); |
---|
| 1525 | FPTreeRoot.Header itemHeader = headerTable.get(item); |
---|
| 1526 | |
---|
| 1527 | // check for minimum support at this level |
---|
| 1528 | int support = itemHeader.getProjectedCounts().getCount(recursionLevel); |
---|
| 1529 | if (support >= minSupport) { |
---|
| 1530 | // process header list at this recursion level |
---|
| 1531 | for (FPTreeNode n : itemHeader.getHeaderList()) { |
---|
| 1532 | // push count up path to root |
---|
| 1533 | int currentCount = n.getProjectedCount(recursionLevel); |
---|
| 1534 | if (currentCount > 0) { |
---|
| 1535 | FPTreeNode temp = n.getParent(); |
---|
| 1536 | while (temp != tree) { |
---|
| 1537 | // set/increase for the node |
---|
| 1538 | temp.increaseProjectedCount(recursionLevel + 1, currentCount); |
---|
| 1539 | |
---|
| 1540 | // set/increase for the header table |
---|
| 1541 | headerTable.get(temp.getItem()). |
---|
| 1542 | getProjectedCounts().increaseCount(recursionLevel + 1, currentCount); |
---|
| 1543 | |
---|
| 1544 | temp = temp.getParent(); |
---|
| 1545 | } |
---|
| 1546 | } |
---|
| 1547 | } |
---|
| 1548 | |
---|
| 1549 | FrequentBinaryItemSet newConditional = |
---|
| 1550 | (FrequentBinaryItemSet) conditionalItems.clone(); |
---|
| 1551 | |
---|
| 1552 | // this item gets added to the conditional items |
---|
| 1553 | newConditional.addItem(item); |
---|
| 1554 | newConditional.setSupport(support); |
---|
| 1555 | |
---|
| 1556 | // now add this conditional item set to the list of large item sets |
---|
| 1557 | largeItemSets.addItemSet(newConditional); |
---|
| 1558 | |
---|
| 1559 | // now recursively process the new tree |
---|
| 1560 | mineTree(tree, largeItemSets, recursionLevel + 1, newConditional, |
---|
| 1561 | minSupport); |
---|
| 1562 | |
---|
| 1563 | // reverse the propagated counts |
---|
| 1564 | for (FPTreeNode n : itemHeader.getHeaderList()) { |
---|
| 1565 | FPTreeNode temp = n.getParent(); |
---|
| 1566 | while (temp != tree) { |
---|
| 1567 | temp.removeProjectedCount(recursionLevel + 1); |
---|
| 1568 | temp = temp.getParent(); |
---|
| 1569 | } |
---|
| 1570 | } |
---|
| 1571 | |
---|
| 1572 | // reverse the propagated counts in the header list |
---|
| 1573 | // at this recursion level |
---|
| 1574 | for (FPTreeRoot.Header h : headerTable.values()) { |
---|
| 1575 | h.getProjectedCounts().removeCount(recursionLevel + 1); |
---|
| 1576 | } |
---|
| 1577 | } |
---|
| 1578 | } |
---|
| 1579 | } |
---|
| 1580 | } |
---|
| 1581 | |
---|
| 1582 | /** |
---|
| 1583 | * Construct a new FPGrowth object. |
---|
| 1584 | */ |
---|
| 1585 | public FPGrowth() { |
---|
| 1586 | resetOptions(); |
---|
| 1587 | } |
---|
| 1588 | |
---|
| 1589 | /** |
---|
| 1590 | * Reset all options to their default values. |
---|
| 1591 | */ |
---|
| 1592 | public void resetOptions() { |
---|
| 1593 | m_delta = 0.05; |
---|
| 1594 | m_metricThreshold = 0.9; |
---|
| 1595 | m_numRulesToFind = 10; |
---|
| 1596 | m_lowerBoundMinSupport = 0.1; |
---|
| 1597 | m_upperBoundMinSupport = 1.0; |
---|
| 1598 | // m_minSupport = -1; |
---|
| 1599 | m_positiveIndex = 2; |
---|
| 1600 | } |
---|
| 1601 | |
---|
| 1602 | /** |
---|
| 1603 | * Tip text for this property suitable for displaying |
---|
| 1604 | * in the GUI. |
---|
| 1605 | * |
---|
| 1606 | * @return the tip text for this property. |
---|
| 1607 | */ |
---|
| 1608 | public String positiveIndexTipText() { |
---|
| 1609 | return "Set the index of binary valued attributes that is to be considered" + |
---|
| 1610 | " the positive index. Has no effect for sparse data (in this case" + |
---|
| 1611 | " the first index (i.e. non-zero values) is always treated as " + |
---|
| 1612 | " positive. Also has no effect for unary valued attributes (i.e." + |
---|
| 1613 | " when using the Weka Apriori-style format for market basket data," + |
---|
| 1614 | " which uses missing value \"?\" to indicate" + |
---|
| 1615 | " absence of an item."; |
---|
| 1616 | } |
---|
| 1617 | |
---|
| 1618 | /** |
---|
| 1619 | * Set the index of the attribute value to consider as positive |
---|
| 1620 | * for binary attributes in normal dense instances. Index 1 is always |
---|
| 1621 | * used for sparse instances. |
---|
| 1622 | * |
---|
| 1623 | * @param index the index to use for positive values in binary attributes. |
---|
| 1624 | */ |
---|
| 1625 | public void setPositiveIndex(int index) { |
---|
| 1626 | m_positiveIndex = index; |
---|
| 1627 | } |
---|
| 1628 | |
---|
| 1629 | /** |
---|
| 1630 | * Get the index of the attribute value to consider as positive |
---|
| 1631 | * for binary attributes in normal dense instances. Index 1 is always |
---|
| 1632 | * used for sparse instances. |
---|
| 1633 | * |
---|
| 1634 | * @return the index to use for positive values in binary attributes. |
---|
| 1635 | */ |
---|
| 1636 | public int getPositiveIndex() { |
---|
| 1637 | return m_positiveIndex; |
---|
| 1638 | } |
---|
| 1639 | |
---|
| 1640 | /** |
---|
| 1641 | * Set the desired number of rules to find. |
---|
| 1642 | * |
---|
| 1643 | * @param numR the number of rules to find. |
---|
| 1644 | */ |
---|
| 1645 | public void setNumRulesToFind(int numR) { |
---|
| 1646 | m_numRulesToFind = numR; |
---|
| 1647 | } |
---|
| 1648 | |
---|
| 1649 | /** |
---|
| 1650 | * Get the number of rules to find. |
---|
| 1651 | * |
---|
| 1652 | * @return the number of rules to find. |
---|
| 1653 | */ |
---|
| 1654 | public int getNumRulesToFind() { |
---|
| 1655 | return m_numRulesToFind; |
---|
| 1656 | } |
---|
| 1657 | |
---|
| 1658 | /** |
---|
| 1659 | * Tip text for this property suitable for displaying |
---|
| 1660 | * in the GUI. |
---|
| 1661 | * |
---|
| 1662 | * @return the tip text for this property. |
---|
| 1663 | */ |
---|
| 1664 | public String numRulesToFindTipText() { |
---|
| 1665 | return "The number of rules to output"; |
---|
| 1666 | } |
---|
| 1667 | |
---|
| 1668 | /** |
---|
| 1669 | * Set the metric type to use. |
---|
| 1670 | * |
---|
| 1671 | * @param d the metric type |
---|
| 1672 | */ |
---|
| 1673 | public void setMetricType(SelectedTag d) { |
---|
| 1674 | int ordinal = d.getSelectedTag().getID(); |
---|
| 1675 | for (AssociationRule.METRIC_TYPE m : AssociationRule.METRIC_TYPE.values()) { |
---|
| 1676 | if (m.ordinal() == ordinal) { |
---|
| 1677 | m_metric = m; |
---|
| 1678 | break; |
---|
| 1679 | } |
---|
| 1680 | } |
---|
| 1681 | } |
---|
| 1682 | |
---|
| 1683 | /** |
---|
| 1684 | * Set the maximum number of items to include in large items sets. |
---|
| 1685 | * |
---|
| 1686 | * @param max the maxim number of items to include in large item sets. |
---|
| 1687 | */ |
---|
| 1688 | public void setMaxNumberOfItems(int max) { |
---|
| 1689 | m_maxItems = max; |
---|
| 1690 | } |
---|
| 1691 | |
---|
| 1692 | /** |
---|
| 1693 | * Gets the maximum number of items to be included in large item sets. |
---|
| 1694 | * |
---|
| 1695 | * @return the maximum number of items to be included in large items sets. |
---|
| 1696 | */ |
---|
| 1697 | public int getMaxNumberOfItems() { |
---|
| 1698 | return m_maxItems; |
---|
| 1699 | } |
---|
| 1700 | |
---|
| 1701 | /** |
---|
| 1702 | * Tip text for this property suitable for displaying |
---|
| 1703 | * in the GUI. |
---|
| 1704 | * |
---|
| 1705 | * @return the tip text for this property. |
---|
| 1706 | */ |
---|
| 1707 | public String maxNumberOfItemsTipText() { |
---|
| 1708 | return "The maximum number of items to include in frequent item sets. -1 " + |
---|
| 1709 | "means no limit."; |
---|
| 1710 | } |
---|
| 1711 | |
---|
| 1712 | /** |
---|
| 1713 | * Get the metric type to use. |
---|
| 1714 | * |
---|
| 1715 | * @return the metric type to use. |
---|
| 1716 | */ |
---|
| 1717 | public SelectedTag getMetricType() { |
---|
| 1718 | return new SelectedTag(m_metric.ordinal(), AssociationRule.TAGS_SELECTION); |
---|
| 1719 | } |
---|
| 1720 | |
---|
| 1721 | /** |
---|
| 1722 | * Tip text for this property suitable for displaying |
---|
| 1723 | * in the GUI. |
---|
| 1724 | * |
---|
| 1725 | * @return the tip text for this property. |
---|
| 1726 | */ |
---|
| 1727 | public String metricTypeTipText() { |
---|
| 1728 | return "Set the type of metric by which to rank rules. Confidence is " |
---|
| 1729 | +"the proportion of the examples covered by the premise that are also " |
---|
| 1730 | +"covered by the consequence(Class association rules can only be mined using confidence). Lift is confidence divided by the " |
---|
| 1731 | +"proportion of all examples that are covered by the consequence. This " |
---|
| 1732 | +"is a measure of the importance of the association that is independent " |
---|
| 1733 | +"of support. Leverage is the proportion of additional examples covered " |
---|
| 1734 | +"by both the premise and consequence above those expected if the " |
---|
| 1735 | +"premise and consequence were independent of each other. The total " |
---|
| 1736 | +"number of examples that this represents is presented in brackets " |
---|
| 1737 | +"following the leverage. Conviction is " |
---|
| 1738 | +"another measure of departure from independence."; |
---|
| 1739 | } |
---|
| 1740 | |
---|
| 1741 | /** |
---|
| 1742 | * Returns the tip text for this property |
---|
| 1743 | * |
---|
| 1744 | * @return tip text for this property suitable for |
---|
| 1745 | * displaying in the explorer/experimenter gui |
---|
| 1746 | */ |
---|
| 1747 | public String minMetricTipText() { |
---|
| 1748 | return "Minimum metric score. Consider only rules with scores higher than " |
---|
| 1749 | +"this value."; |
---|
| 1750 | } |
---|
| 1751 | |
---|
| 1752 | /** |
---|
| 1753 | * Get the value of minConfidence. |
---|
| 1754 | * |
---|
| 1755 | * @return Value of minConfidence. |
---|
| 1756 | */ |
---|
| 1757 | public double getMinMetric() { |
---|
| 1758 | |
---|
| 1759 | return m_metricThreshold; |
---|
| 1760 | } |
---|
| 1761 | |
---|
| 1762 | /** |
---|
| 1763 | * Set the value of minConfidence. |
---|
| 1764 | * |
---|
| 1765 | * @param v Value to assign to minConfidence. |
---|
| 1766 | */ |
---|
| 1767 | public void setMinMetric(double v) { |
---|
| 1768 | |
---|
| 1769 | m_metricThreshold = v; |
---|
| 1770 | } |
---|
| 1771 | |
---|
| 1772 | /** |
---|
| 1773 | * Returns the tip text for this property |
---|
| 1774 | * @return tip text for this property suitable for |
---|
| 1775 | * displaying in the explorer/experimenter gui |
---|
| 1776 | */ |
---|
| 1777 | public String deltaTipText() { |
---|
| 1778 | return "Iteratively decrease support by this factor. Reduces support " |
---|
| 1779 | +"until min support is reached or required number of rules has been " |
---|
| 1780 | +"generated."; |
---|
| 1781 | } |
---|
| 1782 | |
---|
| 1783 | /** |
---|
| 1784 | * Get the value of delta. |
---|
| 1785 | * |
---|
| 1786 | * @return Value of delta. |
---|
| 1787 | */ |
---|
| 1788 | public double getDelta() { |
---|
| 1789 | |
---|
| 1790 | return m_delta; |
---|
| 1791 | } |
---|
| 1792 | |
---|
| 1793 | /** |
---|
| 1794 | * Set the value of delta. |
---|
| 1795 | * |
---|
| 1796 | * @param v Value to assign to delta. |
---|
| 1797 | */ |
---|
| 1798 | public void setDelta(double v) { |
---|
| 1799 | |
---|
| 1800 | m_delta = v; |
---|
| 1801 | } |
---|
| 1802 | |
---|
| 1803 | /** |
---|
| 1804 | * Returns the tip text for this property |
---|
| 1805 | * @return tip text for this property suitable for |
---|
| 1806 | * displaying in the explorer/experimenter gui |
---|
| 1807 | */ |
---|
| 1808 | public String lowerBoundMinSupportTipText() { |
---|
| 1809 | return "Lower bound for minimum support."; |
---|
| 1810 | } |
---|
| 1811 | |
---|
| 1812 | /** |
---|
| 1813 | * Get the value of lowerBoundMinSupport. |
---|
| 1814 | * |
---|
| 1815 | * @return Value of lowerBoundMinSupport. |
---|
| 1816 | */ |
---|
| 1817 | public double getLowerBoundMinSupport() { |
---|
| 1818 | |
---|
| 1819 | return m_lowerBoundMinSupport; |
---|
| 1820 | } |
---|
| 1821 | |
---|
| 1822 | /** |
---|
| 1823 | * Set the value of lowerBoundMinSupport. |
---|
| 1824 | * |
---|
| 1825 | * @param v Value to assign to lowerBoundMinSupport. |
---|
| 1826 | */ |
---|
| 1827 | public void setLowerBoundMinSupport(double v) { |
---|
| 1828 | |
---|
| 1829 | m_lowerBoundMinSupport = v; |
---|
| 1830 | } |
---|
| 1831 | |
---|
| 1832 | /** |
---|
| 1833 | * Returns the tip text for this property |
---|
| 1834 | * @return tip text for this property suitable for |
---|
| 1835 | * displaying in the explorer/experimenter gui |
---|
| 1836 | */ |
---|
| 1837 | public String upperBoundMinSupportTipText() { |
---|
| 1838 | return "Upper bound for minimum support. Start iteratively decreasing " |
---|
| 1839 | +"minimum support from this value."; |
---|
| 1840 | } |
---|
| 1841 | |
---|
| 1842 | /** |
---|
| 1843 | * Get the value of upperBoundMinSupport. |
---|
| 1844 | * |
---|
| 1845 | * @return Value of upperBoundMinSupport. |
---|
| 1846 | */ |
---|
| 1847 | public double getUpperBoundMinSupport() { |
---|
| 1848 | |
---|
| 1849 | return m_upperBoundMinSupport; |
---|
| 1850 | } |
---|
| 1851 | |
---|
| 1852 | /** |
---|
| 1853 | * Set the value of upperBoundMinSupport. |
---|
| 1854 | * |
---|
| 1855 | * @param v Value to assign to upperBoundMinSupport. |
---|
| 1856 | */ |
---|
| 1857 | public void setUpperBoundMinSupport(double v) { |
---|
| 1858 | |
---|
| 1859 | m_upperBoundMinSupport = v; |
---|
| 1860 | } |
---|
| 1861 | |
---|
| 1862 | /** |
---|
| 1863 | * Tip text for this property suitable for displaying |
---|
| 1864 | * in the GUI. |
---|
| 1865 | * |
---|
| 1866 | * @return the tip text for this property. |
---|
| 1867 | */ |
---|
| 1868 | public String findAllRulesForSupportLevelTipText() { |
---|
| 1869 | return "Find all rules that meet " + |
---|
| 1870 | "the lower bound on minimum support and the minimum metric constraint. " + |
---|
| 1871 | "Turning this mode on will disable the iterative support reduction " + |
---|
| 1872 | "procedure to find the specified number of rules."; |
---|
| 1873 | } |
---|
| 1874 | |
---|
| 1875 | /** |
---|
| 1876 | * If true then turn off the iterative support reduction method |
---|
| 1877 | * of finding x rules that meet the minimum support and metric |
---|
| 1878 | * thresholds and just return all the rules that meet the |
---|
| 1879 | * lower bound on minimum support and the minimum metric. |
---|
| 1880 | * |
---|
| 1881 | * @param s true if all rules meeting the lower bound on the support |
---|
| 1882 | * and minimum metric thresholds are to be found. |
---|
| 1883 | */ |
---|
| 1884 | public void setFindAllRulesForSupportLevel(boolean s) { |
---|
| 1885 | m_findAllRulesForSupportLevel = s; |
---|
| 1886 | } |
---|
| 1887 | |
---|
| 1888 | /** |
---|
| 1889 | * Get whether all rules meeting the lower bound on min support |
---|
| 1890 | * and the minimum metric threshold are to be found. |
---|
| 1891 | * |
---|
| 1892 | * @return true if all rules meeting the lower bound on min |
---|
| 1893 | * support and the min metric threshold are to be found. |
---|
| 1894 | */ |
---|
| 1895 | public boolean getFindAllRulesForSupportLevel() { |
---|
| 1896 | return m_findAllRulesForSupportLevel; |
---|
| 1897 | } |
---|
| 1898 | |
---|
| 1899 | /* public void setMinimumSupport(double minSupp) { |
---|
| 1900 | m_minSupport = minSupp; |
---|
| 1901 | } |
---|
| 1902 | |
---|
| 1903 | public double getMinimumSupport() { |
---|
| 1904 | return m_minSupport; |
---|
| 1905 | } */ |
---|
| 1906 | |
---|
| 1907 | /** |
---|
| 1908 | * Gets the list of mined association rules. |
---|
| 1909 | * |
---|
| 1910 | * @return the list of association rules discovered during mining. |
---|
| 1911 | * Returns null if mining hasn't been performed yet. |
---|
| 1912 | */ |
---|
| 1913 | public List<AssociationRule> getAssociationRules() { |
---|
| 1914 | return m_rules; |
---|
| 1915 | } |
---|
| 1916 | |
---|
| 1917 | /** |
---|
| 1918 | * Returns an enumeration describing the available options. |
---|
| 1919 | * |
---|
| 1920 | * @return an enumeration of all the available options. |
---|
| 1921 | */ |
---|
| 1922 | public Enumeration<Option> listOptions() { |
---|
| 1923 | Vector<Option> newVector = new Vector<Option>(); |
---|
| 1924 | |
---|
| 1925 | String string00 = "\tSet the index of the attribute value to consider as 'positive'\n\t" |
---|
| 1926 | + "for binary attributes in normal dense instances. Index 2 is always\n\t" |
---|
| 1927 | + "used for sparse instances. (default = 2)"; |
---|
| 1928 | String string0 = "\tThe maximum number of items to include " + |
---|
| 1929 | "in large items sets (and rules). (default " + |
---|
| 1930 | "= -1, i.e. no limit.)"; |
---|
| 1931 | |
---|
| 1932 | String string1 = "\tThe required number of rules. (default = " |
---|
| 1933 | + m_numRulesToFind + ")"; |
---|
| 1934 | String string2 = "\tThe minimum metric score of a rule. (default" + |
---|
| 1935 | " = " + m_metricThreshold + ")"; |
---|
| 1936 | String string3 = "\tThe metric by which to rank rules. (default" |
---|
| 1937 | + " = confidence)"; |
---|
| 1938 | String string4 = "\tThe lower bound for the minimum support. (default = " |
---|
| 1939 | + m_lowerBoundMinSupport + ")"; |
---|
| 1940 | String string5 = "\tUpper bound for minimum support. " |
---|
| 1941 | + "(default = 1.0)"; |
---|
| 1942 | String string6 = "\tThe delta by which the minimum support is decreased in\n" |
---|
| 1943 | + "\teach iteration. (default = " + m_delta + ")"; |
---|
| 1944 | String string7 = "\tFind all rules that meet the lower bound on\n\t" + |
---|
| 1945 | "minimum support and the minimum metric constraint.\n\t" + |
---|
| 1946 | "Turning this mode on will disable the iterative support reduction\n\t" + |
---|
| 1947 | "procedure to find the specified number of rules."; |
---|
| 1948 | |
---|
| 1949 | newVector.add(new Option(string00, "P", 1, "-P <attribute index of positive value>")); |
---|
| 1950 | newVector.add(new Option(string0, "I", 1, "-I <max items>")); |
---|
| 1951 | newVector.add(new Option(string1, "N", 1, "-N <require number of rules>")); |
---|
| 1952 | newVector.add(new Option(string3, "T", 1, "-T <0=confidence | 1=lift | " |
---|
| 1953 | + "2=leverage | 3=Conviction>")); |
---|
| 1954 | newVector.add(new Option(string2, "C", 1, "-C <minimum metric score of a rule>")); |
---|
| 1955 | newVector.add(new Option(string5, "U", 1, "-U <upper bound for minimum support>")); |
---|
| 1956 | newVector.add(new Option(string4, "M", 1, "-M <lower bound for minimum support>")); |
---|
| 1957 | newVector.add(new Option(string6, "D", 1, "-D <delta for minimum support>")); |
---|
| 1958 | newVector.add(new Option(string7, "S", 0, "-S")); |
---|
| 1959 | |
---|
| 1960 | return newVector.elements(); |
---|
| 1961 | } |
---|
| 1962 | |
---|
| 1963 | /** |
---|
| 1964 | * |
---|
| 1965 | * Parses a given list of options. <p/> |
---|
| 1966 | * |
---|
| 1967 | <!-- options-start --> |
---|
| 1968 | * Valid options are: <p/> |
---|
| 1969 | * |
---|
| 1970 | * <pre> -P <attribute index of positive value> |
---|
| 1971 | * Set the index of the attribute value to consider as 'positive' |
---|
| 1972 | * for binary attributes in normal dense instances. Index 2 is always |
---|
| 1973 | * used for sparse instances. (default = 2)</pre> |
---|
| 1974 | * |
---|
| 1975 | * <pre> -I <max items> |
---|
| 1976 | * The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)</pre> |
---|
| 1977 | * |
---|
| 1978 | * <pre> -N <require number of rules> |
---|
| 1979 | * The required number of rules. (default = 10)</pre> |
---|
| 1980 | * |
---|
| 1981 | * <pre> -T <0=confidence | 1=lift | 2=leverage | 3=Conviction> |
---|
| 1982 | * The metric by which to rank rules. (default = confidence)</pre> |
---|
| 1983 | * |
---|
| 1984 | * <pre> -C <minimum metric score of a rule> |
---|
| 1985 | * The minimum metric score of a rule. (default = 0.9)</pre> |
---|
| 1986 | * |
---|
| 1987 | * <pre> -U <upper bound for minimum support> |
---|
| 1988 | * Upper bound for minimum support. (default = 1.0)</pre> |
---|
| 1989 | * |
---|
| 1990 | * <pre> -M <lower bound for minimum support> |
---|
| 1991 | * The lower bound for the minimum support. (default = 0.1)</pre> |
---|
| 1992 | * |
---|
| 1993 | * <pre> -D <delta for minimum support> |
---|
| 1994 | * The delta by which the minimum support is decreased in |
---|
| 1995 | * each iteration. (default = 0.05)</pre> |
---|
| 1996 | * |
---|
| 1997 | * <pre> -S |
---|
| 1998 | * Find all rules that meet the lower bound on |
---|
| 1999 | * minimum support and the minimum metric constraint. |
---|
| 2000 | * Turning this mode on will disable the iterative support reduction |
---|
| 2001 | * procedure to find the specified number of rules.</pre> |
---|
| 2002 | * |
---|
| 2003 | <!-- options-end --> |
---|
| 2004 | * |
---|
| 2005 | * @param options the list of options as an array of strings |
---|
| 2006 | * @throws Exception if an option is not supported |
---|
| 2007 | */ |
---|
| 2008 | public void setOptions(String[] options) throws Exception { |
---|
| 2009 | resetOptions(); |
---|
| 2010 | String positiveIndexString = Utils.getOption('P', options); |
---|
| 2011 | String maxItemsString = Utils.getOption('I', options); |
---|
| 2012 | String numRulesString = Utils.getOption('N', options); |
---|
| 2013 | String minMetricString = Utils.getOption('C', options); |
---|
| 2014 | String metricTypeString = Utils.getOption("T", options); |
---|
| 2015 | String lowerBoundSupportString = Utils.getOption("M", options); |
---|
| 2016 | String upperBoundSupportString = Utils.getOption("U", options); |
---|
| 2017 | String deltaString = Utils.getOption("D", options); |
---|
| 2018 | |
---|
| 2019 | if (positiveIndexString.length() != 0) { |
---|
| 2020 | setPositiveIndex(Integer.parseInt(positiveIndexString)); |
---|
| 2021 | } |
---|
| 2022 | |
---|
| 2023 | if (maxItemsString.length() != 0) { |
---|
| 2024 | setMaxNumberOfItems(Integer.parseInt(maxItemsString)); |
---|
| 2025 | } |
---|
| 2026 | |
---|
| 2027 | if (metricTypeString.length() != 0) { |
---|
| 2028 | setMetricType(new SelectedTag(Integer.parseInt(metricTypeString), |
---|
| 2029 | AssociationRule.TAGS_SELECTION)); |
---|
| 2030 | } |
---|
| 2031 | |
---|
| 2032 | if (numRulesString.length() != 0) { |
---|
| 2033 | setNumRulesToFind(Integer.parseInt(numRulesString)); |
---|
| 2034 | } |
---|
| 2035 | |
---|
| 2036 | if (minMetricString.length() != 0) { |
---|
| 2037 | setMinMetric(Double.parseDouble(minMetricString)); |
---|
| 2038 | } |
---|
| 2039 | |
---|
| 2040 | if (deltaString.length() != 0) { |
---|
| 2041 | setDelta(Double.parseDouble(deltaString)); |
---|
| 2042 | } |
---|
| 2043 | |
---|
| 2044 | if (lowerBoundSupportString.length() != 0) { |
---|
| 2045 | setLowerBoundMinSupport(Double.parseDouble(lowerBoundSupportString)); |
---|
| 2046 | } |
---|
| 2047 | |
---|
| 2048 | if (upperBoundSupportString.length() != 0) { |
---|
| 2049 | setUpperBoundMinSupport(Double.parseDouble(upperBoundSupportString)); |
---|
| 2050 | } |
---|
| 2051 | |
---|
| 2052 | setFindAllRulesForSupportLevel(Utils.getFlag('S', options)); |
---|
| 2053 | } |
---|
| 2054 | |
---|
| 2055 | /** |
---|
| 2056 | * Gets the current settings of the classifier. |
---|
| 2057 | * |
---|
| 2058 | * @return an array of strings suitable for passing to setOptions |
---|
| 2059 | */ |
---|
| 2060 | public String[] getOptions() { |
---|
| 2061 | ArrayList<String> options = new ArrayList<String>(); |
---|
| 2062 | |
---|
| 2063 | options.add("-P"); options.add("" + getPositiveIndex()); |
---|
| 2064 | options.add("-I"); options.add("" + getMaxNumberOfItems()); |
---|
| 2065 | options.add("-N"); options.add("" + getNumRulesToFind()); |
---|
| 2066 | options.add("-T"); options.add("" + getMetricType().getSelectedTag().getID()); |
---|
| 2067 | options.add("-C"); options.add("" + getMinMetric()); |
---|
| 2068 | options.add("-D"); options.add("" + getDelta()); |
---|
| 2069 | options.add("-U"); options.add("" + getUpperBoundMinSupport()); |
---|
| 2070 | options.add("-M"); options.add("" + getLowerBoundMinSupport()); |
---|
| 2071 | if (getFindAllRulesForSupportLevel()) { |
---|
| 2072 | options.add("-S"); |
---|
| 2073 | } |
---|
| 2074 | |
---|
| 2075 | return options.toArray(new String[1]); |
---|
| 2076 | } |
---|
| 2077 | |
---|
| 2078 | /** |
---|
| 2079 | * Method that generates all large item sets with a minimum support, and from |
---|
| 2080 | * these all association rules with a minimum metric (i.e. confidence, |
---|
| 2081 | * lift etc.). |
---|
| 2082 | * |
---|
| 2083 | * @param data the instances to be used for generating the associations |
---|
| 2084 | * @throws Exception if rules can't be built successfully |
---|
| 2085 | */ |
---|
| 2086 | public void buildAssociations(Instances data) throws Exception { |
---|
| 2087 | |
---|
| 2088 | // can we handle the data? |
---|
| 2089 | getCapabilities().testWithFail(data); |
---|
| 2090 | |
---|
| 2091 | double currentSupport = m_upperBoundMinSupport; |
---|
| 2092 | |
---|
| 2093 | if (m_findAllRulesForSupportLevel) { |
---|
| 2094 | currentSupport = m_lowerBoundMinSupport; |
---|
| 2095 | } |
---|
| 2096 | // first compute singletons |
---|
| 2097 | ArrayList<BinaryItem> singletons = getSingletons(data); |
---|
| 2098 | //ArrayList<BinaryItem> singletonsCopy = new ArrayList<BinaryItem>(singletons); |
---|
| 2099 | /* Collections.sort(singletonsCopy); |
---|
| 2100 | for (int i = 0; i < singletonsCopy.size(); i++) { |
---|
| 2101 | System.out.println(singletonsCopy.get(i).toString(true)); |
---|
| 2102 | } |
---|
| 2103 | System.out.println("---------"); */ |
---|
| 2104 | // System.out.println("Finished finding singletons..."); |
---|
| 2105 | |
---|
| 2106 | // while not enough rules |
---|
| 2107 | do { |
---|
| 2108 | int currentSupportAsInstances = (currentSupport > 1) |
---|
| 2109 | ? (int)currentSupport |
---|
| 2110 | : (int)Math.ceil(currentSupport * data.numInstances()); |
---|
| 2111 | |
---|
| 2112 | //System.err.println("Current support " + currentSupportAsInstances); |
---|
| 2113 | //ArrayList<BinaryItem> prunedSingletons = removeNonFrequent(singletons); |
---|
| 2114 | |
---|
| 2115 | // build the FPTree |
---|
| 2116 | FPTreeRoot tree = buildFPTree(singletons, data, currentSupportAsInstances); |
---|
| 2117 | // System.out.println("Finished building tree..."); |
---|
| 2118 | // System.out.println(tree.toString(0)); |
---|
| 2119 | /*System.out.println(tree.printHeaderTable(0)); */ |
---|
| 2120 | |
---|
| 2121 | FrequentItemSets largeItemSets = new FrequentItemSets(data.numInstances()); |
---|
| 2122 | |
---|
| 2123 | // mine the tree |
---|
| 2124 | FrequentBinaryItemSet conditionalItems = |
---|
| 2125 | new FrequentBinaryItemSet(new ArrayList<BinaryItem>(), 0); |
---|
| 2126 | mineTree(tree, largeItemSets, 0, conditionalItems, currentSupportAsInstances); |
---|
| 2127 | |
---|
| 2128 | m_largeItemSets = largeItemSets; |
---|
| 2129 | // System.err.println("Number of large item sets: " + m_largeItemSets.size()); |
---|
| 2130 | // System.err.println(m_largeItemSets.toString(100)); |
---|
| 2131 | |
---|
| 2132 | // m_largeItemSets.sort(compF); |
---|
| 2133 | // System.err.println("Finished mining tree..."); |
---|
| 2134 | |
---|
| 2135 | // save memory |
---|
| 2136 | tree = null; |
---|
| 2137 | |
---|
| 2138 | m_rules = |
---|
| 2139 | AssociationRule.generateRulesBruteForce(m_largeItemSets, m_metric, |
---|
| 2140 | m_metricThreshold, data.numInstances()); |
---|
| 2141 | |
---|
| 2142 | if (!m_findAllRulesForSupportLevel) { |
---|
| 2143 | currentSupport -= m_delta; |
---|
| 2144 | if (currentSupport < m_lowerBoundMinSupport) { |
---|
| 2145 | if (currentSupport + m_delta > m_lowerBoundMinSupport) { |
---|
| 2146 | // ensure that the lower bound does get evaluated |
---|
| 2147 | currentSupport = m_lowerBoundMinSupport; |
---|
| 2148 | } else { |
---|
| 2149 | break; |
---|
| 2150 | } |
---|
| 2151 | } |
---|
| 2152 | } else { |
---|
| 2153 | // just break out of the loop as we are just finding all rules |
---|
| 2154 | // with a minimum support + metric |
---|
| 2155 | break; |
---|
| 2156 | } |
---|
| 2157 | } while (m_rules.size() < m_numRulesToFind); |
---|
| 2158 | |
---|
| 2159 | Collections.sort(m_rules); |
---|
| 2160 | // for (AssociationRule) |
---|
| 2161 | |
---|
| 2162 | // System.out.println(graph(tree)); |
---|
| 2163 | } |
---|
| 2164 | |
---|
| 2165 | /** |
---|
| 2166 | * Output the association rules. |
---|
| 2167 | * |
---|
| 2168 | * @return a string representation of the model. |
---|
| 2169 | */ |
---|
| 2170 | public String toString() { |
---|
| 2171 | // return m_largeItemSets.toString(m_numItemSetsToFind); |
---|
| 2172 | if (m_rules == null) { |
---|
| 2173 | return "FPGrowth hasn't been trained yet!"; |
---|
| 2174 | } |
---|
| 2175 | |
---|
| 2176 | StringBuffer result = new StringBuffer(); |
---|
| 2177 | int numRules = (m_rules.size() < m_numRulesToFind) |
---|
| 2178 | ? m_rules.size() |
---|
| 2179 | : m_numRulesToFind; |
---|
| 2180 | |
---|
| 2181 | if (m_rules.size() == 0) { |
---|
| 2182 | return "No rules found!"; |
---|
| 2183 | } else { |
---|
| 2184 | result.append("FPGrowth found " + m_rules.size() + " rules"); |
---|
| 2185 | if (!m_findAllRulesForSupportLevel) { |
---|
| 2186 | result.append(" (displaying top " + numRules + ")"); |
---|
| 2187 | } |
---|
| 2188 | result.append("\n\n"); |
---|
| 2189 | } |
---|
| 2190 | |
---|
| 2191 | int count = 0; |
---|
| 2192 | for (AssociationRule r : m_rules) { |
---|
| 2193 | result.append(Utils.doubleToString((double)count+1, |
---|
| 2194 | (int)(Math.log(numRules)/Math.log(10)+1), 0) + ". "); |
---|
| 2195 | result.append(r + "\n"); |
---|
| 2196 | count++; |
---|
| 2197 | if (!m_findAllRulesForSupportLevel && count == m_numRulesToFind) { |
---|
| 2198 | break; |
---|
| 2199 | } |
---|
| 2200 | } |
---|
| 2201 | return result.toString(); |
---|
| 2202 | } |
---|
| 2203 | |
---|
| 2204 | /** |
---|
| 2205 | * Assemble a dot graph representation of the FP-tree. |
---|
| 2206 | * |
---|
| 2207 | * @param tree the root of the FP-tree |
---|
| 2208 | * @return a graph representation as a String in dot format. |
---|
| 2209 | */ |
---|
| 2210 | public String graph(FPTreeRoot tree) { |
---|
| 2211 | //int maxID = tree.assignIDs(-1); |
---|
| 2212 | |
---|
| 2213 | |
---|
| 2214 | StringBuffer text = new StringBuffer(); |
---|
| 2215 | text.append("digraph FPTree {\n"); |
---|
| 2216 | text.append("N0 [label=\"ROOT\"]\n"); |
---|
| 2217 | tree.graphFPTree(text); |
---|
| 2218 | |
---|
| 2219 | // tree.graphHeaderTable(text, maxID+1); |
---|
| 2220 | text.append("}\n"); |
---|
| 2221 | |
---|
| 2222 | return text.toString(); |
---|
| 2223 | } |
---|
| 2224 | |
---|
| 2225 | /** |
---|
| 2226 | * Returns the revision string. |
---|
| 2227 | * |
---|
| 2228 | * @return the revision |
---|
| 2229 | */ |
---|
| 2230 | public String getRevision() { |
---|
| 2231 | return RevisionUtils.extract("$Revision: 6158 $"); |
---|
| 2232 | } |
---|
| 2233 | |
---|
| 2234 | /** |
---|
| 2235 | * Main method. |
---|
| 2236 | * |
---|
| 2237 | * @param args the commandline options |
---|
| 2238 | */ |
---|
| 2239 | public static void main(String[] args) { |
---|
| 2240 | runAssociator(new FPGrowth(), args); |
---|
| 2241 | } |
---|
| 2242 | } |
---|
| 2243 | |
---|