1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * Apriori.java |
---|
19 | * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.associations; |
---|
24 | |
---|
25 | import weka.core.AttributeStats; |
---|
26 | import weka.core.Capabilities; |
---|
27 | import weka.core.FastVector; |
---|
28 | import weka.core.Instances; |
---|
29 | import weka.core.Option; |
---|
30 | import weka.core.OptionHandler; |
---|
31 | import weka.core.RevisionUtils; |
---|
32 | import weka.core.SelectedTag; |
---|
33 | import weka.core.Tag; |
---|
34 | import weka.core.TechnicalInformation; |
---|
35 | import weka.core.TechnicalInformationHandler; |
---|
36 | import weka.core.Utils; |
---|
37 | import weka.core.Capabilities.Capability; |
---|
38 | import weka.core.TechnicalInformation.Field; |
---|
39 | import weka.core.TechnicalInformation.Type; |
---|
40 | import weka.filters.Filter; |
---|
41 | import weka.filters.unsupervised.attribute.Remove; |
---|
42 | |
---|
43 | import java.util.Enumeration; |
---|
44 | import java.util.Hashtable; |
---|
45 | |
---|
46 | /** |
---|
47 | <!-- globalinfo-start --> |
---|
48 | * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum support until it finds the required number of rules with the given minimum confidence.<br/> |
---|
49 | * The algorithm has an option to mine class association rules. It is adapted as explained in the second reference.<br/> |
---|
50 | * <br/> |
---|
51 | * For more information see:<br/> |
---|
52 | * <br/> |
---|
53 | * R. Agrawal, R. Srikant: Fast Algorithms for Mining Association Rules in Large Databases. In: 20th International Conference on Very Large Data Bases, 478-499, 1994.<br/> |
---|
54 | * <br/> |
---|
55 | * Bing Liu, Wynne Hsu, Yiming Ma: Integrating Classification and Association Rule Mining. In: Fourth International Conference on Knowledge Discovery and Data Mining, 80-86, 1998. |
---|
56 | * <p/> |
---|
57 | <!-- globalinfo-end --> |
---|
58 | * |
---|
59 | <!-- technical-bibtex-start --> |
---|
60 | * BibTeX: |
---|
61 | * <pre> |
---|
62 | * @inproceedings{Agrawal1994, |
---|
63 | * author = {R. Agrawal and R. Srikant}, |
---|
64 | * booktitle = {20th International Conference on Very Large Data Bases}, |
---|
65 | * pages = {478-499}, |
---|
66 | * publisher = {Morgan Kaufmann, Los Altos, CA}, |
---|
67 | * title = {Fast Algorithms for Mining Association Rules in Large Databases}, |
---|
68 | * year = {1994} |
---|
69 | * } |
---|
70 | * |
---|
71 | * @inproceedings{Liu1998, |
---|
72 | * author = {Bing Liu and Wynne Hsu and Yiming Ma}, |
---|
73 | * booktitle = {Fourth International Conference on Knowledge Discovery and Data Mining}, |
---|
74 | * pages = {80-86}, |
---|
75 | * publisher = {AAAI Press}, |
---|
76 | * title = {Integrating Classification and Association Rule Mining}, |
---|
77 | * year = {1998} |
---|
78 | * } |
---|
79 | * </pre> |
---|
80 | * <p/> |
---|
81 | <!-- technical-bibtex-end --> |
---|
82 | * |
---|
83 | <!-- options-start --> |
---|
84 | * Valid options are: <p/> |
---|
85 | * |
---|
86 | * <pre> -N <required number of rules output> |
---|
87 | * The required number of rules. (default = 10)</pre> |
---|
88 | * |
---|
89 | * <pre> -T <0=confidence | 1=lift | 2=leverage | 3=Conviction> |
---|
90 | * The metric type by which to rank rules. (default = confidence)</pre> |
---|
91 | * |
---|
92 | * <pre> -C <minimum metric score of a rule> |
---|
93 | * The minimum confidence of a rule. (default = 0.9)</pre> |
---|
94 | * |
---|
95 | * <pre> -D <delta for minimum support> |
---|
96 | * The delta by which the minimum support is decreased in |
---|
97 | * each iteration. (default = 0.05)</pre> |
---|
98 | * |
---|
99 | * <pre> -U <upper bound for minimum support> |
---|
100 | * Upper bound for minimum support. (default = 1.0)</pre> |
---|
101 | * |
---|
102 | * <pre> -M <lower bound for minimum support> |
---|
103 | * The lower bound for the minimum support. (default = 0.1)</pre> |
---|
104 | * |
---|
105 | * <pre> -S <significance level> |
---|
106 | * If used, rules are tested for significance at |
---|
107 | * the given level. Slower. (default = no significance testing)</pre> |
---|
108 | * |
---|
109 | * <pre> -I |
---|
110 | * If set the itemsets found are also output. (default = no)</pre> |
---|
111 | * |
---|
112 | * <pre> -R |
---|
113 | * Remove columns that contain all missing values (default = no)</pre> |
---|
114 | * |
---|
115 | * <pre> -V |
---|
116 | * Report progress iteratively. (default = no)</pre> |
---|
117 | * |
---|
118 | * <pre> -A |
---|
119 | * If set class association rules are mined. (default = no)</pre> |
---|
120 | * |
---|
121 | * <pre> -c <the class index> |
---|
122 | * The class index. (default = last)</pre> |
---|
123 | * |
---|
124 | <!-- options-end --> |
---|
125 | * |
---|
126 | * @author Eibe Frank (eibe@cs.waikato.ac.nz) |
---|
127 | * @author Mark Hall (mhall@cs.waikato.ac.nz) |
---|
128 | * @author Stefan Mutter (mutter@cs.waikato.ac.nz) |
---|
129 | * @version $Revision: 5698 $ |
---|
130 | */ |
---|
131 | public class Apriori |
---|
132 | extends AbstractAssociator |
---|
133 | implements OptionHandler, CARuleMiner, TechnicalInformationHandler { |
---|
134 | |
---|
135 | /** for serialization */ |
---|
136 | static final long serialVersionUID = 3277498842319212687L; |
---|
137 | |
---|
138 | /** The minimum support. */ |
---|
139 | protected double m_minSupport; |
---|
140 | |
---|
141 | /** The upper bound on the support */ |
---|
142 | protected double m_upperBoundMinSupport; |
---|
143 | |
---|
144 | /** The lower bound for the minimum support. */ |
---|
145 | protected double m_lowerBoundMinSupport; |
---|
146 | |
---|
147 | /** Metric type: Confidence */ |
---|
148 | protected static final int CONFIDENCE = 0; |
---|
149 | /** Metric type: Lift */ |
---|
150 | protected static final int LIFT = 1; |
---|
151 | /** Metric type: Leverage */ |
---|
152 | protected static final int LEVERAGE = 2; |
---|
153 | /** Metric type: Conviction */ |
---|
154 | protected static final int CONVICTION = 3; |
---|
155 | /** Metric types. */ |
---|
156 | public static final Tag [] TAGS_SELECTION = { |
---|
157 | new Tag(CONFIDENCE, "Confidence"), |
---|
158 | new Tag(LIFT, "Lift"), |
---|
159 | new Tag(LEVERAGE, "Leverage"), |
---|
160 | new Tag(CONVICTION, "Conviction") |
---|
161 | }; |
---|
162 | |
---|
163 | /** The selected metric type. */ |
---|
164 | protected int m_metricType = CONFIDENCE; |
---|
165 | |
---|
166 | /** The minimum metric score. */ |
---|
167 | protected double m_minMetric; |
---|
168 | |
---|
169 | /** The maximum number of rules that are output. */ |
---|
170 | protected int m_numRules; |
---|
171 | |
---|
172 | /** Delta by which m_minSupport is decreased in each iteration. */ |
---|
173 | protected double m_delta; |
---|
174 | |
---|
175 | /** Significance level for optional significance test. */ |
---|
176 | protected double m_significanceLevel; |
---|
177 | |
---|
178 | /** Number of cycles used before required number of rules was one. */ |
---|
179 | protected int m_cycles; |
---|
180 | |
---|
181 | /** The set of all sets of itemsets L. */ |
---|
182 | protected FastVector m_Ls; |
---|
183 | |
---|
184 | /** The same information stored in hash tables. */ |
---|
185 | protected FastVector m_hashtables; |
---|
186 | |
---|
187 | /** The list of all generated rules. */ |
---|
188 | protected FastVector[] m_allTheRules; |
---|
189 | |
---|
190 | /** The instances (transactions) to be used for generating |
---|
191 | the association rules. */ |
---|
192 | protected Instances m_instances; |
---|
193 | |
---|
194 | /** Output itemsets found? */ |
---|
195 | protected boolean m_outputItemSets; |
---|
196 | |
---|
197 | /** Remove columns with all missing values */ |
---|
198 | protected boolean m_removeMissingCols; |
---|
199 | |
---|
200 | /** Report progress iteratively */ |
---|
201 | protected boolean m_verbose; |
---|
202 | |
---|
203 | /** Only the class attribute of all Instances.*/ |
---|
204 | protected Instances m_onlyClass; |
---|
205 | |
---|
206 | /** The class index. */ |
---|
207 | protected int m_classIndex; |
---|
208 | |
---|
209 | /** Flag indicating whether class association rules are mined. */ |
---|
210 | protected boolean m_car; |
---|
211 | |
---|
212 | /** |
---|
213 | * Treat zeros as missing (rather than a value in their |
---|
214 | * own right) |
---|
215 | */ |
---|
216 | protected boolean m_treatZeroAsMissing = false; |
---|
217 | |
---|
218 | /** |
---|
219 | * Returns a string describing this associator |
---|
220 | * @return a description of the evaluator suitable for |
---|
221 | * displaying in the explorer/experimenter gui |
---|
222 | */ |
---|
223 | public String globalInfo() { |
---|
224 | return |
---|
225 | "Class implementing an Apriori-type algorithm. Iteratively reduces " |
---|
226 | + "the minimum support until it finds the required number of rules with " |
---|
227 | + "the given minimum confidence.\n" |
---|
228 | + "The algorithm has an option to mine class association rules. It is " |
---|
229 | + "adapted as explained in the second reference.\n\n" |
---|
230 | + "For more information see:\n\n" |
---|
231 | + getTechnicalInformation().toString(); |
---|
232 | } |
---|
233 | |
---|
234 | /** |
---|
235 | * Returns an instance of a TechnicalInformation object, containing |
---|
236 | * detailed information about the technical background of this class, |
---|
237 | * e.g., paper reference or book this class is based on. |
---|
238 | * |
---|
239 | * @return the technical information about this class |
---|
240 | */ |
---|
241 | public TechnicalInformation getTechnicalInformation() { |
---|
242 | TechnicalInformation result; |
---|
243 | TechnicalInformation additional; |
---|
244 | |
---|
245 | result = new TechnicalInformation(Type.INPROCEEDINGS); |
---|
246 | result.setValue(Field.AUTHOR, "R. Agrawal and R. Srikant"); |
---|
247 | result.setValue(Field.TITLE, "Fast Algorithms for Mining Association Rules in Large Databases"); |
---|
248 | result.setValue(Field.BOOKTITLE, "20th International Conference on Very Large Data Bases"); |
---|
249 | result.setValue(Field.YEAR, "1994"); |
---|
250 | result.setValue(Field.PAGES, "478-499"); |
---|
251 | result.setValue(Field.PUBLISHER, "Morgan Kaufmann, Los Altos, CA"); |
---|
252 | |
---|
253 | additional = result.add(Type.INPROCEEDINGS); |
---|
254 | additional.setValue(Field.AUTHOR, "Bing Liu and Wynne Hsu and Yiming Ma"); |
---|
255 | additional.setValue(Field.TITLE, "Integrating Classification and Association Rule Mining"); |
---|
256 | additional.setValue(Field.BOOKTITLE, "Fourth International Conference on Knowledge Discovery and Data Mining"); |
---|
257 | additional.setValue(Field.YEAR, "1998"); |
---|
258 | additional.setValue(Field.PAGES, "80-86"); |
---|
259 | additional.setValue(Field.PUBLISHER, "AAAI Press"); |
---|
260 | |
---|
261 | return result; |
---|
262 | } |
---|
263 | |
---|
264 | /** |
---|
265 | * Constructor that allows to sets default values for the |
---|
266 | * minimum confidence and the maximum number of rules |
---|
267 | * the minimum confidence. |
---|
268 | */ |
---|
269 | public Apriori() { |
---|
270 | |
---|
271 | resetOptions(); |
---|
272 | } |
---|
273 | |
---|
274 | /** |
---|
275 | * Resets the options to the default values. |
---|
276 | */ |
---|
277 | public void resetOptions() { |
---|
278 | |
---|
279 | m_removeMissingCols = false; |
---|
280 | m_verbose = false; |
---|
281 | m_delta = 0.05; |
---|
282 | m_minMetric = 0.90; |
---|
283 | m_numRules = 10; |
---|
284 | m_lowerBoundMinSupport = 0.1; |
---|
285 | m_upperBoundMinSupport = 1.0; |
---|
286 | m_significanceLevel = -1; |
---|
287 | m_outputItemSets = false; |
---|
288 | m_car = false; |
---|
289 | m_classIndex = -1; |
---|
290 | } |
---|
291 | |
---|
292 | /** |
---|
293 | * Removes columns that are all missing from the data |
---|
294 | * @param instances the instances |
---|
295 | * @return a new set of instances with all missing columns removed |
---|
296 | * @throws Exception if something goes wrong |
---|
297 | */ |
---|
298 | protected Instances removeMissingColumns(Instances instances) |
---|
299 | throws Exception { |
---|
300 | |
---|
301 | int numInstances = instances.numInstances(); |
---|
302 | StringBuffer deleteString = new StringBuffer(); |
---|
303 | int removeCount = 0; |
---|
304 | boolean first = true; |
---|
305 | int maxCount = 0; |
---|
306 | |
---|
307 | for (int i=0;i<instances.numAttributes();i++) { |
---|
308 | AttributeStats as = instances.attributeStats(i); |
---|
309 | if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { |
---|
310 | // see if we can decrease this by looking for the most frequent value |
---|
311 | int [] counts = as.nominalCounts; |
---|
312 | if (counts[Utils.maxIndex(counts)] > maxCount) { |
---|
313 | maxCount = counts[Utils.maxIndex(counts)]; |
---|
314 | } |
---|
315 | } |
---|
316 | if (as.missingCount == numInstances) { |
---|
317 | if (first) { |
---|
318 | deleteString.append((i+1)); |
---|
319 | first = false; |
---|
320 | } else { |
---|
321 | deleteString.append(","+(i+1)); |
---|
322 | } |
---|
323 | removeCount++; |
---|
324 | } |
---|
325 | } |
---|
326 | if (m_verbose) { |
---|
327 | System.err.println("Removed : "+removeCount+" columns with all missing " |
---|
328 | +"values."); |
---|
329 | } |
---|
330 | if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { |
---|
331 | m_upperBoundMinSupport = (double)maxCount / (double)numInstances; |
---|
332 | if (m_verbose) { |
---|
333 | System.err.println("Setting upper bound min support to : " |
---|
334 | +m_upperBoundMinSupport); |
---|
335 | } |
---|
336 | } |
---|
337 | |
---|
338 | if (deleteString.toString().length() > 0) { |
---|
339 | Remove af = new Remove(); |
---|
340 | af.setAttributeIndices(deleteString.toString()); |
---|
341 | af.setInvertSelection(false); |
---|
342 | af.setInputFormat(instances); |
---|
343 | Instances newInst = Filter.useFilter(instances, af); |
---|
344 | |
---|
345 | return newInst; |
---|
346 | } |
---|
347 | return instances; |
---|
348 | } |
---|
349 | |
---|
350 | /** |
---|
351 | * Returns default capabilities of the classifier. |
---|
352 | * |
---|
353 | * @return the capabilities of this classifier |
---|
354 | */ |
---|
355 | public Capabilities getCapabilities() { |
---|
356 | Capabilities result = super.getCapabilities(); |
---|
357 | result.disableAll(); |
---|
358 | |
---|
359 | // enable what we can handle |
---|
360 | |
---|
361 | // attributes |
---|
362 | result.enable(Capability.NOMINAL_ATTRIBUTES); |
---|
363 | result.enable(Capability.MISSING_VALUES); |
---|
364 | |
---|
365 | // class (can handle a nominal class if CAR rules are selected). This |
---|
366 | result.enable(Capability.NO_CLASS); |
---|
367 | result.enable(Capability.NOMINAL_CLASS); |
---|
368 | result.enable(Capability.MISSING_CLASS_VALUES); |
---|
369 | |
---|
370 | return result; |
---|
371 | } |
---|
372 | |
---|
373 | /** |
---|
374 | * Method that generates all large itemsets with a minimum support, and from |
---|
375 | * these all association rules with a minimum confidence. |
---|
376 | * |
---|
377 | * @param instances the instances to be used for generating the associations |
---|
378 | * @throws Exception if rules can't be built successfully |
---|
379 | */ |
---|
380 | public void buildAssociations(Instances instances) throws Exception { |
---|
381 | |
---|
382 | double[] confidences, supports; |
---|
383 | int[] indices; |
---|
384 | FastVector[] sortedRuleSet; |
---|
385 | double necSupport=0; |
---|
386 | |
---|
387 | instances = new Instances(instances); |
---|
388 | |
---|
389 | if (m_removeMissingCols) { |
---|
390 | instances = removeMissingColumns(instances); |
---|
391 | } |
---|
392 | if(m_car && m_metricType != CONFIDENCE) |
---|
393 | throw new Exception("For CAR-Mining metric type has to be confidence!"); |
---|
394 | |
---|
395 | // only set class index if CAR is requested |
---|
396 | if (m_car) { |
---|
397 | if (m_classIndex == -1 ) { |
---|
398 | instances.setClassIndex(instances.numAttributes()-1); |
---|
399 | } else if (m_classIndex <= instances.numAttributes() && m_classIndex > 0) { |
---|
400 | instances.setClassIndex(m_classIndex - 1); |
---|
401 | } else { |
---|
402 | throw new Exception("Invalid class index."); |
---|
403 | } |
---|
404 | } |
---|
405 | |
---|
406 | // can associator handle the data? |
---|
407 | getCapabilities().testWithFail(instances); |
---|
408 | |
---|
409 | m_cycles = 0; |
---|
410 | |
---|
411 | // make sure that the lower bound is equal to at least one instance |
---|
412 | double lowerBoundMinSupportToUse = |
---|
413 | (m_lowerBoundMinSupport * (double)instances.numInstances() < 1.0) |
---|
414 | ? 1.0 / (double)instances.numInstances() |
---|
415 | : m_lowerBoundMinSupport; |
---|
416 | |
---|
417 | if(m_car){ |
---|
418 | //m_instances does not contain the class attribute |
---|
419 | m_instances = LabeledItemSet.divide(instances,false); |
---|
420 | |
---|
421 | //m_onlyClass contains only the class attribute |
---|
422 | m_onlyClass = LabeledItemSet.divide(instances,true); |
---|
423 | } |
---|
424 | else |
---|
425 | m_instances = instances; |
---|
426 | |
---|
427 | if(m_car && m_numRules == Integer.MAX_VALUE){ |
---|
428 | // Set desired minimum support |
---|
429 | m_minSupport = lowerBoundMinSupportToUse; |
---|
430 | } |
---|
431 | else{ |
---|
432 | // Decrease minimum support until desired number of rules found. |
---|
433 | m_minSupport = m_upperBoundMinSupport - m_delta; |
---|
434 | m_minSupport = (m_minSupport < lowerBoundMinSupportToUse) |
---|
435 | ? lowerBoundMinSupportToUse |
---|
436 | : m_minSupport; |
---|
437 | } |
---|
438 | |
---|
439 | do { |
---|
440 | |
---|
441 | // Reserve space for variables |
---|
442 | m_Ls = new FastVector(); |
---|
443 | m_hashtables = new FastVector(); |
---|
444 | m_allTheRules = new FastVector[6]; |
---|
445 | m_allTheRules[0] = new FastVector(); |
---|
446 | m_allTheRules[1] = new FastVector(); |
---|
447 | m_allTheRules[2] = new FastVector(); |
---|
448 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
449 | m_allTheRules[3] = new FastVector(); |
---|
450 | m_allTheRules[4] = new FastVector(); |
---|
451 | m_allTheRules[5] = new FastVector(); |
---|
452 | } |
---|
453 | sortedRuleSet = new FastVector[6]; |
---|
454 | sortedRuleSet[0] = new FastVector(); |
---|
455 | sortedRuleSet[1] = new FastVector(); |
---|
456 | sortedRuleSet[2] = new FastVector(); |
---|
457 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
458 | sortedRuleSet[3] = new FastVector(); |
---|
459 | sortedRuleSet[4] = new FastVector(); |
---|
460 | sortedRuleSet[5] = new FastVector(); |
---|
461 | } |
---|
462 | if(!m_car){ |
---|
463 | // Find large itemsets and rules |
---|
464 | findLargeItemSets(); |
---|
465 | if (m_significanceLevel != -1 || m_metricType != CONFIDENCE) |
---|
466 | findRulesBruteForce(); |
---|
467 | else |
---|
468 | findRulesQuickly(); |
---|
469 | } |
---|
470 | else{ |
---|
471 | findLargeCarItemSets(); |
---|
472 | findCarRulesQuickly(); |
---|
473 | } |
---|
474 | |
---|
475 | // Sort rules according to their support |
---|
476 | /*supports = new double[m_allTheRules[2].size()]; |
---|
477 | for (int i = 0; i < m_allTheRules[2].size(); i++) |
---|
478 | supports[i] = (double)((AprioriItemSet)m_allTheRules[1].elementAt(i)).support(); |
---|
479 | indices = Utils.stableSort(supports); |
---|
480 | for (int i = 0; i < m_allTheRules[2].size(); i++) { |
---|
481 | sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i])); |
---|
482 | sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i])); |
---|
483 | sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i])); |
---|
484 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
485 | sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i])); |
---|
486 | sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i])); |
---|
487 | sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i])); |
---|
488 | } |
---|
489 | }*/ |
---|
490 | int j = m_allTheRules[2].size()-1; |
---|
491 | supports = new double[m_allTheRules[2].size()]; |
---|
492 | for (int i = 0; i < (j+1); i++) |
---|
493 | supports[j-i] = ((double)((ItemSet)m_allTheRules[1].elementAt(j-i)).support())*(-1); |
---|
494 | indices = Utils.stableSort(supports); |
---|
495 | for (int i = 0; i < (j+1); i++) { |
---|
496 | sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[j-i])); |
---|
497 | sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[j-i])); |
---|
498 | sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[j-i])); |
---|
499 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
500 | sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[j-i])); |
---|
501 | sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[j-i])); |
---|
502 | sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[j-i])); |
---|
503 | } |
---|
504 | } |
---|
505 | |
---|
506 | // Sort rules according to their confidence |
---|
507 | m_allTheRules[0].removeAllElements(); |
---|
508 | m_allTheRules[1].removeAllElements(); |
---|
509 | m_allTheRules[2].removeAllElements(); |
---|
510 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
511 | m_allTheRules[3].removeAllElements(); |
---|
512 | m_allTheRules[4].removeAllElements(); |
---|
513 | m_allTheRules[5].removeAllElements(); |
---|
514 | } |
---|
515 | confidences = new double[sortedRuleSet[2].size()]; |
---|
516 | int sortType = 2 + m_metricType; |
---|
517 | |
---|
518 | for (int i = 0; i < sortedRuleSet[2].size(); i++) |
---|
519 | confidences[i] = |
---|
520 | ((Double)sortedRuleSet[sortType].elementAt(i)).doubleValue(); |
---|
521 | indices = Utils.stableSort(confidences); |
---|
522 | for (int i = sortedRuleSet[0].size() - 1; |
---|
523 | (i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) { |
---|
524 | m_allTheRules[0].addElement(sortedRuleSet[0].elementAt(indices[i])); |
---|
525 | m_allTheRules[1].addElement(sortedRuleSet[1].elementAt(indices[i])); |
---|
526 | m_allTheRules[2].addElement(sortedRuleSet[2].elementAt(indices[i])); |
---|
527 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
528 | m_allTheRules[3].addElement(sortedRuleSet[3].elementAt(indices[i])); |
---|
529 | m_allTheRules[4].addElement(sortedRuleSet[4].elementAt(indices[i])); |
---|
530 | m_allTheRules[5].addElement(sortedRuleSet[5].elementAt(indices[i])); |
---|
531 | } |
---|
532 | } |
---|
533 | |
---|
534 | if (m_verbose) { |
---|
535 | if (m_Ls.size() > 1) { |
---|
536 | System.out.println(toString()); |
---|
537 | } |
---|
538 | } |
---|
539 | if(m_minSupport == lowerBoundMinSupportToUse || m_minSupport - m_delta > lowerBoundMinSupportToUse) |
---|
540 | m_minSupport -= m_delta; |
---|
541 | else |
---|
542 | m_minSupport = lowerBoundMinSupportToUse; |
---|
543 | |
---|
544 | |
---|
545 | necSupport = Math.rint(m_minSupport * (double)m_instances.numInstances()); |
---|
546 | |
---|
547 | m_cycles++; |
---|
548 | } while ((m_allTheRules[0].size() < m_numRules) && |
---|
549 | (Utils.grOrEq(m_minSupport, lowerBoundMinSupportToUse)) |
---|
550 | /* (necSupport >= lowerBoundNumInstancesSupport)*/ |
---|
551 | /* (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport)) */ && |
---|
552 | (necSupport >= 1)); |
---|
553 | m_minSupport += m_delta; |
---|
554 | } |
---|
555 | |
---|
556 | |
---|
557 | /** |
---|
558 | * Method that mines all class association rules with minimum support and |
---|
559 | * with a minimum confidence. |
---|
560 | * @return an sorted array of FastVector (confidence depended) containing the rules and metric information |
---|
561 | * @param data the instances for which class association rules should be mined |
---|
562 | * @throws Exception if rules can't be built successfully |
---|
563 | */ |
---|
564 | public FastVector[] mineCARs(Instances data) throws Exception{ |
---|
565 | |
---|
566 | m_car = true; |
---|
567 | buildAssociations(data); |
---|
568 | return m_allTheRules; |
---|
569 | } |
---|
570 | |
---|
571 | /** |
---|
572 | * Gets the instances without the class atrribute. |
---|
573 | * |
---|
574 | * @return the instances without the class attribute. |
---|
575 | */ |
---|
576 | public Instances getInstancesNoClass() { |
---|
577 | |
---|
578 | return m_instances; |
---|
579 | } |
---|
580 | |
---|
581 | |
---|
582 | /** |
---|
583 | * Gets only the class attribute of the instances. |
---|
584 | * |
---|
585 | * @return the class attribute of all instances. |
---|
586 | */ |
---|
587 | public Instances getInstancesOnlyClass() { |
---|
588 | |
---|
589 | return m_onlyClass; |
---|
590 | } |
---|
591 | |
---|
592 | |
---|
593 | /** |
---|
594 | * Returns an enumeration describing the available options. |
---|
595 | * |
---|
596 | * @return an enumeration of all the available options. |
---|
597 | */ |
---|
598 | public Enumeration listOptions() { |
---|
599 | |
---|
600 | String string1 = "\tThe required number of rules. (default = " + m_numRules + ")", |
---|
601 | string2 = |
---|
602 | "\tThe minimum confidence of a rule. (default = " + m_minMetric + ")", |
---|
603 | string3 = "\tThe delta by which the minimum support is decreased in\n", |
---|
604 | string4 = "\teach iteration. (default = " + m_delta + ")", |
---|
605 | string5 = |
---|
606 | "\tThe lower bound for the minimum support. (default = " + |
---|
607 | m_lowerBoundMinSupport + ")", |
---|
608 | string6 = "\tIf used, rules are tested for significance at\n", |
---|
609 | string7 = "\tthe given level. Slower. (default = no significance testing)", |
---|
610 | string8 = "\tIf set the itemsets found are also output. (default = no)", |
---|
611 | string9 = "\tIf set class association rules are mined. (default = no)", |
---|
612 | string10 = "\tThe class index. (default = last)", |
---|
613 | stringType = "\tThe metric type by which to rank rules. (default = " |
---|
614 | +"confidence)", |
---|
615 | stringZeroAsMissing = "\tTreat zero (i.e. first value of nominal attributes) as " + |
---|
616 | "missing"; |
---|
617 | |
---|
618 | |
---|
619 | FastVector newVector = new FastVector(11); |
---|
620 | |
---|
621 | newVector.addElement(new Option(string1, "N", 1, |
---|
622 | "-N <required number of rules output>")); |
---|
623 | newVector.addElement(new Option(stringType, "T", 1, |
---|
624 | "-T <0=confidence | 1=lift | " |
---|
625 | +"2=leverage | 3=Conviction>")); |
---|
626 | newVector.addElement(new Option(string2, "C", 1, |
---|
627 | "-C <minimum metric score of a rule>")); |
---|
628 | newVector.addElement(new Option(string3 + string4, "D", 1, |
---|
629 | "-D <delta for minimum support>")); |
---|
630 | newVector.addElement(new Option("\tUpper bound for minimum support. " |
---|
631 | +"(default = 1.0)", "U", 1, |
---|
632 | "-U <upper bound for minimum support>")); |
---|
633 | newVector.addElement(new Option(string5, "M", 1, |
---|
634 | "-M <lower bound for minimum support>")); |
---|
635 | newVector.addElement(new Option(string6 + string7, "S", 1, |
---|
636 | "-S <significance level>")); |
---|
637 | newVector.addElement(new Option(string8, "I", 0, |
---|
638 | "-I")); |
---|
639 | newVector.addElement(new Option("\tRemove columns that contain " |
---|
640 | +"all missing values (default = no)" |
---|
641 | , "R", 0, |
---|
642 | "-R")); |
---|
643 | newVector.addElement(new Option("\tReport progress iteratively. (default " |
---|
644 | +"= no)", "V", 0, |
---|
645 | "-V")); |
---|
646 | newVector.addElement(new Option(string9, "A", 0, |
---|
647 | "-A")); |
---|
648 | newVector.addElement(new Option(stringZeroAsMissing, "Z", 0, |
---|
649 | "-Z")); |
---|
650 | newVector.addElement(new Option(string10, "c", 1, |
---|
651 | "-c <the class index>")); |
---|
652 | |
---|
653 | return newVector.elements(); |
---|
654 | } |
---|
655 | |
---|
656 | /** |
---|
657 | * Parses a given list of options. <p/> |
---|
658 | * |
---|
659 | <!-- options-start --> |
---|
660 | * Valid options are: <p/> |
---|
661 | * |
---|
662 | * <pre> -N <required number of rules output> |
---|
663 | * The required number of rules. (default = 10)</pre> |
---|
664 | * |
---|
665 | * <pre> -T <0=confidence | 1=lift | 2=leverage | 3=Conviction> |
---|
666 | * The metric type by which to rank rules. (default = confidence)</pre> |
---|
667 | * |
---|
668 | * <pre> -C <minimum metric score of a rule> |
---|
669 | * The minimum confidence of a rule. (default = 0.9)</pre> |
---|
670 | * |
---|
671 | * <pre> -D <delta for minimum support> |
---|
672 | * The delta by which the minimum support is decreased in |
---|
673 | * each iteration. (default = 0.05)</pre> |
---|
674 | * |
---|
675 | * <pre> -U <upper bound for minimum support> |
---|
676 | * Upper bound for minimum support. (default = 1.0)</pre> |
---|
677 | * |
---|
678 | * <pre> -M <lower bound for minimum support> |
---|
679 | * The lower bound for the minimum support. (default = 0.1)</pre> |
---|
680 | * |
---|
681 | * <pre> -S <significance level> |
---|
682 | * If used, rules are tested for significance at |
---|
683 | * the given level. Slower. (default = no significance testing)</pre> |
---|
684 | * |
---|
685 | * <pre> -I |
---|
686 | * If set the itemsets found are also output. (default = no)</pre> |
---|
687 | * |
---|
688 | * <pre> -R |
---|
689 | * Remove columns that contain all missing values (default = no)</pre> |
---|
690 | * |
---|
691 | * <pre> -V |
---|
692 | * Report progress iteratively. (default = no)</pre> |
---|
693 | * |
---|
694 | * <pre> -A |
---|
695 | * If set class association rules are mined. (default = no)</pre> |
---|
696 | * |
---|
697 | * <pre> -c <the class index> |
---|
698 | * The class index. (default = last)</pre> |
---|
699 | * |
---|
700 | <!-- options-end --> |
---|
701 | * |
---|
702 | * @param options the list of options as an array of strings |
---|
703 | * @throws Exception if an option is not supported |
---|
704 | */ |
---|
705 | public void setOptions(String[] options) throws Exception { |
---|
706 | |
---|
707 | resetOptions(); |
---|
708 | String numRulesString = Utils.getOption('N', options), |
---|
709 | minConfidenceString = Utils.getOption('C', options), |
---|
710 | deltaString = Utils.getOption('D', options), |
---|
711 | maxSupportString = Utils.getOption('U', options), |
---|
712 | minSupportString = Utils.getOption('M', options), |
---|
713 | significanceLevelString = Utils.getOption('S', options), |
---|
714 | classIndexString = Utils.getOption('c',options); |
---|
715 | |
---|
716 | String metricTypeString = Utils.getOption('T', options); |
---|
717 | if (metricTypeString.length() != 0) { |
---|
718 | setMetricType(new SelectedTag(Integer.parseInt(metricTypeString), |
---|
719 | TAGS_SELECTION)); |
---|
720 | } |
---|
721 | |
---|
722 | if (numRulesString.length() != 0) { |
---|
723 | m_numRules = Integer.parseInt(numRulesString); |
---|
724 | } |
---|
725 | if (classIndexString.length() != 0) { |
---|
726 | if (classIndexString.equalsIgnoreCase("last")) { |
---|
727 | m_classIndex = -1; |
---|
728 | } else if (classIndexString.equalsIgnoreCase("first")) { |
---|
729 | m_classIndex = 0; |
---|
730 | } else { |
---|
731 | m_classIndex = Integer.parseInt(classIndexString); |
---|
732 | } |
---|
733 | } |
---|
734 | if (minConfidenceString.length() != 0) { |
---|
735 | m_minMetric = (new Double(minConfidenceString)).doubleValue(); |
---|
736 | } |
---|
737 | if (deltaString.length() != 0) { |
---|
738 | m_delta = (new Double(deltaString)).doubleValue(); |
---|
739 | } |
---|
740 | if (maxSupportString.length() != 0) { |
---|
741 | setUpperBoundMinSupport((new Double(maxSupportString)).doubleValue()); |
---|
742 | } |
---|
743 | if (minSupportString.length() != 0) { |
---|
744 | m_lowerBoundMinSupport = (new Double(minSupportString)).doubleValue(); |
---|
745 | } |
---|
746 | if (significanceLevelString.length() != 0) { |
---|
747 | m_significanceLevel = (new Double(significanceLevelString)).doubleValue(); |
---|
748 | } |
---|
749 | m_outputItemSets = Utils.getFlag('I', options); |
---|
750 | m_car = Utils.getFlag('A', options); |
---|
751 | m_verbose = Utils.getFlag('V', options); |
---|
752 | m_treatZeroAsMissing = Utils.getFlag('Z', options); |
---|
753 | |
---|
754 | setRemoveAllMissingCols(Utils.getFlag('R', options)); |
---|
755 | } |
---|
756 | |
---|
757 | /** |
---|
758 | * Gets the current settings of the Apriori object. |
---|
759 | * |
---|
760 | * @return an array of strings suitable for passing to setOptions |
---|
761 | */ |
---|
762 | public String [] getOptions() { |
---|
763 | |
---|
764 | String [] options = new String [21]; |
---|
765 | int current = 0; |
---|
766 | |
---|
767 | if (m_outputItemSets) { |
---|
768 | options[current++] = "-I"; |
---|
769 | } |
---|
770 | |
---|
771 | if (getRemoveAllMissingCols()) { |
---|
772 | options[current++] = "-R"; |
---|
773 | } |
---|
774 | |
---|
775 | options[current++] = "-N"; options[current++] = "" + m_numRules; |
---|
776 | options[current++] = "-T"; options[current++] = "" + m_metricType; |
---|
777 | options[current++] = "-C"; options[current++] = "" + m_minMetric; |
---|
778 | options[current++] = "-D"; options[current++] = "" + m_delta; |
---|
779 | options[current++] = "-U"; options[current++] = "" + m_upperBoundMinSupport; |
---|
780 | options[current++] = "-M"; options[current++] = "" + m_lowerBoundMinSupport; |
---|
781 | options[current++] = "-S"; options[current++] = "" + m_significanceLevel; |
---|
782 | if (m_car) |
---|
783 | options[current++] = "-A"; |
---|
784 | if (m_verbose) |
---|
785 | options[current++] = "-V"; |
---|
786 | |
---|
787 | if (m_treatZeroAsMissing) { |
---|
788 | options[current++] = "-Z"; |
---|
789 | } |
---|
790 | options[current++] = "-c"; options[current++] = "" + m_classIndex; |
---|
791 | |
---|
792 | while (current < options.length) { |
---|
793 | options[current++] = ""; |
---|
794 | } |
---|
795 | return options; |
---|
796 | } |
---|
797 | |
---|
798 | /** |
---|
799 | * Outputs the size of all the generated sets of itemsets and the rules. |
---|
800 | * |
---|
801 | * @return a string representation of the model |
---|
802 | */ |
---|
803 | public String toString() { |
---|
804 | |
---|
805 | StringBuffer text = new StringBuffer(); |
---|
806 | |
---|
807 | if (m_Ls.size() <= 1) |
---|
808 | return "\nNo large itemsets and rules found!\n"; |
---|
809 | text.append("\nApriori\n=======\n\n"); |
---|
810 | text.append("Minimum support: " |
---|
811 | + Utils.doubleToString(m_minSupport,2) |
---|
812 | + " (" + ((int)(m_minSupport * (double)m_instances.numInstances()+0.5)) |
---|
813 | + " instances)" |
---|
814 | + '\n'); |
---|
815 | text.append("Minimum metric <"); |
---|
816 | switch(m_metricType) { |
---|
817 | case CONFIDENCE: |
---|
818 | text.append("confidence>: "); |
---|
819 | break; |
---|
820 | case LIFT: |
---|
821 | text.append("lift>: "); |
---|
822 | break; |
---|
823 | case LEVERAGE: |
---|
824 | text.append("leverage>: "); |
---|
825 | break; |
---|
826 | case CONVICTION: |
---|
827 | text.append("conviction>: "); |
---|
828 | break; |
---|
829 | } |
---|
830 | text.append(Utils.doubleToString(m_minMetric,2)+'\n'); |
---|
831 | |
---|
832 | if (m_significanceLevel != -1) |
---|
833 | text.append("Significance level: "+ |
---|
834 | Utils.doubleToString(m_significanceLevel,2)+'\n'); |
---|
835 | text.append("Number of cycles performed: " + m_cycles+'\n'); |
---|
836 | text.append("\nGenerated sets of large itemsets:\n"); |
---|
837 | if(!m_car){ |
---|
838 | for (int i = 0; i < m_Ls.size(); i++) { |
---|
839 | text.append("\nSize of set of large itemsets L("+(i+1)+"): "+ |
---|
840 | ((FastVector)m_Ls.elementAt(i)).size()+'\n'); |
---|
841 | if (m_outputItemSets) { |
---|
842 | text.append("\nLarge Itemsets L("+(i+1)+"):\n"); |
---|
843 | for (int j = 0; j < ((FastVector)m_Ls.elementAt(i)).size(); j++) |
---|
844 | text.append(((AprioriItemSet)((FastVector)m_Ls.elementAt(i)).elementAt(j)). |
---|
845 | toString(m_instances)+"\n"); |
---|
846 | } |
---|
847 | } |
---|
848 | text.append("\nBest rules found:\n\n"); |
---|
849 | for (int i = 0; i < m_allTheRules[0].size(); i++) { |
---|
850 | text.append(Utils.doubleToString((double)i+1, |
---|
851 | (int)(Math.log(m_numRules)/Math.log(10)+1),0)+ |
---|
852 | ". " + ((AprioriItemSet)m_allTheRules[0].elementAt(i)). |
---|
853 | toString(m_instances) |
---|
854 | + " ==> " + ((AprioriItemSet)m_allTheRules[1].elementAt(i)). |
---|
855 | toString(m_instances) +" conf:("+ |
---|
856 | Utils.doubleToString(((Double)m_allTheRules[2]. |
---|
857 | elementAt(i)).doubleValue(),2)+")"); |
---|
858 | if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { |
---|
859 | text.append((m_metricType == LIFT ? " <" : "")+" lift:("+ |
---|
860 | Utils.doubleToString(((Double)m_allTheRules[3]. |
---|
861 | elementAt(i)).doubleValue(),2) |
---|
862 | +")"+(m_metricType == LIFT ? ">" : "")); |
---|
863 | text.append((m_metricType == LEVERAGE ? " <" : "")+" lev:("+ |
---|
864 | Utils.doubleToString(((Double)m_allTheRules[4]. |
---|
865 | elementAt(i)).doubleValue(),2) |
---|
866 | +")"); |
---|
867 | text.append(" ["+ |
---|
868 | (int)(((Double)m_allTheRules[4].elementAt(i)) |
---|
869 | .doubleValue() * (double)m_instances.numInstances()) |
---|
870 | +"]"+(m_metricType == LEVERAGE ? ">" : "")); |
---|
871 | text.append((m_metricType == CONVICTION ? " <" : "")+" conv:("+ |
---|
872 | Utils.doubleToString(((Double)m_allTheRules[5]. |
---|
873 | elementAt(i)).doubleValue(),2) |
---|
874 | +")"+(m_metricType == CONVICTION ? ">" : "")); |
---|
875 | } |
---|
876 | text.append('\n'); |
---|
877 | } |
---|
878 | } |
---|
879 | else{ |
---|
880 | for (int i = 0; i < m_Ls.size(); i++) { |
---|
881 | text.append("\nSize of set of large itemsets L("+(i+1)+"): "+ |
---|
882 | ((FastVector)m_Ls.elementAt(i)).size()+'\n'); |
---|
883 | if (m_outputItemSets) { |
---|
884 | text.append("\nLarge Itemsets L("+(i+1)+"):\n"); |
---|
885 | for (int j = 0; j < ((FastVector)m_Ls.elementAt(i)).size(); j++){ |
---|
886 | text.append(((ItemSet)((FastVector)m_Ls.elementAt(i)).elementAt(j)). |
---|
887 | toString(m_instances)+"\n"); |
---|
888 | text.append(((LabeledItemSet)((FastVector)m_Ls.elementAt(i)).elementAt(j)).m_classLabel+" "); |
---|
889 | text.append(((LabeledItemSet)((FastVector)m_Ls.elementAt(i)).elementAt(j)).support()+"\n"); |
---|
890 | } |
---|
891 | } |
---|
892 | } |
---|
893 | text.append("\nBest rules found:\n\n"); |
---|
894 | for (int i = 0; i < m_allTheRules[0].size(); i++) { |
---|
895 | text.append(Utils.doubleToString((double)i+1, |
---|
896 | (int)(Math.log(m_numRules)/Math.log(10)+1),0)+ |
---|
897 | ". " + ((ItemSet)m_allTheRules[0].elementAt(i)). |
---|
898 | toString(m_instances) |
---|
899 | + " ==> " + ((ItemSet)m_allTheRules[1].elementAt(i)). |
---|
900 | toString(m_onlyClass) +" conf:("+ |
---|
901 | Utils.doubleToString(((Double)m_allTheRules[2]. |
---|
902 | elementAt(i)).doubleValue(),2)+")"); |
---|
903 | |
---|
904 | text.append('\n'); |
---|
905 | } |
---|
906 | } |
---|
907 | return text.toString(); |
---|
908 | } |
---|
909 | |
---|
910 | /** |
---|
911 | * Returns the metric string for the chosen metric type |
---|
912 | * @return a string describing the used metric for the interestingness of a class association rule |
---|
913 | */ |
---|
914 | public String metricString() { |
---|
915 | |
---|
916 | switch(m_metricType) { |
---|
917 | case LIFT: |
---|
918 | return "lif"; |
---|
919 | case LEVERAGE: |
---|
920 | return "leverage"; |
---|
921 | case CONVICTION: |
---|
922 | return "conviction"; |
---|
923 | default: |
---|
924 | return "conf"; |
---|
925 | } |
---|
926 | } |
---|
927 | |
---|
928 | /** |
---|
929 | * Returns the tip text for this property |
---|
930 | * @return tip text for this property suitable for |
---|
931 | * displaying in the explorer/experimenter gui |
---|
932 | */ |
---|
933 | public String removeAllMissingColsTipText() { |
---|
934 | return "Remove columns with all missing values."; |
---|
935 | } |
---|
936 | |
---|
937 | /** |
---|
938 | * Remove columns containing all missing values. |
---|
939 | * @param r true if cols are to be removed. |
---|
940 | */ |
---|
941 | public void setRemoveAllMissingCols(boolean r) { |
---|
942 | m_removeMissingCols = r; |
---|
943 | } |
---|
944 | |
---|
945 | /** |
---|
946 | * Returns whether columns containing all missing values are to be removed |
---|
947 | * @return true if columns are to be removed. |
---|
948 | */ |
---|
949 | public boolean getRemoveAllMissingCols() { |
---|
950 | return m_removeMissingCols; |
---|
951 | } |
---|
952 | |
---|
953 | /** |
---|
954 | * Returns the tip text for this property |
---|
955 | * @return tip text for this property suitable for |
---|
956 | * displaying in the explorer/experimenter gui |
---|
957 | */ |
---|
958 | public String upperBoundMinSupportTipText() { |
---|
959 | return "Upper bound for minimum support. Start iteratively decreasing " |
---|
960 | +"minimum support from this value."; |
---|
961 | } |
---|
962 | |
---|
963 | /** |
---|
964 | * Get the value of upperBoundMinSupport. |
---|
965 | * |
---|
966 | * @return Value of upperBoundMinSupport. |
---|
967 | */ |
---|
968 | public double getUpperBoundMinSupport() { |
---|
969 | |
---|
970 | return m_upperBoundMinSupport; |
---|
971 | } |
---|
972 | |
---|
973 | /** |
---|
974 | * Set the value of upperBoundMinSupport. |
---|
975 | * |
---|
976 | * @param v Value to assign to upperBoundMinSupport. |
---|
977 | */ |
---|
978 | public void setUpperBoundMinSupport(double v) { |
---|
979 | |
---|
980 | m_upperBoundMinSupport = v; |
---|
981 | } |
---|
982 | |
---|
983 | /** |
---|
984 | * Sets the class index |
---|
985 | * @param index the class index |
---|
986 | */ |
---|
987 | public void setClassIndex(int index){ |
---|
988 | |
---|
989 | m_classIndex = index; |
---|
990 | } |
---|
991 | |
---|
992 | /** |
---|
993 | * Gets the class index |
---|
994 | * @return the index of the class attribute |
---|
995 | */ |
---|
996 | public int getClassIndex(){ |
---|
997 | |
---|
998 | return m_classIndex; |
---|
999 | } |
---|
1000 | |
---|
1001 | /** |
---|
1002 | * Returns the tip text for this property |
---|
1003 | * @return tip text for this property suitable for |
---|
1004 | * displaying in the explorer/experimenter gui |
---|
1005 | */ |
---|
1006 | public String classIndexTipText() { |
---|
1007 | return "Index of the class attribute. If set to -1, the last attribute is taken as class attribute."; |
---|
1008 | |
---|
1009 | } |
---|
1010 | |
---|
1011 | /** |
---|
1012 | * Sets class association rule mining |
---|
1013 | * @param flag if class association rules are mined, false otherwise |
---|
1014 | */ |
---|
1015 | public void setCar(boolean flag){ |
---|
1016 | m_car = flag; |
---|
1017 | } |
---|
1018 | |
---|
1019 | /** |
---|
1020 | * Gets whether class association ruels are mined |
---|
1021 | * @return true if class association rules are mined, false otherwise |
---|
1022 | */ |
---|
1023 | public boolean getCar(){ |
---|
1024 | return m_car; |
---|
1025 | } |
---|
1026 | |
---|
1027 | /** |
---|
1028 | * Returns the tip text for this property |
---|
1029 | * @return tip text for this property suitable for |
---|
1030 | * displaying in the explorer/experimenter gui |
---|
1031 | */ |
---|
1032 | public String carTipText() { |
---|
1033 | return "If enabled class association rules are mined instead of (general) association rules."; |
---|
1034 | } |
---|
1035 | |
---|
1036 | /** |
---|
1037 | * Returns the tip text for this property |
---|
1038 | * @return tip text for this property suitable for |
---|
1039 | * displaying in the explorer/experimenter gui |
---|
1040 | */ |
---|
1041 | public String lowerBoundMinSupportTipText() { |
---|
1042 | return "Lower bound for minimum support."; |
---|
1043 | } |
---|
1044 | |
---|
1045 | /** |
---|
1046 | * Get the value of lowerBoundMinSupport. |
---|
1047 | * |
---|
1048 | * @return Value of lowerBoundMinSupport. |
---|
1049 | */ |
---|
1050 | public double getLowerBoundMinSupport() { |
---|
1051 | |
---|
1052 | return m_lowerBoundMinSupport; |
---|
1053 | } |
---|
1054 | |
---|
1055 | /** |
---|
1056 | * Set the value of lowerBoundMinSupport. |
---|
1057 | * |
---|
1058 | * @param v Value to assign to lowerBoundMinSupport. |
---|
1059 | */ |
---|
1060 | public void setLowerBoundMinSupport(double v) { |
---|
1061 | |
---|
1062 | m_lowerBoundMinSupport = v; |
---|
1063 | } |
---|
1064 | |
---|
1065 | /** |
---|
1066 | * Get the metric type |
---|
1067 | * |
---|
1068 | * @return the type of metric to use for ranking rules |
---|
1069 | */ |
---|
1070 | public SelectedTag getMetricType() { |
---|
1071 | return new SelectedTag(m_metricType, TAGS_SELECTION); |
---|
1072 | } |
---|
1073 | |
---|
1074 | /** |
---|
1075 | * Returns the tip text for this property |
---|
1076 | * @return tip text for this property suitable for |
---|
1077 | * displaying in the explorer/experimenter gui |
---|
1078 | */ |
---|
1079 | public String metricTypeTipText() { |
---|
1080 | return "Set the type of metric by which to rank rules. Confidence is " |
---|
1081 | +"the proportion of the examples covered by the premise that are also " |
---|
1082 | +"covered by the consequence(Class association rules can only be mined using confidence). Lift is confidence divided by the " |
---|
1083 | +"proportion of all examples that are covered by the consequence. This " |
---|
1084 | +"is a measure of the importance of the association that is independent " |
---|
1085 | +"of support. Leverage is the proportion of additional examples covered " |
---|
1086 | +"by both the premise and consequence above those expected if the " |
---|
1087 | +"premise and consequence were independent of each other. The total " |
---|
1088 | +"number of examples that this represents is presented in brackets " |
---|
1089 | +"following the leverage. Conviction is " |
---|
1090 | +"another measure of departure from independence. Conviction is given " |
---|
1091 | +"by "; |
---|
1092 | } |
---|
1093 | |
---|
1094 | /** |
---|
1095 | * Set the metric type for ranking rules |
---|
1096 | * |
---|
1097 | * @param d the type of metric |
---|
1098 | */ |
---|
1099 | public void setMetricType (SelectedTag d) { |
---|
1100 | |
---|
1101 | if (d.getTags() == TAGS_SELECTION) { |
---|
1102 | m_metricType = d.getSelectedTag().getID(); |
---|
1103 | } |
---|
1104 | |
---|
1105 | if (m_significanceLevel != -1 && m_metricType != CONFIDENCE) { |
---|
1106 | m_metricType = CONFIDENCE; |
---|
1107 | } |
---|
1108 | |
---|
1109 | if (m_metricType == CONFIDENCE) { |
---|
1110 | setMinMetric(0.9); |
---|
1111 | } |
---|
1112 | |
---|
1113 | if (m_metricType == LIFT || m_metricType == CONVICTION) { |
---|
1114 | setMinMetric(1.1); |
---|
1115 | } |
---|
1116 | |
---|
1117 | if (m_metricType == LEVERAGE) { |
---|
1118 | setMinMetric(0.1); |
---|
1119 | } |
---|
1120 | } |
---|
1121 | |
---|
1122 | /** |
---|
1123 | * Returns the tip text for this property |
---|
1124 | * @return tip text for this property suitable for |
---|
1125 | * displaying in the explorer/experimenter gui |
---|
1126 | */ |
---|
1127 | public String minMetricTipText() { |
---|
1128 | return "Minimum metric score. Consider only rules with scores higher than " |
---|
1129 | +"this value."; |
---|
1130 | } |
---|
1131 | |
---|
1132 | /** |
---|
1133 | * Get the value of minConfidence. |
---|
1134 | * |
---|
1135 | * @return Value of minConfidence. |
---|
1136 | */ |
---|
1137 | public double getMinMetric() { |
---|
1138 | |
---|
1139 | return m_minMetric; |
---|
1140 | } |
---|
1141 | |
---|
1142 | /** |
---|
1143 | * Set the value of minConfidence. |
---|
1144 | * |
---|
1145 | * @param v Value to assign to minConfidence. |
---|
1146 | */ |
---|
1147 | public void setMinMetric(double v) { |
---|
1148 | |
---|
1149 | m_minMetric = v; |
---|
1150 | } |
---|
1151 | |
---|
1152 | /** |
---|
1153 | * Returns the tip text for this property |
---|
1154 | * @return tip text for this property suitable for |
---|
1155 | * displaying in the explorer/experimenter gui |
---|
1156 | */ |
---|
1157 | public String numRulesTipText() { |
---|
1158 | return "Number of rules to find."; |
---|
1159 | } |
---|
1160 | |
---|
1161 | /** |
---|
1162 | * Get the value of numRules. |
---|
1163 | * |
---|
1164 | * @return Value of numRules. |
---|
1165 | */ |
---|
1166 | public int getNumRules() { |
---|
1167 | |
---|
1168 | return m_numRules; |
---|
1169 | } |
---|
1170 | |
---|
1171 | /** |
---|
1172 | * Set the value of numRules. |
---|
1173 | * |
---|
1174 | * @param v Value to assign to numRules. |
---|
1175 | */ |
---|
1176 | public void setNumRules(int v) { |
---|
1177 | |
---|
1178 | m_numRules = v; |
---|
1179 | } |
---|
1180 | |
---|
1181 | /** |
---|
1182 | * Returns the tip text for this property |
---|
1183 | * @return tip text for this property suitable for |
---|
1184 | * displaying in the explorer/experimenter gui |
---|
1185 | */ |
---|
1186 | public String deltaTipText() { |
---|
1187 | return "Iteratively decrease support by this factor. Reduces support " |
---|
1188 | +"until min support is reached or required number of rules has been " |
---|
1189 | +"generated."; |
---|
1190 | } |
---|
1191 | |
---|
1192 | /** |
---|
1193 | * Get the value of delta. |
---|
1194 | * |
---|
1195 | * @return Value of delta. |
---|
1196 | */ |
---|
1197 | public double getDelta() { |
---|
1198 | |
---|
1199 | return m_delta; |
---|
1200 | } |
---|
1201 | |
---|
1202 | /** |
---|
1203 | * Set the value of delta. |
---|
1204 | * |
---|
1205 | * @param v Value to assign to delta. |
---|
1206 | */ |
---|
1207 | public void setDelta(double v) { |
---|
1208 | |
---|
1209 | m_delta = v; |
---|
1210 | } |
---|
1211 | |
---|
1212 | /** |
---|
1213 | * Returns the tip text for this property |
---|
1214 | * @return tip text for this property suitable for |
---|
1215 | * displaying in the explorer/experimenter gui |
---|
1216 | */ |
---|
1217 | public String significanceLevelTipText() { |
---|
1218 | return "Significance level. Significance test (confidence metric only)."; |
---|
1219 | } |
---|
1220 | |
---|
1221 | /** |
---|
1222 | * Get the value of significanceLevel. |
---|
1223 | * |
---|
1224 | * @return Value of significanceLevel. |
---|
1225 | */ |
---|
1226 | public double getSignificanceLevel() { |
---|
1227 | |
---|
1228 | return m_significanceLevel; |
---|
1229 | } |
---|
1230 | |
---|
1231 | /** |
---|
1232 | * Set the value of significanceLevel. |
---|
1233 | * |
---|
1234 | * @param v Value to assign to significanceLevel. |
---|
1235 | */ |
---|
1236 | public void setSignificanceLevel(double v) { |
---|
1237 | |
---|
1238 | m_significanceLevel = v; |
---|
1239 | } |
---|
1240 | |
---|
1241 | /** |
---|
1242 | * Sets whether itemsets are output as well |
---|
1243 | * @param flag true if itemsets are to be output as well |
---|
1244 | */ |
---|
1245 | public void setOutputItemSets(boolean flag){ |
---|
1246 | m_outputItemSets = flag; |
---|
1247 | } |
---|
1248 | |
---|
1249 | /** |
---|
1250 | * Gets whether itemsets are output as well |
---|
1251 | * @return true if itemsets are output as well |
---|
1252 | */ |
---|
1253 | public boolean getOutputItemSets(){ |
---|
1254 | return m_outputItemSets; |
---|
1255 | } |
---|
1256 | |
---|
1257 | /** |
---|
1258 | * Returns the tip text for this property |
---|
1259 | * @return tip text for this property suitable for |
---|
1260 | * displaying in the explorer/experimenter gui |
---|
1261 | */ |
---|
1262 | public String outputItemSetsTipText() { |
---|
1263 | return "If enabled the itemsets are output as well."; |
---|
1264 | } |
---|
1265 | |
---|
1266 | /** |
---|
1267 | * Sets verbose mode |
---|
1268 | * @param flag true if algorithm should be run in verbose mode |
---|
1269 | */ |
---|
1270 | public void setVerbose(boolean flag){ |
---|
1271 | m_verbose = flag; |
---|
1272 | } |
---|
1273 | |
---|
1274 | /** |
---|
1275 | * Gets whether algorithm is run in verbose mode |
---|
1276 | * @return true if algorithm is run in verbose mode |
---|
1277 | */ |
---|
1278 | public boolean getVerbose(){ |
---|
1279 | return m_verbose; |
---|
1280 | } |
---|
1281 | |
---|
1282 | /** |
---|
1283 | * Returns the tip text for this property |
---|
1284 | * @return tip text for this property suitable for |
---|
1285 | * displaying in the explorer/experimenter gui |
---|
1286 | */ |
---|
1287 | public String verboseTipText() { |
---|
1288 | return "If enabled the algorithm will be run in verbose mode."; |
---|
1289 | } |
---|
1290 | |
---|
1291 | /** |
---|
1292 | * Returns the tip text for this property |
---|
1293 | * @return tip text for this property suitable for |
---|
1294 | * displaying in the explorer/experimenter gui |
---|
1295 | */ |
---|
1296 | public String treatZeroAsMissingTipText() { |
---|
1297 | return "If enabled, zero (that is, the first value of a nominal) is " |
---|
1298 | + "treated in the same way as a missing value."; |
---|
1299 | } |
---|
1300 | |
---|
1301 | /** |
---|
1302 | * Sets whether zeros (i.e. the first value of a nominal attribute) |
---|
1303 | * should be treated as missing values. |
---|
1304 | * |
---|
1305 | * @param z true if zeros should be treated as missing values. |
---|
1306 | */ |
---|
1307 | public void setTreatZeroAsMissing(boolean z) { |
---|
1308 | m_treatZeroAsMissing = z; |
---|
1309 | } |
---|
1310 | |
---|
1311 | /** |
---|
1312 | * Gets whether zeros (i.e. the first value of a nominal attribute) |
---|
1313 | * is to be treated int he same way as missing values. |
---|
1314 | * |
---|
1315 | * @return true if zeros are to be treated like missing values. |
---|
1316 | */ |
---|
1317 | public boolean getTreatZeroAsMissing() { |
---|
1318 | return m_treatZeroAsMissing; |
---|
1319 | } |
---|
1320 | |
---|
1321 | /** |
---|
1322 | * Method that finds all large itemsets for the given set of instances. |
---|
1323 | * |
---|
1324 | * @throws Exception if an attribute is numeric |
---|
1325 | */ |
---|
1326 | private void findLargeItemSets() throws Exception { |
---|
1327 | |
---|
1328 | FastVector kMinusOneSets, kSets; |
---|
1329 | Hashtable hashtable; |
---|
1330 | int necSupport, necMaxSupport,i = 0; |
---|
1331 | |
---|
1332 | |
---|
1333 | |
---|
1334 | // Find large itemsets |
---|
1335 | |
---|
1336 | // minimum support |
---|
1337 | necSupport = (int)(m_minSupport * (double)m_instances.numInstances()+0.5); |
---|
1338 | necMaxSupport = (int)(m_upperBoundMinSupport * (double)m_instances.numInstances()+0.5); |
---|
1339 | |
---|
1340 | kSets = AprioriItemSet.singletons(m_instances, m_treatZeroAsMissing); |
---|
1341 | AprioriItemSet.upDateCounters(kSets,m_instances); |
---|
1342 | kSets = AprioriItemSet.deleteItemSets(kSets, necSupport, necMaxSupport); |
---|
1343 | if (kSets.size() == 0) |
---|
1344 | return; |
---|
1345 | do { |
---|
1346 | m_Ls.addElement(kSets); |
---|
1347 | kMinusOneSets = kSets; |
---|
1348 | kSets = AprioriItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances()); |
---|
1349 | hashtable = AprioriItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size()); |
---|
1350 | m_hashtables.addElement(hashtable); |
---|
1351 | kSets = AprioriItemSet.pruneItemSets(kSets, hashtable); |
---|
1352 | AprioriItemSet.upDateCounters(kSets, m_instances); |
---|
1353 | kSets = AprioriItemSet.deleteItemSets(kSets, necSupport, necMaxSupport); |
---|
1354 | i++; |
---|
1355 | } while (kSets.size() > 0); |
---|
1356 | } |
---|
1357 | |
---|
1358 | /** |
---|
1359 | * Method that finds all association rules and performs significance test. |
---|
1360 | * |
---|
1361 | * @throws Exception if an attribute is numeric |
---|
1362 | */ |
---|
1363 | private void findRulesBruteForce() throws Exception { |
---|
1364 | |
---|
1365 | FastVector[] rules; |
---|
1366 | |
---|
1367 | // Build rules |
---|
1368 | for (int j = 1; j < m_Ls.size(); j++) { |
---|
1369 | FastVector currentItemSets = (FastVector)m_Ls.elementAt(j); |
---|
1370 | Enumeration enumItemSets = currentItemSets.elements(); |
---|
1371 | while (enumItemSets.hasMoreElements()) { |
---|
1372 | AprioriItemSet currentItemSet = (AprioriItemSet)enumItemSets.nextElement(); |
---|
1373 | //AprioriItemSet currentItemSet = new AprioriItemSet((ItemSet)enumItemSets.nextElement()); |
---|
1374 | rules=currentItemSet.generateRulesBruteForce(m_minMetric,m_metricType, |
---|
1375 | m_hashtables,j+1, |
---|
1376 | m_instances.numInstances(), |
---|
1377 | m_significanceLevel); |
---|
1378 | for (int k = 0; k < rules[0].size(); k++) { |
---|
1379 | m_allTheRules[0].addElement(rules[0].elementAt(k)); |
---|
1380 | m_allTheRules[1].addElement(rules[1].elementAt(k)); |
---|
1381 | m_allTheRules[2].addElement(rules[2].elementAt(k)); |
---|
1382 | |
---|
1383 | m_allTheRules[3].addElement(rules[3].elementAt(k)); |
---|
1384 | m_allTheRules[4].addElement(rules[4].elementAt(k)); |
---|
1385 | m_allTheRules[5].addElement(rules[5].elementAt(k)); |
---|
1386 | } |
---|
1387 | } |
---|
1388 | } |
---|
1389 | } |
---|
1390 | |
---|
1391 | /** |
---|
1392 | * Method that finds all association rules. |
---|
1393 | * |
---|
1394 | * @throws Exception if an attribute is numeric |
---|
1395 | */ |
---|
1396 | private void findRulesQuickly() throws Exception { |
---|
1397 | |
---|
1398 | FastVector[] rules; |
---|
1399 | |
---|
1400 | // Build rules |
---|
1401 | for (int j = 1; j < m_Ls.size(); j++) { |
---|
1402 | FastVector currentItemSets = (FastVector)m_Ls.elementAt(j); |
---|
1403 | Enumeration enumItemSets = currentItemSets.elements(); |
---|
1404 | while (enumItemSets.hasMoreElements()) { |
---|
1405 | AprioriItemSet currentItemSet = (AprioriItemSet)enumItemSets.nextElement(); |
---|
1406 | //AprioriItemSet currentItemSet = new AprioriItemSet((ItemSet)enumItemSets.nextElement()); |
---|
1407 | rules = currentItemSet.generateRules(m_minMetric, m_hashtables, j + 1); |
---|
1408 | for (int k = 0; k < rules[0].size(); k++) { |
---|
1409 | m_allTheRules[0].addElement(rules[0].elementAt(k)); |
---|
1410 | m_allTheRules[1].addElement(rules[1].elementAt(k)); |
---|
1411 | m_allTheRules[2].addElement(rules[2].elementAt(k)); |
---|
1412 | } |
---|
1413 | } |
---|
1414 | } |
---|
1415 | } |
---|
1416 | |
---|
1417 | /** |
---|
1418 | * |
---|
1419 | * Method that finds all large itemsets for class association rules for the given set of instances. |
---|
1420 | * @throws Exception if an attribute is numeric |
---|
1421 | */ |
---|
1422 | private void findLargeCarItemSets() throws Exception { |
---|
1423 | |
---|
1424 | FastVector kMinusOneSets, kSets; |
---|
1425 | Hashtable hashtable; |
---|
1426 | int necSupport, necMaxSupport,i = 0; |
---|
1427 | |
---|
1428 | // Find large itemsets |
---|
1429 | |
---|
1430 | // minimum support |
---|
1431 | double nextMinSupport = m_minSupport*(double)m_instances.numInstances(); |
---|
1432 | double nextMaxSupport = m_upperBoundMinSupport*(double)m_instances.numInstances(); |
---|
1433 | if((double)Math.rint(nextMinSupport) == nextMinSupport){ |
---|
1434 | necSupport = (int) nextMinSupport; |
---|
1435 | } |
---|
1436 | else{ |
---|
1437 | necSupport = Math.round((float)(nextMinSupport+0.5)); |
---|
1438 | } |
---|
1439 | if((double)Math.rint(nextMaxSupport) == nextMaxSupport){ |
---|
1440 | necMaxSupport = (int) nextMaxSupport; |
---|
1441 | } |
---|
1442 | else{ |
---|
1443 | necMaxSupport = Math.round((float)(nextMaxSupport+0.5)); |
---|
1444 | } |
---|
1445 | |
---|
1446 | //find item sets of length one |
---|
1447 | kSets = LabeledItemSet.singletons(m_instances,m_onlyClass); |
---|
1448 | LabeledItemSet.upDateCounters(kSets, m_instances,m_onlyClass); |
---|
1449 | |
---|
1450 | //check if a item set of lentgh one is frequent, if not delete it |
---|
1451 | kSets = LabeledItemSet.deleteItemSets(kSets, necSupport, necMaxSupport); |
---|
1452 | if (kSets.size() == 0) |
---|
1453 | return; |
---|
1454 | do { |
---|
1455 | m_Ls.addElement(kSets); |
---|
1456 | kMinusOneSets = kSets; |
---|
1457 | kSets = LabeledItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances()); |
---|
1458 | hashtable = LabeledItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size()); |
---|
1459 | kSets = LabeledItemSet.pruneItemSets(kSets, hashtable); |
---|
1460 | LabeledItemSet.upDateCounters(kSets, m_instances,m_onlyClass); |
---|
1461 | kSets = LabeledItemSet.deleteItemSets(kSets, necSupport, necMaxSupport); |
---|
1462 | i++; |
---|
1463 | } while (kSets.size() > 0); |
---|
1464 | } |
---|
1465 | |
---|
1466 | |
---|
1467 | |
---|
1468 | /** |
---|
1469 | * Method that finds all class association rules. |
---|
1470 | * |
---|
1471 | * @throws Exception if an attribute is numeric |
---|
1472 | */ |
---|
1473 | private void findCarRulesQuickly() throws Exception { |
---|
1474 | |
---|
1475 | FastVector[] rules; |
---|
1476 | |
---|
1477 | // Build rules |
---|
1478 | for (int j = 0; j < m_Ls.size(); j++) { |
---|
1479 | FastVector currentLabeledItemSets = (FastVector)m_Ls.elementAt(j); |
---|
1480 | Enumeration enumLabeledItemSets = currentLabeledItemSets.elements(); |
---|
1481 | while (enumLabeledItemSets.hasMoreElements()) { |
---|
1482 | LabeledItemSet currentLabeledItemSet = (LabeledItemSet)enumLabeledItemSets.nextElement(); |
---|
1483 | rules = currentLabeledItemSet.generateRules(m_minMetric,false); |
---|
1484 | for (int k = 0; k < rules[0].size(); k++) { |
---|
1485 | m_allTheRules[0].addElement(rules[0].elementAt(k)); |
---|
1486 | m_allTheRules[1].addElement(rules[1].elementAt(k)); |
---|
1487 | m_allTheRules[2].addElement(rules[2].elementAt(k)); |
---|
1488 | } |
---|
1489 | } |
---|
1490 | } |
---|
1491 | } |
---|
1492 | |
---|
1493 | /** |
---|
1494 | * returns all the rules |
---|
1495 | * |
---|
1496 | * @return all the rules |
---|
1497 | * @see #m_allTheRules |
---|
1498 | */ |
---|
1499 | public FastVector[] getAllTheRules() { |
---|
1500 | return m_allTheRules; |
---|
1501 | } |
---|
1502 | |
---|
1503 | /** |
---|
1504 | * Returns the revision string. |
---|
1505 | * |
---|
1506 | * @return the revision |
---|
1507 | */ |
---|
1508 | public String getRevision() { |
---|
1509 | return RevisionUtils.extract("$Revision: 5698 $"); |
---|
1510 | } |
---|
1511 | |
---|
1512 | /** |
---|
1513 | * Main method. |
---|
1514 | * |
---|
1515 | * @param args the commandline options |
---|
1516 | */ |
---|
1517 | public static void main(String[] args) { |
---|
1518 | runAssociator(new Apriori(), args); |
---|
1519 | } |
---|
1520 | } |
---|
1521 | |
---|