1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * SubsetSizeForwardSelection.java |
---|
19 | * Copyright (C) 2007 Martin Guetlein |
---|
20 | * |
---|
21 | */ |
---|
22 | package weka.attributeSelection; |
---|
23 | |
---|
24 | import weka.core.Instances; |
---|
25 | import weka.core.Option; |
---|
26 | import weka.core.OptionHandler; |
---|
27 | import weka.core.RevisionUtils; |
---|
28 | import weka.core.SelectedTag; |
---|
29 | import weka.core.Tag; |
---|
30 | import weka.core.TechnicalInformation; |
---|
31 | import weka.core.Utils; |
---|
32 | import weka.core.TechnicalInformation.Field; |
---|
33 | import weka.core.TechnicalInformation.Type; |
---|
34 | |
---|
35 | import java.util.BitSet; |
---|
36 | import java.util.Enumeration; |
---|
37 | import java.util.Random; |
---|
38 | import java.util.Vector; |
---|
39 | |
---|
40 | |
---|
41 | /** |
---|
42 | <!-- globalinfo-start --> |
---|
43 | * SubsetSizeForwardSelection:<br/> |
---|
44 | * <br/> |
---|
45 | * Extension of LinearForwardSelection. The search performs an interior cross-validation (seed and number of folds can be specified). A LinearForwardSelection is performed on each foldto determine the optimal subset-size (using the given SubsetSizeEvaluator). Finally, a LinearForwardSelection up to the optimal subset-size is performed on the whole data.<br/> |
---|
46 | * <br/> |
---|
47 | * For more information see:<br/> |
---|
48 | * <br/> |
---|
49 | * Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany. |
---|
50 | * <p/> |
---|
51 | <!-- globalinfo-end --> |
---|
52 | * |
---|
53 | <!-- options-start --> |
---|
54 | * Valid options are: <p/> |
---|
55 | * |
---|
56 | * <pre> -I |
---|
57 | * Perform initial ranking to select the |
---|
58 | * top-ranked attributes.</pre> |
---|
59 | * |
---|
60 | * <pre> -K <num> |
---|
61 | * Number of top-ranked attributes that are |
---|
62 | * taken into account by the search.</pre> |
---|
63 | * |
---|
64 | * <pre> -T <0 = fixed-set | 1 = fixed-width> |
---|
65 | * Type of Linear Forward Selection (default = 0).</pre> |
---|
66 | * |
---|
67 | * <pre> -S <num> |
---|
68 | * Size of lookup cache for evaluated subsets. |
---|
69 | * Expressed as a multiple of the number of |
---|
70 | * attributes in the data set. (default = 1)</pre> |
---|
71 | * |
---|
72 | * <pre> -E <subset evaluator> |
---|
73 | * Subset-evaluator used for subset-size determination.-- -M</pre> |
---|
74 | * |
---|
75 | * <pre> -F <num> |
---|
76 | * Number of cross validation folds |
---|
77 | * for subset size determination (default = 5).</pre> |
---|
78 | * |
---|
79 | * <pre> -R <num> |
---|
80 | * Seed for cross validation |
---|
81 | * subset size determination. (default = 1)</pre> |
---|
82 | * |
---|
83 | * <pre> -Z |
---|
84 | * verbose on/off</pre> |
---|
85 | * |
---|
86 | * <pre> |
---|
87 | * Options specific to evaluator weka.attributeSelection.ClassifierSubsetEval: |
---|
88 | * </pre> |
---|
89 | * |
---|
90 | * <pre> -B <classifier> |
---|
91 | * class name of the classifier to use for accuracy estimation. |
---|
92 | * Place any classifier options LAST on the command line |
---|
93 | * following a "--". eg.: |
---|
94 | * -B weka.classifiers.bayes.NaiveBayes ... -- -K |
---|
95 | * (default: weka.classifiers.rules.ZeroR)</pre> |
---|
96 | * |
---|
97 | * <pre> -T |
---|
98 | * Use the training data to estimate accuracy.</pre> |
---|
99 | * |
---|
100 | * <pre> -H <filename> |
---|
101 | * Name of the hold out/test set to |
---|
102 | * estimate accuracy on.</pre> |
---|
103 | * |
---|
104 | * <pre> |
---|
105 | * Options specific to scheme weka.classifiers.rules.ZeroR: |
---|
106 | * </pre> |
---|
107 | * |
---|
108 | * <pre> -D |
---|
109 | * If set, classifier is run in debug mode and |
---|
110 | * may output additional info to the console</pre> |
---|
111 | * |
---|
112 | <!-- options-end --> |
---|
113 | * |
---|
114 | * @author Martin Guetlein (martin.guetlein@gmail.com) |
---|
115 | * @version $Revision: 5604 $ |
---|
116 | */ |
---|
117 | public class SubsetSizeForwardSelection extends ASSearch |
---|
118 | implements OptionHandler { |
---|
119 | /** search directions */ |
---|
120 | protected static final int TYPE_FIXED_SET = 0; |
---|
121 | protected static final int TYPE_FIXED_WIDTH = 1; |
---|
122 | public static final Tag[] TAGS_TYPE = { |
---|
123 | new Tag(TYPE_FIXED_SET, "Fixed-set"), |
---|
124 | new Tag(TYPE_FIXED_WIDTH, "Fixed-width"), |
---|
125 | }; |
---|
126 | |
---|
127 | // member variables |
---|
128 | /** perform initial ranking to select top-ranked attributes */ |
---|
129 | protected boolean m_performRanking; |
---|
130 | |
---|
131 | /** |
---|
132 | * number of top-ranked attributes that are taken into account for the |
---|
133 | * search |
---|
134 | */ |
---|
135 | protected int m_numUsedAttributes; |
---|
136 | |
---|
137 | /** 0 == fixed-set, 1 == fixed-width */ |
---|
138 | protected int m_linearSelectionType; |
---|
139 | |
---|
140 | /** the subset evaluator to use for subset size determination */ |
---|
141 | private ASEvaluation m_setSizeEval; |
---|
142 | |
---|
143 | /** |
---|
144 | * Number of cross validation folds for subset size determination (default = |
---|
145 | * 5). |
---|
146 | */ |
---|
147 | protected int m_numFolds; |
---|
148 | |
---|
149 | /** Seed for cross validation subset size determination. (default = 1) */ |
---|
150 | protected int m_seed; |
---|
151 | |
---|
152 | /** number of attributes in the data */ |
---|
153 | protected int m_numAttribs; |
---|
154 | |
---|
155 | /** total number of subsets evaluated during a search */ |
---|
156 | protected int m_totalEvals; |
---|
157 | |
---|
158 | /** for debugging */ |
---|
159 | protected boolean m_verbose; |
---|
160 | |
---|
161 | /** holds the merit of the best subset found */ |
---|
162 | protected double m_bestMerit; |
---|
163 | |
---|
164 | /** holds the maximum size of the lookup cache for evaluated subsets */ |
---|
165 | protected int m_cacheSize; |
---|
166 | |
---|
167 | /** |
---|
168 | * Constructor |
---|
169 | */ |
---|
170 | public SubsetSizeForwardSelection() { |
---|
171 | resetOptions(); |
---|
172 | } |
---|
173 | |
---|
174 | /** |
---|
175 | * Returns a string describing this search method |
---|
176 | * |
---|
177 | * @return a description of the search method suitable for displaying in the |
---|
178 | * explorer/experimenter gui |
---|
179 | */ |
---|
180 | public String globalInfo() { |
---|
181 | return "SubsetSizeForwardSelection:\n\n" + |
---|
182 | "Extension of LinearForwardSelection. The search performs an interior " + |
---|
183 | "cross-validation (seed and number of folds can be specified). A " + |
---|
184 | "LinearForwardSelection is performed on each foldto determine the optimal " + |
---|
185 | "subset-size (using the given SubsetSizeEvaluator). Finally, a " + |
---|
186 | "LinearForwardSelection up to the optimal subset-size is performed on " + |
---|
187 | "the whole data.\n\n" |
---|
188 | + "For more information see:\n\n" |
---|
189 | + getTechnicalInformation().toString(); |
---|
190 | } |
---|
191 | |
---|
192 | /** |
---|
193 | * Returns an instance of a TechnicalInformation object, containing |
---|
194 | * detailed information about the technical background of this class, |
---|
195 | * e.g., paper reference or book this class is based on. |
---|
196 | * |
---|
197 | * @return the technical information about this class |
---|
198 | */ |
---|
199 | public TechnicalInformation getTechnicalInformation() { |
---|
200 | TechnicalInformation result; |
---|
201 | TechnicalInformation additional; |
---|
202 | |
---|
203 | result = new TechnicalInformation(Type.INPROCEEDINGS); |
---|
204 | result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall"); |
---|
205 | result.setValue(Field.YEAR, "2009"); |
---|
206 | result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); |
---|
207 | result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining"); |
---|
208 | result.setValue(Field.PAGES, "332-339"); |
---|
209 | result.setValue(Field.PUBLISHER, "IEEE"); |
---|
210 | |
---|
211 | additional = result.add(Type.MASTERSTHESIS); |
---|
212 | additional.setValue(Field.AUTHOR, "Martin Guetlein"); |
---|
213 | additional.setValue(Field.YEAR, "2006"); |
---|
214 | additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); |
---|
215 | additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet"); |
---|
216 | additional.setValue(Field.ADDRESS, "Freiburg, Germany"); |
---|
217 | |
---|
218 | return result; |
---|
219 | } |
---|
220 | |
---|
221 | /** |
---|
222 | * Returns an enumeration describing the available options. |
---|
223 | * |
---|
224 | * @return an enumeration of all the available options. |
---|
225 | * |
---|
226 | */ |
---|
227 | public Enumeration listOptions() { |
---|
228 | Vector newVector = new Vector(9); |
---|
229 | |
---|
230 | newVector.addElement(new Option("\tPerform initial ranking to select the" + |
---|
231 | "\n\ttop-ranked attributes.", "I", 0, "-I")); |
---|
232 | newVector.addElement(new Option( |
---|
233 | "\tNumber of top-ranked attributes that are " + |
---|
234 | "\n\ttaken into account by the search.", "K", 1, "-K <num>")); |
---|
235 | newVector.addElement(new Option( |
---|
236 | "\tType of Linear Forward Selection (default = 0).", "T", 1, |
---|
237 | "-T <0 = fixed-set | 1 = fixed-width>")); |
---|
238 | newVector.addElement(new Option( |
---|
239 | "\tSize of lookup cache for evaluated subsets." + |
---|
240 | "\n\tExpressed as a multiple of the number of" + |
---|
241 | "\n\tattributes in the data set. (default = 1)", "S", 1, "-S <num>")); |
---|
242 | newVector.addElement(new Option( |
---|
243 | "\tSubset-evaluator used for subset-size determination." + "-- -M", |
---|
244 | "E", 1, "-E <subset evaluator>")); |
---|
245 | newVector.addElement(new Option("\tNumber of cross validation folds" + |
---|
246 | "\n\tfor subset size determination (default = 5).", "F", 1, "-F <num>")); |
---|
247 | newVector.addElement(new Option("\tSeed for cross validation" + |
---|
248 | "\n\tsubset size determination. (default = 1)", "R", 1, "-R <num>")); |
---|
249 | newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z")); |
---|
250 | |
---|
251 | if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) { |
---|
252 | newVector.addElement(new Option("", "", 0, |
---|
253 | "\nOptions specific to " + "evaluator " + |
---|
254 | m_setSizeEval.getClass().getName() + ":")); |
---|
255 | |
---|
256 | Enumeration enu = ((OptionHandler) m_setSizeEval).listOptions(); |
---|
257 | |
---|
258 | while (enu.hasMoreElements()) { |
---|
259 | newVector.addElement(enu.nextElement()); |
---|
260 | } |
---|
261 | } |
---|
262 | |
---|
263 | return newVector.elements(); |
---|
264 | } |
---|
265 | |
---|
266 | /** |
---|
267 | * Parses a given list of options. |
---|
268 | * |
---|
269 | * Valid options are: |
---|
270 | * <p> |
---|
271 | * |
---|
272 | * -I <br> |
---|
273 | * Perform initial ranking to select top-ranked attributes. |
---|
274 | * <p> |
---|
275 | * |
---|
276 | * -K <num> <br> |
---|
277 | * Number of top-ranked attributes that are taken into account. |
---|
278 | * <p> |
---|
279 | * |
---|
280 | * -T <0 = fixed-set | 1 = fixed-width> <br> |
---|
281 | * Typ of Linear Forward Selection (default = 0). |
---|
282 | * <p> |
---|
283 | * |
---|
284 | * -S <num> <br> |
---|
285 | * Size of lookup cache for evaluated subsets. Expressed as a multiple of |
---|
286 | * the number of attributes in the data set. (default = 1). |
---|
287 | * <p> |
---|
288 | * |
---|
289 | * -E <string> <br> |
---|
290 | * class name of subset evaluator to use for subset size determination |
---|
291 | * (default = null, same subset evaluator as for ranking and final forward |
---|
292 | * selection is used). Place any evaluator options LAST on the command line |
---|
293 | * following a "--". eg. -A weka.attributeSelection.ClassifierSubsetEval ... -- |
---|
294 | * -M |
---|
295 | * |
---|
296 | * </pre> |
---|
297 | * |
---|
298 | * -F <num> <br> |
---|
299 | * Number of cross validation folds for subset size determination (default = |
---|
300 | * 5). |
---|
301 | * <p> |
---|
302 | * |
---|
303 | * -R <num> <br> |
---|
304 | * Seed for cross validation subset size determination. (default = 1) |
---|
305 | * <p> |
---|
306 | * |
---|
307 | * -Z <br> |
---|
308 | * verbose on/off. |
---|
309 | * <p> |
---|
310 | * |
---|
311 | * @param options |
---|
312 | * the list of options as an array of strings |
---|
313 | * @exception Exception |
---|
314 | * if an option is not supported |
---|
315 | * |
---|
316 | */ |
---|
317 | public void setOptions(String[] options) throws Exception { |
---|
318 | String optionString; |
---|
319 | resetOptions(); |
---|
320 | |
---|
321 | setPerformRanking(Utils.getFlag('I', options)); |
---|
322 | |
---|
323 | optionString = Utils.getOption('K', options); |
---|
324 | |
---|
325 | if (optionString.length() != 0) { |
---|
326 | setNumUsedAttributes(Integer.parseInt(optionString)); |
---|
327 | } |
---|
328 | |
---|
329 | optionString = Utils.getOption('T', options); |
---|
330 | |
---|
331 | if (optionString.length() != 0) { |
---|
332 | setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE)); |
---|
333 | } else { |
---|
334 | setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE)); |
---|
335 | } |
---|
336 | |
---|
337 | optionString = Utils.getOption('S', options); |
---|
338 | |
---|
339 | if (optionString.length() != 0) { |
---|
340 | setLookupCacheSize(Integer.parseInt(optionString)); |
---|
341 | } |
---|
342 | |
---|
343 | optionString = Utils.getOption('E', options); |
---|
344 | |
---|
345 | if (optionString.length() == 0) { |
---|
346 | System.out.println( |
---|
347 | "No subset size evaluator given, using evaluator that is used for final search."); |
---|
348 | m_setSizeEval = null; |
---|
349 | } else { |
---|
350 | setSubsetSizeEvaluator(ASEvaluation.forName(optionString, |
---|
351 | Utils.partitionOptions(options))); |
---|
352 | } |
---|
353 | |
---|
354 | optionString = Utils.getOption('F', options); |
---|
355 | |
---|
356 | if (optionString.length() != 0) { |
---|
357 | setNumSubsetSizeCVFolds(Integer.parseInt(optionString)); |
---|
358 | } |
---|
359 | |
---|
360 | optionString = Utils.getOption('R', options); |
---|
361 | |
---|
362 | if (optionString.length() != 0) { |
---|
363 | setSeed(Integer.parseInt(optionString)); |
---|
364 | } |
---|
365 | |
---|
366 | m_verbose = Utils.getFlag('Z', options); |
---|
367 | } |
---|
368 | |
---|
369 | /** |
---|
370 | * Set the maximum size of the evaluated subset cache (hashtable). This is |
---|
371 | * expressed as a multiplier for the number of attributes in the data set. |
---|
372 | * (default = 1). |
---|
373 | * |
---|
374 | * @param size |
---|
375 | * the maximum size of the hashtable |
---|
376 | */ |
---|
377 | public void setLookupCacheSize(int size) { |
---|
378 | if (size >= 0) { |
---|
379 | m_cacheSize = size; |
---|
380 | } |
---|
381 | } |
---|
382 | |
---|
383 | /** |
---|
384 | * Return the maximum size of the evaluated subset cache (expressed as a |
---|
385 | * multiplier for the number of attributes in a data set. |
---|
386 | * |
---|
387 | * @return the maximum size of the hashtable. |
---|
388 | */ |
---|
389 | public int getLookupCacheSize() { |
---|
390 | return m_cacheSize; |
---|
391 | } |
---|
392 | |
---|
393 | /** |
---|
394 | * Returns the tip text for this property |
---|
395 | * |
---|
396 | * @return tip text for this property suitable for displaying in the |
---|
397 | * explorer/experimenter gui |
---|
398 | */ |
---|
399 | public String lookupCacheSizeTipText() { |
---|
400 | return "Set the maximum size of the lookup cache of evaluated subsets. This is " + |
---|
401 | "expressed as a multiplier of the number of attributes in the data set. " + |
---|
402 | "(default = 1)."; |
---|
403 | } |
---|
404 | |
---|
405 | /** |
---|
406 | * Returns the tip text for this property |
---|
407 | * |
---|
408 | * @return tip text for this property suitable for displaying in the |
---|
409 | * explorer/experimenter gui |
---|
410 | */ |
---|
411 | public String performRankingTipText() { |
---|
412 | return "Perform initial ranking to select top-ranked attributes."; |
---|
413 | } |
---|
414 | |
---|
415 | /** |
---|
416 | * Perform initial ranking to select top-ranked attributes. |
---|
417 | * |
---|
418 | * @param b |
---|
419 | * true if initial ranking should be performed |
---|
420 | */ |
---|
421 | public void setPerformRanking(boolean b) { |
---|
422 | m_performRanking = b; |
---|
423 | } |
---|
424 | |
---|
425 | /** |
---|
426 | * Get boolean if initial ranking should be performed to select the |
---|
427 | * top-ranked attributes |
---|
428 | * |
---|
429 | * @return true if initial ranking should be performed |
---|
430 | */ |
---|
431 | public boolean getPerformRanking() { |
---|
432 | return m_performRanking; |
---|
433 | } |
---|
434 | |
---|
435 | /** |
---|
436 | * Returns the tip text for this property |
---|
437 | * |
---|
438 | * @return tip text for this property suitable for displaying in the |
---|
439 | * explorer/experimenter gui |
---|
440 | */ |
---|
441 | public String numUsedAttributesTipText() { |
---|
442 | return "Set the amount of top-ranked attributes that are taken into account by the search process."; |
---|
443 | } |
---|
444 | |
---|
445 | /** |
---|
446 | * Set the number of top-ranked attributes that taken into account by the |
---|
447 | * search process. |
---|
448 | * |
---|
449 | * @param k |
---|
450 | * the number of attributes |
---|
451 | * @exception Exception |
---|
452 | * if k is less than 2 |
---|
453 | */ |
---|
454 | public void setNumUsedAttributes(int k) throws Exception { |
---|
455 | if (k < 2) { |
---|
456 | throw new Exception("Value of -K must be >= 2."); |
---|
457 | } |
---|
458 | |
---|
459 | m_numUsedAttributes = k; |
---|
460 | } |
---|
461 | |
---|
462 | /** |
---|
463 | * Get the number of top-ranked attributes that taken into account by the |
---|
464 | * search process. |
---|
465 | * |
---|
466 | * @return the number of top-ranked attributes that taken into account |
---|
467 | */ |
---|
468 | public int getNumUsedAttributes() { |
---|
469 | return m_numUsedAttributes; |
---|
470 | } |
---|
471 | |
---|
472 | /** |
---|
473 | * Returns the tip text for this property |
---|
474 | * |
---|
475 | * @return tip text for this property suitable for displaying in the |
---|
476 | * explorer/experimenter gui |
---|
477 | */ |
---|
478 | public String typeTipText() { |
---|
479 | return "Set the type of the search."; |
---|
480 | } |
---|
481 | |
---|
482 | /** |
---|
483 | * Set the type |
---|
484 | * |
---|
485 | * @param t |
---|
486 | * the Linear Forward Selection type |
---|
487 | */ |
---|
488 | public void setType(SelectedTag t) { |
---|
489 | if (t.getTags() == TAGS_TYPE) { |
---|
490 | m_linearSelectionType = t.getSelectedTag().getID(); |
---|
491 | } |
---|
492 | } |
---|
493 | |
---|
494 | /** |
---|
495 | * Get the type |
---|
496 | * |
---|
497 | * @return the Linear Forward Selection type |
---|
498 | */ |
---|
499 | public SelectedTag getType() { |
---|
500 | return new SelectedTag(m_linearSelectionType, TAGS_TYPE); |
---|
501 | } |
---|
502 | |
---|
503 | /** |
---|
504 | * Returns the tip text for this property |
---|
505 | * |
---|
506 | * @return tip text for this property suitable for displaying in the |
---|
507 | * explorer/experimenter gui |
---|
508 | */ |
---|
509 | public String subsetSizeEvaluatorTipText() { |
---|
510 | return "Subset evaluator to use for subset size determination."; |
---|
511 | } |
---|
512 | |
---|
513 | /** |
---|
514 | * Set the subset evaluator to use for subset size determination. |
---|
515 | * |
---|
516 | * @param eval |
---|
517 | * the subset evaluator to use for subset size determination. |
---|
518 | */ |
---|
519 | public void setSubsetSizeEvaluator(ASEvaluation eval) |
---|
520 | throws Exception { |
---|
521 | if (!(eval instanceof SubsetEvaluator)) { |
---|
522 | throw new Exception(eval.getClass().getName() + |
---|
523 | " is no subset evaluator."); |
---|
524 | } |
---|
525 | |
---|
526 | m_setSizeEval = eval; |
---|
527 | } |
---|
528 | |
---|
529 | /** |
---|
530 | * Get the subset evaluator used for subset size determination. |
---|
531 | * |
---|
532 | * @return the evaluator used for subset size determination. |
---|
533 | */ |
---|
534 | public ASEvaluation getSubsetSizeEvaluator() { |
---|
535 | return m_setSizeEval; |
---|
536 | } |
---|
537 | |
---|
538 | /** |
---|
539 | * Returns the tip text for this property |
---|
540 | * |
---|
541 | * @return tip text for this property suitable for displaying in the |
---|
542 | * explorer/experimenter gui |
---|
543 | */ |
---|
544 | public String numSubsetSizeCVFoldsTipText() { |
---|
545 | return "Number of cross validation folds for subset size determination"; |
---|
546 | } |
---|
547 | |
---|
548 | /** |
---|
549 | * Set the number of cross validation folds for subset size determination |
---|
550 | * (default = 5). |
---|
551 | * |
---|
552 | * @param f |
---|
553 | * number of folds |
---|
554 | */ |
---|
555 | public void setNumSubsetSizeCVFolds(int f) { |
---|
556 | m_numFolds = f; |
---|
557 | } |
---|
558 | |
---|
559 | /** |
---|
560 | * Get the number of cross validation folds for subset size determination |
---|
561 | * (default = 5). |
---|
562 | * |
---|
563 | * @return number of folds |
---|
564 | */ |
---|
565 | public int getNumSubsetSizeCVFolds() { |
---|
566 | return m_numFolds; |
---|
567 | } |
---|
568 | |
---|
569 | /** |
---|
570 | * Returns the tip text for this property |
---|
571 | * |
---|
572 | * @return tip text for this property suitable for displaying in the |
---|
573 | * explorer/experimenter gui |
---|
574 | */ |
---|
575 | public String seedTipText() { |
---|
576 | return "Seed for cross validation subset size determination. (default = 1)"; |
---|
577 | } |
---|
578 | |
---|
579 | /** |
---|
580 | * Seed for cross validation subset size determination. (default = 1) |
---|
581 | * |
---|
582 | * @param s |
---|
583 | * seed |
---|
584 | */ |
---|
585 | public void setSeed(int s) { |
---|
586 | m_seed = s; |
---|
587 | } |
---|
588 | |
---|
589 | /** |
---|
590 | * Seed for cross validation subset size determination. (default = 1) |
---|
591 | * |
---|
592 | * @return seed |
---|
593 | */ |
---|
594 | public int getSeed() { |
---|
595 | return m_seed; |
---|
596 | } |
---|
597 | |
---|
598 | /** |
---|
599 | * Returns the tip text for this property |
---|
600 | * |
---|
601 | * @return tip text for this property suitable for displaying in the |
---|
602 | * explorer/experimenter gui |
---|
603 | */ |
---|
604 | public String verboseTipText() { |
---|
605 | return "Turn on verbose output for monitoring the search's progress."; |
---|
606 | } |
---|
607 | |
---|
608 | /** |
---|
609 | * Set whether verbose output should be generated. |
---|
610 | * |
---|
611 | * @param b |
---|
612 | * true if output is to be verbose. |
---|
613 | */ |
---|
614 | public void setVerbose(boolean b) { |
---|
615 | m_verbose = b; |
---|
616 | } |
---|
617 | |
---|
618 | /** |
---|
619 | * Get whether output is to be verbose |
---|
620 | * |
---|
621 | * @return true if output will be verbose |
---|
622 | */ |
---|
623 | public boolean getVerbose() { |
---|
624 | return m_verbose; |
---|
625 | } |
---|
626 | |
---|
627 | /** |
---|
628 | * Gets the current settings of LinearForwardSelection. |
---|
629 | * |
---|
630 | * @return an array of strings suitable for passing to setOptions() |
---|
631 | */ |
---|
632 | public String[] getOptions() { |
---|
633 | String[] evaluatorOptions = new String[0]; |
---|
634 | |
---|
635 | if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) { |
---|
636 | evaluatorOptions = ((OptionHandler) m_setSizeEval).getOptions(); |
---|
637 | } |
---|
638 | |
---|
639 | String[] options = new String[15 + evaluatorOptions.length]; |
---|
640 | int current = 0; |
---|
641 | |
---|
642 | if (m_performRanking) { |
---|
643 | options[current++] = "-I"; |
---|
644 | } |
---|
645 | |
---|
646 | options[current++] = "-K"; |
---|
647 | options[current++] = "" + m_numUsedAttributes; |
---|
648 | options[current++] = "-T"; |
---|
649 | options[current++] = "" + m_linearSelectionType; |
---|
650 | |
---|
651 | options[current++] = "-F"; |
---|
652 | options[current++] = "" + m_numFolds; |
---|
653 | options[current++] = "-S"; |
---|
654 | options[current++] = "" + m_seed; |
---|
655 | |
---|
656 | options[current++] = "-Z"; |
---|
657 | options[current++] = "" + m_verbose; |
---|
658 | |
---|
659 | if (m_setSizeEval != null) { |
---|
660 | options[current++] = "-E"; |
---|
661 | options[current++] = m_setSizeEval.getClass().getName(); |
---|
662 | } |
---|
663 | |
---|
664 | options[current++] = "--"; |
---|
665 | System.arraycopy(evaluatorOptions, 0, options, current, |
---|
666 | evaluatorOptions.length); |
---|
667 | current += evaluatorOptions.length; |
---|
668 | |
---|
669 | while (current < options.length) { |
---|
670 | options[current++] = ""; |
---|
671 | } |
---|
672 | |
---|
673 | return options; |
---|
674 | } |
---|
675 | |
---|
676 | /** |
---|
677 | * returns a description of the search as a String |
---|
678 | * |
---|
679 | * @return a description of the search |
---|
680 | */ |
---|
681 | public String toString() { |
---|
682 | StringBuffer LFSString = new StringBuffer(); |
---|
683 | |
---|
684 | LFSString.append("\tSubset Size Forward Selection.\n"); |
---|
685 | |
---|
686 | LFSString.append("\tLinear Forward Selection Type: "); |
---|
687 | |
---|
688 | if (m_linearSelectionType == TYPE_FIXED_SET) { |
---|
689 | LFSString.append("fixed-set\n"); |
---|
690 | } else { |
---|
691 | LFSString.append("fixed-width\n"); |
---|
692 | } |
---|
693 | |
---|
694 | LFSString.append("\tNumber of top-ranked attributes that are used: " + |
---|
695 | m_numUsedAttributes + "\n"); |
---|
696 | |
---|
697 | LFSString.append( |
---|
698 | "\tNumber of cross validation folds for subset size determination: " + |
---|
699 | m_numFolds + "\n"); |
---|
700 | LFSString.append("\tSeed for cross validation subset size determination: " + |
---|
701 | m_seed + "\n"); |
---|
702 | |
---|
703 | LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals + |
---|
704 | "\n"); |
---|
705 | LFSString.append("\tMerit of best subset found: " + |
---|
706 | Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n"); |
---|
707 | |
---|
708 | return LFSString.toString(); |
---|
709 | } |
---|
710 | |
---|
711 | /** |
---|
712 | * Searches the attribute subset space by subset size forward selection |
---|
713 | * |
---|
714 | * @param ASEval |
---|
715 | * the attribute evaluator to guide the search |
---|
716 | * @param data |
---|
717 | * the training instances. |
---|
718 | * @return an array (not necessarily ordered) of selected attribute indexes |
---|
719 | * @exception Exception |
---|
720 | * if the search can't be completed |
---|
721 | */ |
---|
722 | public int[] search(ASEvaluation ASEval, Instances data) |
---|
723 | throws Exception { |
---|
724 | m_totalEvals = 0; |
---|
725 | |
---|
726 | if (!(ASEval instanceof SubsetEvaluator)) { |
---|
727 | throw new Exception(ASEval.getClass().getName() + " is not a " + |
---|
728 | "Subset evaluator!"); |
---|
729 | } |
---|
730 | |
---|
731 | if (m_setSizeEval == null) { |
---|
732 | m_setSizeEval = ASEval; |
---|
733 | } |
---|
734 | |
---|
735 | m_numAttribs = data.numAttributes(); |
---|
736 | |
---|
737 | if (m_numUsedAttributes > m_numAttribs) { |
---|
738 | System.out.println( |
---|
739 | "Decreasing number of top-ranked attributes to total number of attributes: " + |
---|
740 | data.numAttributes()); |
---|
741 | m_numUsedAttributes = m_numAttribs; |
---|
742 | } |
---|
743 | |
---|
744 | Instances[] trainData = new Instances[m_numFolds]; |
---|
745 | Instances[] testData = new Instances[m_numFolds]; |
---|
746 | LFSMethods[] searchResults = new LFSMethods[m_numFolds]; |
---|
747 | |
---|
748 | Random random = new Random(m_seed); |
---|
749 | Instances dataCopy = new Instances(data); |
---|
750 | dataCopy.randomize(random); |
---|
751 | |
---|
752 | if (dataCopy.classAttribute().isNominal()) { |
---|
753 | dataCopy.stratify(m_numFolds); |
---|
754 | } |
---|
755 | |
---|
756 | for (int f = 0; f < m_numFolds; f++) { |
---|
757 | trainData[f] = dataCopy.trainCV(m_numFolds, f, random); |
---|
758 | testData[f] = dataCopy.testCV(m_numFolds, f); |
---|
759 | } |
---|
760 | |
---|
761 | LFSMethods LSF = new LFSMethods(); |
---|
762 | |
---|
763 | int[] ranking; |
---|
764 | |
---|
765 | if (m_performRanking) { |
---|
766 | ASEval.buildEvaluator(data); |
---|
767 | ranking = LSF.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose); |
---|
768 | } else { |
---|
769 | ranking = new int[m_numAttribs]; |
---|
770 | |
---|
771 | for (int i = 0; i < ranking.length; i++) { |
---|
772 | ranking[i] = i; |
---|
773 | } |
---|
774 | } |
---|
775 | |
---|
776 | int maxSubsetSize = 0; |
---|
777 | |
---|
778 | for (int f = 0; f < m_numFolds; f++) { |
---|
779 | if (m_verbose) { |
---|
780 | System.out.println("perform search on internal fold: " + (f + 1) + "/" + |
---|
781 | m_numFolds); |
---|
782 | } |
---|
783 | |
---|
784 | m_setSizeEval.buildEvaluator(trainData[f]); |
---|
785 | searchResults[f] = new LFSMethods(); |
---|
786 | searchResults[f].forwardSearch(m_cacheSize, new BitSet(m_numAttribs), |
---|
787 | ranking, m_numUsedAttributes, |
---|
788 | m_linearSelectionType == TYPE_FIXED_WIDTH, 1, -1, trainData[f], |
---|
789 | (SubsetEvaluator)m_setSizeEval, m_verbose); |
---|
790 | |
---|
791 | maxSubsetSize = Math.max(maxSubsetSize, |
---|
792 | searchResults[f].getBestGroup().cardinality()); |
---|
793 | } |
---|
794 | |
---|
795 | if (m_verbose) { |
---|
796 | System.out.println( |
---|
797 | "continue searches on internal folds to maxSubsetSize (" + |
---|
798 | maxSubsetSize + ")"); |
---|
799 | } |
---|
800 | |
---|
801 | for (int f = 0; f < m_numFolds; f++) { |
---|
802 | if (m_verbose) { |
---|
803 | System.out.print("perform search on internal fold: " + (f + 1) + "/" + |
---|
804 | m_numFolds + " with starting set "); |
---|
805 | LFSMethods.printGroup(searchResults[f].getBestGroup(), |
---|
806 | trainData[f].numAttributes()); |
---|
807 | } |
---|
808 | |
---|
809 | if (searchResults[f].getBestGroup().cardinality() < maxSubsetSize) { |
---|
810 | m_setSizeEval.buildEvaluator(trainData[f]); |
---|
811 | searchResults[f].forwardSearch(m_cacheSize, |
---|
812 | searchResults[f].getBestGroup(), ranking, m_numUsedAttributes, |
---|
813 | m_linearSelectionType == TYPE_FIXED_WIDTH, 1, maxSubsetSize, |
---|
814 | trainData[f], (SubsetEvaluator)m_setSizeEval, m_verbose); |
---|
815 | } |
---|
816 | } |
---|
817 | |
---|
818 | double[][] testMerit = new double[m_numFolds][maxSubsetSize + 1]; |
---|
819 | |
---|
820 | for (int f = 0; f < m_numFolds; f++) { |
---|
821 | for (int s = 1; s <= maxSubsetSize; s++) { |
---|
822 | if (HoldOutSubsetEvaluator.class.isInstance(m_setSizeEval)) { |
---|
823 | m_setSizeEval.buildEvaluator(trainData[f]); |
---|
824 | testMerit[f][s] = ((HoldOutSubsetEvaluator) m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize( |
---|
825 | s), testData[f]); |
---|
826 | } else { |
---|
827 | m_setSizeEval.buildEvaluator(testData[f]); |
---|
828 | testMerit[f][s] = ((SubsetEvaluator)m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize( |
---|
829 | s)); |
---|
830 | } |
---|
831 | } |
---|
832 | } |
---|
833 | |
---|
834 | double[] avgTestMerit = new double[maxSubsetSize + 1]; |
---|
835 | int finalSubsetSize = -1; |
---|
836 | |
---|
837 | for (int s = 1; s <= maxSubsetSize; s++) { |
---|
838 | for (int f = 0; f < m_numFolds; f++) { |
---|
839 | avgTestMerit[s] = ((avgTestMerit[s] * f) + testMerit[f][s]) / (double) (f + |
---|
840 | 1); |
---|
841 | } |
---|
842 | |
---|
843 | if ((finalSubsetSize == -1) || |
---|
844 | (avgTestMerit[s] > avgTestMerit[finalSubsetSize])) { |
---|
845 | finalSubsetSize = s; |
---|
846 | } |
---|
847 | |
---|
848 | if (m_verbose) { |
---|
849 | System.out.println("average merit for subset-size " + s + ": " + |
---|
850 | avgTestMerit[s]); |
---|
851 | } |
---|
852 | } |
---|
853 | |
---|
854 | if (m_verbose) { |
---|
855 | System.out.println("performing final forward selection to subset-size: " + |
---|
856 | finalSubsetSize); |
---|
857 | } |
---|
858 | |
---|
859 | ASEval.buildEvaluator(data); |
---|
860 | LSF.forwardSearch(m_cacheSize, new BitSet(m_numAttribs), ranking, |
---|
861 | m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, 1, |
---|
862 | finalSubsetSize, data, (SubsetEvaluator) ASEval, m_verbose); |
---|
863 | |
---|
864 | m_totalEvals = LSF.getNumEvalsTotal(); |
---|
865 | m_bestMerit = LSF.getBestMerit(); |
---|
866 | |
---|
867 | return attributeList(LSF.getBestGroup()); |
---|
868 | } |
---|
869 | |
---|
870 | /** |
---|
871 | * Reset options to default values |
---|
872 | */ |
---|
873 | protected void resetOptions() { |
---|
874 | m_performRanking = true; |
---|
875 | m_numUsedAttributes = 50; |
---|
876 | m_linearSelectionType = TYPE_FIXED_SET; |
---|
877 | m_setSizeEval = new ClassifierSubsetEval(); |
---|
878 | m_numFolds = 5; |
---|
879 | m_seed = 1; |
---|
880 | m_totalEvals = 0; |
---|
881 | m_cacheSize = 1; |
---|
882 | m_verbose = false; |
---|
883 | } |
---|
884 | |
---|
885 | /** |
---|
886 | * converts a BitSet into a list of attribute indexes |
---|
887 | * |
---|
888 | * @param group |
---|
889 | * the BitSet to convert |
---|
890 | * @return an array of attribute indexes |
---|
891 | */ |
---|
892 | protected int[] attributeList(BitSet group) { |
---|
893 | int count = 0; |
---|
894 | |
---|
895 | // count how many were selected |
---|
896 | for (int i = 0; i < m_numAttribs; i++) { |
---|
897 | if (group.get(i)) { |
---|
898 | count++; |
---|
899 | } |
---|
900 | } |
---|
901 | |
---|
902 | int[] list = new int[count]; |
---|
903 | count = 0; |
---|
904 | |
---|
905 | for (int i = 0; i < m_numAttribs; i++) { |
---|
906 | if (group.get(i)) { |
---|
907 | list[count++] = i; |
---|
908 | } |
---|
909 | } |
---|
910 | |
---|
911 | return list; |
---|
912 | } |
---|
913 | |
---|
914 | /** |
---|
915 | * Returns the revision string. |
---|
916 | * |
---|
917 | * @return the revision |
---|
918 | */ |
---|
919 | public String getRevision() { |
---|
920 | return RevisionUtils.extract("$Revision: 5604 $"); |
---|
921 | } |
---|
922 | } |
---|
923 | |
---|