| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * SubsetSizeForwardSelection.java |
|---|
| 19 | * Copyright (C) 2007 Martin Guetlein |
|---|
| 20 | * |
|---|
| 21 | */ |
|---|
| 22 | package weka.attributeSelection; |
|---|
| 23 | |
|---|
| 24 | import weka.core.Instances; |
|---|
| 25 | import weka.core.Option; |
|---|
| 26 | import weka.core.OptionHandler; |
|---|
| 27 | import weka.core.RevisionUtils; |
|---|
| 28 | import weka.core.SelectedTag; |
|---|
| 29 | import weka.core.Tag; |
|---|
| 30 | import weka.core.TechnicalInformation; |
|---|
| 31 | import weka.core.Utils; |
|---|
| 32 | import weka.core.TechnicalInformation.Field; |
|---|
| 33 | import weka.core.TechnicalInformation.Type; |
|---|
| 34 | |
|---|
| 35 | import java.util.BitSet; |
|---|
| 36 | import java.util.Enumeration; |
|---|
| 37 | import java.util.Random; |
|---|
| 38 | import java.util.Vector; |
|---|
| 39 | |
|---|
| 40 | |
|---|
| 41 | /** |
|---|
| 42 | <!-- globalinfo-start --> |
|---|
| 43 | * SubsetSizeForwardSelection:<br/> |
|---|
| 44 | * <br/> |
|---|
| 45 | * Extension of LinearForwardSelection. The search performs an interior cross-validation (seed and number of folds can be specified). A LinearForwardSelection is performed on each foldto determine the optimal subset-size (using the given SubsetSizeEvaluator). Finally, a LinearForwardSelection up to the optimal subset-size is performed on the whole data.<br/> |
|---|
| 46 | * <br/> |
|---|
| 47 | * For more information see:<br/> |
|---|
| 48 | * <br/> |
|---|
| 49 | * Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany. |
|---|
| 50 | * <p/> |
|---|
| 51 | <!-- globalinfo-end --> |
|---|
| 52 | * |
|---|
| 53 | <!-- options-start --> |
|---|
| 54 | * Valid options are: <p/> |
|---|
| 55 | * |
|---|
| 56 | * <pre> -I |
|---|
| 57 | * Perform initial ranking to select the |
|---|
| 58 | * top-ranked attributes.</pre> |
|---|
| 59 | * |
|---|
| 60 | * <pre> -K <num> |
|---|
| 61 | * Number of top-ranked attributes that are |
|---|
| 62 | * taken into account by the search.</pre> |
|---|
| 63 | * |
|---|
| 64 | * <pre> -T <0 = fixed-set | 1 = fixed-width> |
|---|
| 65 | * Type of Linear Forward Selection (default = 0).</pre> |
|---|
| 66 | * |
|---|
| 67 | * <pre> -S <num> |
|---|
| 68 | * Size of lookup cache for evaluated subsets. |
|---|
| 69 | * Expressed as a multiple of the number of |
|---|
| 70 | * attributes in the data set. (default = 1)</pre> |
|---|
| 71 | * |
|---|
| 72 | * <pre> -E <subset evaluator> |
|---|
| 73 | * Subset-evaluator used for subset-size determination.-- -M</pre> |
|---|
| 74 | * |
|---|
| 75 | * <pre> -F <num> |
|---|
| 76 | * Number of cross validation folds |
|---|
| 77 | * for subset size determination (default = 5).</pre> |
|---|
| 78 | * |
|---|
| 79 | * <pre> -R <num> |
|---|
| 80 | * Seed for cross validation |
|---|
| 81 | * subset size determination. (default = 1)</pre> |
|---|
| 82 | * |
|---|
| 83 | * <pre> -Z |
|---|
| 84 | * verbose on/off</pre> |
|---|
| 85 | * |
|---|
| 86 | * <pre> |
|---|
| 87 | * Options specific to evaluator weka.attributeSelection.ClassifierSubsetEval: |
|---|
| 88 | * </pre> |
|---|
| 89 | * |
|---|
| 90 | * <pre> -B <classifier> |
|---|
| 91 | * class name of the classifier to use for accuracy estimation. |
|---|
| 92 | * Place any classifier options LAST on the command line |
|---|
| 93 | * following a "--". eg.: |
|---|
| 94 | * -B weka.classifiers.bayes.NaiveBayes ... -- -K |
|---|
| 95 | * (default: weka.classifiers.rules.ZeroR)</pre> |
|---|
| 96 | * |
|---|
| 97 | * <pre> -T |
|---|
| 98 | * Use the training data to estimate accuracy.</pre> |
|---|
| 99 | * |
|---|
| 100 | * <pre> -H <filename> |
|---|
| 101 | * Name of the hold out/test set to |
|---|
| 102 | * estimate accuracy on.</pre> |
|---|
| 103 | * |
|---|
| 104 | * <pre> |
|---|
| 105 | * Options specific to scheme weka.classifiers.rules.ZeroR: |
|---|
| 106 | * </pre> |
|---|
| 107 | * |
|---|
| 108 | * <pre> -D |
|---|
| 109 | * If set, classifier is run in debug mode and |
|---|
| 110 | * may output additional info to the console</pre> |
|---|
| 111 | * |
|---|
| 112 | <!-- options-end --> |
|---|
| 113 | * |
|---|
| 114 | * @author Martin Guetlein (martin.guetlein@gmail.com) |
|---|
| 115 | * @version $Revision: 5604 $ |
|---|
| 116 | */ |
|---|
| 117 | public class SubsetSizeForwardSelection extends ASSearch |
|---|
| 118 | implements OptionHandler { |
|---|
| 119 | /** search directions */ |
|---|
| 120 | protected static final int TYPE_FIXED_SET = 0; |
|---|
| 121 | protected static final int TYPE_FIXED_WIDTH = 1; |
|---|
| 122 | public static final Tag[] TAGS_TYPE = { |
|---|
| 123 | new Tag(TYPE_FIXED_SET, "Fixed-set"), |
|---|
| 124 | new Tag(TYPE_FIXED_WIDTH, "Fixed-width"), |
|---|
| 125 | }; |
|---|
| 126 | |
|---|
| 127 | // member variables |
|---|
| 128 | /** perform initial ranking to select top-ranked attributes */ |
|---|
| 129 | protected boolean m_performRanking; |
|---|
| 130 | |
|---|
| 131 | /** |
|---|
| 132 | * number of top-ranked attributes that are taken into account for the |
|---|
| 133 | * search |
|---|
| 134 | */ |
|---|
| 135 | protected int m_numUsedAttributes; |
|---|
| 136 | |
|---|
| 137 | /** 0 == fixed-set, 1 == fixed-width */ |
|---|
| 138 | protected int m_linearSelectionType; |
|---|
| 139 | |
|---|
| 140 | /** the subset evaluator to use for subset size determination */ |
|---|
| 141 | private ASEvaluation m_setSizeEval; |
|---|
| 142 | |
|---|
| 143 | /** |
|---|
| 144 | * Number of cross validation folds for subset size determination (default = |
|---|
| 145 | * 5). |
|---|
| 146 | */ |
|---|
| 147 | protected int m_numFolds; |
|---|
| 148 | |
|---|
| 149 | /** Seed for cross validation subset size determination. (default = 1) */ |
|---|
| 150 | protected int m_seed; |
|---|
| 151 | |
|---|
| 152 | /** number of attributes in the data */ |
|---|
| 153 | protected int m_numAttribs; |
|---|
| 154 | |
|---|
| 155 | /** total number of subsets evaluated during a search */ |
|---|
| 156 | protected int m_totalEvals; |
|---|
| 157 | |
|---|
| 158 | /** for debugging */ |
|---|
| 159 | protected boolean m_verbose; |
|---|
| 160 | |
|---|
| 161 | /** holds the merit of the best subset found */ |
|---|
| 162 | protected double m_bestMerit; |
|---|
| 163 | |
|---|
| 164 | /** holds the maximum size of the lookup cache for evaluated subsets */ |
|---|
| 165 | protected int m_cacheSize; |
|---|
| 166 | |
|---|
| 167 | /** |
|---|
| 168 | * Constructor |
|---|
| 169 | */ |
|---|
| 170 | public SubsetSizeForwardSelection() { |
|---|
| 171 | resetOptions(); |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | /** |
|---|
| 175 | * Returns a string describing this search method |
|---|
| 176 | * |
|---|
| 177 | * @return a description of the search method suitable for displaying in the |
|---|
| 178 | * explorer/experimenter gui |
|---|
| 179 | */ |
|---|
| 180 | public String globalInfo() { |
|---|
| 181 | return "SubsetSizeForwardSelection:\n\n" + |
|---|
| 182 | "Extension of LinearForwardSelection. The search performs an interior " + |
|---|
| 183 | "cross-validation (seed and number of folds can be specified). A " + |
|---|
| 184 | "LinearForwardSelection is performed on each foldto determine the optimal " + |
|---|
| 185 | "subset-size (using the given SubsetSizeEvaluator). Finally, a " + |
|---|
| 186 | "LinearForwardSelection up to the optimal subset-size is performed on " + |
|---|
| 187 | "the whole data.\n\n" |
|---|
| 188 | + "For more information see:\n\n" |
|---|
| 189 | + getTechnicalInformation().toString(); |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | /** |
|---|
| 193 | * Returns an instance of a TechnicalInformation object, containing |
|---|
| 194 | * detailed information about the technical background of this class, |
|---|
| 195 | * e.g., paper reference or book this class is based on. |
|---|
| 196 | * |
|---|
| 197 | * @return the technical information about this class |
|---|
| 198 | */ |
|---|
| 199 | public TechnicalInformation getTechnicalInformation() { |
|---|
| 200 | TechnicalInformation result; |
|---|
| 201 | TechnicalInformation additional; |
|---|
| 202 | |
|---|
| 203 | result = new TechnicalInformation(Type.INPROCEEDINGS); |
|---|
| 204 | result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall"); |
|---|
| 205 | result.setValue(Field.YEAR, "2009"); |
|---|
| 206 | result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); |
|---|
| 207 | result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining"); |
|---|
| 208 | result.setValue(Field.PAGES, "332-339"); |
|---|
| 209 | result.setValue(Field.PUBLISHER, "IEEE"); |
|---|
| 210 | |
|---|
| 211 | additional = result.add(Type.MASTERSTHESIS); |
|---|
| 212 | additional.setValue(Field.AUTHOR, "Martin Guetlein"); |
|---|
| 213 | additional.setValue(Field.YEAR, "2006"); |
|---|
| 214 | additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); |
|---|
| 215 | additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet"); |
|---|
| 216 | additional.setValue(Field.ADDRESS, "Freiburg, Germany"); |
|---|
| 217 | |
|---|
| 218 | return result; |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | /** |
|---|
| 222 | * Returns an enumeration describing the available options. |
|---|
| 223 | * |
|---|
| 224 | * @return an enumeration of all the available options. |
|---|
| 225 | * |
|---|
| 226 | */ |
|---|
| 227 | public Enumeration listOptions() { |
|---|
| 228 | Vector newVector = new Vector(9); |
|---|
| 229 | |
|---|
| 230 | newVector.addElement(new Option("\tPerform initial ranking to select the" + |
|---|
| 231 | "\n\ttop-ranked attributes.", "I", 0, "-I")); |
|---|
| 232 | newVector.addElement(new Option( |
|---|
| 233 | "\tNumber of top-ranked attributes that are " + |
|---|
| 234 | "\n\ttaken into account by the search.", "K", 1, "-K <num>")); |
|---|
| 235 | newVector.addElement(new Option( |
|---|
| 236 | "\tType of Linear Forward Selection (default = 0).", "T", 1, |
|---|
| 237 | "-T <0 = fixed-set | 1 = fixed-width>")); |
|---|
| 238 | newVector.addElement(new Option( |
|---|
| 239 | "\tSize of lookup cache for evaluated subsets." + |
|---|
| 240 | "\n\tExpressed as a multiple of the number of" + |
|---|
| 241 | "\n\tattributes in the data set. (default = 1)", "S", 1, "-S <num>")); |
|---|
| 242 | newVector.addElement(new Option( |
|---|
| 243 | "\tSubset-evaluator used for subset-size determination." + "-- -M", |
|---|
| 244 | "E", 1, "-E <subset evaluator>")); |
|---|
| 245 | newVector.addElement(new Option("\tNumber of cross validation folds" + |
|---|
| 246 | "\n\tfor subset size determination (default = 5).", "F", 1, "-F <num>")); |
|---|
| 247 | newVector.addElement(new Option("\tSeed for cross validation" + |
|---|
| 248 | "\n\tsubset size determination. (default = 1)", "R", 1, "-R <num>")); |
|---|
| 249 | newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z")); |
|---|
| 250 | |
|---|
| 251 | if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) { |
|---|
| 252 | newVector.addElement(new Option("", "", 0, |
|---|
| 253 | "\nOptions specific to " + "evaluator " + |
|---|
| 254 | m_setSizeEval.getClass().getName() + ":")); |
|---|
| 255 | |
|---|
| 256 | Enumeration enu = ((OptionHandler) m_setSizeEval).listOptions(); |
|---|
| 257 | |
|---|
| 258 | while (enu.hasMoreElements()) { |
|---|
| 259 | newVector.addElement(enu.nextElement()); |
|---|
| 260 | } |
|---|
| 261 | } |
|---|
| 262 | |
|---|
| 263 | return newVector.elements(); |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | /** |
|---|
| 267 | * Parses a given list of options. |
|---|
| 268 | * |
|---|
| 269 | * Valid options are: |
|---|
| 270 | * <p> |
|---|
| 271 | * |
|---|
| 272 | * -I <br> |
|---|
| 273 | * Perform initial ranking to select top-ranked attributes. |
|---|
| 274 | * <p> |
|---|
| 275 | * |
|---|
| 276 | * -K <num> <br> |
|---|
| 277 | * Number of top-ranked attributes that are taken into account. |
|---|
| 278 | * <p> |
|---|
| 279 | * |
|---|
| 280 | * -T <0 = fixed-set | 1 = fixed-width> <br> |
|---|
| 281 | * Typ of Linear Forward Selection (default = 0). |
|---|
| 282 | * <p> |
|---|
| 283 | * |
|---|
| 284 | * -S <num> <br> |
|---|
| 285 | * Size of lookup cache for evaluated subsets. Expressed as a multiple of |
|---|
| 286 | * the number of attributes in the data set. (default = 1). |
|---|
| 287 | * <p> |
|---|
| 288 | * |
|---|
| 289 | * -E <string> <br> |
|---|
| 290 | * class name of subset evaluator to use for subset size determination |
|---|
| 291 | * (default = null, same subset evaluator as for ranking and final forward |
|---|
| 292 | * selection is used). Place any evaluator options LAST on the command line |
|---|
| 293 | * following a "--". eg. -A weka.attributeSelection.ClassifierSubsetEval ... -- |
|---|
| 294 | * -M |
|---|
| 295 | * |
|---|
| 296 | * </pre> |
|---|
| 297 | * |
|---|
| 298 | * -F <num> <br> |
|---|
| 299 | * Number of cross validation folds for subset size determination (default = |
|---|
| 300 | * 5). |
|---|
| 301 | * <p> |
|---|
| 302 | * |
|---|
| 303 | * -R <num> <br> |
|---|
| 304 | * Seed for cross validation subset size determination. (default = 1) |
|---|
| 305 | * <p> |
|---|
| 306 | * |
|---|
| 307 | * -Z <br> |
|---|
| 308 | * verbose on/off. |
|---|
| 309 | * <p> |
|---|
| 310 | * |
|---|
| 311 | * @param options |
|---|
| 312 | * the list of options as an array of strings |
|---|
| 313 | * @exception Exception |
|---|
| 314 | * if an option is not supported |
|---|
| 315 | * |
|---|
| 316 | */ |
|---|
| 317 | public void setOptions(String[] options) throws Exception { |
|---|
| 318 | String optionString; |
|---|
| 319 | resetOptions(); |
|---|
| 320 | |
|---|
| 321 | setPerformRanking(Utils.getFlag('I', options)); |
|---|
| 322 | |
|---|
| 323 | optionString = Utils.getOption('K', options); |
|---|
| 324 | |
|---|
| 325 | if (optionString.length() != 0) { |
|---|
| 326 | setNumUsedAttributes(Integer.parseInt(optionString)); |
|---|
| 327 | } |
|---|
| 328 | |
|---|
| 329 | optionString = Utils.getOption('T', options); |
|---|
| 330 | |
|---|
| 331 | if (optionString.length() != 0) { |
|---|
| 332 | setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE)); |
|---|
| 333 | } else { |
|---|
| 334 | setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE)); |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | optionString = Utils.getOption('S', options); |
|---|
| 338 | |
|---|
| 339 | if (optionString.length() != 0) { |
|---|
| 340 | setLookupCacheSize(Integer.parseInt(optionString)); |
|---|
| 341 | } |
|---|
| 342 | |
|---|
| 343 | optionString = Utils.getOption('E', options); |
|---|
| 344 | |
|---|
| 345 | if (optionString.length() == 0) { |
|---|
| 346 | System.out.println( |
|---|
| 347 | "No subset size evaluator given, using evaluator that is used for final search."); |
|---|
| 348 | m_setSizeEval = null; |
|---|
| 349 | } else { |
|---|
| 350 | setSubsetSizeEvaluator(ASEvaluation.forName(optionString, |
|---|
| 351 | Utils.partitionOptions(options))); |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | optionString = Utils.getOption('F', options); |
|---|
| 355 | |
|---|
| 356 | if (optionString.length() != 0) { |
|---|
| 357 | setNumSubsetSizeCVFolds(Integer.parseInt(optionString)); |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | optionString = Utils.getOption('R', options); |
|---|
| 361 | |
|---|
| 362 | if (optionString.length() != 0) { |
|---|
| 363 | setSeed(Integer.parseInt(optionString)); |
|---|
| 364 | } |
|---|
| 365 | |
|---|
| 366 | m_verbose = Utils.getFlag('Z', options); |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | /** |
|---|
| 370 | * Set the maximum size of the evaluated subset cache (hashtable). This is |
|---|
| 371 | * expressed as a multiplier for the number of attributes in the data set. |
|---|
| 372 | * (default = 1). |
|---|
| 373 | * |
|---|
| 374 | * @param size |
|---|
| 375 | * the maximum size of the hashtable |
|---|
| 376 | */ |
|---|
| 377 | public void setLookupCacheSize(int size) { |
|---|
| 378 | if (size >= 0) { |
|---|
| 379 | m_cacheSize = size; |
|---|
| 380 | } |
|---|
| 381 | } |
|---|
| 382 | |
|---|
| 383 | /** |
|---|
| 384 | * Return the maximum size of the evaluated subset cache (expressed as a |
|---|
| 385 | * multiplier for the number of attributes in a data set. |
|---|
| 386 | * |
|---|
| 387 | * @return the maximum size of the hashtable. |
|---|
| 388 | */ |
|---|
| 389 | public int getLookupCacheSize() { |
|---|
| 390 | return m_cacheSize; |
|---|
| 391 | } |
|---|
| 392 | |
|---|
| 393 | /** |
|---|
| 394 | * Returns the tip text for this property |
|---|
| 395 | * |
|---|
| 396 | * @return tip text for this property suitable for displaying in the |
|---|
| 397 | * explorer/experimenter gui |
|---|
| 398 | */ |
|---|
| 399 | public String lookupCacheSizeTipText() { |
|---|
| 400 | return "Set the maximum size of the lookup cache of evaluated subsets. This is " + |
|---|
| 401 | "expressed as a multiplier of the number of attributes in the data set. " + |
|---|
| 402 | "(default = 1)."; |
|---|
| 403 | } |
|---|
| 404 | |
|---|
| 405 | /** |
|---|
| 406 | * Returns the tip text for this property |
|---|
| 407 | * |
|---|
| 408 | * @return tip text for this property suitable for displaying in the |
|---|
| 409 | * explorer/experimenter gui |
|---|
| 410 | */ |
|---|
| 411 | public String performRankingTipText() { |
|---|
| 412 | return "Perform initial ranking to select top-ranked attributes."; |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | /** |
|---|
| 416 | * Perform initial ranking to select top-ranked attributes. |
|---|
| 417 | * |
|---|
| 418 | * @param b |
|---|
| 419 | * true if initial ranking should be performed |
|---|
| 420 | */ |
|---|
| 421 | public void setPerformRanking(boolean b) { |
|---|
| 422 | m_performRanking = b; |
|---|
| 423 | } |
|---|
| 424 | |
|---|
| 425 | /** |
|---|
| 426 | * Get boolean if initial ranking should be performed to select the |
|---|
| 427 | * top-ranked attributes |
|---|
| 428 | * |
|---|
| 429 | * @return true if initial ranking should be performed |
|---|
| 430 | */ |
|---|
| 431 | public boolean getPerformRanking() { |
|---|
| 432 | return m_performRanking; |
|---|
| 433 | } |
|---|
| 434 | |
|---|
| 435 | /** |
|---|
| 436 | * Returns the tip text for this property |
|---|
| 437 | * |
|---|
| 438 | * @return tip text for this property suitable for displaying in the |
|---|
| 439 | * explorer/experimenter gui |
|---|
| 440 | */ |
|---|
| 441 | public String numUsedAttributesTipText() { |
|---|
| 442 | return "Set the amount of top-ranked attributes that are taken into account by the search process."; |
|---|
| 443 | } |
|---|
| 444 | |
|---|
| 445 | /** |
|---|
| 446 | * Set the number of top-ranked attributes that taken into account by the |
|---|
| 447 | * search process. |
|---|
| 448 | * |
|---|
| 449 | * @param k |
|---|
| 450 | * the number of attributes |
|---|
| 451 | * @exception Exception |
|---|
| 452 | * if k is less than 2 |
|---|
| 453 | */ |
|---|
| 454 | public void setNumUsedAttributes(int k) throws Exception { |
|---|
| 455 | if (k < 2) { |
|---|
| 456 | throw new Exception("Value of -K must be >= 2."); |
|---|
| 457 | } |
|---|
| 458 | |
|---|
| 459 | m_numUsedAttributes = k; |
|---|
| 460 | } |
|---|
| 461 | |
|---|
| 462 | /** |
|---|
| 463 | * Get the number of top-ranked attributes that taken into account by the |
|---|
| 464 | * search process. |
|---|
| 465 | * |
|---|
| 466 | * @return the number of top-ranked attributes that taken into account |
|---|
| 467 | */ |
|---|
| 468 | public int getNumUsedAttributes() { |
|---|
| 469 | return m_numUsedAttributes; |
|---|
| 470 | } |
|---|
| 471 | |
|---|
| 472 | /** |
|---|
| 473 | * Returns the tip text for this property |
|---|
| 474 | * |
|---|
| 475 | * @return tip text for this property suitable for displaying in the |
|---|
| 476 | * explorer/experimenter gui |
|---|
| 477 | */ |
|---|
| 478 | public String typeTipText() { |
|---|
| 479 | return "Set the type of the search."; |
|---|
| 480 | } |
|---|
| 481 | |
|---|
| 482 | /** |
|---|
| 483 | * Set the type |
|---|
| 484 | * |
|---|
| 485 | * @param t |
|---|
| 486 | * the Linear Forward Selection type |
|---|
| 487 | */ |
|---|
| 488 | public void setType(SelectedTag t) { |
|---|
| 489 | if (t.getTags() == TAGS_TYPE) { |
|---|
| 490 | m_linearSelectionType = t.getSelectedTag().getID(); |
|---|
| 491 | } |
|---|
| 492 | } |
|---|
| 493 | |
|---|
| 494 | /** |
|---|
| 495 | * Get the type |
|---|
| 496 | * |
|---|
| 497 | * @return the Linear Forward Selection type |
|---|
| 498 | */ |
|---|
| 499 | public SelectedTag getType() { |
|---|
| 500 | return new SelectedTag(m_linearSelectionType, TAGS_TYPE); |
|---|
| 501 | } |
|---|
| 502 | |
|---|
| 503 | /** |
|---|
| 504 | * Returns the tip text for this property |
|---|
| 505 | * |
|---|
| 506 | * @return tip text for this property suitable for displaying in the |
|---|
| 507 | * explorer/experimenter gui |
|---|
| 508 | */ |
|---|
| 509 | public String subsetSizeEvaluatorTipText() { |
|---|
| 510 | return "Subset evaluator to use for subset size determination."; |
|---|
| 511 | } |
|---|
| 512 | |
|---|
| 513 | /** |
|---|
| 514 | * Set the subset evaluator to use for subset size determination. |
|---|
| 515 | * |
|---|
| 516 | * @param eval |
|---|
| 517 | * the subset evaluator to use for subset size determination. |
|---|
| 518 | */ |
|---|
| 519 | public void setSubsetSizeEvaluator(ASEvaluation eval) |
|---|
| 520 | throws Exception { |
|---|
| 521 | if (!(eval instanceof SubsetEvaluator)) { |
|---|
| 522 | throw new Exception(eval.getClass().getName() + |
|---|
| 523 | " is no subset evaluator."); |
|---|
| 524 | } |
|---|
| 525 | |
|---|
| 526 | m_setSizeEval = eval; |
|---|
| 527 | } |
|---|
| 528 | |
|---|
| 529 | /** |
|---|
| 530 | * Get the subset evaluator used for subset size determination. |
|---|
| 531 | * |
|---|
| 532 | * @return the evaluator used for subset size determination. |
|---|
| 533 | */ |
|---|
| 534 | public ASEvaluation getSubsetSizeEvaluator() { |
|---|
| 535 | return m_setSizeEval; |
|---|
| 536 | } |
|---|
| 537 | |
|---|
| 538 | /** |
|---|
| 539 | * Returns the tip text for this property |
|---|
| 540 | * |
|---|
| 541 | * @return tip text for this property suitable for displaying in the |
|---|
| 542 | * explorer/experimenter gui |
|---|
| 543 | */ |
|---|
| 544 | public String numSubsetSizeCVFoldsTipText() { |
|---|
| 545 | return "Number of cross validation folds for subset size determination"; |
|---|
| 546 | } |
|---|
| 547 | |
|---|
| 548 | /** |
|---|
| 549 | * Set the number of cross validation folds for subset size determination |
|---|
| 550 | * (default = 5). |
|---|
| 551 | * |
|---|
| 552 | * @param f |
|---|
| 553 | * number of folds |
|---|
| 554 | */ |
|---|
| 555 | public void setNumSubsetSizeCVFolds(int f) { |
|---|
| 556 | m_numFolds = f; |
|---|
| 557 | } |
|---|
| 558 | |
|---|
| 559 | /** |
|---|
| 560 | * Get the number of cross validation folds for subset size determination |
|---|
| 561 | * (default = 5). |
|---|
| 562 | * |
|---|
| 563 | * @return number of folds |
|---|
| 564 | */ |
|---|
| 565 | public int getNumSubsetSizeCVFolds() { |
|---|
| 566 | return m_numFolds; |
|---|
| 567 | } |
|---|
| 568 | |
|---|
| 569 | /** |
|---|
| 570 | * Returns the tip text for this property |
|---|
| 571 | * |
|---|
| 572 | * @return tip text for this property suitable for displaying in the |
|---|
| 573 | * explorer/experimenter gui |
|---|
| 574 | */ |
|---|
| 575 | public String seedTipText() { |
|---|
| 576 | return "Seed for cross validation subset size determination. (default = 1)"; |
|---|
| 577 | } |
|---|
| 578 | |
|---|
| 579 | /** |
|---|
| 580 | * Seed for cross validation subset size determination. (default = 1) |
|---|
| 581 | * |
|---|
| 582 | * @param s |
|---|
| 583 | * seed |
|---|
| 584 | */ |
|---|
| 585 | public void setSeed(int s) { |
|---|
| 586 | m_seed = s; |
|---|
| 587 | } |
|---|
| 588 | |
|---|
| 589 | /** |
|---|
| 590 | * Seed for cross validation subset size determination. (default = 1) |
|---|
| 591 | * |
|---|
| 592 | * @return seed |
|---|
| 593 | */ |
|---|
| 594 | public int getSeed() { |
|---|
| 595 | return m_seed; |
|---|
| 596 | } |
|---|
| 597 | |
|---|
| 598 | /** |
|---|
| 599 | * Returns the tip text for this property |
|---|
| 600 | * |
|---|
| 601 | * @return tip text for this property suitable for displaying in the |
|---|
| 602 | * explorer/experimenter gui |
|---|
| 603 | */ |
|---|
| 604 | public String verboseTipText() { |
|---|
| 605 | return "Turn on verbose output for monitoring the search's progress."; |
|---|
| 606 | } |
|---|
| 607 | |
|---|
| 608 | /** |
|---|
| 609 | * Set whether verbose output should be generated. |
|---|
| 610 | * |
|---|
| 611 | * @param b |
|---|
| 612 | * true if output is to be verbose. |
|---|
| 613 | */ |
|---|
| 614 | public void setVerbose(boolean b) { |
|---|
| 615 | m_verbose = b; |
|---|
| 616 | } |
|---|
| 617 | |
|---|
| 618 | /** |
|---|
| 619 | * Get whether output is to be verbose |
|---|
| 620 | * |
|---|
| 621 | * @return true if output will be verbose |
|---|
| 622 | */ |
|---|
| 623 | public boolean getVerbose() { |
|---|
| 624 | return m_verbose; |
|---|
| 625 | } |
|---|
| 626 | |
|---|
| 627 | /** |
|---|
| 628 | * Gets the current settings of LinearForwardSelection. |
|---|
| 629 | * |
|---|
| 630 | * @return an array of strings suitable for passing to setOptions() |
|---|
| 631 | */ |
|---|
| 632 | public String[] getOptions() { |
|---|
| 633 | String[] evaluatorOptions = new String[0]; |
|---|
| 634 | |
|---|
| 635 | if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) { |
|---|
| 636 | evaluatorOptions = ((OptionHandler) m_setSizeEval).getOptions(); |
|---|
| 637 | } |
|---|
| 638 | |
|---|
| 639 | String[] options = new String[15 + evaluatorOptions.length]; |
|---|
| 640 | int current = 0; |
|---|
| 641 | |
|---|
| 642 | if (m_performRanking) { |
|---|
| 643 | options[current++] = "-I"; |
|---|
| 644 | } |
|---|
| 645 | |
|---|
| 646 | options[current++] = "-K"; |
|---|
| 647 | options[current++] = "" + m_numUsedAttributes; |
|---|
| 648 | options[current++] = "-T"; |
|---|
| 649 | options[current++] = "" + m_linearSelectionType; |
|---|
| 650 | |
|---|
| 651 | options[current++] = "-F"; |
|---|
| 652 | options[current++] = "" + m_numFolds; |
|---|
| 653 | options[current++] = "-S"; |
|---|
| 654 | options[current++] = "" + m_seed; |
|---|
| 655 | |
|---|
| 656 | options[current++] = "-Z"; |
|---|
| 657 | options[current++] = "" + m_verbose; |
|---|
| 658 | |
|---|
| 659 | if (m_setSizeEval != null) { |
|---|
| 660 | options[current++] = "-E"; |
|---|
| 661 | options[current++] = m_setSizeEval.getClass().getName(); |
|---|
| 662 | } |
|---|
| 663 | |
|---|
| 664 | options[current++] = "--"; |
|---|
| 665 | System.arraycopy(evaluatorOptions, 0, options, current, |
|---|
| 666 | evaluatorOptions.length); |
|---|
| 667 | current += evaluatorOptions.length; |
|---|
| 668 | |
|---|
| 669 | while (current < options.length) { |
|---|
| 670 | options[current++] = ""; |
|---|
| 671 | } |
|---|
| 672 | |
|---|
| 673 | return options; |
|---|
| 674 | } |
|---|
| 675 | |
|---|
| 676 | /** |
|---|
| 677 | * returns a description of the search as a String |
|---|
| 678 | * |
|---|
| 679 | * @return a description of the search |
|---|
| 680 | */ |
|---|
| 681 | public String toString() { |
|---|
| 682 | StringBuffer LFSString = new StringBuffer(); |
|---|
| 683 | |
|---|
| 684 | LFSString.append("\tSubset Size Forward Selection.\n"); |
|---|
| 685 | |
|---|
| 686 | LFSString.append("\tLinear Forward Selection Type: "); |
|---|
| 687 | |
|---|
| 688 | if (m_linearSelectionType == TYPE_FIXED_SET) { |
|---|
| 689 | LFSString.append("fixed-set\n"); |
|---|
| 690 | } else { |
|---|
| 691 | LFSString.append("fixed-width\n"); |
|---|
| 692 | } |
|---|
| 693 | |
|---|
| 694 | LFSString.append("\tNumber of top-ranked attributes that are used: " + |
|---|
| 695 | m_numUsedAttributes + "\n"); |
|---|
| 696 | |
|---|
| 697 | LFSString.append( |
|---|
| 698 | "\tNumber of cross validation folds for subset size determination: " + |
|---|
| 699 | m_numFolds + "\n"); |
|---|
| 700 | LFSString.append("\tSeed for cross validation subset size determination: " + |
|---|
| 701 | m_seed + "\n"); |
|---|
| 702 | |
|---|
| 703 | LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals + |
|---|
| 704 | "\n"); |
|---|
| 705 | LFSString.append("\tMerit of best subset found: " + |
|---|
| 706 | Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n"); |
|---|
| 707 | |
|---|
| 708 | return LFSString.toString(); |
|---|
| 709 | } |
|---|
| 710 | |
|---|
| 711 | /** |
|---|
| 712 | * Searches the attribute subset space by subset size forward selection |
|---|
| 713 | * |
|---|
| 714 | * @param ASEval |
|---|
| 715 | * the attribute evaluator to guide the search |
|---|
| 716 | * @param data |
|---|
| 717 | * the training instances. |
|---|
| 718 | * @return an array (not necessarily ordered) of selected attribute indexes |
|---|
| 719 | * @exception Exception |
|---|
| 720 | * if the search can't be completed |
|---|
| 721 | */ |
|---|
| 722 | public int[] search(ASEvaluation ASEval, Instances data) |
|---|
| 723 | throws Exception { |
|---|
| 724 | m_totalEvals = 0; |
|---|
| 725 | |
|---|
| 726 | if (!(ASEval instanceof SubsetEvaluator)) { |
|---|
| 727 | throw new Exception(ASEval.getClass().getName() + " is not a " + |
|---|
| 728 | "Subset evaluator!"); |
|---|
| 729 | } |
|---|
| 730 | |
|---|
| 731 | if (m_setSizeEval == null) { |
|---|
| 732 | m_setSizeEval = ASEval; |
|---|
| 733 | } |
|---|
| 734 | |
|---|
| 735 | m_numAttribs = data.numAttributes(); |
|---|
| 736 | |
|---|
| 737 | if (m_numUsedAttributes > m_numAttribs) { |
|---|
| 738 | System.out.println( |
|---|
| 739 | "Decreasing number of top-ranked attributes to total number of attributes: " + |
|---|
| 740 | data.numAttributes()); |
|---|
| 741 | m_numUsedAttributes = m_numAttribs; |
|---|
| 742 | } |
|---|
| 743 | |
|---|
| 744 | Instances[] trainData = new Instances[m_numFolds]; |
|---|
| 745 | Instances[] testData = new Instances[m_numFolds]; |
|---|
| 746 | LFSMethods[] searchResults = new LFSMethods[m_numFolds]; |
|---|
| 747 | |
|---|
| 748 | Random random = new Random(m_seed); |
|---|
| 749 | Instances dataCopy = new Instances(data); |
|---|
| 750 | dataCopy.randomize(random); |
|---|
| 751 | |
|---|
| 752 | if (dataCopy.classAttribute().isNominal()) { |
|---|
| 753 | dataCopy.stratify(m_numFolds); |
|---|
| 754 | } |
|---|
| 755 | |
|---|
| 756 | for (int f = 0; f < m_numFolds; f++) { |
|---|
| 757 | trainData[f] = dataCopy.trainCV(m_numFolds, f, random); |
|---|
| 758 | testData[f] = dataCopy.testCV(m_numFolds, f); |
|---|
| 759 | } |
|---|
| 760 | |
|---|
| 761 | LFSMethods LSF = new LFSMethods(); |
|---|
| 762 | |
|---|
| 763 | int[] ranking; |
|---|
| 764 | |
|---|
| 765 | if (m_performRanking) { |
|---|
| 766 | ASEval.buildEvaluator(data); |
|---|
| 767 | ranking = LSF.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose); |
|---|
| 768 | } else { |
|---|
| 769 | ranking = new int[m_numAttribs]; |
|---|
| 770 | |
|---|
| 771 | for (int i = 0; i < ranking.length; i++) { |
|---|
| 772 | ranking[i] = i; |
|---|
| 773 | } |
|---|
| 774 | } |
|---|
| 775 | |
|---|
| 776 | int maxSubsetSize = 0; |
|---|
| 777 | |
|---|
| 778 | for (int f = 0; f < m_numFolds; f++) { |
|---|
| 779 | if (m_verbose) { |
|---|
| 780 | System.out.println("perform search on internal fold: " + (f + 1) + "/" + |
|---|
| 781 | m_numFolds); |
|---|
| 782 | } |
|---|
| 783 | |
|---|
| 784 | m_setSizeEval.buildEvaluator(trainData[f]); |
|---|
| 785 | searchResults[f] = new LFSMethods(); |
|---|
| 786 | searchResults[f].forwardSearch(m_cacheSize, new BitSet(m_numAttribs), |
|---|
| 787 | ranking, m_numUsedAttributes, |
|---|
| 788 | m_linearSelectionType == TYPE_FIXED_WIDTH, 1, -1, trainData[f], |
|---|
| 789 | (SubsetEvaluator)m_setSizeEval, m_verbose); |
|---|
| 790 | |
|---|
| 791 | maxSubsetSize = Math.max(maxSubsetSize, |
|---|
| 792 | searchResults[f].getBestGroup().cardinality()); |
|---|
| 793 | } |
|---|
| 794 | |
|---|
| 795 | if (m_verbose) { |
|---|
| 796 | System.out.println( |
|---|
| 797 | "continue searches on internal folds to maxSubsetSize (" + |
|---|
| 798 | maxSubsetSize + ")"); |
|---|
| 799 | } |
|---|
| 800 | |
|---|
| 801 | for (int f = 0; f < m_numFolds; f++) { |
|---|
| 802 | if (m_verbose) { |
|---|
| 803 | System.out.print("perform search on internal fold: " + (f + 1) + "/" + |
|---|
| 804 | m_numFolds + " with starting set "); |
|---|
| 805 | LFSMethods.printGroup(searchResults[f].getBestGroup(), |
|---|
| 806 | trainData[f].numAttributes()); |
|---|
| 807 | } |
|---|
| 808 | |
|---|
| 809 | if (searchResults[f].getBestGroup().cardinality() < maxSubsetSize) { |
|---|
| 810 | m_setSizeEval.buildEvaluator(trainData[f]); |
|---|
| 811 | searchResults[f].forwardSearch(m_cacheSize, |
|---|
| 812 | searchResults[f].getBestGroup(), ranking, m_numUsedAttributes, |
|---|
| 813 | m_linearSelectionType == TYPE_FIXED_WIDTH, 1, maxSubsetSize, |
|---|
| 814 | trainData[f], (SubsetEvaluator)m_setSizeEval, m_verbose); |
|---|
| 815 | } |
|---|
| 816 | } |
|---|
| 817 | |
|---|
| 818 | double[][] testMerit = new double[m_numFolds][maxSubsetSize + 1]; |
|---|
| 819 | |
|---|
| 820 | for (int f = 0; f < m_numFolds; f++) { |
|---|
| 821 | for (int s = 1; s <= maxSubsetSize; s++) { |
|---|
| 822 | if (HoldOutSubsetEvaluator.class.isInstance(m_setSizeEval)) { |
|---|
| 823 | m_setSizeEval.buildEvaluator(trainData[f]); |
|---|
| 824 | testMerit[f][s] = ((HoldOutSubsetEvaluator) m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize( |
|---|
| 825 | s), testData[f]); |
|---|
| 826 | } else { |
|---|
| 827 | m_setSizeEval.buildEvaluator(testData[f]); |
|---|
| 828 | testMerit[f][s] = ((SubsetEvaluator)m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize( |
|---|
| 829 | s)); |
|---|
| 830 | } |
|---|
| 831 | } |
|---|
| 832 | } |
|---|
| 833 | |
|---|
| 834 | double[] avgTestMerit = new double[maxSubsetSize + 1]; |
|---|
| 835 | int finalSubsetSize = -1; |
|---|
| 836 | |
|---|
| 837 | for (int s = 1; s <= maxSubsetSize; s++) { |
|---|
| 838 | for (int f = 0; f < m_numFolds; f++) { |
|---|
| 839 | avgTestMerit[s] = ((avgTestMerit[s] * f) + testMerit[f][s]) / (double) (f + |
|---|
| 840 | 1); |
|---|
| 841 | } |
|---|
| 842 | |
|---|
| 843 | if ((finalSubsetSize == -1) || |
|---|
| 844 | (avgTestMerit[s] > avgTestMerit[finalSubsetSize])) { |
|---|
| 845 | finalSubsetSize = s; |
|---|
| 846 | } |
|---|
| 847 | |
|---|
| 848 | if (m_verbose) { |
|---|
| 849 | System.out.println("average merit for subset-size " + s + ": " + |
|---|
| 850 | avgTestMerit[s]); |
|---|
| 851 | } |
|---|
| 852 | } |
|---|
| 853 | |
|---|
| 854 | if (m_verbose) { |
|---|
| 855 | System.out.println("performing final forward selection to subset-size: " + |
|---|
| 856 | finalSubsetSize); |
|---|
| 857 | } |
|---|
| 858 | |
|---|
| 859 | ASEval.buildEvaluator(data); |
|---|
| 860 | LSF.forwardSearch(m_cacheSize, new BitSet(m_numAttribs), ranking, |
|---|
| 861 | m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, 1, |
|---|
| 862 | finalSubsetSize, data, (SubsetEvaluator) ASEval, m_verbose); |
|---|
| 863 | |
|---|
| 864 | m_totalEvals = LSF.getNumEvalsTotal(); |
|---|
| 865 | m_bestMerit = LSF.getBestMerit(); |
|---|
| 866 | |
|---|
| 867 | return attributeList(LSF.getBestGroup()); |
|---|
| 868 | } |
|---|
| 869 | |
|---|
| 870 | /** |
|---|
| 871 | * Reset options to default values |
|---|
| 872 | */ |
|---|
| 873 | protected void resetOptions() { |
|---|
| 874 | m_performRanking = true; |
|---|
| 875 | m_numUsedAttributes = 50; |
|---|
| 876 | m_linearSelectionType = TYPE_FIXED_SET; |
|---|
| 877 | m_setSizeEval = new ClassifierSubsetEval(); |
|---|
| 878 | m_numFolds = 5; |
|---|
| 879 | m_seed = 1; |
|---|
| 880 | m_totalEvals = 0; |
|---|
| 881 | m_cacheSize = 1; |
|---|
| 882 | m_verbose = false; |
|---|
| 883 | } |
|---|
| 884 | |
|---|
| 885 | /** |
|---|
| 886 | * converts a BitSet into a list of attribute indexes |
|---|
| 887 | * |
|---|
| 888 | * @param group |
|---|
| 889 | * the BitSet to convert |
|---|
| 890 | * @return an array of attribute indexes |
|---|
| 891 | */ |
|---|
| 892 | protected int[] attributeList(BitSet group) { |
|---|
| 893 | int count = 0; |
|---|
| 894 | |
|---|
| 895 | // count how many were selected |
|---|
| 896 | for (int i = 0; i < m_numAttribs; i++) { |
|---|
| 897 | if (group.get(i)) { |
|---|
| 898 | count++; |
|---|
| 899 | } |
|---|
| 900 | } |
|---|
| 901 | |
|---|
| 902 | int[] list = new int[count]; |
|---|
| 903 | count = 0; |
|---|
| 904 | |
|---|
| 905 | for (int i = 0; i < m_numAttribs; i++) { |
|---|
| 906 | if (group.get(i)) { |
|---|
| 907 | list[count++] = i; |
|---|
| 908 | } |
|---|
| 909 | } |
|---|
| 910 | |
|---|
| 911 | return list; |
|---|
| 912 | } |
|---|
| 913 | |
|---|
| 914 | /** |
|---|
| 915 | * Returns the revision string. |
|---|
| 916 | * |
|---|
| 917 | * @return the revision |
|---|
| 918 | */ |
|---|
| 919 | public String getRevision() { |
|---|
| 920 | return RevisionUtils.extract("$Revision: 5604 $"); |
|---|
| 921 | } |
|---|
| 922 | } |
|---|
| 923 | |
|---|