source: src/main/java/weka/attributeSelection/SubsetSizeForwardSelection.java @ 14

Last change on this file since 14 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 27.6 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    SubsetSizeForwardSelection.java
19 *    Copyright (C) 2007 Martin Guetlein
20 *
21 */
22package weka.attributeSelection;
23
24import weka.core.Instances;
25import weka.core.Option;
26import weka.core.OptionHandler;
27import weka.core.RevisionUtils;
28import weka.core.SelectedTag;
29import weka.core.Tag;
30import weka.core.TechnicalInformation;
31import weka.core.Utils;
32import weka.core.TechnicalInformation.Field;
33import weka.core.TechnicalInformation.Type;
34
35import java.util.BitSet;
36import java.util.Enumeration;
37import java.util.Random;
38import java.util.Vector;
39
40
41/**
42 <!-- globalinfo-start -->
43 * SubsetSizeForwardSelection:<br/>
44 * <br/>
45 * Extension of LinearForwardSelection. The search performs an interior cross-validation (seed and number of folds can be specified). A LinearForwardSelection is performed on each foldto determine the optimal subset-size (using the given SubsetSizeEvaluator). Finally, a LinearForwardSelection up to the optimal subset-size is performed on the whole data.<br/>
46 * <br/>
47 * For more information see:<br/>
48 * <br/>
49 * Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany.
50 * <p/>
51 <!-- globalinfo-end -->
52 *
53 <!-- options-start -->
54 * Valid options are: <p/>
55 *
56 * <pre> -I
57 *  Perform initial ranking to select the
58 *  top-ranked attributes.</pre>
59 *
60 * <pre> -K &lt;num&gt;
61 *  Number of top-ranked attributes that are
62 *  taken into account by the search.</pre>
63 *
64 * <pre> -T &lt;0 = fixed-set | 1 = fixed-width&gt;
65 *  Type of Linear Forward Selection (default = 0).</pre>
66 *
67 * <pre> -S &lt;num&gt;
68 *  Size of lookup cache for evaluated subsets.
69 *  Expressed as a multiple of the number of
70 *  attributes in the data set. (default = 1)</pre>
71 *
72 * <pre> -E &lt;subset evaluator&gt;
73 *  Subset-evaluator used for subset-size determination.-- -M</pre>
74 *
75 * <pre> -F &lt;num&gt;
76 *  Number of cross validation folds
77 *  for subset size determination (default = 5).</pre>
78 *
79 * <pre> -R &lt;num&gt;
80 *  Seed for cross validation
81 *  subset size determination. (default = 1)</pre>
82 *
83 * <pre> -Z
84 *  verbose on/off</pre>
85 *
86 * <pre>
87 * Options specific to evaluator weka.attributeSelection.ClassifierSubsetEval:
88 * </pre>
89 *
90 * <pre> -B &lt;classifier&gt;
91 *  class name of the classifier to use for accuracy estimation.
92 *  Place any classifier options LAST on the command line
93 *  following a "--". eg.:
94 *   -B weka.classifiers.bayes.NaiveBayes ... -- -K
95 *  (default: weka.classifiers.rules.ZeroR)</pre>
96 *
97 * <pre> -T
98 *  Use the training data to estimate accuracy.</pre>
99 *
100 * <pre> -H &lt;filename&gt;
101 *  Name of the hold out/test set to
102 *  estimate accuracy on.</pre>
103 *
104 * <pre>
105 * Options specific to scheme weka.classifiers.rules.ZeroR:
106 * </pre>
107 *
108 * <pre> -D
109 *  If set, classifier is run in debug mode and
110 *  may output additional info to the console</pre>
111 *
112 <!-- options-end -->
113 *
114 * @author Martin Guetlein (martin.guetlein@gmail.com)
115 * @version $Revision: 5604 $
116 */
117public class SubsetSizeForwardSelection extends ASSearch
118  implements OptionHandler {
119  /** search directions */
120  protected static final int TYPE_FIXED_SET = 0;
121  protected static final int TYPE_FIXED_WIDTH = 1;
122  public static final Tag[] TAGS_TYPE = {
123    new Tag(TYPE_FIXED_SET, "Fixed-set"),
124    new Tag(TYPE_FIXED_WIDTH, "Fixed-width"),
125  };
126
127  // member variables
128  /** perform initial ranking to select top-ranked attributes */
129  protected boolean m_performRanking;
130
131  /**
132   * number of top-ranked attributes that are taken into account for the
133   * search
134   */
135  protected int m_numUsedAttributes;
136
137  /** 0 == fixed-set, 1 == fixed-width */
138  protected int m_linearSelectionType;
139
140  /** the subset evaluator to use for subset size determination */
141  private ASEvaluation m_setSizeEval;
142
143  /**
144   * Number of cross validation folds for subset size determination (default =
145   * 5).
146   */
147  protected int m_numFolds;
148
149  /** Seed for cross validation subset size determination. (default = 1) */
150  protected int m_seed;
151
152  /** number of attributes in the data */
153  protected int m_numAttribs;
154
155  /** total number of subsets evaluated during a search */
156  protected int m_totalEvals;
157
158  /** for debugging */
159  protected boolean m_verbose;
160
161  /** holds the merit of the best subset found */
162  protected double m_bestMerit;
163
164  /** holds the maximum size of the lookup cache for evaluated subsets */
165  protected int m_cacheSize;
166
167  /**
168   * Constructor
169   */
170  public SubsetSizeForwardSelection() {
171    resetOptions();
172  }
173
174  /**
175   * Returns a string describing this search method
176   *
177   * @return a description of the search method suitable for displaying in the
178   *         explorer/experimenter gui
179   */
180  public String globalInfo() {
181    return "SubsetSizeForwardSelection:\n\n" +
182      "Extension of LinearForwardSelection. The search performs an interior " +
183      "cross-validation (seed and number of folds can be specified). A " +
184      "LinearForwardSelection is performed on each foldto determine the optimal " +
185      "subset-size (using the given SubsetSizeEvaluator). Finally, a " +
186      "LinearForwardSelection up to the optimal subset-size is performed on " +
187      "the whole data.\n\n"
188      + "For more information see:\n\n"
189      + getTechnicalInformation().toString();
190  }
191
192  /**
193   * Returns an instance of a TechnicalInformation object, containing
194   * detailed information about the technical background of this class,
195   * e.g., paper reference or book this class is based on.
196   *
197   * @return the technical information about this class
198   */
199  public TechnicalInformation getTechnicalInformation() {
200    TechnicalInformation        result;
201    TechnicalInformation        additional;
202   
203    result = new TechnicalInformation(Type.INPROCEEDINGS);
204    result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall");
205    result.setValue(Field.YEAR, "2009");
206    result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
207    result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining");
208    result.setValue(Field.PAGES, "332-339");
209    result.setValue(Field.PUBLISHER, "IEEE");
210   
211    additional = result.add(Type.MASTERSTHESIS);
212    additional.setValue(Field.AUTHOR, "Martin Guetlein");
213    additional.setValue(Field.YEAR, "2006");
214    additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
215    additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet");
216    additional.setValue(Field.ADDRESS, "Freiburg, Germany");
217   
218    return result;
219  }
220
221  /**
222   * Returns an enumeration describing the available options.
223   *
224   * @return an enumeration of all the available options.
225   *
226   */
227  public Enumeration listOptions() {
228    Vector newVector = new Vector(9);
229
230    newVector.addElement(new Option("\tPerform initial ranking to select the" +
231                                    "\n\ttop-ranked attributes.", "I", 0, "-I"));
232    newVector.addElement(new Option(
233                                    "\tNumber of top-ranked attributes that are " +
234                                    "\n\ttaken into account by the search.", "K", 1, "-K <num>"));
235    newVector.addElement(new Option(
236                                    "\tType of Linear Forward Selection (default = 0).", "T", 1,
237                                    "-T <0 = fixed-set | 1 = fixed-width>"));
238    newVector.addElement(new Option(
239                                    "\tSize of lookup cache for evaluated subsets." +
240                                    "\n\tExpressed as a multiple of the number of" +
241                                    "\n\tattributes in the data set. (default = 1)", "S", 1, "-S <num>"));
242    newVector.addElement(new Option(
243                                    "\tSubset-evaluator used for subset-size determination." + "-- -M",
244                                    "E", 1, "-E <subset evaluator>"));
245    newVector.addElement(new Option("\tNumber of cross validation folds" +
246                                    "\n\tfor subset size determination (default = 5).", "F", 1, "-F <num>"));
247    newVector.addElement(new Option("\tSeed for cross validation" +
248                                    "\n\tsubset size determination. (default = 1)", "R", 1, "-R <num>"));
249    newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z"));
250
251    if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
252      newVector.addElement(new Option("", "", 0,
253                                      "\nOptions specific to " + "evaluator " +
254                                      m_setSizeEval.getClass().getName() + ":"));
255
256      Enumeration enu = ((OptionHandler) m_setSizeEval).listOptions();
257
258      while (enu.hasMoreElements()) {
259        newVector.addElement(enu.nextElement());
260      }
261    }
262
263    return newVector.elements();
264  }
265
266  /**
267   * Parses a given list of options.
268   *
269   * Valid options are:
270   * <p>
271   *
272   * -I <br>
273   * Perform initial ranking to select top-ranked attributes.
274   * <p>
275   *
276   * -K <num> <br>
277   * Number of top-ranked attributes that are taken into account.
278   * <p>
279   *
280   * -T <0 = fixed-set | 1 = fixed-width> <br>
281   * Typ of Linear Forward Selection (default = 0).
282   * <p>
283   *
284   * -S <num> <br>
285   * Size of lookup cache for evaluated subsets. Expressed as a multiple of
286   * the number of attributes in the data set. (default = 1).
287   * <p>
288   *
289   * -E <string> <br>
290   * class name of subset evaluator to use for subset size determination
291   * (default = null, same subset evaluator as for ranking and final forward
292   * selection is used). Place any evaluator options LAST on the command line
293   * following a "--". eg. -A weka.attributeSelection.ClassifierSubsetEval ... --
294   * -M
295   *
296   * </pre>
297   *
298   * -F <num> <br>
299   * Number of cross validation folds for subset size determination (default =
300   * 5).
301   * <p>
302   *
303   * -R <num> <br>
304   * Seed for cross validation subset size determination. (default = 1)
305   * <p>
306   *
307   * -Z <br>
308   * verbose on/off.
309   * <p>
310   *
311   * @param options
312   *            the list of options as an array of strings
313   * @exception Exception
314   *                if an option is not supported
315   *
316   */
317  public void setOptions(String[] options) throws Exception {
318    String optionString;
319    resetOptions();
320
321    setPerformRanking(Utils.getFlag('I', options));
322
323    optionString = Utils.getOption('K', options);
324
325    if (optionString.length() != 0) {
326      setNumUsedAttributes(Integer.parseInt(optionString));
327    }
328
329    optionString = Utils.getOption('T', options);
330
331    if (optionString.length() != 0) {
332      setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE));
333    } else {
334      setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE));
335    }
336
337    optionString = Utils.getOption('S', options);
338
339    if (optionString.length() != 0) {
340      setLookupCacheSize(Integer.parseInt(optionString));
341    }
342
343    optionString = Utils.getOption('E', options);
344
345    if (optionString.length() == 0) {
346      System.out.println(
347                         "No subset size evaluator given, using evaluator that is used for final search.");
348      m_setSizeEval = null;
349    } else {
350      setSubsetSizeEvaluator(ASEvaluation.forName(optionString,
351                                                  Utils.partitionOptions(options)));
352    }
353
354    optionString = Utils.getOption('F', options);
355
356    if (optionString.length() != 0) {
357      setNumSubsetSizeCVFolds(Integer.parseInt(optionString));
358    }
359
360    optionString = Utils.getOption('R', options);
361
362    if (optionString.length() != 0) {
363      setSeed(Integer.parseInt(optionString));
364    }
365
366    m_verbose = Utils.getFlag('Z', options);
367  }
368
369  /**
370   * Set the maximum size of the evaluated subset cache (hashtable). This is
371   * expressed as a multiplier for the number of attributes in the data set.
372   * (default = 1).
373   *
374   * @param size
375   *            the maximum size of the hashtable
376   */
377  public void setLookupCacheSize(int size) {
378    if (size >= 0) {
379      m_cacheSize = size;
380    }
381  }
382
383  /**
384   * Return the maximum size of the evaluated subset cache (expressed as a
385   * multiplier for the number of attributes in a data set.
386   *
387   * @return the maximum size of the hashtable.
388   */
389  public int getLookupCacheSize() {
390    return m_cacheSize;
391  }
392
393  /**
394   * Returns the tip text for this property
395   *
396   * @return tip text for this property suitable for displaying in the
397   *         explorer/experimenter gui
398   */
399  public String lookupCacheSizeTipText() {
400    return "Set the maximum size of the lookup cache of evaluated subsets. This is " +
401      "expressed as a multiplier of the number of attributes in the data set. " +
402      "(default = 1).";
403  }
404
405  /**
406   * Returns the tip text for this property
407   *
408   * @return tip text for this property suitable for displaying in the
409   *         explorer/experimenter gui
410   */
411  public String performRankingTipText() {
412    return "Perform initial ranking to select top-ranked attributes.";
413  }
414
415  /**
416   * Perform initial ranking to select top-ranked attributes.
417   *
418   * @param b
419   *            true if initial ranking should be performed
420   */
421  public void setPerformRanking(boolean b) {
422    m_performRanking = b;
423  }
424
425  /**
426   * Get boolean if initial ranking should be performed to select the
427   * top-ranked attributes
428   *
429   * @return true if initial ranking should be performed
430   */
431  public boolean getPerformRanking() {
432    return m_performRanking;
433  }
434
435  /**
436   * Returns the tip text for this property
437   *
438   * @return tip text for this property suitable for displaying in the
439   *         explorer/experimenter gui
440   */
441  public String numUsedAttributesTipText() {
442    return "Set the amount of top-ranked attributes that are taken into account by the search process.";
443  }
444
445  /**
446   * Set the number of top-ranked attributes that taken into account by the
447   * search process.
448   *
449   * @param k
450   *            the number of attributes
451   * @exception Exception
452   *                if k is less than 2
453   */
454  public void setNumUsedAttributes(int k) throws Exception {
455    if (k < 2) {
456      throw new Exception("Value of -K must be >= 2.");
457    }
458
459    m_numUsedAttributes = k;
460  }
461
462  /**
463   * Get the number of top-ranked attributes that taken into account by the
464   * search process.
465   *
466   * @return the number of top-ranked attributes that taken into account
467   */
468  public int getNumUsedAttributes() {
469    return m_numUsedAttributes;
470  }
471
472  /**
473   * Returns the tip text for this property
474   *
475   * @return tip text for this property suitable for displaying in the
476   *         explorer/experimenter gui
477   */
478  public String typeTipText() {
479    return "Set the type of the search.";
480  }
481
482  /**
483   * Set the type
484   *
485   * @param t
486   *            the Linear Forward Selection type
487   */
488  public void setType(SelectedTag t) {
489    if (t.getTags() == TAGS_TYPE) {
490      m_linearSelectionType = t.getSelectedTag().getID();
491    }
492  }
493
494  /**
495   * Get the type
496   *
497   * @return the Linear Forward Selection type
498   */
499  public SelectedTag getType() {
500    return new SelectedTag(m_linearSelectionType, TAGS_TYPE);
501  }
502
503  /**
504   * Returns the tip text for this property
505   *
506   * @return tip text for this property suitable for displaying in the
507   *         explorer/experimenter gui
508   */
509  public String subsetSizeEvaluatorTipText() {
510    return "Subset evaluator to use for subset size determination.";
511  }
512
513  /**
514   * Set the subset evaluator to use for subset size determination.
515   *
516   * @param eval
517   *            the subset evaluator to use for subset size determination.
518   */
519  public void setSubsetSizeEvaluator(ASEvaluation eval)
520    throws Exception {
521    if (!(eval instanceof SubsetEvaluator)) {
522      throw new Exception(eval.getClass().getName() +
523                          " is no subset evaluator.");
524    }
525
526    m_setSizeEval = eval;
527  }
528
529  /**
530   * Get the subset evaluator used for subset size determination.
531   *
532   * @return the evaluator used for subset size determination.
533   */
534  public ASEvaluation getSubsetSizeEvaluator() {
535    return m_setSizeEval;
536  }
537
538  /**
539   * Returns the tip text for this property
540   *
541   * @return tip text for this property suitable for displaying in the
542   *         explorer/experimenter gui
543   */
544  public String numSubsetSizeCVFoldsTipText() {
545    return "Number of cross validation folds for subset size determination";
546  }
547
548  /**
549   * Set the number of cross validation folds for subset size determination
550   * (default = 5).
551   *
552   * @param f
553   *            number of folds
554   */
555  public void setNumSubsetSizeCVFolds(int f) {
556    m_numFolds = f;
557  }
558
559  /**
560   * Get the number of cross validation folds for subset size determination
561   * (default = 5).
562   *
563   * @return number of folds
564   */
565  public int getNumSubsetSizeCVFolds() {
566    return m_numFolds;
567  }
568
569  /**
570   * Returns the tip text for this property
571   *
572   * @return tip text for this property suitable for displaying in the
573   *         explorer/experimenter gui
574   */
575  public String seedTipText() {
576    return "Seed for cross validation subset size determination. (default = 1)";
577  }
578
579  /**
580   * Seed for cross validation subset size determination. (default = 1)
581   *
582   * @param s
583   *            seed
584   */
585  public void setSeed(int s) {
586    m_seed = s;
587  }
588
589  /**
590   * Seed for cross validation subset size determination. (default = 1)
591   *
592   * @return seed
593   */
594  public int getSeed() {
595    return m_seed;
596  }
597
598  /**
599   * Returns the tip text for this property
600   *
601   * @return tip text for this property suitable for displaying in the
602   *         explorer/experimenter gui
603   */
604  public String verboseTipText() {
605    return "Turn on verbose output for monitoring the search's progress.";
606  }
607
608  /**
609   * Set whether verbose output should be generated.
610   *
611   * @param b
612   *            true if output is to be verbose.
613   */
614  public void setVerbose(boolean b) {
615    m_verbose = b;
616  }
617
618  /**
619   * Get whether output is to be verbose
620   *
621   * @return true if output will be verbose
622   */
623  public boolean getVerbose() {
624    return m_verbose;
625  }
626
627  /**
628   * Gets the current settings of LinearForwardSelection.
629   *
630   * @return an array of strings suitable for passing to setOptions()
631   */
632  public String[] getOptions() {
633    String[] evaluatorOptions = new String[0];
634
635    if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
636      evaluatorOptions = ((OptionHandler) m_setSizeEval).getOptions();
637    }
638
639    String[] options = new String[15 + evaluatorOptions.length];
640    int current = 0;
641
642    if (m_performRanking) {
643      options[current++] = "-I";
644    }
645
646    options[current++] = "-K";
647    options[current++] = "" + m_numUsedAttributes;
648    options[current++] = "-T";
649    options[current++] = "" + m_linearSelectionType;
650
651    options[current++] = "-F";
652    options[current++] = "" + m_numFolds;
653    options[current++] = "-S";
654    options[current++] = "" + m_seed;
655
656    options[current++] = "-Z";
657    options[current++] = "" + m_verbose;
658
659    if (m_setSizeEval != null) {
660      options[current++] = "-E";
661      options[current++] = m_setSizeEval.getClass().getName();
662    }
663
664    options[current++] = "--";
665    System.arraycopy(evaluatorOptions, 0, options, current,
666                     evaluatorOptions.length);
667    current += evaluatorOptions.length;
668
669    while (current < options.length) {
670      options[current++] = "";
671    }
672
673    return options;
674  }
675
676  /**
677   * returns a description of the search as a String
678   *
679   * @return a description of the search
680   */
681  public String toString() {
682    StringBuffer LFSString = new StringBuffer();
683
684    LFSString.append("\tSubset Size Forward Selection.\n");
685
686    LFSString.append("\tLinear Forward Selection Type: ");
687
688    if (m_linearSelectionType == TYPE_FIXED_SET) {
689      LFSString.append("fixed-set\n");
690    } else {
691      LFSString.append("fixed-width\n");
692    }
693
694    LFSString.append("\tNumber of top-ranked attributes that are used: " +
695                     m_numUsedAttributes + "\n");
696
697    LFSString.append(
698                     "\tNumber of cross validation folds for subset size determination: " +
699                     m_numFolds + "\n");
700    LFSString.append("\tSeed for cross validation subset size determination: " +
701                     m_seed + "\n");
702
703    LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals +
704                     "\n");
705    LFSString.append("\tMerit of best subset found: " +
706                     Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n");
707
708    return LFSString.toString();
709  }
710
711  /**
712   * Searches the attribute subset space by subset size forward selection
713   *
714   * @param ASEval
715   *            the attribute evaluator to guide the search
716   * @param data
717   *            the training instances.
718   * @return an array (not necessarily ordered) of selected attribute indexes
719   * @exception Exception
720   *                if the search can't be completed
721   */
722  public int[] search(ASEvaluation ASEval, Instances data)
723    throws Exception {
724    m_totalEvals = 0;
725
726    if (!(ASEval instanceof SubsetEvaluator)) {
727      throw new Exception(ASEval.getClass().getName() + " is not a " +
728                          "Subset evaluator!");
729    }
730
731    if (m_setSizeEval == null) {
732      m_setSizeEval = ASEval;
733    }
734
735    m_numAttribs = data.numAttributes();
736
737    if (m_numUsedAttributes > m_numAttribs) {
738      System.out.println(
739                         "Decreasing number of top-ranked attributes to total number of attributes: " +
740                         data.numAttributes());
741      m_numUsedAttributes = m_numAttribs;
742    }
743
744    Instances[] trainData = new Instances[m_numFolds];
745    Instances[] testData = new Instances[m_numFolds];
746    LFSMethods[] searchResults = new LFSMethods[m_numFolds];
747
748    Random random = new Random(m_seed);
749    Instances dataCopy = new Instances(data);
750    dataCopy.randomize(random);
751
752    if (dataCopy.classAttribute().isNominal()) {
753      dataCopy.stratify(m_numFolds);
754    }
755
756    for (int f = 0; f < m_numFolds; f++) {
757      trainData[f] = dataCopy.trainCV(m_numFolds, f, random);
758      testData[f] = dataCopy.testCV(m_numFolds, f);
759    }
760
761    LFSMethods LSF = new LFSMethods();
762
763    int[] ranking;
764
765    if (m_performRanking) {
766      ASEval.buildEvaluator(data);
767      ranking = LSF.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose);
768    } else {
769      ranking = new int[m_numAttribs];
770
771      for (int i = 0; i < ranking.length; i++) {
772        ranking[i] = i;
773      }
774    }
775
776    int maxSubsetSize = 0;
777
778    for (int f = 0; f < m_numFolds; f++) {
779      if (m_verbose) {
780        System.out.println("perform search on internal fold: " + (f + 1) + "/" +
781                           m_numFolds);
782      }
783
784      m_setSizeEval.buildEvaluator(trainData[f]);
785      searchResults[f] = new LFSMethods();
786      searchResults[f].forwardSearch(m_cacheSize, new BitSet(m_numAttribs),
787                                     ranking, m_numUsedAttributes,
788                                     m_linearSelectionType == TYPE_FIXED_WIDTH, 1, -1, trainData[f],
789                                     (SubsetEvaluator)m_setSizeEval, m_verbose);
790
791      maxSubsetSize = Math.max(maxSubsetSize,
792                               searchResults[f].getBestGroup().cardinality());
793    }
794
795    if (m_verbose) {
796      System.out.println(
797                         "continue searches on internal folds to maxSubsetSize (" +
798                         maxSubsetSize + ")");
799    }
800
801    for (int f = 0; f < m_numFolds; f++) {
802      if (m_verbose) {
803        System.out.print("perform search on internal fold: " + (f + 1) + "/" +
804                         m_numFolds + " with starting set ");
805        LFSMethods.printGroup(searchResults[f].getBestGroup(),
806                              trainData[f].numAttributes());
807      }
808
809      if (searchResults[f].getBestGroup().cardinality() < maxSubsetSize) {
810        m_setSizeEval.buildEvaluator(trainData[f]);
811        searchResults[f].forwardSearch(m_cacheSize,
812                                       searchResults[f].getBestGroup(), ranking, m_numUsedAttributes,
813                                       m_linearSelectionType == TYPE_FIXED_WIDTH, 1, maxSubsetSize,
814                                       trainData[f], (SubsetEvaluator)m_setSizeEval, m_verbose);
815      }
816    }
817
818    double[][] testMerit = new double[m_numFolds][maxSubsetSize + 1];
819
820    for (int f = 0; f < m_numFolds; f++) {
821      for (int s = 1; s <= maxSubsetSize; s++) {
822        if (HoldOutSubsetEvaluator.class.isInstance(m_setSizeEval)) {
823          m_setSizeEval.buildEvaluator(trainData[f]);
824          testMerit[f][s] = ((HoldOutSubsetEvaluator) m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize(
825                                                                                                                        s), testData[f]);
826        } else {
827          m_setSizeEval.buildEvaluator(testData[f]);
828          testMerit[f][s] = ((SubsetEvaluator)m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize(
829                                                                                             s));
830        }
831      }
832    }
833
834    double[] avgTestMerit = new double[maxSubsetSize + 1];
835    int finalSubsetSize = -1;
836
837    for (int s = 1; s <= maxSubsetSize; s++) {
838      for (int f = 0; f < m_numFolds; f++) {
839        avgTestMerit[s] = ((avgTestMerit[s] * f) + testMerit[f][s]) / (double) (f +
840                                                                                1);
841      }
842
843      if ((finalSubsetSize == -1) ||
844          (avgTestMerit[s] > avgTestMerit[finalSubsetSize])) {
845        finalSubsetSize = s;
846      }
847
848      if (m_verbose) {
849        System.out.println("average merit for subset-size " + s + ": " +
850                           avgTestMerit[s]);
851      }
852    }
853
854    if (m_verbose) {
855      System.out.println("performing final forward selection to subset-size: " +
856                         finalSubsetSize);
857    }
858
859    ASEval.buildEvaluator(data);
860    LSF.forwardSearch(m_cacheSize, new BitSet(m_numAttribs), ranking,
861                      m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, 1,
862                      finalSubsetSize, data, (SubsetEvaluator) ASEval, m_verbose);
863
864    m_totalEvals = LSF.getNumEvalsTotal();
865    m_bestMerit = LSF.getBestMerit();
866
867    return attributeList(LSF.getBestGroup());
868  }
869
870  /**
871   * Reset options to default values
872   */
873  protected void resetOptions() {
874    m_performRanking = true;
875    m_numUsedAttributes = 50;
876    m_linearSelectionType = TYPE_FIXED_SET;
877    m_setSizeEval = new ClassifierSubsetEval();
878    m_numFolds = 5;
879    m_seed = 1;
880    m_totalEvals = 0;
881    m_cacheSize = 1;
882    m_verbose = false;
883  }
884
885  /**
886   * converts a BitSet into a list of attribute indexes
887   *
888   * @param group
889   *            the BitSet to convert
890   * @return an array of attribute indexes
891   */
892  protected int[] attributeList(BitSet group) {
893    int count = 0;
894
895    // count how many were selected
896    for (int i = 0; i < m_numAttribs; i++) {
897      if (group.get(i)) {
898        count++;
899      }
900    }
901
902    int[] list = new int[count];
903    count = 0;
904
905    for (int i = 0; i < m_numAttribs; i++) {
906      if (group.get(i)) {
907        list[count++] = i;
908      }
909    }
910
911    return list;
912  }
913 
914  /**
915   * Returns the revision string.
916   *
917   * @return            the revision
918   */
919  public String getRevision() {
920    return RevisionUtils.extract("$Revision: 5604 $");
921  }
922}
923
Note: See TracBrowser for help on using the repository browser.