source: src/main/java/weka/attributeSelection/GreedyStepwise.java @ 6

Last change on this file since 6 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 22.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    GreedyStepwise.java
19 *    Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Instances;
26import weka.core.Option;
27import weka.core.OptionHandler;
28import weka.core.Range;
29import weka.core.RevisionUtils;
30import weka.core.Utils;
31
32import java.util.BitSet;
33import java.util.Enumeration;
34import java.util.Vector;
35
36/**
37 <!-- globalinfo-start -->
38 * GreedyStepwise :<br/>
39 * <br/>
40 * Performs a greedy forward or backward search through the space of attribute subsets. May start with no/all attributes or from an arbitrary point in the space. Stops when the addition/deletion of any remaining attributes results in a decrease in evaluation. Can also produce a ranked list of attributes by traversing the space from one side to the other and recording the order that attributes are selected.<br/>
41 * <p/>
42 <!-- globalinfo-end -->
43 *
44 <!-- options-start -->
45 * Valid options are: <p/>
46 *
47 * <pre> -C
48 *  Use conservative forward search</pre>
49 *
50 * <pre> -B
51 *  Use a backward search instead of a
52 *  forward one.</pre>
53 *
54 * <pre> -P &lt;start set&gt;
55 *  Specify a starting set of attributes.
56 *  Eg. 1,3,5-7.</pre>
57 *
58 * <pre> -R
59 *  Produce a ranked list of attributes.</pre>
60 *
61 * <pre> -T &lt;threshold&gt;
62 *  Specify a theshold by which attributes
63 *  may be discarded from the ranking.
64 *  Use in conjuction with -R</pre>
65 *
66 * <pre> -N &lt;num to select&gt;
67 *  Specify number of attributes to select</pre>
68 *
69 <!-- options-end -->
70 *
71 * @author Mark Hall
72 * @version $Revision: 1.10 $
73 */
74public class GreedyStepwise 
75  extends ASSearch
76  implements RankedOutputSearch, StartSetHandler, OptionHandler {
77 
78  /** for serialization */
79  static final long serialVersionUID = -6312951970168325471L;
80
81  /** does the data have a class */
82  protected boolean m_hasClass;
83 
84  /** holds the class index */
85  protected int m_classIndex;
86 
87  /** number of attributes in the data */
88  protected int m_numAttribs;
89
90  /** true if the user has requested a ranked list of attributes */
91  protected boolean m_rankingRequested;
92
93  /**
94   * go from one side of the search space to the other in order to generate
95   * a ranking
96   */
97  protected boolean m_doRank;
98
99  /** used to indicate whether or not ranking has been performed */
100  protected boolean m_doneRanking;
101
102  /**
103   * A threshold by which to discard attributes---used by the
104   * AttributeSelection module
105   */
106  protected double m_threshold;
107
108  /** The number of attributes to select. -1 indicates that all attributes
109      are to be retained. Has precedence over m_threshold */
110  protected int m_numToSelect = -1;
111
112  protected int m_calculatedNumToSelect;
113
114  /** the merit of the best subset found */
115  protected double m_bestMerit;
116
117  /** a ranked list of attribute indexes */
118  protected double [][] m_rankedAtts;
119  protected int m_rankedSoFar;
120
121  /** the best subset found */
122  protected BitSet m_best_group;
123  protected ASEvaluation m_ASEval;
124
125  protected Instances m_Instances;
126
127  /** holds the start set for the search as a Range */
128  protected Range m_startRange;
129
130  /** holds an array of starting attributes */
131  protected int [] m_starting;
132
133  /** Use a backwards search instead of a forwards one */
134  protected boolean m_backward = false;
135
136  /** If set then attributes will continue to be added during a forward
137      search as long as the merit does not degrade */
138  protected boolean m_conservativeSelection = false;
139
140  /**
141   * Constructor
142   */
143  public GreedyStepwise () {
144    m_threshold = -Double.MAX_VALUE;
145    m_doneRanking = false;
146    m_startRange = new Range();
147    m_starting = null;
148    resetOptions();
149  }
150
151  /**
152   * Returns a string describing this search method
153   * @return a description of the search suitable for
154   * displaying in the explorer/experimenter gui
155   */
156  public String globalInfo() {
157    return "GreedyStepwise :\n\nPerforms a greedy forward or backward search "
158      +"through "
159      +"the space of attribute subsets. May start with no/all attributes or from "
160      +"an arbitrary point in the space. Stops when the addition/deletion of any "
161      +"remaining attributes results in a decrease in evaluation. "
162      +"Can also produce a ranked list of "
163      +"attributes by traversing the space from one side to the other and "
164      +"recording the order that attributes are selected.\n";
165  }
166
167  /**
168   * Returns the tip text for this property
169   * @return tip text for this property suitable for
170   * displaying in the explorer/experimenter gui
171   */
172  public String searchBackwardsTipText() {
173    return "Search backwards rather than forwards.";
174  }
175
176  /**
177   * Set whether to search backwards instead of forwards
178   *
179   * @param back true to search backwards
180   */
181  public void setSearchBackwards(boolean back) {
182    m_backward = back;
183    if (m_backward) {
184      setGenerateRanking(false);
185    }
186  }
187
188  /**
189   * Get whether to search backwards
190   *
191   * @return true if the search will proceed backwards
192   */
193  public boolean getSearchBackwards() {
194    return m_backward;
195  }
196
197  /**
198   * Returns the tip text for this property
199   * @return tip text for this property suitable for
200   * displaying in the explorer/experimenter gui
201   */
202  public String thresholdTipText() {
203    return "Set threshold by which attributes can be discarded. Default value "
204      + "results in no attributes being discarded. Use in conjunction with "
205      + "generateRanking";
206  }
207
208  /**
209   * Set the threshold by which the AttributeSelection module can discard
210   * attributes.
211   * @param threshold the threshold.
212   */
213  public void setThreshold(double threshold) {
214    m_threshold = threshold;
215  }
216
217  /**
218   * Returns the threshold so that the AttributeSelection module can
219   * discard attributes from the ranking.
220   */
221  public double getThreshold() {
222    return m_threshold;
223  }
224
225  /**
226   * Returns the tip text for this property
227   * @return tip text for this property suitable for
228   * displaying in the explorer/experimenter gui
229   */
230  public String numToSelectTipText() {
231    return "Specify the number of attributes to retain. The default value "
232      +"(-1) indicates that all attributes are to be retained. Use either "
233      +"this option or a threshold to reduce the attribute set.";
234  }
235
236  /**
237   * Specify the number of attributes to select from the ranked list
238   * (if generating a ranking). -1
239   * indicates that all attributes are to be retained.
240   * @param n the number of attributes to retain
241   */
242  public void setNumToSelect(int n) {
243    m_numToSelect = n;
244  }
245
246  /**
247   * Gets the number of attributes to be retained.
248   * @return the number of attributes to retain
249   */
250  public int getNumToSelect() {
251    return m_numToSelect;
252  }
253
254  /**
255   * Gets the calculated number of attributes to retain. This is the
256   * actual number of attributes to retain. This is the same as
257   * getNumToSelect if the user specifies a number which is not less
258   * than zero. Otherwise it should be the number of attributes in the
259   * (potentially transformed) data.
260   */
261  public int getCalculatedNumToSelect() {
262    if (m_numToSelect >= 0) {
263      m_calculatedNumToSelect = m_numToSelect;
264    }
265    return m_calculatedNumToSelect;
266  }
267
268  /**
269   * Returns the tip text for this property
270   * @return tip text for this property suitable for
271   * displaying in the explorer/experimenter gui
272   */
273  public String generateRankingTipText() {
274    return "Set to true if a ranked list is required.";
275  }
276 
277  /**
278   * Records whether the user has requested a ranked list of attributes.
279   * @param doRank true if ranking is requested
280   */
281  public void setGenerateRanking(boolean doRank) {
282    m_rankingRequested = doRank;
283  }
284
285  /**
286   * Gets whether ranking has been requested. This is used by the
287   * AttributeSelection module to determine if rankedAttributes()
288   * should be called.
289   * @return true if ranking has been requested.
290   */
291  public boolean getGenerateRanking() {
292    return m_rankingRequested;
293  }
294
295  /**
296   * Returns the tip text for this property
297   * @return tip text for this property suitable for
298   * displaying in the explorer/experimenter gui
299   */
300  public String startSetTipText() {
301    return "Set the start point for the search. This is specified as a comma "
302      +"seperated list off attribute indexes starting at 1. It can include "
303      +"ranges. Eg. 1,2,5-9,17.";
304  }
305
306  /**
307   * Sets a starting set of attributes for the search. It is the
308   * search method's responsibility to report this start set (if any)
309   * in its toString() method.
310   * @param startSet a string containing a list of attributes (and or ranges),
311   * eg. 1,2,6,10-15.
312   * @throws Exception if start set can't be set.
313   */
314  public void setStartSet (String startSet) throws Exception {
315    m_startRange.setRanges(startSet);
316  }
317
318  /**
319   * Returns a list of attributes (and or attribute ranges) as a String
320   * @return a list of attributes (and or attribute ranges)
321   */
322  public String getStartSet () {
323    return m_startRange.getRanges();
324  }
325
326  /**
327   * Returns the tip text for this property
328   * @return tip text for this property suitable for
329   * displaying in the explorer/experimenter gui
330   */
331  public String conservativeForwardSelectionTipText() {
332    return "If true (and forward search is selected) then attributes "
333      +"will continue to be added to the best subset as long as merit does "
334      +"not degrade.";
335  }
336
337  /**
338   * Set whether attributes should continue to be added during
339   * a forward search as long as merit does not decrease
340   * @param c true if atts should continue to be atted
341   */
342  public void setConservativeForwardSelection(boolean c) {
343    m_conservativeSelection = c;
344  }
345
346  /**
347   * Gets whether conservative selection has been enabled
348   * @return true if conservative forward selection is enabled
349   */
350  public boolean getConservativeForwardSelection() {
351    return m_conservativeSelection;
352  }
353
354  /**
355   * Returns an enumeration describing the available options.
356   * @return an enumeration of all the available options.
357   **/
358  public Enumeration listOptions () {
359    Vector newVector = new Vector(5);
360
361    newVector.addElement(new Option("\tUse conservative forward search"
362                                    ,"-C", 0, "-C"));
363
364    newVector.addElement(new Option("\tUse a backward search instead of a"
365                                    +"\n\tforward one."
366                                    ,"-B", 0, "-B"));
367    newVector
368      .addElement(new Option("\tSpecify a starting set of attributes." 
369                             + "\n\tEg. 1,3,5-7."
370                             ,"P",1
371                             , "-P <start set>"));
372
373    newVector.addElement(new Option("\tProduce a ranked list of attributes."
374                                    ,"R",0,"-R"));
375    newVector
376      .addElement(new Option("\tSpecify a theshold by which attributes" 
377                             + "\n\tmay be discarded from the ranking."
378                             +"\n\tUse in conjuction with -R","T",1
379                             , "-T <threshold>"));
380
381    newVector
382      .addElement(new Option("\tSpecify number of attributes to select" 
383                             ,"N",1
384                             , "-N <num to select>"));
385
386    return newVector.elements();
387
388  }
389 
390  /**
391   * Parses a given list of options. <p/>
392   *
393   <!-- options-start -->
394   * Valid options are: <p/>
395   *
396   * <pre> -C
397   *  Use conservative forward search</pre>
398   *
399   * <pre> -B
400   *  Use a backward search instead of a
401   *  forward one.</pre>
402   *
403   * <pre> -P &lt;start set&gt;
404   *  Specify a starting set of attributes.
405   *  Eg. 1,3,5-7.</pre>
406   *
407   * <pre> -R
408   *  Produce a ranked list of attributes.</pre>
409   *
410   * <pre> -T &lt;threshold&gt;
411   *  Specify a theshold by which attributes
412   *  may be discarded from the ranking.
413   *  Use in conjuction with -R</pre>
414   *
415   * <pre> -N &lt;num to select&gt;
416   *  Specify number of attributes to select</pre>
417   *
418   <!-- options-end -->
419   *
420   * @param options the list of options as an array of strings
421   * @throws Exception if an option is not supported
422   */
423  public void setOptions (String[] options)
424    throws Exception {
425    String optionString;
426    resetOptions();
427
428    setSearchBackwards(Utils.getFlag('B', options));
429
430    setConservativeForwardSelection(Utils.getFlag('C', options));
431
432    optionString = Utils.getOption('P', options);
433    if (optionString.length() != 0) {
434      setStartSet(optionString);
435    }
436
437    setGenerateRanking(Utils.getFlag('R', options));
438
439    optionString = Utils.getOption('T', options);
440    if (optionString.length() != 0) {
441      Double temp;
442      temp = Double.valueOf(optionString);
443      setThreshold(temp.doubleValue());
444    }
445
446    optionString = Utils.getOption('N', options);
447    if (optionString.length() != 0) {
448      setNumToSelect(Integer.parseInt(optionString));
449    }
450  }
451
452  /**
453   * Gets the current settings of ReliefFAttributeEval.
454   *
455   * @return an array of strings suitable for passing to setOptions()
456   */
457  public String[] getOptions () {
458    String[] options = new String[9];
459    int current = 0;
460   
461    if (getSearchBackwards()) {
462      options[current++] = "-B";
463    }
464
465    if (getConservativeForwardSelection()) {
466      options[current++] = "-C";
467    }
468
469    if (!(getStartSet().equals(""))) {
470      options[current++] = "-P";
471      options[current++] = ""+startSetToString();
472    }
473
474    if (getGenerateRanking()) {
475      options[current++] = "-R";
476    }
477    options[current++] = "-T";
478    options[current++] = "" + getThreshold();
479
480    options[current++] = "-N";
481    options[current++] = ""+getNumToSelect();
482
483    while (current < options.length) {
484      options[current++] = "";
485    }
486    return  options;
487  }
488
489  /**
490   * converts the array of starting attributes to a string. This is
491   * used by getOptions to return the actual attributes specified
492   * as the starting set. This is better than using m_startRanges.getRanges()
493   * as the same start set can be specified in different ways from the
494   * command line---eg 1,2,3 == 1-3. This is to ensure that stuff that
495   * is stored in a database is comparable.
496   * @return a comma seperated list of individual attribute numbers as a String
497   */
498  protected String startSetToString() {
499    StringBuffer FString = new StringBuffer();
500    boolean didPrint;
501   
502    if (m_starting == null) {
503      return getStartSet();
504    }
505    for (int i = 0; i < m_starting.length; i++) {
506      didPrint = false;
507     
508      if ((m_hasClass == false) || 
509          (m_hasClass == true && i != m_classIndex)) {
510        FString.append((m_starting[i] + 1));
511        didPrint = true;
512      }
513     
514      if (i == (m_starting.length - 1)) {
515        FString.append("");
516      }
517      else {
518        if (didPrint) {
519          FString.append(",");
520          }
521      }
522    }
523
524    return FString.toString();
525  }
526
527  /**
528   * returns a description of the search.
529   * @return a description of the search as a String.
530   */
531  public String toString() {
532    StringBuffer FString = new StringBuffer();
533    FString.append("\tGreedy Stepwise ("
534                   + ((m_backward)
535                      ? "backwards)"
536                      : "forwards)")+".\n\tStart set: ");
537
538    if (m_starting == null) {
539      if (m_backward) {
540        FString.append("all attributes\n");
541      } else {
542        FString.append("no attributes\n");
543      }
544    }
545    else {
546      FString.append(startSetToString()+"\n");
547    }
548    if (!m_doneRanking) {
549      FString.append("\tMerit of best subset found: "
550                     +Utils.doubleToString(Math.abs(m_bestMerit),8,3)+"\n");
551    }
552   
553    if ((m_threshold != -Double.MAX_VALUE) && (m_doneRanking)) {
554      FString.append("\tThreshold for discarding attributes: "
555                     + Utils.doubleToString(m_threshold,8,4)+"\n");
556    }
557
558    return FString.toString();
559  }
560
561
562  /**
563   * Searches the attribute subset space by forward selection.
564   *
565   * @param ASEval the attribute evaluator to guide the search
566   * @param data the training instances.
567   * @return an array (not necessarily ordered) of selected attribute indexes
568   * @throws Exception if the search can't be completed
569   */
570  public int[] search (ASEvaluation ASEval, Instances data)
571    throws Exception {
572
573    int i;
574    double best_merit = -Double.MAX_VALUE;
575    double temp_best,temp_merit;
576    int temp_index=0;
577    BitSet temp_group;
578
579    if (data != null) { // this is a fresh run so reset
580      resetOptions();
581      m_Instances = data;
582    }
583    m_ASEval = ASEval;
584
585    m_numAttribs = m_Instances.numAttributes();
586
587    if (m_best_group == null) {
588      m_best_group = new BitSet(m_numAttribs);
589    }
590
591    if (!(m_ASEval instanceof SubsetEvaluator)) {
592      throw  new Exception(m_ASEval.getClass().getName() 
593                           + " is not a " 
594                           + "Subset evaluator!");
595    }
596
597    m_startRange.setUpper(m_numAttribs-1);
598    if (!(getStartSet().equals(""))) {
599      m_starting = m_startRange.getSelection();
600    }
601
602    if (m_ASEval instanceof UnsupervisedSubsetEvaluator) {
603      m_hasClass = false;
604      m_classIndex = -1;
605    }
606    else {
607      m_hasClass = true;
608      m_classIndex = m_Instances.classIndex();
609    }
610
611    SubsetEvaluator ASEvaluator = (SubsetEvaluator)m_ASEval;
612
613    if (m_rankedAtts == null) {
614      m_rankedAtts = new double[m_numAttribs][2];
615      m_rankedSoFar = 0;
616    }
617
618    // If a starting subset has been supplied, then initialise the bitset
619    if (m_starting != null && m_rankedSoFar <= 0) {
620      for (i = 0; i < m_starting.length; i++) {
621        if ((m_starting[i]) != m_classIndex) {
622          m_best_group.set(m_starting[i]);
623        }
624      }
625    } else {
626      if (m_backward && m_rankedSoFar <= 0) {
627        for (i = 0; i < m_numAttribs; i++) {
628          if (i != m_classIndex) {
629            m_best_group.set(i);
630          }
631        }
632      }
633    }
634
635    // Evaluate the initial subset
636    best_merit = ASEvaluator.evaluateSubset(m_best_group);
637
638    // main search loop
639    boolean done = false;
640    boolean addone = false;
641    boolean z;
642    while (!done) {
643      temp_group = (BitSet)m_best_group.clone();
644      temp_best = best_merit;
645      if (m_doRank) {
646        temp_best = -Double.MAX_VALUE;
647      }
648      done = true;
649      addone = false;
650      for (i=0;i<m_numAttribs;i++) {
651        if (m_backward) {
652          z = ((i != m_classIndex) && (temp_group.get(i)));
653        } else {
654          z = ((i != m_classIndex) && (!temp_group.get(i)));
655        }
656        if (z) {
657          // set/unset the bit
658          if (m_backward) {
659            temp_group.clear(i);
660          } else {
661            temp_group.set(i);
662          }
663          temp_merit = ASEvaluator.evaluateSubset(temp_group);
664          if (m_backward) {
665            z = (temp_merit >= temp_best);
666          } else {
667            if (m_conservativeSelection) {
668              z = (temp_merit >= temp_best);
669            } else {
670              z = (temp_merit > temp_best);
671            }
672          }
673
674          if (z) {
675            temp_best = temp_merit;
676            temp_index = i;
677            addone = true;
678            done = false;
679          }
680
681          // unset this addition/deletion
682          if (m_backward) {
683            temp_group.set(i);
684          } else {
685            temp_group.clear(i);
686          }
687          if (m_doRank) {
688            done = false;
689          }
690        }
691      }
692      if (addone) {
693        if (m_backward) {
694          m_best_group.clear(temp_index);
695        } else {
696          m_best_group.set(temp_index);
697        }
698        best_merit = temp_best;
699        m_rankedAtts[m_rankedSoFar][0] = temp_index;
700        m_rankedAtts[m_rankedSoFar][1] = best_merit;
701        m_rankedSoFar++;
702      }
703    }
704    m_bestMerit = best_merit;
705    return attributeList(m_best_group);
706  }
707
708  /**
709   * Produces a ranked list of attributes. Search must have been performed
710   * prior to calling this function. Search is called by this function to
711   * complete the traversal of the the search space. A list of
712   * attributes and merits are returned. The attributes a ranked by the
713   * order they are added to the subset during a forward selection search.
714   * Individual merit values reflect the merit associated with adding the
715   * corresponding attribute to the subset; because of this, merit values
716   * may initially increase but then decrease as the best subset is
717   * "passed by" on the way to the far side of the search space.
718   *
719   * @return an array of attribute indexes and associated merit values
720   * @throws Exception if something goes wrong.
721   */
722  public double [][] rankedAttributes() throws Exception {
723   
724    if (m_rankedAtts == null || m_rankedSoFar == -1) {
725      throw new Exception("Search must be performed before attributes "
726                          +"can be ranked.");
727    }
728   
729    m_doRank = true;
730    search (m_ASEval, null);
731
732    double [][] final_rank = new double [m_rankedSoFar][2];
733    for (int i=0;i<m_rankedSoFar;i++) {
734      final_rank[i][0] = m_rankedAtts[i][0];
735      final_rank[i][1] = m_rankedAtts[i][1];
736    }
737   
738    resetOptions();
739    m_doneRanking = true;
740
741    if (m_numToSelect > final_rank.length) {
742      throw new Exception("More attributes requested than exist in the data");
743    }
744
745    if (m_numToSelect <= 0) {
746      if (m_threshold == -Double.MAX_VALUE) {
747        m_calculatedNumToSelect = final_rank.length;
748      } else {
749        determineNumToSelectFromThreshold(final_rank);
750      }
751    }
752
753    return final_rank;
754  }
755
756  private void determineNumToSelectFromThreshold(double [][] ranking) {
757    int count = 0;
758    for (int i = 0; i < ranking.length; i++) {
759      if (ranking[i][1] > m_threshold) {
760        count++;
761      }
762    }
763    m_calculatedNumToSelect = count;
764  }
765
766  /**
767   * converts a BitSet into a list of attribute indexes
768   * @param group the BitSet to convert
769   * @return an array of attribute indexes
770   **/
771  protected int[] attributeList (BitSet group) {
772    int count = 0;
773
774    // count how many were selected
775    for (int i = 0; i < m_numAttribs; i++) {
776      if (group.get(i)) {
777        count++;
778      }
779    }
780
781    int[] list = new int[count];
782    count = 0;
783
784    for (int i = 0; i < m_numAttribs; i++) {
785      if (group.get(i)) {
786        list[count++] = i;
787      }
788    }
789
790    return  list;
791  }
792
793  /**
794   * Resets options
795   */
796  protected void resetOptions() {
797    m_doRank = false;
798    m_best_group = null;
799    m_ASEval = null;
800    m_Instances = null;
801    m_rankedSoFar = -1;
802    m_rankedAtts = null;
803  }
804 
805  /**
806   * Returns the revision string.
807   *
808   * @return            the revision
809   */
810  public String getRevision() {
811    return RevisionUtils.extract("$Revision: 1.10 $");
812  }
813}
Note: See TracBrowser for help on using the repository browser.