source: src/main/java/weka/attributeSelection/RandomSearch.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 18.7 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RandomSearch.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Instances;
26import weka.core.Option;
27import weka.core.OptionHandler;
28import weka.core.Range;
29import weka.core.RevisionUtils;
30import weka.core.TechnicalInformation;
31import weka.core.TechnicalInformation.Type;
32import weka.core.TechnicalInformation.Field;
33import weka.core.TechnicalInformationHandler;
34import weka.core.Utils;
35
36import java.util.BitSet;
37import java.util.Enumeration;
38import java.util.Random;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * RandomSearch : <br/>
44 * <br/>
45 * Performs a Random search in the space of attribute subsets. If no start set is supplied, Random search starts from a random point and reports the best subset found. If a start set is supplied, Random searches randomly for subsets that are as good or better than the start point with the same or or fewer attributes. Using RandomSearch in conjunction with a start set containing all attributes equates to the LVF algorithm of Liu and Setiono (ICML-96).<br/>
46 * <br/>
47 * For more information see:<br/>
48 * <br/>
49 * H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter solution. In: 13th International Conference on Machine Learning, 319-327, 1996.
50 * <p/>
51 <!-- globalinfo-end -->
52 *
53 <!-- technical-bibtex-start -->
54 * BibTeX:
55 * <pre>
56 * &#64;inproceedings{Liu1996,
57 *    author = {H. Liu and R. Setiono},
58 *    booktitle = {13th International Conference on Machine Learning},
59 *    pages = {319-327},
60 *    title = {A probabilistic approach to feature selection - A filter solution},
61 *    year = {1996}
62 * }
63 * </pre>
64 * <p/>
65 <!-- technical-bibtex-end -->
66 *
67 <!-- options-start -->
68 * Valid options are: <p/>
69 *
70 * <pre> -P &lt;start set&gt;
71 *  Specify a starting set of attributes.
72 *  Eg. 1,3,5-7.
73 *  If a start point is supplied,
74 *  random search evaluates the start
75 *  point and then randomly looks for
76 *  subsets that are as good as or better
77 *  than the start point with the same
78 *  or lower cardinality.</pre>
79 *
80 * <pre> -F &lt;percent&gt;
81 *  Percent of search space to consider.
82 *  (default = 25%).</pre>
83 *
84 * <pre> -V
85 *  Output subsets as the search progresses.
86 *  (default = false).</pre>
87 *
88 <!-- options-end -->
89 *
90 * @author Mark Hall (mhall@cs.waikato.ac.nz)
91 * @version $Revision: 1.18 $
92 */
93public class RandomSearch 
94  extends ASSearch
95  implements StartSetHandler, OptionHandler, TechnicalInformationHandler {
96
97  /** for serialization */
98  static final long serialVersionUID = 7479392617377425484L;
99 
100  /**
101   * holds a starting set as an array of attributes.
102   */
103  private int[] m_starting;
104 
105  /** holds the start set as a range */
106  private Range m_startRange;
107
108  /** the best feature set found during the search */
109  private BitSet m_bestGroup;
110
111  /** the merit of the best subset found */
112  private double m_bestMerit;
113
114  /**
115   * only accept a feature set as being "better" than the best if its
116   * merit is better or equal to the best, and it contains fewer
117   * features than the best (this allows LVF to be implimented).
118   */
119  private boolean m_onlyConsiderBetterAndSmaller;
120
121 /** does the data have a class */
122  private boolean m_hasClass;
123 
124  /** holds the class index */
125  private int m_classIndex;
126 
127  /** number of attributes in the data */
128  private int m_numAttribs;
129
130  /** seed for random number generation */
131  private int m_seed;
132
133  /** percentage of the search space to consider */
134  private double m_searchSize;
135
136  /** the number of iterations performed */
137  private int m_iterations;
138
139  /** random number object */
140  private Random m_random;
141
142  /** output new best subsets as the search progresses */
143  private boolean m_verbose;
144
145  /**
146   * Returns a string describing this search method
147   * @return a description of the search suitable for
148   * displaying in the explorer/experimenter gui
149   */
150  public String globalInfo() {
151    return "RandomSearch : \n\nPerforms a Random search in "
152      +"the space of attribute subsets. If no start set is supplied, Random "
153      +"search starts from a random point and reports the best subset found. "
154      +"If a start set is supplied, Random searches randomly for subsets "
155      +"that are as good or better than the start point with the same or "
156      +"or fewer attributes. Using RandomSearch in conjunction with a start "
157      +"set containing all attributes equates to the LVF algorithm of Liu "
158      +"and Setiono (ICML-96).\n\n"
159      + "For more information see:\n\n"
160      + getTechnicalInformation().toString();
161  }
162
163  /**
164   * Returns an instance of a TechnicalInformation object, containing
165   * detailed information about the technical background of this class,
166   * e.g., paper reference or book this class is based on.
167   *
168   * @return the technical information about this class
169   */
170  public TechnicalInformation getTechnicalInformation() {
171    TechnicalInformation        result;
172   
173    result = new TechnicalInformation(Type.INPROCEEDINGS);
174    result.setValue(Field.AUTHOR, "H. Liu and R. Setiono");
175    result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution");
176    result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning");
177    result.setValue(Field.YEAR, "1996");
178    result.setValue(Field.PAGES, "319-327");
179   
180    return result;
181  }
182
183  /**
184   * Constructor
185   */
186  public RandomSearch () {
187    resetOptions();
188  }
189
190  /**
191   * Returns an enumeration describing the available options.
192   * @return an enumeration of all the available options.
193   **/
194  public Enumeration listOptions () {
195    Vector newVector = new Vector(3);
196   
197    newVector.addElement(new Option("\tSpecify a starting set of attributes." 
198                                    + "\n\tEg. 1,3,5-7."
199                                    +"\n\tIf a start point is supplied,"
200                                    +"\n\trandom search evaluates the start"
201                                    +"\n\tpoint and then randomly looks for"
202                                    +"\n\tsubsets that are as good as or better"
203                                    +"\n\tthan the start point with the same"
204                                    +"\n\tor lower cardinality."
205                                    ,"P",1
206                                    , "-P <start set>"));
207
208    newVector.addElement(new Option("\tPercent of search space to consider."
209                                    +"\n\t(default = 25%)."
210                                    , "F", 1
211                                    , "-F <percent> "));
212    newVector.addElement(new Option("\tOutput subsets as the search progresses."
213                                    +"\n\t(default = false)."
214                                    , "V", 0
215                                    , "-V"));
216    return  newVector.elements();
217  }
218
219  /**
220   * Parses a given list of options. <p/>
221   *
222   <!-- options-start -->
223   * Valid options are: <p/>
224   *
225   * <pre> -P &lt;start set&gt;
226   *  Specify a starting set of attributes.
227   *  Eg. 1,3,5-7.
228   *  If a start point is supplied,
229   *  random search evaluates the start
230   *  point and then randomly looks for
231   *  subsets that are as good as or better
232   *  than the start point with the same
233   *  or lower cardinality.</pre>
234   *
235   * <pre> -F &lt;percent&gt;
236   *  Percent of search space to consider.
237   *  (default = 25%).</pre>
238   *
239   * <pre> -V
240   *  Output subsets as the search progresses.
241   *  (default = false).</pre>
242   *
243   <!-- options-end -->
244   *
245   * @param options the list of options as an array of strings
246   * @throws Exception if an option is not supported
247   *
248   **/
249  public void setOptions (String[] options)
250    throws Exception {
251    String optionString;
252    resetOptions();
253   
254    optionString = Utils.getOption('P', options);
255    if (optionString.length() != 0) {
256      setStartSet(optionString);
257    }
258
259    optionString = Utils.getOption('F',options);
260    if (optionString.length() != 0) {
261      setSearchPercent((new Double(optionString)).doubleValue());
262    }
263
264    setVerbose(Utils.getFlag('V',options));
265  }
266
267  /**
268   * Gets the current settings of RandomSearch.
269   * @return an array of strings suitable for passing to setOptions()
270   */
271  public String[] getOptions () {
272    String[] options = new String[5];
273    int current = 0;
274
275    if (m_verbose) {
276      options[current++] = "-V";
277    }
278
279    if (!(getStartSet().equals(""))) {
280      options[current++] = "-P";
281      options[current++] = "" + startSetToString();
282    }
283
284    options[current++] = "-F";
285    options[current++] = "" + getSearchPercent();
286
287    while (current < options.length) {
288      options[current++] = "";
289    }
290
291    return  options;
292  }
293
294  /**
295   * Returns the tip text for this property
296   * @return tip text for this property suitable for
297   * displaying in the explorer/experimenter gui
298   */
299  public String startSetTipText() {
300    return "Set the start point for the search. This is specified as a comma "
301      +"seperated list off attribute indexes starting at 1. It can include "
302      +"ranges. Eg. 1,2,5-9,17. If specified, Random searches for subsets "
303      +"of attributes that are as good as or better than the start set with "
304      +"the same or lower cardinality.";
305  }
306
307  /**
308   * Sets a starting set of attributes for the search. It is the
309   * search method's responsibility to report this start set (if any)
310   * in its toString() method.
311   * @param startSet a string containing a list of attributes (and or ranges),
312   * eg. 1,2,6,10-15. "" indicates no start point.
313   * If a start point is supplied, random search evaluates the
314   * start point and then looks for subsets that are as good as or better
315   * than the start point with the same or lower cardinality.
316   * @throws Exception if start set can't be set.
317   */
318  public void setStartSet (String startSet) throws Exception {
319    m_startRange.setRanges(startSet);
320  }
321
322  /**
323   * Returns a list of attributes (and or attribute ranges) as a String
324   * @return a list of attributes (and or attribute ranges)
325   */
326  public String getStartSet () {
327    return m_startRange.getRanges();
328  }
329
330  /**
331   * Returns the tip text for this property
332   * @return tip text for this property suitable for
333   * displaying in the explorer/experimenter gui
334   */
335  public String verboseTipText() {
336    return "Print progress information. Sends progress info to the terminal "
337      +"as the search progresses.";
338  }
339
340  /**
341   * set whether or not to output new best subsets as the search proceeds
342   * @param v true if output is to be verbose
343   */
344  public void setVerbose(boolean v) {
345    m_verbose = v;
346  }
347
348  /**
349   * get whether or not output is verbose
350   * @return true if output is set to verbose
351   */
352  public boolean getVerbose() {
353    return m_verbose;
354  }
355
356  /**
357   * Returns the tip text for this property
358   * @return tip text for this property suitable for
359   * displaying in the explorer/experimenter gui
360   */
361  public String searchPercentTipText() {
362    return "Percentage of the search space to explore.";
363  }
364
365  /**
366   * set the percentage of the search space to consider
367   * @param p percent of the search space ( 0 < p <= 100)
368   */
369  public void setSearchPercent(double p) {
370    p = Math.abs(p);
371    if (p == 0) {
372      p = 25;
373    }
374
375    if (p > 100.0) {
376      p = 100;
377    }
378
379    m_searchSize = (p/100.0);
380  }
381
382  /**
383   * get the percentage of the search space to consider
384   * @return the percent of the search space explored
385   */
386  public double getSearchPercent() {
387    return m_searchSize * 100;
388  }
389
390  /**
391   * converts the array of starting attributes to a string. This is
392   * used by getOptions to return the actual attributes specified
393   * as the starting set. This is better than using m_startRanges.getRanges()
394   * as the same start set can be specified in different ways from the
395   * command line---eg 1,2,3 == 1-3. This is to ensure that stuff that
396   * is stored in a database is comparable.
397   * @return a comma seperated list of individual attribute numbers as a String
398   */
399  private String startSetToString() {
400    StringBuffer FString = new StringBuffer();
401    boolean didPrint;
402   
403    if (m_starting == null) {
404      return getStartSet();
405    }
406
407    for (int i = 0; i < m_starting.length; i++) {
408      didPrint = false;
409     
410      if ((m_hasClass == false) || 
411          (m_hasClass == true && i != m_classIndex)) {
412        FString.append((m_starting[i] + 1));
413        didPrint = true;
414      }
415     
416      if (i == (m_starting.length - 1)) {
417        FString.append("");
418      }
419      else {
420        if (didPrint) {
421          FString.append(",");
422          }
423      }
424    }
425
426    return FString.toString();
427  }
428
429  /**
430   * prints a description of the search
431   * @return a description of the search as a string
432   */
433  public String toString() {
434    StringBuffer text = new StringBuffer();
435   
436    text.append("\tRandom search.\n\tStart set: ");
437    if (m_starting == null) {
438      text.append("no attributes\n");
439    }
440    else {
441      text.append(startSetToString()+"\n");
442    }
443    text.append("\tNumber of iterations: "+m_iterations+" ("
444                +(m_searchSize * 100.0)+"% of the search space)\n");
445    text.append("\tMerit of best subset found: "
446                +Utils.doubleToString(Math.abs(m_bestMerit),8,3)+"\n");
447
448    return text.toString();
449  }
450
451  /**
452   * Searches the attribute subset space randomly.
453   *
454   * @param ASEval the attribute evaluator to guide the search
455   * @param data the training instances.
456   * @return an array (not necessarily ordered) of selected attribute indexes
457   * @throws Exception if the search can't be completed
458   */
459   public int[] search (ASEvaluation ASEval, Instances data)
460     throws Exception {
461     double best_merit;
462     int sizeOfBest = m_numAttribs;
463     BitSet temp;
464     m_bestGroup = new BitSet(m_numAttribs);
465     
466     m_onlyConsiderBetterAndSmaller = false;
467     if (!(ASEval instanceof SubsetEvaluator)) {
468       throw  new Exception(ASEval.getClass().getName() 
469                            + " is not a " 
470                            + "Subset evaluator!");
471     }
472
473     m_random = new Random(m_seed);
474     
475     if (ASEval instanceof UnsupervisedSubsetEvaluator) {
476       m_hasClass = false;
477     }
478     else {
479       m_hasClass = true;
480       m_classIndex = data.classIndex();
481     }
482     
483     SubsetEvaluator ASEvaluator = (SubsetEvaluator)ASEval;
484     m_numAttribs = data.numAttributes();
485
486     m_startRange.setUpper(m_numAttribs-1);
487     if (!(getStartSet().equals(""))) {
488       m_starting = m_startRange.getSelection();
489     }
490
491     // If a starting subset has been supplied, then initialise the bitset
492     if (m_starting != null) {
493       for (int i = 0; i < m_starting.length; i++) {
494         if ((m_starting[i]) != m_classIndex) {
495           m_bestGroup.set(m_starting[i]);
496         }
497       }
498       m_onlyConsiderBetterAndSmaller = true;
499       best_merit = ASEvaluator.evaluateSubset(m_bestGroup);
500       sizeOfBest = countFeatures(m_bestGroup);
501     } else {
502       // do initial random subset
503       m_bestGroup = generateRandomSubset();
504       best_merit = ASEvaluator.evaluateSubset(m_bestGroup);
505     }
506     
507     if (m_verbose) {
508       System.out.println("Initial subset ("
509                          +Utils.doubleToString(Math.
510                                                abs(best_merit),8,5)
511                          +"): "+printSubset(m_bestGroup));
512     }
513
514     int i;
515     if (m_hasClass) {
516       i = m_numAttribs -1;
517     } else {
518       i = m_numAttribs;
519     }
520     m_iterations = (int)((m_searchSize * Math.pow(2, i)));
521     
522     int tempSize;
523     double tempMerit;
524     // main loop
525     for (i=0;i<m_iterations;i++) {
526       temp = generateRandomSubset();
527       if (m_onlyConsiderBetterAndSmaller) {
528         tempSize = countFeatures(temp);
529         if (tempSize <= sizeOfBest) {
530           tempMerit = ASEvaluator.evaluateSubset(temp);
531           if (tempMerit >= best_merit) {
532             sizeOfBest = tempSize;
533             m_bestGroup = temp;
534             best_merit = tempMerit;
535             if (m_verbose) {
536               System.out.print("New best subset ("
537                                  +Utils.doubleToString(Math.
538                                                        abs(best_merit),8,5)
539                                  +"): "+printSubset(m_bestGroup) + " :");
540               System.out.println(Utils.
541                                  doubleToString((((double)i)/
542                                                  ((double)m_iterations)*
543                                                  100.0),5,1)
544                                  +"% done");
545             }
546           }
547         }
548       } else {
549         tempMerit = ASEvaluator.evaluateSubset(temp);
550         if (tempMerit > best_merit) {
551           m_bestGroup = temp;
552           best_merit = tempMerit;
553           if (m_verbose) {
554             System.out.print("New best subset ("
555                                +Utils.doubleToString(Math.abs(best_merit),8,5)
556                                +"): "+printSubset(m_bestGroup) + " :");
557             System.out.println(Utils.
558                                doubleToString((((double)i)/
559                                                ((double)m_iterations)
560                                                *100.0),5,1)
561                                +"% done");
562           }
563         }
564       }
565     }
566     m_bestMerit = best_merit;
567     return attributeList(m_bestGroup);
568   }
569
570  /**
571   * prints a subset as a series of attribute numbers
572   * @param temp the subset to print
573   * @return a subset as a String of attribute numbers
574   */
575  private String printSubset(BitSet temp) {
576    StringBuffer text = new StringBuffer();
577
578    for (int j=0;j<m_numAttribs;j++) {
579      if (temp.get(j)) {
580        text.append((j+1)+" ");
581      }
582    }
583    return text.toString();
584  }
585
586  /**
587   * converts a BitSet into a list of attribute indexes
588   * @param group the BitSet to convert
589   * @return an array of attribute indexes
590   **/
591  private int[] attributeList (BitSet group) {
592    int count = 0;
593   
594    // count how many were selected
595    for (int i = 0; i < m_numAttribs; i++) {
596      if (group.get(i)) {
597        count++;
598      }
599    }
600   
601    int[] list = new int[count];
602    count = 0;
603   
604    for (int i = 0; i < m_numAttribs; i++) {
605      if (group.get(i)) {
606        list[count++] = i;
607      }
608    }
609   
610    return  list;
611  }
612
613  /**
614   * generates a random subset
615   * @return a random subset as a BitSet
616   */
617  private BitSet generateRandomSubset() {
618    BitSet temp = new BitSet(m_numAttribs);
619    double r;
620
621    for (int i=0;i<m_numAttribs;i++) {
622      r = m_random.nextDouble();
623      if (r <= 0.5) {
624        if (m_hasClass && i == m_classIndex) {
625        } else {
626          temp.set(i);
627        }
628      }
629    }
630    return temp;
631  }
632
633  /**
634   * counts the number of features in a subset
635   * @param featureSet the feature set for which to count the features
636   * @return the number of features in the subset
637   */
638  private int countFeatures(BitSet featureSet) {
639    int count = 0;
640    for (int i=0;i<m_numAttribs;i++) {
641      if (featureSet.get(i)) {
642        count++;
643      }
644    }
645    return count;
646  }
647
648  /**
649   * resets to defaults
650   */
651  private void resetOptions() {
652    m_starting = null;
653    m_startRange = new Range();
654    m_searchSize = 0.25;
655    m_seed = 1;
656    m_onlyConsiderBetterAndSmaller = false;
657    m_verbose = false;
658  }
659 
660  /**
661   * Returns the revision string.
662   *
663   * @return            the revision
664   */
665  public String getRevision() {
666    return RevisionUtils.extract("$Revision: 1.18 $");
667  }
668}
Note: See TracBrowser for help on using the repository browser.