source: src/main/java/weka/classifiers/meta/RandomSubSpace.java @ 24

Last change on this file since 24 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 16.5 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RandomSubSpace.java
19 *    Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.meta;
24
25import weka.filters.unsupervised.attribute.Remove;
26import weka.classifiers.Classifier;
27import weka.classifiers.AbstractClassifier;
28import weka.classifiers.RandomizableIteratedSingleClassifierEnhancer;
29import weka.classifiers.RandomizableParallelIteratedSingleClassifierEnhancer;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.Randomizable;
34import weka.core.RevisionUtils;
35import weka.core.TechnicalInformation;
36import weka.core.TechnicalInformationHandler;
37import weka.core.Utils;
38import weka.core.WeightedInstancesHandler;
39import weka.core.TechnicalInformation.Field;
40import weka.core.TechnicalInformation.Type;
41
42import java.util.Enumeration;
43import java.util.Random;
44import java.util.Vector;
45import java.util.Arrays;
46import java.util.Collections;
47
48/**
49 <!-- globalinfo-start -->
50 * This method constructs a decision tree based classifier that maintains highest accuracy on training data and improves on generalization accuracy as it grows in complexity. The classifier consists of multiple trees constructed systematically by pseudorandomly selecting subsets of components of the feature vector, that is, trees constructed in randomly chosen subspaces.<br/>
51 * <br/>
52 * For more information, see<br/>
53 * <br/>
54 * Tin Kam Ho (1998). The Random Subspace Method for Constructing Decision Forests. IEEE Transactions on Pattern Analysis and Machine Intelligence. 20(8):832-844. URL http://citeseer.ist.psu.edu/ho98random.html.
55 * <p/>
56 <!-- globalinfo-end -->
57 *
58 <!-- technical-bibtex-start -->
59 * BibTeX:
60 * <pre>
61 * &#64;article{Ho1998,
62 *    author = {Tin Kam Ho},
63 *    journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
64 *    number = {8},
65 *    pages = {832-844},
66 *    title = {The Random Subspace Method for Constructing Decision Forests},
67 *    volume = {20},
68 *    year = {1998},
69 *    ISSN = {0162-8828},
70 *    URL = {http://citeseer.ist.psu.edu/ho98random.html}
71 * }
72 * </pre>
73 * <p/>
74 <!-- technical-bibtex-end -->
75 *
76 <!-- options-start -->
77 * Valid options are: <p/>
78 *
79 * <pre> -P
80 *  Size of each subspace:
81 *   &lt; 1: percentage of the number of attributes
82 *   &gt;=1: absolute number of attributes
83 * </pre>
84 *
85 * <pre> -S &lt;num&gt;
86 *  Random number seed.
87 *  (default 1)</pre>
88 *
89 * <pre> -I &lt;num&gt;
90 *  Number of iterations.
91 *  (default 10)</pre>
92 *
93 * <pre> -D
94 *  If set, classifier is run in debug mode and
95 *  may output additional info to the console</pre>
96 *
97 * <pre> -W
98 *  Full name of base classifier.
99 *  (default: weka.classifiers.trees.REPTree)</pre>
100 *
101 * <pre>
102 * Options specific to classifier weka.classifiers.trees.REPTree:
103 * </pre>
104 *
105 * <pre> -M &lt;minimum number of instances&gt;
106 *  Set minimum number of instances per leaf (default 2).</pre>
107 *
108 * <pre> -V &lt;minimum variance for split&gt;
109 *  Set minimum numeric class variance proportion
110 *  of train variance for split (default 1e-3).</pre>
111 *
112 * <pre> -N &lt;number of folds&gt;
113 *  Number of folds for reduced error pruning (default 3).</pre>
114 *
115 * <pre> -S &lt;seed&gt;
116 *  Seed for random data shuffling (default 1).</pre>
117 *
118 * <pre> -P
119 *  No pruning.</pre>
120 *
121 * <pre> -L
122 *  Maximum tree depth (default -1, no maximum)</pre>
123 *
124 <!-- options-end -->
125 *
126 * Options after -- are passed to the designated classifier.<p>
127 *
128 * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
129 * @author Peter Reutemann (fracpete@cs.waikato.ac.nz)
130 * @version $Revision: 5928 $
131 */
132public class RandomSubSpace
133  extends RandomizableParallelIteratedSingleClassifierEnhancer
134  implements WeightedInstancesHandler, TechnicalInformationHandler {
135
136  /** for serialization */
137  private static final long serialVersionUID = 1278172513912424947L;
138 
139  /** The size of each bag sample, as a percentage of the training size */
140  protected double m_SubSpaceSize = 0.5;
141
142  /** a ZeroR model in case no model can be built from the data */
143  protected Classifier m_ZeroR;
144 
145  /** Training data */
146  protected Instances m_data;
147   
148  /**
149   * Constructor.
150   */
151  public RandomSubSpace() {
152    super();
153   
154    m_Classifier = new weka.classifiers.trees.REPTree();
155  }
156 
157  /**
158   * Returns a string describing classifier
159   *
160   * @return            a description suitable for
161   *                    displaying in the explorer/experimenter gui
162   */
163  public String globalInfo() {
164    return 
165        "This method constructs a decision tree based classifier that "
166      + "maintains highest accuracy on training data and improves on "
167      + "generalization accuracy as it grows in complexity. The classifier "
168      + "consists of multiple trees constructed systematically by "
169      + "pseudorandomly selecting subsets of components of the feature vector, "
170      + "that is, trees constructed in randomly chosen subspaces.\n\n"
171      + "For more information, see\n\n"
172      + getTechnicalInformation().toString();
173  }
174
175  /**
176   * Returns an instance of a TechnicalInformation object, containing
177   * detailed information about the technical background of this class,
178   * e.g., paper reference or book this class is based on.
179   *
180   * @return            the technical information about this class
181   */
182  public TechnicalInformation getTechnicalInformation() {
183    TechnicalInformation        result;
184   
185    result = new TechnicalInformation(Type.ARTICLE);
186    result.setValue(Field.AUTHOR, "Tin Kam Ho");
187    result.setValue(Field.YEAR, "1998");
188    result.setValue(Field.TITLE, "The Random Subspace Method for Constructing Decision Forests");
189    result.setValue(Field.JOURNAL, "IEEE Transactions on Pattern Analysis and Machine Intelligence");
190    result.setValue(Field.VOLUME, "20");
191    result.setValue(Field.NUMBER, "8");
192    result.setValue(Field.PAGES, "832-844");
193    result.setValue(Field.URL, "http://citeseer.ist.psu.edu/ho98random.html");
194    result.setValue(Field.ISSN, "0162-8828");
195   
196    return result;
197  }
198
199  /**
200   * String describing default classifier.
201   *
202   * @return            the default classifier classname
203   */
204  protected String defaultClassifierString() {
205    return "weka.classifiers.trees.REPTree";
206  }
207
208  /**
209   * Returns an enumeration describing the available options.
210   *
211   * @return            an enumeration of all the available options.
212   */
213  public Enumeration listOptions() {
214    Vector result = new Vector();
215
216    result.addElement(new Option(
217        "\tSize of each subspace:\n"
218        + "\t\t< 1: percentage of the number of attributes\n"
219        + "\t\t>=1: absolute number of attributes\n",
220        "P", 1, "-P"));
221
222    Enumeration enu = super.listOptions();
223    while (enu.hasMoreElements()) {
224      result.addElement(enu.nextElement());
225    }
226   
227    return result.elements();
228  }
229
230  /**
231   * Parses a given list of options. <p/>
232   *
233   <!-- options-start -->
234   * Valid options are: <p/>
235   *
236   * <pre> -P
237   *  Size of each subspace:
238   *   &lt; 1: percentage of the number of attributes
239   *   &gt;=1: absolute number of attributes
240   * </pre>
241   *
242   * <pre> -S &lt;num&gt;
243   *  Random number seed.
244   *  (default 1)</pre>
245   *
246   * <pre> -I &lt;num&gt;
247   *  Number of iterations.
248   *  (default 10)</pre>
249   *
250   * <pre> -D
251   *  If set, classifier is run in debug mode and
252   *  may output additional info to the console</pre>
253   *
254   * <pre> -W
255   *  Full name of base classifier.
256   *  (default: weka.classifiers.trees.REPTree)</pre>
257   *
258   * <pre>
259   * Options specific to classifier weka.classifiers.trees.REPTree:
260   * </pre>
261   *
262   * <pre> -M &lt;minimum number of instances&gt;
263   *  Set minimum number of instances per leaf (default 2).</pre>
264   *
265   * <pre> -V &lt;minimum variance for split&gt;
266   *  Set minimum numeric class variance proportion
267   *  of train variance for split (default 1e-3).</pre>
268   *
269   * <pre> -N &lt;number of folds&gt;
270   *  Number of folds for reduced error pruning (default 3).</pre>
271   *
272   * <pre> -S &lt;seed&gt;
273   *  Seed for random data shuffling (default 1).</pre>
274   *
275   * <pre> -P
276   *  No pruning.</pre>
277   *
278   * <pre> -L
279   *  Maximum tree depth (default -1, no maximum)</pre>
280   *
281   <!-- options-end -->
282   *
283   * Options after -- are passed to the designated classifier.<p>
284   *
285   * @param options     the list of options as an array of strings
286   * @throws Exception  if an option is not supported
287   */
288  public void setOptions(String[] options) throws Exception {
289    String tmpStr;
290   
291    tmpStr = Utils.getOption('P', options);
292    if (tmpStr.length() != 0)
293      setSubSpaceSize(Double.parseDouble(tmpStr));
294    else
295      setSubSpaceSize(0.5);
296
297    super.setOptions(options);
298  }
299
300  /**
301   * Gets the current settings of the Classifier.
302   *
303   * @return            an array of strings suitable for passing to setOptions
304   */
305  public String [] getOptions() {
306    Vector        result;
307    String[]      options;
308    int           i;
309   
310    result  = new Vector();
311
312    result.add("-P");
313    result.add("" + getSubSpaceSize());
314   
315    options = super.getOptions();
316    for (i = 0; i < options.length; i++)
317      result.add(options[i]);
318
319    return (String[]) result.toArray(new String[result.size()]);
320  }
321
322  /**
323   * Returns the tip text for this property
324   *
325   * @return            tip text for this property suitable for
326   *                    displaying in the explorer/experimenter gui
327   */
328  public String subSpaceSizeTipText() {
329    return 
330        "Size of each subSpace: if less than 1 as a percentage of the "
331      + "number of attributes, otherwise the absolute number of attributes.";
332  }
333
334  /**
335   * Gets the size of each subSpace, as a percentage of the training set size.
336   *
337   * @return            the subSpace size, as a percentage.
338   */
339  public double getSubSpaceSize() {
340    return m_SubSpaceSize;
341  }
342 
343  /**
344   * Sets the size of each subSpace, as a percentage of the training set size.
345   *
346   * @param value       the subSpace size, as a percentage.
347   */
348  public void setSubSpaceSize(double value) {
349    m_SubSpaceSize = value;
350  }
351
352  /**
353   * calculates the number of attributes
354   *
355   * @param total       the available number of attributes
356   * @param fraction    the fraction - if less than 1 it represents the
357   *                    percentage, otherwise the absolute number of attributes
358   * @return            the number of attributes to use
359   */
360  protected int numberOfAttributes(int total, double fraction) {
361    int k = (int) Math.round((fraction < 1.0) ? total*fraction : fraction);
362   
363    if (k > total)
364      k = total;
365    if (k < 1)
366      k = 1;
367
368    return k;
369  }
370
371  /**
372   * generates an index string describing a random subspace, suitable for
373   * the Remove filter.
374   *
375   * @param indices             the attribute indices
376   * @param subSpaceSize        the size of the subspace
377   * @param classIndex          the class index
378   * @param random              the random number generator
379   * @return                    the generated string describing the subspace
380   */
381  protected String randomSubSpace(Integer[] indices, int subSpaceSize, int classIndex, Random random) {
382    Collections.shuffle(Arrays.asList(indices), random);
383    StringBuffer sb = new StringBuffer("");
384    for(int i = 0; i < subSpaceSize; i++) {
385      sb.append(indices[i]+",");
386    }
387    sb.append(classIndex);
388   
389    if (getDebug())
390      System.out.println("subSPACE = " + sb);
391
392    return sb.toString();
393  }
394
395  /**
396   * builds the classifier.
397   *
398   * @param data        the training data to be used for generating the
399   *                    classifier.
400   * @throws Exception  if the classifier could not be built successfully
401   */
402  public void buildClassifier(Instances data) throws Exception {
403
404    // can classifier handle the data?
405    getCapabilities().testWithFail(data);
406
407    // remove instances with missing class
408    m_data = new Instances(data);
409    m_data.deleteWithMissingClass();
410   
411    // only class? -> build ZeroR model
412    if (m_data.numAttributes() == 1) {
413      System.err.println(
414          "Cannot build model (only class attribute present in data!), "
415          + "using ZeroR model instead!");
416      m_ZeroR = new weka.classifiers.rules.ZeroR();
417      m_ZeroR.buildClassifier(m_data);
418      return;
419    }
420    else {
421      m_ZeroR = null;
422    }
423   
424    super.buildClassifier(data);
425
426    Integer[] indices = new Integer[data.numAttributes()-1];
427    int classIndex = data.classIndex();
428    int offset = 0;
429    for(int i = 0; i < indices.length+1; i++) {
430      if (i != classIndex) {
431        indices[offset++] = i+1;
432      }
433    }
434    int subSpaceSize = numberOfAttributes(indices.length, getSubSpaceSize());
435    Random random = data.getRandomNumberGenerator(m_Seed);
436   
437    for (int j = 0; j < m_Classifiers.length; j++) {
438      if (m_Classifier instanceof Randomizable) {
439        ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
440      }
441      FilteredClassifier fc = new FilteredClassifier();
442      fc.setClassifier(m_Classifiers[j]);
443      m_Classifiers[j] = fc;
444      Remove rm = new Remove();
445      rm.setOptions(new String[]{"-V", "-R", randomSubSpace(indices,subSpaceSize,classIndex+1,random)});
446      fc.setFilter(rm);
447
448      // build the classifier
449      //m_Classifiers[j].buildClassifier(m_data);
450    }
451   
452    buildClassifiers();
453   
454    // save memory
455    m_data = null;
456  }
457 
458  /**
459   * Returns a training set for a particular iteration.
460   *
461   * @param iteration the number of the iteration for the requested training set.
462   * @return the training set for the supplied iteration number
463   * @throws Exception if something goes wrong when generating a training set.
464   */
465  protected synchronized Instances getTrainingSet(int iteration) throws Exception {
466   
467    // We don't manipulate the training data in any way. The FilteredClassifiers
468    // take care of generating the sub-spaces.
469    return m_data;
470  }
471
472  /**
473   * Calculates the class membership probabilities for the given test
474   * instance.
475   *
476   * @param instance    the instance to be classified
477   * @return            preedicted class probability distribution
478   * @throws Exception  if distribution can't be computed successfully
479   */
480  public double[] distributionForInstance(Instance instance) throws Exception {
481
482    // default model?
483    if (m_ZeroR != null) {
484      return m_ZeroR.distributionForInstance(instance);
485    }
486   
487    double[] sums = new double [instance.numClasses()], newProbs; 
488   
489    for (int i = 0; i < m_NumIterations; i++) {
490      if (instance.classAttribute().isNumeric() == true) {
491        sums[0] += m_Classifiers[i].classifyInstance(instance);
492      } else {
493        newProbs = m_Classifiers[i].distributionForInstance(instance);
494        for (int j = 0; j < newProbs.length; j++)
495          sums[j] += newProbs[j];
496      }
497    }
498    if (instance.classAttribute().isNumeric() == true) {
499      sums[0] /= (double)m_NumIterations;
500      return sums;
501    } else if (Utils.eq(Utils.sum(sums), 0)) {
502      return sums;
503    } else {
504      Utils.normalize(sums);
505      return sums;
506    }
507  }
508
509  /**
510   * Returns description of the bagged classifier.
511   *
512   * @return            description of the bagged classifier as a string
513   */
514  public String toString() {
515   
516    // only ZeroR model?
517    if (m_ZeroR != null) {
518      StringBuffer buf = new StringBuffer();
519      buf.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n");
520      buf.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n\n");
521      buf.append("Warning: No model could be built, hence ZeroR model is used:\n\n");
522      buf.append(m_ZeroR.toString());
523      return buf.toString();
524    }
525   
526    if (m_Classifiers == null) {
527      return "RandomSubSpace: No model built yet.";
528    }
529    StringBuffer text = new StringBuffer();
530    text.append("All the base classifiers: \n\n");
531    for (int i = 0; i < m_Classifiers.length; i++)
532      text.append(m_Classifiers[i].toString() + "\n\n");
533
534    return text.toString();
535  }
536 
537  /**
538   * Returns the revision string.
539   *
540   * @return            the revision
541   */
542  public String getRevision() {
543    return RevisionUtils.extract("$Revision: 5928 $");
544  }
545
546  /**
547   * Main method for testing this class.
548   *
549   * @param args        the options
550   */
551  public static void main(String[] args) {
552    runClassifier(new RandomSubSpace(), args);
553  }
554}
Note: See TracBrowser for help on using the repository browser.