Context Navigation

source: src/main/java/weka/classifiers/meta/RandomSubSpace.java @ 16

Last change on this file since 16 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 16.5 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* RandomSubSpace.java
19	* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.meta;
24
25	import weka.filters.unsupervised.attribute.Remove;
26	import weka.classifiers.Classifier;
27	import weka.classifiers.AbstractClassifier;
28	import weka.classifiers.RandomizableIteratedSingleClassifierEnhancer;
29	import weka.classifiers.RandomizableParallelIteratedSingleClassifierEnhancer;
30	import weka.core.Instance;
31	import weka.core.Instances;
32	import weka.core.Option;
33	import weka.core.Randomizable;
34	import weka.core.RevisionUtils;
35	import weka.core.TechnicalInformation;
36	import weka.core.TechnicalInformationHandler;
37	import weka.core.Utils;
38	import weka.core.WeightedInstancesHandler;
39	import weka.core.TechnicalInformation.Field;
40	import weka.core.TechnicalInformation.Type;
41
42	import java.util.Enumeration;
43	import java.util.Random;
44	import java.util.Vector;
45	import java.util.Arrays;
46	import java.util.Collections;
47
48	/**
49	<!-- globalinfo-start -->
50	* This method constructs a decision tree based classifier that maintains highest accuracy on training data and improves on generalization accuracy as it grows in complexity. The classifier consists of multiple trees constructed systematically by pseudorandomly selecting subsets of components of the feature vector, that is, trees constructed in randomly chosen subspaces.<br/>
51	* <br/>
52	* For more information, see<br/>
53	* <br/>
54	* Tin Kam Ho (1998). The Random Subspace Method for Constructing Decision Forests. IEEE Transactions on Pattern Analysis and Machine Intelligence. 20(8):832-844. URL http://citeseer.ist.psu.edu/ho98random.html.
55	* <p/>
56	<!-- globalinfo-end -->
57	*
58	<!-- technical-bibtex-start -->
59	* BibTeX:
60	* <pre>
61	* @article{Ho1998,
62	* author = {Tin Kam Ho},
63	* journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
64	* number = {8},
65	* pages = {832-844},
66	* title = {The Random Subspace Method for Constructing Decision Forests},
67	* volume = {20},
68	* year = {1998},
69	* ISSN = {0162-8828},
70	* URL = {http://citeseer.ist.psu.edu/ho98random.html}
71	* }
72	* </pre>
73	* <p/>
74	<!-- technical-bibtex-end -->
75	*
76	<!-- options-start -->
77	* Valid options are: <p/>
78	*
79	* <pre> -P
80	* Size of each subspace:
81	* < 1: percentage of the number of attributes
82	* >=1: absolute number of attributes
83	* </pre>
84	*
85	* <pre> -S <num>
86	* Random number seed.
87	* (default 1)</pre>
88	*
89	* <pre> -I <num>
90	* Number of iterations.
91	* (default 10)</pre>
92	*
93	* <pre> -D
94	* If set, classifier is run in debug mode and
95	* may output additional info to the console</pre>
96	*
97	* <pre> -W
98	* Full name of base classifier.
99	* (default: weka.classifiers.trees.REPTree)</pre>
100	*
101	* <pre>
102	* Options specific to classifier weka.classifiers.trees.REPTree:
103	* </pre>
104	*
105	* <pre> -M <minimum number of instances>
106	* Set minimum number of instances per leaf (default 2).</pre>
107	*
108	* <pre> -V <minimum variance for split>
109	* Set minimum numeric class variance proportion
110	* of train variance for split (default 1e-3).</pre>
111	*
112	* <pre> -N <number of folds>
113	* Number of folds for reduced error pruning (default 3).</pre>
114	*
115	* <pre> -S <seed>
116	* Seed for random data shuffling (default 1).</pre>
117	*
118	* <pre> -P
119	* No pruning.</pre>
120	*
121	* <pre> -L
122	* Maximum tree depth (default -1, no maximum)</pre>
123	*
124	<!-- options-end -->
125	*
126	* Options after -- are passed to the designated classifier.<p>
127	*
128	* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
129	* @author Peter Reutemann (fracpete@cs.waikato.ac.nz)
130	* @version $Revision: 5928 $
131	*/
132	public class RandomSubSpace
133	extends RandomizableParallelIteratedSingleClassifierEnhancer
134	implements WeightedInstancesHandler, TechnicalInformationHandler {
135
136	/** for serialization */
137	private static final long serialVersionUID = 1278172513912424947L;
138
139	/** The size of each bag sample, as a percentage of the training size */
140	protected double m_SubSpaceSize = 0.5;
141
142	/** a ZeroR model in case no model can be built from the data */
143	protected Classifier m_ZeroR;
144
145	/** Training data */
146	protected Instances m_data;
147
148	/**
149	* Constructor.
150	*/
151	public RandomSubSpace() {
152	super();
153
154	m_Classifier = new weka.classifiers.trees.REPTree();
155	}
156
157	/**
158	* Returns a string describing classifier
159	*
160	* @return a description suitable for
161	* displaying in the explorer/experimenter gui
162	*/
163	public String globalInfo() {
164	return
165	"This method constructs a decision tree based classifier that "
166	+ "maintains highest accuracy on training data and improves on "
167	+ "generalization accuracy as it grows in complexity. The classifier "
168	+ "consists of multiple trees constructed systematically by "
169	+ "pseudorandomly selecting subsets of components of the feature vector, "
170	+ "that is, trees constructed in randomly chosen subspaces.\n\n"
171	+ "For more information, see\n\n"
172	+ getTechnicalInformation().toString();
173	}
174
175	/**
176	* Returns an instance of a TechnicalInformation object, containing
177	* detailed information about the technical background of this class,
178	* e.g., paper reference or book this class is based on.
179	*
180	* @return the technical information about this class
181	*/
182	public TechnicalInformation getTechnicalInformation() {
183	TechnicalInformation result;
184
185	result = new TechnicalInformation(Type.ARTICLE);
186	result.setValue(Field.AUTHOR, "Tin Kam Ho");
187	result.setValue(Field.YEAR, "1998");
188	result.setValue(Field.TITLE, "The Random Subspace Method for Constructing Decision Forests");
189	result.setValue(Field.JOURNAL, "IEEE Transactions on Pattern Analysis and Machine Intelligence");
190	result.setValue(Field.VOLUME, "20");
191	result.setValue(Field.NUMBER, "8");
192	result.setValue(Field.PAGES, "832-844");
193	result.setValue(Field.URL, "http://citeseer.ist.psu.edu/ho98random.html");
194	result.setValue(Field.ISSN, "0162-8828");
195
196	return result;
197	}
198
199	/**
200	* String describing default classifier.
201	*
202	* @return the default classifier classname
203	*/
204	protected String defaultClassifierString() {
205	return "weka.classifiers.trees.REPTree";
206	}
207
208	/**
209	* Returns an enumeration describing the available options.
210	*
211	* @return an enumeration of all the available options.
212	*/
213	public Enumeration listOptions() {
214	Vector result = new Vector();
215
216	result.addElement(new Option(
217	"\tSize of each subspace:\n"
218	+ "\t\t< 1: percentage of the number of attributes\n"
219	+ "\t\t>=1: absolute number of attributes\n",
220	"P", 1, "-P"));
221
222	Enumeration enu = super.listOptions();
223	while (enu.hasMoreElements()) {
224	result.addElement(enu.nextElement());
225	}
226
227	return result.elements();
228	}
229
230	/**
231	* Parses a given list of options. <p/>
232	*
233	<!-- options-start -->
234	* Valid options are: <p/>
235	*
236	* <pre> -P
237	* Size of each subspace:
238	* < 1: percentage of the number of attributes
239	* >=1: absolute number of attributes
240	* </pre>
241	*
242	* <pre> -S <num>
243	* Random number seed.
244	* (default 1)</pre>
245	*
246	* <pre> -I <num>
247	* Number of iterations.
248	* (default 10)</pre>
249	*
250	* <pre> -D
251	* If set, classifier is run in debug mode and
252	* may output additional info to the console</pre>
253	*
254	* <pre> -W
255	* Full name of base classifier.
256	* (default: weka.classifiers.trees.REPTree)</pre>
257	*
258	* <pre>
259	* Options specific to classifier weka.classifiers.trees.REPTree:
260	* </pre>
261	*
262	* <pre> -M <minimum number of instances>
263	* Set minimum number of instances per leaf (default 2).</pre>
264	*
265	* <pre> -V <minimum variance for split>
266	* Set minimum numeric class variance proportion
267	* of train variance for split (default 1e-3).</pre>
268	*
269	* <pre> -N <number of folds>
270	* Number of folds for reduced error pruning (default 3).</pre>
271	*
272	* <pre> -S <seed>
273	* Seed for random data shuffling (default 1).</pre>
274	*
275	* <pre> -P
276	* No pruning.</pre>
277	*
278	* <pre> -L
279	* Maximum tree depth (default -1, no maximum)</pre>
280	*
281	<!-- options-end -->
282	*
283	* Options after -- are passed to the designated classifier.<p>
284	*
285	* @param options the list of options as an array of strings
286	* @throws Exception if an option is not supported
287	*/
288	public void setOptions(String[] options) throws Exception {
289	String tmpStr;
290
291	tmpStr = Utils.getOption('P', options);
292	if (tmpStr.length() != 0)
293	setSubSpaceSize(Double.parseDouble(tmpStr));
294	else
295	setSubSpaceSize(0.5);
296
297	super.setOptions(options);
298	}
299
300	/**
301	* Gets the current settings of the Classifier.
302	*
303	* @return an array of strings suitable for passing to setOptions
304	*/
305	public String [] getOptions() {
306	Vector result;
307	String[] options;
308	int i;
309
310	result = new Vector();
311
312	result.add("-P");
313	result.add("" + getSubSpaceSize());
314
315	options = super.getOptions();
316	for (i = 0; i < options.length; i++)
317	result.add(options[i]);
318
319	return (String[]) result.toArray(new String[result.size()]);
320	}
321
322	/**
323	* Returns the tip text for this property
324	*
325	* @return tip text for this property suitable for
326	* displaying in the explorer/experimenter gui
327	*/
328	public String subSpaceSizeTipText() {
329	return
330	"Size of each subSpace: if less than 1 as a percentage of the "
331	+ "number of attributes, otherwise the absolute number of attributes.";
332	}
333
334	/**
335	* Gets the size of each subSpace, as a percentage of the training set size.
336	*
337	* @return the subSpace size, as a percentage.
338	*/
339	public double getSubSpaceSize() {
340	return m_SubSpaceSize;
341	}
342
343	/**
344	* Sets the size of each subSpace, as a percentage of the training set size.
345	*
346	* @param value the subSpace size, as a percentage.
347	*/
348	public void setSubSpaceSize(double value) {
349	m_SubSpaceSize = value;
350	}
351
352	/**
353	* calculates the number of attributes
354	*
355	* @param total the available number of attributes
356	* @param fraction the fraction - if less than 1 it represents the
357	* percentage, otherwise the absolute number of attributes
358	* @return the number of attributes to use
359	*/
360	protected int numberOfAttributes(int total, double fraction) {
361	int k = (int) Math.round((fraction < 1.0) ? total*fraction : fraction);
362
363	if (k > total)
364	k = total;
365	if (k < 1)
366	k = 1;
367
368	return k;
369	}
370
371	/**
372	* generates an index string describing a random subspace, suitable for
373	* the Remove filter.
374	*
375	* @param indices the attribute indices
376	* @param subSpaceSize the size of the subspace
377	* @param classIndex the class index
378	* @param random the random number generator
379	* @return the generated string describing the subspace
380	*/
381	protected String randomSubSpace(Integer[] indices, int subSpaceSize, int classIndex, Random random) {
382	Collections.shuffle(Arrays.asList(indices), random);
383	StringBuffer sb = new StringBuffer("");
384	for(int i = 0; i < subSpaceSize; i++) {
385	sb.append(indices[i]+",");
386	}
387	sb.append(classIndex);
388
389	if (getDebug())
390	System.out.println("subSPACE = " + sb);
391
392	return sb.toString();
393	}
394
395	/**
396	* builds the classifier.
397	*
398	* @param data the training data to be used for generating the
399	* classifier.
400	* @throws Exception if the classifier could not be built successfully
401	*/
402	public void buildClassifier(Instances data) throws Exception {
403
404	// can classifier handle the data?
405	getCapabilities().testWithFail(data);
406
407	// remove instances with missing class
408	m_data = new Instances(data);
409	m_data.deleteWithMissingClass();
410
411	// only class? -> build ZeroR model
412	if (m_data.numAttributes() == 1) {
413	System.err.println(
414	"Cannot build model (only class attribute present in data!), "
415	+ "using ZeroR model instead!");
416	m_ZeroR = new weka.classifiers.rules.ZeroR();
417	m_ZeroR.buildClassifier(m_data);
418	return;
419	}
420	else {
421	m_ZeroR = null;
422	}
423
424	super.buildClassifier(data);
425
426	Integer[] indices = new Integer[data.numAttributes()-1];
427	int classIndex = data.classIndex();
428	int offset = 0;
429	for(int i = 0; i < indices.length+1; i++) {
430	if (i != classIndex) {
431	indices[offset++] = i+1;
432	}
433	}
434	int subSpaceSize = numberOfAttributes(indices.length, getSubSpaceSize());
435	Random random = data.getRandomNumberGenerator(m_Seed);
436
437	for (int j = 0; j < m_Classifiers.length; j++) {
438	if (m_Classifier instanceof Randomizable) {
439	((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
440	}
441	FilteredClassifier fc = new FilteredClassifier();
442	fc.setClassifier(m_Classifiers[j]);
443	m_Classifiers[j] = fc;
444	Remove rm = new Remove();
445	rm.setOptions(new String[]{"-V", "-R", randomSubSpace(indices,subSpaceSize,classIndex+1,random)});
446	fc.setFilter(rm);
447
448	// build the classifier
449	//m_Classifiers[j].buildClassifier(m_data);
450	}
451
452	buildClassifiers();
453
454	// save memory
455	m_data = null;
456	}
457
458	/**
459	* Returns a training set for a particular iteration.
460	*
461	* @param iteration the number of the iteration for the requested training set.
462	* @return the training set for the supplied iteration number
463	* @throws Exception if something goes wrong when generating a training set.
464	*/
465	protected synchronized Instances getTrainingSet(int iteration) throws Exception {
466
467	// We don't manipulate the training data in any way. The FilteredClassifiers
468	// take care of generating the sub-spaces.
469	return m_data;
470	}
471
472	/**
473	* Calculates the class membership probabilities for the given test
474	* instance.
475	*
476	* @param instance the instance to be classified
477	* @return preedicted class probability distribution
478	* @throws Exception if distribution can't be computed successfully
479	*/
480	public double[] distributionForInstance(Instance instance) throws Exception {
481
482	// default model?
483	if (m_ZeroR != null) {
484	return m_ZeroR.distributionForInstance(instance);
485	}
486
487	double[] sums = new double [instance.numClasses()], newProbs;
488
489	for (int i = 0; i < m_NumIterations; i++) {
490	if (instance.classAttribute().isNumeric() == true) {
491	sums[0] += m_Classifiers[i].classifyInstance(instance);
492	} else {
493	newProbs = m_Classifiers[i].distributionForInstance(instance);
494	for (int j = 0; j < newProbs.length; j++)
495	sums[j] += newProbs[j];
496	}
497	}
498	if (instance.classAttribute().isNumeric() == true) {
499	sums[0] /= (double)m_NumIterations;
500	return sums;
501	} else if (Utils.eq(Utils.sum(sums), 0)) {
502	return sums;
503	} else {
504	Utils.normalize(sums);
505	return sums;
506	}
507	}
508
509	/**
510	* Returns description of the bagged classifier.
511	*
512	* @return description of the bagged classifier as a string
513	*/
514	public String toString() {
515
516	// only ZeroR model?
517	if (m_ZeroR != null) {
518	StringBuffer buf = new StringBuffer();
519	buf.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n");
520	buf.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n\n");
521	buf.append("Warning: No model could be built, hence ZeroR model is used:\n\n");
522	buf.append(m_ZeroR.toString());
523	return buf.toString();
524	}
525
526	if (m_Classifiers == null) {
527	return "RandomSubSpace: No model built yet.";
528	}
529	StringBuffer text = new StringBuffer();
530	text.append("All the base classifiers: \n\n");
531	for (int i = 0; i < m_Classifiers.length; i++)
532	text.append(m_Classifiers[i].toString() + "\n\n");
533
534	return text.toString();
535	}
536
537	/**
538	* Returns the revision string.
539	*
540	* @return the revision
541	*/
542	public String getRevision() {
543	return RevisionUtils.extract("$Revision: 5928 $");
544	}
545
546	/**
547	* Main method for testing this class.
548	*
549	* @param args the options
550	*/
551	public static void main(String[] args) {
552	runClassifier(new RandomSubSpace(), args);
553	}
554	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: