Context Navigation

source: src/main/java/weka/classifiers/trees/REPTree.java @ 11

Last change on this file since 11 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 55.2 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* REPTree.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.trees;
24
25	import weka.classifiers.Classifier;
26	import weka.classifiers.AbstractClassifier;
27	import weka.classifiers.Sourcable;
28	import weka.classifiers.rules.ZeroR;
29	import weka.core.AdditionalMeasureProducer;
30	import weka.core.Attribute;
31	import weka.core.Capabilities;
32	import weka.core.ContingencyTables;
33	import weka.core.Drawable;
34	import weka.core.Instance;
35	import weka.core.Instances;
36	import weka.core.Option;
37	import weka.core.OptionHandler;
38	import weka.core.RevisionHandler;
39	import weka.core.RevisionUtils;
40	import weka.core.Utils;
41	import weka.core.WeightedInstancesHandler;
42	import weka.core.Capabilities.Capability;
43
44	import java.io.Serializable;
45	import java.util.Enumeration;
46	import java.util.Random;
47	import java.util.Vector;
48
49	/**
50	<!-- globalinfo-start -->
51	* Fast decision tree learner. Builds a decision/regression tree using information gain/variance and prunes it using reduced-error pruning (with backfitting). Only sorts values for numeric attributes once. Missing values are dealt with by splitting the corresponding instances into pieces (i.e. as in C4.5).
52	* <p/>
53	<!-- globalinfo-end -->
54	*
55	<!-- options-start -->
56	* Valid options are: <p/>
57	*
58	* <pre> -M <minimum number of instances>
59	* Set minimum number of instances per leaf (default 2).</pre>
60	*
61	* <pre> -V <minimum variance for split>
62	* Set minimum numeric class variance proportion
63	* of train variance for split (default 1e-3).</pre>
64	*
65	* <pre> -N <number of folds>
66	* Number of folds for reduced error pruning (default 3).</pre>
67	*
68	* <pre> -S <seed>
69	* Seed for random data shuffling (default 1).</pre>
70	*
71	* <pre> -P
72	* No pruning.</pre>
73	*
74	* <pre> -L
75	* Maximum tree depth (default -1, no maximum)</pre>
76	*
77	<!-- options-end -->
78	*
79	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
80	* @version $Revision: 5928 $
81	*/
82	public class REPTree
83	extends AbstractClassifier
84	implements OptionHandler, WeightedInstancesHandler, Drawable,
85	AdditionalMeasureProducer, Sourcable {
86
87	/** for serialization */
88	static final long serialVersionUID = -8562443428621539458L;
89
90	/** ZeroR model that is used if no attributes are present. */
91	protected ZeroR m_zeroR;
92
93	/**
94	* Returns a string describing classifier
95	* @return a description suitable for
96	* displaying in the explorer/experimenter gui
97	*/
98	public String globalInfo() {
99
100	return "Fast decision tree learner. Builds a decision/regression tree using "
101	+ "information gain/variance and prunes it using reduced-error pruning "
102	+ "(with backfitting). Only sorts values for numeric attributes "
103	+ "once. Missing values are dealt with by splitting the corresponding "
104	+ "instances into pieces (i.e. as in C4.5).";
105	}
106
107	/** An inner class for building and storing the tree structure */
108	protected class Tree
109	implements Serializable, RevisionHandler {
110
111	/** for serialization */
112	static final long serialVersionUID = -1635481717888437935L;
113
114	/** The header information (for printing the tree). */
115	protected Instances m_Info = null;
116
117	/** The subtrees of this tree. */
118	protected Tree[] m_Successors;
119
120	/** The attribute to split on. */
121	protected int m_Attribute = -1;
122
123	/** The split point. */
124	protected double m_SplitPoint = Double.NaN;
125
126	/** The proportions of training instances going down each branch. */
127	protected double[] m_Prop = null;
128
129	/** Class probabilities from the training data in the nominal case.
130	Holds the mean in the numeric case. */
131	protected double[] m_ClassProbs = null;
132
133	/** The (unnormalized) class distribution in the nominal
134	case. Holds the sum of squared errors and the weight
135	in the numeric case. */
136	protected double[] m_Distribution = null;
137
138	/** Class distribution of hold-out set at node in the nominal case.
139	Straight sum of weights in the numeric case (i.e. array has
140	only one element. */
141	protected double[] m_HoldOutDist = null;
142
143	/** The hold-out error of the node. The number of miss-classified
144	instances in the nominal case, the sum of squared errors in the
145	numeric case. */
146	protected double m_HoldOutError = 0;
147
148	/**
149	* Computes class distribution of an instance using the tree.
150	*
151	* @param instance the instance to compute the distribution for
152	* @return the distribution
153	* @throws Exception if computation fails
154	*/
155	protected double[] distributionForInstance(Instance instance)
156	throws Exception {
157
158	double[] returnedDist = null;
159
160	if (m_Attribute > -1) {
161
162	// Node is not a leaf
163	if (instance.isMissing(m_Attribute)) {
164
165	// Value is missing
166	returnedDist = new double[m_Info.numClasses()];
167
168	// Split instance up
169	for (int i = 0; i < m_Successors.length; i++) {
170	double[] help =
171	m_Successors[i].distributionForInstance(instance);
172	if (help != null) {
173	for (int j = 0; j < help.length; j++) {
174	returnedDist[j] += m_Prop[i] * help[j];
175	}
176	}
177	}
178	} else if (m_Info.attribute(m_Attribute).isNominal()) {
179
180	// For nominal attributes
181	returnedDist = m_Successors[(int)instance.value(m_Attribute)].
182	distributionForInstance(instance);
183	} else {
184
185	// For numeric attributes
186	if (instance.value(m_Attribute) < m_SplitPoint) {
187	returnedDist =
188	m_Successors[0].distributionForInstance(instance);
189	} else {
190	returnedDist =
191	m_Successors[1].distributionForInstance(instance);
192	}
193	}
194	}
195	if ((m_Attribute == -1) \|\| (returnedDist == null)) {
196
197	// Node is a leaf or successor is empty
198	return m_ClassProbs;
199	} else {
200	return returnedDist;
201	}
202	}
203
204	/**
205	* Returns a string containing java source code equivalent to the test
206	* made at this node. The instance being tested is called "i". This
207	* routine assumes to be called in the order of branching, enabling us to
208	* set the >= condition test (the last one) of a numeric splitpoint
209	* to just "true" (because being there in the flow implies that the
210	* previous less-than test failed).
211	*
212	* @param index index of the value tested
213	* @return a value of type 'String'
214	*/
215	public final String sourceExpression(int index) {
216
217	StringBuffer expr = null;
218	if (index < 0) {
219	return "i[" + m_Attribute + "] == null";
220	}
221	if (m_Info.attribute(m_Attribute).isNominal()) {
222	expr = new StringBuffer("i[");
223	expr.append(m_Attribute).append("]");
224	expr.append(".equals(\"").append(m_Info.attribute(m_Attribute)
225	.value(index)).append("\")");
226	} else {
227	expr = new StringBuffer("");
228	if (index == 0) {
229	expr.append("((Double)i[")
230	.append(m_Attribute).append("]).doubleValue() < ")
231	.append(m_SplitPoint);
232	} else {
233	expr.append("true");
234	}
235	}
236	return expr.toString();
237	}
238
239	/**
240	* Returns source code for the tree as if-then statements. The
241	* class is assigned to variable "p", and assumes the tested
242	* instance is named "i". The results are returned as two stringbuffers:
243	* a section of code for assignment of the class, and a section of
244	* code containing support code (eg: other support methods).
245	* <p/>
246	* TODO: If the outputted source code encounters a missing value
247	* for the evaluated attribute, it stops branching and uses the
248	* class distribution of the current node to decide the return value.
249	* This is unlike the behaviour of distributionForInstance().
250	*
251	* @param className the classname that this static classifier has
252	* @param parent parent node of the current node
253	* @return an array containing two stringbuffers, the first string containing
254	* assignment code, and the second containing source for support code.
255	* @throws Exception if something goes wrong
256	*/
257	public StringBuffer [] toSource(String className, Tree parent)
258	throws Exception {
259
260	StringBuffer [] result = new StringBuffer[2];
261	double[] currentProbs;
262
263	if(m_ClassProbs == null)
264	currentProbs = parent.m_ClassProbs;
265	else
266	currentProbs = m_ClassProbs;
267
268	long printID = nextID();
269
270	// Is this a leaf?
271	if (m_Attribute == -1) {
272	result[0] = new StringBuffer(" p = ");
273	if(m_Info.classAttribute().isNumeric())
274	result[0].append(currentProbs[0]);
275	else {
276	result[0].append(Utils.maxIndex(currentProbs));
277	}
278	result[0].append(";\n");
279	result[1] = new StringBuffer("");
280	} else {
281	StringBuffer text = new StringBuffer("");
282	StringBuffer atEnd = new StringBuffer("");
283
284	text.append(" static double N")
285	.append(Integer.toHexString(this.hashCode()) + printID)
286	.append("(Object []i) {\n")
287	.append(" double p = Double.NaN;\n");
288
289	text.append(" /* " + m_Info.attribute(m_Attribute).name() + " */\n");
290	// Missing attribute?
291	text.append(" if (" + this.sourceExpression(-1) + ") {\n")
292	.append(" p = ");
293	if(m_Info.classAttribute().isNumeric())
294	text.append(currentProbs[0] + ";\n");
295	else
296	text.append(Utils.maxIndex(currentProbs) + ";\n");
297	text.append(" } ");
298
299	// Branching of the tree
300	for (int i=0;i<m_Successors.length; i++) {
301	text.append("else if (" + this.sourceExpression(i) + ") {\n");
302	// Is the successor a leaf?
303	if(m_Successors[i].m_Attribute == -1) {
304	double[] successorProbs = m_Successors[i].m_ClassProbs;
305	if(successorProbs == null)
306	successorProbs = m_ClassProbs;
307	text.append(" p = ");
308	if(m_Info.classAttribute().isNumeric()) {
309	text.append(successorProbs[0] + ";\n");
310	} else {
311	text.append(Utils.maxIndex(successorProbs) + ";\n");
312	}
313	} else {
314	StringBuffer [] sub = m_Successors[i].toSource(className, this);
315	text.append("" + sub[0]);
316	atEnd.append("" + sub[1]);
317	}
318	text.append(" } ");
319	if (i == m_Successors.length - 1) {
320	text.append("\n");
321	}
322	}
323
324	text.append(" return p;\n }\n");
325
326	result[0] = new StringBuffer(" p = " + className + ".N");
327	result[0].append(Integer.toHexString(this.hashCode()) + printID)
328	.append("(i);\n");
329	result[1] = text.append("" + atEnd);
330	}
331	return result;
332	}
333
334
335	/**
336	* Outputs one node for graph.
337	*
338	* @param text the buffer to append the output to
339	* @param num the current node id
340	* @param parent the parent of the nodes
341	* @return the next node id
342	* @throws Exception if something goes wrong
343	*/
344	protected int toGraph(StringBuffer text, int num,
345	Tree parent) throws Exception {
346
347	num++;
348	if (m_Attribute == -1) {
349	text.append("N" + Integer.toHexString(Tree.this.hashCode()) +
350	" [label=\"" + num + leafString(parent) +"\"" +
351	"shape=box]\n");
352	} else {
353	text.append("N" + Integer.toHexString(Tree.this.hashCode()) +
354	" [label=\"" + num + ": " +
355	m_Info.attribute(m_Attribute).name() +
356	"\"]\n");
357	for (int i = 0; i < m_Successors.length; i++) {
358	text.append("N" + Integer.toHexString(Tree.this.hashCode())
359	+ "->" +
360	"N" +
361	Integer.toHexString(m_Successors[i].hashCode()) +
362	" [label=\"");
363	if (m_Info.attribute(m_Attribute).isNumeric()) {
364	if (i == 0) {
365	text.append(" < " +
366	Utils.doubleToString(m_SplitPoint, 2));
367	} else {
368	text.append(" >= " +
369	Utils.doubleToString(m_SplitPoint, 2));
370	}
371	} else {
372	text.append(" = " + m_Info.attribute(m_Attribute).value(i));
373	}
374	text.append("\"]\n");
375	num = m_Successors[i].toGraph(text, num, this);
376	}
377	}
378
379	return num;
380	}
381
382	/**
383	* Outputs description of a leaf node.
384	*
385	* @param parent the parent of the node
386	* @return the description of the node
387	* @throws Exception if generation fails
388	*/
389	protected String leafString(Tree parent) throws Exception {
390
391	if (m_Info.classAttribute().isNumeric()) {
392	double classMean;
393	if (m_ClassProbs == null) {
394	classMean = parent.m_ClassProbs[0];
395	} else {
396	classMean = m_ClassProbs[0];
397	}
398	StringBuffer buffer = new StringBuffer();
399	buffer.append(" : " + Utils.doubleToString(classMean, 2));
400	double avgError = 0;
401	if (m_Distribution[1] > 0) {
402	avgError = m_Distribution[0] / m_Distribution[1];
403	}
404	buffer.append(" (" +
405	Utils.doubleToString(m_Distribution[1], 2) + "/" +
406	Utils.doubleToString(avgError, 2)
407	+ ")");
408	avgError = 0;
409	if (m_HoldOutDist[0] > 0) {
410	avgError = m_HoldOutError / m_HoldOutDist[0];
411	}
412	buffer.append(" [" +
413	Utils.doubleToString(m_HoldOutDist[0], 2) + "/" +
414	Utils.doubleToString(avgError, 2)
415	+ "]");
416	return buffer.toString();
417	} else {
418	int maxIndex;
419	if (m_ClassProbs == null) {
420	maxIndex = Utils.maxIndex(parent.m_ClassProbs);
421	} else {
422	maxIndex = Utils.maxIndex(m_ClassProbs);
423	}
424	return " : " + m_Info.classAttribute().value(maxIndex) +
425	" (" + Utils.doubleToString(Utils.sum(m_Distribution), 2) +
426	"/" +
427	Utils.doubleToString((Utils.sum(m_Distribution) -
428	m_Distribution[maxIndex]), 2) + ")" +
429	" [" + Utils.doubleToString(Utils.sum(m_HoldOutDist), 2) + "/" +
430	Utils.doubleToString((Utils.sum(m_HoldOutDist) -
431	m_HoldOutDist[maxIndex]), 2) + "]";
432	}
433	}
434
435	/**
436	* Recursively outputs the tree.
437	*
438	* @param level the current level
439	* @param parent the current parent
440	* @return the generated substree
441	*/
442	protected String toString(int level, Tree parent) {
443
444	try {
445	StringBuffer text = new StringBuffer();
446
447	if (m_Attribute == -1) {
448
449	// Output leaf info
450	return leafString(parent);
451	} else if (m_Info.attribute(m_Attribute).isNominal()) {
452
453	// For nominal attributes
454	for (int i = 0; i < m_Successors.length; i++) {
455	text.append("\n");
456	for (int j = 0; j < level; j++) {
457	text.append("\| ");
458	}
459	text.append(m_Info.attribute(m_Attribute).name() + " = " +
460	m_Info.attribute(m_Attribute).value(i));
461	text.append(m_Successors[i].toString(level + 1, this));
462	}
463	} else {
464
465	// For numeric attributes
466	text.append("\n");
467	for (int j = 0; j < level; j++) {
468	text.append("\| ");
469	}
470	text.append(m_Info.attribute(m_Attribute).name() + " < " +
471	Utils.doubleToString(m_SplitPoint, 2));
472	text.append(m_Successors[0].toString(level + 1, this));
473	text.append("\n");
474	for (int j = 0; j < level; j++) {
475	text.append("\| ");
476	}
477	text.append(m_Info.attribute(m_Attribute).name() + " >= " +
478	Utils.doubleToString(m_SplitPoint, 2));
479	text.append(m_Successors[1].toString(level + 1, this));
480	}
481
482	return text.toString();
483	} catch (Exception e) {
484	e.printStackTrace();
485	return "Decision tree: tree can't be printed";
486	}
487	}
488
489	/**
490	* Recursively generates a tree.
491	*
492	* @param sortedIndices the sorted indices of the instances
493	* @param weights the weights of the instances
494	* @param data the data to work with
495	* @param totalWeight
496	* @param classProbs the class probabilities
497	* @param header the header of the data
498	* @param minNum the minimum number of instances in a leaf
499	* @param minVariance
500	* @param depth the current depth of the tree
501	* @param maxDepth the maximum allowed depth of the tree
502	* @throws Exception if generation fails
503	*/
504	protected void buildTree(int[][] sortedIndices, double[][] weights,
505	Instances data, double totalWeight,
506	double[] classProbs, Instances header,
507	double minNum, double minVariance,
508	int depth, int maxDepth)
509	throws Exception {
510
511	// Store structure of dataset, set minimum number of instances
512	// and make space for potential info from pruning data
513	m_Info = header;
514	m_HoldOutDist = new double[data.numClasses()];
515
516	// Make leaf if there are no training instances
517	int helpIndex = 0;
518	if (data.classIndex() == 0) {
519	helpIndex = 1;
520	}
521	if (sortedIndices[helpIndex].length == 0) {
522	if (data.classAttribute().isNumeric()) {
523	m_Distribution = new double[2];
524	} else {
525	m_Distribution = new double[data.numClasses()];
526	}
527	m_ClassProbs = null;
528	return;
529	}
530
531	double priorVar = 0;
532	if (data.classAttribute().isNumeric()) {
533
534	// Compute prior variance
535	double totalSum = 0, totalSumSquared = 0, totalSumOfWeights = 0;
536	for (int i = 0; i < sortedIndices[helpIndex].length; i++) {
537	Instance inst = data.instance(sortedIndices[helpIndex][i]);
538	totalSum += inst.classValue() * weights[helpIndex][i];
539	totalSumSquared +=
540	inst.classValue() * inst.classValue() * weights[helpIndex][i];
541	totalSumOfWeights += weights[helpIndex][i];
542	}
543	priorVar = singleVariance(totalSum, totalSumSquared,
544	totalSumOfWeights);
545	}
546
547	// Check if node doesn't contain enough instances, is pure
548	// or the maximum tree depth is reached
549	m_ClassProbs = new double[classProbs.length];
550	System.arraycopy(classProbs, 0, m_ClassProbs, 0, classProbs.length);
551	if ((totalWeight < (2 * minNum)) \|\|
552
553	// Nominal case
554	(data.classAttribute().isNominal() &&
555	Utils.eq(m_ClassProbs[Utils.maxIndex(m_ClassProbs)],
556	Utils.sum(m_ClassProbs))) \|\|
557
558	// Numeric case
559	(data.classAttribute().isNumeric() &&
560	((priorVar / totalWeight) < minVariance)) \|\|
561
562	// Check tree depth
563	((m_MaxDepth >= 0) && (depth >= maxDepth))) {
564
565	// Make leaf
566	m_Attribute = -1;
567	if (data.classAttribute().isNominal()) {
568
569	// Nominal case
570	m_Distribution = new double[m_ClassProbs.length];
571	for (int i = 0; i < m_ClassProbs.length; i++) {
572	m_Distribution[i] = m_ClassProbs[i];
573	}
574	Utils.normalize(m_ClassProbs);
575	} else {
576
577	// Numeric case
578	m_Distribution = new double[2];
579	m_Distribution[0] = priorVar;
580	m_Distribution[1] = totalWeight;
581	}
582	return;
583	}
584
585	// Compute class distributions and value of splitting
586	// criterion for each attribute
587	double[] vals = new double[data.numAttributes()];
588	double[][][] dists = new double[data.numAttributes()][0][0];
589	double[][] props = new double[data.numAttributes()][0];
590	double[][] totalSubsetWeights = new double[data.numAttributes()][0];
591	double[] splits = new double[data.numAttributes()];
592	if (data.classAttribute().isNominal()) {
593
594	// Nominal case
595	for (int i = 0; i < data.numAttributes(); i++) {
596	if (i != data.classIndex()) {
597	splits[i] = distribution(props, dists, i, sortedIndices[i],
598	weights[i], totalSubsetWeights, data);
599	vals[i] = gain(dists[i], priorVal(dists[i]));
600	}
601	}
602	} else {
603
604	// Numeric case
605	for (int i = 0; i < data.numAttributes(); i++) {
606	if (i != data.classIndex()) {
607	splits[i] =
608	numericDistribution(props, dists, i, sortedIndices[i],
609	weights[i], totalSubsetWeights, data,
610	vals);
611	}
612	}
613	}
614
615	// Find best attribute
616	m_Attribute = Utils.maxIndex(vals);
617	int numAttVals = dists[m_Attribute].length;
618
619	// Check if there are at least two subsets with
620	// required minimum number of instances
621	int count = 0;
622	for (int i = 0; i < numAttVals; i++) {
623	if (totalSubsetWeights[m_Attribute][i] >= minNum) {
624	count++;
625	}
626	if (count > 1) {
627	break;
628	}
629	}
630
631	// Any useful split found?
632	if ((vals[m_Attribute] > 0) && (count > 1)) {
633
634	// Build subtrees
635	m_SplitPoint = splits[m_Attribute];
636	m_Prop = props[m_Attribute];
637	int[][][] subsetIndices =
638	new int[numAttVals][data.numAttributes()][0];
639	double[][][] subsetWeights =
640	new double[numAttVals][data.numAttributes()][0];
641	splitData(subsetIndices, subsetWeights, m_Attribute, m_SplitPoint,
642	sortedIndices, weights, data);
643	m_Successors = new Tree[numAttVals];
644	for (int i = 0; i < numAttVals; i++) {
645	m_Successors[i] = new Tree();
646	m_Successors[i].
647	buildTree(subsetIndices[i], subsetWeights[i],
648	data, totalSubsetWeights[m_Attribute][i],
649	dists[m_Attribute][i], header, minNum,
650	minVariance, depth + 1, maxDepth);
651	}
652	} else {
653
654	// Make leaf
655	m_Attribute = -1;
656	}
657
658	// Normalize class counts
659	if (data.classAttribute().isNominal()) {
660	m_Distribution = new double[m_ClassProbs.length];
661	for (int i = 0; i < m_ClassProbs.length; i++) {
662	m_Distribution[i] = m_ClassProbs[i];
663	}
664	Utils.normalize(m_ClassProbs);
665	} else {
666	m_Distribution = new double[2];
667	m_Distribution[0] = priorVar;
668	m_Distribution[1] = totalWeight;
669	}
670	}
671
672	/**
673	* Computes size of the tree.
674	*
675	* @return the number of nodes
676	*/
677	protected int numNodes() {
678
679	if (m_Attribute == -1) {
680	return 1;
681	} else {
682	int size = 1;
683	for (int i = 0; i < m_Successors.length; i++) {
684	size += m_Successors[i].numNodes();
685	}
686	return size;
687	}
688	}
689
690	/**
691	* Splits instances into subsets.
692	*
693	* @param subsetIndices the sorted indices in the subset
694	* @param subsetWeights the weights of the subset
695	* @param att the attribute index
696	* @param splitPoint the split point for numeric attributes
697	* @param sortedIndices the sorted indices of the whole set
698	* @param weights the weights of the whole set
699	* @param data the data to work with
700	* @throws Exception if something goes wrong
701	*/
702	protected void splitData(int[][][] subsetIndices,
703	double[][][] subsetWeights,
704	int att, double splitPoint,
705	int[][] sortedIndices, double[][] weights,
706	Instances data) throws Exception {
707
708	int j;
709	int[] num;
710
711	// For each attribute
712	for (int i = 0; i < data.numAttributes(); i++) {
713	if (i != data.classIndex()) {
714	if (data.attribute(att).isNominal()) {
715
716	// For nominal attributes
717	num = new int[data.attribute(att).numValues()];
718	for (int k = 0; k < num.length; k++) {
719	subsetIndices[k][i] = new int[sortedIndices[i].length];
720	subsetWeights[k][i] = new double[sortedIndices[i].length];
721	}
722	for (j = 0; j < sortedIndices[i].length; j++) {
723	Instance inst = data.instance(sortedIndices[i][j]);
724	if (inst.isMissing(att)) {
725
726	// Split instance up
727	for (int k = 0; k < num.length; k++) {
728	if (m_Prop[k] > 0) {
729	subsetIndices[k][i][num[k]] = sortedIndices[i][j];
730	subsetWeights[k][i][num[k]] =
731	m_Prop[k] * weights[i][j];
732	num[k]++;
733	}
734	}
735	} else {
736	int subset = (int)inst.value(att);
737	subsetIndices[subset][i][num[subset]] =
738	sortedIndices[i][j];
739	subsetWeights[subset][i][num[subset]] = weights[i][j];
740	num[subset]++;
741	}
742	}
743	} else {
744
745	// For numeric attributes
746	num = new int[2];
747	for (int k = 0; k < 2; k++) {
748	subsetIndices[k][i] = new int[sortedIndices[i].length];
749	subsetWeights[k][i] = new double[weights[i].length];
750	}
751	for (j = 0; j < sortedIndices[i].length; j++) {
752	Instance inst = data.instance(sortedIndices[i][j]);
753	if (inst.isMissing(att)) {
754
755	// Split instance up
756	for (int k = 0; k < num.length; k++) {
757	if (m_Prop[k] > 0) {
758	subsetIndices[k][i][num[k]] = sortedIndices[i][j];
759	subsetWeights[k][i][num[k]] =
760	m_Prop[k] * weights[i][j];
761	num[k]++;
762	}
763	}
764	} else {
765	int subset = (inst.value(att) < splitPoint) ? 0 : 1;
766	subsetIndices[subset][i][num[subset]] =
767	sortedIndices[i][j];
768	subsetWeights[subset][i][num[subset]] = weights[i][j];
769	num[subset]++;
770	}
771	}
772	}
773
774	// Trim arrays
775	for (int k = 0; k < num.length; k++) {
776	int[] copy = new int[num[k]];
777	System.arraycopy(subsetIndices[k][i], 0, copy, 0, num[k]);
778	subsetIndices[k][i] = copy;
779	double[] copyWeights = new double[num[k]];
780	System.arraycopy(subsetWeights[k][i], 0,
781	copyWeights, 0, num[k]);
782	subsetWeights[k][i] = copyWeights;
783	}
784	}
785	}
786	}
787
788	/**
789	* Computes class distribution for an attribute.
790	*
791	* @param props
792	* @param dists
793	* @param att the attribute index
794	* @param sortedIndices the sorted indices of the instances
795	* @param weights the weights of the instances
796	* @param subsetWeights the weights of the subset
797	* @param data the data to work with
798	* @return the split point
799	* @throws Exception if computation fails
800	*/
801	protected double distribution(double[][] props,
802	double[][][] dists, int att,
803	int[] sortedIndices,
804	double[] weights,
805	double[][] subsetWeights,
806	Instances data)
807	throws Exception {
808
809	double splitPoint = Double.NaN;
810	Attribute attribute = data.attribute(att);
811	double[][] dist = null;
812	int i;
813
814	if (attribute.isNominal()) {
815
816	// For nominal attributes
817	dist = new double[attribute.numValues()][data.numClasses()];
818	for (i = 0; i < sortedIndices.length; i++) {
819	Instance inst = data.instance(sortedIndices[i]);
820	if (inst.isMissing(att)) {
821	break;
822	}
823	dist[(int)inst.value(att)][(int)inst.classValue()] += weights[i];
824	}
825	} else {
826
827	// For numeric attributes
828	double[][] currDist = new double[2][data.numClasses()];
829	dist = new double[2][data.numClasses()];
830
831	// Move all instances into second subset
832	for (int j = 0; j < sortedIndices.length; j++) {
833	Instance inst = data.instance(sortedIndices[j]);
834	if (inst.isMissing(att)) {
835	break;
836	}
837	currDist[1][(int)inst.classValue()] += weights[j];
838	}
839	double priorVal = priorVal(currDist);
840	System.arraycopy(currDist[1], 0, dist[1], 0, dist[1].length);
841
842	// Try all possible split points
843	double currSplit = data.instance(sortedIndices[0]).value(att);
844	double currVal, bestVal = -Double.MAX_VALUE;
845	for (i = 0; i < sortedIndices.length; i++) {
846	Instance inst = data.instance(sortedIndices[i]);
847	if (inst.isMissing(att)) {
848	break;
849	}
850	if (inst.value(att) > currSplit) {
851	currVal = gain(currDist, priorVal);
852	if (currVal > bestVal) {
853	bestVal = currVal;
854	splitPoint = (inst.value(att) + currSplit) / 2.0;
855	for (int j = 0; j < currDist.length; j++) {
856	System.arraycopy(currDist[j], 0, dist[j], 0,
857	dist[j].length);
858	}
859	}
860	}
861	currSplit = inst.value(att);
862	currDist[0][(int)inst.classValue()] += weights[i];
863	currDist[1][(int)inst.classValue()] -= weights[i];
864	}
865	}
866
867	// Compute weights
868	props[att] = new double[dist.length];
869	for (int k = 0; k < props[att].length; k++) {
870	props[att][k] = Utils.sum(dist[k]);
871	}
872	if (!(Utils.sum(props[att]) > 0)) {
873	for (int k = 0; k < props[att].length; k++) {
874	props[att][k] = 1.0 / (double)props[att].length;
875	}
876	} else {
877	Utils.normalize(props[att]);
878	}
879
880	// Distribute counts
881	while (i < sortedIndices.length) {
882	Instance inst = data.instance(sortedIndices[i]);
883	for (int j = 0; j < dist.length; j++) {
884	dist[j][(int)inst.classValue()] += props[att][j] * weights[i];
885	}
886	i++;
887	}
888
889	// Compute subset weights
890	subsetWeights[att] = new double[dist.length];
891	for (int j = 0; j < dist.length; j++) {
892	subsetWeights[att][j] += Utils.sum(dist[j]);
893	}
894
895	// Return distribution and split point
896	dists[att] = dist;
897	return splitPoint;
898	}
899
900	/**
901	* Computes class distribution for an attribute.
902	*
903	* @param props
904	* @param dists
905	* @param att the attribute index
906	* @param sortedIndices the sorted indices of the instances
907	* @param weights the weights of the instances
908	* @param subsetWeights the weights of the subset
909	* @param data the data to work with
910	* @param vals
911	* @return the split point
912	* @throws Exception if computation fails
913	*/
914	protected double numericDistribution(double[][] props,
915	double[][][] dists, int att,
916	int[] sortedIndices,
917	double[] weights,
918	double[][] subsetWeights,
919	Instances data,
920	double[] vals)
921	throws Exception {
922
923	double splitPoint = Double.NaN;
924	Attribute attribute = data.attribute(att);
925	double[][] dist = null;
926	double[] sums = null;
927	double[] sumSquared = null;
928	double[] sumOfWeights = null;
929	double totalSum = 0, totalSumSquared = 0, totalSumOfWeights = 0;
930
931	int i;
932
933	if (attribute.isNominal()) {
934
935	// For nominal attributes
936	sums = new double[attribute.numValues()];
937	sumSquared = new double[attribute.numValues()];
938	sumOfWeights = new double[attribute.numValues()];
939	int attVal;
940	for (i = 0; i < sortedIndices.length; i++) {
941	Instance inst = data.instance(sortedIndices[i]);
942	if (inst.isMissing(att)) {
943	break;
944	}
945	attVal = (int)inst.value(att);
946	sums[attVal] += inst.classValue() * weights[i];
947	sumSquared[attVal] +=
948	inst.classValue() * inst.classValue() * weights[i];
949	sumOfWeights[attVal] += weights[i];
950	}
951	totalSum = Utils.sum(sums);
952	totalSumSquared = Utils.sum(sumSquared);
953	totalSumOfWeights = Utils.sum(sumOfWeights);
954	} else {
955
956	// For numeric attributes
957	sums = new double[2];
958	sumSquared = new double[2];
959	sumOfWeights = new double[2];
960	double[] currSums = new double[2];
961	double[] currSumSquared = new double[2];
962	double[] currSumOfWeights = new double[2];
963
964	// Move all instances into second subset
965	for (int j = 0; j < sortedIndices.length; j++) {
966	Instance inst = data.instance(sortedIndices[j]);
967	if (inst.isMissing(att)) {
968	break;
969	}
970	currSums[1] += inst.classValue() * weights[j];
971	currSumSquared[1] +=
972	inst.classValue() * inst.classValue() * weights[j];
973	currSumOfWeights[1] += weights[j];
974
975	}
976	totalSum = currSums[1];
977	totalSumSquared = currSumSquared[1];
978	totalSumOfWeights = currSumOfWeights[1];
979
980	sums[1] = currSums[1];
981	sumSquared[1] = currSumSquared[1];
982	sumOfWeights[1] = currSumOfWeights[1];
983
984	// Try all possible split points
985	double currSplit = data.instance(sortedIndices[0]).value(att);
986	double currVal, bestVal = Double.MAX_VALUE;
987	for (i = 0; i < sortedIndices.length; i++) {
988	Instance inst = data.instance(sortedIndices[i]);
989	if (inst.isMissing(att)) {
990	break;
991	}
992	if (inst.value(att) > currSplit) {
993	currVal = variance(currSums, currSumSquared, currSumOfWeights);
994	if (currVal < bestVal) {
995	bestVal = currVal;
996	splitPoint = (inst.value(att) + currSplit) / 2.0;
997	for (int j = 0; j < 2; j++) {
998	sums[j] = currSums[j];
999	sumSquared[j] = currSumSquared[j];
1000	sumOfWeights[j] = currSumOfWeights[j];
1001	}
1002	}
1003	}
1004
1005	currSplit = inst.value(att);
1006
1007	double classVal = inst.classValue() * weights[i];
1008	double classValSquared = inst.classValue() * classVal;
1009
1010	currSums[0] += classVal;
1011	currSumSquared[0] += classValSquared;
1012	currSumOfWeights[0] += weights[i];
1013
1014	currSums[1] -= classVal;
1015	currSumSquared[1] -= classValSquared;
1016	currSumOfWeights[1] -= weights[i];
1017	}
1018	}
1019
1020	// Compute weights
1021	props[att] = new double[sums.length];
1022	for (int k = 0; k < props[att].length; k++) {
1023	props[att][k] = sumOfWeights[k];
1024	}
1025	if (!(Utils.sum(props[att]) > 0)) {
1026	for (int k = 0; k < props[att].length; k++) {
1027	props[att][k] = 1.0 / (double)props[att].length;
1028	}
1029	} else {
1030	Utils.normalize(props[att]);
1031	}
1032
1033
1034	// Distribute counts for missing values
1035	while (i < sortedIndices.length) {
1036	Instance inst = data.instance(sortedIndices[i]);
1037	for (int j = 0; j < sums.length; j++) {
1038	sums[j] += props[att][j] * inst.classValue() * weights[i];
1039	sumSquared[j] += props[att][j] * inst.classValue() *
1040	inst.classValue() * weights[i];
1041	sumOfWeights[j] += props[att][j] * weights[i];
1042	}
1043	totalSum += inst.classValue() * weights[i];
1044	totalSumSquared +=
1045	inst.classValue() * inst.classValue() * weights[i];
1046	totalSumOfWeights += weights[i];
1047	i++;
1048	}
1049
1050	// Compute final distribution
1051	dist = new double[sums.length][data.numClasses()];
1052	for (int j = 0; j < sums.length; j++) {
1053	if (sumOfWeights[j] > 0) {
1054	dist[j][0] = sums[j] / sumOfWeights[j];
1055	} else {
1056	dist[j][0] = totalSum / totalSumOfWeights;
1057	}
1058	}
1059
1060	// Compute variance gain
1061	double priorVar =
1062	singleVariance(totalSum, totalSumSquared, totalSumOfWeights);
1063	double var = variance(sums, sumSquared, sumOfWeights);
1064	double gain = priorVar - var;
1065
1066	// Return distribution and split point
1067	subsetWeights[att] = sumOfWeights;
1068	dists[att] = dist;
1069	vals[att] = gain;
1070	return splitPoint;
1071	}
1072
1073	/**
1074	* Computes variance for subsets.
1075	*
1076	* @param s
1077	* @param sS
1078	* @param sumOfWeights
1079	* @return the variance
1080	*/
1081	protected double variance(double[] s, double[] sS,
1082	double[] sumOfWeights) {
1083
1084	double var = 0;
1085
1086	for (int i = 0; i < s.length; i++) {
1087	if (sumOfWeights[i] > 0) {
1088	var += singleVariance(s[i], sS[i], sumOfWeights[i]);
1089	}
1090	}
1091
1092	return var;
1093	}
1094
1095	/**
1096	* Computes the variance for a single set
1097	*
1098	* @param s
1099	* @param sS
1100	* @param weight the weight
1101	* @return the variance
1102	*/
1103	protected double singleVariance(double s, double sS, double weight) {
1104
1105	return sS - ((s * s) / weight);
1106	}
1107
1108	/**
1109	* Computes value of splitting criterion before split.
1110	*
1111	* @param dist
1112	* @return the splitting criterion
1113	*/
1114	protected double priorVal(double[][] dist) {
1115
1116	return ContingencyTables.entropyOverColumns(dist);
1117	}
1118
1119	/**
1120	* Computes value of splitting criterion after split.
1121	*
1122	* @param dist
1123	* @param priorVal the splitting criterion
1124	* @return the gain after splitting
1125	*/
1126	protected double gain(double[][] dist, double priorVal) {
1127
1128	return priorVal - ContingencyTables.entropyConditionedOnRows(dist);
1129	}
1130
1131	/**
1132	* Prunes the tree using the hold-out data (bottom-up).
1133	*
1134	* @return the error
1135	* @throws Exception if pruning fails for some reason
1136	*/
1137	protected double reducedErrorPrune() throws Exception {
1138
1139	// Is node leaf ?
1140	if (m_Attribute == -1) {
1141	return m_HoldOutError;
1142	}
1143
1144	// Prune all sub trees
1145	double errorTree = 0;
1146	for (int i = 0; i < m_Successors.length; i++) {
1147	errorTree += m_Successors[i].reducedErrorPrune();
1148	}
1149
1150	// Replace sub tree with leaf if error doesn't get worse
1151	if (errorTree >= m_HoldOutError) {
1152	m_Attribute = -1;
1153	m_Successors = null;
1154	return m_HoldOutError;
1155	} else {
1156	return errorTree;
1157	}
1158	}
1159
1160	/**
1161	* Inserts hold-out set into tree.
1162	*
1163	* @param data the data to insert
1164	* @throws Exception if something goes wrong
1165	*/
1166	protected void insertHoldOutSet(Instances data) throws Exception {
1167
1168	for (int i = 0; i < data.numInstances(); i++) {
1169	insertHoldOutInstance(data.instance(i), data.instance(i).weight(),
1170	this);
1171	}
1172	}
1173
1174	/**
1175	* Inserts an instance from the hold-out set into the tree.
1176	*
1177	* @param inst the instance to insert
1178	* @param weight the weight of the instance
1179	* @param parent the parent of the node
1180	* @throws Exception if insertion fails
1181	*/
1182	protected void insertHoldOutInstance(Instance inst, double weight,
1183	Tree parent) throws Exception {
1184
1185	// Insert instance into hold-out class distribution
1186	if (inst.classAttribute().isNominal()) {
1187
1188	// Nominal case
1189	m_HoldOutDist[(int)inst.classValue()] += weight;
1190	int predictedClass = 0;
1191	if (m_ClassProbs == null) {
1192	predictedClass = Utils.maxIndex(parent.m_ClassProbs);
1193	} else {
1194	predictedClass = Utils.maxIndex(m_ClassProbs);
1195	}
1196	if (predictedClass != (int)inst.classValue()) {
1197	m_HoldOutError += weight;
1198	}
1199	} else {
1200
1201	// Numeric case
1202	m_HoldOutDist[0] += weight;
1203	double diff = 0;
1204	if (m_ClassProbs == null) {
1205	diff = parent.m_ClassProbs[0] - inst.classValue();
1206	} else {
1207	diff = m_ClassProbs[0] - inst.classValue();
1208	}
1209	m_HoldOutError += diff * diff * weight;
1210	}
1211
1212	// The process is recursive
1213	if (m_Attribute != -1) {
1214
1215	// If node is not a leaf
1216	if (inst.isMissing(m_Attribute)) {
1217
1218	// Distribute instance
1219	for (int i = 0; i < m_Successors.length; i++) {
1220	if (m_Prop[i] > 0) {
1221	m_Successors[i].insertHoldOutInstance(inst, weight *
1222	m_Prop[i], this);
1223	}
1224	}
1225	} else {
1226
1227	if (m_Info.attribute(m_Attribute).isNominal()) {
1228
1229	// Treat nominal attributes
1230	m_Successors[(int)inst.value(m_Attribute)].
1231	insertHoldOutInstance(inst, weight, this);
1232	} else {
1233
1234	// Treat numeric attributes
1235	if (inst.value(m_Attribute) < m_SplitPoint) {
1236	m_Successors[0].insertHoldOutInstance(inst, weight, this);
1237	} else {
1238	m_Successors[1].insertHoldOutInstance(inst, weight, this);
1239	}
1240	}
1241	}
1242	}
1243	}
1244
1245	/**
1246	* Inserts hold-out set into tree.
1247	*
1248	* @param data the data to insert
1249	* @throws Exception if insertion fails
1250	*/
1251	protected void backfitHoldOutSet(Instances data) throws Exception {
1252
1253	for (int i = 0; i < data.numInstances(); i++) {
1254	backfitHoldOutInstance(data.instance(i), data.instance(i).weight(),
1255	this);
1256	}
1257	}
1258
1259	/**
1260	* Inserts an instance from the hold-out set into the tree.
1261	*
1262	* @param inst the instance to insert
1263	* @param weight the weight of the instance
1264	* @param parent the parent node
1265	* @throws Exception if insertion fails
1266	*/
1267	protected void backfitHoldOutInstance(Instance inst, double weight,
1268	Tree parent) throws Exception {
1269
1270	// Insert instance into hold-out class distribution
1271	if (inst.classAttribute().isNominal()) {
1272
1273	// Nominal case
1274	if (m_ClassProbs == null) {
1275	m_ClassProbs = new double[inst.numClasses()];
1276	}
1277	System.arraycopy(m_Distribution, 0, m_ClassProbs, 0, inst.numClasses());
1278	m_ClassProbs[(int)inst.classValue()] += weight;
1279	Utils.normalize(m_ClassProbs);
1280	} else {
1281
1282	// Numeric case
1283	if (m_ClassProbs == null) {
1284	m_ClassProbs = new double[1];
1285	}
1286	m_ClassProbs[0] *= m_Distribution[1];
1287	m_ClassProbs[0] += weight * inst.classValue();
1288	m_ClassProbs[0] /= (m_Distribution[1] + weight);
1289	}
1290
1291	// The process is recursive
1292	if (m_Attribute != -1) {
1293
1294	// If node is not a leaf
1295	if (inst.isMissing(m_Attribute)) {
1296
1297	// Distribute instance
1298	for (int i = 0; i < m_Successors.length; i++) {
1299	if (m_Prop[i] > 0) {
1300	m_Successors[i].backfitHoldOutInstance(inst, weight *
1301	m_Prop[i], this);
1302	}
1303	}
1304	} else {
1305
1306	if (m_Info.attribute(m_Attribute).isNominal()) {
1307
1308	// Treat nominal attributes
1309	m_Successors[(int)inst.value(m_Attribute)].
1310	backfitHoldOutInstance(inst, weight, this);
1311	} else {
1312
1313	// Treat numeric attributes
1314	if (inst.value(m_Attribute) < m_SplitPoint) {
1315	m_Successors[0].backfitHoldOutInstance(inst, weight, this);
1316	} else {
1317	m_Successors[1].backfitHoldOutInstance(inst, weight, this);
1318	}
1319	}
1320	}
1321	}
1322	}
1323
1324	/**
1325	* Returns the revision string.
1326	*
1327	* @return the revision
1328	*/
1329	public String getRevision() {
1330	return RevisionUtils.extract("$Revision: 5928 $");
1331	}
1332	}
1333
1334	/** The Tree object */
1335	protected Tree m_Tree = null;
1336
1337	/** Number of folds for reduced error pruning. */
1338	protected int m_NumFolds = 3;
1339
1340	/** Seed for random data shuffling. */
1341	protected int m_Seed = 1;
1342
1343	/** Don't prune */
1344	protected boolean m_NoPruning = false;
1345
1346	/** The minimum number of instances per leaf. */
1347	protected double m_MinNum = 2;
1348
1349	/** The minimum proportion of the total variance (over all the data)
1350	required for split. */
1351	protected double m_MinVarianceProp = 1e-3;
1352
1353	/** Upper bound on the tree depth */
1354	protected int m_MaxDepth = -1;
1355
1356	/**
1357	* Returns the tip text for this property
1358	* @return tip text for this property suitable for
1359	* displaying in the explorer/experimenter gui
1360	*/
1361	public String noPruningTipText() {
1362	return "Whether pruning is performed.";
1363	}
1364
1365	/**
1366	* Get the value of NoPruning.
1367	*
1368	* @return Value of NoPruning.
1369	*/
1370	public boolean getNoPruning() {
1371
1372	return m_NoPruning;
1373	}
1374
1375	/**
1376	* Set the value of NoPruning.
1377	*
1378	* @param newNoPruning Value to assign to NoPruning.
1379	*/
1380	public void setNoPruning(boolean newNoPruning) {
1381
1382	m_NoPruning = newNoPruning;
1383	}
1384
1385	/**
1386	* Returns the tip text for this property
1387	* @return tip text for this property suitable for
1388	* displaying in the explorer/experimenter gui
1389	*/
1390	public String minNumTipText() {
1391	return "The minimum total weight of the instances in a leaf.";
1392	}
1393
1394	/**
1395	* Get the value of MinNum.
1396	*
1397	* @return Value of MinNum.
1398	*/
1399	public double getMinNum() {
1400
1401	return m_MinNum;
1402	}
1403
1404	/**
1405	* Set the value of MinNum.
1406	*
1407	* @param newMinNum Value to assign to MinNum.
1408	*/
1409	public void setMinNum(double newMinNum) {
1410
1411	m_MinNum = newMinNum;
1412	}
1413
1414	/**
1415	* Returns the tip text for this property
1416	* @return tip text for this property suitable for
1417	* displaying in the explorer/experimenter gui
1418	*/
1419	public String minVariancePropTipText() {
1420	return "The minimum proportion of the variance on all the data " +
1421	"that needs to be present at a node in order for splitting to " +
1422	"be performed in regression trees.";
1423	}
1424
1425	/**
1426	* Get the value of MinVarianceProp.
1427	*
1428	* @return Value of MinVarianceProp.
1429	*/
1430	public double getMinVarianceProp() {
1431
1432	return m_MinVarianceProp;
1433	}
1434
1435	/**
1436	* Set the value of MinVarianceProp.
1437	*
1438	* @param newMinVarianceProp Value to assign to MinVarianceProp.
1439	*/
1440	public void setMinVarianceProp(double newMinVarianceProp) {
1441
1442	m_MinVarianceProp = newMinVarianceProp;
1443	}
1444
1445	/**
1446	* Returns the tip text for this property
1447	* @return tip text for this property suitable for
1448	* displaying in the explorer/experimenter gui
1449	*/
1450	public String seedTipText() {
1451	return "The seed used for randomizing the data.";
1452	}
1453
1454	/**
1455	* Get the value of Seed.
1456	*
1457	* @return Value of Seed.
1458	*/
1459	public int getSeed() {
1460
1461	return m_Seed;
1462	}
1463
1464	/**
1465	* Set the value of Seed.
1466	*
1467	* @param newSeed Value to assign to Seed.
1468	*/
1469	public void setSeed(int newSeed) {
1470
1471	m_Seed = newSeed;
1472	}
1473
1474	/**
1475	* Returns the tip text for this property
1476	* @return tip text for this property suitable for
1477	* displaying in the explorer/experimenter gui
1478	*/
1479	public String numFoldsTipText() {
1480	return "Determines the amount of data used for pruning. One fold is used for "
1481	+ "pruning, the rest for growing the rules.";
1482	}
1483
1484	/**
1485	* Get the value of NumFolds.
1486	*
1487	* @return Value of NumFolds.
1488	*/
1489	public int getNumFolds() {
1490
1491	return m_NumFolds;
1492	}
1493
1494	/**
1495	* Set the value of NumFolds.
1496	*
1497	* @param newNumFolds Value to assign to NumFolds.
1498	*/
1499	public void setNumFolds(int newNumFolds) {
1500
1501	m_NumFolds = newNumFolds;
1502	}
1503
1504	/**
1505	* Returns the tip text for this property
1506	* @return tip text for this property suitable for
1507	* displaying in the explorer/experimenter gui
1508	*/
1509	public String maxDepthTipText() {
1510	return "The maximum tree depth (-1 for no restriction).";
1511	}
1512
1513	/**
1514	* Get the value of MaxDepth.
1515	*
1516	* @return Value of MaxDepth.
1517	*/
1518	public int getMaxDepth() {
1519
1520	return m_MaxDepth;
1521	}
1522
1523	/**
1524	* Set the value of MaxDepth.
1525	*
1526	* @param newMaxDepth Value to assign to MaxDepth.
1527	*/
1528	public void setMaxDepth(int newMaxDepth) {
1529
1530	m_MaxDepth = newMaxDepth;
1531	}
1532
1533	/**
1534	* Lists the command-line options for this classifier.
1535	*
1536	* @return an enumeration over all commandline options
1537	*/
1538	public Enumeration listOptions() {
1539
1540	Vector newVector = new Vector(5);
1541
1542	newVector.
1543	addElement(new Option("\tSet minimum number of instances per leaf " +
1544	"(default 2).",
1545	"M", 1, "-M <minimum number of instances>"));
1546	newVector.
1547	addElement(new Option("\tSet minimum numeric class variance proportion\n" +
1548	"\tof train variance for split (default 1e-3).",
1549	"V", 1, "-V <minimum variance for split>"));
1550	newVector.
1551	addElement(new Option("\tNumber of folds for reduced error pruning " +
1552	"(default 3).",
1553	"N", 1, "-N <number of folds>"));
1554	newVector.
1555	addElement(new Option("\tSeed for random data shuffling (default 1).",
1556	"S", 1, "-S <seed>"));
1557	newVector.
1558	addElement(new Option("\tNo pruning.",
1559	"P", 0, "-P"));
1560	newVector.
1561	addElement(new Option("\tMaximum tree depth (default -1, no maximum)",
1562	"L", 1, "-L"));
1563
1564	return newVector.elements();
1565	}
1566
1567	/**
1568	* Gets options from this classifier.
1569	*
1570	* @return the options for the current setup
1571	*/
1572	public String[] getOptions() {
1573
1574	String [] options = new String [12];
1575	int current = 0;
1576	options[current++] = "-M";
1577	options[current++] = "" + (int)getMinNum();
1578	options[current++] = "-V";
1579	options[current++] = "" + getMinVarianceProp();
1580	options[current++] = "-N";
1581	options[current++] = "" + getNumFolds();
1582	options[current++] = "-S";
1583	options[current++] = "" + getSeed();
1584	options[current++] = "-L";
1585	options[current++] = "" + getMaxDepth();
1586	if (getNoPruning()) {
1587	options[current++] = "-P";
1588	}
1589	while (current < options.length) {
1590	options[current++] = "";
1591	}
1592	return options;
1593	}
1594
1595	/**
1596	* Parses a given list of options. <p/>
1597	*
1598	<!-- options-start -->
1599	* Valid options are: <p/>
1600	*
1601	* <pre> -M <minimum number of instances>
1602	* Set minimum number of instances per leaf (default 2).</pre>
1603	*
1604	* <pre> -V <minimum variance for split>
1605	* Set minimum numeric class variance proportion
1606	* of train variance for split (default 1e-3).</pre>
1607	*
1608	* <pre> -N <number of folds>
1609	* Number of folds for reduced error pruning (default 3).</pre>
1610	*
1611	* <pre> -S <seed>
1612	* Seed for random data shuffling (default 1).</pre>
1613	*
1614	* <pre> -P
1615	* No pruning.</pre>
1616	*
1617	* <pre> -L
1618	* Maximum tree depth (default -1, no maximum)</pre>
1619	*
1620	<!-- options-end -->
1621	*
1622	* @param options the list of options as an array of strings
1623	* @throws Exception if an option is not supported
1624	*/
1625	public void setOptions(String[] options) throws Exception {
1626
1627	String minNumString = Utils.getOption('M', options);
1628	if (minNumString.length() != 0) {
1629	m_MinNum = (double)Integer.parseInt(minNumString);
1630	} else {
1631	m_MinNum = 2;
1632	}
1633	String minVarString = Utils.getOption('V', options);
1634	if (minVarString.length() != 0) {
1635	m_MinVarianceProp = Double.parseDouble(minVarString);
1636	} else {
1637	m_MinVarianceProp = 1e-3;
1638	}
1639	String numFoldsString = Utils.getOption('N', options);
1640	if (numFoldsString.length() != 0) {
1641	m_NumFolds = Integer.parseInt(numFoldsString);
1642	} else {
1643	m_NumFolds = 3;
1644	}
1645	String seedString = Utils.getOption('S', options);
1646	if (seedString.length() != 0) {
1647	m_Seed = Integer.parseInt(seedString);
1648	} else {
1649	m_Seed = 1;
1650	}
1651	m_NoPruning = Utils.getFlag('P', options);
1652	String depthString = Utils.getOption('L', options);
1653	if (depthString.length() != 0) {
1654	m_MaxDepth = Integer.parseInt(depthString);
1655	} else {
1656	m_MaxDepth = -1;
1657	}
1658	Utils.checkForRemainingOptions(options);
1659	}
1660
1661	/**
1662	* Computes size of the tree.
1663	*
1664	* @return the number of nodes
1665	*/
1666	public int numNodes() {
1667
1668	return m_Tree.numNodes();
1669	}
1670
1671	/**
1672	* Returns an enumeration of the additional measure names.
1673	*
1674	* @return an enumeration of the measure names
1675	*/
1676	public Enumeration enumerateMeasures() {
1677
1678	Vector newVector = new Vector(1);
1679	newVector.addElement("measureTreeSize");
1680	return newVector.elements();
1681	}
1682
1683	/**
1684	* Returns the value of the named measure.
1685	*
1686	* @param additionalMeasureName the name of the measure to query for its value
1687	* @return the value of the named measure
1688	* @throws IllegalArgumentException if the named measure is not supported
1689	*/
1690	public double getMeasure(String additionalMeasureName) {
1691
1692	if (additionalMeasureName.equalsIgnoreCase("measureTreeSize")) {
1693	return (double) numNodes();
1694	}
1695	else {throw new IllegalArgumentException(additionalMeasureName
1696	+ " not supported (REPTree)");
1697	}
1698	}
1699
1700	/**
1701	* Returns default capabilities of the classifier.
1702	*
1703	* @return the capabilities of this classifier
1704	*/
1705	public Capabilities getCapabilities() {
1706	Capabilities result = super.getCapabilities();
1707	result.disableAll();
1708
1709	// attributes
1710	result.enable(Capability.NOMINAL_ATTRIBUTES);
1711	result.enable(Capability.NUMERIC_ATTRIBUTES);
1712	result.enable(Capability.DATE_ATTRIBUTES);
1713	result.enable(Capability.MISSING_VALUES);
1714
1715	// class
1716	result.enable(Capability.NOMINAL_CLASS);
1717	result.enable(Capability.NUMERIC_CLASS);
1718	result.enable(Capability.DATE_CLASS);
1719	result.enable(Capability.MISSING_CLASS_VALUES);
1720
1721	return result;
1722	}
1723
1724	/**
1725	* Builds classifier.
1726	*
1727	* @param data the data to train with
1728	* @throws Exception if building fails
1729	*/
1730	public void buildClassifier(Instances data) throws Exception {
1731
1732	// can classifier handle the data?
1733	getCapabilities().testWithFail(data);
1734
1735	// remove instances with missing class
1736	data = new Instances(data);
1737	data.deleteWithMissingClass();
1738
1739	Random random = new Random(m_Seed);
1740
1741	m_zeroR = null;
1742	if (data.numAttributes() == 1) {
1743	m_zeroR = new ZeroR();
1744	m_zeroR.buildClassifier(data);
1745	return;
1746	}
1747
1748	// Randomize and stratify
1749	data.randomize(random);
1750	if (data.classAttribute().isNominal()) {
1751	data.stratify(m_NumFolds);
1752	}
1753
1754	// Split data into training and pruning set
1755	Instances train = null;
1756	Instances prune = null;
1757	if (!m_NoPruning) {
1758	train = data.trainCV(m_NumFolds, 0, random);
1759	prune = data.testCV(m_NumFolds, 0);
1760	} else {
1761	train = data;
1762	}
1763
1764	// Create array of sorted indices and weights
1765	int[][] sortedIndices = new int[train.numAttributes()][0];
1766	double[][] weights = new double[train.numAttributes()][0];
1767	double[] vals = new double[train.numInstances()];
1768	for (int j = 0; j < train.numAttributes(); j++) {
1769	if (j != train.classIndex()) {
1770	weights[j] = new double[train.numInstances()];
1771	if (train.attribute(j).isNominal()) {
1772
1773	// Handling nominal attributes. Putting indices of
1774	// instances with missing values at the end.
1775	sortedIndices[j] = new int[train.numInstances()];
1776	int count = 0;
1777	for (int i = 0; i < train.numInstances(); i++) {
1778	Instance inst = train.instance(i);
1779	if (!inst.isMissing(j)) {
1780	sortedIndices[j][count] = i;
1781	weights[j][count] = inst.weight();
1782	count++;
1783	}
1784	}
1785	for (int i = 0; i < train.numInstances(); i++) {
1786	Instance inst = train.instance(i);
1787	if (inst.isMissing(j)) {
1788	sortedIndices[j][count] = i;
1789	weights[j][count] = inst.weight();
1790	count++;
1791	}
1792	}
1793	} else {
1794
1795	// Sorted indices are computed for numeric attributes
1796	for (int i = 0; i < train.numInstances(); i++) {
1797	Instance inst = train.instance(i);
1798	vals[i] = inst.value(j);
1799	}
1800	sortedIndices[j] = Utils.sort(vals);
1801	for (int i = 0; i < train.numInstances(); i++) {
1802	weights[j][i] = train.instance(sortedIndices[j][i]).weight();
1803	}
1804	}
1805	}
1806	}
1807
1808	// Compute initial class counts
1809	double[] classProbs = new double[train.numClasses()];
1810	double totalWeight = 0, totalSumSquared = 0;
1811	for (int i = 0; i < train.numInstances(); i++) {
1812	Instance inst = train.instance(i);
1813	if (data.classAttribute().isNominal()) {
1814	classProbs[(int)inst.classValue()] += inst.weight();
1815	totalWeight += inst.weight();
1816	} else {
1817	classProbs[0] += inst.classValue() * inst.weight();
1818	totalSumSquared += inst.classValue() * inst.classValue() * inst.weight();
1819	totalWeight += inst.weight();
1820	}
1821	}
1822	m_Tree = new Tree();
1823	double trainVariance = 0;
1824	if (data.classAttribute().isNumeric()) {
1825	trainVariance = m_Tree.
1826	singleVariance(classProbs[0], totalSumSquared, totalWeight) / totalWeight;
1827	classProbs[0] /= totalWeight;
1828	}
1829
1830	// Build tree
1831	m_Tree.buildTree(sortedIndices, weights, train, totalWeight, classProbs,
1832	new Instances(train, 0), m_MinNum, m_MinVarianceProp *
1833	trainVariance, 0, m_MaxDepth);
1834
1835	// Insert pruning data and perform reduced error pruning
1836	if (!m_NoPruning) {
1837	m_Tree.insertHoldOutSet(prune);
1838	m_Tree.reducedErrorPrune();
1839	m_Tree.backfitHoldOutSet(prune);
1840	}
1841	}
1842
1843	/**
1844	* Computes class distribution of an instance using the tree.
1845	*
1846	* @param instance the instance to compute the distribution for
1847	* @return the computed class probabilities
1848	* @throws Exception if computation fails
1849	*/
1850	public double[] distributionForInstance(Instance instance)
1851	throws Exception {
1852
1853	if (m_zeroR != null) {
1854	return m_zeroR.distributionForInstance(instance);
1855	} else {
1856	return m_Tree.distributionForInstance(instance);
1857	}
1858	}
1859
1860
1861	/**
1862	* For getting a unique ID when outputting the tree source
1863	* (hashcode isn't guaranteed unique)
1864	*/
1865	private static long PRINTED_NODES = 0;
1866
1867	/**
1868	* Gets the next unique node ID.
1869	*
1870	* @return the next unique node ID.
1871	*/
1872	protected static long nextID() {
1873
1874	return PRINTED_NODES ++;
1875	}
1876
1877	/**
1878	* resets the counter for the nodes
1879	*/
1880	protected static void resetID() {
1881	PRINTED_NODES = 0;
1882	}
1883
1884	/**
1885	* Returns the tree as if-then statements.
1886	*
1887	* @param className the name for the generated class
1888	* @return the tree as a Java if-then type statement
1889	* @throws Exception if something goes wrong
1890	*/
1891	public String toSource(String className)
1892	throws Exception {
1893
1894	if (m_Tree == null) {
1895	throw new Exception("REPTree: No model built yet.");
1896	}
1897	StringBuffer [] source = m_Tree.toSource(className, m_Tree);
1898	return
1899	"class " + className + " {\n\n"
1900	+" public static double classify(Object [] i)\n"
1901	+" throws Exception {\n\n"
1902	+" double p = Double.NaN;\n"
1903	+ source[0] // Assignment code
1904	+" return p;\n"
1905	+" }\n"
1906	+ source[1] // Support code
1907	+"}\n";
1908	}
1909
1910	/**
1911	* Returns the type of graph this classifier
1912	* represents.
1913	* @return Drawable.TREE
1914	*/
1915	public int graphType() {
1916	return Drawable.TREE;
1917	}
1918
1919	/**
1920	* Outputs the decision tree as a graph
1921	*
1922	* @return the tree as a graph
1923	* @throws Exception if generation fails
1924	*/
1925	public String graph() throws Exception {
1926
1927	if (m_Tree == null) {
1928	throw new Exception("REPTree: No model built yet.");
1929	}
1930	StringBuffer resultBuff = new StringBuffer();
1931	m_Tree.toGraph(resultBuff, 0, null);
1932	String result = "digraph Tree {\n" + "edge [style=bold]\n" + resultBuff.toString()
1933	+ "\n}\n";
1934	return result;
1935	}
1936
1937	/**
1938	* Outputs the decision tree.
1939	*
1940	* @return a string representation of the classifier
1941	*/
1942	public String toString() {
1943
1944	if (m_zeroR != null) {
1945	return "No attributes other than class. Using ZeroR.\n\n" + m_zeroR.toString();
1946	}
1947	if ((m_Tree == null)) {
1948	return "REPTree: No model built yet.";
1949	}
1950	return
1951	"\nREPTree\n============\n" + m_Tree.toString(0, null) + "\n" +
1952	"\nSize of the tree : " + numNodes();
1953	}
1954
1955	/**
1956	* Returns the revision string.
1957	*
1958	* @return the revision
1959	*/
1960	public String getRevision() {
1961	return RevisionUtils.extract("$Revision: 5928 $");
1962	}
1963
1964	/**
1965	* Main method for this class.
1966	*
1967	* @param argv the commandline options
1968	*/
1969	public static void main(String[] argv) {
1970	runClassifier(new REPTree(), argv);
1971	}
1972	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: