Context Navigation

source: src/main/java/weka/classifiers/trees/SimpleCart.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 58.7 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* SimpleCart.java
19	* Copyright (C) 2007 Haijian Shi
20	*
21	*/
22
23	package weka.classifiers.trees;
24
25	import weka.classifiers.Evaluation;
26	import weka.classifiers.RandomizableClassifier;
27	import weka.core.AdditionalMeasureProducer;
28	import weka.core.Attribute;
29	import weka.core.Capabilities;
30	import weka.core.Instance;
31	import weka.core.Instances;
32	import weka.core.Option;
33	import weka.core.RevisionUtils;
34	import weka.core.TechnicalInformation;
35	import weka.core.TechnicalInformationHandler;
36	import weka.core.Utils;
37	import weka.core.Capabilities.Capability;
38	import weka.core.TechnicalInformation.Field;
39	import weka.core.TechnicalInformation.Type;
40	import weka.core.matrix.Matrix;
41
42	import java.util.Arrays;
43	import java.util.Enumeration;
44	import java.util.Random;
45	import java.util.Vector;
46
47	/**
48	<!-- globalinfo-start -->
49	* Class implementing minimal cost-complexity pruning.<br/>
50	* Note when dealing with missing values, use "fractional instances" method instead of surrogate split method.<br/>
51	* <br/>
52	* For more information, see:<br/>
53	* <br/>
54	* Leo Breiman, Jerome H. Friedman, Richard A. Olshen, Charles J. Stone (1984). Classification and Regression Trees. Wadsworth International Group, Belmont, California.
55	* <p/>
56	<!-- globalinfo-end -->
57	*
58	<!-- technical-bibtex-start -->
59	* BibTeX:
60	* <pre>
61	* @book{Breiman1984,
62	* address = {Belmont, California},
63	* author = {Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone},
64	* publisher = {Wadsworth International Group},
65	* title = {Classification and Regression Trees},
66	* year = {1984}
67	* }
68	* </pre>
69	* <p/>
70	<!-- technical-bibtex-end -->
71	*
72	<!-- options-start -->
73	* Valid options are: <p/>
74	*
75	* <pre> -S <num>
76	* Random number seed.
77	* (default 1)</pre>
78	*
79	* <pre> -D
80	* If set, classifier is run in debug mode and
81	* may output additional info to the console</pre>
82	*
83	* <pre> -M <min no>
84	* The minimal number of instances at the terminal nodes.
85	* (default 2)</pre>
86	*
87	* <pre> -N <num folds>
88	* The number of folds used in the minimal cost-complexity pruning.
89	* (default 5)</pre>
90	*
91	* <pre> -U
92	* Don't use the minimal cost-complexity pruning.
93	* (default yes).</pre>
94	*
95	* <pre> -H
96	* Don't use the heuristic method for binary split.
97	* (default true).</pre>
98	*
99	* <pre> -A
100	* Use 1 SE rule to make pruning decision.
101	* (default no).</pre>
102	*
103	* <pre> -C
104	* Percentage of training data size (0-1].
105	* (default 1).</pre>
106	*
107	<!-- options-end -->
108	*
109	* @author Haijian Shi (hs69@cs.waikato.ac.nz)
110	* @version $Revision: 5987 $
111	*/
112	public class SimpleCart
113	extends RandomizableClassifier
114	implements AdditionalMeasureProducer, TechnicalInformationHandler {
115
116	/** For serialization. */
117	private static final long serialVersionUID = 4154189200352566053L;
118
119	/** Training data. */
120	protected Instances m_train;
121
122	/** Successor nodes. */
123	protected SimpleCart[] m_Successors;
124
125	/** Attribute used to split data. */
126	protected Attribute m_Attribute;
127
128	/** Split point for a numeric attribute. */
129	protected double m_SplitValue;
130
131	/** Split subset used to split data for nominal attributes. */
132	protected String m_SplitString;
133
134	/** Class value if the node is leaf. */
135	protected double m_ClassValue;
136
137	/** Class attriubte of data. */
138	protected Attribute m_ClassAttribute;
139
140	/** Minimum number of instances in at the terminal nodes. */
141	protected double m_minNumObj = 2;
142
143	/** Number of folds for minimal cost-complexity pruning. */
144	protected int m_numFoldsPruning = 5;
145
146	/** Alpha-value (for pruning) at the node. */
147	protected double m_Alpha;
148
149	/** Number of training examples misclassified by the model (subtree rooted). */
150	protected double m_numIncorrectModel;
151
152	/** Number of training examples misclassified by the model (subtree not rooted). */
153	protected double m_numIncorrectTree;
154
155	/** Indicate if the node is a leaf node. */
156	protected boolean m_isLeaf;
157
158	/** If use minimal cost-compexity pruning. */
159	protected boolean m_Prune = true;
160
161	/** Total number of instances used to build the classifier. */
162	protected int m_totalTrainInstances;
163
164	/** Proportion for each branch. */
165	protected double[] m_Props;
166
167	/** Class probabilities. */
168	protected double[] m_ClassProbs = null;
169
170	/** Distributions of leaf node (or temporary leaf node in minimal cost-complexity pruning) */
171	protected double[] m_Distribution;
172
173	/** If use huristic search for nominal attributes in multi-class problems (default true). */
174	protected boolean m_Heuristic = true;
175
176	/** If use the 1SE rule to make final decision tree. */
177	protected boolean m_UseOneSE = false;
178
179	/** Training data size. */
180	protected double m_SizePer = 1;
181
182	/**
183	* Return a description suitable for displaying in the explorer/experimenter.
184	*
185	* @return a description suitable for displaying in the
186	* explorer/experimenter
187	*/
188	public String globalInfo() {
189	return
190	"Class implementing minimal cost-complexity pruning.\n"
191	+ "Note when dealing with missing values, use \"fractional "
192	+ "instances\" method instead of surrogate split method.\n\n"
193	+ "For more information, see:\n\n"
194	+ getTechnicalInformation().toString();
195	}
196
197	/**
198	* Returns an instance of a TechnicalInformation object, containing
199	* detailed information about the technical background of this class,
200	* e.g., paper reference or book this class is based on.
201	*
202	* @return the technical information about this class
203	*/
204	public TechnicalInformation getTechnicalInformation() {
205	TechnicalInformation result;
206
207	result = new TechnicalInformation(Type.BOOK);
208	result.setValue(Field.AUTHOR, "Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone");
209	result.setValue(Field.YEAR, "1984");
210	result.setValue(Field.TITLE, "Classification and Regression Trees");
211	result.setValue(Field.PUBLISHER, "Wadsworth International Group");
212	result.setValue(Field.ADDRESS, "Belmont, California");
213
214	return result;
215	}
216
217	/**
218	* Returns default capabilities of the classifier.
219	*
220	* @return the capabilities of this classifier
221	*/
222	public Capabilities getCapabilities() {
223	Capabilities result = super.getCapabilities();
224	result.disableAll();
225
226	// attributes
227	result.enable(Capability.NOMINAL_ATTRIBUTES);
228	result.enable(Capability.NUMERIC_ATTRIBUTES);
229	result.enable(Capability.MISSING_VALUES);
230
231	// class
232	result.enable(Capability.NOMINAL_CLASS);
233
234	return result;
235	}
236
237	/**
238	* Build the classifier.
239	*
240	* @param data the training instances
241	* @throws Exception if something goes wrong
242	*/
243	public void buildClassifier(Instances data) throws Exception {
244
245	getCapabilities().testWithFail(data);
246	data = new Instances(data);
247	data.deleteWithMissingClass();
248
249	// unpruned CART decision tree
250	if (!m_Prune) {
251
252	// calculate sorted indices and weights, and compute initial class counts.
253	int[][] sortedIndices = new int[data.numAttributes()][0];
254	double[][] weights = new double[data.numAttributes()][0];
255	double[] classProbs = new double[data.numClasses()];
256	double totalWeight = computeSortedInfo(data,sortedIndices, weights,classProbs);
257
258	makeTree(data, data.numInstances(),sortedIndices,weights,classProbs,
259	totalWeight,m_minNumObj, m_Heuristic);
260	return;
261	}
262
263	Random random = new Random(m_Seed);
264	Instances cvData = new Instances(data);
265	cvData.randomize(random);
266	cvData = new Instances(cvData,0,(int)(cvData.numInstances()*m_SizePer)-1);
267	cvData.stratify(m_numFoldsPruning);
268
269	double[][] alphas = new double[m_numFoldsPruning][];
270	double[][] errors = new double[m_numFoldsPruning][];
271
272	// calculate errors and alphas for each fold
273	for (int i = 0; i < m_numFoldsPruning; i++) {
274
275	//for every fold, grow tree on training set and fix error on test set.
276	Instances train = cvData.trainCV(m_numFoldsPruning, i);
277	Instances test = cvData.testCV(m_numFoldsPruning, i);
278
279	// calculate sorted indices and weights, and compute initial class counts for each fold
280	int[][] sortedIndices = new int[train.numAttributes()][0];
281	double[][] weights = new double[train.numAttributes()][0];
282	double[] classProbs = new double[train.numClasses()];
283	double totalWeight = computeSortedInfo(train,sortedIndices, weights,classProbs);
284
285	makeTree(train, train.numInstances(),sortedIndices,weights,classProbs,
286	totalWeight,m_minNumObj, m_Heuristic);
287
288	int numNodes = numInnerNodes();
289	alphas[i] = new double[numNodes + 2];
290	errors[i] = new double[numNodes + 2];
291
292	// prune back and log alpha-values and errors on test set
293	prune(alphas[i], errors[i], test);
294	}
295
296	// calculate sorted indices and weights, and compute initial class counts on all training instances
297	int[][] sortedIndices = new int[data.numAttributes()][0];
298	double[][] weights = new double[data.numAttributes()][0];
299	double[] classProbs = new double[data.numClasses()];
300	double totalWeight = computeSortedInfo(data,sortedIndices, weights,classProbs);
301
302	//build tree using all the data
303	makeTree(data, data.numInstances(),sortedIndices,weights,classProbs,
304	totalWeight,m_minNumObj, m_Heuristic);
305
306	int numNodes = numInnerNodes();
307
308	double[] treeAlphas = new double[numNodes + 2];
309
310	// prune back and log alpha-values
311	int iterations = prune(treeAlphas, null, null);
312
313	double[] treeErrors = new double[numNodes + 2];
314
315	// for each pruned subtree, find the cross-validated error
316	for (int i = 0; i <= iterations; i++){
317	//compute midpoint alphas
318	double alpha = Math.sqrt(treeAlphas[i] * treeAlphas[i+1]);
319	double error = 0;
320	for (int k = 0; k < m_numFoldsPruning; k++) {
321	int l = 0;
322	while (alphas[k][l] <= alpha) l++;
323	error += errors[k][l - 1];
324	}
325	treeErrors[i] = error/m_numFoldsPruning;
326	}
327
328	// find best alpha
329	int best = -1;
330	double bestError = Double.MAX_VALUE;
331	for (int i = iterations; i >= 0; i--) {
332	if (treeErrors[i] < bestError) {
333	bestError = treeErrors[i];
334	best = i;
335	}
336	}
337
338	// 1 SE rule to choose expansion
339	if (m_UseOneSE) {
340	double oneSE = Math.sqrt(bestError*(1-bestError)/(data.numInstances()));
341	for (int i = iterations; i >= 0; i--) {
342	if (treeErrors[i] <= bestError+oneSE) {
343	best = i;
344	break;
345	}
346	}
347	}
348
349	double bestAlpha = Math.sqrt(treeAlphas[best] * treeAlphas[best + 1]);
350
351	//"unprune" final tree (faster than regrowing it)
352	unprune();
353	prune(bestAlpha);
354	}
355
356	/**
357	* Make binary decision tree recursively.
358	*
359	* @param data the training instances
360	* @param totalInstances total number of instances
361	* @param sortedIndices sorted indices of the instances
362	* @param weights weights of the instances
363	* @param classProbs class probabilities
364	* @param totalWeight total weight of instances
365	* @param minNumObj minimal number of instances at leaf nodes
366	* @param useHeuristic if use heuristic search for nominal attributes in multi-class problem
367	* @throws Exception if something goes wrong
368	*/
369	protected void makeTree(Instances data, int totalInstances, int[][] sortedIndices,
370	double[][] weights, double[] classProbs, double totalWeight, double minNumObj,
371	boolean useHeuristic) throws Exception{
372
373	// if no instances have reached this node (normally won't happen)
374	if (totalWeight == 0){
375	m_Attribute = null;
376	m_ClassValue = Utils.missingValue();
377	m_Distribution = new double[data.numClasses()];
378	return;
379	}
380
381	m_totalTrainInstances = totalInstances;
382	m_isLeaf = true;
383
384	m_ClassProbs = new double[classProbs.length];
385	m_Distribution = new double[classProbs.length];
386	System.arraycopy(classProbs, 0, m_ClassProbs, 0, classProbs.length);
387	System.arraycopy(classProbs, 0, m_Distribution, 0, classProbs.length);
388	if (Utils.sum(m_ClassProbs)!=0) Utils.normalize(m_ClassProbs);
389
390	// Compute class distributions and value of splitting
391	// criterion for each attribute
392	double[][][] dists = new double[data.numAttributes()][0][0];
393	double[][] props = new double[data.numAttributes()][0];
394	double[][] totalSubsetWeights = new double[data.numAttributes()][2];
395	double[] splits = new double[data.numAttributes()];
396	String[] splitString = new String[data.numAttributes()];
397	double[] giniGains = new double[data.numAttributes()];
398
399	// for each attribute find split information
400	for (int i = 0; i < data.numAttributes(); i++) {
401	Attribute att = data.attribute(i);
402	if (i==data.classIndex()) continue;
403	if (att.isNumeric()) {
404	// numeric attribute
405	splits[i] = numericDistribution(props, dists, att, sortedIndices[i],
406	weights[i], totalSubsetWeights, giniGains, data);
407	} else {
408	// nominal attribute
409	splitString[i] = nominalDistribution(props, dists, att, sortedIndices[i],
410	weights[i], totalSubsetWeights, giniGains, data, useHeuristic);
411	}
412	}
413
414	// Find best attribute (split with maximum Gini gain)
415	int attIndex = Utils.maxIndex(giniGains);
416	m_Attribute = data.attribute(attIndex);
417
418	m_train = new Instances(data, sortedIndices[attIndex].length);
419	for (int i=0; i<sortedIndices[attIndex].length; i++) {
420	Instance inst = data.instance(sortedIndices[attIndex][i]);
421	Instance instCopy = (Instance)inst.copy();
422	instCopy.setWeight(weights[attIndex][i]);
423	m_train.add(instCopy);
424	}
425
426	// Check if node does not contain enough instances, or if it can not be split,
427	// or if it is pure. If does, make leaf.
428	if (totalWeight < 2 * minNumObj \|\| giniGains[attIndex]==0 \|\|
429	props[attIndex][0]==0 \|\| props[attIndex][1]==0) {
430	makeLeaf(data);
431	}
432
433	else {
434	m_Props = props[attIndex];
435	int[][][] subsetIndices = new int[2][data.numAttributes()][0];
436	double[][][] subsetWeights = new double[2][data.numAttributes()][0];
437
438	// numeric split
439	if (m_Attribute.isNumeric()) m_SplitValue = splits[attIndex];
440
441	// nominal split
442	else m_SplitString = splitString[attIndex];
443
444	splitData(subsetIndices, subsetWeights, m_Attribute, m_SplitValue,
445	m_SplitString, sortedIndices, weights, data);
446
447	// If split of the node results in a node with less than minimal number of isntances,
448	// make the node leaf node.
449	if (subsetIndices[0][attIndex].length<minNumObj \|\|
450	subsetIndices[1][attIndex].length<minNumObj) {
451	makeLeaf(data);
452	return;
453	}
454
455	// Otherwise, split the node.
456	m_isLeaf = false;
457	m_Successors = new SimpleCart[2];
458	for (int i = 0; i < 2; i++) {
459	m_Successors[i] = new SimpleCart();
460	m_Successors[i].makeTree(data, m_totalTrainInstances, subsetIndices[i],
461	subsetWeights[i],dists[attIndex][i], totalSubsetWeights[attIndex][i],
462	minNumObj, useHeuristic);
463	}
464	}
465	}
466
467	/**
468	* Prunes the original tree using the CART pruning scheme, given a
469	* cost-complexity parameter alpha.
470	*
471	* @param alpha the cost-complexity parameter
472	* @throws Exception if something goes wrong
473	*/
474	public void prune(double alpha) throws Exception {
475
476	Vector nodeList;
477
478	// determine training error of pruned subtrees (both with and without replacing a subtree),
479	// and calculate alpha-values from them
480	modelErrors();
481	treeErrors();
482	calculateAlphas();
483
484	// get list of all inner nodes in the tree
485	nodeList = getInnerNodes();
486
487	boolean prune = (nodeList.size() > 0);
488	double preAlpha = Double.MAX_VALUE;
489	while (prune) {
490
491	// select node with minimum alpha
492	SimpleCart nodeToPrune = nodeToPrune(nodeList);
493
494	// want to prune if its alpha is smaller than alpha
495	if (nodeToPrune.m_Alpha > alpha) {
496	break;
497	}
498
499	nodeToPrune.makeLeaf(nodeToPrune.m_train);
500
501	// normally would not happen
502	if (nodeToPrune.m_Alpha==preAlpha) {
503	nodeToPrune.makeLeaf(nodeToPrune.m_train);
504	treeErrors();
505	calculateAlphas();
506	nodeList = getInnerNodes();
507	prune = (nodeList.size() > 0);
508	continue;
509	}
510	preAlpha = nodeToPrune.m_Alpha;
511
512	//update tree errors and alphas
513	treeErrors();
514	calculateAlphas();
515
516	nodeList = getInnerNodes();
517	prune = (nodeList.size() > 0);
518	}
519	}
520
521	/**
522	* Method for performing one fold in the cross-validation of minimal
523	* cost-complexity pruning. Generates a sequence of alpha-values with error
524	* estimates for the corresponding (partially pruned) trees, given the test
525	* set of that fold.
526	*
527	* @param alphas array to hold the generated alpha-values
528	* @param errors array to hold the corresponding error estimates
529	* @param test test set of that fold (to obtain error estimates)
530	* @return the iteration of the pruning
531	* @throws Exception if something goes wrong
532	*/
533	public int prune(double[] alphas, double[] errors, Instances test)
534	throws Exception {
535
536	Vector nodeList;
537
538	// determine training error of subtrees (both with and without replacing a subtree),
539	// and calculate alpha-values from them
540	modelErrors();
541	treeErrors();
542	calculateAlphas();
543
544	// get list of all inner nodes in the tree
545	nodeList = getInnerNodes();
546
547	boolean prune = (nodeList.size() > 0);
548
549	//alpha_0 is always zero (unpruned tree)
550	alphas[0] = 0;
551
552	Evaluation eval;
553
554	// error of unpruned tree
555	if (errors != null) {
556	eval = new Evaluation(test);
557	eval.evaluateModel(this, test);
558	errors[0] = eval.errorRate();
559	}
560
561	int iteration = 0;
562	double preAlpha = Double.MAX_VALUE;
563	while (prune) {
564
565	iteration++;
566
567	// get node with minimum alpha
568	SimpleCart nodeToPrune = nodeToPrune(nodeList);
569
570	// do not set m_sons null, want to unprune
571	nodeToPrune.m_isLeaf = true;
572
573	// normally would not happen
574	if (nodeToPrune.m_Alpha==preAlpha) {
575	iteration--;
576	treeErrors();
577	calculateAlphas();
578	nodeList = getInnerNodes();
579	prune = (nodeList.size() > 0);
580	continue;
581	}
582
583	// get alpha-value of node
584	alphas[iteration] = nodeToPrune.m_Alpha;
585
586	// log error
587	if (errors != null) {
588	eval = new Evaluation(test);
589	eval.evaluateModel(this, test);
590	errors[iteration] = eval.errorRate();
591	}
592	preAlpha = nodeToPrune.m_Alpha;
593
594	//update errors/alphas
595	treeErrors();
596	calculateAlphas();
597
598	nodeList = getInnerNodes();
599	prune = (nodeList.size() > 0);
600	}
601
602	//set last alpha 1 to indicate end
603	alphas[iteration + 1] = 1.0;
604	return iteration;
605	}
606
607	/**
608	* Method to "unprune" the CART tree. Sets all leaf-fields to false.
609	* Faster than re-growing the tree because CART do not have to be fit again.
610	*/
611	protected void unprune() {
612	if (m_Successors != null) {
613	m_isLeaf = false;
614	for (int i = 0; i < m_Successors.length; i++) m_Successors[i].unprune();
615	}
616	}
617
618	/**
619	* Compute distributions, proportions and total weights of two successor
620	* nodes for a given numeric attribute.
621	*
622	* @param props proportions of each two branches for each attribute
623	* @param dists class distributions of two branches for each attribute
624	* @param att numeric att split on
625	* @param sortedIndices sorted indices of instances for the attirubte
626	* @param weights weights of instances for the attirbute
627	* @param subsetWeights total weight of two branches split based on the attribute
628	* @param giniGains Gini gains for each attribute
629	* @param data training instances
630	* @return Gini gain the given numeric attribute
631	* @throws Exception if something goes wrong
632	*/
633	protected double numericDistribution(double[][] props, double[][][] dists,
634	Attribute att, int[] sortedIndices, double[] weights, double[][] subsetWeights,
635	double[] giniGains, Instances data)
636	throws Exception {
637
638	double splitPoint = Double.NaN;
639	double[][] dist = null;
640	int numClasses = data.numClasses();
641	int i; // differ instances with or without missing values
642
643	double[][] currDist = new double[2][numClasses];
644	dist = new double[2][numClasses];
645
646	// Move all instances without missing values into second subset
647	double[] parentDist = new double[numClasses];
648	int missingStart = 0;
649	for (int j = 0; j < sortedIndices.length; j++) {
650	Instance inst = data.instance(sortedIndices[j]);
651	if (!inst.isMissing(att)) {
652	missingStart ++;
653	currDist[1][(int)inst.classValue()] += weights[j];
654	}
655	parentDist[(int)inst.classValue()] += weights[j];
656	}
657	System.arraycopy(currDist[1], 0, dist[1], 0, dist[1].length);
658
659	// Try all possible split points
660	double currSplit = data.instance(sortedIndices[0]).value(att);
661	double currGiniGain;
662	double bestGiniGain = -Double.MAX_VALUE;
663
664	for (i = 0; i < sortedIndices.length; i++) {
665	Instance inst = data.instance(sortedIndices[i]);
666	if (inst.isMissing(att)) {
667	break;
668	}
669	if (inst.value(att) > currSplit) {
670
671	double[][] tempDist = new double[2][numClasses];
672	for (int k=0; k<2; k++) {
673	//tempDist[k] = currDist[k];
674	System.arraycopy(currDist[k], 0, tempDist[k], 0, tempDist[k].length);
675	}
676
677	double[] tempProps = new double[2];
678	for (int k=0; k<2; k++) {
679	tempProps[k] = Utils.sum(tempDist[k]);
680	}
681
682	if (Utils.sum(tempProps) !=0) Utils.normalize(tempProps);
683
684	// split missing values
685	int index = missingStart;
686	while (index < sortedIndices.length) {
687	Instance insta = data.instance(sortedIndices[index]);
688	for (int j = 0; j < 2; j++) {
689	tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index];
690	}
691	index++;
692	}
693
694	currGiniGain = computeGiniGain(parentDist,tempDist);
695
696	if (currGiniGain > bestGiniGain) {
697	bestGiniGain = currGiniGain;
698
699	// clean split point
700	// splitPoint = Math.rint((inst.value(att) + currSplit)/2.0*100000)/100000.0;
701	splitPoint = (inst.value(att) + currSplit) / 2.0;
702
703	for (int j = 0; j < currDist.length; j++) {
704	System.arraycopy(tempDist[j], 0, dist[j], 0,
705	dist[j].length);
706	}
707	}
708	}
709	currSplit = inst.value(att);
710	currDist[0][(int)inst.classValue()] += weights[i];
711	currDist[1][(int)inst.classValue()] -= weights[i];
712	}
713
714	// Compute weights
715	int attIndex = att.index();
716	props[attIndex] = new double[2];
717	for (int k = 0; k < 2; k++) {
718	props[attIndex][k] = Utils.sum(dist[k]);
719	}
720	if (Utils.sum(props[attIndex]) != 0) Utils.normalize(props[attIndex]);
721
722	// Compute subset weights
723	subsetWeights[attIndex] = new double[2];
724	for (int j = 0; j < 2; j++) {
725	subsetWeights[attIndex][j] += Utils.sum(dist[j]);
726	}
727
728	// clean Gini gain
729	//giniGains[attIndex] = Math.rint(bestGiniGain*10000000)/10000000.0;
730	giniGains[attIndex] = bestGiniGain;
731	dists[attIndex] = dist;
732
733	return splitPoint;
734	}
735
736	/**
737	* Compute distributions, proportions and total weights of two successor
738	* nodes for a given nominal attribute.
739	*
740	* @param props proportions of each two branches for each attribute
741	* @param dists class distributions of two branches for each attribute
742	* @param att numeric att split on
743	* @param sortedIndices sorted indices of instances for the attirubte
744	* @param weights weights of instances for the attirbute
745	* @param subsetWeights total weight of two branches split based on the attribute
746	* @param giniGains Gini gains for each attribute
747	* @param data training instances
748	* @param useHeuristic if use heuristic search
749	* @return Gini gain for the given nominal attribute
750	* @throws Exception if something goes wrong
751	*/
752	protected String nominalDistribution(double[][] props, double[][][] dists,
753	Attribute att, int[] sortedIndices, double[] weights, double[][] subsetWeights,
754	double[] giniGains, Instances data, boolean useHeuristic)
755	throws Exception {
756
757	String[] values = new String[att.numValues()];
758	int numCat = values.length; // number of values of the attribute
759	int numClasses = data.numClasses();
760
761	String bestSplitString = "";
762	double bestGiniGain = -Double.MAX_VALUE;
763
764	// class frequency for each value
765	int[] classFreq = new int[numCat];
766	for (int j=0; j<numCat; j++) classFreq[j] = 0;
767
768	double[] parentDist = new double[numClasses];
769	double[][] currDist = new double[2][numClasses];
770	double[][] dist = new double[2][numClasses];
771	int missingStart = 0;
772
773	for (int i = 0; i < sortedIndices.length; i++) {
774	Instance inst = data.instance(sortedIndices[i]);
775	if (!inst.isMissing(att)) {
776	missingStart++;
777	classFreq[(int)inst.value(att)] ++;
778	}
779	parentDist[(int)inst.classValue()] += weights[i];
780	}
781
782	// count the number of values that class frequency is not 0
783	int nonEmpty = 0;
784	for (int j=0; j<numCat; j++) {
785	if (classFreq[j]!=0) nonEmpty ++;
786	}
787
788	// attribute values that class frequency is not 0
789	String[] nonEmptyValues = new String[nonEmpty];
790	int nonEmptyIndex = 0;
791	for (int j=0; j<numCat; j++) {
792	if (classFreq[j]!=0) {
793	nonEmptyValues[nonEmptyIndex] = att.value(j);
794	nonEmptyIndex ++;
795	}
796	}
797
798	// attribute values that class frequency is 0
799	int empty = numCat - nonEmpty;
800	String[] emptyValues = new String[empty];
801	int emptyIndex = 0;
802	for (int j=0; j<numCat; j++) {
803	if (classFreq[j]==0) {
804	emptyValues[emptyIndex] = att.value(j);
805	emptyIndex ++;
806	}
807	}
808
809	if (nonEmpty<=1) {
810	giniGains[att.index()] = 0;
811	return "";
812	}
813
814	// for tow-class probloms
815	if (data.numClasses()==2) {
816
817	//// Firstly, for attribute values which class frequency is not zero
818
819	// probability of class 0 for each attribute value
820	double[] pClass0 = new double[nonEmpty];
821	// class distribution for each attribute value
822	double[][] valDist = new double[nonEmpty][2];
823
824	for (int j=0; j<nonEmpty; j++) {
825	for (int k=0; k<2; k++) {
826	valDist[j][k] = 0;
827	}
828	}
829
830	for (int i = 0; i < sortedIndices.length; i++) {
831	Instance inst = data.instance(sortedIndices[i]);
832	if (inst.isMissing(att)) {
833	break;
834	}
835
836	for (int j=0; j<nonEmpty; j++) {
837	if (att.value((int)inst.value(att)).compareTo(nonEmptyValues[j])==0) {
838	valDist[j][(int)inst.classValue()] += inst.weight();
839	break;
840	}
841	}
842	}
843
844	for (int j=0; j<nonEmpty; j++) {
845	double distSum = Utils.sum(valDist[j]);
846	if (distSum==0) pClass0[j]=0;
847	else pClass0[j] = valDist[j][0]/distSum;
848	}
849
850	// sort category according to the probability of the first class
851	String[] sortedValues = new String[nonEmpty];
852	for (int j=0; j<nonEmpty; j++) {
853	sortedValues[j] = nonEmptyValues[Utils.minIndex(pClass0)];
854	pClass0[Utils.minIndex(pClass0)] = Double.MAX_VALUE;
855	}
856
857	// Find a subset of attribute values that maximize Gini decrease
858
859	// for the attribute values that class frequency is not 0
860	String tempStr = "";
861
862	for (int j=0; j<nonEmpty-1; j++) {
863	currDist = new double[2][numClasses];
864	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";
865	else tempStr += "\|"+ "(" + sortedValues[j] + ")";
866	for (int i=0; i<sortedIndices.length;i++) {
867	Instance inst = data.instance(sortedIndices[i]);
868	if (inst.isMissing(att)) {
869	break;
870	}
871
872	if (tempStr.indexOf
873	("(" + att.value((int)inst.value(att)) + ")")!=-1) {
874	currDist[0][(int)inst.classValue()] += weights[i];
875	} else currDist[1][(int)inst.classValue()] += weights[i];
876	}
877
878	double[][] tempDist = new double[2][numClasses];
879	for (int kk=0; kk<2; kk++) {
880	tempDist[kk] = currDist[kk];
881	}
882
883	double[] tempProps = new double[2];
884	for (int kk=0; kk<2; kk++) {
885	tempProps[kk] = Utils.sum(tempDist[kk]);
886	}
887
888	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
889
890	// split missing values
891	int mstart = missingStart;
892	while (mstart < sortedIndices.length) {
893	Instance insta = data.instance(sortedIndices[mstart]);
894	for (int jj = 0; jj < 2; jj++) {
895	tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];
896	}
897	mstart++;
898	}
899
900	double currGiniGain = computeGiniGain(parentDist,tempDist);
901
902	if (currGiniGain>bestGiniGain) {
903	bestGiniGain = currGiniGain;
904	bestSplitString = tempStr;
905	for (int jj = 0; jj < 2; jj++) {
906	//dist[jj] = new double[currDist[jj].length];
907	System.arraycopy(tempDist[jj], 0, dist[jj], 0,
908	dist[jj].length);
909	}
910	}
911	}
912	}
913
914	// multi-class problems - exhaustive search
915	else if (!useHeuristic \|\| nonEmpty<=4) {
916
917	// Firstly, for attribute values which class frequency is not zero
918	for (int i=0; i<(int)Math.pow(2,nonEmpty-1); i++) {
919	String tempStr="";
920	currDist = new double[2][numClasses];
921	int mod;
922	int bit10 = i;
923	for (int j=nonEmpty-1; j>=0; j--) {
924	mod = bit10%2; // convert from 10bit to 2bit
925	if (mod==1) {
926	if (tempStr=="") tempStr = "("+nonEmptyValues[j]+")";
927	else tempStr += "\|" + "("+nonEmptyValues[j]+")";
928	}
929	bit10 = bit10/2;
930	}
931	for (int j=0; j<sortedIndices.length;j++) {
932	Instance inst = data.instance(sortedIndices[j]);
933	if (inst.isMissing(att)) {
934	break;
935	}
936
937	if (tempStr.indexOf("("+att.value((int)inst.value(att))+")")!=-1) {
938	currDist[0][(int)inst.classValue()] += weights[j];
939	} else currDist[1][(int)inst.classValue()] += weights[j];
940	}
941
942	double[][] tempDist = new double[2][numClasses];
943	for (int k=0; k<2; k++) {
944	tempDist[k] = currDist[k];
945	}
946
947	double[] tempProps = new double[2];
948	for (int k=0; k<2; k++) {
949	tempProps[k] = Utils.sum(tempDist[k]);
950	}
951
952	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
953
954	// split missing values
955	int index = missingStart;
956	while (index < sortedIndices.length) {
957	Instance insta = data.instance(sortedIndices[index]);
958	for (int j = 0; j < 2; j++) {
959	tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index];
960	}
961	index++;
962	}
963
964	double currGiniGain = computeGiniGain(parentDist,tempDist);
965
966	if (currGiniGain>bestGiniGain) {
967	bestGiniGain = currGiniGain;
968	bestSplitString = tempStr;
969	for (int j = 0; j < 2; j++) {
970	//dist[jj] = new double[currDist[jj].length];
971	System.arraycopy(tempDist[j], 0, dist[j], 0,
972	dist[j].length);
973	}
974	}
975	}
976	}
977
978	// huristic search to solve multi-classes problems
979	else {
980	// Firstly, for attribute values which class frequency is not zero
981	int n = nonEmpty;
982	int k = data.numClasses(); // number of classes of the data
983	double[][] P = new double[n][k]; // class probability matrix
984	int[] numInstancesValue = new int[n]; // number of instances for an attribute value
985	double[] meanClass = new double[k]; // vector of mean class probability
986	int numInstances = data.numInstances(); // total number of instances
987
988	// initialize the vector of mean class probability
989	for (int j=0; j<meanClass.length; j++) meanClass[j]=0;
990
991	for (int j=0; j<numInstances; j++) {
992	Instance inst = (Instance)data.instance(j);
993	int valueIndex = 0; // attribute value index in nonEmptyValues
994	for (int i=0; i<nonEmpty; i++) {
995	if (att.value((int)inst.value(att)).compareToIgnoreCase(nonEmptyValues[i])==0){
996	valueIndex = i;
997	break;
998	}
999	}
1000	P[valueIndex][(int)inst.classValue()]++;
1001	numInstancesValue[valueIndex]++;
1002	meanClass[(int)inst.classValue()]++;
1003	}
1004
1005	// calculate the class probability matrix
1006	for (int i=0; i<P.length; i++) {
1007	for (int j=0; j<P[0].length; j++) {
1008	if (numInstancesValue[i]==0) P[i][j]=0;
1009	else P[i][j]/=numInstancesValue[i];
1010	}
1011	}
1012
1013	//calculate the vector of mean class probability
1014	for (int i=0; i<meanClass.length; i++) {
1015	meanClass[i]/=numInstances;
1016	}
1017
1018	// calculate the covariance matrix
1019	double[][] covariance = new double[k][k];
1020	for (int i1=0; i1<k; i1++) {
1021	for (int i2=0; i2<k; i2++) {
1022	double element = 0;
1023	for (int j=0; j<n; j++) {
1024	element += (P[j][i2]-meanClass[i2])*(P[j][i1]-meanClass[i1])
1025	*numInstancesValue[j];
1026	}
1027	covariance[i1][i2] = element;
1028	}
1029	}
1030
1031	Matrix matrix = new Matrix(covariance);
1032	weka.core.matrix.EigenvalueDecomposition eigen =
1033	new weka.core.matrix.EigenvalueDecomposition(matrix);
1034	double[] eigenValues = eigen.getRealEigenvalues();
1035
1036	// find index of the largest eigenvalue
1037	int index=0;
1038	double largest = eigenValues[0];
1039	for (int i=1; i<eigenValues.length; i++) {
1040	if (eigenValues[i]>largest) {
1041	index=i;
1042	largest = eigenValues[i];
1043	}
1044	}
1045
1046	// calculate the first principle component
1047	double[] FPC = new double[k];
1048	Matrix eigenVector = eigen.getV();
1049	double[][] vectorArray = eigenVector.getArray();
1050	for (int i=0; i<FPC.length; i++) {
1051	FPC[i] = vectorArray[i][index];
1052	}
1053
1054	// calculate the first principle component scores
1055	//System.out.println("the first principle component scores: ");
1056	double[] Sa = new double[n];
1057	for (int i=0; i<Sa.length; i++) {
1058	Sa[i]=0;
1059	for (int j=0; j<k; j++) {
1060	Sa[i] += FPC[j]*P[i][j];
1061	}
1062	}
1063
1064	// sort category according to Sa(s)
1065	double[] pCopy = new double[n];
1066	System.arraycopy(Sa,0,pCopy,0,n);
1067	String[] sortedValues = new String[n];
1068	Arrays.sort(Sa);
1069
1070	for (int j=0; j<n; j++) {
1071	sortedValues[j] = nonEmptyValues[Utils.minIndex(pCopy)];
1072	pCopy[Utils.minIndex(pCopy)] = Double.MAX_VALUE;
1073	}
1074
1075	// for the attribute values that class frequency is not 0
1076	String tempStr = "";
1077
1078	for (int j=0; j<nonEmpty-1; j++) {
1079	currDist = new double[2][numClasses];
1080	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";
1081	else tempStr += "\|"+ "(" + sortedValues[j] + ")";
1082	for (int i=0; i<sortedIndices.length;i++) {
1083	Instance inst = data.instance(sortedIndices[i]);
1084	if (inst.isMissing(att)) {
1085	break;
1086	}
1087
1088	if (tempStr.indexOf
1089	("(" + att.value((int)inst.value(att)) + ")")!=-1) {
1090	currDist[0][(int)inst.classValue()] += weights[i];
1091	} else currDist[1][(int)inst.classValue()] += weights[i];
1092	}
1093
1094	double[][] tempDist = new double[2][numClasses];
1095	for (int kk=0; kk<2; kk++) {
1096	tempDist[kk] = currDist[kk];
1097	}
1098
1099	double[] tempProps = new double[2];
1100	for (int kk=0; kk<2; kk++) {
1101	tempProps[kk] = Utils.sum(tempDist[kk]);
1102	}
1103
1104	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
1105
1106	// split missing values
1107	int mstart = missingStart;
1108	while (mstart < sortedIndices.length) {
1109	Instance insta = data.instance(sortedIndices[mstart]);
1110	for (int jj = 0; jj < 2; jj++) {
1111	tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];
1112	}
1113	mstart++;
1114	}
1115
1116	double currGiniGain = computeGiniGain(parentDist,tempDist);
1117
1118	if (currGiniGain>bestGiniGain) {
1119	bestGiniGain = currGiniGain;
1120	bestSplitString = tempStr;
1121	for (int jj = 0; jj < 2; jj++) {
1122	//dist[jj] = new double[currDist[jj].length];
1123	System.arraycopy(tempDist[jj], 0, dist[jj], 0,
1124	dist[jj].length);
1125	}
1126	}
1127	}
1128	}
1129
1130	// Compute weights
1131	int attIndex = att.index();
1132	props[attIndex] = new double[2];
1133	for (int k = 0; k < 2; k++) {
1134	props[attIndex][k] = Utils.sum(dist[k]);
1135	}
1136
1137	if (!(Utils.sum(props[attIndex]) > 0)) {
1138	for (int k = 0; k < props[attIndex].length; k++) {
1139	props[attIndex][k] = 1.0 / (double)props[attIndex].length;
1140	}
1141	} else {
1142	Utils.normalize(props[attIndex]);
1143	}
1144
1145
1146	// Compute subset weights
1147	subsetWeights[attIndex] = new double[2];
1148	for (int j = 0; j < 2; j++) {
1149	subsetWeights[attIndex][j] += Utils.sum(dist[j]);
1150	}
1151
1152	// Then, for the attribute values that class frequency is 0, split it into the
1153	// most frequent branch
1154	for (int j=0; j<empty; j++) {
1155	if (props[attIndex][0]>=props[attIndex][1]) {
1156	if (bestSplitString=="") bestSplitString = "(" + emptyValues[j] + ")";
1157	else bestSplitString += "\|" + "(" + emptyValues[j] + ")";
1158	}
1159	}
1160
1161	// clean Gini gain for the attribute
1162	//giniGains[attIndex] = Math.rint(bestGiniGain*10000000)/10000000.0;
1163	giniGains[attIndex] = bestGiniGain;
1164
1165	dists[attIndex] = dist;
1166	return bestSplitString;
1167	}
1168
1169
1170	/**
1171	* Split data into two subsets and store sorted indices and weights for two
1172	* successor nodes.
1173	*
1174	* @param subsetIndices sorted indecis of instances for each attribute
1175	* for two successor node
1176	* @param subsetWeights weights of instances for each attribute for
1177	* two successor node
1178	* @param att attribute the split based on
1179	* @param splitPoint split point the split based on if att is numeric
1180	* @param splitStr split subset the split based on if att is nominal
1181	* @param sortedIndices sorted indices of the instances to be split
1182	* @param weights weights of the instances to bes split
1183	* @param data training data
1184	* @throws Exception if something goes wrong
1185	*/
1186	protected void splitData(int[][][] subsetIndices, double[][][] subsetWeights,
1187	Attribute att, double splitPoint, String splitStr, int[][] sortedIndices,
1188	double[][] weights, Instances data) throws Exception {
1189
1190	int j;
1191	// For each attribute
1192	for (int i = 0; i < data.numAttributes(); i++) {
1193	if (i==data.classIndex()) continue;
1194	int[] num = new int[2];
1195	for (int k = 0; k < 2; k++) {
1196	subsetIndices[k][i] = new int[sortedIndices[i].length];
1197	subsetWeights[k][i] = new double[weights[i].length];
1198	}
1199
1200	for (j = 0; j < sortedIndices[i].length; j++) {
1201	Instance inst = data.instance(sortedIndices[i][j]);
1202	if (inst.isMissing(att)) {
1203	// Split instance up
1204	for (int k = 0; k < 2; k++) {
1205	if (m_Props[k] > 0) {
1206	subsetIndices[k][i][num[k]] = sortedIndices[i][j];
1207	subsetWeights[k][i][num[k]] = m_Props[k] * weights[i][j];
1208	num[k]++;
1209	}
1210	}
1211	} else {
1212	int subset;
1213	if (att.isNumeric()) {
1214	subset = (inst.value(att) < splitPoint) ? 0 : 1;
1215	} else { // nominal attribute
1216	if (splitStr.indexOf
1217	("(" + att.value((int)inst.value(att.index()))+")")!=-1) {
1218	subset = 0;
1219	} else subset = 1;
1220	}
1221	subsetIndices[subset][i][num[subset]] = sortedIndices[i][j];
1222	subsetWeights[subset][i][num[subset]] = weights[i][j];
1223	num[subset]++;
1224	}
1225	}
1226
1227	// Trim arrays
1228	for (int k = 0; k < 2; k++) {
1229	int[] copy = new int[num[k]];
1230	System.arraycopy(subsetIndices[k][i], 0, copy, 0, num[k]);
1231	subsetIndices[k][i] = copy;
1232	double[] copyWeights = new double[num[k]];
1233	System.arraycopy(subsetWeights[k][i], 0 ,copyWeights, 0, num[k]);
1234	subsetWeights[k][i] = copyWeights;
1235	}
1236	}
1237	}
1238
1239	/**
1240	* Updates the numIncorrectModel field for all nodes when subtree (to be
1241	* pruned) is rooted. This is needed for calculating the alpha-values.
1242	*
1243	* @throws Exception if something goes wrong
1244	*/
1245	public void modelErrors() throws Exception{
1246	Evaluation eval = new Evaluation(m_train);
1247
1248	if (!m_isLeaf) {
1249	m_isLeaf = true; //temporarily make leaf
1250
1251	// calculate distribution for evaluation
1252	eval.evaluateModel(this, m_train);
1253	m_numIncorrectModel = eval.incorrect();
1254
1255	m_isLeaf = false;
1256
1257	for (int i = 0; i < m_Successors.length; i++)
1258	m_Successors[i].modelErrors();
1259
1260	} else {
1261	eval.evaluateModel(this, m_train);
1262	m_numIncorrectModel = eval.incorrect();
1263	}
1264	}
1265
1266	/**
1267	* Updates the numIncorrectTree field for all nodes. This is needed for
1268	* calculating the alpha-values.
1269	*
1270	* @throws Exception if something goes wrong
1271	*/
1272	public void treeErrors() throws Exception {
1273	if (m_isLeaf) {
1274	m_numIncorrectTree = m_numIncorrectModel;
1275	} else {
1276	m_numIncorrectTree = 0;
1277	for (int i = 0; i < m_Successors.length; i++) {
1278	m_Successors[i].treeErrors();
1279	m_numIncorrectTree += m_Successors[i].m_numIncorrectTree;
1280	}
1281	}
1282	}
1283
1284	/**
1285	* Updates the alpha field for all nodes.
1286	*
1287	* @throws Exception if something goes wrong
1288	*/
1289	public void calculateAlphas() throws Exception {
1290
1291	if (!m_isLeaf) {
1292	double errorDiff = m_numIncorrectModel - m_numIncorrectTree;
1293	if (errorDiff <=0) {
1294	//split increases training error (should not normally happen).
1295	//prune it instantly.
1296	makeLeaf(m_train);
1297	m_Alpha = Double.MAX_VALUE;
1298	} else {
1299	//compute alpha
1300	errorDiff /= m_totalTrainInstances;
1301	m_Alpha = errorDiff / (double)(numLeaves() - 1);
1302	long alphaLong = Math.round(m_Alpha*Math.pow(10,10));
1303	m_Alpha = (double)alphaLong/Math.pow(10,10);
1304	for (int i = 0; i < m_Successors.length; i++) {
1305	m_Successors[i].calculateAlphas();
1306	}
1307	}
1308	} else {
1309	//alpha = infinite for leaves (do not want to prune)
1310	m_Alpha = Double.MAX_VALUE;
1311	}
1312	}
1313
1314	/**
1315	* Find the node with minimal alpha value. If two nodes have the same alpha,
1316	* choose the one with more leave nodes.
1317	*
1318	* @param nodeList list of inner nodes
1319	* @return the node to be pruned
1320	*/
1321	protected SimpleCart nodeToPrune(Vector nodeList) {
1322	if (nodeList.size()==0) return null;
1323	if (nodeList.size()==1) return (SimpleCart)nodeList.elementAt(0);
1324	SimpleCart returnNode = (SimpleCart)nodeList.elementAt(0);
1325	double baseAlpha = returnNode.m_Alpha;
1326	for (int i=1; i<nodeList.size(); i++) {
1327	SimpleCart node = (SimpleCart)nodeList.elementAt(i);
1328	if (node.m_Alpha < baseAlpha) {
1329	baseAlpha = node.m_Alpha;
1330	returnNode = node;
1331	} else if (node.m_Alpha == baseAlpha) { // break tie
1332	if (node.numLeaves()>returnNode.numLeaves()) {
1333	returnNode = node;
1334	}
1335	}
1336	}
1337	return returnNode;
1338	}
1339
1340	/**
1341	* Compute sorted indices, weights and class probabilities for a given
1342	* dataset. Return total weights of the data at the node.
1343	*
1344	* @param data training data
1345	* @param sortedIndices sorted indices of instances at the node
1346	* @param weights weights of instances at the node
1347	* @param classProbs class probabilities at the node
1348	* @return total weights of instances at the node
1349	* @throws Exception if something goes wrong
1350	*/
1351	protected double computeSortedInfo(Instances data, int[][] sortedIndices, double[][] weights,
1352	double[] classProbs) throws Exception {
1353
1354	// Create array of sorted indices and weights
1355	double[] vals = new double[data.numInstances()];
1356	for (int j = 0; j < data.numAttributes(); j++) {
1357	if (j==data.classIndex()) continue;
1358	weights[j] = new double[data.numInstances()];
1359
1360	if (data.attribute(j).isNominal()) {
1361
1362	// Handling nominal attributes. Putting indices of
1363	// instances with missing values at the end.
1364	sortedIndices[j] = new int[data.numInstances()];
1365	int count = 0;
1366	for (int i = 0; i < data.numInstances(); i++) {
1367	Instance inst = data.instance(i);
1368	if (!inst.isMissing(j)) {
1369	sortedIndices[j][count] = i;
1370	weights[j][count] = inst.weight();
1371	count++;
1372	}
1373	}
1374	for (int i = 0; i < data.numInstances(); i++) {
1375	Instance inst = data.instance(i);
1376	if (inst.isMissing(j)) {
1377	sortedIndices[j][count] = i;
1378	weights[j][count] = inst.weight();
1379	count++;
1380	}
1381	}
1382	} else {
1383
1384	// Sorted indices are computed for numeric attributes
1385	// missing values instances are put to end
1386	for (int i = 0; i < data.numInstances(); i++) {
1387	Instance inst = data.instance(i);
1388	vals[i] = inst.value(j);
1389	}
1390	sortedIndices[j] = Utils.sort(vals);
1391	for (int i = 0; i < data.numInstances(); i++) {
1392	weights[j][i] = data.instance(sortedIndices[j][i]).weight();
1393	}
1394	}
1395	}
1396
1397	// Compute initial class counts
1398	double totalWeight = 0;
1399	for (int i = 0; i < data.numInstances(); i++) {
1400	Instance inst = data.instance(i);
1401	classProbs[(int)inst.classValue()] += inst.weight();
1402	totalWeight += inst.weight();
1403	}
1404
1405	return totalWeight;
1406	}
1407
1408	/**
1409	* Compute and return gini gain for given distributions of a node and its
1410	* successor nodes.
1411	*
1412	* @param parentDist class distributions of parent node
1413	* @param childDist class distributions of successor nodes
1414	* @return Gini gain computed
1415	*/
1416	protected double computeGiniGain(double[] parentDist, double[][] childDist) {
1417	double totalWeight = Utils.sum(parentDist);
1418	if (totalWeight==0) return 0;
1419
1420	double leftWeight = Utils.sum(childDist[0]);
1421	double rightWeight = Utils.sum(childDist[1]);
1422
1423	double parentGini = computeGini(parentDist, totalWeight);
1424	double leftGini = computeGini(childDist[0],leftWeight);
1425	double rightGini = computeGini(childDist[1], rightWeight);
1426
1427	return parentGini - leftWeight/totalWeight*leftGini -
1428	rightWeight/totalWeight*rightGini;
1429	}
1430
1431	/**
1432	* Compute and return gini index for a given distribution of a node.
1433	*
1434	* @param dist class distributions
1435	* @param total class distributions
1436	* @return Gini index of the class distributions
1437	*/
1438	protected double computeGini(double[] dist, double total) {
1439	if (total==0) return 0;
1440	double val = 0;
1441	for (int i=0; i<dist.length; i++) {
1442	val += (dist[i]/total)*(dist[i]/total);
1443	}
1444	return 1- val;
1445	}
1446
1447	/**
1448	* Computes class probabilities for instance using the decision tree.
1449	*
1450	* @param instance the instance for which class probabilities is to be computed
1451	* @return the class probabilities for the given instance
1452	* @throws Exception if something goes wrong
1453	*/
1454	public double[] distributionForInstance(Instance instance)
1455	throws Exception {
1456	if (!m_isLeaf) {
1457	// value of split attribute is missing
1458	if (instance.isMissing(m_Attribute)) {
1459	double[] returnedDist = new double[m_ClassProbs.length];
1460
1461	for (int i = 0; i < m_Successors.length; i++) {
1462	double[] help =
1463	m_Successors[i].distributionForInstance(instance);
1464	if (help != null) {
1465	for (int j = 0; j < help.length; j++) {
1466	returnedDist[j] += m_Props[i] * help[j];
1467	}
1468	}
1469	}
1470	return returnedDist;
1471	}
1472
1473	// split attribute is nonimal
1474	else if (m_Attribute.isNominal()) {
1475	if (m_SplitString.indexOf("(" +
1476	m_Attribute.value((int)instance.value(m_Attribute)) + ")")!=-1)
1477	return m_Successors[0].distributionForInstance(instance);
1478	else return m_Successors[1].distributionForInstance(instance);
1479	}
1480
1481	// split attribute is numeric
1482	else {
1483	if (instance.value(m_Attribute) < m_SplitValue)
1484	return m_Successors[0].distributionForInstance(instance);
1485	else
1486	return m_Successors[1].distributionForInstance(instance);
1487	}
1488	}
1489
1490	// leaf node
1491	else return m_ClassProbs;
1492	}
1493
1494	/**
1495	* Make the node leaf node.
1496	*
1497	* @param data trainging data
1498	*/
1499	protected void makeLeaf(Instances data) {
1500	m_Attribute = null;
1501	m_isLeaf = true;
1502	m_ClassValue=Utils.maxIndex(m_ClassProbs);
1503	m_ClassAttribute = data.classAttribute();
1504	}
1505
1506	/**
1507	* Prints the decision tree using the protected toString method from below.
1508	*
1509	* @return a textual description of the classifier
1510	*/
1511	public String toString() {
1512	if ((m_ClassProbs == null) && (m_Successors == null)) {
1513	return "CART Tree: No model built yet.";
1514	}
1515
1516	return "CART Decision Tree\n" + toString(0)+"\n\n"
1517	+"Number of Leaf Nodes: "+numLeaves()+"\n\n" +
1518	"Size of the Tree: "+numNodes();
1519	}
1520
1521	/**
1522	* Outputs a tree at a certain level.
1523	*
1524	* @param level the level at which the tree is to be printed
1525	* @return a tree at a certain level
1526	*/
1527	protected String toString(int level) {
1528
1529	StringBuffer text = new StringBuffer();
1530	// if leaf nodes
1531	if (m_Attribute == null) {
1532	if (Utils.isMissingValue(m_ClassValue)) {
1533	text.append(": null");
1534	} else {
1535	double correctNum = (int)(m_Distribution[Utils.maxIndex(m_Distribution)]*100)/
1536	100.0;
1537	double wrongNum = (int)((Utils.sum(m_Distribution) -
1538	m_Distribution[Utils.maxIndex(m_Distribution)])*100)/100.0;
1539	String str = "(" + correctNum + "/" + wrongNum + ")";
1540	text.append(": " + m_ClassAttribute.value((int) m_ClassValue)+ str);
1541	}
1542	} else {
1543	for (int j = 0; j < 2; j++) {
1544	text.append("\n");
1545	for (int i = 0; i < level; i++) {
1546	text.append("\| ");
1547	}
1548	if (j==0) {
1549	if (m_Attribute.isNumeric())
1550	text.append(m_Attribute.name() + " < " + m_SplitValue);
1551	else
1552	text.append(m_Attribute.name() + "=" + m_SplitString);
1553	} else {
1554	if (m_Attribute.isNumeric())
1555	text.append(m_Attribute.name() + " >= " + m_SplitValue);
1556	else
1557	text.append(m_Attribute.name() + "!=" + m_SplitString);
1558	}
1559	text.append(m_Successors[j].toString(level + 1));
1560	}
1561	}
1562	return text.toString();
1563	}
1564
1565	/**
1566	* Compute size of the tree.
1567	*
1568	* @return size of the tree
1569	*/
1570	public int numNodes() {
1571	if (m_isLeaf) {
1572	return 1;
1573	} else {
1574	int size =1;
1575	for (int i=0;i<m_Successors.length;i++) {
1576	size+=m_Successors[i].numNodes();
1577	}
1578	return size;
1579	}
1580	}
1581
1582	/**
1583	* Method to count the number of inner nodes in the tree.
1584	*
1585	* @return the number of inner nodes
1586	*/
1587	public int numInnerNodes(){
1588	if (m_Attribute==null) return 0;
1589	int numNodes = 1;
1590	for (int i = 0; i < m_Successors.length; i++)
1591	numNodes += m_Successors[i].numInnerNodes();
1592	return numNodes;
1593	}
1594
1595	/**
1596	* Return a list of all inner nodes in the tree.
1597	*
1598	* @return the list of all inner nodes
1599	*/
1600	protected Vector getInnerNodes(){
1601	Vector nodeList = new Vector();
1602	fillInnerNodes(nodeList);
1603	return nodeList;
1604	}
1605
1606	/**
1607	* Fills a list with all inner nodes in the tree.
1608	*
1609	* @param nodeList the list to be filled
1610	*/
1611	protected void fillInnerNodes(Vector nodeList) {
1612	if (!m_isLeaf) {
1613	nodeList.add(this);
1614	for (int i = 0; i < m_Successors.length; i++)
1615	m_Successors[i].fillInnerNodes(nodeList);
1616	}
1617	}
1618
1619	/**
1620	* Compute number of leaf nodes.
1621	*
1622	* @return number of leaf nodes
1623	*/
1624	public int numLeaves() {
1625	if (m_isLeaf) return 1;
1626	else {
1627	int size=0;
1628	for (int i=0;i<m_Successors.length;i++) {
1629	size+=m_Successors[i].numLeaves();
1630	}
1631	return size;
1632	}
1633	}
1634
1635	/**
1636	* Returns an enumeration describing the available options.
1637	*
1638	* @return an enumeration of all the available options.
1639	*/
1640	public Enumeration listOptions() {
1641	Vector result;
1642	Enumeration en;
1643
1644	result = new Vector();
1645
1646	en = super.listOptions();
1647	while (en.hasMoreElements())
1648	result.addElement(en.nextElement());
1649
1650	result.addElement(new Option(
1651	"\tThe minimal number of instances at the terminal nodes.\n"
1652	+ "\t(default 2)",
1653	"M", 1, "-M <min no>"));
1654
1655	result.addElement(new Option(
1656	"\tThe number of folds used in the minimal cost-complexity pruning.\n"
1657	+ "\t(default 5)",
1658	"N", 1, "-N <num folds>"));
1659
1660	result.addElement(new Option(
1661	"\tDon't use the minimal cost-complexity pruning.\n"
1662	+ "\t(default yes).",
1663	"U", 0, "-U"));
1664
1665	result.addElement(new Option(
1666	"\tDon't use the heuristic method for binary split.\n"
1667	+ "\t(default true).",
1668	"H", 0, "-H"));
1669
1670	result.addElement(new Option(
1671	"\tUse 1 SE rule to make pruning decision.\n"
1672	+ "\t(default no).",
1673	"A", 0, "-A"));
1674
1675	result.addElement(new Option(
1676	"\tPercentage of training data size (0-1].\n"
1677	+ "\t(default 1).",
1678	"C", 1, "-C"));
1679
1680	return result.elements();
1681	}
1682
1683	/**
1684	* Parses a given list of options. <p/>
1685	*
1686	<!-- options-start -->
1687	* Valid options are: <p/>
1688	*
1689	* <pre> -S <num>
1690	* Random number seed.
1691	* (default 1)</pre>
1692	*
1693	* <pre> -D
1694	* If set, classifier is run in debug mode and
1695	* may output additional info to the console</pre>
1696	*
1697	* <pre> -M <min no>
1698	* The minimal number of instances at the terminal nodes.
1699	* (default 2)</pre>
1700	*
1701	* <pre> -N <num folds>
1702	* The number of folds used in the minimal cost-complexity pruning.
1703	* (default 5)</pre>
1704	*
1705	* <pre> -U
1706	* Don't use the minimal cost-complexity pruning.
1707	* (default yes).</pre>
1708	*
1709	* <pre> -H
1710	* Don't use the heuristic method for binary split.
1711	* (default true).</pre>
1712	*
1713	* <pre> -A
1714	* Use 1 SE rule to make pruning decision.
1715	* (default no).</pre>
1716	*
1717	* <pre> -C
1718	* Percentage of training data size (0-1].
1719	* (default 1).</pre>
1720	*
1721	<!-- options-end -->
1722	*
1723	* @param options the list of options as an array of strings
1724	* @throws Exception if an options is not supported
1725	*/
1726	public void setOptions(String[] options) throws Exception {
1727	String tmpStr;
1728
1729	super.setOptions(options);
1730
1731	tmpStr = Utils.getOption('M', options);
1732	if (tmpStr.length() != 0)
1733	setMinNumObj(Double.parseDouble(tmpStr));
1734	else
1735	setMinNumObj(2);
1736
1737	tmpStr = Utils.getOption('N', options);
1738	if (tmpStr.length()!=0)
1739	setNumFoldsPruning(Integer.parseInt(tmpStr));
1740	else
1741	setNumFoldsPruning(5);
1742
1743	setUsePrune(!Utils.getFlag('U',options));
1744	setHeuristic(!Utils.getFlag('H',options));
1745	setUseOneSE(Utils.getFlag('A',options));
1746
1747	tmpStr = Utils.getOption('C', options);
1748	if (tmpStr.length()!=0)
1749	setSizePer(Double.parseDouble(tmpStr));
1750	else
1751	setSizePer(1);
1752
1753	Utils.checkForRemainingOptions(options);
1754	}
1755
1756	/**
1757	* Gets the current settings of the classifier.
1758	*
1759	* @return the current setting of the classifier
1760	*/
1761	public String[] getOptions() {
1762	int i;
1763	Vector result;
1764	String[] options;
1765
1766	result = new Vector();
1767
1768	options = super.getOptions();
1769	for (i = 0; i < options.length; i++)
1770	result.add(options[i]);
1771
1772	result.add("-M");
1773	result.add("" + getMinNumObj());
1774
1775	result.add("-N");
1776	result.add("" + getNumFoldsPruning());
1777
1778	if (!getUsePrune())
1779	result.add("-U");
1780
1781	if (!getHeuristic())
1782	result.add("-H");
1783
1784	if (getUseOneSE())
1785	result.add("-A");
1786
1787	result.add("-C");
1788	result.add("" + getSizePer());
1789
1790	return (String[]) result.toArray(new String[result.size()]);
1791	}
1792
1793	/**
1794	* Return an enumeration of the measure names.
1795	*
1796	* @return an enumeration of the measure names
1797	*/
1798	public Enumeration enumerateMeasures() {
1799	Vector result = new Vector();
1800
1801	result.addElement("measureTreeSize");
1802
1803	return result.elements();
1804	}
1805
1806	/**
1807	* Return number of tree size.
1808	*
1809	* @return number of tree size
1810	*/
1811	public double measureTreeSize() {
1812	return numNodes();
1813	}
1814
1815	/**
1816	* Returns the value of the named measure.
1817	*
1818	* @param additionalMeasureName the name of the measure to query for its value
1819	* @return the value of the named measure
1820	* @throws IllegalArgumentException if the named measure is not supported
1821	*/
1822	public double getMeasure(String additionalMeasureName) {
1823	if (additionalMeasureName.compareToIgnoreCase("measureTreeSize") == 0) {
1824	return measureTreeSize();
1825	} else {
1826	throw new IllegalArgumentException(additionalMeasureName
1827	+ " not supported (Cart pruning)");
1828	}
1829	}
1830
1831	/**
1832	* Returns the tip text for this property
1833	*
1834	* @return tip text for this property suitable for
1835	* displaying in the explorer/experimenter gui
1836	*/
1837	public String minNumObjTipText() {
1838	return "The minimal number of observations at the terminal nodes (default 2).";
1839	}
1840
1841	/**
1842	* Set minimal number of instances at the terminal nodes.
1843	*
1844	* @param value minimal number of instances at the terminal nodes
1845	*/
1846	public void setMinNumObj(double value) {
1847	m_minNumObj = value;
1848	}
1849
1850	/**
1851	* Get minimal number of instances at the terminal nodes.
1852	*
1853	* @return minimal number of instances at the terminal nodes
1854	*/
1855	public double getMinNumObj() {
1856	return m_minNumObj;
1857	}
1858
1859	/**
1860	* Returns the tip text for this property
1861	*
1862	* @return tip text for this property suitable for
1863	* displaying in the explorer/experimenter gui
1864	*/
1865	public String numFoldsPruningTipText() {
1866	return "The number of folds in the internal cross-validation (default 5).";
1867	}
1868
1869	/**
1870	* Set number of folds in internal cross-validation.
1871	*
1872	* @param value number of folds in internal cross-validation.
1873	*/
1874	public void setNumFoldsPruning(int value) {
1875	m_numFoldsPruning = value;
1876	}
1877
1878	/**
1879	* Set number of folds in internal cross-validation.
1880	*
1881	* @return number of folds in internal cross-validation.
1882	*/
1883	public int getNumFoldsPruning() {
1884	return m_numFoldsPruning;
1885	}
1886
1887	/**
1888	* Return the tip text for this property
1889	*
1890	* @return tip text for this property suitable for displaying in
1891	* the explorer/experimenter gui.
1892	*/
1893	public String usePruneTipText() {
1894	return "Use minimal cost-complexity pruning (default yes).";
1895	}
1896
1897	/**
1898	* Set if use minimal cost-complexity pruning.
1899	*
1900	* @param value if use minimal cost-complexity pruning
1901	*/
1902	public void setUsePrune(boolean value) {
1903	m_Prune = value;
1904	}
1905
1906	/**
1907	* Get if use minimal cost-complexity pruning.
1908	*
1909	* @return if use minimal cost-complexity pruning
1910	*/
1911	public boolean getUsePrune() {
1912	return m_Prune;
1913	}
1914
1915	/**
1916	* Returns the tip text for this property
1917	*
1918	* @return tip text for this property suitable for
1919	* displaying in the explorer/experimenter gui.
1920	*/
1921	public String heuristicTipText() {
1922	return
1923	"If heuristic search is used for binary split for nominal attributes "
1924	+ "in multi-class problems (default yes).";
1925	}
1926
1927	/**
1928	* Set if use heuristic search for nominal attributes in multi-class problems.
1929	*
1930	* @param value if use heuristic search for nominal attributes in
1931	* multi-class problems
1932	*/
1933	public void setHeuristic(boolean value) {
1934	m_Heuristic = value;
1935	}
1936
1937	/**
1938	* Get if use heuristic search for nominal attributes in multi-class problems.
1939	*
1940	* @return if use heuristic search for nominal attributes in
1941	* multi-class problems
1942	*/
1943	public boolean getHeuristic() {return m_Heuristic;}
1944
1945	/**
1946	* Returns the tip text for this property
1947	*
1948	* @return tip text for this property suitable for
1949	* displaying in the explorer/experimenter gui.
1950	*/
1951	public String useOneSETipText() {
1952	return "Use the 1SE rule to make pruning decisoin.";
1953	}
1954
1955	/**
1956	* Set if use the 1SE rule to choose final model.
1957	*
1958	* @param value if use the 1SE rule to choose final model
1959	*/
1960	public void setUseOneSE(boolean value) {
1961	m_UseOneSE = value;
1962	}
1963
1964	/**
1965	* Get if use the 1SE rule to choose final model.
1966	*
1967	* @return if use the 1SE rule to choose final model
1968	*/
1969	public boolean getUseOneSE() {
1970	return m_UseOneSE;
1971	}
1972
1973	/**
1974	* Returns the tip text for this property
1975	*
1976	* @return tip text for this property suitable for
1977	* displaying in the explorer/experimenter gui.
1978	*/
1979	public String sizePerTipText() {
1980	return "The percentage of the training set size (0-1, 0 not included).";
1981	}
1982
1983	/**
1984	* Set training set size.
1985	*
1986	* @param value training set size
1987	*/
1988	public void setSizePer(double value) {
1989	if ((value <= 0) \|\| (value > 1))
1990	System.err.println(
1991	"The percentage of the training set size must be in range 0 to 1 "
1992	+ "(0 not included) - ignored!");
1993	else
1994	m_SizePer = value;
1995	}
1996
1997	/**
1998	* Get training set size.
1999	*
2000	* @return training set size
2001	*/
2002	public double getSizePer() {
2003	return m_SizePer;
2004	}
2005
2006	/**
2007	* Returns the revision string.
2008	*
2009	* @return the revision
2010	*/
2011	public String getRevision() {
2012	return RevisionUtils.extract("$Revision: 5987 $");
2013	}
2014
2015	/**
2016	* Main method.
2017	* @param args the options for the classifier
2018	*/
2019	public static void main(String[] args) {
2020	runClassifier(new SimpleCart(), args);
2021	}
2022	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: