Context Navigation

BFTree.java

Last change on this file was 29, checked in by gnappo, 14 years ago
Taggata versione per la demo e aggiunto branch.
File size: 83.2 KB

Rev	Line
[29]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* BFTree.java
	19	* Copyright (C) 2007 Haijian Shi
	20	*
	21	*/
	22
	23	package weka.classifiers.trees;
	24
	25	import weka.classifiers.Evaluation;
	26	import weka.classifiers.RandomizableClassifier;
	27	import weka.core.AdditionalMeasureProducer;
	28	import weka.core.Attribute;
	29	import weka.core.Capabilities;
	30	import weka.core.FastVector;
	31	import weka.core.Instance;
	32	import weka.core.Instances;
	33	import weka.core.Option;
	34	import weka.core.RevisionUtils;
	35	import weka.core.SelectedTag;
	36	import weka.core.Tag;
	37	import weka.core.TechnicalInformation;
	38	import weka.core.TechnicalInformationHandler;
	39	import weka.core.Utils;
	40	import weka.core.Capabilities.Capability;
	41	import weka.core.TechnicalInformation.Field;
	42	import weka.core.TechnicalInformation.Type;
	43	import weka.core.matrix.Matrix;
	44
	45	import java.util.Arrays;
	46	import java.util.Enumeration;
	47	import java.util.Random;
	48	import java.util.Vector;
	49
	50	/**
	51	<!-- globalinfo-start -->
	52	* Class for building a best-first decision tree classifier. This class uses binary split for both nominal and numeric attributes. For missing values, the method of 'fractional' instances is used.<br/>
	53	* <br/>
	54	* For more information, see:<br/>
	55	* <br/>
	56	* Haijian Shi (2007). Best-first decision tree learning. Hamilton, NZ.<br/>
	57	* <br/>
	58	* Jerome Friedman, Trevor Hastie, Robert Tibshirani (2000). Additive logistic regression : A statistical view of boosting. Annals of statistics. 28(2):337-407.
	59	* <p/>
	60	<!-- globalinfo-end -->
	61	*
	62	<!-- technical-bibtex-start -->
	63	* BibTeX:
	64	* <pre>
	65	* @mastersthesis{Shi2007,
	66	* address = {Hamilton, NZ},
	67	* author = {Haijian Shi},
	68	* note = {COMP594},
	69	* school = {University of Waikato},
	70	* title = {Best-first decision tree learning},
	71	* year = {2007}
	72	* }
	73	*
	74	* @article{Friedman2000,
	75	* author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
	76	* journal = {Annals of statistics},
	77	* number = {2},
	78	* pages = {337-407},
	79	* title = {Additive logistic regression : A statistical view of boosting},
	80	* volume = {28},
	81	* year = {2000},
	82	* ISSN = {0090-5364}
	83	* }
	84	* </pre>
	85	* <p/>
	86	<!-- technical-bibtex-end -->
	87	*
	88	<!-- options-start -->
	89	* Valid options are: <p/>
	90	*
	91	* <pre> -S <num>
	92	* Random number seed.
	93	* (default 1)</pre>
	94	*
	95	* <pre> -D
	96	* If set, classifier is run in debug mode and
	97	* may output additional info to the console</pre>
	98	*
	99	* <pre> -P <UNPRUNED\|POSTPRUNED\|PREPRUNED>
	100	* The pruning strategy.
	101	* (default: POSTPRUNED)</pre>
	102	*
	103	* <pre> -M <min no>
	104	* The minimal number of instances at the terminal nodes.
	105	* (default 2)</pre>
	106	*
	107	* <pre> -N <num folds>
	108	* The number of folds used in the pruning.
	109	* (default 5)</pre>
	110	*
	111	* <pre> -H
	112	* Don't use heuristic search for nominal attributes in multi-class
	113	* problem (default yes).
	114	* </pre>
	115	*
	116	* <pre> -G
	117	* Don't use Gini index for splitting (default yes),
	118	* if not information is used.</pre>
	119	*
	120	* <pre> -R
	121	* Don't use error rate in internal cross-validation (default yes),
	122	* but root mean squared error.</pre>
	123	*
	124	* <pre> -A
	125	* Use the 1 SE rule to make pruning decision.
	126	* (default no).</pre>
	127	*
	128	* <pre> -C
	129	* Percentage of training data size (0-1]
	130	* (default 1).</pre>
	131	*
	132	<!-- options-end -->
	133	*
	134	* @author Haijian Shi (hs69@cs.waikato.ac.nz)
	135	* @version $Revision: 5987 $
	136	*/
	137	public class BFTree
	138	extends RandomizableClassifier
	139	implements AdditionalMeasureProducer, TechnicalInformationHandler {
	140
	141	/** For serialization. */
	142	private static final long serialVersionUID = -7035607375962528217L;
	143
	144	/** pruning strategy: un-pruned */
	145	public static final int PRUNING_UNPRUNED = 0;
	146	/** pruning strategy: post-pruning */
	147	public static final int PRUNING_POSTPRUNING = 1;
	148	/** pruning strategy: pre-pruning */
	149	public static final int PRUNING_PREPRUNING = 2;
	150	/** pruning strategy */
	151	public static final Tag[] TAGS_PRUNING = {
	152	new Tag(PRUNING_UNPRUNED, "unpruned", "Un-pruned"),
	153	new Tag(PRUNING_POSTPRUNING, "postpruned", "Post-pruning"),
	154	new Tag(PRUNING_PREPRUNING, "prepruned", "Pre-pruning")
	155	};
	156
	157	/** the pruning strategy */
	158	protected int m_PruningStrategy = PRUNING_POSTPRUNING;
	159
	160	/** Successor nodes. */
	161	protected BFTree[] m_Successors;
	162
	163	/** Attribute used for splitting. */
	164	protected Attribute m_Attribute;
	165
	166	/** Split point (for numeric attributes). */
	167	protected double m_SplitValue;
	168
	169	/** Split subset (for nominal attributes). */
	170	protected String m_SplitString;
	171
	172	/** Class value for a node. */
	173	protected double m_ClassValue;
	174
	175	/** Class attribute of a dataset. */
	176	protected Attribute m_ClassAttribute;
	177
	178	/** Minimum number of instances at leaf nodes. */
	179	protected int m_minNumObj = 2;
	180
	181	/** Number of folds for the pruning. */
	182	protected int m_numFoldsPruning = 5;
	183
	184	/** If the ndoe is leaf node. */
	185	protected boolean m_isLeaf;
	186
	187	/** Number of expansions. */
	188	protected static int m_Expansion;
	189
	190	/** Fixed number of expansions (if no pruning method is used, its value is -1. Otherwise,
	191	* its value is gotten from internal cross-validation). */
	192	protected int m_FixedExpansion = -1;
	193
	194	/** If use huristic search for binary split (default true). Note even if its value is true, it is only
	195	* used when the number of values of a nominal attribute is larger than 4. */
	196	protected boolean m_Heuristic = true;
	197
	198	/** If use Gini index as the splitting criterion - default (if not, information is used). */
	199	protected boolean m_UseGini = true;
	200
	201	/** If use error rate in internal cross-validation to fix the number of expansions - default
	202	* (if not, root mean squared error is used). */
	203	protected boolean m_UseErrorRate = true;
	204
	205	/** If use the 1SE rule to make the decision. */
	206	protected boolean m_UseOneSE = false;
	207
	208	/** Class distributions. */
	209	protected double[] m_Distribution;
	210
	211	/** Branch proportions. */
	212	protected double[] m_Props;
	213
	214	/** Sorted indices. */
	215	protected int[][] m_SortedIndices;
	216
	217	/** Sorted weights. */
	218	protected double[][] m_Weights;
	219
	220	/** Distributions of each attribute for two successor nodes. */
	221	protected double[][][] m_Dists;
	222
	223	/** Class probabilities. */
	224	protected double[] m_ClassProbs;
	225
	226	/** Total weights. */
	227	protected double m_TotalWeight;
	228
	229	/** The training data size (0-1). Default 1. */
	230	protected double m_SizePer = 1;
	231
	232	/**
	233	* Returns a string describing classifier
	234	*
	235	* @return a description suitable for displaying in the
	236	* explorer/experimenter gui
	237	*/
	238	public String globalInfo() {
	239	return
	240	"Class for building a best-first decision tree classifier. "
	241	+ "This class uses binary split for both nominal and numeric attributes. "
	242	+ "For missing values, the method of 'fractional' instances is used.\n\n"
	243	+ "For more information, see:\n\n"
	244	+ getTechnicalInformation().toString();
	245	}
	246
	247	/**
	248	* Returns an instance of a TechnicalInformation object, containing
	249	* detailed information about the technical background of this class,
	250	* e.g., paper reference or book this class is based on.
	251	*
	252	* @return the technical information about this class
	253	*/
	254	public TechnicalInformation getTechnicalInformation() {
	255	TechnicalInformation result;
	256	TechnicalInformation additional;
	257
	258	result = new TechnicalInformation(Type.MASTERSTHESIS);
	259	result.setValue(Field.AUTHOR, "Haijian Shi");
	260	result.setValue(Field.YEAR, "2007");
	261	result.setValue(Field.TITLE, "Best-first decision tree learning");
	262	result.setValue(Field.SCHOOL, "University of Waikato");
	263	result.setValue(Field.ADDRESS, "Hamilton, NZ");
	264	result.setValue(Field.NOTE, "COMP594");
	265
	266	additional = result.add(Type.ARTICLE);
	267	additional.setValue(Field.AUTHOR, "Jerome Friedman and Trevor Hastie and Robert Tibshirani");
	268	additional.setValue(Field.YEAR, "2000");
	269	additional.setValue(Field.TITLE, "Additive logistic regression : A statistical view of boosting");
	270	additional.setValue(Field.JOURNAL, "Annals of statistics");
	271	additional.setValue(Field.VOLUME, "28");
	272	additional.setValue(Field.NUMBER, "2");
	273	additional.setValue(Field.PAGES, "337-407");
	274	additional.setValue(Field.ISSN, "0090-5364");
	275
	276	return result;
	277	}
	278
	279	/**
	280	* Returns default capabilities of the classifier.
	281	*
	282	* @return the capabilities of this classifier
	283	*/
	284	public Capabilities getCapabilities() {
	285	Capabilities result = super.getCapabilities();
	286	result.disableAll();
	287
	288	// attributes
	289	result.enable(Capability.NOMINAL_ATTRIBUTES);
	290	result.enable(Capability.NUMERIC_ATTRIBUTES);
	291	result.enable(Capability.MISSING_VALUES);
	292
	293	// class
	294	result.enable(Capability.NOMINAL_CLASS);
	295
	296	return result;
	297	}
	298
	299	/**
	300	* Method for building a BestFirst decision tree classifier.
	301	*
	302	* @param data set of instances serving as training data
	303	* @throws Exception if decision tree cannot be built successfully
	304	*/
	305	public void buildClassifier(Instances data) throws Exception {
	306
	307	getCapabilities().testWithFail(data);
	308	data = new Instances(data);
	309	data.deleteWithMissingClass();
	310
	311	// build an unpruned tree
	312	if (m_PruningStrategy == PRUNING_UNPRUNED) {
	313
	314	// calculate sorted indices, weights and initial class probabilities
	315	int[][] sortedIndices = new int[data.numAttributes()][0];
	316	double[][] weights = new double[data.numAttributes()][0];
	317	double[] classProbs = new double[data.numClasses()];
	318	double totalWeight = computeSortedInfo(data,sortedIndices, weights,classProbs);
	319
	320	// Compute information of the best split for this node (include split attribute,
	321	// split value and gini gain (or information gain)). At the same time, compute
	322	// variables dists, props and totalSubsetWeights.
	323	double[][][] dists = new double[data.numAttributes()][2][data.numClasses()];
	324	double[][] props = new double[data.numAttributes()][2];
	325	double[][] totalSubsetWeights = new double[data.numAttributes()][2];
	326	FastVector nodeInfo = computeSplitInfo(this, data, sortedIndices, weights, dists,
	327	props, totalSubsetWeights, m_Heuristic, m_UseGini);
	328
	329	// add the node (with all split info) into BestFirstElements
	330	FastVector BestFirstElements = new FastVector();
	331	BestFirstElements.addElement(nodeInfo);
	332
	333	// Make the best-first decision tree.
	334	int attIndex = ((Attribute)nodeInfo.elementAt(1)).index();
	335	m_Expansion = 0;
	336	makeTree(BestFirstElements, data, sortedIndices, weights, dists, classProbs,
	337	totalWeight, props[attIndex] ,m_minNumObj, m_Heuristic, m_UseGini, m_FixedExpansion);
	338
	339	return;
	340	}
	341
	342	// the following code is for pre-pruning and post-pruning methods
	343
	344	// Compute train data, test data, sorted indices, sorted weights, total weights,
	345	// class probabilities, class distributions, branch proportions and total subset
	346	// weights for root nodes of each fold for prepruning and postpruning.
	347	int expansion = 0;
	348
	349	Random random = new Random(m_Seed);
	350	Instances cvData = new Instances(data);
	351	cvData.randomize(random);
	352	cvData = new Instances(cvData,0,(int)(cvData.numInstances()*m_SizePer)-1);
	353	cvData.stratify(m_numFoldsPruning);
	354
	355	Instances[] train = new Instances[m_numFoldsPruning];
	356	Instances[] test = new Instances[m_numFoldsPruning];
	357	FastVector[] parallelBFElements = new FastVector [m_numFoldsPruning];
	358	BFTree[] m_roots = new BFTree[m_numFoldsPruning];
	359
	360	int[][][] sortedIndices = new int[m_numFoldsPruning][data.numAttributes()][0];
	361	double[][][] weights = new double[m_numFoldsPruning][data.numAttributes()][0];
	362	double[][] classProbs = new double[m_numFoldsPruning][data.numClasses()];
	363	double[] totalWeight = new double[m_numFoldsPruning];
	364
	365	double[][][][] dists =
	366	new double[m_numFoldsPruning][data.numAttributes()][2][data.numClasses()];
	367	double[][][] props =
	368	new double[m_numFoldsPruning][data.numAttributes()][2];
	369	double[][][] totalSubsetWeights =
	370	new double[m_numFoldsPruning][data.numAttributes()][2];
	371	FastVector[] nodeInfo = new FastVector[m_numFoldsPruning];
	372
	373	for (int i = 0; i < m_numFoldsPruning; i++) {
	374	train[i] = cvData.trainCV(m_numFoldsPruning, i);
	375	test[i] = cvData.testCV(m_numFoldsPruning, i);
	376	parallelBFElements[i] = new FastVector();
	377	m_roots[i] = new BFTree();
	378
	379	// calculate sorted indices, weights, initial class counts and total weights for each training data
	380	totalWeight[i] = computeSortedInfo(train[i],sortedIndices[i], weights[i],
	381	classProbs[i]);
	382
	383	// compute information of the best split for this node (include split attribute,
	384	// split value and gini gain (or information gain)) in this fold
	385	nodeInfo[i] = computeSplitInfo(m_roots[i], train[i], sortedIndices[i],
	386	weights[i], dists[i], props[i], totalSubsetWeights[i], m_Heuristic, m_UseGini);
	387
	388	// compute information for root nodes
	389
	390	int attIndex = ((Attribute)nodeInfo[i].elementAt(1)).index();
	391
	392	m_roots[i].m_SortedIndices = new int[sortedIndices[i].length][0];
	393	m_roots[i].m_Weights = new double[weights[i].length][0];
	394	m_roots[i].m_Dists = new double[dists[i].length][0][0];
	395	m_roots[i].m_ClassProbs = new double[classProbs[i].length];
	396	m_roots[i].m_Distribution = new double[classProbs[i].length];
	397	m_roots[i].m_Props = new double[2];
	398
	399	for (int j=0; j<m_roots[i].m_SortedIndices.length; j++) {
	400	m_roots[i].m_SortedIndices[j] = sortedIndices[i][j];
	401	m_roots[i].m_Weights[j] = weights[i][j];
	402	m_roots[i].m_Dists[j] = dists[i][j];
	403	}
	404
	405	System.arraycopy(classProbs[i], 0, m_roots[i].m_ClassProbs, 0,
	406	classProbs[i].length);
	407	if (Utils.sum(m_roots[i].m_ClassProbs)!=0)
	408	Utils.normalize(m_roots[i].m_ClassProbs);
	409
	410	System.arraycopy(classProbs[i], 0, m_roots[i].m_Distribution, 0,
	411	classProbs[i].length);
	412	System.arraycopy(props[i][attIndex], 0, m_roots[i].m_Props, 0,
	413	props[i][attIndex].length);
	414
	415	m_roots[i].m_TotalWeight = totalWeight[i];
	416
	417	parallelBFElements[i].addElement(nodeInfo[i]);
	418	}
	419
	420	// build a pre-pruned tree
	421	if (m_PruningStrategy == PRUNING_PREPRUNING) {
	422
	423	double previousError = Double.MAX_VALUE;
	424	double currentError = previousError;
	425	double minError = Double.MAX_VALUE;
	426	int minExpansion = 0;
	427	FastVector errorList = new FastVector();
	428	while(true) {
	429	// compute average error
	430	double expansionError = 0;
	431	int count = 0;
	432
	433	for (int i=0; i<m_numFoldsPruning; i++) {
	434	Evaluation eval;
	435
	436	// calculate error rate if only root node
	437	if (expansion==0) {
	438	m_roots[i].m_isLeaf = true;
	439	eval = new Evaluation(test[i]);
	440	eval.evaluateModel(m_roots[i], test[i]);
	441	if (m_UseErrorRate) expansionError += eval.errorRate();
	442	else expansionError += eval.rootMeanSquaredError();
	443	count ++;
	444	}
	445
	446	// make tree - expand one node at a time
	447	else {
	448	if (m_roots[i] == null) continue; // if the tree cannot be expanded, go to next fold
	449	m_roots[i].m_isLeaf = false;
	450	BFTree nodeToSplit = (BFTree)
	451	(((FastVector)(parallelBFElements[i].elementAt(0))).elementAt(0));
	452	if (!m_roots[i].makeTree(parallelBFElements[i], m_roots[i], train[i],
	453	nodeToSplit.m_SortedIndices, nodeToSplit.m_Weights,
	454	nodeToSplit.m_Dists, nodeToSplit.m_ClassProbs,
	455	nodeToSplit.m_TotalWeight, nodeToSplit.m_Props, m_minNumObj,
	456	m_Heuristic, m_UseGini)) {
	457	m_roots[i] = null; // cannot be expanded
	458	continue;
	459	}
	460	eval = new Evaluation(test[i]);
	461	eval.evaluateModel(m_roots[i], test[i]);
	462	if (m_UseErrorRate) expansionError += eval.errorRate();
	463	else expansionError += eval.rootMeanSquaredError();
	464	count ++;
	465	}
	466	}
	467
	468	// no tree can be expanded any more
	469	if (count==0) break;
	470
	471	expansionError /=count;
	472	errorList.addElement(new Double(expansionError));
	473	currentError = expansionError;
	474
	475	if (!m_UseOneSE) {
	476	if (currentError>previousError)
	477	break;
	478	}
	479
	480	else {
	481	if (expansionError < minError) {
	482	minError = expansionError;
	483	minExpansion = expansion;
	484	}
	485
	486	if (currentError>previousError) {
	487	double oneSE = Math.sqrt(minError*(1-minError)/
	488	data.numInstances());
	489	if (currentError > minError + oneSE) {
	490	break;
	491	}
	492	}
	493	}
	494
	495	expansion ++;
	496	previousError = currentError;
	497	}
	498
	499	if (!m_UseOneSE) expansion = expansion - 1;
	500	else {
	501	double oneSE = Math.sqrt(minError*(1-minError)/data.numInstances());
	502	for (int i=0; i<errorList.size(); i++) {
	503	double error = ((Double)(errorList.elementAt(i))).doubleValue();
	504	if (error<=minError + oneSE) { // && counts[i]>=m_numFoldsPruning/2) {
	505	expansion = i;
	506	break;
	507	}
	508	}
	509	}
	510	}
	511
	512	// build a postpruned tree
	513	else {
	514	FastVector[] modelError = new FastVector[m_numFoldsPruning];
	515
	516	// calculate error of each expansion for each fold
	517	for (int i = 0; i < m_numFoldsPruning; i++) {
	518	modelError[i] = new FastVector();
	519
	520	m_roots[i].m_isLeaf = true;
	521	Evaluation eval = new Evaluation(test[i]);
	522	eval.evaluateModel(m_roots[i], test[i]);
	523	double error;
	524	if (m_UseErrorRate) error = eval.errorRate();
	525	else error = eval.rootMeanSquaredError();
	526	modelError[i].addElement(new Double(error));
	527
	528	m_roots[i].m_isLeaf = false;
	529	BFTree nodeToSplit = (BFTree)
	530	(((FastVector)(parallelBFElements[i].elementAt(0))).elementAt(0));
	531
	532	m_roots[i].makeTree(parallelBFElements[i], m_roots[i], train[i], test[i],
	533	modelError[i],nodeToSplit.m_SortedIndices, nodeToSplit.m_Weights,
	534	nodeToSplit.m_Dists, nodeToSplit.m_ClassProbs,
	535	nodeToSplit.m_TotalWeight, nodeToSplit.m_Props, m_minNumObj,
	536	m_Heuristic, m_UseGini, m_UseErrorRate);
	537	m_roots[i] = null;
	538	}
	539
	540	// find the expansion with minimal error rate
	541	double minError = Double.MAX_VALUE;
	542
	543	int maxExpansion = modelError[0].size();
	544	for (int i=1; i<modelError.length; i++) {
	545	if (modelError[i].size()>maxExpansion)
	546	maxExpansion = modelError[i].size();
	547	}
	548
	549	double[] error = new double[maxExpansion];
	550	int[] counts = new int[maxExpansion];
	551	for (int i=0; i<maxExpansion; i++) {
	552	counts[i] = 0;
	553	error[i] = 0;
	554	for (int j=0; j<m_numFoldsPruning; j++) {
	555	if (i<modelError[j].size()) {
	556	error[i] += ((Double)modelError[j].elementAt(i)).doubleValue();
	557	counts[i]++;
	558	}
	559	}
	560	error[i] = error[i]/counts[i]; //average error for each expansion
	561
	562	if (error[i]<minError) {// && counts[i]>=m_numFoldsPruning/2) {
	563	minError = error[i];
	564	expansion = i;
	565	}
	566	}
	567
	568	// the 1 SE rule choosen
	569	if (m_UseOneSE) {
	570	double oneSE = Math.sqrt(minError*(1-minError)/
	571	data.numInstances());
	572	for (int i=0; i<maxExpansion; i++) {
	573	if (error[i]<=minError + oneSE) { // && counts[i]>=m_numFoldsPruning/2) {
	574	expansion = i;
	575	break;
	576	}
	577	}
	578	}
	579	}
	580
	581	// make tree on all data based on the expansion caculated
	582	// from cross-validation
	583
	584	// calculate sorted indices, weights and initial class counts
	585	int[][] prune_sortedIndices = new int[data.numAttributes()][0];
	586	double[][] prune_weights = new double[data.numAttributes()][0];
	587	double[] prune_classProbs = new double[data.numClasses()];
	588	double prune_totalWeight = computeSortedInfo(data, prune_sortedIndices,
	589	prune_weights, prune_classProbs);
	590
	591	// compute information of the best split for this node (include split attribute,
	592	// split value and gini gain)
	593	double[][][] prune_dists = new double[data.numAttributes()][2][data.numClasses()];
	594	double[][] prune_props = new double[data.numAttributes()][2];
	595	double[][] prune_totalSubsetWeights = new double[data.numAttributes()][2];
	596	FastVector prune_nodeInfo = computeSplitInfo(this, data, prune_sortedIndices,
	597	prune_weights, prune_dists, prune_props, prune_totalSubsetWeights, m_Heuristic,m_UseGini);
	598
	599	// add the root node (with its split info) to BestFirstElements
	600	FastVector BestFirstElements = new FastVector();
	601	BestFirstElements.addElement(prune_nodeInfo);
	602
	603	int attIndex = ((Attribute)prune_nodeInfo.elementAt(1)).index();
	604	m_Expansion = 0;
	605	makeTree(BestFirstElements, data, prune_sortedIndices, prune_weights, prune_dists,
	606	prune_classProbs, prune_totalWeight, prune_props[attIndex] ,m_minNumObj,
	607	m_Heuristic, m_UseGini, expansion);
	608	}
	609
	610	/**
	611	* Recursively build a best-first decision tree.
	612	* Method for building a Best-First tree for a given number of expansions.
	613	* preExpasion is -1 means that no expansion is specified (just for a
	614	* tree without any pruning method). Pre-pruning and post-pruning methods also
	615	* use this method to build the final tree on all training data based on the
	616	* expansion calculated from internal cross-validation.
	617	*
	618	* @param BestFirstElements list to store BFTree nodes
	619	* @param data training data
	620	* @param sortedIndices sorted indices of the instances
	621	* @param weights weights of the instances
	622	* @param dists class distributions for each attribute
	623	* @param classProbs class probabilities of this node
	624	* @param totalWeight total weight of this node (note if the node
	625	* can not split, this value is not calculated.)
	626	* @param branchProps proportions of two subbranches
	627	* @param minNumObj minimal number of instances at leaf nodes
	628	* @param useHeuristic if use heuristic search for nominal attributes
	629	* in multi-class problem
	630	* @param useGini if use Gini index as splitting criterion
	631	* @param preExpansion the number of expansions the tree to be expanded
	632	* @throws Exception if something goes wrong
	633	*/
	634	protected void makeTree(FastVector BestFirstElements,Instances data,
	635	int[][] sortedIndices, double[][] weights, double[][][] dists,
	636	double[] classProbs, double totalWeight, double[] branchProps,
	637	int minNumObj, boolean useHeuristic, boolean useGini, int preExpansion)
	638	throws Exception {
	639
	640	if (BestFirstElements.size()==0) return;
	641
	642	///////////////////////////////////////////////////////////////////////
	643	// All information about the node to split (the first BestFirst object in
	644	// BestFirstElements)
	645	FastVector firstElement = (FastVector)BestFirstElements.elementAt(0);
	646
	647	// split attribute
	648	Attribute att = (Attribute)firstElement.elementAt(1);
	649
	650	// info of split value or split string
	651	double splitValue = Double.NaN;
	652	String splitStr = null;
	653	if (att.isNumeric())
	654	splitValue = ((Double)firstElement.elementAt(2)).doubleValue();
	655	else {
	656	splitStr=((String)firstElement.elementAt(2)).toString();
	657	}
	658
	659	// the best gini gain or information gain of this node
	660	double gain = ((Double)firstElement.elementAt(3)).doubleValue();
	661	///////////////////////////////////////////////////////////////////////
	662
	663	if (m_ClassProbs==null) {
	664	m_SortedIndices = new int[sortedIndices.length][0];
	665	m_Weights = new double[weights.length][0];
	666	m_Dists = new double[dists.length][0][0];
	667	m_ClassProbs = new double[classProbs.length];
	668	m_Distribution = new double[classProbs.length];
	669	m_Props = new double[2];
	670
	671	for (int i=0; i<m_SortedIndices.length; i++) {
	672	m_SortedIndices[i] = sortedIndices[i];
	673	m_Weights[i] = weights[i];
	674	m_Dists[i] = dists[i];
	675	}
	676
	677	System.arraycopy(classProbs, 0, m_ClassProbs, 0, classProbs.length);
	678	System.arraycopy(classProbs, 0, m_Distribution, 0, classProbs.length);
	679	System.arraycopy(branchProps, 0, m_Props, 0, m_Props.length);
	680	m_TotalWeight = totalWeight;
	681	if (Utils.sum(m_ClassProbs)!=0) Utils.normalize(m_ClassProbs);
	682	}
	683
	684	// If no enough data or this node can not be split, find next node to split.
	685	if (totalWeight < 2*minNumObj \|\| branchProps[0]==0
	686	\|\| branchProps[1]==0) {
	687	// remove the first element
	688	BestFirstElements.removeElementAt(0);
	689
	690	makeLeaf(data);
	691	if (BestFirstElements.size()!=0) {
	692	FastVector nextSplitElement = (FastVector)BestFirstElements.elementAt(0);
	693	BFTree nextSplitNode = (BFTree)nextSplitElement.elementAt(0);
	694	nextSplitNode.makeTree(BestFirstElements,data,
	695	nextSplitNode.m_SortedIndices, nextSplitNode.m_Weights,
	696	nextSplitNode.m_Dists,
	697	nextSplitNode.m_ClassProbs, nextSplitNode.m_TotalWeight,
	698	nextSplitNode.m_Props, minNumObj, useHeuristic, useGini, preExpansion);
	699	}
	700	return;
	701	}
	702
	703	// If gini gain or information gain is 0, make all nodes in the BestFirstElements leaf nodes
	704	// because these nodes are sorted descendingly according to gini gain or information gain.
	705	// (namely, gini gain or information gain of all nodes in BestFirstEelements is 0).
	706	if (gain==0 \|\| preExpansion==m_Expansion) {
	707	for (int i=0; i<BestFirstElements.size(); i++) {
	708	FastVector element = (FastVector)BestFirstElements.elementAt(i);
	709	BFTree node = (BFTree)element.elementAt(0);
	710	node.makeLeaf(data);
	711	}
	712	BestFirstElements.removeAllElements();
	713	}
	714
	715	// gain is not 0
	716	else {
	717	// remove the first element
	718	BestFirstElements.removeElementAt(0);
	719
	720	m_Attribute = att;
	721	if (m_Attribute.isNumeric()) m_SplitValue = splitValue;
	722	else m_SplitString = splitStr;
	723
	724	int[][][] subsetIndices = new int[2][data.numAttributes()][0];
	725	double[][][] subsetWeights = new double[2][data.numAttributes()][0];
	726
	727	splitData(subsetIndices, subsetWeights, m_Attribute, m_SplitValue,
	728	m_SplitString, sortedIndices, weights, data);
	729
	730	// If split will generate node(s) which has total weights less than m_minNumObj,
	731	// do not split.
	732	int attIndex = att.index();
	733	if (subsetIndices[0][attIndex].length<minNumObj \|\|
	734	subsetIndices[1][attIndex].length<minNumObj) {
	735	makeLeaf(data);
	736	}
	737
	738	// split the node
	739	else {
	740	m_isLeaf = false;
	741	m_Attribute = att;
	742
	743	// if expansion is specified (if pruning method used)
	744	if ( (m_PruningStrategy == PRUNING_PREPRUNING)
	745	\|\| (m_PruningStrategy == PRUNING_POSTPRUNING)
	746	\|\| (preExpansion != -1))
	747	m_Expansion++;
	748
	749	makeSuccessors(BestFirstElements,data,subsetIndices,subsetWeights,dists,
	750	att,useHeuristic, useGini);
	751	}
	752
	753	// choose next node to split
	754	if (BestFirstElements.size()!=0) {
	755	FastVector nextSplitElement = (FastVector)BestFirstElements.elementAt(0);
	756	BFTree nextSplitNode = (BFTree)nextSplitElement.elementAt(0);
	757	nextSplitNode.makeTree(BestFirstElements,data,
	758	nextSplitNode.m_SortedIndices, nextSplitNode.m_Weights,
	759	nextSplitNode.m_Dists,
	760	nextSplitNode.m_ClassProbs, nextSplitNode.m_TotalWeight,
	761	nextSplitNode.m_Props, minNumObj, useHeuristic, useGini, preExpansion);
	762	}
	763
	764	}
	765	}
	766
	767	/**
	768	* This method is to find the number of expansions based on internal
	769	* cross-validation for just pre-pruning. It expands the first BestFirst
	770	* node in the BestFirstElements if it is expansible, otherwise it looks
	771	* for next exapansible node. If it finds a node is expansibel, expand the
	772	* node, then return true. (note it just expands one node at a time).
	773	*
	774	* @param BestFirstElements list to store BFTree nodes
	775	* @param root root node of tree in each fold
	776	* @param train training data
	777	* @param sortedIndices sorted indices of the instances
	778	* @param weights weights of the instances
	779	* @param dists class distributions for each attribute
	780	* @param classProbs class probabilities of this node
	781	* @param totalWeight total weight of this node (note if the node
	782	* can not split, this value is not calculated.)
	783	* @param branchProps proportions of two subbranches
	784	* @param minNumObj minimal number of instances at leaf nodes
	785	* @param useHeuristic if use heuristic search for nominal attributes
	786	* in multi-class problem
	787	* @param useGini if use Gini index as splitting criterion
	788	* @return true if expand successfully, otherwise return false
	789	* (all nodes in BestFirstElements cannot be
	790	* expanded).
	791	* @throws Exception if something goes wrong
	792	*/
	793	protected boolean makeTree(FastVector BestFirstElements, BFTree root,
	794	Instances train, int[][] sortedIndices, double[][] weights,
	795	double[][][] dists, double[] classProbs, double totalWeight,
	796	double[] branchProps, int minNumObj, boolean useHeuristic, boolean useGini)
	797	throws Exception {
	798
	799	if (BestFirstElements.size()==0) return false;
	800
	801	///////////////////////////////////////////////////////////////////////
	802	// All information about the node to split (first BestFirst object in
	803	// BestFirstElements)
	804	FastVector firstElement = (FastVector)BestFirstElements.elementAt(0);
	805
	806	// node to split
	807	BFTree nodeToSplit = (BFTree)firstElement.elementAt(0);
	808
	809	// split attribute
	810	Attribute att = (Attribute)firstElement.elementAt(1);
	811
	812	// info of split value or split string
	813	double splitValue = Double.NaN;
	814	String splitStr = null;
	815	if (att.isNumeric())
	816	splitValue = ((Double)firstElement.elementAt(2)).doubleValue();
	817	else {
	818	splitStr=((String)firstElement.elementAt(2)).toString();
	819	}
	820
	821	// the best gini gain or information gain of this node
	822	double gain = ((Double)firstElement.elementAt(3)).doubleValue();
	823	///////////////////////////////////////////////////////////////////////
	824
	825	// If no enough data to split for this node or this node can not be split find next node to split.
	826	if (totalWeight < 2*minNumObj \|\| branchProps[0]==0
	827	\|\| branchProps[1]==0) {
	828	// remove the first element
	829	BestFirstElements.removeElementAt(0);
	830	nodeToSplit.makeLeaf(train);
	831	BFTree nextNode = (BFTree)
	832	((FastVector)BestFirstElements.elementAt(0)).elementAt(0);
	833	return root.makeTree(BestFirstElements, root, train,
	834	nextNode.m_SortedIndices, nextNode.m_Weights, nextNode.m_Dists,
	835	nextNode.m_ClassProbs, nextNode.m_TotalWeight,
	836	nextNode.m_Props, minNumObj, useHeuristic, useGini);
	837	}
	838
	839	// If gini gain or information is 0, make all nodes in the BestFirstElements leaf nodes
	840	// because these node sorted descendingly according to gini gain or information gain.
	841	// (namely, gini gain or information gain of all nodes in BestFirstEelements is 0).
	842	if (gain==0) {
	843	for (int i=0; i<BestFirstElements.size(); i++) {
	844	FastVector element = (FastVector)BestFirstElements.elementAt(i);
	845	BFTree node = (BFTree)element.elementAt(0);
	846	node.makeLeaf(train);
	847	}
	848	BestFirstElements.removeAllElements();
	849	return false;
	850	}
	851
	852	else {
	853	// remove the first element
	854	BestFirstElements.removeElementAt(0);
	855	nodeToSplit.m_Attribute = att;
	856	if (att.isNumeric()) nodeToSplit.m_SplitValue = splitValue;
	857	else nodeToSplit.m_SplitString = splitStr;
	858
	859	int[][][] subsetIndices = new int[2][train.numAttributes()][0];
	860	double[][][] subsetWeights = new double[2][train.numAttributes()][0];
	861
	862	splitData(subsetIndices, subsetWeights, nodeToSplit.m_Attribute,
	863	nodeToSplit.m_SplitValue, nodeToSplit.m_SplitString,
	864	nodeToSplit.m_SortedIndices, nodeToSplit.m_Weights, train);
	865
	866	// if split will generate node(s) which has total weights less than m_minNumObj,
	867	// do not split
	868	int attIndex = att.index();
	869	if (subsetIndices[0][attIndex].length<minNumObj \|\|
	870	subsetIndices[1][attIndex].length<minNumObj) {
	871
	872	nodeToSplit.makeLeaf(train);
	873	BFTree nextNode = (BFTree)
	874	((FastVector)BestFirstElements.elementAt(0)).elementAt(0);
	875	return root.makeTree(BestFirstElements, root, train,
	876	nextNode.m_SortedIndices, nextNode.m_Weights, nextNode.m_Dists,
	877	nextNode.m_ClassProbs, nextNode.m_TotalWeight,
	878	nextNode.m_Props, minNumObj, useHeuristic, useGini);
	879	}
	880
	881	// split the node
	882	else {
	883	nodeToSplit.m_isLeaf = false;
	884	nodeToSplit.m_Attribute = att;
	885
	886	nodeToSplit.makeSuccessors(BestFirstElements,train,subsetIndices,
	887	subsetWeights,dists, nodeToSplit.m_Attribute,useHeuristic,useGini);
	888
	889	for (int i=0; i<2; i++){
	890	nodeToSplit.m_Successors[i].makeLeaf(train);
	891	}
	892
	893	return true;
	894	}
	895	}
	896	}
	897
	898	/**
	899	* This method is to find the number of expansions based on internal
	900	* cross-validation for just post-pruning. It expands the first BestFirst
	901	* node in the BestFirstElements until no node can be split. When building
	902	* the tree, stroe error for each temporary tree, namely for each expansion.
	903	*
	904	* @param BestFirstElements list to store BFTree nodes
	905	* @param root root node of tree in each fold
	906	* @param train training data in each fold
	907	* @param test test data in each fold
	908	* @param modelError list to store error for each expansion in
	909	* each fold
	910	* @param sortedIndices sorted indices of the instances
	911	* @param weights weights of the instances
	912	* @param dists class distributions for each attribute
	913	* @param classProbs class probabilities of this node
	914	* @param totalWeight total weight of this node (note if the node
	915	* can not split, this value is not calculated.)
	916	* @param branchProps proportions of two subbranches
	917	* @param minNumObj minimal number of instances at leaf nodes
	918	* @param useHeuristic if use heuristic search for nominal attributes
	919	* in multi-class problem
	920	* @param useGini if use Gini index as splitting criterion
	921	* @param useErrorRate if use error rate in internal cross-validation
	922	* @throws Exception if something goes wrong
	923	*/
	924	protected void makeTree(FastVector BestFirstElements, BFTree root,
	925	Instances train, Instances test, FastVector modelError, int[][] sortedIndices,
	926	double[][] weights, double[][][] dists, double[] classProbs, double totalWeight,
	927	double[] branchProps, int minNumObj, boolean useHeuristic, boolean useGini, boolean useErrorRate)
	928	throws Exception {
	929
	930	if (BestFirstElements.size()==0) return;
	931
	932	///////////////////////////////////////////////////////////////////////
	933	// All information about the node to split (first BestFirst object in
	934	// BestFirstElements)
	935	FastVector firstElement = (FastVector)BestFirstElements.elementAt(0);
	936
	937	// node to split
	938	//BFTree nodeToSplit = (BFTree)firstElement.elementAt(0);
	939
	940	// split attribute
	941	Attribute att = (Attribute)firstElement.elementAt(1);
	942
	943	// info of split value or split string
	944	double splitValue = Double.NaN;
	945	String splitStr = null;
	946	if (att.isNumeric())
	947	splitValue = ((Double)firstElement.elementAt(2)).doubleValue();
	948	else {
	949	splitStr=((String)firstElement.elementAt(2)).toString();
	950	}
	951
	952	// the best gini gain or information of this node
	953	double gain = ((Double)firstElement.elementAt(3)).doubleValue();
	954	///////////////////////////////////////////////////////////////////////
	955
	956	if (totalWeight < 2*minNumObj \|\| branchProps[0]==0
	957	\|\| branchProps[1]==0) {
	958	// remove the first element
	959	BestFirstElements.removeElementAt(0);
	960	makeLeaf(train);
	961	if (BestFirstElements.size() == 0) {
	962	return;
	963	}
	964
	965	BFTree nextSplitNode = (BFTree)
	966	((FastVector)BestFirstElements.elementAt(0)).elementAt(0);
	967	nextSplitNode.makeTree(BestFirstElements, root, train, test, modelError,
	968	nextSplitNode.m_SortedIndices, nextSplitNode.m_Weights,
	969	nextSplitNode.m_Dists, nextSplitNode.m_ClassProbs,
	970	nextSplitNode.m_TotalWeight, nextSplitNode.m_Props, minNumObj,
	971	useHeuristic, useGini, useErrorRate);
	972	return;
	973
	974	}
	975
	976	// If gini gain or information gain is 0, make all nodes in the BestFirstElements leaf nodes
	977	// because these node sorted descendingly according to gini gain or information gain.
	978	// (namely, gini gain or information gain of all nodes in BestFirstEelements is 0).
	979	if (gain==0) {
	980	for (int i=0; i<BestFirstElements.size(); i++) {
	981	FastVector element = (FastVector)BestFirstElements.elementAt(i);
	982	BFTree node = (BFTree)element.elementAt(0);
	983	node.makeLeaf(train);
	984	}
	985	BestFirstElements.removeAllElements();
	986	}
	987
	988	// gini gain or information gain is not 0
	989	else {
	990	// remove the first element
	991	BestFirstElements.removeElementAt(0);
	992	m_Attribute = att;
	993	if (att.isNumeric()) m_SplitValue = splitValue;
	994	else m_SplitString = splitStr;
	995
	996	int[][][] subsetIndices = new int[2][train.numAttributes()][0];
	997	double[][][] subsetWeights = new double[2][train.numAttributes()][0];
	998
	999	splitData(subsetIndices, subsetWeights, m_Attribute,
	1000	m_SplitValue, m_SplitString,
	1001	sortedIndices, weights, train);
	1002
	1003	// if split will generate node(s) which has total weights less than m_minNumObj,
	1004	// do not split
	1005	int attIndex = att.index();
	1006	if (subsetIndices[0][attIndex].length<minNumObj \|\|
	1007	subsetIndices[1][attIndex].length<minNumObj) {
	1008	makeLeaf(train);
	1009	}
	1010
	1011	// split the node and cauculate error rate of this temporary tree
	1012	else {
	1013	m_isLeaf = false;
	1014	m_Attribute = att;
	1015
	1016	makeSuccessors(BestFirstElements,train,subsetIndices,
	1017	subsetWeights,dists, m_Attribute, useHeuristic, useGini);
	1018	for (int i=0; i<2; i++){
	1019	m_Successors[i].makeLeaf(train);
	1020	}
	1021
	1022	Evaluation eval = new Evaluation(test);
	1023	eval.evaluateModel(root, test);
	1024	double error;
	1025	if (useErrorRate) error = eval.errorRate();
	1026	else error = eval.rootMeanSquaredError();
	1027	modelError.addElement(new Double(error));
	1028	}
	1029
	1030	if (BestFirstElements.size()!=0) {
	1031	FastVector nextSplitElement = (FastVector)BestFirstElements.elementAt(0);
	1032	BFTree nextSplitNode = (BFTree)nextSplitElement.elementAt(0);
	1033	nextSplitNode.makeTree(BestFirstElements, root, train, test, modelError,
	1034	nextSplitNode.m_SortedIndices, nextSplitNode.m_Weights,
	1035	nextSplitNode.m_Dists, nextSplitNode.m_ClassProbs,
	1036	nextSplitNode.m_TotalWeight, nextSplitNode.m_Props, minNumObj,
	1037	useHeuristic, useGini,useErrorRate);
	1038	}
	1039	}
	1040	}
	1041
	1042
	1043	/**
	1044	* Generate successor nodes for a node and put them into BestFirstElements
	1045	* according to gini gain or information gain in a descending order.
	1046	*
	1047	* @param BestFirstElements list to store BestFirst nodes
	1048	* @param data training instance
	1049	* @param subsetSortedIndices sorted indices of instances of successor nodes
	1050	* @param subsetWeights weights of instances of successor nodes
	1051	* @param dists class distributions of successor nodes
	1052	* @param att attribute used to split the node
	1053	* @param useHeuristic if use heuristic search for nominal attributes in multi-class problem
	1054	* @param useGini if use Gini index as splitting criterion
	1055	* @throws Exception if something goes wrong
	1056	*/
	1057	protected void makeSuccessors(FastVector BestFirstElements,Instances data,
	1058	int[][][] subsetSortedIndices, double[][][] subsetWeights,
	1059	double[][][] dists,
	1060	Attribute att, boolean useHeuristic, boolean useGini) throws Exception {
	1061
	1062	m_Successors = new BFTree[2];
	1063
	1064	for (int i=0; i<2; i++) {
	1065	m_Successors[i] = new BFTree();
	1066	m_Successors[i].m_isLeaf = true;
	1067
	1068	// class probability and distribution for this successor node
	1069	m_Successors[i].m_ClassProbs = new double[data.numClasses()];
	1070	m_Successors[i].m_Distribution = new double[data.numClasses()];
	1071	System.arraycopy(dists[att.index()][i], 0, m_Successors[i].m_ClassProbs,
	1072	0,m_Successors[i].m_ClassProbs.length);
	1073	System.arraycopy(dists[att.index()][i], 0, m_Successors[i].m_Distribution,
	1074	0,m_Successors[i].m_Distribution.length);
	1075	if (Utils.sum(m_Successors[i].m_ClassProbs)!=0)
	1076	Utils.normalize(m_Successors[i].m_ClassProbs);
	1077
	1078	// split information for this successor node
	1079	double[][] props = new double[data.numAttributes()][2];
	1080	double[][][] subDists = new double[data.numAttributes()][2][data.numClasses()];
	1081	double[][] totalSubsetWeights = new double[data.numAttributes()][2];
	1082	FastVector splitInfo = m_Successors[i].computeSplitInfo(m_Successors[i], data,
	1083	subsetSortedIndices[i], subsetWeights[i], subDists, props,
	1084	totalSubsetWeights, useHeuristic, useGini);
	1085
	1086	// branch proportion for this successor node
	1087	int splitIndex = ((Attribute)splitInfo.elementAt(1)).index();
	1088	m_Successors[i].m_Props = new double[2];
	1089	System.arraycopy(props[splitIndex], 0, m_Successors[i].m_Props, 0,
	1090	m_Successors[i].m_Props.length);
	1091
	1092	// sorted indices and weights of each attribute for this successor node
	1093	m_Successors[i].m_SortedIndices = new int[data.numAttributes()][0];
	1094	m_Successors[i].m_Weights = new double[data.numAttributes()][0];
	1095	for (int j=0; j<m_Successors[i].m_SortedIndices.length; j++) {
	1096	m_Successors[i].m_SortedIndices[j] = subsetSortedIndices[i][j];
	1097	m_Successors[i].m_Weights[j] = subsetWeights[i][j];
	1098	}
	1099
	1100	// distribution of each attribute for this successor node
	1101	m_Successors[i].m_Dists = new double[data.numAttributes()][2][data.numClasses()];
	1102	for (int j=0; j<subDists.length; j++) {
	1103	m_Successors[i].m_Dists[j] = subDists[j];
	1104	}
	1105
	1106	// total weights for this successor node.
	1107	m_Successors[i].m_TotalWeight = Utils.sum(totalSubsetWeights[splitIndex]);
	1108
	1109	// insert this successor node into BestFirstElements according to gini gain or information gain
	1110	// descendingly
	1111	if (BestFirstElements.size()==0) {
	1112	BestFirstElements.addElement(splitInfo);
	1113	} else {
	1114	double gGain = ((Double)(splitInfo.elementAt(3))).doubleValue();
	1115	int vectorSize = BestFirstElements.size();
	1116	FastVector lastNode = (FastVector)BestFirstElements.elementAt(vectorSize-1);
	1117
	1118	// If gini gain is less than that of last node in FastVector
	1119	if (gGain<((Double)(lastNode.elementAt(3))).doubleValue()) {
	1120	BestFirstElements.insertElementAt(splitInfo, vectorSize);
	1121	} else {
	1122	for (int j=0; j<vectorSize; j++) {
	1123	FastVector node = (FastVector)BestFirstElements.elementAt(j);
	1124	double nodeGain = ((Double)(node.elementAt(3))).doubleValue();
	1125	if (gGain>=nodeGain) {
	1126	BestFirstElements.insertElementAt(splitInfo, j);
	1127	break;
	1128	}
	1129	}
	1130	}
	1131	}
	1132	}
	1133	}
	1134
	1135	/**
	1136	* Compute sorted indices, weights and class probabilities for a given
	1137	* dataset. Return total weights of the data at the node.
	1138	*
	1139	* @param data training data
	1140	* @param sortedIndices sorted indices of instances at the node
	1141	* @param weights weights of instances at the node
	1142	* @param classProbs class probabilities at the node
	1143	* @return total weights of instances at the node
	1144	* @throws Exception if something goes wrong
	1145	*/
	1146	protected double computeSortedInfo(Instances data, int[][] sortedIndices, double[][] weights,
	1147	double[] classProbs) throws Exception {
	1148
	1149	// Create array of sorted indices and weights
	1150	double[] vals = new double[data.numInstances()];
	1151	for (int j = 0; j < data.numAttributes(); j++) {
	1152	if (j==data.classIndex()) continue;
	1153	weights[j] = new double[data.numInstances()];
	1154
	1155	if (data.attribute(j).isNominal()) {
	1156
	1157	// Handling nominal attributes. Putting indices of
	1158	// instances with missing values at the end.
	1159	sortedIndices[j] = new int[data.numInstances()];
	1160	int count = 0;
	1161	for (int i = 0; i < data.numInstances(); i++) {
	1162	Instance inst = data.instance(i);
	1163	if (!inst.isMissing(j)) {
	1164	sortedIndices[j][count] = i;
	1165	weights[j][count] = inst.weight();
	1166	count++;
	1167	}
	1168	}
	1169	for (int i = 0; i < data.numInstances(); i++) {
	1170	Instance inst = data.instance(i);
	1171	if (inst.isMissing(j)) {
	1172	sortedIndices[j][count] = i;
	1173	weights[j][count] = inst.weight();
	1174	count++;
	1175	}
	1176	}
	1177	} else {
	1178
	1179	// Sorted indices are computed for numeric attributes
	1180	// missing values instances are put to end (through Utils.sort() method)
	1181	for (int i = 0; i < data.numInstances(); i++) {
	1182	Instance inst = data.instance(i);
	1183	vals[i] = inst.value(j);
	1184	}
	1185	sortedIndices[j] = Utils.sort(vals);
	1186	for (int i = 0; i < data.numInstances(); i++) {
	1187	weights[j][i] = data.instance(sortedIndices[j][i]).weight();
	1188	}
	1189	}
	1190	}
	1191
	1192	// Compute initial class counts and total weight
	1193	double totalWeight = 0;
	1194	for (int i = 0; i < data.numInstances(); i++) {
	1195	Instance inst = data.instance(i);
	1196	classProbs[(int)inst.classValue()] += inst.weight();
	1197	totalWeight += inst.weight();
	1198	}
	1199
	1200	return totalWeight;
	1201	}
	1202
	1203	/**
	1204	* Compute the best splitting attribute, split point or subset and the best
	1205	* gini gain or iformation gain for a given dataset.
	1206	*
	1207	* @param node node to be split
	1208	* @param data training data
	1209	* @param sortedIndices sorted indices of the instances
	1210	* @param weights weights of the instances
	1211	* @param dists class distributions for each attribute
	1212	* @param props proportions of two branches
	1213	* @param totalSubsetWeights total weight of two subsets
	1214	* @param useHeuristic if use heuristic search for nominal attributes
	1215	* in multi-class problem
	1216	* @param useGini if use Gini index as splitting criterion
	1217	* @return split information about the node
	1218	* @throws Exception if something is wrong
	1219	*/
	1220	protected FastVector computeSplitInfo(BFTree node, Instances data, int[][] sortedIndices,
	1221	double[][] weights, double[][][] dists, double[][] props,
	1222	double[][] totalSubsetWeights, boolean useHeuristic, boolean useGini) throws Exception {
	1223
	1224	double[] splits = new double[data.numAttributes()];
	1225	String[] splitString = new String[data.numAttributes()];
	1226	double[] gains = new double[data.numAttributes()];
	1227
	1228	for (int i = 0; i < data.numAttributes(); i++) {
	1229	if (i==data.classIndex()) continue;
	1230	Attribute att = data.attribute(i);
	1231	if (att.isNumeric()) {
	1232	// numeric attribute
	1233	splits[i] = numericDistribution(props, dists, att, sortedIndices[i],
	1234	weights[i], totalSubsetWeights, gains, data, useGini);
	1235	} else {
	1236	// nominal attribute
	1237	splitString[i] = nominalDistribution(props, dists, att, sortedIndices[i],
	1238	weights[i], totalSubsetWeights, gains, data, useHeuristic, useGini);
	1239	}
	1240	}
	1241
	1242	int index = Utils.maxIndex(gains);
	1243	double mBestGain = gains[index];
	1244
	1245	Attribute att = data.attribute(index);
	1246	double mValue =Double.NaN;
	1247	String mString = null;
	1248	if (att.isNumeric()) mValue= splits[index];
	1249	else {
	1250	mString = splitString[index];
	1251	if (mString==null) mString = "";
	1252	}
	1253
	1254	// split information
	1255	FastVector splitInfo = new FastVector();
	1256	splitInfo.addElement(node);
	1257	splitInfo.addElement(att);
	1258	if (att.isNumeric()) splitInfo.addElement(new Double(mValue));
	1259	else splitInfo.addElement(mString);
	1260	splitInfo.addElement(new Double(mBestGain));
	1261
	1262	return splitInfo;
	1263	}
	1264
	1265	/**
	1266	* Compute distributions, proportions and total weights of two successor nodes for
	1267	* a given numeric attribute.
	1268	*
	1269	* @param props proportions of each two branches for each attribute
	1270	* @param dists class distributions of two branches for each attribute
	1271	* @param att numeric att split on
	1272	* @param sortedIndices sorted indices of instances for the attirubte
	1273	* @param weights weights of instances for the attirbute
	1274	* @param subsetWeights total weight of two branches split based on the attribute
	1275	* @param gains Gini gains or information gains for each attribute
	1276	* @param data training instances
	1277	* @param useGini if use Gini index as splitting criterion
	1278	* @return Gini gain or information gain for the given attribute
	1279	* @throws Exception if something goes wrong
	1280	*/
	1281	protected double numericDistribution(double[][] props, double[][][] dists,
	1282	Attribute att, int[] sortedIndices, double[] weights, double[][] subsetWeights,
	1283	double[] gains, Instances data, boolean useGini)
	1284	throws Exception {
	1285
	1286	double splitPoint = Double.NaN;
	1287	double[][] dist = null;
	1288	int numClasses = data.numClasses();
	1289	int i; // differ instances with or without missing values
	1290
	1291	double[][] currDist = new double[2][numClasses];
	1292	dist = new double[2][numClasses];
	1293
	1294	// Move all instances without missing values into second subset
	1295	double[] parentDist = new double[numClasses];
	1296	int missingStart = 0;
	1297	for (int j = 0; j < sortedIndices.length; j++) {
	1298	Instance inst = data.instance(sortedIndices[j]);
	1299	if (!inst.isMissing(att)) {
	1300	missingStart ++;
	1301	currDist[1][(int)inst.classValue()] += weights[j];
	1302	}
	1303	parentDist[(int)inst.classValue()] += weights[j];
	1304	}
	1305	System.arraycopy(currDist[1], 0, dist[1], 0, dist[1].length);
	1306
	1307	// Try all possible split points
	1308	double currSplit = data.instance(sortedIndices[0]).value(att);
	1309	double currGain;
	1310	double bestGain = -Double.MAX_VALUE;
	1311
	1312	for (i = 0; i < sortedIndices.length; i++) {
	1313	Instance inst = data.instance(sortedIndices[i]);
	1314	if (inst.isMissing(att)) {
	1315	break;
	1316	}
	1317	if (inst.value(att) > currSplit) {
	1318
	1319	double[][] tempDist = new double[2][numClasses];
	1320	for (int k=0; k<2; k++) {
	1321	//tempDist[k] = currDist[k];
	1322	System.arraycopy(currDist[k], 0, tempDist[k], 0, tempDist[k].length);
	1323	}
	1324
	1325	double[] tempProps = new double[2];
	1326	for (int k=0; k<2; k++) {
	1327	tempProps[k] = Utils.sum(tempDist[k]);
	1328	}
	1329
	1330	if (Utils.sum(tempProps) !=0) Utils.normalize(tempProps);
	1331
	1332	// split missing values
	1333	int index = missingStart;
	1334	while (index < sortedIndices.length) {
	1335	Instance insta = data.instance(sortedIndices[index]);
	1336	for (int j = 0; j < 2; j++) {
	1337	tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index];
	1338	}
	1339	index++;
	1340	}
	1341
	1342	if (useGini) currGain = computeGiniGain(parentDist,tempDist);
	1343	else currGain = computeInfoGain(parentDist,tempDist);
	1344
	1345	if (currGain > bestGain) {
	1346	bestGain = currGain;
	1347	// clean split point
	1348	splitPoint = Math.rint((inst.value(att) + currSplit)/2.0*100000)/100000.0;
	1349	for (int j = 0; j < currDist.length; j++) {
	1350	System.arraycopy(tempDist[j], 0, dist[j], 0,
	1351	dist[j].length);
	1352	}
	1353	}
	1354	}
	1355	currSplit = inst.value(att);
	1356	currDist[0][(int)inst.classValue()] += weights[i];
	1357	currDist[1][(int)inst.classValue()] -= weights[i];
	1358	}
	1359
	1360	// Compute weights
	1361	int attIndex = att.index();
	1362	props[attIndex] = new double[2];
	1363	for (int k = 0; k < 2; k++) {
	1364	props[attIndex][k] = Utils.sum(dist[k]);
	1365	}
	1366	if (Utils.sum(props[attIndex]) != 0) Utils.normalize(props[attIndex]);
	1367
	1368	// Compute subset weights
	1369	subsetWeights[attIndex] = new double[2];
	1370	for (int j = 0; j < 2; j++) {
	1371	subsetWeights[attIndex][j] += Utils.sum(dist[j]);
	1372	}
	1373
	1374	// clean gain
	1375	gains[attIndex] = Math.rint(bestGain*10000000)/10000000.0;
	1376	dists[attIndex] = dist;
	1377	return splitPoint;
	1378	}
	1379
	1380	/**
	1381	* Compute distributions, proportions and total weights of two successor
	1382	* nodes for a given nominal attribute.
	1383	*
	1384	* @param props proportions of each two branches for each attribute
	1385	* @param dists class distributions of two branches for each attribute
	1386	* @param att numeric att split on
	1387	* @param sortedIndices sorted indices of instances for the attirubte
	1388	* @param weights weights of instances for the attirbute
	1389	* @param subsetWeights total weight of two branches split based on the attribute
	1390	* @param gains Gini gains for each attribute
	1391	* @param data training instances
	1392	* @param useHeuristic if use heuristic search
	1393	* @param useGini if use Gini index as splitting criterion
	1394	* @return Gini gain for the given attribute
	1395	* @throws Exception if something goes wrong
	1396	*/
	1397	protected String nominalDistribution(double[][] props, double[][][] dists,
	1398	Attribute att, int[] sortedIndices, double[] weights, double[][] subsetWeights,
	1399	double[] gains, Instances data, boolean useHeuristic, boolean useGini)
	1400	throws Exception {
	1401
	1402	String[] values = new String[att.numValues()];
	1403	int numCat = values.length; // number of values of the attribute
	1404	int numClasses = data.numClasses();
	1405
	1406	String bestSplitString = "";
	1407	double bestGain = -Double.MAX_VALUE;
	1408
	1409	// class frequency for each value
	1410	int[] classFreq = new int[numCat];
	1411	for (int j=0; j<numCat; j++) classFreq[j] = 0;
	1412
	1413	double[] parentDist = new double[numClasses];
	1414	double[][] currDist = new double[2][numClasses];
	1415	double[][] dist = new double[2][numClasses];
	1416	int missingStart = 0;
	1417
	1418	for (int i = 0; i < sortedIndices.length; i++) {
	1419	Instance inst = data.instance(sortedIndices[i]);
	1420	if (!inst.isMissing(att)) {
	1421	missingStart++;
	1422	classFreq[(int)inst.value(att)] ++;
	1423	}
	1424	parentDist[(int)inst.classValue()] += weights[i];
	1425	}
	1426
	1427	// count the number of values that class frequency is not 0
	1428	int nonEmpty = 0;
	1429	for (int j=0; j<numCat; j++) {
	1430	if (classFreq[j]!=0) nonEmpty ++;
	1431	}
	1432
	1433	// attribute values which class frequency is not 0
	1434	String[] nonEmptyValues = new String[nonEmpty];
	1435	int nonEmptyIndex = 0;
	1436	for (int j=0; j<numCat; j++) {
	1437	if (classFreq[j]!=0) {
	1438	nonEmptyValues[nonEmptyIndex] = att.value(j);
	1439	nonEmptyIndex ++;
	1440	}
	1441	}
	1442
	1443	// attribute values which class frequency is 0
	1444	int empty = numCat - nonEmpty;
	1445	String[] emptyValues = new String[empty];
	1446	int emptyIndex = 0;
	1447	for (int j=0; j<numCat; j++) {
	1448	if (classFreq[j]==0) {
	1449	emptyValues[emptyIndex] = att.value(j);
	1450	emptyIndex ++;
	1451	}
	1452	}
	1453
	1454	if (nonEmpty<=1) {
	1455	gains[att.index()] = 0;
	1456	return "";
	1457	}
	1458
	1459	// for tow-class probloms
	1460	if (data.numClasses()==2) {
	1461
	1462	//// Firstly, for attribute values which class frequency is not zero
	1463
	1464	// probability of class 0 for each attribute value
	1465	double[] pClass0 = new double[nonEmpty];
	1466	// class distribution for each attribute value
	1467	double[][] valDist = new double[nonEmpty][2];
	1468
	1469	for (int j=0; j<nonEmpty; j++) {
	1470	for (int k=0; k<2; k++) {
	1471	valDist[j][k] = 0;
	1472	}
	1473	}
	1474
	1475	for (int i = 0; i < sortedIndices.length; i++) {
	1476	Instance inst = data.instance(sortedIndices[i]);
	1477	if (inst.isMissing(att)) {
	1478	break;
	1479	}
	1480
	1481	for (int j=0; j<nonEmpty; j++) {
	1482	if (att.value((int)inst.value(att)).compareTo(nonEmptyValues[j])==0) {
	1483	valDist[j][(int)inst.classValue()] += inst.weight();
	1484	break;
	1485	}
	1486	}
	1487	}
	1488
	1489	for (int j=0; j<nonEmpty; j++) {
	1490	double distSum = Utils.sum(valDist[j]);
	1491	if (distSum==0) pClass0[j]=0;
	1492	else pClass0[j] = valDist[j][0]/distSum;
	1493	}
	1494
	1495	// sort category according to the probability of class 0.0
	1496	String[] sortedValues = new String[nonEmpty];
	1497	for (int j=0; j<nonEmpty; j++) {
	1498	sortedValues[j] = nonEmptyValues[Utils.minIndex(pClass0)];
	1499	pClass0[Utils.minIndex(pClass0)] = Double.MAX_VALUE;
	1500	}
	1501
	1502	// Find a subset of attribute values that maximize impurity decrease
	1503
	1504	// for the attribute values that class frequency is not 0
	1505	String tempStr = "";
	1506
	1507	for (int j=0; j<nonEmpty-1; j++) {
	1508	currDist = new double[2][numClasses];
	1509	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";
	1510	else tempStr += "\|"+ "(" + sortedValues[j] + ")";
	1511	//System.out.println(sortedValues[j]);
	1512	for (int i=0; i<sortedIndices.length;i++) {
	1513	Instance inst = data.instance(sortedIndices[i]);
	1514	if (inst.isMissing(att)) {
	1515	break;
	1516	}
	1517
	1518	if (tempStr.indexOf
	1519	("(" + att.value((int)inst.value(att)) + ")")!=-1) {
	1520	currDist[0][(int)inst.classValue()] += weights[i];
	1521	} else currDist[1][(int)inst.classValue()] += weights[i];
	1522	}
	1523
	1524	double[][] tempDist = new double[2][numClasses];
	1525	for (int kk=0; kk<2; kk++) {
	1526	tempDist[kk] = currDist[kk];
	1527	}
	1528
	1529	double[] tempProps = new double[2];
	1530	for (int kk=0; kk<2; kk++) {
	1531	tempProps[kk] = Utils.sum(tempDist[kk]);
	1532	}
	1533
	1534	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
	1535
	1536	// split missing values
	1537	int mstart = missingStart;
	1538	while (mstart < sortedIndices.length) {
	1539	Instance insta = data.instance(sortedIndices[mstart]);
	1540	for (int jj = 0; jj < 2; jj++) {
	1541	tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];
	1542	}
	1543	mstart++;
	1544	}
	1545
	1546	double currGain;
	1547	if (useGini) currGain = computeGiniGain(parentDist,tempDist);
	1548	else currGain = computeInfoGain(parentDist,tempDist);
	1549
	1550	if (currGain>bestGain) {
	1551	bestGain = currGain;
	1552	bestSplitString = tempStr;
	1553	for (int jj = 0; jj < 2; jj++) {
	1554	System.arraycopy(tempDist[jj], 0, dist[jj], 0,
	1555	dist[jj].length);
	1556	}
	1557	}
	1558	}
	1559	}
	1560
	1561	// multi-class problems (exhaustive search)
	1562	else if (!useHeuristic \|\| nonEmpty<=4) {
	1563	//else if (!useHeuristic \|\| nonEmpty==2) {
	1564
	1565	// Firstly, for attribute values which class frequency is not zero
	1566	for (int i=0; i<(int)Math.pow(2,nonEmpty-1); i++) {
	1567	String tempStr="";
	1568	currDist = new double[2][numClasses];
	1569	int mod;
	1570	int bit10 = i;
	1571	for (int j=nonEmpty-1; j>=0; j--) {
	1572	mod = bit10%2; // convert from 10bit to 2bit
	1573	if (mod==1) {
	1574	if (tempStr=="") tempStr = "("+nonEmptyValues[j]+")";
	1575	else tempStr += "\|" + "("+nonEmptyValues[j]+")";
	1576	}
	1577	bit10 = bit10/2;
	1578	}
	1579	for (int j=0; j<sortedIndices.length;j++) {
	1580	Instance inst = data.instance(sortedIndices[j]);
	1581	if (inst.isMissing(att)) {
	1582	break;
	1583	}
	1584
	1585	if (tempStr.indexOf("("+att.value((int)inst.value(att))+")")!=-1) {
	1586	currDist[0][(int)inst.classValue()] += weights[j];
	1587	} else currDist[1][(int)inst.classValue()] += weights[j];
	1588	}
	1589
	1590	double[][] tempDist = new double[2][numClasses];
	1591	for (int k=0; k<2; k++) {
	1592	tempDist[k] = currDist[k];
	1593	}
	1594
	1595	double[] tempProps = new double[2];
	1596	for (int k=0; k<2; k++) {
	1597	tempProps[k] = Utils.sum(tempDist[k]);
	1598	}
	1599
	1600	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
	1601
	1602	// split missing values
	1603	int index = missingStart;
	1604	while (index < sortedIndices.length) {
	1605	Instance insta = data.instance(sortedIndices[index]);
	1606	for (int j = 0; j < 2; j++) {
	1607	tempDist[j][(int)insta.classValue()] += tempProps[j] * weights[index];
	1608	}
	1609	index++;
	1610	}
	1611
	1612	double currGain;
	1613	if (useGini) currGain = computeGiniGain(parentDist,tempDist);
	1614	else currGain = computeInfoGain(parentDist,tempDist);
	1615
	1616	if (currGain>bestGain) {
	1617	bestGain = currGain;
	1618	bestSplitString = tempStr;
	1619	for (int j = 0; j < 2; j++) {
	1620	//dist[jj] = new double[currDist[jj].length];
	1621	System.arraycopy(tempDist[j], 0, dist[j], 0,
	1622	dist[j].length);
	1623	}
	1624	}
	1625	}
	1626	}
	1627
	1628	// huristic method to solve multi-classes problems
	1629	else {
	1630	// Firstly, for attribute values which class frequency is not zero
	1631	int n = nonEmpty;
	1632	int k = data.numClasses(); // number of classes of the data
	1633	double[][] P = new double[n][k]; // class probability matrix
	1634	int[] numInstancesValue = new int[n]; // number of instances for an attribute value
	1635	double[] meanClass = new double[k]; // vector of mean class probability
	1636	int numInstances = data.numInstances(); // total number of instances
	1637
	1638	// initialize the vector of mean class probability
	1639	for (int j=0; j<meanClass.length; j++) meanClass[j]=0;
	1640
	1641	for (int j=0; j<numInstances; j++) {
	1642	Instance inst = (Instance)data.instance(j);
	1643	int valueIndex = 0; // attribute value index in nonEmptyValues
	1644	for (int i=0; i<nonEmpty; i++) {
	1645	if (att.value((int)inst.value(att)).compareToIgnoreCase(nonEmptyValues[i])==0){
	1646	valueIndex = i;
	1647	break;
	1648	}
	1649	}
	1650	P[valueIndex][(int)inst.classValue()]++;
	1651	numInstancesValue[valueIndex]++;
	1652	meanClass[(int)inst.classValue()]++;
	1653	}
	1654
	1655	// calculate the class probability matrix
	1656	for (int i=0; i<P.length; i++) {
	1657	for (int j=0; j<P[0].length; j++) {
	1658	if (numInstancesValue[i]==0) P[i][j]=0;
	1659	else P[i][j]/=numInstancesValue[i];
	1660	}
	1661	}
	1662
	1663	//calculate the vector of mean class probability
	1664	for (int i=0; i<meanClass.length; i++) {
	1665	meanClass[i]/=numInstances;
	1666	}
	1667
	1668	// calculate the covariance matrix
	1669	double[][] covariance = new double[k][k];
	1670	for (int i1=0; i1<k; i1++) {
	1671	for (int i2=0; i2<k; i2++) {
	1672	double element = 0;
	1673	for (int j=0; j<n; j++) {
	1674	element += (P[j][i2]-meanClass[i2])*(P[j][i1]-meanClass[i1])
	1675	*numInstancesValue[j];
	1676	}
	1677	covariance[i1][i2] = element;
	1678	}
	1679	}
	1680
	1681	Matrix matrix = new Matrix(covariance);
	1682	weka.core.matrix.EigenvalueDecomposition eigen =
	1683	new weka.core.matrix.EigenvalueDecomposition(matrix);
	1684	double[] eigenValues = eigen.getRealEigenvalues();
	1685
	1686	// find index of the largest eigenvalue
	1687	int index=0;
	1688	double largest = eigenValues[0];
	1689	for (int i=1; i<eigenValues.length; i++) {
	1690	if (eigenValues[i]>largest) {
	1691	index=i;
	1692	largest = eigenValues[i];
	1693	}
	1694	}
	1695
	1696	// calculate the first principle component
	1697	double[] FPC = new double[k];
	1698	Matrix eigenVector = eigen.getV();
	1699	double[][] vectorArray = eigenVector.getArray();
	1700	for (int i=0; i<FPC.length; i++) {
	1701	FPC[i] = vectorArray[i][index];
	1702	}
	1703
	1704	// calculate the first principle component scores
	1705	double[] Sa = new double[n];
	1706	for (int i=0; i<Sa.length; i++) {
	1707	Sa[i]=0;
	1708	for (int j=0; j<k; j++) {
	1709	Sa[i] += FPC[j]*P[i][j];
	1710	}
	1711	}
	1712
	1713	// sort category according to Sa(s)
	1714	double[] pCopy = new double[n];
	1715	System.arraycopy(Sa,0,pCopy,0,n);
	1716	String[] sortedValues = new String[n];
	1717	Arrays.sort(Sa);
	1718
	1719	for (int j=0; j<n; j++) {
	1720	sortedValues[j] = nonEmptyValues[Utils.minIndex(pCopy)];
	1721	pCopy[Utils.minIndex(pCopy)] = Double.MAX_VALUE;
	1722	}
	1723
	1724	// for the attribute values that class frequency is not 0
	1725	String tempStr = "";
	1726
	1727	for (int j=0; j<nonEmpty-1; j++) {
	1728	currDist = new double[2][numClasses];
	1729	if (tempStr=="") tempStr="(" + sortedValues[j] + ")";
	1730	else tempStr += "\|"+ "(" + sortedValues[j] + ")";
	1731	for (int i=0; i<sortedIndices.length;i++) {
	1732	Instance inst = data.instance(sortedIndices[i]);
	1733	if (inst.isMissing(att)) {
	1734	break;
	1735	}
	1736
	1737	if (tempStr.indexOf
	1738	("(" + att.value((int)inst.value(att)) + ")")!=-1) {
	1739	currDist[0][(int)inst.classValue()] += weights[i];
	1740	} else currDist[1][(int)inst.classValue()] += weights[i];
	1741	}
	1742
	1743	double[][] tempDist = new double[2][numClasses];
	1744	for (int kk=0; kk<2; kk++) {
	1745	tempDist[kk] = currDist[kk];
	1746	}
	1747
	1748	double[] tempProps = new double[2];
	1749	for (int kk=0; kk<2; kk++) {
	1750	tempProps[kk] = Utils.sum(tempDist[kk]);
	1751	}
	1752
	1753	if (Utils.sum(tempProps)!=0) Utils.normalize(tempProps);
	1754
	1755	// split missing values
	1756	int mstart = missingStart;
	1757	while (mstart < sortedIndices.length) {
	1758	Instance insta = data.instance(sortedIndices[mstart]);
	1759	for (int jj = 0; jj < 2; jj++) {
	1760	tempDist[jj][(int)insta.classValue()] += tempProps[jj] * weights[mstart];
	1761	}
	1762	mstart++;
	1763	}
	1764
	1765	double currGain;
	1766	if (useGini) currGain = computeGiniGain(parentDist,tempDist);
	1767	else currGain = computeInfoGain(parentDist,tempDist);
	1768
	1769	if (currGain>bestGain) {
	1770	bestGain = currGain;
	1771	bestSplitString = tempStr;
	1772	for (int jj = 0; jj < 2; jj++) {
	1773	//dist[jj] = new double[currDist[jj].length];
	1774	System.arraycopy(tempDist[jj], 0, dist[jj], 0,
	1775	dist[jj].length);
	1776	}
	1777	}
	1778	}
	1779	}
	1780
	1781	// Compute weights
	1782	int attIndex = att.index();
	1783	props[attIndex] = new double[2];
	1784	for (int k = 0; k < 2; k++) {
	1785	props[attIndex][k] = Utils.sum(dist[k]);
	1786	}
	1787	if (!(Utils.sum(props[attIndex]) > 0)) {
	1788	for (int k = 0; k < props[attIndex].length; k++) {
	1789	props[attIndex][k] = 1.0 / (double)props[attIndex].length;
	1790	}
	1791	} else {
	1792	Utils.normalize(props[attIndex]);
	1793	}
	1794
	1795	// Compute subset weights
	1796	subsetWeights[attIndex] = new double[2];
	1797	for (int j = 0; j < 2; j++) {
	1798	subsetWeights[attIndex][j] += Utils.sum(dist[j]);
	1799	}
	1800
	1801	// Then, for the attribute values that class frequency is 0, split it into the
	1802	// most frequent branch
	1803	for (int j=0; j<empty; j++) {
	1804	if (props[attIndex][0]>=props[attIndex][1]) {
	1805	if (bestSplitString=="") bestSplitString = "(" + emptyValues[j] + ")";
	1806	else bestSplitString += "\|" + "(" + emptyValues[j] + ")";
	1807	}
	1808	}
	1809
	1810	// clean gain
	1811	gains[attIndex] = Math.rint(bestGain*10000000)/10000000.0;
	1812
	1813	dists[attIndex] = dist;
	1814	return bestSplitString;
	1815	}
	1816
	1817
	1818	/**
	1819	* Split data into two subsets and store sorted indices and weights for two
	1820	* successor nodes.
	1821	*
	1822	* @param subsetIndices sorted indecis of instances for each attribute for two successor node
	1823	* @param subsetWeights weights of instances for each attribute for two successor node
	1824	* @param att attribute the split based on
	1825	* @param splitPoint split point the split based on if att is numeric
	1826	* @param splitStr split subset the split based on if att is nominal
	1827	* @param sortedIndices sorted indices of the instances to be split
	1828	* @param weights weights of the instances to bes split
	1829	* @param data training data
	1830	* @throws Exception if something goes wrong
	1831	*/
	1832	protected void splitData(int[][][] subsetIndices, double[][][] subsetWeights,
	1833	Attribute att, double splitPoint, String splitStr, int[][] sortedIndices,
	1834	double[][] weights, Instances data) throws Exception {
	1835
	1836	int j;
	1837	// For each attribute
	1838	for (int i = 0; i < data.numAttributes(); i++) {
	1839	if (i==data.classIndex()) continue;
	1840	int[] num = new int[2];
	1841	for (int k = 0; k < 2; k++) {
	1842	subsetIndices[k][i] = new int[sortedIndices[i].length];
	1843	subsetWeights[k][i] = new double[weights[i].length];
	1844	}
	1845
	1846	for (j = 0; j < sortedIndices[i].length; j++) {
	1847	Instance inst = data.instance(sortedIndices[i][j]);
	1848	if (inst.isMissing(att)) {
	1849	// Split instance up
	1850	for (int k = 0; k < 2; k++) {
	1851	if (m_Props[k] > 0) {
	1852	subsetIndices[k][i][num[k]] = sortedIndices[i][j];
	1853	subsetWeights[k][i][num[k]] = m_Props[k] * weights[i][j];
	1854	num[k]++;
	1855	}
	1856	}
	1857	} else {
	1858	int subset;
	1859	if (att.isNumeric()) {
	1860	subset = (inst.value(att) < splitPoint) ? 0 : 1;
	1861	} else { // nominal attribute
	1862	if (splitStr.indexOf
	1863	("(" + att.value((int)inst.value(att.index()))+")")!=-1) {
	1864	subset = 0;
	1865	} else subset = 1;
	1866	}
	1867	subsetIndices[subset][i][num[subset]] = sortedIndices[i][j];
	1868	subsetWeights[subset][i][num[subset]] = weights[i][j];
	1869	num[subset]++;
	1870	}
	1871	}
	1872
	1873	// Trim arrays
	1874	for (int k = 0; k < 2; k++) {
	1875	int[] copy = new int[num[k]];
	1876	System.arraycopy(subsetIndices[k][i], 0, copy, 0, num[k]);
	1877	subsetIndices[k][i] = copy;
	1878	double[] copyWeights = new double[num[k]];
	1879	System.arraycopy(subsetWeights[k][i], 0 ,copyWeights, 0, num[k]);
	1880	subsetWeights[k][i] = copyWeights;
	1881	}
	1882	}
	1883	}
	1884
	1885
	1886	/**
	1887	* Compute and return gini gain for given distributions of a node and its
	1888	* successor nodes.
	1889	*
	1890	* @param parentDist class distributions of parent node
	1891	* @param childDist class distributions of successor nodes
	1892	* @return Gini gain computed
	1893	*/
	1894	protected double computeGiniGain(double[] parentDist, double[][] childDist) {
	1895	double totalWeight = Utils.sum(parentDist);
	1896	if (totalWeight==0) return 0;
	1897
	1898	double leftWeight = Utils.sum(childDist[0]);
	1899	double rightWeight = Utils.sum(childDist[1]);
	1900
	1901	double parentGini = computeGini(parentDist, totalWeight);
	1902	double leftGini = computeGini(childDist[0],leftWeight);
	1903	double rightGini = computeGini(childDist[1], rightWeight);
	1904
	1905	return parentGini - leftWeight/totalWeight*leftGini -
	1906	rightWeight/totalWeight*rightGini;
	1907	}
	1908
	1909	/**
	1910	* Compute and return gini index for a given distribution of a node.
	1911	*
	1912	* @param dist class distributions
	1913	* @param total class distributions
	1914	* @return Gini index of the class distributions
	1915	*/
	1916	protected double computeGini(double[] dist, double total) {
	1917	if (total==0) return 0;
	1918	double val = 0;
	1919	for (int i=0; i<dist.length; i++) {
	1920	val += (dist[i]/total)*(dist[i]/total);
	1921	}
	1922	return 1- val;
	1923	}
	1924
	1925	/**
	1926	* Compute and return information gain for given distributions of a node
	1927	* and its successor nodes.
	1928	*
	1929	* @param parentDist class distributions of parent node
	1930	* @param childDist class distributions of successor nodes
	1931	* @return information gain computed
	1932	*/
	1933	protected double computeInfoGain(double[] parentDist, double[][] childDist) {
	1934	double totalWeight = Utils.sum(parentDist);
	1935	if (totalWeight==0) return 0;
	1936
	1937	double leftWeight = Utils.sum(childDist[0]);
	1938	double rightWeight = Utils.sum(childDist[1]);
	1939
	1940	double parentInfo = computeEntropy(parentDist, totalWeight);
	1941	double leftInfo = computeEntropy(childDist[0],leftWeight);
	1942	double rightInfo = computeEntropy(childDist[1], rightWeight);
	1943
	1944	return parentInfo - leftWeight/totalWeight*leftInfo -
	1945	rightWeight/totalWeight*rightInfo;
	1946	}
	1947
	1948	/**
	1949	* Compute and return entropy for a given distribution of a node.
	1950	*
	1951	* @param dist class distributions
	1952	* @param total class distributions
	1953	* @return entropy of the class distributions
	1954	*/
	1955	protected double computeEntropy(double[] dist, double total) {
	1956	if (total==0) return 0;
	1957	double entropy = 0;
	1958	for (int i=0; i<dist.length; i++) {
	1959	if (dist[i]!=0) entropy -= dist[i]/total * Utils.log2(dist[i]/total);
	1960	}
	1961	return entropy;
	1962	}
	1963
	1964	/**
	1965	* Make the node leaf node.
	1966	*
	1967	* @param data training data
	1968	*/
	1969	protected void makeLeaf(Instances data) {
	1970	m_Attribute = null;
	1971	m_isLeaf = true;
	1972	m_ClassValue=Utils.maxIndex(m_ClassProbs);
	1973	m_ClassAttribute = data.classAttribute();
	1974	}
	1975
	1976	/**
	1977	* Computes class probabilities for instance using the decision tree.
	1978	*
	1979	* @param instance the instance for which class probabilities is to be computed
	1980	* @return the class probabilities for the given instance
	1981	* @throws Exception if something goes wrong
	1982	*/
	1983	public double[] distributionForInstance(Instance instance)
	1984	throws Exception {
	1985	if (!m_isLeaf) {
	1986	// value of split attribute is missing
	1987	if (instance.isMissing(m_Attribute)) {
	1988	double[] returnedDist = new double[m_ClassProbs.length];
	1989
	1990	for (int i = 0; i < m_Successors.length; i++) {
	1991	double[] help =
	1992	m_Successors[i].distributionForInstance(instance);
	1993	if (help != null) {
	1994	for (int j = 0; j < help.length; j++) {
	1995	returnedDist[j] += m_Props[i] * help[j];
	1996	}
	1997	}
	1998	}
	1999	return returnedDist;
	2000	}
	2001
	2002	// split attribute is nonimal
	2003	else if (m_Attribute.isNominal()) {
	2004	if (m_SplitString.indexOf("(" +
	2005	m_Attribute.value((int)instance.value(m_Attribute)) + ")")!=-1)
	2006	return m_Successors[0].distributionForInstance(instance);
	2007	else return m_Successors[1].distributionForInstance(instance);
	2008	}
	2009
	2010	// split attribute is numeric
	2011	else {
	2012	if (instance.value(m_Attribute) < m_SplitValue)
	2013	return m_Successors[0].distributionForInstance(instance);
	2014	else
	2015	return m_Successors[1].distributionForInstance(instance);
	2016	}
	2017	}
	2018
	2019	// leaf node
	2020	else return m_ClassProbs;
	2021	}
	2022
	2023	/**
	2024	* Prints the decision tree using the protected toString method from below.
	2025	*
	2026	* @return a textual description of the classifier
	2027	*/
	2028	public String toString() {
	2029	if ((m_Distribution == null) && (m_Successors == null)) {
	2030	return "Best-First: No model built yet.";
	2031	}
	2032	return "Best-First Decision Tree\n" + toString(0)+"\n\n"
	2033	+"Size of the Tree: "+numNodes()+"\n\n"
	2034	+"Number of Leaf Nodes: "+numLeaves();
	2035	}
	2036
	2037	/**
	2038	* Outputs a tree at a certain level.
	2039	*
	2040	* @param level the level at which the tree is to be printed
	2041	* @return a tree at a certain level.
	2042	*/
	2043	protected String toString(int level) {
	2044	StringBuffer text = new StringBuffer();
	2045	// if leaf nodes
	2046	if (m_Attribute == null) {
	2047	if (Utils.isMissingValue(m_ClassValue)) {
	2048	text.append(": null");
	2049	} else {
	2050	double correctNum = Math.rint(m_Distribution[Utils.maxIndex(m_Distribution)]*100)/
	2051	100.0;
	2052	double wrongNum = Math.rint((Utils.sum(m_Distribution) -
	2053	m_Distribution[Utils.maxIndex(m_Distribution)])*100)/100.0;
	2054	String str = "(" + correctNum + "/" + wrongNum + ")";
	2055	text.append(": " + m_ClassAttribute.value((int) m_ClassValue)+ str);
	2056	}
	2057	} else {
	2058	for (int j = 0; j < 2; j++) {
	2059	text.append("\n");
	2060	for (int i = 0; i < level; i++) {
	2061	text.append("\| ");
	2062	}
	2063	if (j==0) {
	2064	if (m_Attribute.isNumeric())
	2065	text.append(m_Attribute.name() + " < " + m_SplitValue);
	2066	else
	2067	text.append(m_Attribute.name() + "=" + m_SplitString);
	2068	} else {
	2069	if (m_Attribute.isNumeric())
	2070	text.append(m_Attribute.name() + " >= " + m_SplitValue);
	2071	else
	2072	text.append(m_Attribute.name() + "!=" + m_SplitString);
	2073	}
	2074	text.append(m_Successors[j].toString(level + 1));
	2075	}
	2076	}
	2077	return text.toString();
	2078	}
	2079
	2080	/**
	2081	* Compute size of the tree.
	2082	*
	2083	* @return size of the tree
	2084	*/
	2085	public int numNodes() {
	2086	if (m_isLeaf) {
	2087	return 1;
	2088	} else {
	2089	int size =1;
	2090	for (int i=0;i<m_Successors.length;i++) {
	2091	size+=m_Successors[i].numNodes();
	2092	}
	2093	return size;
	2094	}
	2095	}
	2096
	2097	/**
	2098	* Compute number of leaf nodes.
	2099	*
	2100	* @return number of leaf nodes
	2101	*/
	2102	public int numLeaves() {
	2103	if (m_isLeaf) return 1;
	2104	else {
	2105	int size=0;
	2106	for (int i=0;i<m_Successors.length;i++) {
	2107	size+=m_Successors[i].numLeaves();
	2108	}
	2109	return size;
	2110	}
	2111	}
	2112
	2113	/**
	2114	* Returns an enumeration describing the available options.
	2115	*
	2116	* @return an enumeration describing the available options.
	2117	*/
	2118	public Enumeration listOptions() {
	2119	Vector result;
	2120	Enumeration en;
	2121
	2122	result = new Vector();
	2123
	2124	en = super.listOptions();
	2125	while (en.hasMoreElements())
	2126	result.addElement(en.nextElement());
	2127
	2128	result.addElement(new Option(
	2129	"\tThe pruning strategy.\n"
	2130	+ "\t(default: " + new SelectedTag(PRUNING_POSTPRUNING, TAGS_PRUNING) + ")",
	2131	"P", 1, "-P " + Tag.toOptionList(TAGS_PRUNING)));
	2132
	2133	result.addElement(new Option(
	2134	"\tThe minimal number of instances at the terminal nodes.\n"
	2135	+ "\t(default 2)",
	2136	"M", 1, "-M <min no>"));
	2137
	2138	result.addElement(new Option(
	2139	"\tThe number of folds used in the pruning.\n"
	2140	+ "\t(default 5)",
	2141	"N", 5, "-N <num folds>"));
	2142
	2143	result.addElement(new Option(
	2144	"\tDon't use heuristic search for nominal attributes in multi-class\n"
	2145	+ "\tproblem (default yes).\n",
	2146	"H", 0, "-H"));
	2147
	2148	result.addElement(new Option(
	2149	"\tDon't use Gini index for splitting (default yes),\n"
	2150	+ "\tif not information is used.",
	2151	"G", 0, "-G"));
	2152
	2153	result.addElement(new Option(
	2154	"\tDon't use error rate in internal cross-validation (default yes), \n"
	2155	+ "\tbut root mean squared error.",
	2156	"R", 0, "-R"));
	2157
	2158	result.addElement(new Option(
	2159	"\tUse the 1 SE rule to make pruning decision.\n"
	2160	+ "\t(default no).",
	2161	"A", 0, "-A"));
	2162
	2163	result.addElement(new Option(
	2164	"\tPercentage of training data size (0-1]\n"
	2165	+ "\t(default 1).",
	2166	"C", 0, "-C"));
	2167
	2168	return result.elements();
	2169	}
	2170
	2171	/**
	2172	* Parses the options for this object. <p/>
	2173	*
	2174	<!-- options-start -->
	2175	* Valid options are: <p/>
	2176	*
	2177	* <pre> -S <num>
	2178	* Random number seed.
	2179	* (default 1)</pre>
	2180	*
	2181	* <pre> -D
	2182	* If set, classifier is run in debug mode and
	2183	* may output additional info to the console</pre>
	2184	*
	2185	* <pre> -P <UNPRUNED\|POSTPRUNED\|PREPRUNED>
	2186	* The pruning strategy.
	2187	* (default: POSTPRUNED)</pre>
	2188	*
	2189	* <pre> -M <min no>
	2190	* The minimal number of instances at the terminal nodes.
	2191	* (default 2)</pre>
	2192	*
	2193	* <pre> -N <num folds>
	2194	* The number of folds used in the pruning.
	2195	* (default 5)</pre>
	2196	*
	2197	* <pre> -H
	2198	* Don't use heuristic search for nominal attributes in multi-class
	2199	* problem (default yes).
	2200	* </pre>
	2201	*
	2202	* <pre> -G
	2203	* Don't use Gini index for splitting (default yes),
	2204	* if not information is used.</pre>
	2205	*
	2206	* <pre> -R
	2207	* Don't use error rate in internal cross-validation (default yes),
	2208	* but root mean squared error.</pre>
	2209	*
	2210	* <pre> -A
	2211	* Use the 1 SE rule to make pruning decision.
	2212	* (default no).</pre>
	2213	*
	2214	* <pre> -C
	2215	* Percentage of training data size (0-1]
	2216	* (default 1).</pre>
	2217	*
	2218	<!-- options-end -->
	2219	*
	2220	* @param options the options to use
	2221	* @throws Exception if setting of options fails
	2222	*/
	2223	public void setOptions(String[] options) throws Exception {
	2224	String tmpStr;
	2225
	2226	super.setOptions(options);
	2227
	2228	tmpStr = Utils.getOption('M', options);
	2229	if (tmpStr.length() != 0)
	2230	setMinNumObj(Integer.parseInt(tmpStr));
	2231	else
	2232	setMinNumObj(2);
	2233
	2234	tmpStr = Utils.getOption('N', options);
	2235	if (tmpStr.length() != 0)
	2236	setNumFoldsPruning(Integer.parseInt(tmpStr));
	2237	else
	2238	setNumFoldsPruning(5);
	2239
	2240	tmpStr = Utils.getOption('C', options);
	2241	if (tmpStr.length()!=0)
	2242	setSizePer(Double.parseDouble(tmpStr));
	2243	else
	2244	setSizePer(1);
	2245
	2246	tmpStr = Utils.getOption('P', options);
	2247	if (tmpStr.length() != 0)
	2248	setPruningStrategy(new SelectedTag(tmpStr, TAGS_PRUNING));
	2249	else
	2250	setPruningStrategy(new SelectedTag(PRUNING_POSTPRUNING, TAGS_PRUNING));
	2251
	2252	setHeuristic(!Utils.getFlag('H',options));
	2253
	2254	setUseGini(!Utils.getFlag('G',options));
	2255
	2256	setUseErrorRate(!Utils.getFlag('R',options));
	2257
	2258	setUseOneSE(Utils.getFlag('A',options));
	2259	}
	2260
	2261	/**
	2262	* Gets the current settings of the Classifier.
	2263	*
	2264	* @return the current settings of the Classifier
	2265	*/
	2266	public String[] getOptions() {
	2267	int i;
	2268	Vector result;
	2269	String[] options;
	2270
	2271	result = new Vector();
	2272
	2273	options = super.getOptions();
	2274	for (i = 0; i < options.length; i++)
	2275	result.add(options[i]);
	2276
	2277	result.add("-M");
	2278	result.add("" + getMinNumObj());
	2279
	2280	result.add("-N");
	2281	result.add("" + getNumFoldsPruning());
	2282
	2283	if (!getHeuristic())
	2284	result.add("-H");
	2285
	2286	if (!getUseGini())
	2287	result.add("-G");
	2288
	2289	if (!getUseErrorRate())
	2290	result.add("-R");
	2291
	2292	if (getUseOneSE())
	2293	result.add("-A");
	2294
	2295	result.add("-C");
	2296	result.add("" + getSizePer());
	2297
	2298	result.add("-P");
	2299	result.add("" + getPruningStrategy());
	2300
	2301	return (String[]) result.toArray(new String[result.size()]);
	2302	}
	2303
	2304	/**
	2305	* Return an enumeration of the measure names.
	2306	*
	2307	* @return an enumeration of the measure names
	2308	*/
	2309	public Enumeration enumerateMeasures() {
	2310	Vector result = new Vector();
	2311
	2312	result.addElement("measureTreeSize");
	2313
	2314	return result.elements();
	2315	}
	2316
	2317	/**
	2318	* Return number of tree size.
	2319	*
	2320	* @return number of tree size
	2321	*/
	2322	public double measureTreeSize() {
	2323	return numNodes();
	2324	}
	2325
	2326	/**
	2327	* Returns the value of the named measure
	2328	*
	2329	* @param additionalMeasureName the name of the measure to query for its value
	2330	* @return the value of the named measure
	2331	* @throws IllegalArgumentException if the named measure is not supported
	2332	*/
	2333	public double getMeasure(String additionalMeasureName) {
	2334	if (additionalMeasureName.compareToIgnoreCase("measureTreeSize") == 0) {
	2335	return measureTreeSize();
	2336	} else {
	2337	throw new IllegalArgumentException(additionalMeasureName
	2338	+ " not supported (Best-First)");
	2339	}
	2340	}
	2341
	2342	/**
	2343	* Returns the tip text for this property
	2344	*
	2345	* @return tip text for this property suitable for
	2346	* displaying in the explorer/experimenter gui
	2347	*/
	2348	public String pruningStrategyTipText() {
	2349	return "Sets the pruning strategy.";
	2350	}
	2351
	2352	/**
	2353	* Sets the pruning strategy.
	2354	*
	2355	* @param value the strategy
	2356	*/
	2357	public void setPruningStrategy(SelectedTag value) {
	2358	if (value.getTags() == TAGS_PRUNING) {
	2359	m_PruningStrategy = value.getSelectedTag().getID();
	2360	}
	2361	}
	2362
	2363	/**
	2364	* Gets the pruning strategy.
	2365	*
	2366	* @return the current strategy.
	2367	*/
	2368	public SelectedTag getPruningStrategy() {
	2369	return new SelectedTag(m_PruningStrategy, TAGS_PRUNING);
	2370	}
	2371
	2372	/**
	2373	* Returns the tip text for this property
	2374	*
	2375	* @return tip text for this property suitable for
	2376	* displaying in the explorer/experimenter gui
	2377	*/
	2378	public String minNumObjTipText() {
	2379	return "Set minimal number of instances at the terminal nodes.";
	2380	}
	2381
	2382	/**
	2383	* Set minimal number of instances at the terminal nodes.
	2384	*
	2385	* @param value minimal number of instances at the terminal nodes
	2386	*/
	2387	public void setMinNumObj(int value) {
	2388	m_minNumObj = value;
	2389	}
	2390
	2391	/**
	2392	* Get minimal number of instances at the terminal nodes.
	2393	*
	2394	* @return minimal number of instances at the terminal nodes
	2395	*/
	2396	public int getMinNumObj() {
	2397	return m_minNumObj;
	2398	}
	2399
	2400	/**
	2401	* Returns the tip text for this property
	2402	*
	2403	* @return tip text for this property suitable for
	2404	* displaying in the explorer/experimenter gui
	2405	*/
	2406	public String numFoldsPruningTipText() {
	2407	return "Number of folds in internal cross-validation.";
	2408	}
	2409
	2410	/**
	2411	* Set number of folds in internal cross-validation.
	2412	*
	2413	* @param value the number of folds
	2414	*/
	2415	public void setNumFoldsPruning(int value) {
	2416	m_numFoldsPruning = value;
	2417	}
	2418
	2419	/**
	2420	* Set number of folds in internal cross-validation.
	2421	*
	2422	* @return number of folds in internal cross-validation
	2423	*/
	2424	public int getNumFoldsPruning() {
	2425	return m_numFoldsPruning;
	2426	}
	2427
	2428	/**
	2429	* Returns the tip text for this property
	2430	*
	2431	* @return tip text for this property suitable for
	2432	* displaying in the explorer/experimenter gui.
	2433	*/
	2434	public String heuristicTipText() {
	2435	return "If heuristic search is used for binary split for nominal attributes.";
	2436	}
	2437
	2438	/**
	2439	* Set if use heuristic search for nominal attributes in multi-class problems.
	2440	*
	2441	* @param value if use heuristic search for nominal attributes in
	2442	* multi-class problems
	2443	*/
	2444	public void setHeuristic(boolean value) {
	2445	m_Heuristic = value;
	2446	}
	2447
	2448	/**
	2449	* Get if use heuristic search for nominal attributes in multi-class problems.
	2450	*
	2451	* @return if use heuristic search for nominal attributes in
	2452	* multi-class problems
	2453	*/
	2454	public boolean getHeuristic() {
	2455	return m_Heuristic;
	2456	}
	2457
	2458	/**
	2459	* Returns the tip text for this property
	2460	*
	2461	* @return tip text for this property suitable for
	2462	* displaying in the explorer/experimenter gui.
	2463	*/
	2464	public String useGiniTipText() {
	2465	return "If true the Gini index is used for splitting criterion, otherwise the information is used.";
	2466	}
	2467
	2468	/**
	2469	* Set if use Gini index as splitting criterion.
	2470	*
	2471	* @param value if use Gini index splitting criterion
	2472	*/
	2473	public void setUseGini(boolean value) {
	2474	m_UseGini = value;
	2475	}
	2476
	2477	/**
	2478	* Get if use Gini index as splitting criterion.
	2479	*
	2480	* @return if use Gini index as splitting criterion
	2481	*/
	2482	public boolean getUseGini() {
	2483	return m_UseGini;
	2484	}
	2485
	2486	/**
	2487	* Returns the tip text for this property
	2488	*
	2489	* @return tip text for this property suitable for
	2490	* displaying in the explorer/experimenter gui.
	2491	*/
	2492	public String useErrorRateTipText() {
	2493	return "If error rate is used as error estimate. if not, root mean squared error is used.";
	2494	}
	2495
	2496	/**
	2497	* Set if use error rate in internal cross-validation.
	2498	*
	2499	* @param value if use error rate in internal cross-validation
	2500	*/
	2501	public void setUseErrorRate(boolean value) {
	2502	m_UseErrorRate = value;
	2503	}
	2504
	2505	/**
	2506	* Get if use error rate in internal cross-validation.
	2507	*
	2508	* @return if use error rate in internal cross-validation.
	2509	*/
	2510	public boolean getUseErrorRate() {
	2511	return m_UseErrorRate;
	2512	}
	2513
	2514	/**
	2515	* Returns the tip text for this property
	2516	*
	2517	* @return tip text for this property suitable for
	2518	* displaying in the explorer/experimenter gui.
	2519	*/
	2520	public String useOneSETipText() {
	2521	return "Use the 1SE rule to make pruning decision.";
	2522	}
	2523
	2524	/**
	2525	* Set if use the 1SE rule to choose final model.
	2526	*
	2527	* @param value if use the 1SE rule to choose final model
	2528	*/
	2529	public void setUseOneSE(boolean value) {
	2530	m_UseOneSE = value;
	2531	}
	2532
	2533	/**
	2534	* Get if use the 1SE rule to choose final model.
	2535	*
	2536	* @return if use the 1SE rule to choose final model
	2537	*/
	2538	public boolean getUseOneSE() {
	2539	return m_UseOneSE;
	2540	}
	2541
	2542	/**
	2543	* Returns the tip text for this property
	2544	*
	2545	* @return tip text for this property suitable for
	2546	* displaying in the explorer/experimenter gui.
	2547	*/
	2548	public String sizePerTipText() {
	2549	return "The percentage of the training set size (0-1, 0 not included).";
	2550	}
	2551
	2552	/**
	2553	* Set training set size.
	2554	*
	2555	* @param value training set size
	2556	*/
	2557	public void setSizePer(double value) {
	2558	if ((value <= 0) \|\| (value > 1))
	2559	System.err.println(
	2560	"The percentage of the training set size must be in range 0 to 1 "
	2561	+ "(0 not included) - ignored!");
	2562	else
	2563	m_SizePer = value;
	2564	}
	2565
	2566	/**
	2567	* Get training set size.
	2568	*
	2569	* @return training set size
	2570	*/
	2571	public double getSizePer() {
	2572	return m_SizePer;
	2573	}
	2574
	2575	/**
	2576	* Returns the revision string.
	2577	*
	2578	* @return the revision
	2579	*/
	2580	public String getRevision() {
	2581	return RevisionUtils.extract("$Revision: 5987 $");
	2582	}
	2583
	2584	/**
	2585	* Main method.
	2586	*
	2587	* @param args the options for the classifier
	2588	*/
	2589	public static void main(String[] args) {
	2590	runClassifier(new BFTree(), args);
	2591	}
	2592	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/classifiers/trees/BFTree.java

Download in other formats: