Context Navigation

source: src/main/java/weka/classifiers/trees/j48/C45ModelSelection.java @ 7

Last change on this file since 7 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 6.1 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* C45ModelSelection.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.trees.j48;
24
25	import weka.core.Attribute;
26	import weka.core.Instances;
27	import weka.core.RevisionUtils;
28	import weka.core.Utils;
29
30	import java.util.Enumeration;
31
32	/**
33	* Class for selecting a C4.5-type split for a given dataset.
34	*
35	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
36	* @version $Revision: 6073 $
37	*/
38	public class C45ModelSelection
39	extends ModelSelection {
40
41	/** for serialization */
42	private static final long serialVersionUID = 3372204862440821989L;
43
44	/** Minimum number of objects in interval. */
45	private int m_minNoObj;
46
47	/** Use MDL correction? */
48	private boolean m_useMDLcorrection;
49
50	/** All the training data */
51	private Instances m_allData; //
52
53	/**
54	* Initializes the split selection method with the given parameters.
55	*
56	* @param minNoObj minimum number of instances that have to occur in at least two
57	* subsets induced by split
58	* @param allData FULL training dataset (necessary for
59	* selection of split points).
60	* @param useMDLcorrection whether to use MDL adjustement when
61	* finding splits on numeric attributes
62	*/
63	public C45ModelSelection(int minNoObj, Instances allData,
64	boolean useMDLcorrection) {
65	m_minNoObj = minNoObj;
66	m_allData = allData;
67	m_useMDLcorrection = useMDLcorrection;
68	}
69
70	/**
71	* Sets reference to training data to null.
72	*/
73	public void cleanup() {
74
75	m_allData = null;
76	}
77
78	/**
79	* Selects C4.5-type split for the given dataset.
80	*/
81	public final ClassifierSplitModel selectModel(Instances data){
82
83	double minResult;
84	double currentResult;
85	C45Split [] currentModel;
86	C45Split bestModel = null;
87	NoSplit noSplitModel = null;
88	double averageInfoGain = 0;
89	int validModels = 0;
90	boolean multiVal = true;
91	Distribution checkDistribution;
92	Attribute attribute;
93	double sumOfWeights;
94	int i;
95
96	try{
97
98	// Check if all Instances belong to one class or if not
99	// enough Instances to split.
100	checkDistribution = new Distribution(data);
101	noSplitModel = new NoSplit(checkDistribution);
102	if (Utils.sm(checkDistribution.total(),2*m_minNoObj) \|\|
103	Utils.eq(checkDistribution.total(),
104	checkDistribution.perClass(checkDistribution.maxClass())))
105	return noSplitModel;
106
107	// Check if all attributes are nominal and have a
108	// lot of values.
109	if (m_allData != null) {
110	Enumeration enu = data.enumerateAttributes();
111	while (enu.hasMoreElements()) {
112	attribute = (Attribute) enu.nextElement();
113	if ((attribute.isNumeric()) \|\|
114	(Utils.sm((double)attribute.numValues(),
115	(0.3*(double)m_allData.numInstances())))){
116	multiVal = false;
117	break;
118	}
119	}
120	}
121
122	currentModel = new C45Split[data.numAttributes()];
123	sumOfWeights = data.sumOfWeights();
124
125	// For each attribute.
126	for (i = 0; i < data.numAttributes(); i++){
127
128	// Apart from class attribute.
129	if (i != (data).classIndex()){
130
131	// Get models for current attribute.
132	currentModel[i] = new C45Split(i,m_minNoObj,sumOfWeights,m_useMDLcorrection);
133	currentModel[i].buildClassifier(data);
134
135	// Check if useful split for current attribute
136	// exists and check for enumerated attributes with
137	// a lot of values.
138	if (currentModel[i].checkModel())
139	if (m_allData != null) {
140	if ((data.attribute(i).isNumeric()) \|\|
141	(multiVal \|\| Utils.sm((double)data.attribute(i).numValues(),
142	(0.3*(double)m_allData.numInstances())))){
143	averageInfoGain = averageInfoGain+currentModel[i].infoGain();
144	validModels++;
145	}
146	} else {
147	averageInfoGain = averageInfoGain+currentModel[i].infoGain();
148	validModels++;
149	}
150	}else
151	currentModel[i] = null;
152	}
153
154	// Check if any useful split was found.
155	if (validModels == 0)
156	return noSplitModel;
157	averageInfoGain = averageInfoGain/(double)validModels;
158
159	// Find "best" attribute to split on.
160	minResult = 0;
161	for (i=0;i<data.numAttributes();i++){
162	if ((i != (data).classIndex()) &&
163	(currentModel[i].checkModel()))
164
165	// Use 1E-3 here to get a closer approximation to the original
166	// implementation.
167	if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&
168	Utils.gr(currentModel[i].gainRatio(),minResult)){
169	bestModel = currentModel[i];
170	minResult = currentModel[i].gainRatio();
171	}
172	}
173
174	// Check if useful split was found.
175	if (Utils.eq(minResult,0))
176	return noSplitModel;
177
178	// Add all Instances with unknown values for the corresponding
179	// attribute to the distribution for the model, so that
180	// the complete distribution is stored with the model.
181	bestModel.distribution().
182	addInstWithUnknown(data,bestModel.attIndex());
183
184	// Set the split point analogue to C45 if attribute numeric.
185	if (m_allData != null)
186	bestModel.setSplitPoint(m_allData);
187	return bestModel;
188	}catch(Exception e){
189	e.printStackTrace();
190	}
191	return null;
192	}
193
194	/**
195	* Selects C4.5-type split for the given dataset.
196	*/
197	public final ClassifierSplitModel selectModel(Instances train, Instances test) {
198
199	return selectModel(train);
200	}
201
202	/**
203	* Returns the revision string.
204	*
205	* @return the revision
206	*/
207	public String getRevision() {
208	return RevisionUtils.extract("$Revision: 6073 $");
209	}
210	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: