Context Navigation

source: src/main/java/weka/classifiers/trees/Id3.java @ 24

Last change on this file since 24 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 15.3 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* Id3.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.trees;
24
25	import weka.classifiers.Classifier;
26	import weka.classifiers.AbstractClassifier;
27	import weka.classifiers.Sourcable;
28	import weka.core.Attribute;
29	import weka.core.Capabilities;
30	import weka.core.Instance;
31	import weka.core.Instances;
32	import weka.core.NoSupportForMissingValuesException;
33	import weka.core.RevisionUtils;
34	import weka.core.TechnicalInformation;
35	import weka.core.TechnicalInformationHandler;
36	import weka.core.Utils;
37	import weka.core.Capabilities.Capability;
38	import weka.core.TechnicalInformation.Field;
39	import weka.core.TechnicalInformation.Type;
40
41	import java.util.Enumeration;
42
43	/**
44	<!-- globalinfo-start -->
45	* Class for constructing an unpruned decision tree based on the ID3 algorithm. Can only deal with nominal attributes. No missing values allowed. Empty leaves may result in unclassified instances. For more information see: <br/>
46	* <br/>
47	* R. Quinlan (1986). Induction of decision trees. Machine Learning. 1(1):81-106.
48	* <p/>
49	<!-- globalinfo-end -->
50	*
51	<!-- technical-bibtex-start -->
52	* BibTeX:
53	* <pre>
54	* @article{Quinlan1986,
55	* author = {R. Quinlan},
56	* journal = {Machine Learning},
57	* number = {1},
58	* pages = {81-106},
59	* title = {Induction of decision trees},
60	* volume = {1},
61	* year = {1986}
62	* }
63	* </pre>
64	* <p/>
65	<!-- technical-bibtex-end -->
66	*
67	<!-- options-start -->
68	* Valid options are: <p/>
69	*
70	* <pre> -D
71	* If set, classifier is run in debug mode and
72	* may output additional info to the console</pre>
73	*
74	<!-- options-end -->
75	*
76	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
77	* @version $Revision: 5987 $
78	*/
79	public class Id3
80	extends AbstractClassifier
81	implements TechnicalInformationHandler, Sourcable {
82
83	/** for serialization */
84	static final long serialVersionUID = -2693678647096322561L;
85
86	/** The node's successors. */
87	private Id3[] m_Successors;
88
89	/** Attribute used for splitting. */
90	private Attribute m_Attribute;
91
92	/** Class value if node is leaf. */
93	private double m_ClassValue;
94
95	/** Class distribution if node is leaf. */
96	private double[] m_Distribution;
97
98	/** Class attribute of dataset. */
99	private Attribute m_ClassAttribute;
100
101	/**
102	* Returns a string describing the classifier.
103	* @return a description suitable for the GUI.
104	*/
105	public String globalInfo() {
106
107	return "Class for constructing an unpruned decision tree based on the ID3 "
108	+ "algorithm. Can only deal with nominal attributes. No missing values "
109	+ "allowed. Empty leaves may result in unclassified instances. For more "
110	+ "information see: \n\n"
111	+ getTechnicalInformation().toString();
112	}
113
114	/**
115	* Returns an instance of a TechnicalInformation object, containing
116	* detailed information about the technical background of this class,
117	* e.g., paper reference or book this class is based on.
118	*
119	* @return the technical information about this class
120	*/
121	public TechnicalInformation getTechnicalInformation() {
122	TechnicalInformation result;
123
124	result = new TechnicalInformation(Type.ARTICLE);
125	result.setValue(Field.AUTHOR, "R. Quinlan");
126	result.setValue(Field.YEAR, "1986");
127	result.setValue(Field.TITLE, "Induction of decision trees");
128	result.setValue(Field.JOURNAL, "Machine Learning");
129	result.setValue(Field.VOLUME, "1");
130	result.setValue(Field.NUMBER, "1");
131	result.setValue(Field.PAGES, "81-106");
132
133	return result;
134	}
135
136	/**
137	* Returns default capabilities of the classifier.
138	*
139	* @return the capabilities of this classifier
140	*/
141	public Capabilities getCapabilities() {
142	Capabilities result = super.getCapabilities();
143	result.disableAll();
144
145	// attributes
146	result.enable(Capability.NOMINAL_ATTRIBUTES);
147
148	// class
149	result.enable(Capability.NOMINAL_CLASS);
150	result.enable(Capability.MISSING_CLASS_VALUES);
151
152	// instances
153	result.setMinimumNumberInstances(0);
154
155	return result;
156	}
157
158	/**
159	* Builds Id3 decision tree classifier.
160	*
161	* @param data the training data
162	* @exception Exception if classifier can't be built successfully
163	*/
164	public void buildClassifier(Instances data) throws Exception {
165
166	// can classifier handle the data?
167	getCapabilities().testWithFail(data);
168
169	// remove instances with missing class
170	data = new Instances(data);
171	data.deleteWithMissingClass();
172
173	makeTree(data);
174	}
175
176	/**
177	* Method for building an Id3 tree.
178	*
179	* @param data the training data
180	* @exception Exception if decision tree can't be built successfully
181	*/
182	private void makeTree(Instances data) throws Exception {
183
184	// Check if no instances have reached this node.
185	if (data.numInstances() == 0) {
186	m_Attribute = null;
187	m_ClassValue = Utils.missingValue();
188	m_Distribution = new double[data.numClasses()];
189	return;
190	}
191
192	// Compute attribute with maximum information gain.
193	double[] infoGains = new double[data.numAttributes()];
194	Enumeration attEnum = data.enumerateAttributes();
195	while (attEnum.hasMoreElements()) {
196	Attribute att = (Attribute) attEnum.nextElement();
197	infoGains[att.index()] = computeInfoGain(data, att);
198	}
199	m_Attribute = data.attribute(Utils.maxIndex(infoGains));
200
201	// Make leaf if information gain is zero.
202	// Otherwise create successors.
203	if (Utils.eq(infoGains[m_Attribute.index()], 0)) {
204	m_Attribute = null;
205	m_Distribution = new double[data.numClasses()];
206	Enumeration instEnum = data.enumerateInstances();
207	while (instEnum.hasMoreElements()) {
208	Instance inst = (Instance) instEnum.nextElement();
209	m_Distribution[(int) inst.classValue()]++;
210	}
211	Utils.normalize(m_Distribution);
212	m_ClassValue = Utils.maxIndex(m_Distribution);
213	m_ClassAttribute = data.classAttribute();
214	} else {
215	Instances[] splitData = splitData(data, m_Attribute);
216	m_Successors = new Id3[m_Attribute.numValues()];
217	for (int j = 0; j < m_Attribute.numValues(); j++) {
218	m_Successors[j] = new Id3();
219	m_Successors[j].makeTree(splitData[j]);
220	}
221	}
222	}
223
224	/**
225	* Classifies a given test instance using the decision tree.
226	*
227	* @param instance the instance to be classified
228	* @return the classification
229	* @throws NoSupportForMissingValuesException if instance has missing values
230	*/
231	public double classifyInstance(Instance instance)
232	throws NoSupportForMissingValuesException {
233
234	if (instance.hasMissingValue()) {
235	throw new NoSupportForMissingValuesException("Id3: no missing values, "
236	+ "please.");
237	}
238	if (m_Attribute == null) {
239	return m_ClassValue;
240	} else {
241	return m_Successors[(int) instance.value(m_Attribute)].
242	classifyInstance(instance);
243	}
244	}
245
246	/**
247	* Computes class distribution for instance using decision tree.
248	*
249	* @param instance the instance for which distribution is to be computed
250	* @return the class distribution for the given instance
251	* @throws NoSupportForMissingValuesException if instance has missing values
252	*/
253	public double[] distributionForInstance(Instance instance)
254	throws NoSupportForMissingValuesException {
255
256	if (instance.hasMissingValue()) {
257	throw new NoSupportForMissingValuesException("Id3: no missing values, "
258	+ "please.");
259	}
260	if (m_Attribute == null) {
261	return m_Distribution;
262	} else {
263	return m_Successors[(int) instance.value(m_Attribute)].
264	distributionForInstance(instance);
265	}
266	}
267
268	/**
269	* Prints the decision tree using the private toString method from below.
270	*
271	* @return a textual description of the classifier
272	*/
273	public String toString() {
274
275	if ((m_Distribution == null) && (m_Successors == null)) {
276	return "Id3: No model built yet.";
277	}
278	return "Id3\n\n" + toString(0);
279	}
280
281	/**
282	* Computes information gain for an attribute.
283	*
284	* @param data the data for which info gain is to be computed
285	* @param att the attribute
286	* @return the information gain for the given attribute and data
287	* @throws Exception if computation fails
288	*/
289	private double computeInfoGain(Instances data, Attribute att)
290	throws Exception {
291
292	double infoGain = computeEntropy(data);
293	Instances[] splitData = splitData(data, att);
294	for (int j = 0; j < att.numValues(); j++) {
295	if (splitData[j].numInstances() > 0) {
296	infoGain -= ((double) splitData[j].numInstances() /
297	(double) data.numInstances()) *
298	computeEntropy(splitData[j]);
299	}
300	}
301	return infoGain;
302	}
303
304	/**
305	* Computes the entropy of a dataset.
306	*
307	* @param data the data for which entropy is to be computed
308	* @return the entropy of the data's class distribution
309	* @throws Exception if computation fails
310	*/
311	private double computeEntropy(Instances data) throws Exception {
312
313	double [] classCounts = new double[data.numClasses()];
314	Enumeration instEnum = data.enumerateInstances();
315	while (instEnum.hasMoreElements()) {
316	Instance inst = (Instance) instEnum.nextElement();
317	classCounts[(int) inst.classValue()]++;
318	}
319	double entropy = 0;
320	for (int j = 0; j < data.numClasses(); j++) {
321	if (classCounts[j] > 0) {
322	entropy -= classCounts[j] * Utils.log2(classCounts[j]);
323	}
324	}
325	entropy /= (double) data.numInstances();
326	return entropy + Utils.log2(data.numInstances());
327	}
328
329	/**
330	* Splits a dataset according to the values of a nominal attribute.
331	*
332	* @param data the data which is to be split
333	* @param att the attribute to be used for splitting
334	* @return the sets of instances produced by the split
335	*/
336	private Instances[] splitData(Instances data, Attribute att) {
337
338	Instances[] splitData = new Instances[att.numValues()];
339	for (int j = 0; j < att.numValues(); j++) {
340	splitData[j] = new Instances(data, data.numInstances());
341	}
342	Enumeration instEnum = data.enumerateInstances();
343	while (instEnum.hasMoreElements()) {
344	Instance inst = (Instance) instEnum.nextElement();
345	splitData[(int) inst.value(att)].add(inst);
346	}
347	for (int i = 0; i < splitData.length; i++) {
348	splitData[i].compactify();
349	}
350	return splitData;
351	}
352
353	/**
354	* Outputs a tree at a certain level.
355	*
356	* @param level the level at which the tree is to be printed
357	* @return the tree as string at the given level
358	*/
359	private String toString(int level) {
360
361	StringBuffer text = new StringBuffer();
362
363	if (m_Attribute == null) {
364	if (Utils.isMissingValue(m_ClassValue)) {
365	text.append(": null");
366	} else {
367	text.append(": " + m_ClassAttribute.value((int) m_ClassValue));
368	}
369	} else {
370	for (int j = 0; j < m_Attribute.numValues(); j++) {
371	text.append("\n");
372	for (int i = 0; i < level; i++) {
373	text.append("\| ");
374	}
375	text.append(m_Attribute.name() + " = " + m_Attribute.value(j));
376	text.append(m_Successors[j].toString(level + 1));
377	}
378	}
379	return text.toString();
380	}
381
382	/**
383	* Adds this tree recursively to the buffer.
384	*
385	* @param id the unqiue id for the method
386	* @param buffer the buffer to add the source code to
387	* @return the last ID being used
388	* @throws Exception if something goes wrong
389	*/
390	protected int toSource(int id, StringBuffer buffer) throws Exception {
391	int result;
392	int i;
393	int newID;
394	StringBuffer[] subBuffers;
395
396	buffer.append("\n");
397	buffer.append(" protected static double node" + id + "(Object[] i) {\n");
398
399	// leaf?
400	if (m_Attribute == null) {
401	result = id;
402	if (Double.isNaN(m_ClassValue))
403	buffer.append(" return Double.NaN;");
404	else
405	buffer.append(" return " + m_ClassValue + ";");
406	if (m_ClassAttribute != null)
407	buffer.append(" // " + m_ClassAttribute.value((int) m_ClassValue));
408	buffer.append("\n");
409	buffer.append(" }\n");
410	}
411	else {
412	buffer.append(" // " + m_Attribute.name() + "\n");
413
414	// subtree calls
415	subBuffers = new StringBuffer[m_Attribute.numValues()];
416	newID = id;
417	for (i = 0; i < m_Attribute.numValues(); i++) {
418	newID++;
419
420	buffer.append(" ");
421	if (i > 0)
422	buffer.append("else ");
423	buffer.append("if (((String) i[" + m_Attribute.index() + "]).equals(\"" + m_Attribute.value(i) + "\"))\n");
424	buffer.append(" return node" + newID + "(i);\n");
425
426	subBuffers[i] = new StringBuffer();
427	newID = m_Successors[i].toSource(newID, subBuffers[i]);
428	}
429	buffer.append(" else\n");
430	buffer.append(" throw new IllegalArgumentException(\"Value '\" + i[" + m_Attribute.index() + "] + \"' is not allowed!\");\n");
431	buffer.append(" }\n");
432
433	// output subtree code
434	for (i = 0; i < m_Attribute.numValues(); i++) {
435	buffer.append(subBuffers[i].toString());
436	}
437	subBuffers = null;
438
439	result = newID;
440	}
441
442	return result;
443	}
444
445	/**
446	* Returns a string that describes the classifier as source. The
447	* classifier will be contained in a class with the given name (there may
448	* be auxiliary classes),
449	* and will contain a method with the signature:
450	* <pre><code>
451	* public static double classify(Object[] i);
452	* </code></pre>
453	* where the array <code>i</code> contains elements that are either
454	* Double, String, with missing values represented as null. The generated
455	* code is public domain and comes with no warranty. <br/>
456	* Note: works only if class attribute is the last attribute in the dataset.
457	*
458	* @param className the name that should be given to the source class.
459	* @return the object source described by a string
460	* @throws Exception if the souce can't be computed
461	*/
462	public String toSource(String className) throws Exception {
463	StringBuffer result;
464	int id;
465
466	result = new StringBuffer();
467
468	result.append("class " + className + " {\n");
469	result.append(" public static double classify(Object[] i) {\n");
470	id = 0;
471	result.append(" return node" + id + "(i);\n");
472	result.append(" }\n");
473	toSource(id, result);
474	result.append("}\n");
475
476	return result.toString();
477	}
478
479	/**
480	* Returns the revision string.
481	*
482	* @return the revision
483	*/
484	public String getRevision() {
485	return RevisionUtils.extract("$Revision: 5987 $");
486	}
487
488	/**
489	* Main method.
490	*
491	* @param args the options for the classifier
492	*/
493	public static void main(String[] args) {
494	runClassifier(new Id3(), args);
495	}
496	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: