source: src/main/java/weka/classifiers/trees/Id3.java @ 24

Last change on this file since 24 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 15.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Id3.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.trees;
24
25import weka.classifiers.Classifier;
26import weka.classifiers.AbstractClassifier;
27import weka.classifiers.Sourcable;
28import weka.core.Attribute;
29import weka.core.Capabilities;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.core.NoSupportForMissingValuesException;
33import weka.core.RevisionUtils;
34import weka.core.TechnicalInformation;
35import weka.core.TechnicalInformationHandler;
36import weka.core.Utils;
37import weka.core.Capabilities.Capability;
38import weka.core.TechnicalInformation.Field;
39import weka.core.TechnicalInformation.Type;
40
41import java.util.Enumeration;
42
43/**
44 <!-- globalinfo-start -->
45 * Class for constructing an unpruned decision tree based on the ID3 algorithm. Can only deal with nominal attributes. No missing values allowed. Empty leaves may result in unclassified instances. For more information see: <br/>
46 * <br/>
47 * R. Quinlan (1986). Induction of decision trees. Machine Learning. 1(1):81-106.
48 * <p/>
49 <!-- globalinfo-end -->
50 *
51 <!-- technical-bibtex-start -->
52 * BibTeX:
53 * <pre>
54 * &#64;article{Quinlan1986,
55 *    author = {R. Quinlan},
56 *    journal = {Machine Learning},
57 *    number = {1},
58 *    pages = {81-106},
59 *    title = {Induction of decision trees},
60 *    volume = {1},
61 *    year = {1986}
62 * }
63 * </pre>
64 * <p/>
65 <!-- technical-bibtex-end -->
66 *
67 <!-- options-start -->
68 * Valid options are: <p/>
69 *
70 * <pre> -D
71 *  If set, classifier is run in debug mode and
72 *  may output additional info to the console</pre>
73 *
74 <!-- options-end -->
75 *
76 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
77 * @version $Revision: 5987 $
78 */
79public class Id3 
80  extends AbstractClassifier
81  implements TechnicalInformationHandler, Sourcable {
82
83  /** for serialization */
84  static final long serialVersionUID = -2693678647096322561L;
85 
86  /** The node's successors. */ 
87  private Id3[] m_Successors;
88
89  /** Attribute used for splitting. */
90  private Attribute m_Attribute;
91
92  /** Class value if node is leaf. */
93  private double m_ClassValue;
94
95  /** Class distribution if node is leaf. */
96  private double[] m_Distribution;
97
98  /** Class attribute of dataset. */
99  private Attribute m_ClassAttribute;
100
101  /**
102   * Returns a string describing the classifier.
103   * @return a description suitable for the GUI.
104   */
105  public String globalInfo() {
106
107    return  "Class for constructing an unpruned decision tree based on the ID3 "
108      + "algorithm. Can only deal with nominal attributes. No missing values "
109      + "allowed. Empty leaves may result in unclassified instances. For more "
110      + "information see: \n\n"
111      + getTechnicalInformation().toString();
112  }
113
114  /**
115   * Returns an instance of a TechnicalInformation object, containing
116   * detailed information about the technical background of this class,
117   * e.g., paper reference or book this class is based on.
118   *
119   * @return the technical information about this class
120   */
121  public TechnicalInformation getTechnicalInformation() {
122    TechnicalInformation        result;
123   
124    result = new TechnicalInformation(Type.ARTICLE);
125    result.setValue(Field.AUTHOR, "R. Quinlan");
126    result.setValue(Field.YEAR, "1986");
127    result.setValue(Field.TITLE, "Induction of decision trees");
128    result.setValue(Field.JOURNAL, "Machine Learning");
129    result.setValue(Field.VOLUME, "1");
130    result.setValue(Field.NUMBER, "1");
131    result.setValue(Field.PAGES, "81-106");
132   
133    return result;
134  }
135
136  /**
137   * Returns default capabilities of the classifier.
138   *
139   * @return      the capabilities of this classifier
140   */
141  public Capabilities getCapabilities() {
142    Capabilities result = super.getCapabilities();
143    result.disableAll();
144
145    // attributes
146    result.enable(Capability.NOMINAL_ATTRIBUTES);
147
148    // class
149    result.enable(Capability.NOMINAL_CLASS);
150    result.enable(Capability.MISSING_CLASS_VALUES);
151
152    // instances
153    result.setMinimumNumberInstances(0);
154   
155    return result;
156  }
157
158  /**
159   * Builds Id3 decision tree classifier.
160   *
161   * @param data the training data
162   * @exception Exception if classifier can't be built successfully
163   */
164  public void buildClassifier(Instances data) throws Exception {
165
166    // can classifier handle the data?
167    getCapabilities().testWithFail(data);
168
169    // remove instances with missing class
170    data = new Instances(data);
171    data.deleteWithMissingClass();
172   
173    makeTree(data);
174  }
175
176  /**
177   * Method for building an Id3 tree.
178   *
179   * @param data the training data
180   * @exception Exception if decision tree can't be built successfully
181   */
182  private void makeTree(Instances data) throws Exception {
183
184    // Check if no instances have reached this node.
185    if (data.numInstances() == 0) {
186      m_Attribute = null;
187      m_ClassValue = Utils.missingValue();
188      m_Distribution = new double[data.numClasses()];
189      return;
190    }
191
192    // Compute attribute with maximum information gain.
193    double[] infoGains = new double[data.numAttributes()];
194    Enumeration attEnum = data.enumerateAttributes();
195    while (attEnum.hasMoreElements()) {
196      Attribute att = (Attribute) attEnum.nextElement();
197      infoGains[att.index()] = computeInfoGain(data, att);
198    }
199    m_Attribute = data.attribute(Utils.maxIndex(infoGains));
200   
201    // Make leaf if information gain is zero.
202    // Otherwise create successors.
203    if (Utils.eq(infoGains[m_Attribute.index()], 0)) {
204      m_Attribute = null;
205      m_Distribution = new double[data.numClasses()];
206      Enumeration instEnum = data.enumerateInstances();
207      while (instEnum.hasMoreElements()) {
208        Instance inst = (Instance) instEnum.nextElement();
209        m_Distribution[(int) inst.classValue()]++;
210      }
211      Utils.normalize(m_Distribution);
212      m_ClassValue = Utils.maxIndex(m_Distribution);
213      m_ClassAttribute = data.classAttribute();
214    } else {
215      Instances[] splitData = splitData(data, m_Attribute);
216      m_Successors = new Id3[m_Attribute.numValues()];
217      for (int j = 0; j < m_Attribute.numValues(); j++) {
218        m_Successors[j] = new Id3();
219        m_Successors[j].makeTree(splitData[j]);
220      }
221    }
222  }
223
224  /**
225   * Classifies a given test instance using the decision tree.
226   *
227   * @param instance the instance to be classified
228   * @return the classification
229   * @throws NoSupportForMissingValuesException if instance has missing values
230   */
231  public double classifyInstance(Instance instance) 
232    throws NoSupportForMissingValuesException {
233
234    if (instance.hasMissingValue()) {
235      throw new NoSupportForMissingValuesException("Id3: no missing values, "
236                                                   + "please.");
237    }
238    if (m_Attribute == null) {
239      return m_ClassValue;
240    } else {
241      return m_Successors[(int) instance.value(m_Attribute)].
242        classifyInstance(instance);
243    }
244  }
245
246  /**
247   * Computes class distribution for instance using decision tree.
248   *
249   * @param instance the instance for which distribution is to be computed
250   * @return the class distribution for the given instance
251   * @throws NoSupportForMissingValuesException if instance has missing values
252   */
253  public double[] distributionForInstance(Instance instance) 
254    throws NoSupportForMissingValuesException {
255
256    if (instance.hasMissingValue()) {
257      throw new NoSupportForMissingValuesException("Id3: no missing values, "
258                                                   + "please.");
259    }
260    if (m_Attribute == null) {
261      return m_Distribution;
262    } else { 
263      return m_Successors[(int) instance.value(m_Attribute)].
264        distributionForInstance(instance);
265    }
266  }
267
268  /**
269   * Prints the decision tree using the private toString method from below.
270   *
271   * @return a textual description of the classifier
272   */
273  public String toString() {
274
275    if ((m_Distribution == null) && (m_Successors == null)) {
276      return "Id3: No model built yet.";
277    }
278    return "Id3\n\n" + toString(0);
279  }
280
281  /**
282   * Computes information gain for an attribute.
283   *
284   * @param data the data for which info gain is to be computed
285   * @param att the attribute
286   * @return the information gain for the given attribute and data
287   * @throws Exception if computation fails
288   */
289  private double computeInfoGain(Instances data, Attribute att) 
290    throws Exception {
291
292    double infoGain = computeEntropy(data);
293    Instances[] splitData = splitData(data, att);
294    for (int j = 0; j < att.numValues(); j++) {
295      if (splitData[j].numInstances() > 0) {
296        infoGain -= ((double) splitData[j].numInstances() /
297                     (double) data.numInstances()) *
298          computeEntropy(splitData[j]);
299      }
300    }
301    return infoGain;
302  }
303
304  /**
305   * Computes the entropy of a dataset.
306   *
307   * @param data the data for which entropy is to be computed
308   * @return the entropy of the data's class distribution
309   * @throws Exception if computation fails
310   */
311  private double computeEntropy(Instances data) throws Exception {
312
313    double [] classCounts = new double[data.numClasses()];
314    Enumeration instEnum = data.enumerateInstances();
315    while (instEnum.hasMoreElements()) {
316      Instance inst = (Instance) instEnum.nextElement();
317      classCounts[(int) inst.classValue()]++;
318    }
319    double entropy = 0;
320    for (int j = 0; j < data.numClasses(); j++) {
321      if (classCounts[j] > 0) {
322        entropy -= classCounts[j] * Utils.log2(classCounts[j]);
323      }
324    }
325    entropy /= (double) data.numInstances();
326    return entropy + Utils.log2(data.numInstances());
327  }
328
329  /**
330   * Splits a dataset according to the values of a nominal attribute.
331   *
332   * @param data the data which is to be split
333   * @param att the attribute to be used for splitting
334   * @return the sets of instances produced by the split
335   */
336  private Instances[] splitData(Instances data, Attribute att) {
337
338    Instances[] splitData = new Instances[att.numValues()];
339    for (int j = 0; j < att.numValues(); j++) {
340      splitData[j] = new Instances(data, data.numInstances());
341    }
342    Enumeration instEnum = data.enumerateInstances();
343    while (instEnum.hasMoreElements()) {
344      Instance inst = (Instance) instEnum.nextElement();
345      splitData[(int) inst.value(att)].add(inst);
346    }
347    for (int i = 0; i < splitData.length; i++) {
348      splitData[i].compactify();
349    }
350    return splitData;
351  }
352
353  /**
354   * Outputs a tree at a certain level.
355   *
356   * @param level the level at which the tree is to be printed
357   * @return the tree as string at the given level
358   */
359  private String toString(int level) {
360
361    StringBuffer text = new StringBuffer();
362   
363    if (m_Attribute == null) {
364      if (Utils.isMissingValue(m_ClassValue)) {
365        text.append(": null");
366      } else {
367        text.append(": " + m_ClassAttribute.value((int) m_ClassValue));
368      } 
369    } else {
370      for (int j = 0; j < m_Attribute.numValues(); j++) {
371        text.append("\n");
372        for (int i = 0; i < level; i++) {
373          text.append("|  ");
374        }
375        text.append(m_Attribute.name() + " = " + m_Attribute.value(j));
376        text.append(m_Successors[j].toString(level + 1));
377      }
378    }
379    return text.toString();
380  }
381
382  /**
383   * Adds this tree recursively to the buffer.
384   *
385   * @param id          the unqiue id for the method
386   * @param buffer      the buffer to add the source code to
387   * @return            the last ID being used
388   * @throws Exception  if something goes wrong
389   */
390  protected int toSource(int id, StringBuffer buffer) throws Exception {
391    int                 result;
392    int                 i;
393    int                 newID;
394    StringBuffer[]      subBuffers;
395   
396    buffer.append("\n");
397    buffer.append("  protected static double node" + id + "(Object[] i) {\n");
398   
399    // leaf?
400    if (m_Attribute == null) {
401      result = id;
402      if (Double.isNaN(m_ClassValue))
403        buffer.append("    return Double.NaN;");
404      else
405        buffer.append("    return " + m_ClassValue + ";");
406      if (m_ClassAttribute != null)
407        buffer.append(" // " + m_ClassAttribute.value((int) m_ClassValue));
408      buffer.append("\n");
409      buffer.append("  }\n");
410    }
411    else {
412      buffer.append("    // " + m_Attribute.name() + "\n");
413     
414      // subtree calls
415      subBuffers = new StringBuffer[m_Attribute.numValues()];
416      newID      = id;
417      for (i = 0; i < m_Attribute.numValues(); i++) {
418        newID++;
419
420        buffer.append("    ");
421        if (i > 0)
422          buffer.append("else ");
423        buffer.append("if (((String) i[" + m_Attribute.index() + "]).equals(\"" + m_Attribute.value(i) + "\"))\n");
424        buffer.append("      return node" + newID + "(i);\n");
425
426        subBuffers[i] = new StringBuffer();
427        newID         = m_Successors[i].toSource(newID, subBuffers[i]);
428      }
429      buffer.append("    else\n");
430      buffer.append("      throw new IllegalArgumentException(\"Value '\" + i[" + m_Attribute.index() + "] + \"' is not allowed!\");\n");
431      buffer.append("  }\n");
432
433      // output subtree code
434      for (i = 0; i < m_Attribute.numValues(); i++) {
435        buffer.append(subBuffers[i].toString());
436      }
437      subBuffers = null;
438     
439      result = newID;
440    }
441   
442    return result;
443  }
444 
445  /**
446   * Returns a string that describes the classifier as source. The
447   * classifier will be contained in a class with the given name (there may
448   * be auxiliary classes),
449   * and will contain a method with the signature:
450   * <pre><code>
451   * public static double classify(Object[] i);
452   * </code></pre>
453   * where the array <code>i</code> contains elements that are either
454   * Double, String, with missing values represented as null. The generated
455   * code is public domain and comes with no warranty. <br/>
456   * Note: works only if class attribute is the last attribute in the dataset.
457   *
458   * @param className the name that should be given to the source class.
459   * @return the object source described by a string
460   * @throws Exception if the souce can't be computed
461   */
462  public String toSource(String className) throws Exception {
463    StringBuffer        result;
464    int                 id;
465   
466    result = new StringBuffer();
467
468    result.append("class " + className + " {\n");
469    result.append("  public static double classify(Object[] i) {\n");
470    id = 0;
471    result.append("    return node" + id + "(i);\n");
472    result.append("  }\n");
473    toSource(id, result);
474    result.append("}\n");
475
476    return result.toString();
477  }
478 
479  /**
480   * Returns the revision string.
481   *
482   * @return            the revision
483   */
484  public String getRevision() {
485    return RevisionUtils.extract("$Revision: 5987 $");
486  }
487
488  /**
489   * Main method.
490   *
491   * @param args the options for the classifier
492   */
493  public static void main(String[] args) {
494    runClassifier(new Id3(), args);
495  }
496}
Note: See TracBrowser for help on using the repository browser.