source: src/main/java/weka/classifiers/trees/j48/C45ModelSelection.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 6.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    C45ModelSelection.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.trees.j48;
24
25import weka.core.Attribute;
26import weka.core.Instances;
27import weka.core.RevisionUtils;
28import weka.core.Utils;
29
30import java.util.Enumeration;
31
32/**
33 * Class for selecting a C4.5-type split for a given dataset.
34 *
35 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
36 * @version $Revision: 6073 $
37 */
38public class C45ModelSelection
39  extends ModelSelection {
40
41  /** for serialization */
42  private static final long serialVersionUID = 3372204862440821989L;
43
44  /** Minimum number of objects in interval. */
45  private int m_minNoObj;               
46
47  /** Use MDL correction? */
48  private boolean m_useMDLcorrection;         
49
50  /** All the training data */
51  private Instances m_allData; //
52
53  /**
54   * Initializes the split selection method with the given parameters.
55   *
56   * @param minNoObj minimum number of instances that have to occur in at least two
57   * subsets induced by split
58   * @param allData FULL training dataset (necessary for
59   * selection of split points).
60   * @param useMDLcorrection whether to use MDL adjustement when
61   * finding splits on numeric attributes
62   */
63  public C45ModelSelection(int minNoObj, Instances allData,
64                             boolean useMDLcorrection) {
65    m_minNoObj = minNoObj;
66    m_allData = allData;
67    m_useMDLcorrection = useMDLcorrection;
68  }
69
70  /**
71   * Sets reference to training data to null.
72   */
73  public void cleanup() {
74
75    m_allData = null;
76  }
77
78  /**
79   * Selects C4.5-type split for the given dataset.
80   */
81  public final ClassifierSplitModel selectModel(Instances data){
82
83    double minResult;
84    double currentResult;
85    C45Split [] currentModel;
86    C45Split bestModel = null;
87    NoSplit noSplitModel = null;
88    double averageInfoGain = 0;
89    int validModels = 0;
90    boolean multiVal = true;
91    Distribution checkDistribution;
92    Attribute attribute;
93    double sumOfWeights;
94    int i;
95   
96    try{
97
98      // Check if all Instances belong to one class or if not
99      // enough Instances to split.
100      checkDistribution = new Distribution(data);
101      noSplitModel = new NoSplit(checkDistribution);
102      if (Utils.sm(checkDistribution.total(),2*m_minNoObj) ||
103          Utils.eq(checkDistribution.total(),
104                   checkDistribution.perClass(checkDistribution.maxClass())))
105        return noSplitModel;
106
107      // Check if all attributes are nominal and have a
108      // lot of values.
109      if (m_allData != null) {
110        Enumeration enu = data.enumerateAttributes();
111        while (enu.hasMoreElements()) {
112          attribute = (Attribute) enu.nextElement();
113          if ((attribute.isNumeric()) ||
114              (Utils.sm((double)attribute.numValues(),
115                        (0.3*(double)m_allData.numInstances())))){
116            multiVal = false;
117            break;
118          }
119        }
120      } 
121
122      currentModel = new C45Split[data.numAttributes()];
123      sumOfWeights = data.sumOfWeights();
124
125      // For each attribute.
126      for (i = 0; i < data.numAttributes(); i++){
127       
128        // Apart from class attribute.
129        if (i != (data).classIndex()){
130         
131          // Get models for current attribute.
132          currentModel[i] = new C45Split(i,m_minNoObj,sumOfWeights,m_useMDLcorrection);
133          currentModel[i].buildClassifier(data);
134         
135          // Check if useful split for current attribute
136          // exists and check for enumerated attributes with
137          // a lot of values.
138          if (currentModel[i].checkModel())
139            if (m_allData != null) {
140              if ((data.attribute(i).isNumeric()) ||
141                  (multiVal || Utils.sm((double)data.attribute(i).numValues(),
142                                        (0.3*(double)m_allData.numInstances())))){
143                averageInfoGain = averageInfoGain+currentModel[i].infoGain();
144                validModels++;
145              } 
146            } else {
147              averageInfoGain = averageInfoGain+currentModel[i].infoGain();
148              validModels++;
149            }
150        }else
151          currentModel[i] = null;
152      }
153     
154      // Check if any useful split was found.
155      if (validModels == 0)
156        return noSplitModel;
157      averageInfoGain = averageInfoGain/(double)validModels;
158
159      // Find "best" attribute to split on.
160      minResult = 0;
161      for (i=0;i<data.numAttributes();i++){
162        if ((i != (data).classIndex()) &&
163            (currentModel[i].checkModel()))
164         
165          // Use 1E-3 here to get a closer approximation to the original
166          // implementation.
167          if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&
168              Utils.gr(currentModel[i].gainRatio(),minResult)){ 
169            bestModel = currentModel[i];
170            minResult = currentModel[i].gainRatio();
171          } 
172      }
173
174      // Check if useful split was found.
175      if (Utils.eq(minResult,0))
176        return noSplitModel;
177     
178      // Add all Instances with unknown values for the corresponding
179      // attribute to the distribution for the model, so that
180      // the complete distribution is stored with the model.
181      bestModel.distribution().
182          addInstWithUnknown(data,bestModel.attIndex());
183     
184      // Set the split point analogue to C45 if attribute numeric.
185      if (m_allData != null)
186        bestModel.setSplitPoint(m_allData);
187      return bestModel;
188    }catch(Exception e){
189      e.printStackTrace();
190    }
191    return null;
192  }
193
194  /**
195   * Selects C4.5-type split for the given dataset.
196   */
197  public final ClassifierSplitModel selectModel(Instances train, Instances test) {
198
199    return selectModel(train);
200  }
201 
202  /**
203   * Returns the revision string.
204   *
205   * @return            the revision
206   */
207  public String getRevision() {
208    return RevisionUtils.extract("$Revision: 6073 $");
209  }
210}
Note: See TracBrowser for help on using the repository browser.