source: src/main/java/weka/classifiers/trees/j48/BinC45ModelSelection.java @ 23

Last change on this file since 23 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 5.9 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    BinC45ModelSelection.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.classifiers.trees.j48;
24
25import weka.core.Attribute;
26import weka.core.Instances;
27import weka.core.RevisionUtils;
28import weka.core.Utils;
29
30import java.util.Enumeration;
31
32/**
33 * Class for selecting a C4.5-like binary (!) split for a given dataset.
34 *
35 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
36 * @version $Revision: 6073 $
37 */
38public class BinC45ModelSelection
39  extends ModelSelection {
40
41  /** for serialization */
42  private static final long serialVersionUID = 179170923545122001L;
43
44  /** Minimum number of instances in interval. */
45  private int m_minNoObj;               
46
47  /** Use MDL correction? */
48  private boolean m_useMDLcorrection;         
49
50  /** The FULL training dataset. */
51  private Instances m_allData; 
52
53  /**
54   * Initializes the split selection method with the given parameters.
55   *
56   * @param minNoObj minimum number of instances that have to occur in
57   * at least two subsets induced by split
58   * @param allData FULL training dataset (necessary for selection of
59   * split points). 
60   * @param useMDLcorrection whether to use MDL adjustement when
61   * finding splits on numeric attributes
62   */
63  public BinC45ModelSelection(int minNoObj,Instances allData,
64                             boolean useMDLcorrection){
65    m_minNoObj = minNoObj;
66    m_allData = allData;
67    m_useMDLcorrection = useMDLcorrection;
68  }
69
70  /**
71   * Sets reference to training data to null.
72   */
73  public void cleanup() {
74
75    m_allData = null;
76  }
77
78  /**
79   * Selects C4.5-type split for the given dataset.
80   */
81  public final ClassifierSplitModel selectModel(Instances data){
82
83    double minResult;
84    double currentResult;
85    BinC45Split [] currentModel;
86    BinC45Split bestModel = null;
87    NoSplit noSplitModel = null;
88    double averageInfoGain = 0;
89    int validModels = 0;
90    boolean multiVal = true;
91    Distribution checkDistribution;
92    double sumOfWeights;
93    int i;
94   
95    try{
96
97      // Check if all Instances belong to one class or if not
98      // enough Instances to split.
99      checkDistribution = new Distribution(data);
100      noSplitModel = new NoSplit(checkDistribution);
101      if (Utils.sm(checkDistribution.total(),2*m_minNoObj) ||
102          Utils.eq(checkDistribution.total(),
103                   checkDistribution.perClass(checkDistribution.maxClass())))
104        return noSplitModel;
105
106      // Check if all attributes are nominal and have a
107      // lot of values.
108      Enumeration enu = data.enumerateAttributes();
109      while (enu.hasMoreElements()) {
110        Attribute attribute = (Attribute) enu.nextElement();
111        if ((attribute.isNumeric()) ||
112            (Utils.sm((double)attribute.numValues(),
113                      (0.3*(double)m_allData.numInstances())))){
114          multiVal = false;
115          break;
116        }
117      }
118      currentModel = new BinC45Split[data.numAttributes()];
119      sumOfWeights = data.sumOfWeights();
120
121      // For each attribute.
122      for (i = 0; i < data.numAttributes(); i++){
123       
124        // Apart from class attribute.
125        if (i != (data).classIndex()){
126         
127          // Get models for current attribute.
128          currentModel[i] = new BinC45Split(i,m_minNoObj,sumOfWeights,m_useMDLcorrection);
129          currentModel[i].buildClassifier(data);
130         
131          // Check if useful split for current attribute
132          // exists and check for enumerated attributes with
133          // a lot of values.
134          if (currentModel[i].checkModel())
135            if ((data.attribute(i).isNumeric()) ||
136                (multiVal || Utils.sm((double)data.attribute(i).numValues(),
137                                      (0.3*(double)m_allData.numInstances())))){
138              averageInfoGain = averageInfoGain+currentModel[i].infoGain();
139              validModels++;
140            }
141        }else
142          currentModel[i] = null;
143      }
144     
145      // Check if any useful split was found.
146      if (validModels == 0)
147        return noSplitModel;
148      averageInfoGain = averageInfoGain/(double)validModels;
149
150      // Find "best" attribute to split on.
151      minResult = 0;
152      for (i=0;i<data.numAttributes();i++){
153        if ((i != (data).classIndex()) &&
154            (currentModel[i].checkModel()))
155         
156          // Use 1E-3 here to get a closer approximation to the original
157          // implementation.
158          if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&
159              Utils.gr(currentModel[i].gainRatio(),minResult)){ 
160            bestModel = currentModel[i];
161            minResult = currentModel[i].gainRatio();
162          }
163      }
164     
165      // Check if useful split was found.
166      if (Utils.eq(minResult,0))
167        return noSplitModel;
168
169      // Add all Instances with unknown values for the corresponding
170      // attribute to the distribution for the model, so that
171      // the complete distribution is stored with the model.
172      bestModel.distribution().
173        addInstWithUnknown(data,bestModel.attIndex());
174     
175      // Set the split point analogue to C45 if attribute numeric.
176      bestModel.setSplitPoint(m_allData);
177      return bestModel;
178    }catch(Exception e){
179      e.printStackTrace();
180    }
181    return null;
182  }
183
184  /**
185   * Selects C4.5-type split for the given dataset.
186   */
187  public final ClassifierSplitModel selectModel(Instances train, Instances test) {
188
189    return selectModel(train);
190  }
191 
192  /**
193   * Returns the revision string.
194   *
195   * @return            the revision
196   */
197  public String getRevision() {
198    return RevisionUtils.extract("$Revision: 6073 $");
199  }
200}
Note: See TracBrowser for help on using the repository browser.