Context Navigation

BinC45ModelSelection.java

Last change on this file was 29, checked in by gnappo, 14 years ago
Taggata versione per la demo e aggiunto branch.
File size: 5.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* BinC45ModelSelection.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.trees.j48;
24
25	import weka.core.Attribute;
26	import weka.core.Instances;
27	import weka.core.RevisionUtils;
28	import weka.core.Utils;
29
30	import java.util.Enumeration;
31
32	/**
33	* Class for selecting a C4.5-like binary (!) split for a given dataset.
34	*
35	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
36	* @version $Revision: 6073 $
37	*/
38	public class BinC45ModelSelection
39	extends ModelSelection {
40
41	/** for serialization */
42	private static final long serialVersionUID = 179170923545122001L;
43
44	/** Minimum number of instances in interval. */
45	private int m_minNoObj;
46
47	/** Use MDL correction? */
48	private boolean m_useMDLcorrection;
49
50	/** The FULL training dataset. */
51	private Instances m_allData;
52
53	/**
54	* Initializes the split selection method with the given parameters.
55	*
56	* @param minNoObj minimum number of instances that have to occur in
57	* at least two subsets induced by split
58	* @param allData FULL training dataset (necessary for selection of
59	* split points).
60	* @param useMDLcorrection whether to use MDL adjustement when
61	* finding splits on numeric attributes
62	*/
63	public BinC45ModelSelection(int minNoObj,Instances allData,
64	boolean useMDLcorrection){
65	m_minNoObj = minNoObj;
66	m_allData = allData;
67	m_useMDLcorrection = useMDLcorrection;
68	}
69
70	/**
71	* Sets reference to training data to null.
72	*/
73	public void cleanup() {
74
75	m_allData = null;
76	}
77
78	/**
79	* Selects C4.5-type split for the given dataset.
80	*/
81	public final ClassifierSplitModel selectModel(Instances data){
82
83	double minResult;
84	double currentResult;
85	BinC45Split [] currentModel;
86	BinC45Split bestModel = null;
87	NoSplit noSplitModel = null;
88	double averageInfoGain = 0;
89	int validModels = 0;
90	boolean multiVal = true;
91	Distribution checkDistribution;
92	double sumOfWeights;
93	int i;
94
95	try{
96
97	// Check if all Instances belong to one class or if not
98	// enough Instances to split.
99	checkDistribution = new Distribution(data);
100	noSplitModel = new NoSplit(checkDistribution);
101	if (Utils.sm(checkDistribution.total(),2*m_minNoObj) \|\|
102	Utils.eq(checkDistribution.total(),
103	checkDistribution.perClass(checkDistribution.maxClass())))
104	return noSplitModel;
105
106	// Check if all attributes are nominal and have a
107	// lot of values.
108	Enumeration enu = data.enumerateAttributes();
109	while (enu.hasMoreElements()) {
110	Attribute attribute = (Attribute) enu.nextElement();
111	if ((attribute.isNumeric()) \|\|
112	(Utils.sm((double)attribute.numValues(),
113	(0.3*(double)m_allData.numInstances())))){
114	multiVal = false;
115	break;
116	}
117	}
118	currentModel = new BinC45Split[data.numAttributes()];
119	sumOfWeights = data.sumOfWeights();
120
121	// For each attribute.
122	for (i = 0; i < data.numAttributes(); i++){
123
124	// Apart from class attribute.
125	if (i != (data).classIndex()){
126
127	// Get models for current attribute.
128	currentModel[i] = new BinC45Split(i,m_minNoObj,sumOfWeights,m_useMDLcorrection);
129	currentModel[i].buildClassifier(data);
130
131	// Check if useful split for current attribute
132	// exists and check for enumerated attributes with
133	// a lot of values.
134	if (currentModel[i].checkModel())
135	if ((data.attribute(i).isNumeric()) \|\|
136	(multiVal \|\| Utils.sm((double)data.attribute(i).numValues(),
137	(0.3*(double)m_allData.numInstances())))){
138	averageInfoGain = averageInfoGain+currentModel[i].infoGain();
139	validModels++;
140	}
141	}else
142	currentModel[i] = null;
143	}
144
145	// Check if any useful split was found.
146	if (validModels == 0)
147	return noSplitModel;
148	averageInfoGain = averageInfoGain/(double)validModels;
149
150	// Find "best" attribute to split on.
151	minResult = 0;
152	for (i=0;i<data.numAttributes();i++){
153	if ((i != (data).classIndex()) &&
154	(currentModel[i].checkModel()))
155
156	// Use 1E-3 here to get a closer approximation to the original
157	// implementation.
158	if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&
159	Utils.gr(currentModel[i].gainRatio(),minResult)){
160	bestModel = currentModel[i];
161	minResult = currentModel[i].gainRatio();
162	}
163	}
164
165	// Check if useful split was found.
166	if (Utils.eq(minResult,0))
167	return noSplitModel;
168
169	// Add all Instances with unknown values for the corresponding
170	// attribute to the distribution for the model, so that
171	// the complete distribution is stored with the model.
172	bestModel.distribution().
173	addInstWithUnknown(data,bestModel.attIndex());
174
175	// Set the split point analogue to C45 if attribute numeric.
176	bestModel.setSplitPoint(m_allData);
177	return bestModel;
178	}catch(Exception e){
179	e.printStackTrace();
180	}
181	return null;
182	}
183
184	/**
185	* Selects C4.5-type split for the given dataset.
186	*/
187	public final ClassifierSplitModel selectModel(Instances train, Instances test) {
188
189	return selectModel(train);
190	}
191
192	/**
193	* Returns the revision string.
194	*
195	* @return the revision
196	*/
197	public String getRevision() {
198	return RevisionUtils.extract("$Revision: 6073 $");
199	}
200	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/classifiers/trees/j48/BinC45ModelSelection.java

Download in other formats: