Context Navigation

source: src/main/java/weka/classifiers/rules/ConjunctiveRule.java @ 12

Last change on this file since 12 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 51.5 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* ConjunctiveRule.java
19	* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.rules;
24
25	import weka.classifiers.Classifier;
26	import weka.classifiers.AbstractClassifier;
27	import weka.core.Attribute;
28	import weka.core.Capabilities;
29	import weka.core.ContingencyTables;
30	import weka.core.FastVector;
31	import weka.core.Instance;
32	import weka.core.Instances;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.RevisionHandler;
36	import weka.core.RevisionUtils;
37	import weka.core.Utils;
38	import weka.core.WeightedInstancesHandler;
39	import weka.core.Capabilities.Capability;
40
41	import java.io.Serializable;
42	import java.util.Enumeration;
43	import java.util.Random;
44	import java.util.Vector;
45
46	/**
47	<!-- globalinfo-start -->
48	* This class implements a single conjunctive rule learner that can predict for numeric and nominal class labels.<br/>
49	* <br/>
50	* A rule consists of antecedents "AND"ed together and the consequent (class value) for the classification/regression. In this case, the consequent is the distribution of the available classes (or mean for a numeric value) in the dataset. If the test instance is not covered by this rule, then it's predicted using the default class distributions/value of the data not covered by the rule in the training data.This learner selects an antecedent by computing the Information Gain of each antecendent and prunes the generated rule using Reduced Error Prunning (REP) or simple pre-pruning based on the number of antecedents.<br/>
51	* <br/>
52	* For classification, the Information of one antecedent is the weighted average of the entropies of both the data covered and not covered by the rule.<br/>
53	* For regression, the Information is the weighted average of the mean-squared errors of both the data covered and not covered by the rule.<br/>
54	* <br/>
55	* In pruning, weighted average of the accuracy rates on the pruning data is used for classification while the weighted average of the mean-squared errors on the pruning data is used for regression.<br/>
56	* <br/>
57	* <p/>
58	<!-- globalinfo-end -->
59	*
60	<!-- options-start -->
61	* Valid options are: <p/>
62	*
63	* <pre> -N <number of folds>
64	* Set number of folds for REP
65	* One fold is used as pruning set.
66	* (default 3)</pre>
67	*
68	* <pre> -R
69	* Set if NOT uses randomization
70	* (default:use randomization)</pre>
71	*
72	* <pre> -E
73	* Set whether consider the exclusive
74	* expressions for nominal attributes
75	* (default false)</pre>
76	*
77	* <pre> -M <min. weights>
78	* Set the minimal weights of instances
79	* within a split.
80	* (default 2.0)</pre>
81	*
82	* <pre> -P <number of antecedents>
83	* Set number of antecedents for pre-pruning
84	* if -1, then REP is used
85	* (default -1)</pre>
86	*
87	* <pre> -S <seed>
88	* Set the seed of randomization
89	* (default 1)</pre>
90	*
91	<!-- options-end -->
92	*
93	* @author Xin XU (xx5@cs.waikato.ac.nz)
94	* @version $Revision: 5928 $
95	*/
96	public class ConjunctiveRule
97	extends AbstractClassifier
98	implements OptionHandler, WeightedInstancesHandler{
99
100	/** for serialization */
101	static final long serialVersionUID = -5938309903225087198L;
102
103	/** The number of folds to split data into Grow and Prune for REP*/
104	private int m_Folds = 3;
105
106	/** The class attribute of the data*/
107	private Attribute m_ClassAttribute;
108
109	/** The vector of antecedents of this rule*/
110	protected FastVector m_Antds = null;
111
112	/** The default rule distribution of the data not covered*/
113	protected double[] m_DefDstr = null;
114
115	/** The consequent of this rule */
116	protected double[] m_Cnsqt = null;
117
118	/** Number of classes in the training data */
119	private int m_NumClasses = 0;
120
121	/** The seed to perform randomization */
122	private long m_Seed = 1;
123
124	/** The Random object used for randomization */
125	private Random m_Random = null;
126
127	/** The predicted classes recorded for each antecedent in the growing data */
128	private FastVector m_Targets;
129
130	/** Whether to use exlusive expressions for nominal attributes */
131	private boolean m_IsExclude = false;
132
133	/** The minimal number of instance weights within a split*/
134	private double m_MinNo = 2.0;
135
136	/** The number of antecedents in pre-pruning */
137	private int m_NumAntds = -1;
138
139	/**
140	* Returns a string describing classifier
141	* @return a description suitable for
142	* displaying in the explorer/experimenter gui
143	*/
144	public String globalInfo() {
145
146	return "This class implements a single conjunctive rule learner that can predict "
147	+ "for numeric and nominal class labels.\n\n"
148	+ "A rule consists of antecedents \"AND\"ed together and the consequent (class value) "
149	+ "for the classification/regression. In this case, the consequent is the "
150	+ "distribution of the available classes (or mean for a numeric value) in the dataset. "
151	+ "If the test instance is not covered by this rule, then it's predicted "
152	+ "using the default class distributions/value of the data not covered by the "
153	+ "rule in the training data."
154	+ "This learner selects an antecedent by computing the Information Gain of each "
155	+ "antecendent and prunes the generated rule using Reduced Error Prunning (REP) "
156	+ "or simple pre-pruning based on the number of antecedents.\n\n"
157	+ "For classification, the Information of one antecedent is the weighted average of "
158	+ "the entropies of both the data covered and not covered by the rule.\n"
159	+ "For regression, the Information is the weighted average of the mean-squared errors "
160	+ "of both the data covered and not covered by the rule.\n\n"
161	+ "In pruning, weighted average of the accuracy rates on the pruning data is used "
162	+ "for classification while the weighted average of the mean-squared errors "
163	+ "on the pruning data is used for regression.\n\n";
164	}
165
166	/**
167	* The single antecedent in the rule, which is composed of an attribute and
168	* the corresponding value. There are two inherited classes, namely NumericAntd
169	* and NominalAntd in which the attributes are numeric and nominal respectively.
170	*/
171	private abstract class Antd
172	implements Serializable, RevisionHandler {
173
174	/** for serialization */
175	private static final long serialVersionUID = -8729076306737827571L;
176
177	/** The attribute of the antecedent */
178	protected Attribute att;
179
180	/** The attribute value of the antecedent.
181	For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */
182	protected double value;
183
184	/** The maximum infoGain achieved by this antecedent test */
185	protected double maxInfoGain;
186
187	/** The information of this antecedent test on the growing data */
188	protected double inform;
189
190	/** The parameter related to the meanSquaredError of the data not covered
191	by the previous antecedents when the class is numeric */
192	protected double uncoverWtSq, uncoverWtVl, uncoverSum;
193
194	/** The parameters related to the data not covered by the previous
195	antecedents when the class is nominal */
196	protected double[] uncover;
197
198	/** Constructor for nominal class */
199	public Antd(Attribute a, double[] unc){
200	att=a;
201	value=Double.NaN;
202	maxInfoGain = 0;
203	inform = Double.NaN;
204	uncover = unc;
205	}
206
207	/**
208	* Constructor for numeric class
209	*/
210	public Antd(Attribute a, double uncoveredWtSq,
211	double uncoveredWtVl, double uncoveredWts){
212	att=a;
213	value=Double.NaN;
214	maxInfoGain = 0;
215	inform = Double.NaN;
216	uncoverWtSq = uncoveredWtSq;
217	uncoverWtVl = uncoveredWtVl;
218	uncoverSum = uncoveredWts;
219	}
220
221	/* The abstract members for inheritance */
222	public abstract Instances[] splitData(Instances data, double defInfo);
223	public abstract boolean isCover(Instance inst);
224	public abstract String toString();
225
226	/* Get functions of this antecedent */
227	public Attribute getAttr(){ return att; }
228	public double getAttrValue(){ return value; }
229	public double getMaxInfoGain(){ return maxInfoGain; }
230	public double getInfo(){ return inform;}
231
232	/**
233	* Function used to calculate the weighted mean squared error,
234	* i.e., sum[x-avg(x)]^2 based on the given elements of the formula:
235	* meanSquaredError = sum(Wi*Xi^2) - (sum(WiXi))^2/sum(Wi)
236	*
237	* @param weightedSq sum(Wi*Xi^2)
238	* @param weightedValue sum(WiXi)
239	* @param sum sum of weights
240	* @return the weighted mean-squared error
241	*/
242	protected double wtMeanSqErr(double weightedSq, double weightedValue, double sum){
243	if(Utils.smOrEq(sum, 1.0E-6))
244	return 0;
245	return (weightedSq - (weightedValue * weightedValue) / sum);
246	}
247
248	/**
249	* Function used to calculate the entropy of given vector of values
250	* entropy = (1/sum){-sigma[i=1..P](Xilog2(Xi)) + sum*log2(sum)}
251	* where P is the length of the vector
252	*
253	* @param value the given vector of values
254	* @param sum the sum of the given values. It's provided just for efficiency.
255	* @return the entropy
256	*/
257	protected double entropy(double[] value, double sum){
258	if(Utils.smOrEq(sum, 1.0E-6))
259	return 0;
260
261	double entropy = 0;
262	for(int i=0; i < value.length; i++){
263	if(!Utils.eq(value[i],0))
264	entropy -= value[i] * Utils.log2(value[i]);
265	}
266	entropy += sum * Utils.log2(sum);
267	entropy /= sum;
268	return entropy;
269	}
270
271	/**
272	* Returns the revision string.
273	*
274	* @return the revision
275	*/
276	public String getRevision() {
277	return RevisionUtils.extract("$Revision: 5928 $");
278	}
279	}
280
281	/**
282	* The antecedent with numeric attribute
283	*/
284	private class NumericAntd
285	extends Antd {
286
287	/** for serialization */
288	static final long serialVersionUID = -7957266498918210436L;
289
290	/** The split point for this numeric antecedent */
291	private double splitPoint;
292
293	/**
294	* Constructor for nominal class
295	*/
296	public NumericAntd(Attribute a, double[] unc){
297	super(a, unc);
298	splitPoint = Double.NaN;
299	}
300
301	/**
302	* Constructor for numeric class
303	*/
304	public NumericAntd(Attribute a, double sq, double vl, double wts){
305	super(a, sq, vl, wts);
306	splitPoint = Double.NaN;
307	}
308
309	/**
310	* Get split point of this numeric antecedent
311	*
312	* @return the split point
313	*/
314	public double getSplitPoint(){
315	return splitPoint;
316	}
317
318	/**
319	* Implements the splitData function.
320	* This procedure is to split the data into two bags according
321	* to the information gain of the numeric attribute value
322	* the data with missing values are stored in the last split.
323	* The maximum infoGain is also calculated.
324	*
325	* @param insts the data to be split
326	* @param defInfo the default information for data
327	* @return the array of data after split
328	*/
329	public Instances[] splitData(Instances insts, double defInfo){
330	Instances data = new Instances(insts);
331	data.sort(att);
332	int total=data.numInstances();// Total number of instances without
333	// missing value for att
334	maxInfoGain = 0;
335	value = 0;
336
337	// Compute minimum number of Instances required in each split
338	double minSplit;
339	if(m_ClassAttribute.isNominal()){
340	minSplit = 0.1 * (data.sumOfWeights()) /
341	((double)m_ClassAttribute.numValues());
342	if (Utils.smOrEq(minSplit,m_MinNo))
343	minSplit = m_MinNo;
344	else if (Utils.gr(minSplit,25))
345	minSplit = 25;
346	}
347	else
348	minSplit = m_MinNo;
349
350	double[] fst=null, snd=null, missing=null;
351	if(m_ClassAttribute.isNominal()){
352	fst = new double[m_NumClasses];
353	snd = new double[m_NumClasses];
354	missing = new double[m_NumClasses];
355
356	for(int v=0; v < m_NumClasses; v++)
357	fst[v]=snd[v]=missing[v]=0.0;
358	}
359	double fstCover=0, sndCover=0, fstWtSq=0, sndWtSq=0, fstWtVl=0, sndWtVl=0;
360
361	int split=1; // Current split position
362	int prev=0; // Previous split position
363	int finalSplit=split; // Final split position
364
365	for(int x=0; x<data.numInstances(); x++){
366	Instance inst = data.instance(x);
367	if(inst.isMissing(att)){
368	total = x;
369	break;
370	}
371
372	sndCover += inst.weight();
373	if(m_ClassAttribute.isNominal()) // Nominal class
374	snd[(int)inst.classValue()] += inst.weight();
375	else{ // Numeric class
376	sndWtSq += inst.weight() * inst.classValue() * inst.classValue();
377	sndWtVl += inst.weight() * inst.classValue();
378	}
379	}
380
381
382	// Enough Instances with known values?
383	if (Utils.sm(sndCover,(2*minSplit)))
384	return null;
385
386	double msingWtSq=0, msingWtVl=0;
387	Instances missingData = new Instances(data, 0);
388	for(int y=total; y < data.numInstances(); y++){
389	Instance inst = data.instance(y);
390	missingData.add(inst);
391	if(m_ClassAttribute.isNominal())
392	missing[(int)inst.classValue()] += inst.weight();
393	else{
394	msingWtSq += inst.weight() * inst.classValue() * inst.classValue();
395	msingWtVl += inst.weight() * inst.classValue();
396	}
397	}
398
399	if(total == 0) return null; // Data all missing for the attribute
400
401	splitPoint = data.instance(total-1).value(att);
402
403	for(; split < total; split++){
404	if(!Utils.eq(data.instance(split).value(att), // Can't split
405	data.instance(prev).value(att))){// within same value
406
407	// Move the split point
408	for(int y=prev; y<split; y++){
409	Instance inst = data.instance(y);
410	fstCover += inst.weight(); sndCover -= inst.weight();
411	if(m_ClassAttribute.isNominal()){ // Nominal class
412	fst[(int)inst.classValue()] += inst.weight();
413	snd[(int)inst.classValue()] -= inst.weight();
414	}
415	else{ // Numeric class
416	fstWtSq += inst.weight() * inst.classValue() * inst.classValue();
417	fstWtVl += inst.weight() * inst.classValue();
418	sndWtSq -= inst.weight() * inst.classValue() * inst.classValue();
419	sndWtVl -= inst.weight() * inst.classValue();
420	}
421	}
422
423	if(Utils.sm(fstCover, minSplit) \|\| Utils.sm(sndCover, minSplit)){
424	prev=split; // Cannot split because either
425	continue; // split has not enough data
426	}
427
428	double fstEntp = 0, sndEntp = 0;
429
430	if(m_ClassAttribute.isNominal()){
431	fstEntp = entropy(fst, fstCover);
432	sndEntp = entropy(snd, sndCover);
433	}
434	else{
435	fstEntp = wtMeanSqErr(fstWtSq, fstWtVl, fstCover)/fstCover;
436	sndEntp = wtMeanSqErr(sndWtSq, sndWtVl, sndCover)/sndCover;
437	}
438
439	/* Which bag has higher information gain? */
440	boolean isFirst;
441	double fstInfoGain, sndInfoGain;
442	double info, infoGain, fstInfo, sndInfo;
443	if(m_ClassAttribute.isNominal()){
444	double sum = data.sumOfWeights();
445	double otherCover, whole = sum + Utils.sum(uncover), otherEntropy;
446	double[] other = null;
447
448	// InfoGain of first bag
449	other = new double[m_NumClasses];
450	for(int z=0; z < m_NumClasses; z++)
451	other[z] = uncover[z] + snd[z] + missing[z];
452	otherCover = whole - fstCover;
453	otherEntropy = entropy(other, otherCover);
454	// Weighted average
455	fstInfo = (fstEntpfstCover + otherEntropyotherCover)/whole;
456	fstInfoGain = defInfo - fstInfo;
457
458	// InfoGain of second bag
459	other = new double[m_NumClasses];
460	for(int z=0; z < m_NumClasses; z++)
461	other[z] = uncover[z] + fst[z] + missing[z];
462	otherCover = whole - sndCover;
463	otherEntropy = entropy(other, otherCover);
464	// Weighted average
465	sndInfo = (sndEntpsndCover + otherEntropyotherCover)/whole;
466	sndInfoGain = defInfo - sndInfo;
467	}
468	else{
469	double sum = data.sumOfWeights();
470	double otherWtSq = (sndWtSq + msingWtSq + uncoverWtSq),
471	otherWtVl = (sndWtVl + msingWtVl + uncoverWtVl),
472	otherCover = (sum - fstCover + uncoverSum);
473
474	fstInfo = Utils.eq(fstCover, 0) ? 0 : (fstEntp * fstCover);
475	fstInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover);
476	fstInfoGain = defInfo - fstInfo;
477
478	otherWtSq = (fstWtSq + msingWtSq + uncoverWtSq);
479	otherWtVl = (fstWtVl + msingWtVl + uncoverWtVl);
480	otherCover = sum - sndCover + uncoverSum;
481	sndInfo = Utils.eq(sndCover, 0) ? 0 : (sndEntp * sndCover);
482	sndInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover);
483	sndInfoGain = defInfo - sndInfo;
484	}
485
486	if(Utils.gr(fstInfoGain,sndInfoGain) \|\|
487	(Utils.eq(fstInfoGain,sndInfoGain)&&(Utils.sm(fstEntp,sndEntp)))){
488	isFirst = true;
489	infoGain = fstInfoGain;
490	info = fstInfo;
491	}
492	else{
493	isFirst = false;
494	infoGain = sndInfoGain;
495	info = sndInfo;
496	}
497
498	boolean isUpdate = Utils.gr(infoGain, maxInfoGain);
499
500	/* Check whether so far the max infoGain */
501	if(isUpdate){
502	splitPoint = ((data.instance(split).value(att)) + (data.instance(prev).value(att)))/2.0;
503	value = ((isFirst) ? 0 : 1);
504	inform = info;
505	maxInfoGain = infoGain;
506	finalSplit = split;
507	}
508	prev=split;
509	}
510	}
511
512	/* Split the data */
513	Instances[] splitData = new Instances[3];
514	splitData[0] = new Instances(data, 0, finalSplit);
515	splitData[1] = new Instances(data, finalSplit, total-finalSplit);
516	splitData[2] = new Instances(missingData);
517
518	return splitData;
519	}
520
521	/**
522	* Whether the instance is covered by this antecedent
523	*
524	* @param inst the instance in question
525	* @return the boolean value indicating whether the instance is covered
526	* by this antecedent
527	*/
528	public boolean isCover(Instance inst){
529	boolean isCover=false;
530	if(!inst.isMissing(att)){
531	if(Utils.eq(value, 0)){
532	if(Utils.smOrEq(inst.value(att), splitPoint))
533	isCover=true;
534	}
535	else if(Utils.gr(inst.value(att), splitPoint))
536	isCover=true;
537	}
538	return isCover;
539	}
540
541	/**
542	* Prints this antecedent
543	*
544	* @return a textual description of this antecedent
545	*/
546	public String toString() {
547	String symbol = Utils.eq(value, 0.0) ? " <= " : " > ";
548	return (att.name() + symbol + Utils.doubleToString(splitPoint, 6));
549	}
550
551	/**
552	* Returns the revision string.
553	*
554	* @return the revision
555	*/
556	public String getRevision() {
557	return RevisionUtils.extract("$Revision: 5928 $");
558	}
559	}
560
561
562	/**
563	* The antecedent with nominal attribute
564	*/
565	class NominalAntd
566	extends Antd {
567
568	/** for serialization */
569	static final long serialVersionUID = -5949864163376447424L;
570
571	/* The parameters of infoGain calculated for each attribute value */
572	private double[][] stats;
573	private double[] coverage;
574	private boolean isIn;
575
576	/**
577	* Constructor for nominal class
578	*/
579	public NominalAntd(Attribute a, double[] unc){
580	super(a, unc);
581	int bag = att.numValues();
582	stats = new double[bag][m_NumClasses];
583	coverage = new double[bag];
584	isIn = true;
585	}
586
587	/**
588	* Constructor for numeric class
589	*/
590	public NominalAntd(Attribute a, double sq, double vl, double wts){
591	super(a, sq, vl, wts);
592	int bag = att.numValues();
593	stats = null;
594	coverage = new double[bag];
595	isIn = true;
596	}
597
598	/**
599	* Implements the splitData function.
600	* This procedure is to split the data into bags according
601	* to the nominal attribute value
602	* the data with missing values are stored in the last bag.
603	* The infoGain for each bag is also calculated.
604	*
605	* @param data the data to be split
606	* @param defInfo the default information for data
607	* @return the array of data after split
608	*/
609	public Instances[] splitData(Instances data, double defInfo){
610	int bag = att.numValues();
611	Instances[] splitData = new Instances[bag+1];
612	double[] wSq = new double[bag];
613	double[] wVl = new double[bag];
614	double totalWS=0, totalWV=0, msingWS=0, msingWV=0, sum=data.sumOfWeights();
615	double[] all = new double[m_NumClasses];
616	double[] missing = new double[m_NumClasses];
617
618	for(int w=0; w < m_NumClasses; w++)
619	all[w] = missing[w] = 0;
620
621	for(int x=0; x<bag; x++){
622	coverage[x] = wSq[x] = wVl[x] = 0;
623	if(stats != null)
624	for(int y=0; y < m_NumClasses; y++)
625	stats[x][y] = 0;
626	splitData[x] = new Instances(data, data.numInstances());
627	}
628	splitData[bag] = new Instances(data, data.numInstances());
629
630	// Record the statistics of data
631	for(int x=0; x<data.numInstances(); x++){
632	Instance inst=data.instance(x);
633	if(!inst.isMissing(att)){
634	int v = (int)inst.value(att);
635	splitData[v].add(inst);
636	coverage[v] += inst.weight();
637	if(m_ClassAttribute.isNominal()){ // Nominal class
638	stats[v][(int)inst.classValue()] += inst.weight();
639	all[(int)inst.classValue()] += inst.weight();
640	}
641	else{ // Numeric class
642	wSq[v] += inst.weight() * inst.classValue() * inst.classValue();
643	wVl[v] += inst.weight() * inst.classValue();
644	totalWS += inst.weight() * inst.classValue() * inst.classValue();
645	totalWV += inst.weight() * inst.classValue();
646	}
647	}
648	else{
649	splitData[bag].add(inst);
650	if(m_ClassAttribute.isNominal()){ // Nominal class
651	all[(int)inst.classValue()] += inst.weight();
652	missing[(int)inst.classValue()] += inst.weight();
653	}
654	else{ // Numeric class
655	totalWS += inst.weight() * inst.classValue() * inst.classValue();
656	totalWV += inst.weight() * inst.classValue();
657	msingWS += inst.weight() * inst.classValue() * inst.classValue();
658	msingWV += inst.weight() * inst.classValue();
659	}
660	}
661	}
662
663	// The total weights of the whole grow data
664	double whole;
665	if(m_ClassAttribute.isNominal())
666	whole = sum + Utils.sum(uncover);
667	else
668	whole = sum + uncoverSum;
669
670	// Find the split
671	double minEntrp=Double.MAX_VALUE;
672	maxInfoGain = 0;
673
674	// Check if >=2 splits have more than the minimal data
675	int count=0;
676	for(int x=0; x<bag; x++)
677	if(Utils.grOrEq(coverage[x], m_MinNo))
678	++count;
679
680	if(count < 2){ // Don't split
681	maxInfoGain = 0;
682	inform = defInfo;
683	value = Double.NaN;
684	return null;
685	}
686
687	for(int x=0; x<bag; x++){
688	double t = coverage[x], entrp, infoGain;
689
690	if(Utils.sm(t, m_MinNo))
691	continue;
692
693	if(m_ClassAttribute.isNominal()){ // Nominal class
694	double[] other = new double[m_NumClasses];
695	for(int y=0; y < m_NumClasses; y++)
696	other[y] = all[y] - stats[x][y] + uncover[y];
697	double otherCover = whole - t;
698
699	// Entropies of data covered and uncovered
700	entrp = entropy(stats[x], t);
701	double uncEntp = entropy(other, otherCover);
702
703	// Weighted average
704	infoGain = defInfo - (entrpt + uncEntpotherCover)/whole;
705	}
706	else{ // Numeric class
707	double weight = (whole - t);
708	entrp = wtMeanSqErr(wSq[x], wVl[x], t)/t;
709	infoGain = defInfo - (entrp * t) -
710	wtMeanSqErr((totalWS-wSq[x]+uncoverWtSq),
711	(totalWV-wVl[x]+uncoverWtVl),
712	weight);
713	}
714
715	// Test the exclusive expression
716	boolean isWithin =true;
717	if(m_IsExclude){
718	double infoGain2, entrp2;
719	if(m_ClassAttribute.isNominal()){ // Nominal class
720	double[] other2 = new double[m_NumClasses];
721	double[] notIn = new double[m_NumClasses];
722	for(int y=0; y < m_NumClasses; y++){
723	other2[y] = stats[x][y] + missing[y] + uncover[y];
724	notIn[y] = all[y] - stats[x][y] - missing[y];
725	}
726
727	double msSum = Utils.sum(missing);
728	double otherCover2 = t + msSum + Utils.sum(uncover);
729
730	entrp2 = entropy(notIn, (sum-t-msSum));
731	double uncEntp2 = entropy(other2, otherCover2);
732	infoGain2 = defInfo -
733	(entrp2(sum-t-msSum) + uncEntp2otherCover2)/whole;
734	}
735	else{ // Numeric class
736	double msWts = splitData[bag].sumOfWeights();
737	double weight2 = t + uncoverSum + msWts;
738
739	entrp2 = wtMeanSqErr((totalWS-wSq[x]-msingWS),
740	(totalWV-wVl[x]-msingWV),(sum-t-msWts))
741	/(sum-t-msWts);
742	infoGain2 = defInfo - entrp2 * (sum-t-msWts) -
743	wtMeanSqErr((wSq[x]+uncoverWtSq+msingWS),
744	(wVl[x]+uncoverWtVl+msingWV),
745	weight2);
746	}
747
748	// Use the exclusive expression?
749	if (Utils.gr(infoGain2, infoGain) \|\|
750	(Utils.eq(infoGain2, infoGain) && Utils.sm(entrp2, entrp))){
751	infoGain = infoGain2;
752	entrp = entrp2;
753	isWithin =false;
754	}
755	}
756
757	// Test this split
758	if (Utils.gr(infoGain, maxInfoGain) \|\|
759	(Utils.eq(infoGain, maxInfoGain) && Utils.sm(entrp, minEntrp))){
760	value = (double)x;
761	maxInfoGain = infoGain;
762	inform = maxInfoGain - defInfo;
763	minEntrp = entrp;
764	isIn = isWithin;
765	}
766	}
767
768	return splitData;
769	}
770
771	/**
772	* Whether the instance is covered by this antecedent
773	*
774	* @param inst the instance in question
775	* @return the boolean value indicating whether the instance is covered
776	* by this antecedent
777	*/
778	public boolean isCover(Instance inst){
779	boolean isCover=false;
780	if(!inst.isMissing(att)){
781	if(isIn){
782	if(Utils.eq(inst.value(att), value))
783	isCover=true;
784	}
785	else if(!Utils.eq(inst.value(att), value))
786	isCover=true;
787	}
788	return isCover;
789	}
790
791	/**
792	* Whether the expression is "att = value" or att != value"
793	* for this nominal attribute. True if in the former expression,
794	* otherwise the latter
795	*
796	* @return the boolean value
797	*/
798	public boolean isIn(){
799	return isIn;
800	}
801
802	/**
803	* Prints this antecedent
804	*
805	* @return a textual description of this antecedent
806	*/
807	public String toString() {
808	String symbol = isIn ? " = " : " != ";
809	return (att.name() + symbol + att.value((int)value));
810	}
811
812	/**
813	* Returns the revision string.
814	*
815	* @return the revision
816	*/
817	public String getRevision() {
818	return RevisionUtils.extract("$Revision: 5928 $");
819	}
820	}
821
822	/**
823	* Returns an enumeration describing the available options
824	* Valid options are: <p>
825	*
826	* -N number <br>
827	* Set number of folds for REP. One fold is
828	* used as the pruning set. (Default: 3) <p>
829	*
830	* -R <br>
831	* Set if NOT randomize the data before split to growing and
832	* pruning data. If NOT set, the seed of randomization is
833	* specified by the -S option. (Default: randomize) <p>
834	*
835	* -S <br>
836	* Seed of randomization. (Default: 1)<p>
837	*
838	* -E <br>
839	* Set whether consider the exclusive expressions for nominal
840	* attribute split. (Default: false) <p>
841	*
842	* -M number <br>
843	* Set the minimal weights of instances within a split.
844	* (Default: 2) <p>
845	*
846	* -P number <br>
847	* Set the number of antecedents allowed in the rule if pre-pruning
848	* is used. If this value is other than -1, then pre-pruning will be
849	* used, otherwise the rule uses REP. (Default: -1) <p>
850	*
851	* @return an enumeration of all the available options
852	*/
853	public Enumeration listOptions() {
854	Vector newVector = new Vector(6);
855
856	newVector.addElement(new Option("\tSet number of folds for REP\n" +
857	"\tOne fold is used as pruning set.\n" +
858	"\t(default 3)","N", 1, "-N <number of folds>"));
859
860	newVector.addElement(new Option("\tSet if NOT uses randomization\n" +
861	"\t(default:use randomization)","R", 0, "-R"));
862
863	newVector.addElement(new Option("\tSet whether consider the exclusive\n" +
864	"\texpressions for nominal attributes\n"+
865	"\t(default false)","E", 0, "-E"));
866
867	newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
868	"\twithin a split.\n" +
869	"\t(default 2.0)","M", 1, "-M <min. weights>"));
870
871	newVector.addElement(new Option("\tSet number of antecedents for pre-pruning\n" +
872	"\tif -1, then REP is used\n" +
873	"\t(default -1)","P", 1, "-P <number of antecedents>"));
874
875	newVector.addElement(new Option("\tSet the seed of randomization\n" +
876	"\t(default 1)","S", 1, "-S <seed>"));
877
878	return newVector.elements();
879	}
880
881	/**
882	* Parses a given list of options. <p/>
883	*
884	<!-- options-start -->
885	* Valid options are: <p/>
886	*
887	* <pre> -N <number of folds>
888	* Set number of folds for REP
889	* One fold is used as pruning set.
890	* (default 3)</pre>
891	*
892	* <pre> -R
893	* Set if NOT uses randomization
894	* (default:use randomization)</pre>
895	*
896	* <pre> -E
897	* Set whether consider the exclusive
898	* expressions for nominal attributes
899	* (default false)</pre>
900	*
901	* <pre> -M <min. weights>
902	* Set the minimal weights of instances
903	* within a split.
904	* (default 2.0)</pre>
905	*
906	* <pre> -P <number of antecedents>
907	* Set number of antecedents for pre-pruning
908	* if -1, then REP is used
909	* (default -1)</pre>
910	*
911	* <pre> -S <seed>
912	* Set the seed of randomization
913	* (default 1)</pre>
914	*
915	<!-- options-end -->
916	*
917	* @param options the list of options as an array of strings
918	* @throws Exception if an option is not supported
919	*/
920	public void setOptions(String[] options) throws Exception {
921
922	String numFoldsString = Utils.getOption('N', options);
923	if (numFoldsString.length() != 0)
924	m_Folds = Integer.parseInt(numFoldsString);
925	else
926	m_Folds = 3;
927
928	String minNoString = Utils.getOption('M', options);
929	if (minNoString.length() != 0)
930	m_MinNo = Double.parseDouble(minNoString);
931	else
932	m_MinNo = 2.0;
933
934	String seedString = Utils.getOption('S', options);
935	if (seedString.length() != 0)
936	m_Seed = Integer.parseInt(seedString);
937	else
938	m_Seed = 1;
939
940	String numAntdsString = Utils.getOption('P', options);
941	if (numAntdsString.length() != 0)
942	m_NumAntds = Integer.parseInt(numAntdsString);
943	else
944	m_NumAntds = -1;
945
946	m_IsExclude = Utils.getFlag('E', options);
947	}
948
949	/**
950	* Gets the current settings of the Classifier.
951	*
952	* @return an array of strings suitable for passing to setOptions
953	*/
954	public String [] getOptions() {
955
956	String [] options = new String [9];
957	int current = 0;
958	options[current++] = "-N"; options[current++] = "" + m_Folds;
959	options[current++] = "-M"; options[current++] = "" + m_MinNo;
960	options[current++] = "-P"; options[current++] = "" + m_NumAntds;
961	options[current++] = "-S"; options[current++] = "" + m_Seed;
962
963	if(m_IsExclude)
964	options[current++] = "-E";
965
966	while (current < options.length)
967	options[current++] = "";
968	return options;
969	}
970
971	/** The access functions for parameters */
972
973	/**
974	* Returns the tip text for this property
975	* @return tip text for this property suitable for
976	* displaying in the explorer/experimenter gui
977	*/
978	public String foldsTipText() {
979	return "Determines the amount of data used for pruning. One fold is used for "
980	+ "pruning, the rest for growing the rules.";
981	}
982
983	/**
984	* the number of folds to use
985	*
986	* @param folds the number of folds to use
987	*/
988	public void setFolds(int folds) {
989	m_Folds = folds;
990	}
991
992	/**
993	* returns the current number of folds
994	*
995	* @return the number of folds
996	*/
997	public int getFolds() {
998	return m_Folds;
999	}
1000
1001	/**
1002	* Returns the tip text for this property
1003	* @return tip text for this property suitable for
1004	* displaying in the explorer/experimenter gui
1005	*/
1006	public String seedTipText() {
1007	return "The seed used for randomizing the data.";
1008	}
1009
1010	/**
1011	* sets the seed for randomizing the data
1012	*
1013	* @param s the seed value
1014	*/
1015	public void setSeed(long s) {
1016	m_Seed = s;
1017	}
1018
1019	/**
1020	* returns the current seed value for randomizing the data
1021	*
1022	* @return the seed value
1023	*/
1024	public long getSeed() {
1025	return m_Seed;
1026	}
1027
1028	/**
1029	* Returns the tip text for this property
1030	* @return tip text for this property suitable for
1031	* displaying in the explorer/experimenter gui
1032	*/
1033	public String exclusiveTipText() {
1034	return "Set whether to consider exclusive expressions for nominal "
1035	+ "attribute splits.";
1036	}
1037
1038	/**
1039	* Returns whether exclusive expressions for nominal attributes splits are
1040	* considered
1041	*
1042	* @return true if exclusive expressions for nominal attributes splits are
1043	* considered
1044	*/
1045	public boolean getExclusive() {
1046	return m_IsExclude;
1047	}
1048
1049	/**
1050	* Sets whether exclusive expressions for nominal attributes splits are
1051	* considered
1052	*
1053	* @param e whether to consider exclusive expressions for nominal attribute
1054	* splits
1055	*/
1056	public void setExclusive(boolean e) {
1057	m_IsExclude = e;
1058	}
1059
1060	/**
1061	* Returns the tip text for this property
1062	* @return tip text for this property suitable for
1063	* displaying in the explorer/experimenter gui
1064	*/
1065	public String minNoTipText() {
1066	return "The minimum total weight of the instances in a rule.";
1067	}
1068
1069	/**
1070	* Sets the minimum total weight of the instances in a rule
1071	*
1072	* @param m the minimum total weight of the instances in a rule
1073	*/
1074	public void setMinNo(double m) {
1075	m_MinNo = m;
1076	}
1077
1078	/**
1079	* Gets the minimum total weight of the instances in a rule
1080	*
1081	* @return the minimum total weight of the instances in a rule
1082	*/
1083	public double getMinNo(){
1084	return m_MinNo;
1085	}
1086
1087	/**
1088	* Returns the tip text for this property
1089	* @return tip text for this property suitable for
1090	* displaying in the explorer/experimenter gui
1091	*/
1092	public String numAntdsTipText() {
1093	return "Set the number of antecedents allowed in the rule if "
1094	+ "pre-pruning is used. If this value is other than -1, then "
1095	+ "pre-pruning will be used, otherwise the rule uses reduced-error "
1096	+ "pruning.";
1097	}
1098
1099	/**
1100	* Sets the number of antecedants
1101	*
1102	* @param n the number of antecedants
1103	*/
1104	public void setNumAntds(int n) {
1105	m_NumAntds = n;
1106	}
1107
1108	/**
1109	* Gets the number of antecedants
1110	*
1111	* @return the number of antecedants
1112	*/
1113	public int getNumAntds(){
1114	return m_NumAntds;
1115	}
1116
1117	/**
1118	* Returns default capabilities of the classifier.
1119	*
1120	* @return the capabilities of this classifier
1121	*/
1122	public Capabilities getCapabilities() {
1123	Capabilities result = super.getCapabilities();
1124	result.disableAll();
1125
1126	// attributes
1127	result.enable(Capability.NOMINAL_ATTRIBUTES);
1128	result.enable(Capability.NUMERIC_ATTRIBUTES);
1129	result.enable(Capability.DATE_ATTRIBUTES);
1130	result.enable(Capability.MISSING_VALUES);
1131
1132	// class
1133	result.enable(Capability.NOMINAL_CLASS);
1134	result.enable(Capability.NUMERIC_CLASS);
1135	result.enable(Capability.DATE_CLASS);
1136	result.enable(Capability.MISSING_CLASS_VALUES);
1137
1138	return result;
1139	}
1140
1141	/**
1142	* Builds a single rule learner with REP dealing with nominal classes or
1143	* numeric classes.
1144	* For nominal classes, this rule learner predicts a distribution on
1145	* the classes.
1146	* For numeric classes, this learner predicts a single value.
1147	*
1148	* @param instances the training data
1149	* @throws Exception if classifier can't be built successfully
1150	*/
1151	public void buildClassifier(Instances instances) throws Exception {
1152	// can classifier handle the data?
1153	getCapabilities().testWithFail(instances);
1154
1155	// remove instances with missing class
1156	Instances data = new Instances(instances);
1157	data.deleteWithMissingClass();
1158
1159	if(data.numInstances() < m_Folds)
1160	throw new Exception("Not enough data for REP.");
1161
1162	m_ClassAttribute = data.classAttribute();
1163	if(m_ClassAttribute.isNominal())
1164	m_NumClasses = m_ClassAttribute.numValues();
1165	else
1166	m_NumClasses = 1;
1167
1168	m_Antds = new FastVector();
1169	m_DefDstr = new double[m_NumClasses];
1170	m_Cnsqt = new double[m_NumClasses];
1171	m_Targets = new FastVector();
1172	m_Random = new Random(m_Seed);
1173
1174	if(m_NumAntds != -1){
1175	grow(data);
1176	}
1177	else{
1178
1179	data.randomize(m_Random);
1180
1181	// Split data into Grow and Prune
1182	data.stratify(m_Folds);
1183
1184	Instances growData=data.trainCV(m_Folds, m_Folds-1, m_Random);
1185	Instances pruneData=data.testCV(m_Folds, m_Folds-1);
1186
1187	grow(growData); // Build this rule
1188	prune(pruneData); // Prune this rule
1189	}
1190
1191	if(m_ClassAttribute.isNominal()){
1192	Utils.normalize(m_Cnsqt);
1193	if(Utils.gr(Utils.sum(m_DefDstr), 0))
1194	Utils.normalize(m_DefDstr);
1195	}
1196	}
1197
1198	/**
1199	* Computes class distribution for the given instance.
1200	*
1201	* @param instance the instance for which distribution is to be computed
1202	* @return the class distribution for the given instance
1203	* @throws Exception if given instance is null
1204	*/
1205	public double[] distributionForInstance(Instance instance) throws Exception {
1206	if(instance == null)
1207	throw new Exception("Testing instance is NULL!");
1208
1209	if (isCover(instance))
1210	return m_Cnsqt;
1211	else
1212	return m_DefDstr;
1213	}
1214
1215	/**
1216	* Whether the instance covered by this rule
1217	*
1218	* @param datum the instance in question
1219	* @return the boolean value indicating whether the instance is covered by this rule
1220	*/
1221	public boolean isCover(Instance datum){
1222	boolean isCover=true;
1223
1224	for(int i=0; i<m_Antds.size(); i++){
1225	Antd antd = (Antd)m_Antds.elementAt(i);
1226	if(!antd.isCover(datum)){
1227	isCover = false;
1228	break;
1229	}
1230	}
1231
1232	return isCover;
1233	}
1234
1235	/**
1236	* Whether this rule has antecedents, i.e. whether it is a default rule
1237	*
1238	* @return the boolean value indicating whether the rule has antecedents
1239	*/
1240	public boolean hasAntds(){
1241	if (m_Antds == null)
1242	return false;
1243	else
1244	return (m_Antds.size() > 0);
1245	}
1246
1247	/**
1248	* Build one rule using the growing data
1249	*
1250	* @param data the growing data used to build the rule
1251	*/
1252	private void grow(Instances data){
1253	Instances growData = new Instances(data);
1254	double defInfo;
1255	double whole = data.sumOfWeights();
1256
1257	if(m_NumAntds != 0){
1258
1259	/* Class distribution for data both covered and not covered by one antecedent */
1260	double[][] classDstr = new double[2][m_NumClasses];
1261
1262	/* Compute the default information of the growing data */
1263	for(int j=0; j < m_NumClasses; j++){
1264	classDstr[0][j] = 0;
1265	classDstr[1][j] = 0;
1266	}
1267	if(m_ClassAttribute.isNominal()){
1268	for(int i=0; i < growData.numInstances(); i++){
1269	Instance datum = growData.instance(i);
1270	classDstr[0][(int)datum.classValue()] += datum.weight();
1271	}
1272	defInfo = ContingencyTables.entropy(classDstr[0]);
1273	}
1274	else{
1275	for(int i=0; i < growData.numInstances(); i++){
1276	Instance datum = growData.instance(i);
1277	classDstr[0][0] += datum.weight() * datum.classValue();
1278	}
1279
1280	// No need to be divided by the denomitor because
1281	// it's always the same
1282	double defMean = (classDstr[0][0] / whole);
1283	defInfo = meanSquaredError(growData, defMean) * growData.sumOfWeights();
1284	}
1285
1286	// Store the default class distribution
1287	double[][] tmp = new double[2][m_NumClasses];
1288	for(int y=0; y < m_NumClasses; y++){
1289	if(m_ClassAttribute.isNominal()){
1290	tmp[0][y] = classDstr[0][y];
1291	tmp[1][y] = classDstr[1][y];
1292	}
1293	else{
1294	tmp[0][y] = classDstr[0][y]/whole;
1295	tmp[1][y] = classDstr[1][y];
1296	}
1297	}
1298	m_Targets.addElement(tmp);
1299
1300	/* Keep the record of which attributes have already been used*/
1301	boolean[] used=new boolean[growData.numAttributes()];
1302	for (int k=0; k<used.length; k++)
1303	used[k]=false;
1304	int numUnused=used.length;
1305	double maxInfoGain, uncoveredWtSq=0, uncoveredWtVl=0, uncoveredWts=0;
1306	boolean isContinue = true; // The stopping criterion of this rule
1307
1308	while (isContinue){
1309	maxInfoGain = 0; // We require that infoGain be positive
1310
1311	/* Build a list of antecedents */
1312	Antd oneAntd=null;
1313	Instances coverData = null, uncoverData = null;
1314	Enumeration enumAttr=growData.enumerateAttributes();
1315	int index=-1;
1316
1317	/* Build one condition based on all attributes not used yet*/
1318	while (enumAttr.hasMoreElements()){
1319	Attribute att= (Attribute)(enumAttr.nextElement());
1320	index++;
1321
1322	Antd antd =null;
1323	if(m_ClassAttribute.isNominal()){
1324	if(att.isNumeric())
1325	antd = new NumericAntd(att, classDstr[1]);
1326	else
1327	antd = new NominalAntd(att, classDstr[1]);
1328	}
1329	else
1330	if(att.isNumeric())
1331	antd = new NumericAntd(att, uncoveredWtSq, uncoveredWtVl, uncoveredWts);
1332	else
1333	antd = new NominalAntd(att, uncoveredWtSq, uncoveredWtVl, uncoveredWts);
1334
1335	if(!used[index]){
1336	/* Compute the best information gain for each attribute,
1337	it's stored in the antecedent formed by this attribute.
1338	This procedure returns the data covered by the antecedent*/
1339	Instances[] coveredData = computeInfoGain(growData, defInfo, antd);
1340
1341	if(coveredData != null){
1342	double infoGain = antd.getMaxInfoGain();
1343	boolean isUpdate = Utils.gr(infoGain, maxInfoGain);
1344
1345	if(isUpdate){
1346	oneAntd=antd;
1347	coverData = coveredData[0];
1348	uncoverData = coveredData[1];
1349	maxInfoGain = infoGain;
1350	}
1351	}
1352	}
1353	}
1354
1355	if(oneAntd == null)
1356	break;
1357
1358	//Numeric attributes can be used more than once
1359	if(!oneAntd.getAttr().isNumeric()){
1360	used[oneAntd.getAttr().index()]=true;
1361	numUnused--;
1362	}
1363
1364	m_Antds.addElement(oneAntd);
1365	growData = coverData;// Grow data size is shrinking
1366
1367	for(int x=0; x < uncoverData.numInstances(); x++){
1368	Instance datum = uncoverData.instance(x);
1369	if(m_ClassAttribute.isNumeric()){
1370	uncoveredWtSq += datum.weight() * datum.classValue() * datum.classValue();
1371	uncoveredWtVl += datum.weight() * datum.classValue();
1372	uncoveredWts += datum.weight();
1373	classDstr[0][0] -= datum.weight() * datum.classValue();
1374	classDstr[1][0] += datum.weight() * datum.classValue();
1375	}
1376	else{
1377	classDstr[0][(int)datum.classValue()] -= datum.weight();
1378	classDstr[1][(int)datum.classValue()] += datum.weight();
1379	}
1380	}
1381
1382	// Store class distribution of growing data
1383	tmp = new double[2][m_NumClasses];
1384	for(int y=0; y < m_NumClasses; y++){
1385	if(m_ClassAttribute.isNominal()){
1386	tmp[0][y] = classDstr[0][y];
1387	tmp[1][y] = classDstr[1][y];
1388	}
1389	else{
1390	tmp[0][y] = classDstr[0][y]/(whole-uncoveredWts);
1391	tmp[1][y] = classDstr[1][y]/uncoveredWts;
1392	}
1393	}
1394	m_Targets.addElement(tmp);
1395
1396	defInfo = oneAntd.getInfo();
1397	int numAntdsThreshold = (m_NumAntds == -1) ? Integer.MAX_VALUE : m_NumAntds;
1398
1399	if(Utils.eq(growData.sumOfWeights(), 0.0) \|\|
1400	(numUnused == 0) \|\|
1401	(m_Antds.size() >= numAntdsThreshold))
1402	isContinue = false;
1403	}
1404	}
1405
1406	m_Cnsqt = ((double[][])(m_Targets.lastElement()))[0];
1407	m_DefDstr = ((double[][])(m_Targets.lastElement()))[1];
1408	}
1409
1410	/**
1411	* Compute the best information gain for the specified antecedent
1412	*
1413	* @param instances the data based on which the infoGain is computed
1414	* @param defInfo the default information of data
1415	* @param antd the specific antecedent
1416	* @return the data covered and not covered by the antecedent
1417	*/
1418	private Instances[] computeInfoGain(Instances instances, double defInfo, Antd antd){
1419	Instances data = new Instances(instances);
1420
1421	/* Split the data into bags.
1422	The information gain of each bag is also calculated in this procedure */
1423	Instances[] splitData = antd.splitData(data, defInfo);
1424	Instances[] coveredData = new Instances[2];
1425
1426	/* Get the bag of data to be used for next antecedents */
1427	Instances tmp1 = new Instances(data, 0);
1428	Instances tmp2 = new Instances(data, 0);
1429
1430	if(splitData == null)
1431	return null;
1432
1433	for(int x=0; x < (splitData.length-1); x++){
1434	if(x == ((int)antd.getAttrValue()))
1435	tmp1 = splitData[x];
1436	else{
1437	for(int y=0; y < splitData[x].numInstances(); y++)
1438	tmp2.add(splitData[x].instance(y));
1439	}
1440	}
1441
1442	if(antd.getAttr().isNominal()){ // Nominal attributes
1443	if(((NominalAntd)antd).isIn()){ // Inclusive expression
1444	coveredData[0] = new Instances(tmp1);
1445	coveredData[1] = new Instances(tmp2);
1446	}
1447	else{ // Exclusive expression
1448	coveredData[0] = new Instances(tmp2);
1449	coveredData[1] = new Instances(tmp1);
1450	}
1451	}
1452	else{ // Numeric attributes
1453	coveredData[0] = new Instances(tmp1);
1454	coveredData[1] = new Instances(tmp2);
1455	}
1456
1457	/* Add data with missing value */
1458	for(int z=0; z<splitData[splitData.length-1].numInstances(); z++)
1459	coveredData[1].add(splitData[splitData.length-1].instance(z));
1460
1461	return coveredData;
1462	}
1463
1464	/**
1465	* Prune the rule using the pruning data.
1466	* The weighted average of accuracy rate/mean-squared error is
1467	* used to prune the rule.
1468	*
1469	* @param pruneData the pruning data used to prune the rule
1470	*/
1471	private void prune(Instances pruneData){
1472	Instances data=new Instances(pruneData);
1473	Instances otherData = new Instances(data, 0);
1474	double total = data.sumOfWeights();
1475
1476	/* The default accurate# and the the accuracy rate on pruning data */
1477	double defAccu;
1478	if(m_ClassAttribute.isNumeric())
1479	defAccu = meanSquaredError(pruneData,
1480	((double[][])m_Targets.firstElement())[0][0]);
1481	else{
1482	int predict = Utils.maxIndex(((double[][])m_Targets.firstElement())[0]);
1483	defAccu = computeAccu(pruneData, predict)/total;
1484	}
1485
1486	int size=m_Antds.size();
1487	if(size == 0){
1488	m_Cnsqt = ((double[][])m_Targets.lastElement())[0];
1489	m_DefDstr = ((double[][])m_Targets.lastElement())[1];
1490	return; // Default rule before pruning
1491	}
1492
1493	double[] worthValue = new double[size];
1494
1495	/* Calculate accuracy parameters for all the antecedents in this rule */
1496	for(int x=0; x<size; x++){
1497	Antd antd=(Antd)m_Antds.elementAt(x);
1498	Instances newData = new Instances(data);
1499	if(Utils.eq(newData.sumOfWeights(),0.0))
1500	break;
1501
1502	data = new Instances(newData, newData.numInstances()); // Make data empty
1503
1504	for(int y=0; y<newData.numInstances(); y++){
1505	Instance ins=newData.instance(y);
1506	if(antd.isCover(ins)) // Covered by this antecedent
1507	data.add(ins); // Add to data for further
1508	else
1509	otherData.add(ins); // Not covered by this antecedent
1510	}
1511
1512	double covered, other;
1513	double[][] classes =
1514	(double[][])m_Targets.elementAt(x+1); // m_Targets has one more element
1515	if(m_ClassAttribute.isNominal()){
1516	int coverClass = Utils.maxIndex(classes[0]),
1517	otherClass = Utils.maxIndex(classes[1]);
1518
1519	covered = computeAccu(data, coverClass);
1520	other = computeAccu(otherData, otherClass);
1521	}
1522	else{
1523	double coverClass = classes[0][0],
1524	otherClass = classes[1][0];
1525	covered = (data.sumOfWeights())*meanSquaredError(data, coverClass);
1526	other = (otherData.sumOfWeights())*meanSquaredError(otherData, otherClass);
1527	}
1528
1529	worthValue[x] = (covered + other)/total;
1530	}
1531
1532	/* Prune the antecedents according to the accuracy parameters */
1533	for(int z=(size-1); z > 0; z--){
1534	// Treatment to avoid precision problems
1535	double valueDelta;
1536	if(m_ClassAttribute.isNominal()){
1537	if(Utils.sm(worthValue[z], 1.0))
1538	valueDelta = (worthValue[z] - worthValue[z-1]) / worthValue[z];
1539	else
1540	valueDelta = worthValue[z] - worthValue[z-1];
1541	}
1542	else{
1543	if(Utils.sm(worthValue[z], 1.0))
1544	valueDelta = (worthValue[z-1] - worthValue[z]) / worthValue[z];
1545	else
1546	valueDelta = (worthValue[z-1] - worthValue[z]);
1547	}
1548
1549	if(Utils.smOrEq(valueDelta, 0.0)){
1550	m_Antds.removeElementAt(z);
1551	m_Targets.removeElementAt(z+1);
1552	}
1553	else break;
1554	}
1555
1556	// Check whether this rule is a default rule
1557	if(m_Antds.size() == 1){
1558	double valueDelta;
1559	if(m_ClassAttribute.isNominal()){
1560	if(Utils.sm(worthValue[0], 1.0))
1561	valueDelta = (worthValue[0] - defAccu) / worthValue[0];
1562	else
1563	valueDelta = (worthValue[0] - defAccu);
1564	}
1565	else{
1566	if(Utils.sm(worthValue[0], 1.0))
1567	valueDelta = (defAccu - worthValue[0]) / worthValue[0];
1568	else
1569	valueDelta = (defAccu - worthValue[0]);
1570	}
1571
1572	if(Utils.smOrEq(valueDelta, 0.0)){
1573	m_Antds.removeAllElements();
1574	m_Targets.removeElementAt(1);
1575	}
1576	}
1577
1578	m_Cnsqt = ((double[][])(m_Targets.lastElement()))[0];
1579	m_DefDstr = ((double[][])(m_Targets.lastElement()))[1];
1580	}
1581
1582	/**
1583	* Private function to compute number of accurate instances
1584	* based on the specified predicted class
1585	*
1586	* @param data the data in question
1587	* @param clas the predicted class
1588	* @return the default accuracy number
1589	*/
1590	private double computeAccu(Instances data, int clas){
1591	double accu = 0;
1592	for(int i=0; i<data.numInstances(); i++){
1593	Instance inst = data.instance(i);
1594	if((int)inst.classValue() == clas)
1595	accu += inst.weight();
1596	}
1597	return accu;
1598	}
1599
1600
1601	/**
1602	* Private function to compute the squared error of
1603	* the specified data and the specified mean
1604	*
1605	* @param data the data in question
1606	* @param mean the specified mean
1607	* @return the default mean-squared error
1608	*/
1609	private double meanSquaredError(Instances data, double mean){
1610	if(Utils.eq(data.sumOfWeights(),0.0))
1611	return 0;
1612
1613	double mSqErr=0, sum = data.sumOfWeights();
1614	for(int i=0; i < data.numInstances(); i++){
1615	Instance datum = data.instance(i);
1616	mSqErr += datum.weight()*
1617	(datum.classValue() - mean)*
1618	(datum.classValue() - mean);
1619	}
1620
1621	return (mSqErr / sum);
1622	}
1623
1624	/**
1625	* Prints this rule with the specified class label
1626	*
1627	* @param att the string standing for attribute in the consequent of this rule
1628	* @param cl the string standing for value in the consequent of this rule
1629	* @return a textual description of this rule with the specified class label
1630	*/
1631	public String toString(String att, String cl) {
1632	StringBuffer text = new StringBuffer();
1633	if(m_Antds.size() > 0){
1634	for(int j=0; j< (m_Antds.size()-1); j++)
1635	text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
1636	text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
1637	}
1638	text.append(" => " + att + " = " + cl);
1639
1640	return text.toString();
1641	}
1642
1643	/**
1644	* Prints this rule
1645	*
1646	* @return a textual description of this rule
1647	*/
1648	public String toString() {
1649	String title =
1650	"\n\nSingle conjunctive rule learner:\n"+
1651	"--------------------------------\n", body = null;
1652	StringBuffer text = new StringBuffer();
1653	if(m_ClassAttribute != null){
1654	if(m_ClassAttribute.isNominal()){
1655	body = toString(m_ClassAttribute.name(), m_ClassAttribute.value(Utils.maxIndex(m_Cnsqt)));
1656
1657	text.append("\n\nClass distributions:\nCovered by the rule:\n");
1658	for(int k=0; k < m_Cnsqt.length; k++)
1659	text.append(m_ClassAttribute.value(k)+ "\t");
1660	text.append('\n');
1661	for(int l=0; l < m_Cnsqt.length; l++)
1662	text.append(Utils.doubleToString(m_Cnsqt[l], 6)+"\t");
1663
1664	text.append("\n\nNot covered by the rule:\n");
1665	for(int k=0; k < m_DefDstr.length; k++)
1666	text.append(m_ClassAttribute.value(k)+ "\t");
1667	text.append('\n');
1668	for(int l=0; l < m_DefDstr.length; l++)
1669	text.append(Utils.doubleToString(m_DefDstr[l], 6)+"\t");
1670	}
1671	else
1672	body = toString(m_ClassAttribute.name(), Utils.doubleToString(m_Cnsqt[0], 6));
1673	}
1674	return (title + body + text.toString());
1675	}
1676
1677	/**
1678	* Returns the revision string.
1679	*
1680	* @return the revision
1681	*/
1682	public String getRevision() {
1683	return RevisionUtils.extract("$Revision: 5928 $");
1684	}
1685
1686	/**
1687	* Main method.
1688	*
1689	* @param args the options for the classifier
1690	*/
1691	public static void main(String[] args) {
1692	runClassifier(new ConjunctiveRule(), args);
1693	}
1694	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: