Context Navigation

source: src/main/java/weka/classifiers/CheckClassifier.java @ 8

Last change on this file since 8 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 70.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* CheckClassifier.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers;
24
25	import weka.core.Attribute;
26	import weka.core.CheckScheme;
27	import weka.core.FastVector;
28	import weka.core.Instance;
29	import weka.core.Instances;
30	import weka.core.MultiInstanceCapabilitiesHandler;
31	import weka.core.Option;
32	import weka.core.OptionHandler;
33	import weka.core.RevisionUtils;
34	import weka.core.SerializationHelper;
35	import weka.core.TestInstances;
36	import weka.core.Utils;
37	import weka.core.WeightedInstancesHandler;
38
39	import java.util.Enumeration;
40	import java.util.Random;
41	import java.util.Vector;
42
43	/**
44	* Class for examining the capabilities and finding problems with
45	* classifiers. If you implement a classifier using the WEKA.libraries,
46	* you should run the checks on it to ensure robustness and correct
47	* operation. Passing all the tests of this object does not mean
48	* bugs in the classifier don't exist, but this will help find some
49	* common ones. <p/>
50	*
51	* Typical usage: <p/>
52	* <code>java weka.classifiers.CheckClassifier -W classifier_name
53	* classifier_options </code><p/>
54	*
55	* CheckClassifier reports on the following:
56	* <ul>
57	* <li> Classifier abilities
58	* <ul>
59	* <li> Possible command line options to the classifier </li>
60	* <li> Whether the classifier can predict nominal, numeric, string,
61	* date or relational class attributes. Warnings will be displayed if
62	* performance is worse than ZeroR </li>
63	* <li> Whether the classifier can be trained incrementally </li>
64	* <li> Whether the classifier can handle numeric predictor attributes </li>
65	* <li> Whether the classifier can handle nominal predictor attributes </li>
66	* <li> Whether the classifier can handle string predictor attributes </li>
67	* <li> Whether the classifier can handle date predictor attributes </li>
68	* <li> Whether the classifier can handle relational predictor attributes </li>
69	* <li> Whether the classifier can handle multi-instance data </li>
70	* <li> Whether the classifier can handle missing predictor values </li>
71	* <li> Whether the classifier can handle missing class values </li>
72	* <li> Whether a nominal classifier only handles 2 class problems </li>
73	* <li> Whether the classifier can handle instance weights </li>
74	* </ul>
75	* </li>
76	* <li> Correct functioning
77	* <ul>
78	* <li> Correct initialisation during buildClassifier (i.e. no result
79	* changes when buildClassifier called repeatedly) </li>
80	* <li> Whether incremental training produces the same results
81	* as during non-incremental training (which may or may not
82	* be OK) </li>
83	* <li> Whether the classifier alters the data pased to it
84	* (number of instances, instance order, instance weights, etc) </li>
85	* <li> Whether the toString() method works correctly before the
86	* classifier has been built. </li>
87	* </ul>
88	* </li>
89	* <li> Degenerate cases
90	* <ul>
91	* <li> building classifier with zero training instances </li>
92	* <li> all but one predictor attribute values missing </li>
93	* <li> all predictor attribute values missing </li>
94	* <li> all but one class values missing </li>
95	* <li> all class values missing </li>
96	* </ul>
97	* </li>
98	* </ul>
99	* Running CheckClassifier with the debug option set will output the
100	* training and test datasets for any failed tests.<p/>
101	*
102	* The <code>weka.classifiers.AbstractClassifierTest</code> uses this
103	* class to test all the classifiers. Any changes here, have to be
104	* checked in that abstract test class, too. <p/>
105	*
106	<!-- options-start -->
107	* Valid options are: <p/>
108	*
109	* <pre> -D
110	* Turn on debugging output.</pre>
111	*
112	* <pre> -S
113	* Silent mode - prints nothing to stdout.</pre>
114	*
115	* <pre> -N <num>
116	* The number of instances in the datasets (default 20).</pre>
117	*
118	* <pre> -nominal <num>
119	* The number of nominal attributes (default 2).</pre>
120	*
121	* <pre> -nominal-values <num>
122	* The number of values for nominal attributes (default 1).</pre>
123	*
124	* <pre> -numeric <num>
125	* The number of numeric attributes (default 1).</pre>
126	*
127	* <pre> -string <num>
128	* The number of string attributes (default 1).</pre>
129	*
130	* <pre> -date <num>
131	* The number of date attributes (default 1).</pre>
132	*
133	* <pre> -relational <num>
134	* The number of relational attributes (default 1).</pre>
135	*
136	* <pre> -num-instances-relational <num>
137	* The number of instances in relational/bag attributes (default 10).</pre>
138	*
139	* <pre> -words <comma-separated-list>
140	* The words to use in string attributes.</pre>
141	*
142	* <pre> -word-separators <chars>
143	* The word separators to use in string attributes.</pre>
144	*
145	* <pre> -W
146	* Full name of the classifier analysed.
147	* eg: weka.classifiers.bayes.NaiveBayes
148	* (default weka.classifiers.rules.ZeroR)</pre>
149	*
150	* <pre>
151	* Options specific to classifier weka.classifiers.rules.ZeroR:
152	* </pre>
153	*
154	* <pre> -D
155	* If set, classifier is run in debug mode and
156	* may output additional info to the console</pre>
157	*
158	<!-- options-end -->
159	*
160	* Options after -- are passed to the designated classifier.<p/>
161	*
162	* @author Len Trigg (trigg@cs.waikato.ac.nz)
163	* @author FracPete (fracpete at waikato dot ac dot nz)
164	* @version $Revision: 6041 $
165	* @see TestInstances
166	*/
167	public class CheckClassifier
168	extends CheckScheme {
169
170	/*
171	* Note about test methods:
172	* - methods return array of booleans
173	* - first index: success or not
174	* - second index: acceptable or not (e.g., Exception is OK)
175	* - in case the performance is worse than that of ZeroR both indices are true
176	*
177	* FracPete (fracpete at waikato dot ac dot nz)
178	*/
179
180	/*** The classifier to be examined */
181	protected Classifier m_Classifier = new weka.classifiers.rules.ZeroR();
182
183	/**
184	* Returns an enumeration describing the available options.
185	*
186	* @return an enumeration of all the available options.
187	*/
188	public Enumeration listOptions() {
189	Vector result = new Vector();
190
191	Enumeration en = super.listOptions();
192	while (en.hasMoreElements())
193	result.addElement(en.nextElement());
194
195	result.addElement(new Option(
196	"\tFull name of the classifier analysed.\n"
197	+"\teg: weka.classifiers.bayes.NaiveBayes\n"
198	+ "\t(default weka.classifiers.rules.ZeroR)",
199	"W", 1, "-W"));
200
201	if ((m_Classifier != null)
202	&& (m_Classifier instanceof OptionHandler)) {
203	result.addElement(new Option("", "", 0,
204	"\nOptions specific to classifier "
205	+ m_Classifier.getClass().getName()
206	+ ":"));
207	Enumeration enu = ((OptionHandler)m_Classifier).listOptions();
208	while (enu.hasMoreElements())
209	result.addElement(enu.nextElement());
210	}
211
212	return result.elements();
213	}
214
215	/**
216	* Parses a given list of options.
217	*
218	<!-- options-start -->
219	* Valid options are: <p/>
220	*
221	* <pre> -D
222	* Turn on debugging output.</pre>
223	*
224	* <pre> -S
225	* Silent mode - prints nothing to stdout.</pre>
226	*
227	* <pre> -N <num>
228	* The number of instances in the datasets (default 20).</pre>
229	*
230	* <pre> -nominal <num>
231	* The number of nominal attributes (default 2).</pre>
232	*
233	* <pre> -nominal-values <num>
234	* The number of values for nominal attributes (default 1).</pre>
235	*
236	* <pre> -numeric <num>
237	* The number of numeric attributes (default 1).</pre>
238	*
239	* <pre> -string <num>
240	* The number of string attributes (default 1).</pre>
241	*
242	* <pre> -date <num>
243	* The number of date attributes (default 1).</pre>
244	*
245	* <pre> -relational <num>
246	* The number of relational attributes (default 1).</pre>
247	*
248	* <pre> -num-instances-relational <num>
249	* The number of instances in relational/bag attributes (default 10).</pre>
250	*
251	* <pre> -words <comma-separated-list>
252	* The words to use in string attributes.</pre>
253	*
254	* <pre> -word-separators <chars>
255	* The word separators to use in string attributes.</pre>
256	*
257	* <pre> -W
258	* Full name of the classifier analysed.
259	* eg: weka.classifiers.bayes.NaiveBayes
260	* (default weka.classifiers.rules.ZeroR)</pre>
261	*
262	* <pre>
263	* Options specific to classifier weka.classifiers.rules.ZeroR:
264	* </pre>
265	*
266	* <pre> -D
267	* If set, classifier is run in debug mode and
268	* may output additional info to the console</pre>
269	*
270	<!-- options-end -->
271	*
272	* @param options the list of options as an array of strings
273	* @throws Exception if an option is not supported
274	*/
275	public void setOptions(String[] options) throws Exception {
276	String tmpStr;
277
278	super.setOptions(options);
279
280	tmpStr = Utils.getOption('W', options);
281	if (tmpStr.length() == 0)
282	tmpStr = weka.classifiers.rules.ZeroR.class.getName();
283	setClassifier(
284	(Classifier) forName(
285	"weka.classifiers",
286	Classifier.class,
287	tmpStr,
288	Utils.partitionOptions(options)));
289	}
290
291	/**
292	* Gets the current settings of the CheckClassifier.
293	*
294	* @return an array of strings suitable for passing to setOptions
295	*/
296	public String[] getOptions() {
297	Vector result;
298	String[] options;
299	int i;
300
301	result = new Vector();
302
303	options = super.getOptions();
304	for (i = 0; i < options.length; i++)
305	result.add(options[i]);
306
307	if (getClassifier() != null) {
308	result.add("-W");
309	result.add(getClassifier().getClass().getName());
310	}
311
312	if ((m_Classifier != null) && (m_Classifier instanceof OptionHandler))
313	options = ((OptionHandler) m_Classifier).getOptions();
314	else
315	options = new String[0];
316
317	if (options.length > 0) {
318	result.add("--");
319	for (i = 0; i < options.length; i++)
320	result.add(options[i]);
321	}
322
323	return (String[]) result.toArray(new String[result.size()]);
324	}
325
326	/**
327	* Begin the tests, reporting results to System.out
328	*/
329	public void doTests() {
330
331	if (getClassifier() == null) {
332	println("\n=== No classifier set ===");
333	return;
334	}
335	println("\n=== Check on Classifier: "
336	+ getClassifier().getClass().getName()
337	+ " ===\n");
338
339	// Start tests
340	m_ClasspathProblems = false;
341	println("--> Checking for interfaces");
342	canTakeOptions();
343	boolean updateableClassifier = updateableClassifier()[0];
344	boolean weightedInstancesHandler = weightedInstancesHandler()[0];
345	boolean multiInstanceHandler = multiInstanceHandler()[0];
346	println("--> Classifier tests");
347	declaresSerialVersionUID();
348	testToString();
349	testsPerClassType(Attribute.NOMINAL, updateableClassifier, weightedInstancesHandler, multiInstanceHandler);
350	testsPerClassType(Attribute.NUMERIC, updateableClassifier, weightedInstancesHandler, multiInstanceHandler);
351	testsPerClassType(Attribute.DATE, updateableClassifier, weightedInstancesHandler, multiInstanceHandler);
352	testsPerClassType(Attribute.STRING, updateableClassifier, weightedInstancesHandler, multiInstanceHandler);
353	testsPerClassType(Attribute.RELATIONAL, updateableClassifier, weightedInstancesHandler, multiInstanceHandler);
354	}
355
356	/**
357	* Set the classifier for boosting.
358	*
359	* @param newClassifier the Classifier to use.
360	*/
361	public void setClassifier(Classifier newClassifier) {
362	m_Classifier = newClassifier;
363	}
364
365	/**
366	* Get the classifier used as the classifier
367	*
368	* @return the classifier used as the classifier
369	*/
370	public Classifier getClassifier() {
371	return m_Classifier;
372	}
373
374	/**
375	* Run a battery of tests for a given class attribute type
376	*
377	* @param classType true if the class attribute should be numeric
378	* @param updateable true if the classifier is updateable
379	* @param weighted true if the classifier says it handles weights
380	* @param multiInstance true if the classifier is a multi-instance classifier
381	*/
382	protected void testsPerClassType(int classType,
383	boolean updateable,
384	boolean weighted,
385	boolean multiInstance) {
386
387	boolean PNom = canPredict(true, false, false, false, false, multiInstance, classType)[0];
388	boolean PNum = canPredict(false, true, false, false, false, multiInstance, classType)[0];
389	boolean PStr = canPredict(false, false, true, false, false, multiInstance, classType)[0];
390	boolean PDat = canPredict(false, false, false, true, false, multiInstance, classType)[0];
391	boolean PRel;
392	if (!multiInstance)
393	PRel = canPredict(false, false, false, false, true, multiInstance, classType)[0];
394	else
395	PRel = false;
396
397	if (PNom \|\| PNum \|\| PStr \|\| PDat \|\| PRel) {
398	if (weighted)
399	instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
400
401	canHandleOnlyClass(PNom, PNum, PStr, PDat, PRel, classType);
402
403	if (classType == Attribute.NOMINAL)
404	canHandleNClasses(PNom, PNum, PStr, PDat, PRel, multiInstance, 4);
405
406	if (!multiInstance) {
407	canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat, PRel, multiInstance, classType, 0);
408	canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat, PRel, multiInstance, classType, 1);
409	}
410
411	canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
412	boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel,
413	multiInstance, classType,
414	true, false, 20)[0];
415	if (handleMissingPredictors)
416	canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, classType, true, false, 100);
417
418	boolean handleMissingClass = canHandleMissing(PNom, PNum, PStr, PDat, PRel,
419	multiInstance, classType,
420	false, true, 20)[0];
421	if (handleMissingClass)
422	canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, classType, false, true, 100);
423
424	correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
425	datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, classType,
426	handleMissingPredictors, handleMissingClass);
427	doesntUseTestClassVal(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
428	if (updateable)
429	updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
430	}
431	}
432
433	/**
434	* Checks whether the scheme's toString() method works even though the
435	* classifies hasn't been built yet.
436	*
437	* @return index 0 is true if the toString() method works fine
438	*/
439	protected boolean[] testToString() {
440	boolean[] result = new boolean[2];
441
442	print("toString...");
443
444	try {
445	Classifier copy = (Classifier) m_Classifier.getClass().newInstance();
446	copy.toString();
447	result[0] = true;
448	println("yes");
449	}
450	catch (Exception e) {
451	result[0] = false;
452	println("no");
453	if (m_Debug) {
454	println("\n=== Full report ===");
455	e.printStackTrace();
456	println("\n");
457	}
458	}
459
460	return result;
461	}
462
463	/**
464	* tests for a serialVersionUID. Fails in case the scheme doesn't declare
465	* a UID.
466	*
467	* @return index 0 is true if the scheme declares a UID
468	*/
469	protected boolean[] declaresSerialVersionUID() {
470	boolean[] result = new boolean[2];
471
472	print("serialVersionUID...");
473
474	result[0] = !SerializationHelper.needsUID(m_Classifier.getClass());
475
476	if (result[0])
477	println("yes");
478	else
479	println("no");
480
481	return result;
482	}
483
484	/**
485	* Checks whether the scheme can take command line options.
486	*
487	* @return index 0 is true if the classifier can take options
488	*/
489	protected boolean[] canTakeOptions() {
490
491	boolean[] result = new boolean[2];
492
493	print("options...");
494	if (m_Classifier instanceof OptionHandler) {
495	println("yes");
496	if (m_Debug) {
497	println("\n=== Full report ===");
498	Enumeration enu = ((OptionHandler)m_Classifier).listOptions();
499	while (enu.hasMoreElements()) {
500	Option option = (Option) enu.nextElement();
501	print(option.synopsis() + "\n"
502	+ option.description() + "\n");
503	}
504	println("\n");
505	}
506	result[0] = true;
507	}
508	else {
509	println("no");
510	result[0] = false;
511	}
512
513	return result;
514	}
515
516	/**
517	* Checks whether the scheme can build models incrementally.
518	*
519	* @return index 0 is true if the classifier can train incrementally
520	*/
521	protected boolean[] updateableClassifier() {
522
523	boolean[] result = new boolean[2];
524
525	print("updateable classifier...");
526	if (m_Classifier instanceof UpdateableClassifier) {
527	println("yes");
528	result[0] = true;
529	}
530	else {
531	println("no");
532	result[0] = false;
533	}
534
535	return result;
536	}
537
538	/**
539	* Checks whether the scheme says it can handle instance weights.
540	*
541	* @return true if the classifier handles instance weights
542	*/
543	protected boolean[] weightedInstancesHandler() {
544
545	boolean[] result = new boolean[2];
546
547	print("weighted instances classifier...");
548	if (m_Classifier instanceof WeightedInstancesHandler) {
549	println("yes");
550	result[0] = true;
551	}
552	else {
553	println("no");
554	result[0] = false;
555	}
556
557	return result;
558	}
559
560	/**
561	* Checks whether the scheme handles multi-instance data.
562	*
563	* @return true if the classifier handles multi-instance data
564	*/
565	protected boolean[] multiInstanceHandler() {
566	boolean[] result = new boolean[2];
567
568	print("multi-instance classifier...");
569	if (m_Classifier instanceof MultiInstanceCapabilitiesHandler) {
570	println("yes");
571	result[0] = true;
572	}
573	else {
574	println("no");
575	result[0] = false;
576	}
577
578	return result;
579	}
580
581	/**
582	* Checks basic prediction of the scheme, for simple non-troublesome
583	* datasets.
584	*
585	* @param nominalPredictor if true use nominal predictor attributes
586	* @param numericPredictor if true use numeric predictor attributes
587	* @param stringPredictor if true use string predictor attributes
588	* @param datePredictor if true use date predictor attributes
589	* @param relationalPredictor if true use relational predictor attributes
590	* @param multiInstance whether multi-instance is needed
591	* @param classType the class type (NOMINAL, NUMERIC, etc.)
592	* @return index 0 is true if the test was passed, index 1 is true if test
593	* was acceptable
594	*/
595	protected boolean[] canPredict(
596	boolean nominalPredictor,
597	boolean numericPredictor,
598	boolean stringPredictor,
599	boolean datePredictor,
600	boolean relationalPredictor,
601	boolean multiInstance,
602	int classType) {
603
604	print("basic predict");
605	printAttributeSummary(
606	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
607	print("...");
608	FastVector accepts = new FastVector();
609	accepts.addElement("unary");
610	accepts.addElement("binary");
611	accepts.addElement("nominal");
612	accepts.addElement("numeric");
613	accepts.addElement("string");
614	accepts.addElement("date");
615	accepts.addElement("relational");
616	accepts.addElement("multi-instance");
617	accepts.addElement("not in classpath");
618	int numTrain = getNumInstances(), numTest = getNumInstances(),
619	numClasses = 2, missingLevel = 0;
620	boolean predictorMissing = false, classMissing = false;
621
622	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
623	datePredictor, relationalPredictor,
624	multiInstance,
625	classType,
626	missingLevel, predictorMissing, classMissing,
627	numTrain, numTest, numClasses,
628	accepts);
629	}
630
631	/**
632	* Checks whether the scheme can handle data that contains only the class
633	* attribute. If a scheme cannot build a proper model with that data, it
634	* should default back to a ZeroR model.
635	*
636	* @param nominalPredictor if true use nominal predictor attributes
637	* @param numericPredictor if true use numeric predictor attributes
638	* @param stringPredictor if true use string predictor attributes
639	* @param datePredictor if true use date predictor attributes
640	* @param relationalPredictor if true use relational predictor attributes
641	* @param classType the class type (NOMINAL, NUMERIC, etc.)
642	* @return index 0 is true if the test was passed, index 1 is true if test
643	* was acceptable
644	*/
645	protected boolean[] canHandleOnlyClass(
646	boolean nominalPredictor,
647	boolean numericPredictor,
648	boolean stringPredictor,
649	boolean datePredictor,
650	boolean relationalPredictor,
651	int classType) {
652
653	print("only class in data");
654	printAttributeSummary(
655	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, false, classType);
656	print("...");
657	FastVector accepts = new FastVector();
658	accepts.addElement("class");
659	accepts.addElement("zeror");
660	int numTrain = getNumInstances(), numTest = getNumInstances(),
661	missingLevel = 0;
662	boolean predictorMissing = false, classMissing = false;
663
664	return runBasicTest(false, false, false, false, false,
665	false,
666	classType,
667	missingLevel, predictorMissing, classMissing,
668	numTrain, numTest, 2,
669	accepts);
670	}
671
672	/**
673	* Checks whether nominal schemes can handle more than two classes.
674	* If a scheme is only designed for two-class problems it should
675	* throw an appropriate exception for multi-class problems.
676	*
677	* @param nominalPredictor if true use nominal predictor attributes
678	* @param numericPredictor if true use numeric predictor attributes
679	* @param stringPredictor if true use string predictor attributes
680	* @param datePredictor if true use date predictor attributes
681	* @param relationalPredictor if true use relational predictor attributes
682	* @param multiInstance whether multi-instance is needed
683	* @param numClasses the number of classes to test
684	* @return index 0 is true if the test was passed, index 1 is true if test
685	* was acceptable
686	*/
687	protected boolean[] canHandleNClasses(
688	boolean nominalPredictor,
689	boolean numericPredictor,
690	boolean stringPredictor,
691	boolean datePredictor,
692	boolean relationalPredictor,
693	boolean multiInstance,
694	int numClasses) {
695
696	print("more than two class problems");
697	printAttributeSummary(
698	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, Attribute.NOMINAL);
699	print("...");
700	FastVector accepts = new FastVector();
701	accepts.addElement("number");
702	accepts.addElement("class");
703	int numTrain = getNumInstances(), numTest = getNumInstances(),
704	missingLevel = 0;
705	boolean predictorMissing = false, classMissing = false;
706
707	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
708	datePredictor, relationalPredictor,
709	multiInstance,
710	Attribute.NOMINAL,
711	missingLevel, predictorMissing, classMissing,
712	numTrain, numTest, numClasses,
713	accepts);
714	}
715
716	/**
717	* Checks whether the scheme can handle class attributes as Nth attribute.
718	*
719	* @param nominalPredictor if true use nominal predictor attributes
720	* @param numericPredictor if true use numeric predictor attributes
721	* @param stringPredictor if true use string predictor attributes
722	* @param datePredictor if true use date predictor attributes
723	* @param relationalPredictor if true use relational predictor attributes
724	* @param multiInstance whether multi-instance is needed
725	* @param classType the class type (NUMERIC, NOMINAL, etc.)
726	* @param classIndex the index of the class attribute (0-based, -1 means last attribute)
727	* @return index 0 is true if the test was passed, index 1 is true if test
728	* was acceptable
729	* @see TestInstances#CLASS_IS_LAST
730	*/
731	protected boolean[] canHandleClassAsNthAttribute(
732	boolean nominalPredictor,
733	boolean numericPredictor,
734	boolean stringPredictor,
735	boolean datePredictor,
736	boolean relationalPredictor,
737	boolean multiInstance,
738	int classType,
739	int classIndex) {
740
741	if (classIndex == TestInstances.CLASS_IS_LAST)
742	print("class attribute as last attribute");
743	else
744	print("class attribute as " + (classIndex + 1) + ". attribute");
745	printAttributeSummary(
746	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
747	print("...");
748	FastVector accepts = new FastVector();
749	int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2,
750	missingLevel = 0;
751	boolean predictorMissing = false, classMissing = false;
752
753	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
754	datePredictor, relationalPredictor,
755	multiInstance,
756	classType,
757	classIndex,
758	missingLevel, predictorMissing, classMissing,
759	numTrain, numTest, numClasses,
760	accepts);
761	}
762
763	/**
764	* Checks whether the scheme can handle zero training instances.
765	*
766	* @param nominalPredictor if true use nominal predictor attributes
767	* @param numericPredictor if true use numeric predictor attributes
768	* @param stringPredictor if true use string predictor attributes
769	* @param datePredictor if true use date predictor attributes
770	* @param relationalPredictor if true use relational predictor attributes
771	* @param multiInstance whether multi-instance is needed
772	* @param classType the class type (NUMERIC, NOMINAL, etc.)
773	* @return index 0 is true if the test was passed, index 1 is true if test
774	* was acceptable
775	*/
776	protected boolean[] canHandleZeroTraining(
777	boolean nominalPredictor,
778	boolean numericPredictor,
779	boolean stringPredictor,
780	boolean datePredictor,
781	boolean relationalPredictor,
782	boolean multiInstance,
783	int classType) {
784
785	print("handle zero training instances");
786	printAttributeSummary(
787	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
788	print("...");
789	FastVector accepts = new FastVector();
790	accepts.addElement("train");
791	accepts.addElement("value");
792	int numTrain = 0, numTest = getNumInstances(), numClasses = 2,
793	missingLevel = 0;
794	boolean predictorMissing = false, classMissing = false;
795
796	return runBasicTest(
797	nominalPredictor, numericPredictor, stringPredictor,
798	datePredictor, relationalPredictor,
799	multiInstance,
800	classType,
801	missingLevel, predictorMissing, classMissing,
802	numTrain, numTest, numClasses,
803	accepts);
804	}
805
806	/**
807	* Checks whether the scheme correctly initialises models when
808	* buildClassifier is called. This test calls buildClassifier with
809	* one training dataset and records performance on a test set.
810	* buildClassifier is then called on a training set with different
811	* structure, and then again with the original training set. The
812	* performance on the test set is compared with the original results
813	* and any performance difference noted as incorrect build initialisation.
814	*
815	* @param nominalPredictor if true use nominal predictor attributes
816	* @param numericPredictor if true use numeric predictor attributes
817	* @param stringPredictor if true use string predictor attributes
818	* @param datePredictor if true use date predictor attributes
819	* @param relationalPredictor if true use relational predictor attributes
820	* @param multiInstance whether multi-instance is needed
821	* @param classType the class type (NUMERIC, NOMINAL, etc.)
822	* @return index 0 is true if the test was passed, index 1 is true if the
823	* scheme performs worse than ZeroR, but without error (index 0 is
824	* false)
825	*/
826	protected boolean[] correctBuildInitialisation(
827	boolean nominalPredictor,
828	boolean numericPredictor,
829	boolean stringPredictor,
830	boolean datePredictor,
831	boolean relationalPredictor,
832	boolean multiInstance,
833	int classType) {
834
835	boolean[] result = new boolean[2];
836
837	print("correct initialisation during buildClassifier");
838	printAttributeSummary(
839	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
840	print("...");
841	int numTrain = getNumInstances(), numTest = getNumInstances(),
842	numClasses = 2, missingLevel = 0;
843	boolean predictorMissing = false, classMissing = false;
844
845	Instances train1 = null;
846	Instances test1 = null;
847	Instances train2 = null;
848	Instances test2 = null;
849	Classifier classifier = null;
850	Evaluation evaluation1A = null;
851	Evaluation evaluation1B = null;
852	Evaluation evaluation2 = null;
853	boolean built = false;
854	int stage = 0;
855	try {
856
857	// Make two sets of train/test splits with different
858	// numbers of attributes
859	train1 = makeTestDataset(42, numTrain,
860	nominalPredictor ? getNumNominal() : 0,
861	numericPredictor ? getNumNumeric() : 0,
862	stringPredictor ? getNumString() : 0,
863	datePredictor ? getNumDate() : 0,
864	relationalPredictor ? getNumRelational() : 0,
865	numClasses,
866	classType,
867	multiInstance);
868	train2 = makeTestDataset(84, numTrain,
869	nominalPredictor ? getNumNominal() + 1 : 0,
870	numericPredictor ? getNumNumeric() + 1 : 0,
871	stringPredictor ? getNumString() : 0,
872	datePredictor ? getNumDate() : 0,
873	relationalPredictor ? getNumRelational() : 0,
874	numClasses,
875	classType,
876	multiInstance);
877	test1 = makeTestDataset(24, numTest,
878	nominalPredictor ? getNumNominal() : 0,
879	numericPredictor ? getNumNumeric() : 0,
880	stringPredictor ? getNumString() : 0,
881	datePredictor ? getNumDate() : 0,
882	relationalPredictor ? getNumRelational() : 0,
883	numClasses,
884	classType,
885	multiInstance);
886	test2 = makeTestDataset(48, numTest,
887	nominalPredictor ? getNumNominal() + 1 : 0,
888	numericPredictor ? getNumNumeric() + 1 : 0,
889	stringPredictor ? getNumString() : 0,
890	datePredictor ? getNumDate() : 0,
891	relationalPredictor ? getNumRelational() : 0,
892	numClasses,
893	classType,
894	multiInstance);
895	if (missingLevel > 0) {
896	addMissing(train1, missingLevel, predictorMissing, classMissing);
897	addMissing(test1, Math.min(missingLevel,50), predictorMissing,
898	classMissing);
899	addMissing(train2, missingLevel, predictorMissing, classMissing);
900	addMissing(test2, Math.min(missingLevel,50), predictorMissing,
901	classMissing);
902	}
903
904	classifier = AbstractClassifier.makeCopies(getClassifier(), 1)[0];
905	evaluation1A = new Evaluation(train1);
906	evaluation1B = new Evaluation(train1);
907	evaluation2 = new Evaluation(train2);
908	} catch (Exception ex) {
909	throw new Error("Error setting up for tests: " + ex.getMessage());
910	}
911	try {
912	stage = 0;
913	classifier.buildClassifier(train1);
914	built = true;
915	if (!testWRTZeroR(classifier, evaluation1A, train1, test1)[0]) {
916	throw new Exception("Scheme performs worse than ZeroR");
917	}
918
919	stage = 1;
920	built = false;
921	classifier.buildClassifier(train2);
922	built = true;
923	if (!testWRTZeroR(classifier, evaluation2, train2, test2)[0]) {
924	throw new Exception("Scheme performs worse than ZeroR");
925	}
926
927	stage = 2;
928	built = false;
929	classifier.buildClassifier(train1);
930	built = true;
931	if (!testWRTZeroR(classifier, evaluation1B, train1, test1)[0]) {
932	throw new Exception("Scheme performs worse than ZeroR");
933	}
934
935	stage = 3;
936	if (!evaluation1A.equals(evaluation1B)) {
937	if (m_Debug) {
938	println("\n=== Full report ===\n"
939	+ evaluation1A.toSummaryString("\nFirst buildClassifier()",
940	true)
941	+ "\n\n");
942	println(
943	evaluation1B.toSummaryString("\nSecond buildClassifier()",
944	true)
945	+ "\n\n");
946	}
947	throw new Exception("Results differ between buildClassifier calls");
948	}
949	println("yes");
950	result[0] = true;
951
952	if (false && m_Debug) {
953	println("\n=== Full report ===\n"
954	+ evaluation1A.toSummaryString("\nFirst buildClassifier()",
955	true)
956	+ "\n\n");
957	println(
958	evaluation1B.toSummaryString("\nSecond buildClassifier()",
959	true)
960	+ "\n\n");
961	}
962	}
963	catch (Exception ex) {
964	String msg = ex.getMessage().toLowerCase();
965	if (msg.indexOf("worse than zeror") >= 0) {
966	println("warning: performs worse than ZeroR");
967	result[0] = (stage < 1);
968	result[1] = (stage < 1);
969	} else {
970	println("no");
971	result[0] = false;
972	}
973	if (m_Debug) {
974	println("\n=== Full Report ===");
975	print("Problem during");
976	if (built) {
977	print(" testing");
978	} else {
979	print(" training");
980	}
981	switch (stage) {
982	case 0:
983	print(" of dataset 1");
984	break;
985	case 1:
986	print(" of dataset 2");
987	break;
988	case 2:
989	print(" of dataset 1 (2nd build)");
990	break;
991	case 3:
992	print(", comparing results from builds of dataset 1");
993	break;
994	}
995	println(": " + ex.getMessage() + "\n");
996	println("here are the datasets:\n");
997	println("=== Train1 Dataset ===\n"
998	+ train1.toString() + "\n");
999	println("=== Test1 Dataset ===\n"
1000	+ test1.toString() + "\n\n");
1001	println("=== Train2 Dataset ===\n"
1002	+ train2.toString() + "\n");
1003	println("=== Test2 Dataset ===\n"
1004	+ test2.toString() + "\n\n");
1005	}
1006	}
1007
1008	return result;
1009	}
1010
1011	/**
1012	* Checks basic missing value handling of the scheme. If the missing
1013	* values cause an exception to be thrown by the scheme, this will be
1014	* recorded.
1015	*
1016	* @param nominalPredictor if true use nominal predictor attributes
1017	* @param numericPredictor if true use numeric predictor attributes
1018	* @param stringPredictor if true use string predictor attributes
1019	* @param datePredictor if true use date predictor attributes
1020	* @param relationalPredictor if true use relational predictor attributes
1021	* @param multiInstance whether multi-instance is needed
1022	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1023	* @param predictorMissing true if the missing values may be in
1024	* the predictors
1025	* @param classMissing true if the missing values may be in the class
1026	* @param missingLevel the percentage of missing values
1027	* @return index 0 is true if the test was passed, index 1 is true if test
1028	* was acceptable
1029	*/
1030	protected boolean[] canHandleMissing(
1031	boolean nominalPredictor,
1032	boolean numericPredictor,
1033	boolean stringPredictor,
1034	boolean datePredictor,
1035	boolean relationalPredictor,
1036	boolean multiInstance,
1037	int classType,
1038	boolean predictorMissing,
1039	boolean classMissing,
1040	int missingLevel) {
1041
1042	if (missingLevel == 100)
1043	print("100% ");
1044	print("missing");
1045	if (predictorMissing) {
1046	print(" predictor");
1047	if (classMissing)
1048	print(" and");
1049	}
1050	if (classMissing)
1051	print(" class");
1052	print(" values");
1053	printAttributeSummary(
1054	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
1055	print("...");
1056	FastVector accepts = new FastVector();
1057	accepts.addElement("missing");
1058	accepts.addElement("value");
1059	accepts.addElement("train");
1060	int numTrain = getNumInstances(), numTest = getNumInstances(),
1061	numClasses = 2;
1062
1063	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
1064	datePredictor, relationalPredictor,
1065	multiInstance,
1066	classType,
1067	missingLevel, predictorMissing, classMissing,
1068	numTrain, numTest, numClasses,
1069	accepts);
1070	}
1071
1072	/**
1073	* Checks whether an updateable scheme produces the same model when
1074	* trained incrementally as when batch trained. The model itself
1075	* cannot be compared, so we compare the evaluation on test data
1076	* for both models. It is possible to get a false positive on this
1077	* test (likelihood depends on the classifier).
1078	*
1079	* @param nominalPredictor if true use nominal predictor attributes
1080	* @param numericPredictor if true use numeric predictor attributes
1081	* @param stringPredictor if true use string predictor attributes
1082	* @param datePredictor if true use date predictor attributes
1083	* @param relationalPredictor if true use relational predictor attributes
1084	* @param multiInstance whether multi-instance is needed
1085	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1086	* @return index 0 is true if the test was passed
1087	*/
1088	protected boolean[] updatingEquality(
1089	boolean nominalPredictor,
1090	boolean numericPredictor,
1091	boolean stringPredictor,
1092	boolean datePredictor,
1093	boolean relationalPredictor,
1094	boolean multiInstance,
1095	int classType) {
1096
1097	print("incremental training produces the same results"
1098	+ " as batch training");
1099	printAttributeSummary(
1100	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
1101	print("...");
1102	int numTrain = getNumInstances(), numTest = getNumInstances(),
1103	numClasses = 2, missingLevel = 0;
1104	boolean predictorMissing = false, classMissing = false;
1105
1106	boolean[] result = new boolean[2];
1107	Instances train = null;
1108	Instances test = null;
1109	Classifier [] classifiers = null;
1110	Evaluation evaluationB = null;
1111	Evaluation evaluationI = null;
1112	boolean built = false;
1113	try {
1114	train = makeTestDataset(42, numTrain,
1115	nominalPredictor ? getNumNominal() : 0,
1116	numericPredictor ? getNumNumeric() : 0,
1117	stringPredictor ? getNumString() : 0,
1118	datePredictor ? getNumDate() : 0,
1119	relationalPredictor ? getNumRelational() : 0,
1120	numClasses,
1121	classType,
1122	multiInstance);
1123	test = makeTestDataset(24, numTest,
1124	nominalPredictor ? getNumNominal() : 0,
1125	numericPredictor ? getNumNumeric() : 0,
1126	stringPredictor ? getNumString() : 0,
1127	datePredictor ? getNumDate() : 0,
1128	relationalPredictor ? getNumRelational() : 0,
1129	numClasses,
1130	classType,
1131	multiInstance);
1132	if (missingLevel > 0) {
1133	addMissing(train, missingLevel, predictorMissing, classMissing);
1134	addMissing(test, Math.min(missingLevel, 50), predictorMissing,
1135	classMissing);
1136	}
1137	classifiers = AbstractClassifier.makeCopies(getClassifier(), 2);
1138	evaluationB = new Evaluation(train);
1139	evaluationI = new Evaluation(train);
1140	classifiers[0].buildClassifier(train);
1141	testWRTZeroR(classifiers[0], evaluationB, train, test);
1142	} catch (Exception ex) {
1143	throw new Error("Error setting up for tests: " + ex.getMessage());
1144	}
1145	try {
1146	classifiers[1].buildClassifier(new Instances(train, 0));
1147	for (int i = 0; i < train.numInstances(); i++) {
1148	((UpdateableClassifier)classifiers[1]).updateClassifier(
1149	train.instance(i));
1150	}
1151	built = true;
1152	testWRTZeroR(classifiers[1], evaluationI, train, test);
1153	if (!evaluationB.equals(evaluationI)) {
1154	println("no");
1155	result[0] = false;
1156
1157	if (m_Debug) {
1158	println("\n=== Full Report ===");
1159	println("Results differ between batch and "
1160	+ "incrementally built models.\n"
1161	+ "Depending on the classifier, this may be OK");
1162	println("Here are the results:\n");
1163	println(evaluationB.toSummaryString(
1164	"\nbatch built results\n", true));
1165	println(evaluationI.toSummaryString(
1166	"\nincrementally built results\n", true));
1167	println("Here are the datasets:\n");
1168	println("=== Train Dataset ===\n"
1169	+ train.toString() + "\n");
1170	println("=== Test Dataset ===\n"
1171	+ test.toString() + "\n\n");
1172	}
1173	}
1174	else {
1175	println("yes");
1176	result[0] = true;
1177	}
1178	} catch (Exception ex) {
1179	result[0] = false;
1180
1181	print("Problem during");
1182	if (built)
1183	print(" testing");
1184	else
1185	print(" training");
1186	println(": " + ex.getMessage() + "\n");
1187	}
1188
1189	return result;
1190	}
1191
1192	/**
1193	* Checks whether the classifier erroneously uses the class
1194	* value of test instances (if provided). Runs the classifier with
1195	* test instance class values set to missing and compares with results
1196	* when test instance class values are left intact.
1197	*
1198	* @param nominalPredictor if true use nominal predictor attributes
1199	* @param numericPredictor if true use numeric predictor attributes
1200	* @param stringPredictor if true use string predictor attributes
1201	* @param datePredictor if true use date predictor attributes
1202	* @param relationalPredictor if true use relational predictor attributes
1203	* @param multiInstance whether multi-instance is needed
1204	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1205	* @return index 0 is true if the test was passed
1206	*/
1207	protected boolean[] doesntUseTestClassVal(
1208	boolean nominalPredictor,
1209	boolean numericPredictor,
1210	boolean stringPredictor,
1211	boolean datePredictor,
1212	boolean relationalPredictor,
1213	boolean multiInstance,
1214	int classType) {
1215
1216	print("classifier ignores test instance class vals");
1217	printAttributeSummary(
1218	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
1219	print("...");
1220	int numTrain = 2*getNumInstances(), numTest = getNumInstances(),
1221	numClasses = 2, missingLevel = 0;
1222	boolean predictorMissing = false, classMissing = false;
1223
1224	boolean[] result = new boolean[2];
1225	Instances train = null;
1226	Instances test = null;
1227	Classifier [] classifiers = null;
1228	boolean evalFail = false;
1229	try {
1230	train = makeTestDataset(42, numTrain,
1231	nominalPredictor ? getNumNominal() + 1 : 0,
1232	numericPredictor ? getNumNumeric() + 1 : 0,
1233	stringPredictor ? getNumString() : 0,
1234	datePredictor ? getNumDate() : 0,
1235	relationalPredictor ? getNumRelational() : 0,
1236	numClasses,
1237	classType,
1238	multiInstance);
1239	test = makeTestDataset(24, numTest,
1240	nominalPredictor ? getNumNominal() + 1 : 0,
1241	numericPredictor ? getNumNumeric() + 1 : 0,
1242	stringPredictor ? getNumString() : 0,
1243	datePredictor ? getNumDate() : 0,
1244	relationalPredictor ? getNumRelational() : 0,
1245	numClasses,
1246	classType,
1247	multiInstance);
1248	if (missingLevel > 0) {
1249	addMissing(train, missingLevel, predictorMissing, classMissing);
1250	addMissing(test, Math.min(missingLevel, 50), predictorMissing,
1251	classMissing);
1252	}
1253	classifiers = AbstractClassifier.makeCopies(getClassifier(), 2);
1254	classifiers[0].buildClassifier(train);
1255	classifiers[1].buildClassifier(train);
1256	} catch (Exception ex) {
1257	throw new Error("Error setting up for tests: " + ex.getMessage());
1258	}
1259	try {
1260
1261	// Now set test values to missing when predicting
1262	for (int i = 0; i < test.numInstances(); i++) {
1263	Instance testInst = test.instance(i);
1264	Instance classMissingInst = (Instance)testInst.copy();
1265	classMissingInst.setDataset(test);
1266	classMissingInst.setClassMissing();
1267	double [] dist0 = classifiers[0].distributionForInstance(testInst);
1268	double [] dist1 = classifiers[1].distributionForInstance(classMissingInst);
1269	for (int j = 0; j < dist0.length; j++) {
1270	// ignore, if both are NaNs
1271	if (Double.isNaN(dist0[j]) && Double.isNaN(dist1[j])) {
1272	if (getDebug())
1273	System.out.println("Both predictions are NaN!");
1274	continue;
1275	}
1276	// distribution different?
1277	if (dist0[j] != dist1[j]) {
1278	throw new Exception("Prediction different for instance " + (i + 1));
1279	}
1280	}
1281	}
1282
1283	println("yes");
1284	result[0] = true;
1285	} catch (Exception ex) {
1286	println("no");
1287	result[0] = false;
1288
1289	if (m_Debug) {
1290	println("\n=== Full Report ===");
1291
1292	if (evalFail) {
1293	println("Results differ between non-missing and "
1294	+ "missing test class values.");
1295	} else {
1296	print("Problem during testing");
1297	println(": " + ex.getMessage() + "\n");
1298	}
1299	println("Here are the datasets:\n");
1300	println("=== Train Dataset ===\n"
1301	+ train.toString() + "\n");
1302	println("=== Train Weights ===\n");
1303	for (int i = 0; i < train.numInstances(); i++) {
1304	println(" " + (i + 1)
1305	+ " " + train.instance(i).weight());
1306	}
1307	println("=== Test Dataset ===\n"
1308	+ test.toString() + "\n\n");
1309	println("(test weights all 1.0\n");
1310	}
1311	}
1312
1313	return result;
1314	}
1315
1316	/**
1317	* Checks whether the classifier can handle instance weights.
1318	* This test compares the classifier performance on two datasets
1319	* that are identical except for the training weights. If the
1320	* results change, then the classifier must be using the weights. It
1321	* may be possible to get a false positive from this test if the
1322	* weight changes aren't significant enough to induce a change
1323	* in classifier performance (but the weights are chosen to minimize
1324	* the likelihood of this).
1325	*
1326	* @param nominalPredictor if true use nominal predictor attributes
1327	* @param numericPredictor if true use numeric predictor attributes
1328	* @param stringPredictor if true use string predictor attributes
1329	* @param datePredictor if true use date predictor attributes
1330	* @param relationalPredictor if true use relational predictor attributes
1331	* @param multiInstance whether multi-instance is needed
1332	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1333	* @return index 0 true if the test was passed
1334	*/
1335	protected boolean[] instanceWeights(
1336	boolean nominalPredictor,
1337	boolean numericPredictor,
1338	boolean stringPredictor,
1339	boolean datePredictor,
1340	boolean relationalPredictor,
1341	boolean multiInstance,
1342	int classType) {
1343
1344	print("classifier uses instance weights");
1345	printAttributeSummary(
1346	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
1347	print("...");
1348	int numTrain = 2*getNumInstances(), numTest = getNumInstances(),
1349	numClasses = 2, missingLevel = 0;
1350	boolean predictorMissing = false, classMissing = false;
1351
1352	boolean[] result = new boolean[2];
1353	Instances train = null;
1354	Instances test = null;
1355	Classifier [] classifiers = null;
1356	Evaluation evaluationB = null;
1357	Evaluation evaluationI = null;
1358	boolean built = false;
1359	boolean evalFail = false;
1360	try {
1361	train = makeTestDataset(42, numTrain,
1362	nominalPredictor ? getNumNominal() + 1 : 0,
1363	numericPredictor ? getNumNumeric() + 1 : 0,
1364	stringPredictor ? getNumString() : 0,
1365	datePredictor ? getNumDate() : 0,
1366	relationalPredictor ? getNumRelational() : 0,
1367	numClasses,
1368	classType,
1369	multiInstance);
1370	test = makeTestDataset(24, numTest,
1371	nominalPredictor ? getNumNominal() + 1 : 0,
1372	numericPredictor ? getNumNumeric() + 1 : 0,
1373	stringPredictor ? getNumString() : 0,
1374	datePredictor ? getNumDate() : 0,
1375	relationalPredictor ? getNumRelational() : 0,
1376	numClasses,
1377	classType,
1378	multiInstance);
1379	if (missingLevel > 0) {
1380	addMissing(train, missingLevel, predictorMissing, classMissing);
1381	addMissing(test, Math.min(missingLevel, 50), predictorMissing,
1382	classMissing);
1383	}
1384	classifiers = AbstractClassifier.makeCopies(getClassifier(), 2);
1385	evaluationB = new Evaluation(train);
1386	evaluationI = new Evaluation(train);
1387	classifiers[0].buildClassifier(train);
1388	testWRTZeroR(classifiers[0], evaluationB, train, test);
1389	} catch (Exception ex) {
1390	throw new Error("Error setting up for tests: " + ex.getMessage());
1391	}
1392	try {
1393
1394	// Now modify instance weights and re-built/test
1395	for (int i = 0; i < train.numInstances(); i++) {
1396	train.instance(i).setWeight(0);
1397	}
1398	Random random = new Random(1);
1399	for (int i = 0; i < train.numInstances() / 2; i++) {
1400	int inst = Math.abs(random.nextInt()) % train.numInstances();
1401	int weight = Math.abs(random.nextInt()) % 10 + 1;
1402	train.instance(inst).setWeight(weight);
1403	}
1404	classifiers[1].buildClassifier(train);
1405	built = true;
1406	testWRTZeroR(classifiers[1], evaluationI, train, test);
1407	if (evaluationB.equals(evaluationI)) {
1408	// println("no");
1409	evalFail = true;
1410	throw new Exception("evalFail");
1411	}
1412
1413	println("yes");
1414	result[0] = true;
1415	} catch (Exception ex) {
1416	println("no");
1417	result[0] = false;
1418
1419	if (m_Debug) {
1420	println("\n=== Full Report ===");
1421
1422	if (evalFail) {
1423	println("Results don't differ between non-weighted and "
1424	+ "weighted instance models.");
1425	println("Here are the results:\n");
1426	println(evaluationB.toSummaryString("\nboth methods\n",
1427	true));
1428	} else {
1429	print("Problem during");
1430	if (built) {
1431	print(" testing");
1432	} else {
1433	print(" training");
1434	}
1435	println(": " + ex.getMessage() + "\n");
1436	}
1437	println("Here are the datasets:\n");
1438	println("=== Train Dataset ===\n"
1439	+ train.toString() + "\n");
1440	println("=== Train Weights ===\n");
1441	for (int i = 0; i < train.numInstances(); i++) {
1442	println(" " + (i + 1)
1443	+ " " + train.instance(i).weight());
1444	}
1445	println("=== Test Dataset ===\n"
1446	+ test.toString() + "\n\n");
1447	println("(test weights all 1.0\n");
1448	}
1449	}
1450
1451	return result;
1452	}
1453
1454	/**
1455	* Checks whether the scheme alters the training dataset during
1456	* training. If the scheme needs to modify the training
1457	* data it should take a copy of the training data. Currently checks
1458	* for changes to header structure, number of instances, order of
1459	* instances, instance weights.
1460	*
1461	* @param nominalPredictor if true use nominal predictor attributes
1462	* @param numericPredictor if true use numeric predictor attributes
1463	* @param stringPredictor if true use string predictor attributes
1464	* @param datePredictor if true use date predictor attributes
1465	* @param relationalPredictor if true use relational predictor attributes
1466	* @param multiInstance whether multi-instance is needed
1467	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1468	* @param predictorMissing true if we know the classifier can handle
1469	* (at least) moderate missing predictor values
1470	* @param classMissing true if we know the classifier can handle
1471	* (at least) moderate missing class values
1472	* @return index 0 is true if the test was passed
1473	*/
1474	protected boolean[] datasetIntegrity(
1475	boolean nominalPredictor,
1476	boolean numericPredictor,
1477	boolean stringPredictor,
1478	boolean datePredictor,
1479	boolean relationalPredictor,
1480	boolean multiInstance,
1481	int classType,
1482	boolean predictorMissing,
1483	boolean classMissing) {
1484
1485	print("classifier doesn't alter original datasets");
1486	printAttributeSummary(
1487	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, classType);
1488	print("...");
1489	int numTrain = getNumInstances(), numTest = getNumInstances(),
1490	numClasses = 2, missingLevel = 20;
1491
1492	boolean[] result = new boolean[2];
1493	Instances train = null;
1494	Instances test = null;
1495	Classifier classifier = null;
1496	Evaluation evaluation = null;
1497	boolean built = false;
1498	try {
1499	train = makeTestDataset(42, numTrain,
1500	nominalPredictor ? getNumNominal() : 0,
1501	numericPredictor ? getNumNumeric() : 0,
1502	stringPredictor ? getNumString() : 0,
1503	datePredictor ? getNumDate() : 0,
1504	relationalPredictor ? getNumRelational() : 0,
1505	numClasses,
1506	classType,
1507	multiInstance);
1508	test = makeTestDataset(24, numTest,
1509	nominalPredictor ? getNumNominal() : 0,
1510	numericPredictor ? getNumNumeric() : 0,
1511	stringPredictor ? getNumString() : 0,
1512	datePredictor ? getNumDate() : 0,
1513	relationalPredictor ? getNumRelational() : 0,
1514	numClasses,
1515	classType,
1516	multiInstance);
1517	if (missingLevel > 0) {
1518	addMissing(train, missingLevel, predictorMissing, classMissing);
1519	addMissing(test, Math.min(missingLevel, 50), predictorMissing,
1520	classMissing);
1521	}
1522	classifier = AbstractClassifier.makeCopies(getClassifier(), 1)[0];
1523	evaluation = new Evaluation(train);
1524	} catch (Exception ex) {
1525	throw new Error("Error setting up for tests: " + ex.getMessage());
1526	}
1527	try {
1528	Instances trainCopy = new Instances(train);
1529	Instances testCopy = new Instances(test);
1530	classifier.buildClassifier(trainCopy);
1531	compareDatasets(train, trainCopy);
1532	built = true;
1533	testWRTZeroR(classifier, evaluation, trainCopy, testCopy);
1534	compareDatasets(test, testCopy);
1535
1536	println("yes");
1537	result[0] = true;
1538	} catch (Exception ex) {
1539	println("no");
1540	result[0] = false;
1541
1542	if (m_Debug) {
1543	println("\n=== Full Report ===");
1544	print("Problem during");
1545	if (built) {
1546	print(" testing");
1547	} else {
1548	print(" training");
1549	}
1550	println(": " + ex.getMessage() + "\n");
1551	println("Here are the datasets:\n");
1552	println("=== Train Dataset ===\n"
1553	+ train.toString() + "\n");
1554	println("=== Test Dataset ===\n"
1555	+ test.toString() + "\n\n");
1556	}
1557	}
1558
1559	return result;
1560	}
1561
1562	/**
1563	* Runs a text on the datasets with the given characteristics.
1564	*
1565	* @param nominalPredictor if true use nominal predictor attributes
1566	* @param numericPredictor if true use numeric predictor attributes
1567	* @param stringPredictor if true use string predictor attributes
1568	* @param datePredictor if true use date predictor attributes
1569	* @param relationalPredictor if true use relational predictor attributes
1570	* @param multiInstance whether multi-instance is needed
1571	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1572	* @param missingLevel the percentage of missing values
1573	* @param predictorMissing true if the missing values may be in
1574	* the predictors
1575	* @param classMissing true if the missing values may be in the class
1576	* @param numTrain the number of instances in the training set
1577	* @param numTest the number of instaces in the test set
1578	* @param numClasses the number of classes
1579	* @param accepts the acceptable string in an exception
1580	* @return index 0 is true if the test was passed, index 1 is true if test
1581	* was acceptable
1582	*/
1583	protected boolean[] runBasicTest(boolean nominalPredictor,
1584	boolean numericPredictor,
1585	boolean stringPredictor,
1586	boolean datePredictor,
1587	boolean relationalPredictor,
1588	boolean multiInstance,
1589	int classType,
1590	int missingLevel,
1591	boolean predictorMissing,
1592	boolean classMissing,
1593	int numTrain,
1594	int numTest,
1595	int numClasses,
1596	FastVector accepts) {
1597
1598	return runBasicTest(
1599	nominalPredictor,
1600	numericPredictor,
1601	stringPredictor,
1602	datePredictor,
1603	relationalPredictor,
1604	multiInstance,
1605	classType,
1606	TestInstances.CLASS_IS_LAST,
1607	missingLevel,
1608	predictorMissing,
1609	classMissing,
1610	numTrain,
1611	numTest,
1612	numClasses,
1613	accepts);
1614	}
1615
1616	/**
1617	* Runs a text on the datasets with the given characteristics.
1618	*
1619	* @param nominalPredictor if true use nominal predictor attributes
1620	* @param numericPredictor if true use numeric predictor attributes
1621	* @param stringPredictor if true use string predictor attributes
1622	* @param datePredictor if true use date predictor attributes
1623	* @param relationalPredictor if true use relational predictor attributes
1624	* @param multiInstance whether multi-instance is needed
1625	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1626	* @param classIndex the attribute index of the class
1627	* @param missingLevel the percentage of missing values
1628	* @param predictorMissing true if the missing values may be in
1629	* the predictors
1630	* @param classMissing true if the missing values may be in the class
1631	* @param numTrain the number of instances in the training set
1632	* @param numTest the number of instaces in the test set
1633	* @param numClasses the number of classes
1634	* @param accepts the acceptable string in an exception
1635	* @return index 0 is true if the test was passed, index 1 is true if test
1636	* was acceptable
1637	*/
1638	protected boolean[] runBasicTest(boolean nominalPredictor,
1639	boolean numericPredictor,
1640	boolean stringPredictor,
1641	boolean datePredictor,
1642	boolean relationalPredictor,
1643	boolean multiInstance,
1644	int classType,
1645	int classIndex,
1646	int missingLevel,
1647	boolean predictorMissing,
1648	boolean classMissing,
1649	int numTrain,
1650	int numTest,
1651	int numClasses,
1652	FastVector accepts) {
1653
1654	boolean[] result = new boolean[2];
1655	Instances train = null;
1656	Instances test = null;
1657	Classifier classifier = null;
1658	Evaluation evaluation = null;
1659	boolean built = false;
1660	try {
1661	train = makeTestDataset(42, numTrain,
1662	nominalPredictor ? getNumNominal() : 0,
1663	numericPredictor ? getNumNumeric() : 0,
1664	stringPredictor ? getNumString() : 0,
1665	datePredictor ? getNumDate() : 0,
1666	relationalPredictor ? getNumRelational() : 0,
1667	numClasses,
1668	classType,
1669	classIndex,
1670	multiInstance);
1671	test = makeTestDataset(24, numTest,
1672	nominalPredictor ? getNumNominal() : 0,
1673	numericPredictor ? getNumNumeric() : 0,
1674	stringPredictor ? getNumString() : 0,
1675	datePredictor ? getNumDate() : 0,
1676	relationalPredictor ? getNumRelational() : 0,
1677	numClasses,
1678	classType,
1679	classIndex,
1680	multiInstance);
1681	if (missingLevel > 0) {
1682	addMissing(train, missingLevel, predictorMissing, classMissing);
1683	addMissing(test, Math.min(missingLevel, 50), predictorMissing,
1684	classMissing);
1685	}
1686	classifier = AbstractClassifier.makeCopies(getClassifier(), 1)[0];
1687	evaluation = new Evaluation(train);
1688	} catch (Exception ex) {
1689	ex.printStackTrace();
1690	throw new Error("Error setting up for tests: " + ex.getMessage());
1691	}
1692	try {
1693	classifier.buildClassifier(train);
1694	built = true;
1695	if (!testWRTZeroR(classifier, evaluation, train, test)[0]) {
1696	result[0] = true;
1697	result[1] = true;
1698	throw new Exception("Scheme performs worse than ZeroR");
1699	}
1700
1701	println("yes");
1702	result[0] = true;
1703	}
1704	catch (Exception ex) {
1705	boolean acceptable = false;
1706	String msg;
1707	if (ex.getMessage() == null)
1708	msg = "";
1709	else
1710	msg = ex.getMessage().toLowerCase();
1711	if (msg.indexOf("not in classpath") > -1)
1712	m_ClasspathProblems = true;
1713	if (msg.indexOf("worse than zeror") >= 0) {
1714	println("warning: performs worse than ZeroR");
1715	result[0] = true;
1716	result[1] = true;
1717	} else {
1718	for (int i = 0; i < accepts.size(); i++) {
1719	if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
1720	acceptable = true;
1721	}
1722	}
1723
1724	println("no" + (acceptable ? " (OK error message)" : ""));
1725	result[1] = acceptable;
1726	}
1727
1728	if (m_Debug) {
1729	println("\n=== Full Report ===");
1730	print("Problem during");
1731	if (built) {
1732	print(" testing");
1733	} else {
1734	print(" training");
1735	}
1736	println(": " + ex.getMessage() + "\n");
1737	if (!acceptable) {
1738	if (accepts.size() > 0) {
1739	print("Error message doesn't mention ");
1740	for (int i = 0; i < accepts.size(); i++) {
1741	if (i != 0) {
1742	print(" or ");
1743	}
1744	print('"' + (String)accepts.elementAt(i) + '"');
1745	}
1746	}
1747	println("here are the datasets:\n");
1748	println("=== Train Dataset ===\n"
1749	+ train.toString() + "\n");
1750	println("=== Test Dataset ===\n"
1751	+ test.toString() + "\n\n");
1752	}
1753	}
1754	}
1755
1756	return result;
1757	}
1758
1759	/**
1760	* Determine whether the scheme performs worse than ZeroR during testing
1761	*
1762	* @param classifier the pre-trained classifier
1763	* @param evaluation the classifier evaluation object
1764	* @param train the training data
1765	* @param test the test data
1766	* @return index 0 is true if the scheme performs better than ZeroR
1767	* @throws Exception if there was a problem during the scheme's testing
1768	*/
1769	protected boolean[] testWRTZeroR(Classifier classifier,
1770	Evaluation evaluation,
1771	Instances train, Instances test)
1772	throws Exception {
1773
1774	boolean[] result = new boolean[2];
1775
1776	evaluation.evaluateModel(classifier, test);
1777	try {
1778
1779	// Tested OK, compare with ZeroR
1780	Classifier zeroR = new weka.classifiers.rules.ZeroR();
1781	zeroR.buildClassifier(train);
1782	Evaluation zeroREval = new Evaluation(train);
1783	zeroREval.evaluateModel(zeroR, test);
1784	result[0] = Utils.grOrEq(zeroREval.errorRate(), evaluation.errorRate());
1785	}
1786	catch (Exception ex) {
1787	throw new Error("Problem determining ZeroR performance: "
1788	+ ex.getMessage());
1789	}
1790
1791	return result;
1792	}
1793
1794	/**
1795	* Make a simple set of instances, which can later be modified
1796	* for use in specific tests.
1797	*
1798	* @param seed the random number seed
1799	* @param numInstances the number of instances to generate
1800	* @param numNominal the number of nominal attributes
1801	* @param numNumeric the number of numeric attributes
1802	* @param numString the number of string attributes
1803	* @param numDate the number of date attributes
1804	* @param numRelational the number of relational attributes
1805	* @param numClasses the number of classes (if nominal class)
1806	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1807	* @param multiInstance whether the dataset should a multi-instance dataset
1808	* @return the test dataset
1809	* @throws Exception if the dataset couldn't be generated
1810	* @see #process(Instances)
1811	*/
1812	protected Instances makeTestDataset(int seed, int numInstances,
1813	int numNominal, int numNumeric,
1814	int numString, int numDate,
1815	int numRelational,
1816	int numClasses, int classType,
1817	boolean multiInstance)
1818	throws Exception {
1819
1820	return makeTestDataset(
1821	seed,
1822	numInstances,
1823	numNominal,
1824	numNumeric,
1825	numString,
1826	numDate,
1827	numRelational,
1828	numClasses,
1829	classType,
1830	TestInstances.CLASS_IS_LAST,
1831	multiInstance);
1832	}
1833
1834	/**
1835	* Make a simple set of instances with variable position of the class
1836	* attribute, which can later be modified for use in specific tests.
1837	*
1838	* @param seed the random number seed
1839	* @param numInstances the number of instances to generate
1840	* @param numNominal the number of nominal attributes
1841	* @param numNumeric the number of numeric attributes
1842	* @param numString the number of string attributes
1843	* @param numDate the number of date attributes
1844	* @param numRelational the number of relational attributes
1845	* @param numClasses the number of classes (if nominal class)
1846	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1847	* @param classIndex the index of the class (0-based, -1 as last)
1848	* @param multiInstance whether the dataset should a multi-instance dataset
1849	* @return the test dataset
1850	* @throws Exception if the dataset couldn't be generated
1851	* @see TestInstances#CLASS_IS_LAST
1852	* @see #process(Instances)
1853	*/
1854	protected Instances makeTestDataset(int seed, int numInstances,
1855	int numNominal, int numNumeric,
1856	int numString, int numDate,
1857	int numRelational,
1858	int numClasses, int classType,
1859	int classIndex,
1860	boolean multiInstance)
1861	throws Exception {
1862
1863	TestInstances dataset = new TestInstances();
1864
1865	dataset.setSeed(seed);
1866	dataset.setNumInstances(numInstances);
1867	dataset.setNumNominal(numNominal);
1868	dataset.setNumNumeric(numNumeric);
1869	dataset.setNumString(numString);
1870	dataset.setNumDate(numDate);
1871	dataset.setNumRelational(numRelational);
1872	dataset.setNumClasses(numClasses);
1873	dataset.setClassType(classType);
1874	dataset.setClassIndex(classIndex);
1875	dataset.setNumClasses(numClasses);
1876	dataset.setMultiInstance(multiInstance);
1877	dataset.setWords(getWords());
1878	dataset.setWordSeparators(getWordSeparators());
1879
1880	return process(dataset.generate());
1881	}
1882
1883	/**
1884	* Print out a short summary string for the dataset characteristics
1885	*
1886	* @param nominalPredictor true if nominal predictor attributes are present
1887	* @param numericPredictor true if numeric predictor attributes are present
1888	* @param stringPredictor true if string predictor attributes are present
1889	* @param datePredictor true if date predictor attributes are present
1890	* @param relationalPredictor true if relational predictor attributes are present
1891	* @param multiInstance whether multi-instance is needed
1892	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1893	*/
1894	protected void printAttributeSummary(boolean nominalPredictor,
1895	boolean numericPredictor,
1896	boolean stringPredictor,
1897	boolean datePredictor,
1898	boolean relationalPredictor,
1899	boolean multiInstance,
1900	int classType) {
1901
1902	String str = "";
1903
1904	if (numericPredictor)
1905	str += " numeric";
1906
1907	if (nominalPredictor) {
1908	if (str.length() > 0)
1909	str += " &";
1910	str += " nominal";
1911	}
1912
1913	if (stringPredictor) {
1914	if (str.length() > 0)
1915	str += " &";
1916	str += " string";
1917	}
1918
1919	if (datePredictor) {
1920	if (str.length() > 0)
1921	str += " &";
1922	str += " date";
1923	}
1924
1925	if (relationalPredictor) {
1926	if (str.length() > 0)
1927	str += " &";
1928	str += " relational";
1929	}
1930
1931	str += " predictors)";
1932
1933	switch (classType) {
1934	case Attribute.NUMERIC:
1935	str = " (numeric class," + str;
1936	break;
1937	case Attribute.NOMINAL:
1938	str = " (nominal class," + str;
1939	break;
1940	case Attribute.STRING:
1941	str = " (string class," + str;
1942	break;
1943	case Attribute.DATE:
1944	str = " (date class," + str;
1945	break;
1946	case Attribute.RELATIONAL:
1947	str = " (relational class," + str;
1948	break;
1949	}
1950
1951	print(str);
1952	}
1953
1954	/**
1955	* Returns the revision string.
1956	*
1957	* @return the revision
1958	*/
1959	public String getRevision() {
1960	return RevisionUtils.extract("$Revision: 6041 $");
1961	}
1962
1963	/**
1964	* Test method for this class
1965	*
1966	* @param args the commandline parameters
1967	*/
1968	public static void main(String [] args) {
1969	runCheck(new CheckClassifier(), args);
1970	}
1971	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: