Context Navigation

source: src/main/java/weka/estimators/CheckEstimator.java @ 13

Last change on this file since 13 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 64.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* CheckEstimator.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.estimators;
24
25	import weka.core.Attribute;
26	import weka.core.FastVector;
27	import weka.core.Instance;
28	import weka.core.Instances;
29	import weka.core.Option;
30	import weka.core.OptionHandler;
31	import weka.core.RevisionHandler;
32	import weka.core.RevisionUtils;
33	import weka.core.TestInstances;
34	import weka.core.Utils;
35	import weka.core.WeightedInstancesHandler;
36
37	import java.util.Enumeration;
38	import java.util.Random;
39	import java.util.Vector;
40
41	/**
42	* Class for examining the capabilities and finding problems with
43	* estimators. If you implement a estimator using the WEKA.libraries,
44	* you should run the checks on it to ensure robustness and correct
45	* operation. Passing all the tests of this object does not mean
46	* bugs in the estimator don't exist, but this will help find some
47	* common ones. <p/>
48	*
49	* Typical usage: <p/>
50	* <code>java weka.estimators.CheckEstimator -W estimator_name
51	* estimator_options </code><p/>
52	*
53	* This class uses code from the CheckEstimatorClass
54	* ATTENTION! Current estimators can only
55	* 1. split on a nominal class attribute
56	* 2. build estimators for nominal and numeric attributes
57	* 3. build estimators independendly of the class type
58	* The functionality to test on other class and attribute types
59	* is left in big parts in the code.
60	*
61	* CheckEstimator reports on the following:
62	* <ul>
63	* <li> Estimator abilities
64	* <ul>
65	* <li> Possible command line options to the estimator </li>
66	* <li> Whether the estimator can predict nominal, numeric, string,
67	* date or relational class attributes. Warnings will be displayed if
68	* performance is worse than ZeroR </li>
69	* <li> Whether the estimator can be trained incrementally </li>
70	* <li> Whether the estimator can build estimates for numeric attributes </li>
71	* <li> Whether the estimator can handle nominal attributes </li>
72	* <li> Whether the estimator can handle string attributes </li>
73	* <li> Whether the estimator can handle date attributes </li>
74	* <li> Whether the estimator can handle relational attributes </li>
75	* <li> Whether the estimator build estimates for multi-instance data </li>
76	* <li> Whether the estimator can handle missing attribute values </li>
77	* <li> Whether the estimator can handle missing class values </li>
78	* <li> Whether a nominal estimator only handles 2 class problems </li>
79	* <li> Whether the estimator can handle instance weights </li>
80	* </ul>
81	* </li>
82	* <li> Correct functioning
83	* <ul>
84	* <li> Correct initialisation during addvalues (i.e. no result
85	* changes when addValues called repeatedly) </li>
86	* <li> Whether incremental training produces the same results
87	* as during non-incremental training (which may or may not
88	* be OK) </li>
89	* <li> Whether the estimator alters the data pased to it
90	* (number of instances, instance order, instance weights, etc) </li>
91	* </ul>
92	* </li>
93	* <li> Degenerate cases
94	* <ul>
95	* <li> building estimator with zero training instances </li>
96	* <li> all but one attribute attribute values missing </li>
97	* <li> all attribute attribute values missing </li>
98	* <li> all but one class values missing </li>
99	* <li> all class values missing </li>
100	* </ul>
101	* </li>
102	* </ul>
103	* Running CheckEstimator with the debug option set will output the
104	* training and test datasets for any failed tests.<p/>
105	*
106	* The <code>weka.estimators.AbstractEstimatorTest</code> uses this
107	* class to test all the estimators. Any changes here, have to be
108	* checked in that abstract test class, too. <p/>
109	*
110	<!-- options-start -->
111	* Valid options are: <p/>
112	*
113	* <pre> -D
114	* Turn on debugging output.</pre>
115	*
116	* <pre> -S
117	* Silent mode - prints nothing to stdout.</pre>
118	*
119	* <pre> -N <num>
120	* The number of instances in the datasets (default 100).</pre>
121	*
122	* <pre> -W
123	* Full name of the estimator analysed.
124	* eg: weka.estimators.NormalEstimator</pre>
125	*
126	* <pre>
127	* Options specific to estimator weka.estimators.NormalEstimator:
128	* </pre>
129	*
130	* <pre> -D
131	* If set, estimator is run in debug mode and
132	* may output additional info to the console</pre>
133	*
134	<!-- options-end -->
135	*
136	* Options after -- are passed to the designated estimator.<p/>
137	*
138	* @author Len Trigg (trigg@cs.waikato.ac.nz)
139	* @author FracPete (fracpete at waikato dot ac dot nz)
140	* @version $Revision: 4997 $
141	* @see TestInstances
142	*/
143	public class CheckEstimator implements OptionHandler, RevisionHandler {
144
145	/*
146	* Note about test methods:
147	* - methods return array of booleans
148	* - first index: success or not
149	* - second index: acceptable or not (e.g., Exception is OK)
150	* - in case the performance is worse than that of ZeroR both indices are true
151	*
152	* FracPete (fracpete at waikato dot ac dot nz)
153	*/
154
155	/** a class for postprocessing the test-data
156	*/
157	public class PostProcessor
158	implements RevisionHandler {
159	/**
160	* Provides a hook for derived classes to further modify the data. Currently,
161	* the data is just passed through.
162	*
163	* @param data the data to process
164	* @return the processed data
165	*/
166	protected Instances process(Instances data) {
167	return data;
168	}
169
170	/**
171	* Returns the revision string.
172	*
173	* @return the revision
174	*/
175	public String getRevision() {
176	return RevisionUtils.extract("$Revision: 4997 $");
177	}
178	}
179
180	/*** The estimator to be examined */
181	protected Estimator m_Estimator = (Estimator) new weka.estimators.NormalEstimator(0.000001);
182
183	/** The options to be passed to the base estimator. */
184	protected String[] m_EstimatorOptions;
185
186	/** The results of the analysis as a string */
187	protected String m_AnalysisResults;
188
189	/** Debugging mode, gives extra output if true */
190	protected boolean m_Debug = false;
191
192	/** Silent mode, for no output at all to stdout */
193	protected boolean m_Silent = false;
194
195	/** The number of instances in the datasets */
196	protected int m_NumInstances = 100;
197
198	/** for post-processing the data even further */
199	protected PostProcessor m_PostProcessor = null;
200
201	/** whether classpath problems occurred */
202	protected boolean m_ClasspathProblems = false;
203
204	/**
205	* class that contains info about the attribute types the estimator can estimate
206	* estimator work on one attribute only
207	*/
208	public static class AttrTypes
209	implements RevisionHandler {
210
211	boolean nominal = false;
212	boolean numeric = false;
213	boolean string = false;
214	boolean date = false;
215	boolean relational = false;
216
217	AttrTypes() {
218	}
219
220	AttrTypes (AttrTypes newTypes) {
221	nominal = newTypes.nominal;
222	numeric = newTypes.numeric;
223	string = newTypes.string;
224	date = newTypes.date;
225	relational = newTypes.relational;
226	}
227
228	AttrTypes (int type) {
229	if (type == Attribute.NOMINAL) nominal = true;
230	if (type == Attribute.NUMERIC) numeric = true;
231	if (type == Attribute.STRING) string = true;
232	if (type == Attribute.DATE) date = true;
233	if (type == Attribute.RELATIONAL) relational = true;
234	}
235
236	int getSetType() throws Exception {
237	int sum = 0;
238	int type = -1;
239	if (nominal) { sum ++; type = Attribute.NOMINAL; }
240	if (numeric) { sum ++; type = Attribute.NUMERIC; }
241	if (string) { sum ++; type = Attribute.STRING; }
242	if (date) { sum ++; type = Attribute.DATE; }
243	if (relational) { sum ++; type = Attribute.RELATIONAL; }
244	if (sum > 1)
245	throw new Exception("Expected to have only one type set used wrongly.");
246	if (type < 0)
247	throw new Exception("No type set.");
248	return type;
249	}
250
251	boolean oneIsSet() {
252	return (nominal \|\| numeric \|\| string \|\| date \|\| relational);
253	}
254
255	public Vector getVectorOfAttrTypes() {
256	Vector attrs = new Vector();
257	if (nominal) attrs.add(new Integer(Attribute.NOMINAL));
258	if (numeric) attrs.add(new Integer(Attribute.NUMERIC));
259	if (string) attrs.add(new Integer(Attribute.STRING));
260	if (date) attrs.add(new Integer(Attribute.DATE));
261	if (relational) attrs.add(new Integer(Attribute.RELATIONAL));
262	return attrs;
263	}
264
265	/**
266	* Returns the revision string.
267	*
268	* @return the revision
269	*/
270	public String getRevision() {
271	return RevisionUtils.extract("$Revision: 4997 $");
272	}
273	}
274
275	/**
276	* public class that contains info about the chosen attribute type
277	* estimator work on one attribute only
278	*/
279	public static class EstTypes
280	implements RevisionHandler {
281
282	boolean incremental = false;
283	boolean weighted = false;
284	boolean supervised = false;
285
286	/**
287	* Constructor
288	*/
289	public EstTypes () {
290	}
291
292	/**
293	* Constructor
294	*/
295	public EstTypes (boolean i, boolean w, boolean s) {
296	incremental = i;
297	weighted = w;
298	supervised = s;
299	}
300
301	/**
302	* Returns the revision string.
303	*
304	* @return the revision
305	*/
306	public String getRevision() {
307	return RevisionUtils.extract("$Revision: 4997 $");
308	}
309	}
310
311	/**
312	* Returns an enumeration describing the available options.
313	*
314	* @return an enumeration of all the available options.
315	*/
316	public Enumeration listOptions() {
317
318	Vector newVector = new Vector(2);
319
320	newVector.addElement(new Option(
321	"\tTurn on debugging output.",
322	"D", 0, "-D"));
323
324	newVector.addElement(new Option(
325	"\tSilent mode - prints nothing to stdout.",
326	"S", 0, "-S"));
327
328	newVector.addElement(new Option(
329	"\tThe number of instances in the datasets (default 100).",
330	"N", 1, "-N <num>"));
331
332	newVector.addElement(new Option(
333	"\tFull name of the estimator analysed.\n"
334	+"\teg: weka.estimators.NormalEstimator",
335	"W", 1, "-W"));
336
337	if ((m_Estimator != null)
338	&& (m_Estimator instanceof OptionHandler)) {
339	newVector.addElement(new Option("", "", 0,
340	"\nOptions specific to estimator "
341	+ m_Estimator.getClass().getName()
342	+ ":"));
343	Enumeration enu = ((OptionHandler)m_Estimator).listOptions();
344	while (enu.hasMoreElements())
345	newVector.addElement(enu.nextElement());
346	}
347
348	return newVector.elements();
349	}
350
351	/**
352	* Parses a given list of options.
353	*
354	<!-- options-start -->
355	* Valid options are: <p/>
356	*
357	* <pre> -D
358	* Turn on debugging output.</pre>
359	*
360	* <pre> -S
361	* Silent mode - prints nothing to stdout.</pre>
362	*
363	* <pre> -N <num>
364	* The number of instances in the datasets (default 100).</pre>
365	*
366	* <pre> -W
367	* Full name of the estimator analysed.
368	* eg: weka.estimators.NormalEstimator</pre>
369	*
370	* <pre>
371	* Options specific to estimator weka.estimators.NormalEstimator:
372	* </pre>
373	*
374	* <pre> -D
375	* If set, estimator is run in debug mode and
376	* may output additional info to the console</pre>
377	*
378	<!-- options-end -->
379	*
380	* @param options the list of options as an array of strings
381	* @throws Exception if an option is not supported
382	*/
383	public void setOptions(String[] options) throws Exception {
384	String tmpStr;
385
386	setDebug(Utils.getFlag('D', options));
387
388	setSilent(Utils.getFlag('S', options));
389
390	tmpStr = Utils.getOption('N', options);
391	if (tmpStr.length() != 0)
392	setNumInstances(Integer.parseInt(tmpStr));
393	else
394	setNumInstances(100);
395
396	tmpStr = Utils.getOption('W', options);
397	if (tmpStr.length() == 0)
398	throw new Exception("A estimator must be specified with the -W option.");
399	setEstimator(Estimator.forName(tmpStr, Utils.partitionOptions(options)));
400	}
401
402	/**
403	* Gets the current settings of the CheckEstimator.
404	*
405	* @return an array of strings suitable for passing to setOptions
406	*/
407	public String[] getOptions() {
408	Vector result;
409	String[] options;
410	int i;
411
412	result = new Vector();
413
414	if (getDebug())
415	result.add("-D");
416
417	if (getSilent())
418	result.add("-S");
419
420	result.add("-N");
421	result.add("" + getNumInstances());
422
423	if (getEstimator() != null) {
424	result.add("-W");
425	result.add(getEstimator().getClass().getName());
426	}
427
428	if ((m_Estimator != null) && (m_Estimator instanceof OptionHandler))
429	options = ((OptionHandler) m_Estimator).getOptions();
430	else
431	options = new String[0];
432
433	if (options.length > 0) {
434	result.add("--");
435	for (i = 0; i < options.length; i++)
436	result.add(options[i]);
437	}
438
439	return (String[]) result.toArray(new String[result.size()]);
440	}
441
442	/**
443	* sets the PostProcessor to use
444	*
445	* @param value the new PostProcessor
446	* @see #m_PostProcessor
447	*/
448	public void setPostProcessor(PostProcessor value) {
449	m_PostProcessor = value;
450	}
451
452	/**
453	* returns the current PostProcessor, can be null
454	*
455	* @return the current PostProcessor
456	*/
457	public PostProcessor getPostProcessor() {
458	return m_PostProcessor;
459	}
460
461	/**
462	* returns TRUE if the estimator returned a "not in classpath" Exception
463	*
464	* @return true if CLASSPATH problems occurred
465	*/
466	public boolean hasClasspathProblems() {
467	return m_ClasspathProblems;
468	}
469
470	/**
471	* Begin the tests, reporting results to System.out
472	*/
473	public void doTests() {
474
475	if (getEstimator() == null) {
476	println("\n=== No estimator set ===");
477	return;
478	}
479	println("\n=== Check on Estimator: "
480	+ getEstimator().getClass().getName()
481	+ " ===\n");
482
483	m_ClasspathProblems = false;
484
485	// Start tests with test for options
486	canTakeOptions();
487
488	// test what type of estimator it is
489	EstTypes estTypes = new EstTypes();
490	estTypes.incremental = incrementalEstimator()[0];
491	estTypes.weighted = weightedInstancesHandler()[0];
492	estTypes.supervised = supervisedEstimator()[0];
493
494	// in none of the estimators yet the functionality is depending on the class type
495	// since this could change the basic structure taken from checkclassifiers is kept here
496	int classType = Attribute.NOMINAL;
497	AttrTypes attrTypes = testsPerClassType(classType, estTypes);
498
499
500	// only nominal class can be split up so far
501	canSplitUpClass(attrTypes, classType);
502	}
503
504
505	/**
506	* Set debugging mode
507	*
508	* @param debug true if debug output should be printed
509	*/
510	public void setDebug(boolean debug) {
511	m_Debug = debug;
512
513	// disable silent mode, if necessary
514	if (getDebug())
515	setSilent(false);
516	}
517
518	/**
519	* Get whether debugging is turned on
520	*
521	* @return true if debugging output is on
522	*/
523	public boolean getDebug() {
524	return m_Debug;
525	}
526
527	/**
528	* Set slient mode, i.e., no output at all to stdout
529	*
530	* @param value whether silent mode is active or not
531	*/
532	public void setSilent(boolean value) {
533	m_Silent = value;
534	}
535
536	/**
537	* Get whether silent mode is turned on
538	*
539	* @return true if silent mode is on
540	*/
541	public boolean getSilent() {
542	return m_Silent;
543	}
544
545	/**
546	* Sets the number of instances to use in the datasets (some estimators
547	* might require more instances).
548	*
549	* @param value the number of instances to use
550	*/
551	public void setNumInstances(int value) {
552	m_NumInstances = value;
553	}
554
555	/**
556	* Gets the current number of instances to use for the datasets.
557	*
558	* @return the number of instances
559	*/
560	public int getNumInstances() {
561	return m_NumInstances;
562	}
563
564	/**
565	* Set the estimator for boosting.
566	*
567	* @param newEstimator the Estimator to use.
568	*/
569	public void setEstimator(Estimator newEstimator) {
570	m_Estimator = newEstimator;
571	}
572
573	/**
574	* Get the estimator used as the estimator
575	*
576	* @return the estimator used as the estimator
577	*/
578	public Estimator getEstimator() {
579	return m_Estimator;
580	}
581
582	/**
583	* prints the given message to stdout, if not silent mode
584	*
585	* @param msg the text to print to stdout
586	*/
587	protected void print(Object msg) {
588	if (!getSilent())
589	System.out.print(msg);
590	}
591
592	/**
593	* prints the given message (+ LF) to stdout, if not silent mode
594	*
595	* @param msg the message to println to stdout
596	*/
597	protected void println(Object msg) {
598	print(msg + "\n");
599	}
600
601	/**
602	* prints a LF to stdout, if not silent mode
603	*/
604	protected void println() {
605	print("\n");
606	}
607
608	/**
609	* Run a battery of tests for a given class attribute type
610	*
611	* @param classType true if the class attribute should be numeric
612	* @param estTypes types the estimator is, like incremental, weighted, supervised etc
613	* @return attribute types estimator can work with
614	*/
615	protected AttrTypes testsPerClassType(int classType, EstTypes estTypes) {
616
617	// in none of the estimators yet is the estimation depending on the class type
618	// since this could change the basic structure taken from checkclassifiers is kept here
619
620	// test A: simple test - if can estimate
621	AttrTypes attrTypes = new AttrTypes();
622	AttrTypes at = new AttrTypes(Attribute.NOMINAL);
623	attrTypes.nominal = canEstimate(at, estTypes.supervised, classType)[0];
624	at = new AttrTypes(Attribute.NUMERIC);
625	attrTypes.numeric = canEstimate(at, estTypes.supervised, classType)[0];
626	attrTypes.string = false;
627	attrTypes.date = false;
628	attrTypes.relational = false;
629
630	// if (!multiInstance)
631	// PRel = canEstimate(false, false, false, false, true, classType)[0];
632	// else
633	// PRel = false;
634
635	// one of the attribute types succeeded
636
637	if (attrTypes.oneIsSet()) {
638	Vector attributesSet = attrTypes.getVectorOfAttrTypes();
639
640	// make tests for each attribute
641	for (int i = 0; i < attributesSet.size(); i++) {
642	AttrTypes workAttrTypes = new AttrTypes(((Integer) attributesSet.elementAt(i)).intValue());
643
644	// test B: weights change estimate or not
645	if (estTypes.weighted)
646	instanceWeights(workAttrTypes, classType);
647
648	if (classType == Attribute.NOMINAL) {
649	int numClasses = 4;
650	canHandleNClasses(workAttrTypes, numClasses);
651	}
652
653	// tests with class not the last attribute and the attribute not the first
654
655	// if (!multiInstance) {
656	int numAtt = 4;
657
658	canHandleClassAsNthAttribute(workAttrTypes, numAtt, 0, classType, 1);
659
660	//TODOTODOcanHandleAttrAsNthAttribute(workAttrTypes, numAtt, 2, classType);
661	//}
662
663	canHandleZeroTraining(workAttrTypes, classType);
664	boolean handleMissingAttributes = canHandleMissing(workAttrTypes,
665	classType, true, false, 20)[0];
666	if (handleMissingAttributes)
667	canHandleMissing(workAttrTypes, classType, true, false, 100);
668
669	boolean handleMissingClass = canHandleMissing(workAttrTypes,
670	classType,
671	false, true, 20)[0];
672	if (handleMissingClass)
673	canHandleMissing(workAttrTypes, classType, false, true, 100);
674
675	correctBuildInitialisation(workAttrTypes, classType);
676	datasetIntegrity(workAttrTypes, classType,
677	handleMissingAttributes, handleMissingClass);
678
679	if (estTypes.incremental)
680	incrementingEquality(workAttrTypes, classType);
681	}
682	}
683	return attrTypes;
684	}
685
686	/**
687	* Checks whether the scheme can take command line options.
688	*
689	* @return index 0 is true if the estimator can take options
690	*/
691	protected boolean[] canTakeOptions() {
692
693	boolean[] result = new boolean[2];
694
695	print("options...");
696	if (m_Estimator instanceof OptionHandler) {
697	println("yes");
698	if (m_Debug) {
699	println("\n=== Full report ===");
700	Enumeration enu = ((OptionHandler)m_Estimator).listOptions();
701	while (enu.hasMoreElements()) {
702	Option option = (Option) enu.nextElement();
703	print(option.synopsis() + "\n"
704	+ option.description() + "\n");
705	}
706	println("\n");
707	}
708	result[0] = true;
709	}
710	else {
711	println("no");
712	result[0] = false;
713	}
714
715	return result;
716	}
717
718	/**
719	* Checks whether the scheme can build models incrementally.
720	*
721	* @return index 0 is true if the estimator can train incrementally
722	*/
723	protected boolean[] incrementalEstimator() {
724
725	boolean[] result = new boolean[2];
726
727	print("incremental estimator...");
728	if (m_Estimator instanceof IncrementalEstimator) {
729	println("yes");
730	result[0] = true;
731	}
732	else {
733	println("no");
734	result[0] = false;
735	}
736
737	return result;
738	}
739
740	/**
741	* Checks whether the scheme says it can handle instance weights.
742	*
743	* @return true if the estimator handles instance weights
744	*/
745	protected boolean[] weightedInstancesHandler() {
746
747	boolean[] result = new boolean[2];
748
749	print("weighted instances estimator...");
750	if (m_Estimator instanceof WeightedInstancesHandler) {
751	println("yes");
752	result[0] = true;
753	}
754	else {
755	println("no");
756	result[0] = false;
757	}
758
759	return result;
760	}
761
762	/**
763	* Checks whether the estimator is supervised.
764	*
765	* @return true if the estimator handles instance weights
766	*/
767	protected boolean[] supervisedEstimator() {
768	boolean[] result = new boolean[2];
769	result[0] = false;
770	return result;
771	}
772
773	/**
774	* Checks basic estimation of one attribute of the scheme, for simple non-troublesome
775	* datasets.
776	*
777	* @param attrTypes the types the estimator can work with
778	* @param classType the class type (NOMINAL, NUMERIC, etc.)
779	* @return index 0 is true if the test was passed, index 1 is true if test
780	* was acceptable
781	*/
782	protected boolean[] canEstimate(AttrTypes attrTypes, boolean supervised, int classType) {
783
784	// supervised is ignored, no supervised estimators used yet
785
786	print("basic estimation");
787	printAttributeSummary(attrTypes, classType);
788	print("...");
789	FastVector accepts = new FastVector();
790	accepts.addElement("nominal");
791	accepts.addElement("numeric");
792	accepts.addElement("string");
793	accepts.addElement("date");
794	accepts.addElement("relational");
795	accepts.addElement("not in classpath");
796	int numTrain = getNumInstances(), numTest = getNumInstances(),
797	numClasses = 2, missingLevel = 0;
798	boolean attributeMissing = false, classMissing = false;
799	int numAtts = 1, attrIndex = 0;
800
801	return runBasicTest(attrTypes, numAtts, attrIndex,
802	classType,
803	missingLevel, attributeMissing, classMissing,
804	numTrain, numTest, numClasses,
805	accepts);
806	}
807
808	/**
809	* Checks basic estimation of one attribute of the scheme, for simple non-troublesome
810	* datasets.
811	*
812	* @param attrTypes the types the estimator can work with
813	* @param classType the class type (NOMINAL, NUMERIC, etc.)
814	*/
815	protected void canSplitUpClass(AttrTypes attrTypes, int classType) {
816
817	if (attrTypes.nominal)
818	canSplitUpClass(Attribute.NOMINAL, classType);
819	if (attrTypes.numeric)
820	canSplitUpClass(Attribute.NUMERIC, classType);
821	}
822
823	/**
824	* Checks basic estimation of one attribute of the scheme, for simple non-troublesome
825	* datasets.
826	*
827	* @param attrType the type of the estimator
828	* @param classType the class type (NOMINAL, NUMERIC, etc.)
829	* @return index 0 is true if the test was passed, index 1 is true if test
830	* was acceptable
831	*/
832	protected boolean[] canSplitUpClass(int attrType, int classType) {
833
834	boolean[] result = new boolean[2];
835
836	FastVector accepts = new FastVector();
837	accepts.addElement("not in classpath");
838
839	// supervised is ignored, no supervised estimators used yet
840	print("split per class type ");
841	printAttributeSummary(attrType, Attribute.NOMINAL);
842	print("...");
843
844	int numTrain = getNumInstances(), numTest = getNumInstances(),
845	numClasses = 2;
846	boolean attributeMissing = false, classMissing = false;
847	int numAtts = 3, attrIndex = 0, classIndex = 1;
848	Instances train = null;
849	Vector test;
850	Estimator estimator = null;
851	boolean built = false;
852
853	try {
854	AttrTypes at = new AttrTypes(attrType);
855	train = makeTestDataset(42, numTrain, numAtts, at,
856	numClasses, classType, classIndex);
857
858	// prepare training data set and test value list
859	test = makeTestValueList(24, numTest, train, attrIndex,
860	attrType);
861
862	estimator = Estimator.makeCopies(getEstimator(), 1)[0];
863	} catch (Exception ex) {
864	ex.printStackTrace();
865	throw new Error("Error setting up for tests: " + ex.getMessage());
866	}
867	try {
868	estimator.addValues(train, attrIndex, classType, classIndex);
869	built = true;
870
871	testWithTestValues(estimator, test);
872
873	println("yes");
874	result[0] = true;
875	}
876	catch (Exception ex) {
877	boolean acceptable = false;
878	String msg;
879	if (ex.getMessage() == null)
880	msg = "";
881	else
882	msg = ex.getMessage().toLowerCase();
883	if (msg.indexOf("not in classpath") > -1)
884	m_ClasspathProblems = true;
885
886	for (int i = 0; i < accepts.size(); i++) {
887	if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
888	acceptable = true;
889	}
890	}
891
892	println("no" + (acceptable ? " (OK error message)" : ""));
893	result[1] = acceptable;
894
895
896	if (m_Debug) {
897	println("\n=== Full Report ===");
898	print("Problem during");
899	if (built) {
900	print(" testing");
901	} else {
902	print(" training");
903	}
904	println(": " + ex.getMessage() + "\n");
905	if (!acceptable) {
906	if (accepts.size() > 0) {
907	print("Error message doesn't mention ");
908	for (int i = 0; i < accepts.size(); i++) {
909	if (i != 0) {
910	print(" or ");
911	}
912	print('"' + (String)accepts.elementAt(i) + '"');
913	}
914	}
915	println("here are the datasets:\n");
916	println("=== Train Dataset ===\n"
917	+ train.toString() + "\n");
918	println("=== Test Dataset ===\n"
919	+ test.toString() + "\n\n");
920	}
921
922	}
923	}
924	return result;
925	}
926
927	/**
928	* Checks whether nominal schemes can handle more than two classes.
929	* If a scheme is only designed for two-class problems it should
930	* throw an appropriate exception for multi-class problems.
931	*
932	* @param attrTypes attribute types the estimator excepts
933	* @param numClasses the number of classes to test
934	* @return index 0 is true if the test was passed, index 1 is true if test
935	* was acceptable
936	*/
937	protected boolean[] canHandleNClasses(AttrTypes attrTypes, int numClasses) {
938
939	print("more than two class problems");
940	printAttributeSummary(attrTypes, Attribute.NOMINAL);
941	print("...");
942
943	FastVector accepts = new FastVector();
944	accepts.addElement("number");
945	accepts.addElement("class");
946
947	int numTrain = getNumInstances(), numTest = getNumInstances(),
948	missingLevel = 0;
949	boolean attributeMissing = false, classMissing = false;
950	int numAttr = 1, attrIndex = 0;
951
952	return runBasicTest(attrTypes,
953	numAttr, attrIndex,
954	Attribute.NOMINAL,
955	missingLevel, attributeMissing, classMissing,
956	numTrain, numTest, numClasses,
957	accepts);
958	}
959
960	/**
961	* Checks whether the scheme can handle class attributes as Nth attribute.
962	*
963	* @param attrTypes the attribute types the estimator accepts
964	* @param numAtts of attributes
965	* @param attrIndex the index of the attribute
966	* @param classType the class type (NUMERIC, NOMINAL, etc.)
967	* @param classIndex the index of the class attribute (0-based, -1 means last attribute)
968	* @return index 0 is true if the test was passed, index 1 is true if test
969	* was acceptable
970	* @see TestInstances#CLASS_IS_LAST
971	*/
972	protected boolean[] canHandleClassAsNthAttribute(AttrTypes attrTypes,
973	int numAtts,
974	int attrIndex,
975	int classType,
976	int classIndex) {
977
978	if (classIndex == TestInstances.CLASS_IS_LAST)
979	print("class attribute as last attribute");
980	else
981	print("class attribute as " + (classIndex + 1) + ". attribute");
982	printAttributeSummary(attrTypes, classType);
983	print("...");
984	FastVector accepts = new FastVector();
985	int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2,
986	missingLevel = 0;
987	boolean attributeMissing = false, classMissing = false;
988
989	return runBasicTest(attrTypes,
990	numAtts, attrIndex,
991	classType, classIndex,
992	missingLevel, attributeMissing, classMissing,
993	numTrain, numTest, numClasses,
994	accepts);
995	}
996
997	/**
998	* Checks whether the scheme can handle zero training instances.
999	*
1000	* @param attrTypes attribute types that can be estimated
1001	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1002	* @return index 0 is true if the test was passed, index 1 is true if test
1003	* was acceptable
1004	*/
1005	protected boolean[] canHandleZeroTraining(AttrTypes attrTypes, int classType) {
1006
1007	print("handle zero training instances");
1008	printAttributeSummary(attrTypes, classType);
1009
1010	print("...");
1011	FastVector accepts = new FastVector();
1012	accepts.addElement("train");
1013	accepts.addElement("value");
1014	int numTrain = 0, numTest = getNumInstances(), numClasses = 2,
1015	missingLevel = 0;
1016	boolean attributeMissing = false, classMissing = false;
1017	int numAtts = 1;
1018	int attrIndex = 0;
1019	return runBasicTest(
1020	attrTypes, numAtts, attrIndex,
1021	classType,
1022	missingLevel, attributeMissing, classMissing,
1023	numTrain, numTest, numClasses,
1024	accepts);
1025	}
1026
1027	/**
1028	* Checks whether the scheme correctly initialises models when
1029	* buildEstimator is called. This test calls buildEstimator with
1030	* one training dataset and records performance on a test set.
1031	* buildEstimator is then called on a training set with different
1032	* structure, and then again with the original training set. The
1033	* performance on the test set is compared with the original results
1034	* and any performance difference noted as incorrect build initialisation.
1035	*
1036	* @param attrTypes attribute types that can be estimated
1037	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1038	* @return index 0 is true if the test was passed, index 1 is true if the
1039	* scheme performs worse than ZeroR, but without error (index 0 is
1040	* false)
1041	*/
1042	protected boolean[] correctBuildInitialisation(AttrTypes attrTypes,
1043	int classType) {
1044
1045	boolean[] result = new boolean[2];
1046
1047	print("correct initialisation during buildEstimator");
1048	printAttributeSummary(attrTypes, classType);
1049
1050	print("...");
1051	int numTrain = getNumInstances(), numTest = getNumInstances(),
1052	numClasses = 2, missingLevel = 0;
1053	boolean attributeMissing = false, classMissing = false;
1054
1055	Instances train1 = null;
1056	Instances test1 = null;
1057	Instances train2 = null;
1058	Instances test2 = null;
1059	Estimator estimator = null;
1060	Estimator estimator1 = null;
1061
1062	boolean built = false;
1063	int stage = 0;
1064	int attrIndex1 = 1;
1065	int attrIndex2 = 2;
1066
1067	try {
1068
1069	// Make two sets of train/test splits with different
1070	// numbers of attributes
1071	train1 = makeTestDataset(42, numTrain, 2, attrTypes,
1072	numClasses,
1073	classType);
1074	train2 = makeTestDataset(84, numTrain, 3, attrTypes,
1075	numClasses,
1076	classType);
1077	if (missingLevel > 0) {
1078	addMissing(train1, missingLevel, attributeMissing, classMissing, attrIndex1);
1079	addMissing(train2, missingLevel, attributeMissing, classMissing, attrIndex2);
1080	}
1081
1082	estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1083	} catch (Exception ex) {
1084	throw new Error("Error setting up for tests: " + ex.getMessage());
1085	}
1086	try {
1087	//TESTING??
1088	stage = 0;
1089	estimator.addValues(train1, attrIndex1);
1090	built = true;
1091
1092	estimator1 = estimator.makeCopies(getEstimator(), 1)[0];
1093
1094	stage = 1;
1095	built = false;
1096	estimator.addValues(train2, attrIndex2);
1097	built = true;
1098
1099	stage = 2;
1100	built = false;
1101	estimator.addValues(train1, attrIndex1);
1102	built = true;
1103
1104	stage = 3;
1105	if (!estimator.equals(estimator1)) {
1106	if (m_Debug) {
1107	println("\n=== Full report ===\n"
1108	+ "\nFirst build estimator\n"+
1109	estimator.toString() + "\n\n");
1110	println("\nSecond build estimator\n"+
1111	estimator.toString() + "\n\n");
1112	}
1113	throw new Exception("Results differ between buildEstimator calls");
1114	}
1115	println("yes");
1116	result[0] = true;
1117
1118	if (false && m_Debug) {
1119	println("\n=== Full report ===\n"
1120	+ "\nFirst buildEstimator()"
1121	+ "\n\n");
1122	println("\nSecond buildEstimator()"
1123	+ "\n\n");
1124	}
1125	}
1126	catch (Exception ex) {
1127	String msg = ex.getMessage().toLowerCase();
1128	if (msg.indexOf("worse than zeror") >= 0) {
1129	println("warning: performs worse than ZeroR");
1130	result[0] = true;
1131	result[1] = true;
1132	} else {
1133	println("no");
1134	result[0] = false;
1135	}
1136	if (m_Debug) {
1137	println("\n=== Full Report ===");
1138	print("Problem during");
1139	if (built) {
1140	print(" testing");
1141	} else {
1142	print(" training");
1143	}
1144	switch (stage) {
1145	case 0:
1146	print(" of dataset 1");
1147	break;
1148	case 1:
1149	print(" of dataset 2");
1150	break;
1151	case 2:
1152	print(" of dataset 1 (2nd build)");
1153	break;
1154	case 3:
1155	print(", comparing results from builds of dataset 1");
1156	break;
1157	}
1158	println(": " + ex.getMessage() + "\n");
1159	println("here are the datasets:\n");
1160	println("=== Train1 Dataset ===\n"
1161	+ train1.toString() + "\n");
1162	println("=== Test1 Dataset ===\n"
1163	+ test1.toString() + "\n\n");
1164	println("=== Train2 Dataset ===\n"
1165	+ train2.toString() + "\n");
1166	println("=== Test2 Dataset ===\n"
1167	+ test2.toString() + "\n\n");
1168	}
1169	}
1170
1171	return result;
1172	}
1173
1174	/**
1175	* Checks basic missing value handling of the scheme. If the missing
1176	* values cause an exception to be thrown by the scheme, this will be
1177	* recorded.
1178	*
1179	* @param attrTypes attribute types that can be estimated
1180	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1181	* @param attributeMissing true if the missing values may be in
1182	* the attributes
1183	* @param classMissing true if the missing values may be in the class
1184	* @param missingLevel the percentage of missing values
1185	* @return index 0 is true if the test was passed, index 1 is true if test
1186	* was acceptable
1187	*/
1188	protected boolean[] canHandleMissing(AttrTypes attrTypes,
1189	int classType,
1190	boolean attributeMissing,
1191	boolean classMissing,
1192	int missingLevel) {
1193
1194	if (missingLevel == 100)
1195	print("100% ");
1196	print("missing");
1197	if (attributeMissing) {
1198	print(" attribute");
1199	if (classMissing)
1200	print(" and");
1201	}
1202	if (classMissing)
1203	print(" class");
1204	print(" values");
1205	printAttributeSummary(attrTypes, classType);
1206
1207	print("...");
1208	FastVector accepts = new FastVector();
1209	accepts.addElement("missing");
1210	accepts.addElement("value");
1211	accepts.addElement("train");
1212	int numTrain = getNumInstances(), numTest = getNumInstances(),
1213	numClasses = 2;
1214
1215	int numAtts = 1, attrIndex = 0;
1216	return runBasicTest(attrTypes,
1217	numAtts, attrIndex,
1218	classType,
1219	missingLevel, attributeMissing, classMissing,
1220	numTrain, numTest, numClasses,
1221	accepts);
1222	}
1223
1224	/**
1225	* Checks whether an incremental scheme produces the same model when
1226	* trained incrementally as when batch trained. The model itself
1227	* cannot be compared, so we compare the evaluation on test data
1228	* for both models. It is possible to get a false positive on this
1229	* test (likelihood depends on the estimator).
1230	*
1231	* @param attrTypes attribute types that can be estimated
1232	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1233	* @return index 0 is true if the test was passed
1234	*/
1235	protected boolean[] incrementingEquality(AttrTypes attrTypes,
1236	int classType) {
1237
1238	print("incremental training produces the same results"
1239	+ " as batch training");
1240	printAttributeSummary(attrTypes, classType);
1241
1242	print("...");
1243	int numTrain = getNumInstances(), numTest = getNumInstances(),
1244	numClasses = 2, missingLevel = 0;
1245	boolean attributeMissing = false, classMissing = false;
1246
1247	boolean[] result = new boolean[2];
1248	Instances train = null;
1249	Estimator [] estimators = null;
1250	boolean built = false;
1251	int attrIndex = 0;
1252	Vector test;
1253	try {
1254	train = makeTestDataset(42, numTrain, 1, attrTypes,
1255	numClasses,
1256	classType
1257	);
1258
1259	// prepare training data set and test value list
1260	test = makeTestValueList(24, numTest, train, attrIndex,
1261	attrTypes.getSetType());
1262
1263	if (missingLevel > 0) {
1264	addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex);
1265	}
1266	estimators = Estimator.makeCopies(getEstimator(), 2);
1267	estimators[0].addValues(train, attrIndex);
1268	} catch (Exception ex) {
1269	throw new Error("Error setting up for tests: " + ex.getMessage());
1270	}
1271	try {
1272	for (int i = 0; i < train.numInstances(); i++) {
1273	((IncrementalEstimator)estimators[1]).addValue(train.instance(i).value(attrIndex), 1.0);
1274	}
1275	built = true;
1276	if (!estimators[0].equals(estimators[1])) {
1277	println("no");
1278	result[0] = false;
1279
1280	if (m_Debug) {
1281	println("\n=== Full Report ===");
1282	println("Results differ between batch and "
1283	+ "incrementally built models.\n"
1284	+ "Depending on the estimator, this may be OK");
1285	println("Here are the results:\n");
1286	println("batch built results\n" + estimators[0].toString());
1287	println("incrementally built results\n" + estimators[1].toString());
1288	println("Here are the datasets:\n");
1289	println("=== Train Dataset ===\n"
1290	+ train.toString() + "\n");
1291	println("=== Test Dataset ===\n"
1292	+ test.toString() + "\n\n");
1293	}
1294	}
1295	else {
1296	println("yes");
1297	result[0] = true;
1298	}
1299	} catch (Exception ex) {
1300	result[0] = false;
1301
1302	print("Problem during");
1303	if (built)
1304	print(" testing");
1305	else
1306	print(" training");
1307	println(": " + ex.getMessage() + "\n");
1308	}
1309
1310	return result;
1311	}
1312
1313
1314	/**
1315	* Checks whether the estimator can handle instance weights.
1316	* This test compares the estimator performance on two datasets
1317	* that are identical except for the training weights. If the
1318	* results change, then the estimator must be using the weights. It
1319	* may be possible to get a false positive from this test if the
1320	* weight changes aren't significant enough to induce a change
1321	* in estimator performance (but the weights are chosen to minimize
1322	* the likelihood of this).
1323	*
1324	* @param attrTypes attribute types that can be estimated
1325	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1326	* @return index 0 true if the test was passed
1327	*/
1328	protected boolean[] instanceWeights(AttrTypes attrTypes,
1329	int classType) {
1330
1331	print("estimator uses instance weights");
1332	printAttributeSummary(attrTypes, classType);
1333
1334	print("...");
1335
1336	int numTrain = 2 * getNumInstances(), numTest = getNumInstances(),
1337	numClasses = 2, missingLevel = 0;
1338	boolean attributeMissing = false, classMissing = false;
1339
1340	boolean[] result = new boolean[2];
1341	Instances train = null;
1342	Vector test = null;
1343	Estimator [] estimators = null;
1344
1345	Vector resultProbsO = null;
1346	Vector resultProbsW = null;
1347	boolean built = false;
1348	boolean evalFail = false;
1349	int attrIndex = 0;
1350	try {
1351	train = makeTestDataset(42, numTrain, 1,
1352	attrTypes, numClasses,
1353	classType);
1354
1355	// prepare training data set and test value list
1356	test = makeTestValueList(24, numTest, train, attrIndex,
1357	attrTypes.getSetType());
1358
1359	if (missingLevel > 0) {
1360	addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex);
1361	}
1362
1363	estimators = Estimator.makeCopies(getEstimator(), 2);
1364
1365	estimators[0].addValues(train, attrIndex);
1366	resultProbsO = testWithTestValues(estimators[0], test);
1367
1368	} catch (Exception ex) {
1369	throw new Error("Error setting up for tests: " + ex.getMessage());
1370	}
1371	try {
1372
1373	// Now modify instance weights and re-built
1374	for (int i = 0; i < train.numInstances(); i++) {
1375	train.instance(i).setWeight(0);
1376	}
1377	Random random = new Random(1);
1378	for (int i = 0; i < train.numInstances() / 2; i++) {
1379	int inst = Math.abs(random.nextInt()) % train.numInstances();
1380	int weight = Math.abs(random.nextInt()) % 10 + 1;
1381	train.instance(inst).setWeight(weight);
1382	}
1383	estimators[1].addValues(train, attrIndex);
1384	resultProbsW = testWithTestValues(estimators[1], test);
1385
1386	built = true;
1387	if (resultProbsO.equals(resultProbsW)) {
1388	// println("no");
1389	evalFail = true;
1390	throw new Exception("evalFail");
1391	}
1392
1393	println("yes");
1394	result[0] = true;
1395	} catch (Exception ex) {
1396	println("no");
1397	result[0] = false;
1398
1399	if (m_Debug) {
1400	println("\n=== Full Report ===");
1401
1402	if (evalFail) {
1403	println("Results don't differ between non-weighted and "
1404	+ "weighted instance models.");
1405	println("Here are the results:\n");
1406	println(probsToString(resultProbsO));
1407	} else {
1408	print("Problem during");
1409	if (built) {
1410	print(" testing");
1411	} else {
1412	print(" training");
1413	}
1414	println(": " + ex.getMessage() + "\n");
1415	}
1416	println("Here are the datasets:\n");
1417	println("=== Train Dataset ===\n"
1418	+ train.toString() + "\n");
1419	println("=== Train Weights ===\n");
1420	for (int i = 0; i < train.numInstances(); i++) {
1421	println(" " + (i + 1)
1422	+ " " + train.instance(i).weight());
1423	}
1424	println("=== Test Dataset ===\n"
1425	+ test.toString() + "\n\n");
1426	println("(test weights all 1.0\n");
1427	}
1428	}
1429
1430	return result;
1431	}
1432
1433	/**
1434	* Checks whether the scheme alters the training dataset during
1435	* training. If the scheme needs to modify the training
1436	* data it should take a copy of the training data. Currently checks
1437	* for changes to header structure, number of instances, order of
1438	* instances, instance weights.
1439	*
1440	* @param attrTypes attribute types that can be estimated
1441	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1442	* @param attributeMissing true if we know the estimator can handle
1443	* (at least) moderate missing attribute values
1444	* @param classMissing true if we know the estimator can handle
1445	* (at least) moderate missing class values
1446	* @return index 0 is true if the test was passed
1447	*/
1448	protected boolean[] datasetIntegrity(AttrTypes attrTypes,
1449	int classType,
1450	boolean attributeMissing,
1451	boolean classMissing) {
1452
1453	Estimator estimator = null;
1454	print("estimator doesn't alter original datasets");
1455	printAttributeSummary(attrTypes, classType);
1456	print("...");
1457	int numTrain = getNumInstances(), numTest = getNumInstances(),
1458	numClasses = 2, missingLevel = 100;
1459
1460	boolean[] result = new boolean[2];
1461	Instances train = null;
1462	boolean built = false;
1463	try {
1464	train = makeTestDataset(42, numTrain, 1, attrTypes,
1465	numClasses,
1466	classType);
1467	int attrIndex = 0;
1468
1469	if (missingLevel > 0) {
1470	addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex);
1471	}
1472	estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1473	} catch (Exception ex) {
1474	throw new Error("Error setting up for tests: " + ex.getMessage());
1475	}
1476	try {
1477	Instances trainCopy = new Instances(train);
1478	int attrIndex = 0;
1479	estimator.addValues(trainCopy, attrIndex);
1480	compareDatasets(train, trainCopy);
1481	built = true;
1482
1483	println("yes");
1484	result[0] = true;
1485	} catch (Exception ex) {
1486	println("no");
1487	result[0] = false;
1488
1489	if (m_Debug) {
1490	println("\n=== Full Report ===");
1491	print("Problem during");
1492	if (built) {
1493	print(" testing");
1494	} else {
1495	print(" training");
1496	}
1497	println(": " + ex.getMessage() + "\n");
1498	println("Here are the datasets:\n");
1499	println("=== Train Dataset ===\n"
1500	+ train.toString() + "\n");
1501	}
1502	}
1503
1504	return result;
1505	}
1506
1507	/**
1508	* Runs a text on the datasets with the given characteristics.
1509	*
1510	* @param attrTypes attribute types that can be estimated
1511	* @param numAtts number of attributes
1512	* @param attrIndex attribute index
1513	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1514	* @param missingLevel the percentage of missing values
1515	* @param attributeMissing true if the missing values may be in
1516	* the attributes
1517	* @param classMissing true if the missing values may be in the class
1518	* @param numTrain the number of instances in the training set
1519	* @param numTest the number of instaces in the test set
1520	* @param numClasses the number of classes
1521	* @param accepts the acceptable string in an exception
1522	* @return index 0 is true if the test was passed, index 1 is true if test
1523	* was acceptable
1524	*/
1525	protected boolean[] runBasicTest(AttrTypes attrTypes,
1526	int numAtts,
1527	int attrIndex,
1528	int classType,
1529	int missingLevel,
1530	boolean attributeMissing,
1531	boolean classMissing,
1532	int numTrain,
1533	int numTest,
1534	int numClasses,
1535	FastVector accepts) {
1536
1537	return runBasicTest(attrTypes,
1538	numAtts,
1539	attrIndex,
1540	classType,
1541	TestInstances.CLASS_IS_LAST,
1542	missingLevel,
1543	attributeMissing,
1544	classMissing,
1545	numTrain,
1546	numTest,
1547	numClasses,
1548	accepts);
1549	}
1550
1551	/**
1552	* Runs a text on the datasets with the given characteristics.
1553	*
1554	* @param attrTypes attribute types that can be estimated
1555	* @param numAtts number of attributes
1556	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1557	* @param classIndex the attribute index of the class
1558	* @param missingLevel the percentage of missing values
1559	* @param attributeMissing true if the missing values may be in
1560	* the attributes
1561	* @param classMissing true if the missing values may be in the class
1562	* @param numTrain the number of instances in the training set
1563	* @param numTest the number of instaces in the test set
1564	* @param numClasses the number of classes
1565	* @param accepts the acceptable string in an exception
1566	* @return index 0 is true if the test was passed, index 1 is true if test
1567	* was acceptable
1568	*/
1569	protected boolean[] runBasicTest(AttrTypes attrTypes,
1570	int numAtts,
1571	int attrIndex,
1572	int classType,
1573	int classIndex,
1574	int missingLevel,
1575	boolean attributeMissing,
1576	boolean classMissing,
1577	int numTrain,
1578	int numTest,
1579	int numClasses,
1580	FastVector accepts) {
1581
1582	boolean[] result = new boolean[2];
1583	Instances train = null;
1584	Vector test = null;
1585	Estimator estimator = null;
1586	boolean built = false;
1587
1588	try {
1589	train = makeTestDataset(42, numTrain, numAtts, attrTypes,
1590	numClasses,
1591	classType,
1592	classIndex);
1593
1594	// prepare training data set and test value list
1595	if (numTrain > 0) {
1596	test = makeTestValueList(24, numTest, train, attrIndex,
1597	attrTypes.getSetType());
1598
1599	} else {
1600	double min = -10.0;
1601	double max = 8.0;
1602	test = makeTestValueList(24, numTest, min, max,
1603	attrTypes.getSetType());
1604	}
1605
1606	if (missingLevel > 0) {
1607	addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex);
1608	}
1609	estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1610	} catch (Exception ex) {
1611	ex.printStackTrace();
1612	throw new Error("Error setting up for tests: " + ex.getMessage());
1613	}
1614	try {
1615	estimator.addValues(train, attrIndex);
1616	built = true;
1617
1618	testWithTestValues(estimator, test);
1619
1620	println("yes");
1621	result[0] = true;
1622	}
1623	catch (Exception ex) {
1624	boolean acceptable = false;
1625	String msg;
1626	if (ex.getMessage() == null)
1627	msg = "";
1628	else
1629	msg = ex.getMessage().toLowerCase();
1630	if (msg.indexOf("not in classpath") > -1)
1631	m_ClasspathProblems = true;
1632
1633	for (int i = 0; i < accepts.size(); i++) {
1634	if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
1635	acceptable = true;
1636	}
1637	}
1638
1639	println("no" + (acceptable ? " (OK error message)" : ""));
1640	result[1] = acceptable;
1641
1642
1643	if (m_Debug) {
1644	println("\n=== Full Report ===");
1645	print("Problem during");
1646	if (built) {
1647	print(" testing");
1648	} else {
1649	print(" training");
1650	}
1651	println(": " + ex.getMessage() + "\n");
1652	if (!acceptable) {
1653	if (accepts.size() > 0) {
1654	print("Error message doesn't mention ");
1655	for (int i = 0; i < accepts.size(); i++) {
1656	if (i != 0) {
1657	print(" or ");
1658	}
1659	print('"' + (String)accepts.elementAt(i) + '"');
1660	}
1661	}
1662	println("here are the datasets:\n");
1663	println("=== Train Dataset ===\n"
1664	+ train.toString() + "\n");
1665	println("=== Test Dataset ===\n"
1666	+ test.toString() + "\n\n");
1667	}
1668
1669	}
1670	}
1671	return result;
1672	}
1673
1674	/**
1675	* Compare two datasets to see if they differ.
1676	*
1677	* @param data1 one set of instances
1678	* @param data2 the other set of instances
1679	* @throws Exception if the datasets differ
1680	*/
1681	protected void compareDatasets(Instances data1, Instances data2)
1682	throws Exception {
1683	if (!data2.equalHeaders(data1)) {
1684	throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1));
1685	}
1686	if (!(data2.numInstances() == data1.numInstances())) {
1687	throw new Exception("number of instances has changed");
1688	}
1689	for (int i = 0; i < data2.numInstances(); i++) {
1690	Instance orig = data1.instance(i);
1691	Instance copy = data2.instance(i);
1692	for (int j = 0; j < orig.numAttributes(); j++) {
1693	if (orig.isMissing(j)) {
1694	if (!copy.isMissing(j)) {
1695	throw new Exception("instances have changed");
1696	}
1697	} else if (orig.value(j) != copy.value(j)) {
1698	throw new Exception("instances have changed");
1699	}
1700	if (orig.weight() != copy.weight()) {
1701	throw new Exception("instance weights have changed");
1702	}
1703	}
1704	}
1705	}
1706
1707	/**
1708	* Add missing values to a dataset.
1709	*
1710	* @param data the instances to add missing values to
1711	* @param level the level of missing values to add (if positive, this
1712	* is the probability that a value will be set to missing, if negative
1713	* all but one value will be set to missing (not yet implemented))
1714	* @param attributeMissing if true, attributes will be modified
1715	* @param classMissing if true, the class attribute will be modified
1716	* @param attrIndex index of the attribute
1717	*/
1718	protected void addMissing(Instances data, int level,
1719	boolean attributeMissing, boolean classMissing,
1720	int attrIndex) {
1721
1722	int classIndex = data.classIndex();
1723	Random random = new Random(1);
1724	for (int i = 0; i < data.numInstances(); i++) {
1725	Instance current = data.instance(i);
1726
1727	for (int j = 0; j < data.numAttributes(); j++) {
1728	if (((j == classIndex) && classMissing) \|\|
1729	((j == attrIndex) && attributeMissing)) {
1730	if (Math.abs(random.nextInt()) % 100 < level)
1731	current.setMissing(j);
1732	}
1733	}
1734	}
1735	}
1736
1737	/**
1738	* Make a simple set of instances, which can later be modified
1739	* for use in specific tests.
1740	*
1741	* @param seed the random number seed
1742	* @param numInstances the number of instances to generate
1743	* @param numAttr the number of attributes
1744	* @param attrTypes the attribute types
1745	* @param numClasses the number of classes (if nominal class)
1746	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1747	* @return the test dataset
1748	* @throws Exception if the dataset couldn't be generated
1749	* @see #process(Instances)
1750	*/
1751	protected Instances makeTestDataset(int seed,
1752	int numInstances,
1753	int numAttr,
1754	AttrTypes attrTypes,
1755	int numClasses,
1756	int classType)
1757	throws Exception {
1758
1759	return makeTestDataset(
1760	seed,
1761	numInstances,
1762	numAttr,
1763	attrTypes,
1764	numClasses,
1765	classType,
1766	TestInstances.CLASS_IS_LAST);
1767	}
1768
1769
1770	/**
1771	* Make a simple set of instances with variable position of the class
1772	* attribute, which can later be modified for use in specific tests.
1773	*
1774	* @param seed the random number seed
1775	* @param numInstances the number of instances to generate
1776	* @param numAttr the number of attributes to generate
1777	* @param attrTypes the type of attrbute that is excepted
1778	* @param numClasses the number of classes (if nominal class)
1779	* @param classType the class type (NUMERIC, NOMINAL, etc.)
1780	* @param classIndex the index of the class (0-based, -1 as last)
1781	* @return the test dataset
1782	* @throws Exception if the dataset couldn't be generated
1783	* @see TestInstances#CLASS_IS_LAST
1784	* @see #process(Instances)
1785	*/
1786	protected Instances makeTestDataset(int seed, int numInstances,
1787	int numAttr, AttrTypes attrTypes,
1788	int numClasses, int classType,
1789	int classIndex)
1790	throws Exception {
1791
1792	TestInstances dataset = new TestInstances();
1793
1794	dataset.setSeed(seed);
1795	dataset.setNumInstances(numInstances);
1796	dataset.setNumNominal (attrTypes.nominal ? numAttr : 0);
1797	dataset.setNumNumeric (attrTypes.numeric ? numAttr : 0);
1798	dataset.setNumString (attrTypes.string ? numAttr : 0);
1799	dataset.setNumDate (attrTypes.date ? numAttr : 0);
1800	dataset.setNumRelational(attrTypes.relational ? numAttr : 0);
1801	dataset.setNumClasses(numClasses);
1802	dataset.setClassType(classType);
1803	dataset.setClassIndex(classIndex);
1804
1805	return process(dataset.generate());
1806	}
1807
1808	/**
1809	* Make a simple set of values. Only one of the num'type' parameters should be larger 0.
1810	* (just to make parameter similar to the makeTestDataset parameters)
1811	*
1812	* @param seed the random number seed
1813	* @param numValues the number of values to generate
1814	* @param data the dataset to make test examples for
1815	* @param attrIndex index of the attribute
1816	* @param attrType the class type (NUMERIC, NOMINAL, etc.)
1817	* @throws Exception if the dataset couldn't be generated
1818	* @see #process(Instances)
1819	*/
1820	protected Vector makeTestValueList(int seed, int numValues,
1821	Instances data, int attrIndex, int attrType)
1822	throws Exception {
1823
1824	// get min max
1825	double []minMax = getMinimumMaximum(data, attrIndex);
1826	double minValue = minMax[0];
1827	double maxValue = minMax[1];
1828
1829	// make value list and put into a VECTOR
1830	double range = maxValue - minValue;
1831	Vector values = new Vector(numValues);
1832	Random random = new Random(seed);
1833
1834	if (attrType == Attribute.NOMINAL) {
1835	for (int i = 0; i < numValues; i++) {
1836	Double v = new Double((Math.abs(random.nextInt()) % (int)range)+ (int)minValue);
1837	values.add(v);
1838	}
1839	}
1840	if (attrType == Attribute.NUMERIC) {
1841	for (int i = 0; i < numValues; i++) {
1842	Double v = new Double(random.nextDouble() * range + minValue);
1843	values.add(v);
1844	}
1845	}
1846	return values;
1847	}
1848
1849	/**
1850	* Make a simple set of values. Only one of the num'type' parameters should be larger 0.
1851	* (just to make parameter similar to the makeTestDataset parameters)
1852	*
1853	* @param seed the random number seed
1854	* @param numValues the number of values to generate
1855	* @param minValue the minimal data value
1856	* @param maxValue the maximal data value
1857	* @param attrType the class type (NUMERIC, NOMINAL, etc.)
1858	* @throws Exception if the dataset couldn't be generated
1859	* @see #process(Instances)
1860	*/
1861	protected Vector makeTestValueList(int seed, int numValues,
1862	double minValue, double maxValue, int attrType)
1863	throws Exception {
1864
1865
1866	// make value list and put into a VECTOR
1867	double range = maxValue - minValue;
1868	Vector values = new Vector(numValues);
1869	Random random = new Random(seed);
1870
1871	if (attrType == Attribute.NOMINAL) {
1872	for (int i = 0; i < numValues; i++) {
1873	Double v = new Double((Math.abs(random.nextInt()) % (int)range)+ (int)minValue);
1874	values.add(v);
1875	}
1876	}
1877	if (attrType == Attribute.NUMERIC) {
1878	for (int i = 0; i < numValues; i++) {
1879	Double v = new Double(random.nextDouble() * range + minValue);
1880	values.add(v);
1881	}
1882	}
1883	return values;
1884	}
1885
1886	/**
1887	* Test with test values.
1888	*
1889	* @param est estimator to be tested
1890	* @param test vector with test values
1891	*
1892	**/
1893	protected Vector testWithTestValues(Estimator est, Vector test) {
1894
1895	Vector results = new Vector();
1896	for (int i = 0; i < test.size(); i++) {
1897	double testValue = ((Double)(test.elementAt(i))).doubleValue();
1898	double prob = est.getProbability(testValue);
1899	Double p = new Double(prob);
1900	results.add(p);
1901	}
1902	return results;
1903	}
1904
1905	/**
1906	* Gets the minimum and maximum of the values a the first attribute
1907	* of the given data set
1908	*
1909	* @param inst the instance
1910	* @param attrIndex the index of the attribut to find min and max
1911	* @return the array with the minimum value on index 0 and the max on index 1
1912	*/
1913
1914	protected double[] getMinimumMaximum(Instances inst, int attrIndex) {
1915	double []minMax = new double[2];
1916
1917	try {
1918	int num = getMinMax(inst, attrIndex, minMax);
1919	} catch (Exception ex) {
1920	ex.printStackTrace();
1921	System.out.println(ex.getMessage());
1922	}
1923	return minMax;
1924	// double minValue = minMax[0];
1925	// double maxValue = minMax[1];
1926	}
1927
1928	/**
1929	* Find the minimum and the maximum of the attribute and return it in
1930	* the last parameter..
1931	* @param inst instances used to build the estimator
1932	* @param attrIndex index of the attribute
1933	* @param minMax the array to return minimum and maximum in
1934	* @return number of not missing values
1935	* @exception Exception if parameter minMax wasn't initialized properly
1936	*/
1937	public static int getMinMax(Instances inst, int attrIndex, double [] minMax)
1938	throws Exception {
1939	double min = Double.NaN;
1940	double max = Double.NaN;
1941	Instance instance = null;
1942	int numNotMissing = 0;
1943	if ((minMax == null) \|\| (minMax.length < 2)) {
1944	throw new Exception("Error in Program, privat method getMinMax");
1945	}
1946
1947	Enumeration enumInst = inst.enumerateInstances();
1948	if (enumInst.hasMoreElements()) {
1949	do {
1950	instance = (Instance) enumInst.nextElement();
1951	} while (instance.isMissing(attrIndex) && (enumInst.hasMoreElements()));
1952
1953	// add values if not missing
1954	if (!instance.isMissing(attrIndex)) {
1955	numNotMissing++;
1956	min = instance.value(attrIndex);
1957	max = instance.value(attrIndex);
1958	}
1959	while (enumInst.hasMoreElements()) {
1960	instance = (Instance) enumInst.nextElement();
1961	if (!instance.isMissing(attrIndex)) {
1962	numNotMissing++;
1963	if (instance.value(attrIndex) < min) {
1964	min = (instance.value(attrIndex));
1965	} else {
1966	if (instance.value(attrIndex) > max) {
1967	max = (instance.value(attrIndex));
1968	}
1969	}
1970	}
1971	}
1972	}
1973	minMax[0] = min;
1974	minMax[1] = max;
1975	return numNotMissing;
1976	}
1977
1978	/**
1979	* Print the probabilities after testing
1980	* @param probs vector with probability values
1981	* @return string with probability values printed
1982	*/
1983	private String probsToString(Vector probs) {
1984	StringBuffer txt = new StringBuffer (" ");
1985	for (int i = 0; i < probs.size(); i++) {
1986	txt.append("" + ((Double)(probs.elementAt(i))).doubleValue() + " ");
1987	}
1988	return txt.toString();
1989	}
1990
1991	/**
1992	* Provides a hook for derived classes to further modify the data.
1993	*
1994	* @param data the data to process
1995	* @return the processed data
1996	* @see #m_PostProcessor
1997	*/
1998	protected Instances process(Instances data) {
1999	if (getPostProcessor() == null)
2000	return data;
2001	else
2002	return getPostProcessor().process(data);
2003	}
2004
2005	/**
2006	* Print out a short summary string for the dataset characteristics
2007	*
2008	* @param attrTypes the attribute types used (NUMERIC, NOMINAL, etc.)
2009	* @param classType the class type (NUMERIC, NOMINAL, etc.)
2010	*/
2011	protected void printAttributeSummary(AttrTypes attrTypes, int classType) {
2012
2013	String str = "";
2014
2015	if (attrTypes.numeric)
2016	str += " numeric";
2017
2018	if (attrTypes.nominal) {
2019	if (str.length() > 0)
2020	str += " &";
2021	str += " nominal";
2022	}
2023
2024	if (attrTypes.string) {
2025	if (str.length() > 0)
2026	str += " &";
2027	str += " string";
2028	}
2029
2030	if (attrTypes.date) {
2031	if (str.length() > 0)
2032	str += " &";
2033	str += " date";
2034	}
2035
2036	if (attrTypes.relational) {
2037	if (str.length() > 0)
2038	str += " &";
2039	str += " relational";
2040	}
2041
2042	str += " attributes)";
2043
2044	switch (classType) {
2045	case Attribute.NUMERIC:
2046	str = " (numeric class," + str;
2047	break;
2048	case Attribute.NOMINAL:
2049	str = " (nominal class," + str;
2050	break;
2051	case Attribute.STRING:
2052	str = " (string class," + str;
2053	break;
2054	case Attribute.DATE:
2055	str = " (date class," + str;
2056	break;
2057	case Attribute.RELATIONAL:
2058	str = " (relational class," + str;
2059	break;
2060	}
2061
2062	print(str);
2063	}
2064
2065	/**
2066	* Print out a short summary string for the dataset characteristics
2067	*
2068	* @param attrType the attribute type (NUMERIC, NOMINAL, etc.)
2069	* @param classType the class type (NUMERIC, NOMINAL, etc.)
2070	*/
2071	protected void printAttributeSummary(int attrType, int classType) {
2072
2073	String str = "";
2074
2075	switch (attrType) {
2076	case Attribute.NUMERIC:
2077	str = " numeric" + str;
2078	break;
2079	case Attribute.NOMINAL:
2080	str = " nominal" + str;
2081	break;
2082	case Attribute.STRING:
2083	str = " string" + str;
2084	break;
2085	case Attribute.DATE:
2086	str = " date" + str;
2087	break;
2088	case Attribute.RELATIONAL:
2089	str = " relational" + str;
2090	break;
2091	}
2092	str += " attribute(s))";
2093
2094	switch (classType) {
2095	case Attribute.NUMERIC:
2096	str = " (numeric class," + str;
2097	break;
2098	case Attribute.NOMINAL:
2099	str = " (nominal class," + str;
2100	break;
2101	case Attribute.STRING:
2102	str = " (string class," + str;
2103	break;
2104	case Attribute.DATE:
2105	str = " (date class," + str;
2106	break;
2107	case Attribute.RELATIONAL:
2108	str = " (relational class," + str;
2109	break;
2110	}
2111
2112	print(str);
2113	}
2114
2115	/**
2116	* Returns the revision string.
2117	*
2118	* @return the revision
2119	*/
2120	public String getRevision() {
2121	return RevisionUtils.extract("$Revision: 4997 $");
2122	}
2123
2124	/**
2125	* Test method for this class
2126	*
2127	* @param args the commandline parameters
2128	*/
2129	public static void main(String [] args) {
2130	try {
2131	CheckEstimator check = new CheckEstimator();
2132
2133	try {
2134	check.setOptions(args);
2135	Utils.checkForRemainingOptions(args);
2136	} catch (Exception ex) {
2137	String result = ex.getMessage() + "\n\n" + check.getClass().getName().replaceAll(".*\\.", "") + " Options:\n\n";
2138	Enumeration enu = check.listOptions();
2139	while (enu.hasMoreElements()) {
2140	Option option = (Option) enu.nextElement();
2141	result += option.synopsis() + "\n" + option.description() + "\n";
2142	}
2143	throw new Exception(result);
2144	}
2145
2146	check.doTests();
2147	} catch (Exception ex) {
2148	System.err.println(ex.getMessage());
2149	}
2150	}
2151	}
2152

Note: See TracBrowser for help on using the repository browser.

Download in other formats: