Context Navigation

source: src/main/java/weka/clusterers/CheckClusterer.java @ 22

Last change on this file since 22 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 45.0 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* CheckClusterer.java
19	* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.clusterers;
24
25	import weka.core.CheckScheme;
26	import weka.core.FastVector;
27	import weka.core.Instance;
28	import weka.core.Instances;
29	import weka.core.MultiInstanceCapabilitiesHandler;
30	import weka.core.Option;
31	import weka.core.OptionHandler;
32	import weka.core.RevisionUtils;
33	import weka.core.SerializationHelper;
34	import weka.core.TestInstances;
35	import weka.core.Utils;
36	import weka.core.WeightedInstancesHandler;
37
38	import java.util.Enumeration;
39	import java.util.Random;
40	import java.util.Vector;
41
42	/**
43	* Class for examining the capabilities and finding problems with
44	* clusterers. If you implement a clusterer using the WEKA.libraries,
45	* you should run the checks on it to ensure robustness and correct
46	* operation. Passing all the tests of this object does not mean
47	* bugs in the clusterer don't exist, but this will help find some
48	* common ones. <p/>
49	*
50	* Typical usage: <p/>
51	* <code>java weka.clusterers.CheckClusterer -W clusterer_name
52	* -- clusterer_options </code><p/>
53	*
54	* CheckClusterer reports on the following:
55	* <ul>
56	* <li> Clusterer abilities
57	* <ul>
58	* <li> Possible command line options to the clusterer </li>
59	* <li> Whether the clusterer can predict nominal, numeric, string,
60	* date or relational class attributes.</li>
61	* <li> Whether the clusterer can handle numeric predictor attributes </li>
62	* <li> Whether the clusterer can handle nominal predictor attributes </li>
63	* <li> Whether the clusterer can handle string predictor attributes </li>
64	* <li> Whether the clusterer can handle date predictor attributes </li>
65	* <li> Whether the clusterer can handle relational predictor attributes </li>
66	* <li> Whether the clusterer can handle multi-instance data </li>
67	* <li> Whether the clusterer can handle missing predictor values </li>
68	* <li> Whether the clusterer can handle instance weights </li>
69	* </ul>
70	* </li>
71	* <li> Correct functioning
72	* <ul>
73	* <li> Correct initialisation during buildClusterer (i.e. no result
74	* changes when buildClusterer called repeatedly) </li>
75	* <li> Whether the clusterer alters the data pased to it
76	* (number of instances, instance order, instance weights, etc) </li>
77	* </ul>
78	* </li>
79	* <li> Degenerate cases
80	* <ul>
81	* <li> building clusterer with zero training instances </li>
82	* <li> all but one predictor attribute values missing </li>
83	* <li> all predictor attribute values missing </li>
84	* <li> all but one class values missing </li>
85	* <li> all class values missing </li>
86	* </ul>
87	* </li>
88	* </ul>
89	* Running CheckClusterer with the debug option set will output the
90	* training dataset for any failed tests.<p/>
91	*
92	* The <code>weka.clusterers.AbstractClustererTest</code> uses this
93	* class to test all the clusterers. Any changes here, have to be
94	* checked in that abstract test class, too. <p/>
95	*
96	<!-- options-start -->
97	* Valid options are: <p/>
98	*
99	* <pre> -D
100	* Turn on debugging output.</pre>
101	*
102	* <pre> -S
103	* Silent mode - prints nothing to stdout.</pre>
104	*
105	* <pre> -N <num>
106	* The number of instances in the datasets (default 20).</pre>
107	*
108	* <pre> -nominal <num>
109	* The number of nominal attributes (default 2).</pre>
110	*
111	* <pre> -nominal-values <num>
112	* The number of values for nominal attributes (default 1).</pre>
113	*
114	* <pre> -numeric <num>
115	* The number of numeric attributes (default 1).</pre>
116	*
117	* <pre> -string <num>
118	* The number of string attributes (default 1).</pre>
119	*
120	* <pre> -date <num>
121	* The number of date attributes (default 1).</pre>
122	*
123	* <pre> -relational <num>
124	* The number of relational attributes (default 1).</pre>
125	*
126	* <pre> -num-instances-relational <num>
127	* The number of instances in relational/bag attributes (default 10).</pre>
128	*
129	* <pre> -words <comma-separated-list>
130	* The words to use in string attributes.</pre>
131	*
132	* <pre> -word-separators <chars>
133	* The word separators to use in string attributes.</pre>
134	*
135	* <pre> -W
136	* Full name of the clusterer analyzed.
137	* eg: weka.clusterers.SimpleKMeans
138	* (default weka.clusterers.SimpleKMeans)</pre>
139	*
140	* <pre>
141	* Options specific to clusterer weka.clusterers.SimpleKMeans:
142	* </pre>
143	*
144	* <pre> -N <num>
145	* number of clusters.
146	* (default 2).</pre>
147	*
148	* <pre> -V
149	* Display std. deviations for centroids.
150	* </pre>
151	*
152	* <pre> -M
153	* Replace missing values with mean/mode.
154	* </pre>
155	*
156	* <pre> -S <num>
157	* Random number seed.
158	* (default 10)</pre>
159	*
160	<!-- options-end -->
161	*
162	* Options after -- are passed to the designated clusterer.<p/>
163	*
164	* @author Len Trigg (trigg@cs.waikato.ac.nz)
165	* @author FracPete (fracpete at waikato dot ac dot nz)
166	* @version $Revision: 1.11 $
167	* @see TestInstances
168	*/
169	public class CheckClusterer
170	extends CheckScheme {
171
172	/*
173	* Note about test methods:
174	* - methods return array of booleans
175	* - first index: success or not
176	* - second index: acceptable or not (e.g., Exception is OK)
177	*
178	* FracPete (fracpete at waikato dot ac dot nz)
179	*/
180
181	/*** The clusterer to be examined */
182	protected Clusterer m_Clusterer = new SimpleKMeans();
183
184	/**
185	* default constructor
186	*/
187	public CheckClusterer() {
188	super();
189
190	setNumInstances(40);
191	}
192
193	/**
194	* Returns an enumeration describing the available options.
195	*
196	* @return an enumeration of all the available options.
197	*/
198	public Enumeration listOptions() {
199	Vector result = new Vector();
200
201	Enumeration en = super.listOptions();
202	while (en.hasMoreElements())
203	result.addElement(en.nextElement());
204
205	result.addElement(new Option(
206	"\tFull name of the clusterer analyzed.\n"
207	+"\teg: weka.clusterers.SimpleKMeans\n"
208	+ "\t(default weka.clusterers.SimpleKMeans)",
209	"W", 1, "-W"));
210
211	if ((m_Clusterer != null)
212	&& (m_Clusterer instanceof OptionHandler)) {
213	result.addElement(new Option("", "", 0,
214	"\nOptions specific to clusterer "
215	+ m_Clusterer.getClass().getName()
216	+ ":"));
217	Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
218	while (enu.hasMoreElements())
219	result.addElement(enu.nextElement());
220	}
221
222	return result.elements();
223	}
224
225	/**
226	* Parses a given list of options. <p/>
227	*
228	<!-- options-start -->
229	* Valid options are: <p/>
230	*
231	* <pre> -D
232	* Turn on debugging output.</pre>
233	*
234	* <pre> -S
235	* Silent mode - prints nothing to stdout.</pre>
236	*
237	* <pre> -N <num>
238	* The number of instances in the datasets (default 20).</pre>
239	*
240	* <pre> -nominal <num>
241	* The number of nominal attributes (default 2).</pre>
242	*
243	* <pre> -nominal-values <num>
244	* The number of values for nominal attributes (default 1).</pre>
245	*
246	* <pre> -numeric <num>
247	* The number of numeric attributes (default 1).</pre>
248	*
249	* <pre> -string <num>
250	* The number of string attributes (default 1).</pre>
251	*
252	* <pre> -date <num>
253	* The number of date attributes (default 1).</pre>
254	*
255	* <pre> -relational <num>
256	* The number of relational attributes (default 1).</pre>
257	*
258	* <pre> -num-instances-relational <num>
259	* The number of instances in relational/bag attributes (default 10).</pre>
260	*
261	* <pre> -words <comma-separated-list>
262	* The words to use in string attributes.</pre>
263	*
264	* <pre> -word-separators <chars>
265	* The word separators to use in string attributes.</pre>
266	*
267	* <pre> -W
268	* Full name of the clusterer analyzed.
269	* eg: weka.clusterers.SimpleKMeans
270	* (default weka.clusterers.SimpleKMeans)</pre>
271	*
272	* <pre>
273	* Options specific to clusterer weka.clusterers.SimpleKMeans:
274	* </pre>
275	*
276	* <pre> -N <num>
277	* number of clusters.
278	* (default 2).</pre>
279	*
280	* <pre> -V
281	* Display std. deviations for centroids.
282	* </pre>
283	*
284	* <pre> -M
285	* Replace missing values with mean/mode.
286	* </pre>
287	*
288	* <pre> -S <num>
289	* Random number seed.
290	* (default 10)</pre>
291	*
292	<!-- options-end -->
293	*
294	* @param options the list of options as an array of strings
295	* @throws Exception if an option is not supported
296	*/
297	public void setOptions(String[] options) throws Exception {
298	String tmpStr;
299
300	tmpStr = Utils.getOption('N', options);
301
302	super.setOptions(options);
303
304	if (tmpStr.length() != 0)
305	setNumInstances(Integer.parseInt(tmpStr));
306	else
307	setNumInstances(40);
308
309	tmpStr = Utils.getOption('W', options);
310	if (tmpStr.length() == 0)
311	tmpStr = weka.clusterers.SimpleKMeans.class.getName();
312	setClusterer(
313	(Clusterer) forName(
314	"weka.clusterers",
315	Clusterer.class,
316	tmpStr,
317	Utils.partitionOptions(options)));
318	}
319
320	/**
321	* Gets the current settings of the CheckClusterer.
322	*
323	* @return an array of strings suitable for passing to setOptions
324	*/
325	public String[] getOptions() {
326	Vector result;
327	String[] options;
328	int i;
329
330	result = new Vector();
331
332	options = super.getOptions();
333	for (i = 0; i < options.length; i++)
334	result.add(options[i]);
335
336	if (getClusterer() != null) {
337	result.add("-W");
338	result.add(getClusterer().getClass().getName());
339	}
340
341	if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler))
342	options = ((OptionHandler) m_Clusterer).getOptions();
343	else
344	options = new String[0];
345
346	if (options.length > 0) {
347	result.add("--");
348	for (i = 0; i < options.length; i++)
349	result.add(options[i]);
350	}
351
352	return (String[]) result.toArray(new String[result.size()]);
353	}
354
355	/**
356	* Begin the tests, reporting results to System.out
357	*/
358	public void doTests() {
359
360	if (getClusterer() == null) {
361	println("\n=== No clusterer set ===");
362	return;
363	}
364	println("\n=== Check on Clusterer: "
365	+ getClusterer().getClass().getName()
366	+ " ===\n");
367
368	// Start tests
369	println("--> Checking for interfaces");
370	canTakeOptions();
371	boolean updateable = updateableClusterer()[0];
372	boolean weightedInstancesHandler = weightedInstancesHandler()[0];
373	boolean multiInstanceHandler = multiInstanceHandler()[0];
374	println("--> Clusterer tests");
375	declaresSerialVersionUID();
376	runTests(weightedInstancesHandler, multiInstanceHandler, updateable);
377	}
378
379	/**
380	* Set the clusterer for testing.
381	*
382	* @param newClusterer the Clusterer to use.
383	*/
384	public void setClusterer(Clusterer newClusterer) {
385	m_Clusterer = newClusterer;
386	}
387
388	/**
389	* Get the clusterer used as the clusterer
390	*
391	* @return the clusterer used as the clusterer
392	*/
393	public Clusterer getClusterer() {
394	return m_Clusterer;
395	}
396
397	/**
398	* Run a battery of tests
399	*
400	* @param weighted true if the clusterer says it handles weights
401	* @param multiInstance true if the clusterer is a multi-instance clusterer
402	* @param updateable true if the classifier is updateable
403	*/
404	protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) {
405
406	boolean PNom = canPredict(true, false, false, false, false, multiInstance)[0];
407	boolean PNum = canPredict(false, true, false, false, false, multiInstance)[0];
408	boolean PStr = canPredict(false, false, true, false, false, multiInstance)[0];
409	boolean PDat = canPredict(false, false, false, true, false, multiInstance)[0];
410	boolean PRel;
411	if (!multiInstance)
412	PRel = canPredict(false, false, false, false, true, multiInstance)[0];
413	else
414	PRel = false;
415
416	if (PNom \|\| PNum \|\| PStr \|\| PDat \|\| PRel) {
417	if (weighted)
418	instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance);
419
420	canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance);
421	boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel,
422	multiInstance, true, 20)[0];
423	if (handleMissingPredictors)
424	canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100);
425
426	correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance);
427	datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors);
428	if (updateable)
429	updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance);
430	}
431	}
432
433	/**
434	* Checks whether the scheme can take command line options.
435	*
436	* @return index 0 is true if the clusterer can take options
437	*/
438	protected boolean[] canTakeOptions() {
439
440	boolean[] result = new boolean[2];
441
442	print("options...");
443	if (m_Clusterer instanceof OptionHandler) {
444	println("yes");
445	if (m_Debug) {
446	println("\n=== Full report ===");
447	Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
448	while (enu.hasMoreElements()) {
449	Option option = (Option) enu.nextElement();
450	print(option.synopsis() + "\n"
451	+ option.description() + "\n");
452	}
453	println("\n");
454	}
455	result[0] = true;
456	}
457	else {
458	println("no");
459	result[0] = false;
460	}
461
462	return result;
463	}
464
465	/**
466	* Checks whether the scheme can build models incrementally.
467	*
468	* @return index 0 is true if the clusterer can train incrementally
469	*/
470	protected boolean[] updateableClusterer() {
471
472	boolean[] result = new boolean[2];
473
474	print("updateable clusterer...");
475	if (m_Clusterer instanceof UpdateableClusterer) {
476	println("yes");
477	result[0] = true;
478	}
479	else {
480	println("no");
481	result[0] = false;
482	}
483
484	return result;
485	}
486
487	/**
488	* Checks whether the scheme says it can handle instance weights.
489	*
490	* @return true if the clusterer handles instance weights
491	*/
492	protected boolean[] weightedInstancesHandler() {
493
494	boolean[] result = new boolean[2];
495
496	print("weighted instances clusterer...");
497	if (m_Clusterer instanceof WeightedInstancesHandler) {
498	println("yes");
499	result[0] = true;
500	}
501	else {
502	println("no");
503	result[0] = false;
504	}
505
506	return result;
507	}
508
509	/**
510	* Checks whether the scheme handles multi-instance data.
511	*
512	* @return true if the clusterer handles multi-instance data
513	*/
514	protected boolean[] multiInstanceHandler() {
515	boolean[] result = new boolean[2];
516
517	print("multi-instance clusterer...");
518	if (m_Clusterer instanceof MultiInstanceCapabilitiesHandler) {
519	println("yes");
520	result[0] = true;
521	}
522	else {
523	println("no");
524	result[0] = false;
525	}
526
527	return result;
528	}
529
530	/**
531	* tests for a serialVersionUID. Fails in case the scheme doesn't declare
532	* a UID.
533	*
534	* @return index 0 is true if the scheme declares a UID
535	*/
536	protected boolean[] declaresSerialVersionUID() {
537	boolean[] result = new boolean[2];
538
539	print("serialVersionUID...");
540
541	result[0] = !SerializationHelper.needsUID(m_Clusterer.getClass());
542
543	if (result[0])
544	println("yes");
545	else
546	println("no");
547
548	return result;
549	}
550
551	/**
552	* Checks basic prediction of the scheme, for simple non-troublesome
553	* datasets.
554	*
555	* @param nominalPredictor if true use nominal predictor attributes
556	* @param numericPredictor if true use numeric predictor attributes
557	* @param stringPredictor if true use string predictor attributes
558	* @param datePredictor if true use date predictor attributes
559	* @param relationalPredictor if true use relational predictor attributes
560	* @param multiInstance whether multi-instance is needed
561	* @return index 0 is true if the test was passed, index 1 is true if test
562	* was acceptable
563	*/
564	protected boolean[] canPredict(
565	boolean nominalPredictor,
566	boolean numericPredictor,
567	boolean stringPredictor,
568	boolean datePredictor,
569	boolean relationalPredictor,
570	boolean multiInstance) {
571
572	print("basic predict");
573	printAttributeSummary(
574	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
575	print("...");
576	FastVector accepts = new FastVector();
577	accepts.addElement("unary");
578	accepts.addElement("binary");
579	accepts.addElement("nominal");
580	accepts.addElement("numeric");
581	accepts.addElement("string");
582	accepts.addElement("date");
583	accepts.addElement("relational");
584	accepts.addElement("multi-instance");
585	accepts.addElement("not in classpath");
586	int numTrain = getNumInstances(), missingLevel = 0;
587	boolean predictorMissing = false;
588
589	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
590	datePredictor, relationalPredictor,
591	multiInstance,
592	missingLevel, predictorMissing,
593	numTrain,
594	accepts);
595	}
596
597	/**
598	* Checks whether the scheme can handle zero training instances.
599	*
600	* @param nominalPredictor if true use nominal predictor attributes
601	* @param numericPredictor if true use numeric predictor attributes
602	* @param stringPredictor if true use string predictor attributes
603	* @param datePredictor if true use date predictor attributes
604	* @param relationalPredictor if true use relational predictor attributes
605	* @param multiInstance whether multi-instance is needed
606	* @return index 0 is true if the test was passed, index 1 is true if test
607	* was acceptable
608	*/
609	protected boolean[] canHandleZeroTraining(
610	boolean nominalPredictor,
611	boolean numericPredictor,
612	boolean stringPredictor,
613	boolean datePredictor,
614	boolean relationalPredictor,
615	boolean multiInstance) {
616
617	print("handle zero training instances");
618	printAttributeSummary(
619	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
620	print("...");
621	FastVector accepts = new FastVector();
622	accepts.addElement("train");
623	accepts.addElement("value");
624	int numTrain = 0, missingLevel = 0;
625	boolean predictorMissing = false;
626
627	return runBasicTest(
628	nominalPredictor, numericPredictor, stringPredictor,
629	datePredictor, relationalPredictor,
630	multiInstance,
631	missingLevel, predictorMissing,
632	numTrain,
633	accepts);
634	}
635
636	/**
637	* Checks whether the scheme correctly initialises models when
638	* buildClusterer is called. This test calls buildClusterer with
639	* one training dataset. buildClusterer is then called on a training set
640	* with different structure, and then again with the original training set.
641	* If the equals method of the ClusterEvaluation class returns
642	* false, this is noted as incorrect build initialisation.
643	*
644	* @param nominalPredictor if true use nominal predictor attributes
645	* @param numericPredictor if true use numeric predictor attributes
646	* @param stringPredictor if true use string predictor attributes
647	* @param datePredictor if true use date predictor attributes
648	* @param relationalPredictor if true use relational predictor attributes
649	* @param multiInstance whether multi-instance is needed
650	* @return index 0 is true if the test was passed
651	*/
652	protected boolean[] correctBuildInitialisation(
653	boolean nominalPredictor,
654	boolean numericPredictor,
655	boolean stringPredictor,
656	boolean datePredictor,
657	boolean relationalPredictor,
658	boolean multiInstance) {
659
660	boolean[] result = new boolean[2];
661
662	print("correct initialisation during buildClusterer");
663	printAttributeSummary(
664	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
665	print("...");
666	int numTrain = getNumInstances(), missingLevel = 0;
667	boolean predictorMissing = false;
668
669	Instances train1 = null;
670	Instances train2 = null;
671	Clusterer clusterer = null;
672	ClusterEvaluation evaluation1A = null;
673	ClusterEvaluation evaluation1B = null;
674	ClusterEvaluation evaluation2 = null;
675	boolean built = false;
676	int stage = 0;
677	try {
678
679	// Make two train sets with different numbers of attributes
680	train1 = makeTestDataset(42, numTrain,
681	nominalPredictor ? getNumNominal() : 0,
682	numericPredictor ? getNumNumeric() : 0,
683	stringPredictor ? getNumString() : 0,
684	datePredictor ? getNumDate() : 0,
685	relationalPredictor ? getNumRelational() : 0,
686	multiInstance);
687	train2 = makeTestDataset(84, numTrain,
688	nominalPredictor ? getNumNominal() + 1 : 0,
689	numericPredictor ? getNumNumeric() + 1 : 0,
690	stringPredictor ? getNumString() : 0,
691	datePredictor ? getNumDate() : 0,
692	relationalPredictor ? getNumRelational() : 0,
693	multiInstance);
694	if (nominalPredictor && !multiInstance) {
695	train1.deleteAttributeAt(0);
696	train2.deleteAttributeAt(0);
697	}
698	if (missingLevel > 0) {
699	addMissing(train1, missingLevel, predictorMissing);
700	addMissing(train2, missingLevel, predictorMissing);
701	}
702
703	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
704	evaluation1A = new ClusterEvaluation();
705	evaluation1B = new ClusterEvaluation();
706	evaluation2 = new ClusterEvaluation();
707	} catch (Exception ex) {
708	throw new Error("Error setting up for tests: " + ex.getMessage());
709	}
710	try {
711	stage = 0;
712	clusterer.buildClusterer(train1);
713	built = true;
714	evaluation1A.setClusterer(clusterer);
715	evaluation1A.evaluateClusterer(train1);
716
717	stage = 1;
718	built = false;
719	clusterer.buildClusterer(train2);
720	built = true;
721	evaluation2.setClusterer(clusterer);
722	evaluation2.evaluateClusterer(train2);
723
724	stage = 2;
725	built = false;
726	clusterer.buildClusterer(train1);
727	built = true;
728	evaluation1B.setClusterer(clusterer);
729	evaluation1B.evaluateClusterer(train1);
730
731	stage = 3;
732	if (!evaluation1A.equals(evaluation1B)) {
733	if (m_Debug) {
734	println("\n=== Full report ===\n");
735	println("First buildClusterer()");
736	println(evaluation1A.clusterResultsToString() + "\n\n");
737	println("Second buildClusterer()");
738	println(evaluation1B.clusterResultsToString() + "\n\n");
739	}
740	throw new Exception("Results differ between buildClusterer calls");
741	}
742	println("yes");
743	result[0] = true;
744
745	if (false && m_Debug) {
746	println("\n=== Full report ===\n");
747	println("First buildClusterer()");
748	println(evaluation1A.clusterResultsToString() + "\n\n");
749	println("Second buildClusterer()");
750	println(evaluation1B.clusterResultsToString() + "\n\n");
751	}
752	}
753	catch (Exception ex) {
754	println("no");
755	result[0] = false;
756	if (m_Debug) {
757	println("\n=== Full Report ===");
758	print("Problem during");
759	if (built) {
760	print(" testing");
761	} else {
762	print(" training");
763	}
764	switch (stage) {
765	case 0:
766	print(" of dataset 1");
767	break;
768	case 1:
769	print(" of dataset 2");
770	break;
771	case 2:
772	print(" of dataset 1 (2nd build)");
773	break;
774	case 3:
775	print(", comparing results from builds of dataset 1");
776	break;
777	}
778	println(": " + ex.getMessage() + "\n");
779	println("here are the datasets:\n");
780	println("=== Train1 Dataset ===\n"
781	+ train1.toString() + "\n");
782	println("=== Train2 Dataset ===\n"
783	+ train2.toString() + "\n");
784	}
785	}
786
787	return result;
788	}
789
790	/**
791	* Checks basic missing value handling of the scheme. If the missing
792	* values cause an exception to be thrown by the scheme, this will be
793	* recorded.
794	*
795	* @param nominalPredictor if true use nominal predictor attributes
796	* @param numericPredictor if true use numeric predictor attributes
797	* @param stringPredictor if true use string predictor attributes
798	* @param datePredictor if true use date predictor attributes
799	* @param relationalPredictor if true use relational predictor attributes
800	* @param multiInstance whether multi-instance is needed
801	* @param predictorMissing true if the missing values may be in
802	* the predictors
803	* @param missingLevel the percentage of missing values
804	* @return index 0 is true if the test was passed, index 1 is true if test
805	* was acceptable
806	*/
807	protected boolean[] canHandleMissing(
808	boolean nominalPredictor,
809	boolean numericPredictor,
810	boolean stringPredictor,
811	boolean datePredictor,
812	boolean relationalPredictor,
813	boolean multiInstance,
814	boolean predictorMissing,
815	int missingLevel) {
816
817	if (missingLevel == 100)
818	print("100% ");
819	print("missing");
820	if (predictorMissing) {
821	print(" predictor");
822	}
823	print(" values");
824	printAttributeSummary(
825	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
826	print("...");
827	FastVector accepts = new FastVector();
828	accepts.addElement("missing");
829	accepts.addElement("value");
830	accepts.addElement("train");
831	int numTrain = getNumInstances();
832
833	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
834	datePredictor, relationalPredictor,
835	multiInstance,
836	missingLevel, predictorMissing,
837	numTrain,
838	accepts);
839	}
840
841	/**
842	* Checks whether the clusterer can handle instance weights.
843	* This test compares the clusterer performance on two datasets
844	* that are identical except for the training weights. If the
845	* results change, then the clusterer must be using the weights. It
846	* may be possible to get a false positive from this test if the
847	* weight changes aren't significant enough to induce a change
848	* in clusterer performance (but the weights are chosen to minimize
849	* the likelihood of this).
850	*
851	* @param nominalPredictor if true use nominal predictor attributes
852	* @param numericPredictor if true use numeric predictor attributes
853	* @param stringPredictor if true use string predictor attributes
854	* @param datePredictor if true use date predictor attributes
855	* @param relationalPredictor if true use relational predictor attributes
856	* @param multiInstance whether multi-instance is needed
857	* @return index 0 true if the test was passed
858	*/
859	protected boolean[] instanceWeights(
860	boolean nominalPredictor,
861	boolean numericPredictor,
862	boolean stringPredictor,
863	boolean datePredictor,
864	boolean relationalPredictor,
865	boolean multiInstance) {
866
867	print("clusterer uses instance weights");
868	printAttributeSummary(
869	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
870	print("...");
871	int numTrain = 2*getNumInstances(), missingLevel = 0;
872	boolean predictorMissing = false;
873
874	boolean[] result = new boolean[2];
875	Instances train = null;
876	Clusterer [] clusterers = null;
877	ClusterEvaluation evaluationB = null;
878	ClusterEvaluation evaluationI = null;
879	boolean built = false;
880	boolean evalFail = false;
881	try {
882	train = makeTestDataset(42, numTrain,
883	nominalPredictor ? getNumNominal() + 1 : 0,
884	numericPredictor ? getNumNumeric() + 1 : 0,
885	stringPredictor ? getNumString() : 0,
886	datePredictor ? getNumDate() : 0,
887	relationalPredictor ? getNumRelational() : 0,
888	multiInstance);
889	if (nominalPredictor && !multiInstance)
890	train.deleteAttributeAt(0);
891	if (missingLevel > 0)
892	addMissing(train, missingLevel, predictorMissing);
893	clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
894	evaluationB = new ClusterEvaluation();
895	evaluationI = new ClusterEvaluation();
896	clusterers[0].buildClusterer(train);
897	evaluationB.setClusterer(clusterers[0]);
898	} catch (Exception ex) {
899	throw new Error("Error setting up for tests: " + ex.getMessage());
900	}
901	try {
902
903	// Now modify instance weights and re-built/test
904	for (int i = 0; i < train.numInstances(); i++) {
905	train.instance(i).setWeight(0);
906	}
907	Random random = new Random(1);
908	for (int i = 0; i < train.numInstances() / 2; i++) {
909	int inst = Math.abs(random.nextInt()) % train.numInstances();
910	int weight = Math.abs(random.nextInt()) % 10 + 1;
911	train.instance(inst).setWeight(weight);
912	}
913	clusterers[1].buildClusterer(train);
914	built = true;
915	evaluationI.setClusterer(clusterers[1]);
916	if (evaluationB.equals(evaluationI)) {
917	// println("no");
918	evalFail = true;
919	throw new Exception("evalFail");
920	}
921
922	println("yes");
923	result[0] = true;
924	} catch (Exception ex) {
925	println("no");
926	result[0] = false;
927
928	if (m_Debug) {
929	println("\n=== Full Report ===");
930
931	if (evalFail) {
932	println("Results don't differ between non-weighted and "
933	+ "weighted instance models.");
934	println("Here are the results:\n");
935	println("\nboth methods\n");
936	println(evaluationB.clusterResultsToString());
937	} else {
938	print("Problem during");
939	if (built) {
940	print(" testing");
941	} else {
942	print(" training");
943	}
944	println(": " + ex.getMessage() + "\n");
945	}
946	println("Here is the dataset:\n");
947	println("=== Train Dataset ===\n"
948	+ train.toString() + "\n");
949	println("=== Train Weights ===\n");
950	for (int i = 0; i < train.numInstances(); i++) {
951	println(" " + (i + 1)
952	+ " " + train.instance(i).weight());
953	}
954	}
955	}
956
957	return result;
958	}
959
960	/**
961	* Checks whether the scheme alters the training dataset during
962	* training. If the scheme needs to modify the training
963	* data it should take a copy of the training data. Currently checks
964	* for changes to header structure, number of instances, order of
965	* instances, instance weights.
966	*
967	* @param nominalPredictor if true use nominal predictor attributes
968	* @param numericPredictor if true use numeric predictor attributes
969	* @param stringPredictor if true use string predictor attributes
970	* @param datePredictor if true use date predictor attributes
971	* @param relationalPredictor if true use relational predictor attributes
972	* @param multiInstance whether multi-instance is needed
973	* @param predictorMissing true if we know the clusterer can handle
974	* (at least) moderate missing predictor values
975	* @return index 0 is true if the test was passed
976	*/
977	protected boolean[] datasetIntegrity(
978	boolean nominalPredictor,
979	boolean numericPredictor,
980	boolean stringPredictor,
981	boolean datePredictor,
982	boolean relationalPredictor,
983	boolean multiInstance,
984	boolean predictorMissing) {
985
986	print("clusterer doesn't alter original datasets");
987	printAttributeSummary(
988	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
989	print("...");
990	int numTrain = getNumInstances(), missingLevel = 20;
991
992	boolean[] result = new boolean[2];
993	Instances train = null;
994	Clusterer clusterer = null;
995	try {
996	train = makeTestDataset(42, numTrain,
997	nominalPredictor ? getNumNominal() : 0,
998	numericPredictor ? getNumNumeric() : 0,
999	stringPredictor ? getNumString() : 0,
1000	datePredictor ? getNumDate() : 0,
1001	relationalPredictor ? getNumRelational() : 0,
1002	multiInstance);
1003	if (nominalPredictor && !multiInstance)
1004	train.deleteAttributeAt(0);
1005	if (missingLevel > 0)
1006	addMissing(train, missingLevel, predictorMissing);
1007	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
1008	} catch (Exception ex) {
1009	throw new Error("Error setting up for tests: " + ex.getMessage());
1010	}
1011	try {
1012	Instances trainCopy = new Instances(train);
1013	clusterer.buildClusterer(trainCopy);
1014	compareDatasets(train, trainCopy);
1015
1016	println("yes");
1017	result[0] = true;
1018	} catch (Exception ex) {
1019	println("no");
1020	result[0] = false;
1021
1022	if (m_Debug) {
1023	println("\n=== Full Report ===");
1024	print("Problem during training");
1025	println(": " + ex.getMessage() + "\n");
1026	println("Here is the dataset:\n");
1027	println("=== Train Dataset ===\n"
1028	+ train.toString() + "\n");
1029	}
1030	}
1031
1032	return result;
1033	}
1034
1035	/**
1036	* Checks whether an updateable scheme produces the same model when
1037	* trained incrementally as when batch trained. The model itself
1038	* cannot be compared, so we compare the evaluation on test data
1039	* for both models. It is possible to get a false positive on this
1040	* test (likelihood depends on the classifier).
1041	*
1042	* @param nominalPredictor if true use nominal predictor attributes
1043	* @param numericPredictor if true use numeric predictor attributes
1044	* @param stringPredictor if true use string predictor attributes
1045	* @param datePredictor if true use date predictor attributes
1046	* @param relationalPredictor if true use relational predictor attributes
1047	* @param multiInstance whether multi-instance is needed
1048	* @return index 0 is true if the test was passed
1049	*/
1050	protected boolean[] updatingEquality(
1051	boolean nominalPredictor,
1052	boolean numericPredictor,
1053	boolean stringPredictor,
1054	boolean datePredictor,
1055	boolean relationalPredictor,
1056	boolean multiInstance) {
1057
1058	print("incremental training produces the same results"
1059	+ " as batch training");
1060	printAttributeSummary(
1061	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
1062	print("...");
1063	int numTrain = getNumInstances(), missingLevel = 0;
1064	boolean predictorMissing = false, classMissing = false;
1065
1066	boolean[] result = new boolean[2];
1067	Instances train = null;
1068	Clusterer[] clusterers = null;
1069	ClusterEvaluation evaluationB = null;
1070	ClusterEvaluation evaluationI = null;
1071	boolean built = false;
1072	try {
1073	train = makeTestDataset(42, numTrain,
1074	nominalPredictor ? getNumNominal() : 0,
1075	numericPredictor ? getNumNumeric() : 0,
1076	stringPredictor ? getNumString() : 0,
1077	datePredictor ? getNumDate() : 0,
1078	relationalPredictor ? getNumRelational() : 0,
1079	multiInstance);
1080	if (missingLevel > 0)
1081	addMissing(train, missingLevel, predictorMissing, classMissing);
1082	clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
1083	evaluationB = new ClusterEvaluation();
1084	evaluationI = new ClusterEvaluation();
1085	clusterers[0].buildClusterer(train);
1086	evaluationB.setClusterer(clusterers[0]);
1087	} catch (Exception ex) {
1088	throw new Error("Error setting up for tests: " + ex.getMessage());
1089	}
1090	try {
1091	clusterers[1].buildClusterer(new Instances(train, 0));
1092	for (int i = 0; i < train.numInstances(); i++) {
1093	((UpdateableClusterer)clusterers[1]).updateClusterer(
1094	train.instance(i));
1095	}
1096	built = true;
1097	evaluationI.setClusterer(clusterers[1]);
1098	if (!evaluationB.equals(evaluationI)) {
1099	println("no");
1100	result[0] = false;
1101
1102	if (m_Debug) {
1103	println("\n=== Full Report ===");
1104	println("Results differ between batch and "
1105	+ "incrementally built models.\n"
1106	+ "Depending on the classifier, this may be OK");
1107	println("Here are the results:\n");
1108	println("\nbatch built results\n" + evaluationB.clusterResultsToString());
1109	println("\nincrementally built results\n" + evaluationI.clusterResultsToString());
1110	println("Here are the datasets:\n");
1111	println("=== Train Dataset ===\n"
1112	+ train.toString() + "\n");
1113	}
1114	}
1115	else {
1116	println("yes");
1117	result[0] = true;
1118	}
1119	} catch (Exception ex) {
1120	result[0] = false;
1121
1122	print("Problem during");
1123	if (built)
1124	print(" testing");
1125	else
1126	print(" training");
1127	println(": " + ex.getMessage() + "\n");
1128	}
1129
1130	return result;
1131	}
1132
1133	/**
1134	* Runs a text on the datasets with the given characteristics.
1135	*
1136	* @param nominalPredictor if true use nominal predictor attributes
1137	* @param numericPredictor if true use numeric predictor attributes
1138	* @param stringPredictor if true use string predictor attributes
1139	* @param datePredictor if true use date predictor attributes
1140	* @param relationalPredictor if true use relational predictor attributes
1141	* @param multiInstance whether multi-instance is needed
1142	* @param missingLevel the percentage of missing values
1143	* @param predictorMissing true if the missing values may be in
1144	* the predictors
1145	* @param numTrain the number of instances in the training set
1146	* @param accepts the acceptable string in an exception
1147	* @return index 0 is true if the test was passed, index 1 is true if test
1148	* was acceptable
1149	*/
1150	protected boolean[] runBasicTest(boolean nominalPredictor,
1151	boolean numericPredictor,
1152	boolean stringPredictor,
1153	boolean datePredictor,
1154	boolean relationalPredictor,
1155	boolean multiInstance,
1156	int missingLevel,
1157	boolean predictorMissing,
1158	int numTrain,
1159	FastVector accepts) {
1160
1161	boolean[] result = new boolean[2];
1162	Instances train = null;
1163	Clusterer clusterer = null;
1164	try {
1165	train = makeTestDataset(42, numTrain,
1166	nominalPredictor ? getNumNominal() : 0,
1167	numericPredictor ? getNumNumeric() : 0,
1168	stringPredictor ? getNumString() : 0,
1169	datePredictor ? getNumDate() : 0,
1170	relationalPredictor ? getNumRelational() : 0,
1171	multiInstance);
1172	if (nominalPredictor && !multiInstance)
1173	train.deleteAttributeAt(0);
1174	if (missingLevel > 0)
1175	addMissing(train, missingLevel, predictorMissing);
1176	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
1177	} catch (Exception ex) {
1178	ex.printStackTrace();
1179	throw new Error("Error setting up for tests: " + ex.getMessage());
1180	}
1181	try {
1182	clusterer.buildClusterer(train);
1183	println("yes");
1184	result[0] = true;
1185	}
1186	catch (Exception ex) {
1187	boolean acceptable = false;
1188	String msg = ex.getMessage().toLowerCase();
1189	for (int i = 0; i < accepts.size(); i++) {
1190	if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
1191	acceptable = true;
1192	}
1193	}
1194
1195	println("no" + (acceptable ? " (OK error message)" : ""));
1196	result[1] = acceptable;
1197
1198	if (m_Debug) {
1199	println("\n=== Full Report ===");
1200	print("Problem during training");
1201	println(": " + ex.getMessage() + "\n");
1202	if (!acceptable) {
1203	if (accepts.size() > 0) {
1204	print("Error message doesn't mention ");
1205	for (int i = 0; i < accepts.size(); i++) {
1206	if (i != 0) {
1207	print(" or ");
1208	}
1209	print('"' + (String)accepts.elementAt(i) + '"');
1210	}
1211	}
1212	println("here is the dataset:\n");
1213	println("=== Train Dataset ===\n"
1214	+ train.toString() + "\n");
1215	}
1216	}
1217	}
1218
1219	return result;
1220	}
1221
1222	/**
1223	* Add missing values to a dataset.
1224	*
1225	* @param data the instances to add missing values to
1226	* @param level the level of missing values to add (if positive, this
1227	* is the probability that a value will be set to missing, if negative
1228	* all but one value will be set to missing (not yet implemented))
1229	* @param predictorMissing if true, predictor attributes will be modified
1230	*/
1231	protected void addMissing(Instances data, int level, boolean predictorMissing) {
1232
1233	Random random = new Random(1);
1234	for (int i = 0; i < data.numInstances(); i++) {
1235	Instance current = data.instance(i);
1236	for (int j = 0; j < data.numAttributes(); j++) {
1237	if (predictorMissing) {
1238	if (Math.abs(random.nextInt()) % 100 < level)
1239	current.setMissing(j);
1240	}
1241	}
1242	}
1243	}
1244
1245	/**
1246	* Make a simple set of instances with variable position of the class
1247	* attribute, which can later be modified for use in specific tests.
1248	*
1249	* @param seed the random number seed
1250	* @param numInstances the number of instances to generate
1251	* @param numNominal the number of nominal attributes
1252	* @param numNumeric the number of numeric attributes
1253	* @param numString the number of string attributes
1254	* @param numDate the number of date attributes
1255	* @param numRelational the number of relational attributes
1256	* @param multiInstance whether the dataset should a multi-instance dataset
1257	* @return the test dataset
1258	* @throws Exception if the dataset couldn't be generated
1259	* @see TestInstances#CLASS_IS_LAST
1260	*/
1261	protected Instances makeTestDataset(int seed, int numInstances,
1262	int numNominal, int numNumeric,
1263	int numString, int numDate,
1264	int numRelational,
1265	boolean multiInstance)
1266	throws Exception {
1267
1268	TestInstances dataset = new TestInstances();
1269
1270	dataset.setSeed(seed);
1271	dataset.setNumInstances(numInstances);
1272	dataset.setNumNominal(numNominal);
1273	dataset.setNumNumeric(numNumeric);
1274	dataset.setNumString(numString);
1275	dataset.setNumDate(numDate);
1276	dataset.setNumRelational(numRelational);
1277	dataset.setClassIndex(TestInstances.NO_CLASS);
1278	dataset.setMultiInstance(multiInstance);
1279
1280	return dataset.generate();
1281	}
1282
1283	/**
1284	* Print out a short summary string for the dataset characteristics
1285	*
1286	* @param nominalPredictor true if nominal predictor attributes are present
1287	* @param numericPredictor true if numeric predictor attributes are present
1288	* @param stringPredictor true if string predictor attributes are present
1289	* @param datePredictor true if date predictor attributes are present
1290	* @param relationalPredictor true if relational predictor attributes are present
1291	* @param multiInstance whether multi-instance is needed
1292	*/
1293	protected void printAttributeSummary(boolean nominalPredictor,
1294	boolean numericPredictor,
1295	boolean stringPredictor,
1296	boolean datePredictor,
1297	boolean relationalPredictor,
1298	boolean multiInstance) {
1299
1300	String str = "";
1301
1302	if (numericPredictor)
1303	str += "numeric";
1304
1305	if (nominalPredictor) {
1306	if (str.length() > 0)
1307	str += " & ";
1308	str += "nominal";
1309	}
1310
1311	if (stringPredictor) {
1312	if (str.length() > 0)
1313	str += " & ";
1314	str += "string";
1315	}
1316
1317	if (datePredictor) {
1318	if (str.length() > 0)
1319	str += " & ";
1320	str += "date";
1321	}
1322
1323	if (relationalPredictor) {
1324	if (str.length() > 0)
1325	str += " & ";
1326	str += "relational";
1327	}
1328
1329	str = " (" + str + " predictors)";
1330
1331	print(str);
1332	}
1333
1334	/**
1335	* Returns the revision string.
1336	*
1337	* @return the revision
1338	*/
1339	public String getRevision() {
1340	return RevisionUtils.extract("$Revision: 1.11 $");
1341	}
1342
1343	/**
1344	* Test method for this class
1345	*
1346	* @param args the commandline options
1347	*/
1348	public static void main(String [] args) {
1349	runCheck(new CheckClusterer(), args);
1350	}
1351	}
1352

Note: See TracBrowser for help on using the repository browser.

Download in other formats: