Context Navigation

source: src/main/java/weka/clusterers/CheckClusterer.java @ 11

Last change on this file since 11 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 45.0 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* CheckClusterer.java
	19	* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.clusterers;
	24
	25	import weka.core.CheckScheme;
	26	import weka.core.FastVector;
	27	import weka.core.Instance;
	28	import weka.core.Instances;
	29	import weka.core.MultiInstanceCapabilitiesHandler;
	30	import weka.core.Option;
	31	import weka.core.OptionHandler;
	32	import weka.core.RevisionUtils;
	33	import weka.core.SerializationHelper;
	34	import weka.core.TestInstances;
	35	import weka.core.Utils;
	36	import weka.core.WeightedInstancesHandler;
	37
	38	import java.util.Enumeration;
	39	import java.util.Random;
	40	import java.util.Vector;
	41
	42	/**
	43	* Class for examining the capabilities and finding problems with
	44	* clusterers. If you implement a clusterer using the WEKA.libraries,
	45	* you should run the checks on it to ensure robustness and correct
	46	* operation. Passing all the tests of this object does not mean
	47	* bugs in the clusterer don't exist, but this will help find some
	48	* common ones. <p/>
	49	*
	50	* Typical usage: <p/>
	51	* <code>java weka.clusterers.CheckClusterer -W clusterer_name
	52	* -- clusterer_options </code><p/>
	53	*
	54	* CheckClusterer reports on the following:
	55	* <ul>
	56	* <li> Clusterer abilities
	57	* <ul>
	58	* <li> Possible command line options to the clusterer </li>
	59	* <li> Whether the clusterer can predict nominal, numeric, string,
	60	* date or relational class attributes.</li>
	61	* <li> Whether the clusterer can handle numeric predictor attributes </li>
	62	* <li> Whether the clusterer can handle nominal predictor attributes </li>
	63	* <li> Whether the clusterer can handle string predictor attributes </li>
	64	* <li> Whether the clusterer can handle date predictor attributes </li>
	65	* <li> Whether the clusterer can handle relational predictor attributes </li>
	66	* <li> Whether the clusterer can handle multi-instance data </li>
	67	* <li> Whether the clusterer can handle missing predictor values </li>
	68	* <li> Whether the clusterer can handle instance weights </li>
	69	* </ul>
	70	* </li>
	71	* <li> Correct functioning
	72	* <ul>
	73	* <li> Correct initialisation during buildClusterer (i.e. no result
	74	* changes when buildClusterer called repeatedly) </li>
	75	* <li> Whether the clusterer alters the data pased to it
	76	* (number of instances, instance order, instance weights, etc) </li>
	77	* </ul>
	78	* </li>
	79	* <li> Degenerate cases
	80	* <ul>
	81	* <li> building clusterer with zero training instances </li>
	82	* <li> all but one predictor attribute values missing </li>
	83	* <li> all predictor attribute values missing </li>
	84	* <li> all but one class values missing </li>
	85	* <li> all class values missing </li>
	86	* </ul>
	87	* </li>
	88	* </ul>
	89	* Running CheckClusterer with the debug option set will output the
	90	* training dataset for any failed tests.<p/>
	91	*
	92	* The <code>weka.clusterers.AbstractClustererTest</code> uses this
	93	* class to test all the clusterers. Any changes here, have to be
	94	* checked in that abstract test class, too. <p/>
	95	*
	96	<!-- options-start -->
	97	* Valid options are: <p/>
	98	*
	99	* <pre> -D
	100	* Turn on debugging output.</pre>
	101	*
	102	* <pre> -S
	103	* Silent mode - prints nothing to stdout.</pre>
	104	*
	105	* <pre> -N <num>
	106	* The number of instances in the datasets (default 20).</pre>
	107	*
	108	* <pre> -nominal <num>
	109	* The number of nominal attributes (default 2).</pre>
	110	*
	111	* <pre> -nominal-values <num>
	112	* The number of values for nominal attributes (default 1).</pre>
	113	*
	114	* <pre> -numeric <num>
	115	* The number of numeric attributes (default 1).</pre>
	116	*
	117	* <pre> -string <num>
	118	* The number of string attributes (default 1).</pre>
	119	*
	120	* <pre> -date <num>
	121	* The number of date attributes (default 1).</pre>
	122	*
	123	* <pre> -relational <num>
	124	* The number of relational attributes (default 1).</pre>
	125	*
	126	* <pre> -num-instances-relational <num>
	127	* The number of instances in relational/bag attributes (default 10).</pre>
	128	*
	129	* <pre> -words <comma-separated-list>
	130	* The words to use in string attributes.</pre>
	131	*
	132	* <pre> -word-separators <chars>
	133	* The word separators to use in string attributes.</pre>
	134	*
	135	* <pre> -W
	136	* Full name of the clusterer analyzed.
	137	* eg: weka.clusterers.SimpleKMeans
	138	* (default weka.clusterers.SimpleKMeans)</pre>
	139	*
	140	* <pre>
	141	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	142	* </pre>
	143	*
	144	* <pre> -N <num>
	145	* number of clusters.
	146	* (default 2).</pre>
	147	*
	148	* <pre> -V
	149	* Display std. deviations for centroids.
	150	* </pre>
	151	*
	152	* <pre> -M
	153	* Replace missing values with mean/mode.
	154	* </pre>
	155	*
	156	* <pre> -S <num>
	157	* Random number seed.
	158	* (default 10)</pre>
	159	*
	160	<!-- options-end -->
	161	*
	162	* Options after -- are passed to the designated clusterer.<p/>
	163	*
	164	* @author Len Trigg (trigg@cs.waikato.ac.nz)
	165	* @author FracPete (fracpete at waikato dot ac dot nz)
	166	* @version $Revision: 1.11 $
	167	* @see TestInstances
	168	*/
	169	public class CheckClusterer
	170	extends CheckScheme {
	171
	172	/*
	173	* Note about test methods:
	174	* - methods return array of booleans
	175	* - first index: success or not
	176	* - second index: acceptable or not (e.g., Exception is OK)
	177	*
	178	* FracPete (fracpete at waikato dot ac dot nz)
	179	*/
	180
	181	/*** The clusterer to be examined */
	182	protected Clusterer m_Clusterer = new SimpleKMeans();
	183
	184	/**
	185	* default constructor
	186	*/
	187	public CheckClusterer() {
	188	super();
	189
	190	setNumInstances(40);
	191	}
	192
	193	/**
	194	* Returns an enumeration describing the available options.
	195	*
	196	* @return an enumeration of all the available options.
	197	*/
	198	public Enumeration listOptions() {
	199	Vector result = new Vector();
	200
	201	Enumeration en = super.listOptions();
	202	while (en.hasMoreElements())
	203	result.addElement(en.nextElement());
	204
	205	result.addElement(new Option(
	206	"\tFull name of the clusterer analyzed.\n"
	207	+"\teg: weka.clusterers.SimpleKMeans\n"
	208	+ "\t(default weka.clusterers.SimpleKMeans)",
	209	"W", 1, "-W"));
	210
	211	if ((m_Clusterer != null)
	212	&& (m_Clusterer instanceof OptionHandler)) {
	213	result.addElement(new Option("", "", 0,
	214	"\nOptions specific to clusterer "
	215	+ m_Clusterer.getClass().getName()
	216	+ ":"));
	217	Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
	218	while (enu.hasMoreElements())
	219	result.addElement(enu.nextElement());
	220	}
	221
	222	return result.elements();
	223	}
	224
	225	/**
	226	* Parses a given list of options. <p/>
	227	*
	228	<!-- options-start -->
	229	* Valid options are: <p/>
	230	*
	231	* <pre> -D
	232	* Turn on debugging output.</pre>
	233	*
	234	* <pre> -S
	235	* Silent mode - prints nothing to stdout.</pre>
	236	*
	237	* <pre> -N <num>
	238	* The number of instances in the datasets (default 20).</pre>
	239	*
	240	* <pre> -nominal <num>
	241	* The number of nominal attributes (default 2).</pre>
	242	*
	243	* <pre> -nominal-values <num>
	244	* The number of values for nominal attributes (default 1).</pre>
	245	*
	246	* <pre> -numeric <num>
	247	* The number of numeric attributes (default 1).</pre>
	248	*
	249	* <pre> -string <num>
	250	* The number of string attributes (default 1).</pre>
	251	*
	252	* <pre> -date <num>
	253	* The number of date attributes (default 1).</pre>
	254	*
	255	* <pre> -relational <num>
	256	* The number of relational attributes (default 1).</pre>
	257	*
	258	* <pre> -num-instances-relational <num>
	259	* The number of instances in relational/bag attributes (default 10).</pre>
	260	*
	261	* <pre> -words <comma-separated-list>
	262	* The words to use in string attributes.</pre>
	263	*
	264	* <pre> -word-separators <chars>
	265	* The word separators to use in string attributes.</pre>
	266	*
	267	* <pre> -W
	268	* Full name of the clusterer analyzed.
	269	* eg: weka.clusterers.SimpleKMeans
	270	* (default weka.clusterers.SimpleKMeans)</pre>
	271	*
	272	* <pre>
	273	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	274	* </pre>
	275	*
	276	* <pre> -N <num>
	277	* number of clusters.
	278	* (default 2).</pre>
	279	*
	280	* <pre> -V
	281	* Display std. deviations for centroids.
	282	* </pre>
	283	*
	284	* <pre> -M
	285	* Replace missing values with mean/mode.
	286	* </pre>
	287	*
	288	* <pre> -S <num>
	289	* Random number seed.
	290	* (default 10)</pre>
	291	*
	292	<!-- options-end -->
	293	*
	294	* @param options the list of options as an array of strings
	295	* @throws Exception if an option is not supported
	296	*/
	297	public void setOptions(String[] options) throws Exception {
	298	String tmpStr;
	299
	300	tmpStr = Utils.getOption('N', options);
	301
	302	super.setOptions(options);
	303
	304	if (tmpStr.length() != 0)
	305	setNumInstances(Integer.parseInt(tmpStr));
	306	else
	307	setNumInstances(40);
	308
	309	tmpStr = Utils.getOption('W', options);
	310	if (tmpStr.length() == 0)
	311	tmpStr = weka.clusterers.SimpleKMeans.class.getName();
	312	setClusterer(
	313	(Clusterer) forName(
	314	"weka.clusterers",
	315	Clusterer.class,
	316	tmpStr,
	317	Utils.partitionOptions(options)));
	318	}
	319
	320	/**
	321	* Gets the current settings of the CheckClusterer.
	322	*
	323	* @return an array of strings suitable for passing to setOptions
	324	*/
	325	public String[] getOptions() {
	326	Vector result;
	327	String[] options;
	328	int i;
	329
	330	result = new Vector();
	331
	332	options = super.getOptions();
	333	for (i = 0; i < options.length; i++)
	334	result.add(options[i]);
	335
	336	if (getClusterer() != null) {
	337	result.add("-W");
	338	result.add(getClusterer().getClass().getName());
	339	}
	340
	341	if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler))
	342	options = ((OptionHandler) m_Clusterer).getOptions();
	343	else
	344	options = new String[0];
	345
	346	if (options.length > 0) {
	347	result.add("--");
	348	for (i = 0; i < options.length; i++)
	349	result.add(options[i]);
	350	}
	351
	352	return (String[]) result.toArray(new String[result.size()]);
	353	}
	354
	355	/**
	356	* Begin the tests, reporting results to System.out
	357	*/
	358	public void doTests() {
	359
	360	if (getClusterer() == null) {
	361	println("\n=== No clusterer set ===");
	362	return;
	363	}
	364	println("\n=== Check on Clusterer: "
	365	+ getClusterer().getClass().getName()
	366	+ " ===\n");
	367
	368	// Start tests
	369	println("--> Checking for interfaces");
	370	canTakeOptions();
	371	boolean updateable = updateableClusterer()[0];
	372	boolean weightedInstancesHandler = weightedInstancesHandler()[0];
	373	boolean multiInstanceHandler = multiInstanceHandler()[0];
	374	println("--> Clusterer tests");
	375	declaresSerialVersionUID();
	376	runTests(weightedInstancesHandler, multiInstanceHandler, updateable);
	377	}
	378
	379	/**
	380	* Set the clusterer for testing.
	381	*
	382	* @param newClusterer the Clusterer to use.
	383	*/
	384	public void setClusterer(Clusterer newClusterer) {
	385	m_Clusterer = newClusterer;
	386	}
	387
	388	/**
	389	* Get the clusterer used as the clusterer
	390	*
	391	* @return the clusterer used as the clusterer
	392	*/
	393	public Clusterer getClusterer() {
	394	return m_Clusterer;
	395	}
	396
	397	/**
	398	* Run a battery of tests
	399	*
	400	* @param weighted true if the clusterer says it handles weights
	401	* @param multiInstance true if the clusterer is a multi-instance clusterer
	402	* @param updateable true if the classifier is updateable
	403	*/
	404	protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) {
	405
	406	boolean PNom = canPredict(true, false, false, false, false, multiInstance)[0];
	407	boolean PNum = canPredict(false, true, false, false, false, multiInstance)[0];
	408	boolean PStr = canPredict(false, false, true, false, false, multiInstance)[0];
	409	boolean PDat = canPredict(false, false, false, true, false, multiInstance)[0];
	410	boolean PRel;
	411	if (!multiInstance)
	412	PRel = canPredict(false, false, false, false, true, multiInstance)[0];
	413	else
	414	PRel = false;
	415
	416	if (PNom \|\| PNum \|\| PStr \|\| PDat \|\| PRel) {
	417	if (weighted)
	418	instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance);
	419
	420	canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance);
	421	boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel,
	422	multiInstance, true, 20)[0];
	423	if (handleMissingPredictors)
	424	canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100);
	425
	426	correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance);
	427	datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors);
	428	if (updateable)
	429	updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance);
	430	}
	431	}
	432
	433	/**
	434	* Checks whether the scheme can take command line options.
	435	*
	436	* @return index 0 is true if the clusterer can take options
	437	*/
	438	protected boolean[] canTakeOptions() {
	439
	440	boolean[] result = new boolean[2];
	441
	442	print("options...");
	443	if (m_Clusterer instanceof OptionHandler) {
	444	println("yes");
	445	if (m_Debug) {
	446	println("\n=== Full report ===");
	447	Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
	448	while (enu.hasMoreElements()) {
	449	Option option = (Option) enu.nextElement();
	450	print(option.synopsis() + "\n"
	451	+ option.description() + "\n");
	452	}
	453	println("\n");
	454	}
	455	result[0] = true;
	456	}
	457	else {
	458	println("no");
	459	result[0] = false;
	460	}
	461
	462	return result;
	463	}
	464
	465	/**
	466	* Checks whether the scheme can build models incrementally.
	467	*
	468	* @return index 0 is true if the clusterer can train incrementally
	469	*/
	470	protected boolean[] updateableClusterer() {
	471
	472	boolean[] result = new boolean[2];
	473
	474	print("updateable clusterer...");
	475	if (m_Clusterer instanceof UpdateableClusterer) {
	476	println("yes");
	477	result[0] = true;
	478	}
	479	else {
	480	println("no");
	481	result[0] = false;
	482	}
	483
	484	return result;
	485	}
	486
	487	/**
	488	* Checks whether the scheme says it can handle instance weights.
	489	*
	490	* @return true if the clusterer handles instance weights
	491	*/
	492	protected boolean[] weightedInstancesHandler() {
	493
	494	boolean[] result = new boolean[2];
	495
	496	print("weighted instances clusterer...");
	497	if (m_Clusterer instanceof WeightedInstancesHandler) {
	498	println("yes");
	499	result[0] = true;
	500	}
	501	else {
	502	println("no");
	503	result[0] = false;
	504	}
	505
	506	return result;
	507	}
	508
	509	/**
	510	* Checks whether the scheme handles multi-instance data.
	511	*
	512	* @return true if the clusterer handles multi-instance data
	513	*/
	514	protected boolean[] multiInstanceHandler() {
	515	boolean[] result = new boolean[2];
	516
	517	print("multi-instance clusterer...");
	518	if (m_Clusterer instanceof MultiInstanceCapabilitiesHandler) {
	519	println("yes");
	520	result[0] = true;
	521	}
	522	else {
	523	println("no");
	524	result[0] = false;
	525	}
	526
	527	return result;
	528	}
	529
	530	/**
	531	* tests for a serialVersionUID. Fails in case the scheme doesn't declare
	532	* a UID.
	533	*
	534	* @return index 0 is true if the scheme declares a UID
	535	*/
	536	protected boolean[] declaresSerialVersionUID() {
	537	boolean[] result = new boolean[2];
	538
	539	print("serialVersionUID...");
	540
	541	result[0] = !SerializationHelper.needsUID(m_Clusterer.getClass());
	542
	543	if (result[0])
	544	println("yes");
	545	else
	546	println("no");
	547
	548	return result;
	549	}
	550
	551	/**
	552	* Checks basic prediction of the scheme, for simple non-troublesome
	553	* datasets.
	554	*
	555	* @param nominalPredictor if true use nominal predictor attributes
	556	* @param numericPredictor if true use numeric predictor attributes
	557	* @param stringPredictor if true use string predictor attributes
	558	* @param datePredictor if true use date predictor attributes
	559	* @param relationalPredictor if true use relational predictor attributes
	560	* @param multiInstance whether multi-instance is needed
	561	* @return index 0 is true if the test was passed, index 1 is true if test
	562	* was acceptable
	563	*/
	564	protected boolean[] canPredict(
	565	boolean nominalPredictor,
	566	boolean numericPredictor,
	567	boolean stringPredictor,
	568	boolean datePredictor,
	569	boolean relationalPredictor,
	570	boolean multiInstance) {
	571
	572	print("basic predict");
	573	printAttributeSummary(
	574	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	575	print("...");
	576	FastVector accepts = new FastVector();
	577	accepts.addElement("unary");
	578	accepts.addElement("binary");
	579	accepts.addElement("nominal");
	580	accepts.addElement("numeric");
	581	accepts.addElement("string");
	582	accepts.addElement("date");
	583	accepts.addElement("relational");
	584	accepts.addElement("multi-instance");
	585	accepts.addElement("not in classpath");
	586	int numTrain = getNumInstances(), missingLevel = 0;
	587	boolean predictorMissing = false;
	588
	589	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
	590	datePredictor, relationalPredictor,
	591	multiInstance,
	592	missingLevel, predictorMissing,
	593	numTrain,
	594	accepts);
	595	}
	596
	597	/**
	598	* Checks whether the scheme can handle zero training instances.
	599	*
	600	* @param nominalPredictor if true use nominal predictor attributes
	601	* @param numericPredictor if true use numeric predictor attributes
	602	* @param stringPredictor if true use string predictor attributes
	603	* @param datePredictor if true use date predictor attributes
	604	* @param relationalPredictor if true use relational predictor attributes
	605	* @param multiInstance whether multi-instance is needed
	606	* @return index 0 is true if the test was passed, index 1 is true if test
	607	* was acceptable
	608	*/
	609	protected boolean[] canHandleZeroTraining(
	610	boolean nominalPredictor,
	611	boolean numericPredictor,
	612	boolean stringPredictor,
	613	boolean datePredictor,
	614	boolean relationalPredictor,
	615	boolean multiInstance) {
	616
	617	print("handle zero training instances");
	618	printAttributeSummary(
	619	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	620	print("...");
	621	FastVector accepts = new FastVector();
	622	accepts.addElement("train");
	623	accepts.addElement("value");
	624	int numTrain = 0, missingLevel = 0;
	625	boolean predictorMissing = false;
	626
	627	return runBasicTest(
	628	nominalPredictor, numericPredictor, stringPredictor,
	629	datePredictor, relationalPredictor,
	630	multiInstance,
	631	missingLevel, predictorMissing,
	632	numTrain,
	633	accepts);
	634	}
	635
	636	/**
	637	* Checks whether the scheme correctly initialises models when
	638	* buildClusterer is called. This test calls buildClusterer with
	639	* one training dataset. buildClusterer is then called on a training set
	640	* with different structure, and then again with the original training set.
	641	* If the equals method of the ClusterEvaluation class returns
	642	* false, this is noted as incorrect build initialisation.
	643	*
	644	* @param nominalPredictor if true use nominal predictor attributes
	645	* @param numericPredictor if true use numeric predictor attributes
	646	* @param stringPredictor if true use string predictor attributes
	647	* @param datePredictor if true use date predictor attributes
	648	* @param relationalPredictor if true use relational predictor attributes
	649	* @param multiInstance whether multi-instance is needed
	650	* @return index 0 is true if the test was passed
	651	*/
	652	protected boolean[] correctBuildInitialisation(
	653	boolean nominalPredictor,
	654	boolean numericPredictor,
	655	boolean stringPredictor,
	656	boolean datePredictor,
	657	boolean relationalPredictor,
	658	boolean multiInstance) {
	659
	660	boolean[] result = new boolean[2];
	661
	662	print("correct initialisation during buildClusterer");
	663	printAttributeSummary(
	664	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	665	print("...");
	666	int numTrain = getNumInstances(), missingLevel = 0;
	667	boolean predictorMissing = false;
	668
	669	Instances train1 = null;
	670	Instances train2 = null;
	671	Clusterer clusterer = null;
	672	ClusterEvaluation evaluation1A = null;
	673	ClusterEvaluation evaluation1B = null;
	674	ClusterEvaluation evaluation2 = null;
	675	boolean built = false;
	676	int stage = 0;
	677	try {
	678
	679	// Make two train sets with different numbers of attributes
	680	train1 = makeTestDataset(42, numTrain,
	681	nominalPredictor ? getNumNominal() : 0,
	682	numericPredictor ? getNumNumeric() : 0,
	683	stringPredictor ? getNumString() : 0,
	684	datePredictor ? getNumDate() : 0,
	685	relationalPredictor ? getNumRelational() : 0,
	686	multiInstance);
	687	train2 = makeTestDataset(84, numTrain,
	688	nominalPredictor ? getNumNominal() + 1 : 0,
	689	numericPredictor ? getNumNumeric() + 1 : 0,
	690	stringPredictor ? getNumString() : 0,
	691	datePredictor ? getNumDate() : 0,
	692	relationalPredictor ? getNumRelational() : 0,
	693	multiInstance);
	694	if (nominalPredictor && !multiInstance) {
	695	train1.deleteAttributeAt(0);
	696	train2.deleteAttributeAt(0);
	697	}
	698	if (missingLevel > 0) {
	699	addMissing(train1, missingLevel, predictorMissing);
	700	addMissing(train2, missingLevel, predictorMissing);
	701	}
	702
	703	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
	704	evaluation1A = new ClusterEvaluation();
	705	evaluation1B = new ClusterEvaluation();
	706	evaluation2 = new ClusterEvaluation();
	707	} catch (Exception ex) {
	708	throw new Error("Error setting up for tests: " + ex.getMessage());
	709	}
	710	try {
	711	stage = 0;
	712	clusterer.buildClusterer(train1);
	713	built = true;
	714	evaluation1A.setClusterer(clusterer);
	715	evaluation1A.evaluateClusterer(train1);
	716
	717	stage = 1;
	718	built = false;
	719	clusterer.buildClusterer(train2);
	720	built = true;
	721	evaluation2.setClusterer(clusterer);
	722	evaluation2.evaluateClusterer(train2);
	723
	724	stage = 2;
	725	built = false;
	726	clusterer.buildClusterer(train1);
	727	built = true;
	728	evaluation1B.setClusterer(clusterer);
	729	evaluation1B.evaluateClusterer(train1);
	730
	731	stage = 3;
	732	if (!evaluation1A.equals(evaluation1B)) {
	733	if (m_Debug) {
	734	println("\n=== Full report ===\n");
	735	println("First buildClusterer()");
	736	println(evaluation1A.clusterResultsToString() + "\n\n");
	737	println("Second buildClusterer()");
	738	println(evaluation1B.clusterResultsToString() + "\n\n");
	739	}
	740	throw new Exception("Results differ between buildClusterer calls");
	741	}
	742	println("yes");
	743	result[0] = true;
	744
	745	if (false && m_Debug) {
	746	println("\n=== Full report ===\n");
	747	println("First buildClusterer()");
	748	println(evaluation1A.clusterResultsToString() + "\n\n");
	749	println("Second buildClusterer()");
	750	println(evaluation1B.clusterResultsToString() + "\n\n");
	751	}
	752	}
	753	catch (Exception ex) {
	754	println("no");
	755	result[0] = false;
	756	if (m_Debug) {
	757	println("\n=== Full Report ===");
	758	print("Problem during");
	759	if (built) {
	760	print(" testing");
	761	} else {
	762	print(" training");
	763	}
	764	switch (stage) {
	765	case 0:
	766	print(" of dataset 1");
	767	break;
	768	case 1:
	769	print(" of dataset 2");
	770	break;
	771	case 2:
	772	print(" of dataset 1 (2nd build)");
	773	break;
	774	case 3:
	775	print(", comparing results from builds of dataset 1");
	776	break;
	777	}
	778	println(": " + ex.getMessage() + "\n");
	779	println("here are the datasets:\n");
	780	println("=== Train1 Dataset ===\n"
	781	+ train1.toString() + "\n");
	782	println("=== Train2 Dataset ===\n"
	783	+ train2.toString() + "\n");
	784	}
	785	}
	786
	787	return result;
	788	}
	789
	790	/**
	791	* Checks basic missing value handling of the scheme. If the missing
	792	* values cause an exception to be thrown by the scheme, this will be
	793	* recorded.
	794	*
	795	* @param nominalPredictor if true use nominal predictor attributes
	796	* @param numericPredictor if true use numeric predictor attributes
	797	* @param stringPredictor if true use string predictor attributes
	798	* @param datePredictor if true use date predictor attributes
	799	* @param relationalPredictor if true use relational predictor attributes
	800	* @param multiInstance whether multi-instance is needed
	801	* @param predictorMissing true if the missing values may be in
	802	* the predictors
	803	* @param missingLevel the percentage of missing values
	804	* @return index 0 is true if the test was passed, index 1 is true if test
	805	* was acceptable
	806	*/
	807	protected boolean[] canHandleMissing(
	808	boolean nominalPredictor,
	809	boolean numericPredictor,
	810	boolean stringPredictor,
	811	boolean datePredictor,
	812	boolean relationalPredictor,
	813	boolean multiInstance,
	814	boolean predictorMissing,
	815	int missingLevel) {
	816
	817	if (missingLevel == 100)
	818	print("100% ");
	819	print("missing");
	820	if (predictorMissing) {
	821	print(" predictor");
	822	}
	823	print(" values");
	824	printAttributeSummary(
	825	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	826	print("...");
	827	FastVector accepts = new FastVector();
	828	accepts.addElement("missing");
	829	accepts.addElement("value");
	830	accepts.addElement("train");
	831	int numTrain = getNumInstances();
	832
	833	return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
	834	datePredictor, relationalPredictor,
	835	multiInstance,
	836	missingLevel, predictorMissing,
	837	numTrain,
	838	accepts);
	839	}
	840
	841	/**
	842	* Checks whether the clusterer can handle instance weights.
	843	* This test compares the clusterer performance on two datasets
	844	* that are identical except for the training weights. If the
	845	* results change, then the clusterer must be using the weights. It
	846	* may be possible to get a false positive from this test if the
	847	* weight changes aren't significant enough to induce a change
	848	* in clusterer performance (but the weights are chosen to minimize
	849	* the likelihood of this).
	850	*
	851	* @param nominalPredictor if true use nominal predictor attributes
	852	* @param numericPredictor if true use numeric predictor attributes
	853	* @param stringPredictor if true use string predictor attributes
	854	* @param datePredictor if true use date predictor attributes
	855	* @param relationalPredictor if true use relational predictor attributes
	856	* @param multiInstance whether multi-instance is needed
	857	* @return index 0 true if the test was passed
	858	*/
	859	protected boolean[] instanceWeights(
	860	boolean nominalPredictor,
	861	boolean numericPredictor,
	862	boolean stringPredictor,
	863	boolean datePredictor,
	864	boolean relationalPredictor,
	865	boolean multiInstance) {
	866
	867	print("clusterer uses instance weights");
	868	printAttributeSummary(
	869	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	870	print("...");
	871	int numTrain = 2*getNumInstances(), missingLevel = 0;
	872	boolean predictorMissing = false;
	873
	874	boolean[] result = new boolean[2];
	875	Instances train = null;
	876	Clusterer [] clusterers = null;
	877	ClusterEvaluation evaluationB = null;
	878	ClusterEvaluation evaluationI = null;
	879	boolean built = false;
	880	boolean evalFail = false;
	881	try {
	882	train = makeTestDataset(42, numTrain,
	883	nominalPredictor ? getNumNominal() + 1 : 0,
	884	numericPredictor ? getNumNumeric() + 1 : 0,
	885	stringPredictor ? getNumString() : 0,
	886	datePredictor ? getNumDate() : 0,
	887	relationalPredictor ? getNumRelational() : 0,
	888	multiInstance);
	889	if (nominalPredictor && !multiInstance)
	890	train.deleteAttributeAt(0);
	891	if (missingLevel > 0)
	892	addMissing(train, missingLevel, predictorMissing);
	893	clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
	894	evaluationB = new ClusterEvaluation();
	895	evaluationI = new ClusterEvaluation();
	896	clusterers[0].buildClusterer(train);
	897	evaluationB.setClusterer(clusterers[0]);
	898	} catch (Exception ex) {
	899	throw new Error("Error setting up for tests: " + ex.getMessage());
	900	}
	901	try {
	902
	903	// Now modify instance weights and re-built/test
	904	for (int i = 0; i < train.numInstances(); i++) {
	905	train.instance(i).setWeight(0);
	906	}
	907	Random random = new Random(1);
	908	for (int i = 0; i < train.numInstances() / 2; i++) {
	909	int inst = Math.abs(random.nextInt()) % train.numInstances();
	910	int weight = Math.abs(random.nextInt()) % 10 + 1;
	911	train.instance(inst).setWeight(weight);
	912	}
	913	clusterers[1].buildClusterer(train);
	914	built = true;
	915	evaluationI.setClusterer(clusterers[1]);
	916	if (evaluationB.equals(evaluationI)) {
	917	// println("no");
	918	evalFail = true;
	919	throw new Exception("evalFail");
	920	}
	921
	922	println("yes");
	923	result[0] = true;
	924	} catch (Exception ex) {
	925	println("no");
	926	result[0] = false;
	927
	928	if (m_Debug) {
	929	println("\n=== Full Report ===");
	930
	931	if (evalFail) {
	932	println("Results don't differ between non-weighted and "
	933	+ "weighted instance models.");
	934	println("Here are the results:\n");
	935	println("\nboth methods\n");
	936	println(evaluationB.clusterResultsToString());
	937	} else {
	938	print("Problem during");
	939	if (built) {
	940	print(" testing");
	941	} else {
	942	print(" training");
	943	}
	944	println(": " + ex.getMessage() + "\n");
	945	}
	946	println("Here is the dataset:\n");
	947	println("=== Train Dataset ===\n"
	948	+ train.toString() + "\n");
	949	println("=== Train Weights ===\n");
	950	for (int i = 0; i < train.numInstances(); i++) {
	951	println(" " + (i + 1)
	952	+ " " + train.instance(i).weight());
	953	}
	954	}
	955	}
	956
	957	return result;
	958	}
	959
	960	/**
	961	* Checks whether the scheme alters the training dataset during
	962	* training. If the scheme needs to modify the training
	963	* data it should take a copy of the training data. Currently checks
	964	* for changes to header structure, number of instances, order of
	965	* instances, instance weights.
	966	*
	967	* @param nominalPredictor if true use nominal predictor attributes
	968	* @param numericPredictor if true use numeric predictor attributes
	969	* @param stringPredictor if true use string predictor attributes
	970	* @param datePredictor if true use date predictor attributes
	971	* @param relationalPredictor if true use relational predictor attributes
	972	* @param multiInstance whether multi-instance is needed
	973	* @param predictorMissing true if we know the clusterer can handle
	974	* (at least) moderate missing predictor values
	975	* @return index 0 is true if the test was passed
	976	*/
	977	protected boolean[] datasetIntegrity(
	978	boolean nominalPredictor,
	979	boolean numericPredictor,
	980	boolean stringPredictor,
	981	boolean datePredictor,
	982	boolean relationalPredictor,
	983	boolean multiInstance,
	984	boolean predictorMissing) {
	985
	986	print("clusterer doesn't alter original datasets");
	987	printAttributeSummary(
	988	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	989	print("...");
	990	int numTrain = getNumInstances(), missingLevel = 20;
	991
	992	boolean[] result = new boolean[2];
	993	Instances train = null;
	994	Clusterer clusterer = null;
	995	try {
	996	train = makeTestDataset(42, numTrain,
	997	nominalPredictor ? getNumNominal() : 0,
	998	numericPredictor ? getNumNumeric() : 0,
	999	stringPredictor ? getNumString() : 0,
	1000	datePredictor ? getNumDate() : 0,
	1001	relationalPredictor ? getNumRelational() : 0,
	1002	multiInstance);
	1003	if (nominalPredictor && !multiInstance)
	1004	train.deleteAttributeAt(0);
	1005	if (missingLevel > 0)
	1006	addMissing(train, missingLevel, predictorMissing);
	1007	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
	1008	} catch (Exception ex) {
	1009	throw new Error("Error setting up for tests: " + ex.getMessage());
	1010	}
	1011	try {
	1012	Instances trainCopy = new Instances(train);
	1013	clusterer.buildClusterer(trainCopy);
	1014	compareDatasets(train, trainCopy);
	1015
	1016	println("yes");
	1017	result[0] = true;
	1018	} catch (Exception ex) {
	1019	println("no");
	1020	result[0] = false;
	1021
	1022	if (m_Debug) {
	1023	println("\n=== Full Report ===");
	1024	print("Problem during training");
	1025	println(": " + ex.getMessage() + "\n");
	1026	println("Here is the dataset:\n");
	1027	println("=== Train Dataset ===\n"
	1028	+ train.toString() + "\n");
	1029	}
	1030	}
	1031
	1032	return result;
	1033	}
	1034
	1035	/**
	1036	* Checks whether an updateable scheme produces the same model when
	1037	* trained incrementally as when batch trained. The model itself
	1038	* cannot be compared, so we compare the evaluation on test data
	1039	* for both models. It is possible to get a false positive on this
	1040	* test (likelihood depends on the classifier).
	1041	*
	1042	* @param nominalPredictor if true use nominal predictor attributes
	1043	* @param numericPredictor if true use numeric predictor attributes
	1044	* @param stringPredictor if true use string predictor attributes
	1045	* @param datePredictor if true use date predictor attributes
	1046	* @param relationalPredictor if true use relational predictor attributes
	1047	* @param multiInstance whether multi-instance is needed
	1048	* @return index 0 is true if the test was passed
	1049	*/
	1050	protected boolean[] updatingEquality(
	1051	boolean nominalPredictor,
	1052	boolean numericPredictor,
	1053	boolean stringPredictor,
	1054	boolean datePredictor,
	1055	boolean relationalPredictor,
	1056	boolean multiInstance) {
	1057
	1058	print("incremental training produces the same results"
	1059	+ " as batch training");
	1060	printAttributeSummary(
	1061	nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
	1062	print("...");
	1063	int numTrain = getNumInstances(), missingLevel = 0;
	1064	boolean predictorMissing = false, classMissing = false;
	1065
	1066	boolean[] result = new boolean[2];
	1067	Instances train = null;
	1068	Clusterer[] clusterers = null;
	1069	ClusterEvaluation evaluationB = null;
	1070	ClusterEvaluation evaluationI = null;
	1071	boolean built = false;
	1072	try {
	1073	train = makeTestDataset(42, numTrain,
	1074	nominalPredictor ? getNumNominal() : 0,
	1075	numericPredictor ? getNumNumeric() : 0,
	1076	stringPredictor ? getNumString() : 0,
	1077	datePredictor ? getNumDate() : 0,
	1078	relationalPredictor ? getNumRelational() : 0,
	1079	multiInstance);
	1080	if (missingLevel > 0)
	1081	addMissing(train, missingLevel, predictorMissing, classMissing);
	1082	clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
	1083	evaluationB = new ClusterEvaluation();
	1084	evaluationI = new ClusterEvaluation();
	1085	clusterers[0].buildClusterer(train);
	1086	evaluationB.setClusterer(clusterers[0]);
	1087	} catch (Exception ex) {
	1088	throw new Error("Error setting up for tests: " + ex.getMessage());
	1089	}
	1090	try {
	1091	clusterers[1].buildClusterer(new Instances(train, 0));
	1092	for (int i = 0; i < train.numInstances(); i++) {
	1093	((UpdateableClusterer)clusterers[1]).updateClusterer(
	1094	train.instance(i));
	1095	}
	1096	built = true;
	1097	evaluationI.setClusterer(clusterers[1]);
	1098	if (!evaluationB.equals(evaluationI)) {
	1099	println("no");
	1100	result[0] = false;
	1101
	1102	if (m_Debug) {
	1103	println("\n=== Full Report ===");
	1104	println("Results differ between batch and "
	1105	+ "incrementally built models.\n"
	1106	+ "Depending on the classifier, this may be OK");
	1107	println("Here are the results:\n");
	1108	println("\nbatch built results\n" + evaluationB.clusterResultsToString());
	1109	println("\nincrementally built results\n" + evaluationI.clusterResultsToString());
	1110	println("Here are the datasets:\n");
	1111	println("=== Train Dataset ===\n"
	1112	+ train.toString() + "\n");
	1113	}
	1114	}
	1115	else {
	1116	println("yes");
	1117	result[0] = true;
	1118	}
	1119	} catch (Exception ex) {
	1120	result[0] = false;
	1121
	1122	print("Problem during");
	1123	if (built)
	1124	print(" testing");
	1125	else
	1126	print(" training");
	1127	println(": " + ex.getMessage() + "\n");
	1128	}
	1129
	1130	return result;
	1131	}
	1132
	1133	/**
	1134	* Runs a text on the datasets with the given characteristics.
	1135	*
	1136	* @param nominalPredictor if true use nominal predictor attributes
	1137	* @param numericPredictor if true use numeric predictor attributes
	1138	* @param stringPredictor if true use string predictor attributes
	1139	* @param datePredictor if true use date predictor attributes
	1140	* @param relationalPredictor if true use relational predictor attributes
	1141	* @param multiInstance whether multi-instance is needed
	1142	* @param missingLevel the percentage of missing values
	1143	* @param predictorMissing true if the missing values may be in
	1144	* the predictors
	1145	* @param numTrain the number of instances in the training set
	1146	* @param accepts the acceptable string in an exception
	1147	* @return index 0 is true if the test was passed, index 1 is true if test
	1148	* was acceptable
	1149	*/
	1150	protected boolean[] runBasicTest(boolean nominalPredictor,
	1151	boolean numericPredictor,
	1152	boolean stringPredictor,
	1153	boolean datePredictor,
	1154	boolean relationalPredictor,
	1155	boolean multiInstance,
	1156	int missingLevel,
	1157	boolean predictorMissing,
	1158	int numTrain,
	1159	FastVector accepts) {
	1160
	1161	boolean[] result = new boolean[2];
	1162	Instances train = null;
	1163	Clusterer clusterer = null;
	1164	try {
	1165	train = makeTestDataset(42, numTrain,
	1166	nominalPredictor ? getNumNominal() : 0,
	1167	numericPredictor ? getNumNumeric() : 0,
	1168	stringPredictor ? getNumString() : 0,
	1169	datePredictor ? getNumDate() : 0,
	1170	relationalPredictor ? getNumRelational() : 0,
	1171	multiInstance);
	1172	if (nominalPredictor && !multiInstance)
	1173	train.deleteAttributeAt(0);
	1174	if (missingLevel > 0)
	1175	addMissing(train, missingLevel, predictorMissing);
	1176	clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
	1177	} catch (Exception ex) {
	1178	ex.printStackTrace();
	1179	throw new Error("Error setting up for tests: " + ex.getMessage());
	1180	}
	1181	try {
	1182	clusterer.buildClusterer(train);
	1183	println("yes");
	1184	result[0] = true;
	1185	}
	1186	catch (Exception ex) {
	1187	boolean acceptable = false;
	1188	String msg = ex.getMessage().toLowerCase();
	1189	for (int i = 0; i < accepts.size(); i++) {
	1190	if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
	1191	acceptable = true;
	1192	}
	1193	}
	1194
	1195	println("no" + (acceptable ? " (OK error message)" : ""));
	1196	result[1] = acceptable;
	1197
	1198	if (m_Debug) {
	1199	println("\n=== Full Report ===");
	1200	print("Problem during training");
	1201	println(": " + ex.getMessage() + "\n");
	1202	if (!acceptable) {
	1203	if (accepts.size() > 0) {
	1204	print("Error message doesn't mention ");
	1205	for (int i = 0; i < accepts.size(); i++) {
	1206	if (i != 0) {
	1207	print(" or ");
	1208	}
	1209	print('"' + (String)accepts.elementAt(i) + '"');
	1210	}
	1211	}
	1212	println("here is the dataset:\n");
	1213	println("=== Train Dataset ===\n"
	1214	+ train.toString() + "\n");
	1215	}
	1216	}
	1217	}
	1218
	1219	return result;
	1220	}
	1221
	1222	/**
	1223	* Add missing values to a dataset.
	1224	*
	1225	* @param data the instances to add missing values to
	1226	* @param level the level of missing values to add (if positive, this
	1227	* is the probability that a value will be set to missing, if negative
	1228	* all but one value will be set to missing (not yet implemented))
	1229	* @param predictorMissing if true, predictor attributes will be modified
	1230	*/
	1231	protected void addMissing(Instances data, int level, boolean predictorMissing) {
	1232
	1233	Random random = new Random(1);
	1234	for (int i = 0; i < data.numInstances(); i++) {
	1235	Instance current = data.instance(i);
	1236	for (int j = 0; j < data.numAttributes(); j++) {
	1237	if (predictorMissing) {
	1238	if (Math.abs(random.nextInt()) % 100 < level)
	1239	current.setMissing(j);
	1240	}
	1241	}
	1242	}
	1243	}
	1244
	1245	/**
	1246	* Make a simple set of instances with variable position of the class
	1247	* attribute, which can later be modified for use in specific tests.
	1248	*
	1249	* @param seed the random number seed
	1250	* @param numInstances the number of instances to generate
	1251	* @param numNominal the number of nominal attributes
	1252	* @param numNumeric the number of numeric attributes
	1253	* @param numString the number of string attributes
	1254	* @param numDate the number of date attributes
	1255	* @param numRelational the number of relational attributes
	1256	* @param multiInstance whether the dataset should a multi-instance dataset
	1257	* @return the test dataset
	1258	* @throws Exception if the dataset couldn't be generated
	1259	* @see TestInstances#CLASS_IS_LAST
	1260	*/
	1261	protected Instances makeTestDataset(int seed, int numInstances,
	1262	int numNominal, int numNumeric,
	1263	int numString, int numDate,
	1264	int numRelational,
	1265	boolean multiInstance)
	1266	throws Exception {
	1267
	1268	TestInstances dataset = new TestInstances();
	1269
	1270	dataset.setSeed(seed);
	1271	dataset.setNumInstances(numInstances);
	1272	dataset.setNumNominal(numNominal);
	1273	dataset.setNumNumeric(numNumeric);
	1274	dataset.setNumString(numString);
	1275	dataset.setNumDate(numDate);
	1276	dataset.setNumRelational(numRelational);
	1277	dataset.setClassIndex(TestInstances.NO_CLASS);
	1278	dataset.setMultiInstance(multiInstance);
	1279
	1280	return dataset.generate();
	1281	}
	1282
	1283	/**
	1284	* Print out a short summary string for the dataset characteristics
	1285	*
	1286	* @param nominalPredictor true if nominal predictor attributes are present
	1287	* @param numericPredictor true if numeric predictor attributes are present
	1288	* @param stringPredictor true if string predictor attributes are present
	1289	* @param datePredictor true if date predictor attributes are present
	1290	* @param relationalPredictor true if relational predictor attributes are present
	1291	* @param multiInstance whether multi-instance is needed
	1292	*/
	1293	protected void printAttributeSummary(boolean nominalPredictor,
	1294	boolean numericPredictor,
	1295	boolean stringPredictor,
	1296	boolean datePredictor,
	1297	boolean relationalPredictor,
	1298	boolean multiInstance) {
	1299
	1300	String str = "";
	1301
	1302	if (numericPredictor)
	1303	str += "numeric";
	1304
	1305	if (nominalPredictor) {
	1306	if (str.length() > 0)
	1307	str += " & ";
	1308	str += "nominal";
	1309	}
	1310
	1311	if (stringPredictor) {
	1312	if (str.length() > 0)
	1313	str += " & ";
	1314	str += "string";
	1315	}
	1316
	1317	if (datePredictor) {
	1318	if (str.length() > 0)
	1319	str += " & ";
	1320	str += "date";
	1321	}
	1322
	1323	if (relationalPredictor) {
	1324	if (str.length() > 0)
	1325	str += " & ";
	1326	str += "relational";
	1327	}
	1328
	1329	str = " (" + str + " predictors)";
	1330
	1331	print(str);
	1332	}
	1333
	1334	/**
	1335	* Returns the revision string.
	1336	*
	1337	* @return the revision
	1338	*/
	1339	public String getRevision() {
	1340	return RevisionUtils.extract("$Revision: 1.11 $");
	1341	}
	1342
	1343	/**
	1344	* Test method for this class
	1345	*
	1346	* @param args the commandline options
	1347	*/
	1348	public static void main(String [] args) {
	1349	runCheck(new CheckClusterer(), args);
	1350	}
	1351	}
	1352

Note: See TracBrowser for help on using the repository browser.

Download in other formats: