Context Navigation

source: src/main/java/weka/clusterers/ClusterEvaluation.java @ 10

Last change on this file since 10 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 39.7 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* ClusterEvaluation.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.clusterers;
24
25	import weka.core.Drawable;
26	import weka.core.Instance;
27	import weka.core.Instances;
28	import weka.core.Option;
29	import weka.core.OptionHandler;
30	import weka.core.Range;
31	import weka.core.RevisionHandler;
32	import weka.core.RevisionUtils;
33	import weka.core.Utils;
34	import weka.core.converters.ConverterUtils.DataSource;
35	import weka.filters.Filter;
36	import weka.filters.unsupervised.attribute.Remove;
37
38	import java.beans.BeanInfo;
39	import java.beans.Introspector;
40	import java.beans.MethodDescriptor;
41	import java.io.BufferedWriter;
42	import java.io.FileWriter;
43	import java.io.Serializable;
44	import java.lang.reflect.Method;
45	import java.util.Enumeration;
46	import java.util.Random;
47	import java.util.Vector;
48
49	/**
50	* Class for evaluating clustering models.<p/>
51	*
52	* Valid options are: <p/>
53	*
54	* -t name of the training file <br/>
55	* Specify the training file. <p/>
56	*
57	* -T name of the test file <br/>
58	* Specify the test file to apply clusterer to. <p/>
59	*
60	* -d name of file to save clustering model to <br/>
61	* Specify output file. <p/>
62	*
63	* -l name of file to load clustering model from <br/>
64	* Specifiy input file. <p/>
65	*
66	* -p attribute range <br/>
67	* Output predictions. Predictions are for the training file if only the
68	* training file is specified, otherwise they are for the test file. The range
69	* specifies attribute values to be output with the predictions.
70	* Use '-p 0' for none. <p/>
71	*
72	* -x num folds <br/>
73	* Set the number of folds for a cross validation of the training data.
74	* Cross validation can only be done for distribution clusterers and will
75	* be performed if the test file is missing. <p/>
76	*
77	* -s num <br/>
78	* Sets the seed for randomizing the data for cross-validation. <p/>
79	*
80	* -c class <br/>
81	* Set the class attribute. If set, then class based evaluation of clustering
82	* is performed. <p/>
83	*
84	* -g name of graph file <br/>
85	* Outputs the graph representation of the clusterer to the file. Only for
86	* clusterer that implemented the <code>weka.core.Drawable</code> interface.
87	* <p/>
88	*
89	* @author Mark Hall (mhall@cs.waikato.ac.nz)
90	* @version $Revision: 6021 $
91	* @see weka.core.Drawable
92	*/
93	public class ClusterEvaluation
94	implements Serializable, RevisionHandler {
95
96	/** for serialization */
97	static final long serialVersionUID = -830188327319128005L;
98
99	/** the clusterer */
100	private Clusterer m_Clusterer;
101
102	/** holds a string describing the results of clustering the training data */
103	private StringBuffer m_clusteringResults;
104
105	/** holds the number of clusters found by the clusterer */
106	private int m_numClusters;
107
108	/** holds the assigments of instances to clusters for a particular testing
109	dataset */
110	private double[] m_clusterAssignments;
111
112	/** holds the average log likelihood for a particular testing dataset
113	if the clusterer is a DensityBasedClusterer */
114	private double m_logL;
115
116	/** will hold the mapping of classes to clusters (for class based
117	evaluation) */
118	private int[] m_classToCluster = null;
119
120	/**
121	* set the clusterer
122	* @param clusterer the clusterer to use
123	*/
124	public void setClusterer(Clusterer clusterer) {
125	m_Clusterer = clusterer;
126	}
127
128	/**
129	* return the results of clustering.
130	* @return a string detailing the results of clustering a data set
131	*/
132	public String clusterResultsToString() {
133	return m_clusteringResults.toString();
134	}
135
136	/**
137	* Return the number of clusters found for the most recent call to
138	* evaluateClusterer
139	* @return the number of clusters found
140	*/
141	public int getNumClusters() {
142	return m_numClusters;
143	}
144
145	/**
146	* Return an array of cluster assignments corresponding to the most
147	* recent set of instances clustered.
148	* @return an array of cluster assignments
149	*/
150	public double[] getClusterAssignments() {
151	return m_clusterAssignments;
152	}
153
154	/**
155	* Return the array (ordered by cluster number) of minimum error class to
156	* cluster mappings
157	* @return an array of class to cluster mappings
158	*/
159	public int[] getClassesToClusters() {
160	return m_classToCluster;
161	}
162
163	/**
164	* Return the log likelihood corresponding to the most recent
165	* set of instances clustered.
166	*
167	* @return a <code>double</code> value
168	*/
169	public double getLogLikelihood() {
170	return m_logL;
171	}
172
173	/**
174	* Constructor. Sets defaults for each member variable. Default Clusterer
175	* is EM.
176	*/
177	public ClusterEvaluation () {
178	setClusterer(new SimpleKMeans());
179	m_clusteringResults = new StringBuffer();
180	m_clusterAssignments = null;
181	}
182
183	/**
184	* Evaluate the clusterer on a set of instances. Calculates clustering
185	* statistics and stores cluster assigments for the instances in
186	* m_clusterAssignments
187	*
188	* @param test the set of instances to cluster
189	* @throws Exception if something goes wrong
190	*/
191	public void evaluateClusterer(Instances test) throws Exception {
192	evaluateClusterer(test, "");
193	}
194
195	/**
196	* Evaluate the clusterer on a set of instances. Calculates clustering
197	* statistics and stores cluster assigments for the instances in
198	* m_clusterAssignments
199	*
200	* @param test the set of instances to cluster
201	* @param testFileName the name of the test file for incremental testing,
202	* if "" or null then not used
203	* @throws Exception if something goes wrong
204	*/
205	public void evaluateClusterer(Instances test, String testFileName) throws Exception {
206	int i = 0;
207	int cnum;
208	double loglk = 0.0;
209	int cc = m_Clusterer.numberOfClusters();
210	m_numClusters = cc;
211	double[] instanceStats = new double[cc];
212	Instances testRaw = null;
213	boolean hasClass = (test.classIndex() >= 0);
214	int unclusteredInstances = 0;
215	Vector<Double> clusterAssignments = new Vector<Double>();
216	Filter filter = null;
217	DataSource source = null;
218	Instance inst;
219
220	if (testFileName == null)
221	testFileName = "";
222
223	// load data
224	if (testFileName.length() != 0)
225	source = new DataSource(testFileName);
226	else
227	source = new DataSource(test);
228	testRaw = source.getStructure(test.classIndex());
229
230	// If class is set then do class based evaluation as well
231	if (hasClass) {
232	if (testRaw.classAttribute().isNumeric())
233	throw new Exception("ClusterEvaluation: Class must be nominal!");
234
235	filter = new Remove();
236	((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1));
237	((Remove) filter).setInvertSelection(false);
238	filter.setInputFormat(testRaw);
239	}
240
241	i = 0;
242	while (source.hasMoreElements(testRaw)) {
243	// next instance
244	inst = source.nextElement(testRaw);
245	if (filter != null) {
246	filter.input(inst);
247	filter.batchFinished();
248	inst = filter.output();
249	}
250
251	cnum = -1;
252	try {
253	if (m_Clusterer instanceof DensityBasedClusterer) {
254	loglk += ((DensityBasedClusterer)m_Clusterer).
255	logDensityForInstance(inst);
256	cnum = m_Clusterer.clusterInstance(inst);
257	clusterAssignments.add((double) cnum);
258	}
259	else {
260	cnum = m_Clusterer.clusterInstance(inst);
261	clusterAssignments.add((double) cnum);
262	}
263	}
264	catch (Exception e) {
265	clusterAssignments.add(-1.0);
266	unclusteredInstances++;
267	}
268
269	if (cnum != -1) {
270	instanceStats[cnum]++;
271	}
272	}
273
274	double sum = Utils.sum(instanceStats);
275	loglk /= sum;
276	m_logL = loglk;
277	m_clusterAssignments = new double [clusterAssignments.size()];
278	for (i = 0; i < clusterAssignments.size(); i++) {
279	m_clusterAssignments[i] = clusterAssignments.get(i);
280	}
281	int numInstFieldWidth = (int)((Math.log(clusterAssignments.size())/Math.log(10))+1);
282
283	m_clusteringResults.append(m_Clusterer.toString());
284	m_clusteringResults.append("Clustered Instances\n\n");
285	int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
286	for (i = 0; i < cc; i++) {
287	if (instanceStats[i] > 0)
288	m_clusteringResults.append(Utils.doubleToString((double)i,
289	clustFieldWidth, 0)
290	+ " "
291	+ Utils.doubleToString(instanceStats[i],
292	numInstFieldWidth, 0)
293	+ " ("
294	+ Utils.doubleToString((instanceStats[i] /
295	sum * 100.0)
296	, 3, 0) + "%)\n");
297	}
298
299	if (unclusteredInstances > 0)
300	m_clusteringResults.append("\nUnclustered instances : "
301	+unclusteredInstances);
302
303	if (m_Clusterer instanceof DensityBasedClusterer)
304	m_clusteringResults.append("\n\nLog likelihood: "
305	+ Utils.doubleToString(loglk, 1, 5)
306	+ "\n");
307
308	if (hasClass) {
309	evaluateClustersWithRespectToClass(test, testFileName);
310	}
311	}
312
313	/**
314	* Evaluates cluster assignments with respect to actual class labels.
315	* Assumes that m_Clusterer has been trained and tested on
316	* inst (minus the class).
317	*
318	* @param inst the instances (including class) to evaluate with respect to
319	* @param fileName the name of the test file for incremental testing,
320	* if "" or null then not used
321	* @throws Exception if something goes wrong
322	*/
323	private void evaluateClustersWithRespectToClass(Instances inst, String fileName)
324	throws Exception {
325
326
327
328	int numClasses = inst.classAttribute().numValues();
329	int[][] counts = new int [m_numClusters][numClasses];
330	int[] clusterTotals = new int[m_numClusters];
331	double[] best = new double[m_numClusters+1];
332	double[] current = new double[m_numClusters+1];
333	DataSource source = null;
334	Instances instances = null;
335	Instance instance = null;
336	int i;
337	int numInstances;
338
339
340	if (fileName == null)
341	fileName = "";
342
343	if (fileName.length() != 0) {
344	source = new DataSource(fileName);
345	}
346	else
347	source = new DataSource(inst);
348	instances = source.getStructure(inst.classIndex());
349
350	i = 0;
351	while (source.hasMoreElements(instances)) {
352	instance = source.nextElement(instances);
353	if (m_clusterAssignments[i] >= 0) {
354	counts[(int)m_clusterAssignments[i]][(int)instance.classValue()]++;
355	clusterTotals[(int)m_clusterAssignments[i]]++;
356	}
357	i++;
358	}
359	numInstances = i;
360
361	best[m_numClusters] = Double.MAX_VALUE;
362	mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0);
363
364	m_clusteringResults.append("\n\nClass attribute: "
365	+inst.classAttribute().name()
366	+"\n");
367	m_clusteringResults.append("Classes to Clusters:\n");
368	String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0));
369	m_clusteringResults.append(matrixString).append("\n");
370
371	int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));
372	// add the minimum error assignment
373	for (i = 0; i < m_numClusters; i++) {
374	if (clusterTotals[i] > 0) {
375	m_clusteringResults.append("Cluster "
376	+Utils.doubleToString((double)i,Cwidth,0));
377	m_clusteringResults.append(" <-- ");
378
379	if (best[i] < 0) {
380	m_clusteringResults.append("No class\n");
381	} else {
382	m_clusteringResults.
383	append(inst.classAttribute().value((int)best[i])).append("\n");
384	}
385	}
386	}
387	m_clusteringResults.append("\nIncorrectly clustered instances :\t"
388	+best[m_numClusters]+"\t"
389	+(Utils.doubleToString((best[m_numClusters] /
390	numInstances *
391	100.0), 8, 4))
392	+" %\n");
393
394	// copy the class assignments
395	m_classToCluster = new int [m_numClusters];
396	for (i = 0; i < m_numClusters; i++) {
397	m_classToCluster[i] = (int)best[i];
398	}
399	}
400
401	/**
402	* Returns a "confusion" style matrix of classes to clusters assignments
403	* @param counts the counts of classes for each cluster
404	* @param clusterTotals total number of examples in each cluster
405	* @param inst the training instances (with class)
406	* @return the "confusion" style matrix as string
407	* @throws Exception if matrix can't be generated
408	*/
409	private String toMatrixString(int[][] counts, int[] clusterTotals,
410	Instances inst)
411	throws Exception {
412	StringBuffer ms = new StringBuffer();
413
414	int maxval = 0;
415	for (int i = 0; i < m_numClusters; i++) {
416	for (int j = 0; j < counts[i].length; j++) {
417	if (counts[i][j] > maxval) {
418	maxval = counts[i][j];
419	}
420	}
421	}
422
423	int Cwidth = 1 + Math.max((int)(Math.log(maxval) / Math.log(10)),
424	(int)(Math.log(m_numClusters) / Math.log(10)));
425
426	ms.append("\n");
427
428	for (int i = 0; i < m_numClusters; i++) {
429	if (clusterTotals[i] > 0) {
430	ms.append(" ").append(Utils.doubleToString((double)i, Cwidth, 0));
431	}
432	}
433	ms.append(" <-- assigned to cluster\n");
434
435	for (int i = 0; i< counts[0].length; i++) {
436
437	for (int j = 0; j < m_numClusters; j++) {
438	if (clusterTotals[j] > 0) {
439	ms.append(" ").append(Utils.doubleToString((double)counts[j][i],
440	Cwidth, 0));
441	}
442	}
443	ms.append(" \| ").append(inst.classAttribute().value(i)).append("\n");
444	}
445
446	return ms.toString();
447	}
448
449	/**
450	* Finds the minimum error mapping of classes to clusters. Recursively
451	* considers all possible class to cluster assignments.
452	*
453	* @param numClusters the number of clusters
454	* @param lev the cluster being processed
455	* @param counts the counts of classes in clusters
456	* @param clusterTotals the total number of examples in each cluster
457	* @param current the current path through the class to cluster assignment
458	* tree
459	* @param best the best assignment path seen
460	* @param error accumulates the error for a particular path
461	*/
462	public static void mapClasses(int numClusters, int lev, int[][] counts, int[] clusterTotals,
463	double[] current, double[] best, int error) {
464	// leaf
465	if (lev == numClusters) {
466	if (error < best[numClusters]) {
467	best[numClusters] = error;
468	for (int i = 0; i < numClusters; i++) {
469	best[i] = current[i];
470	}
471	}
472	} else {
473	// empty cluster -- ignore
474	if (clusterTotals[lev] == 0) {
475	current[lev] = -1; // cluster ignored
476	mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
477	error);
478	} else {
479	// first try no class assignment to this cluster
480	current[lev] = -1; // cluster assigned no class (ie all errors)
481	mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
482	error+clusterTotals[lev]);
483	// now loop through the classes in this cluster
484	for (int i = 0; i < counts[0].length; i++) {
485	if (counts[lev][i] > 0) {
486	boolean ok = true;
487	// check to see if this class has already been assigned
488	for (int j = 0; j < lev; j++) {
489	if ((int)current[j] == i) {
490	ok = false;
491	break;
492	}
493	}
494	if (ok) {
495	current[lev] = i;
496	mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
497	(error + (clusterTotals[lev] - counts[lev][i])));
498	}
499	}
500	}
501	}
502	}
503	}
504
505	/**
506	* Evaluates a clusterer with the options given in an array of
507	* strings. It takes the string indicated by "-t" as training file, the
508	* string indicated by "-T" as test file.
509	* If the test file is missing, a stratified ten-fold
510	* cross-validation is performed (distribution clusterers only).
511	* Using "-x" you can change the number of
512	* folds to be used, and using "-s" the random seed.
513	* If the "-p" option is present it outputs the classification for
514	* each test instance. If you provide the name of an object file using
515	* "-l", a clusterer will be loaded from the given file. If you provide the
516	* name of an object file using "-d", the clusterer built from the
517	* training data will be saved to the given file.
518	*
519	* @param clusterer machine learning clusterer
520	* @param options the array of string containing the options
521	* @throws Exception if model could not be evaluated successfully
522	* @return a string describing the results
523	*/
524	public static String evaluateClusterer(Clusterer clusterer, String[] options)
525	throws Exception {
526
527	int seed = 1, folds = 10;
528	boolean doXval = false;
529	Instances train = null;
530	Random random;
531	String trainFileName, testFileName, seedString, foldsString;
532	String objectInputFileName, objectOutputFileName, attributeRangeString;
533	String graphFileName;
534	String[] savedOptions = null;
535	boolean printClusterAssignments = false;
536	Range attributesToOutput = null;
537	StringBuffer text = new StringBuffer();
538	int theClass = -1; // class based evaluation of clustering
539	boolean updateable = (clusterer instanceof UpdateableClusterer);
540	DataSource source = null;
541	Instance inst;
542
543	if (Utils.getFlag('h', options) \|\| Utils.getFlag("help", options)) {
544
545	// global info requested as well?
546	boolean globalInfo = Utils.getFlag("synopsis", options) \|\|
547	Utils.getFlag("info", options);
548
549	throw new Exception("Help requested."
550	+ makeOptionString(clusterer, globalInfo));
551	}
552
553	try {
554	// Get basic options (options the same for all clusterers
555	//printClusterAssignments = Utils.getFlag('p', options);
556	objectInputFileName = Utils.getOption('l', options);
557	objectOutputFileName = Utils.getOption('d', options);
558	trainFileName = Utils.getOption('t', options);
559	testFileName = Utils.getOption('T', options);
560	graphFileName = Utils.getOption('g', options);
561
562	// Check -p option
563	try {
564	attributeRangeString = Utils.getOption('p', options);
565	}
566	catch (Exception e) {
567	throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. " +
568	"It now expects a parameter specifying a range of attributes " +
569	"to list with the predictions. Use '-p 0' for none.");
570	}
571	if (attributeRangeString.length() != 0) {
572	printClusterAssignments = true;
573	if (!attributeRangeString.equals("0"))
574	attributesToOutput = new Range(attributeRangeString);
575	}
576
577	if (trainFileName.length() == 0) {
578	if (objectInputFileName.length() == 0) {
579	throw new Exception("No training file and no object "
580	+ "input file given.");
581	}
582
583	if (testFileName.length() == 0) {
584	throw new Exception("No training file and no test file given.");
585	}
586	}
587	else {
588	if ((objectInputFileName.length() != 0)
589	&& (printClusterAssignments == false)) {
590	throw new Exception("Can't use both train and model file "
591	+ "unless -p specified.");
592	}
593	}
594
595	seedString = Utils.getOption('s', options);
596
597	if (seedString.length() != 0) {
598	seed = Integer.parseInt(seedString);
599	}
600
601	foldsString = Utils.getOption('x', options);
602
603	if (foldsString.length() != 0) {
604	folds = Integer.parseInt(foldsString);
605	doXval = true;
606	}
607	}
608	catch (Exception e) {
609	throw new Exception('\n' + e.getMessage()
610	+ makeOptionString(clusterer, false));
611	}
612
613	try {
614	if (trainFileName.length() != 0) {
615	source = new DataSource(trainFileName);
616	train = source.getStructure();
617
618	String classString = Utils.getOption('c',options);
619	if (classString.length() != 0) {
620	if (classString.compareTo("last") == 0)
621	theClass = train.numAttributes();
622	else if (classString.compareTo("first") == 0)
623	theClass = 1;
624	else
625	theClass = Integer.parseInt(classString);
626
627	if (theClass != -1) {
628	if (doXval \|\| testFileName.length() != 0)
629	throw new Exception("Can only do class based evaluation on the "
630	+"training data");
631
632	if (objectInputFileName.length() != 0)
633	throw new Exception("Can't load a clusterer and do class based "
634	+"evaluation");
635
636	if (objectOutputFileName.length() != 0)
637	throw new Exception(
638	"Can't do class based evaluation and save clusterer");
639	}
640	}
641	else {
642	// if the dataset defines a class attribute, use it
643	if (train.classIndex() != -1) {
644	theClass = train.classIndex() + 1;
645	System.err.println(
646	"Note: using class attribute from dataset, i.e., attribute #"
647	+ theClass);
648	}
649	}
650
651	if (theClass != -1) {
652	if (theClass < 1 \|\| theClass > train.numAttributes())
653	throw new Exception("Class is out of range!");
654
655	if (!train.attribute(theClass - 1).isNominal())
656	throw new Exception("Class must be nominal!");
657
658	train.setClassIndex(theClass - 1);
659	}
660	}
661	}
662	catch (Exception e) {
663	throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
664	}
665
666	// Save options
667	if (options != null) {
668	savedOptions = new String[options.length];
669	System.arraycopy(options, 0, savedOptions, 0, options.length);
670	}
671
672	if (objectInputFileName.length() != 0)
673	Utils.checkForRemainingOptions(options);
674
675	// Set options for clusterer
676	if (clusterer instanceof OptionHandler)
677	((OptionHandler)clusterer).setOptions(options);
678
679	Utils.checkForRemainingOptions(options);
680
681	Instances trainHeader = train;
682	if (objectInputFileName.length() != 0) {
683	// Load the clusterer from file
684	// clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
685	java.io.ObjectInputStream ois =
686	new java.io.ObjectInputStream(
687	new java.io.BufferedInputStream(
688	new java.io.FileInputStream(objectInputFileName)));
689	clusterer = (Clusterer) ois.readObject();
690	// try and get the training header
691	try {
692	trainHeader = (Instances) ois.readObject();
693	} catch (Exception ex) {
694	// don't moan if we cant
695	}
696	}
697	else {
698	// Build the clusterer if no object file provided
699	if (theClass == -1) {
700	if (updateable) {
701	clusterer.buildClusterer(source.getStructure());
702	while (source.hasMoreElements(train)) {
703	inst = source.nextElement(train);
704	((UpdateableClusterer) clusterer).updateClusterer(inst);
705	}
706	((UpdateableClusterer) clusterer).updateFinished();
707	}
708	else {
709	clusterer.buildClusterer(source.getDataSet());
710	}
711	}
712	else {
713	Remove removeClass = new Remove();
714	removeClass.setAttributeIndices("" + theClass);
715	removeClass.setInvertSelection(false);
716	removeClass.setInputFormat(train);
717	if (updateable) {
718	Instances clusterTrain = Filter.useFilter(train, removeClass);
719	clusterer.buildClusterer(clusterTrain);
720	trainHeader = clusterTrain;
721	while (source.hasMoreElements(train)) {
722	inst = source.nextElement(train);
723	removeClass.input(inst);
724	removeClass.batchFinished();
725	Instance clusterTrainInst = removeClass.output();
726	((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
727	}
728	((UpdateableClusterer) clusterer).updateFinished();
729	}
730	else {
731	Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
732	clusterer.buildClusterer(clusterTrain);
733	trainHeader = clusterTrain;
734	}
735	ClusterEvaluation ce = new ClusterEvaluation();
736	ce.setClusterer(clusterer);
737	ce.evaluateClusterer(train, trainFileName);
738
739	return "\n\n=== Clustering stats for training data ===\n\n" +
740	ce.clusterResultsToString();
741	}
742	}
743
744	/* Output cluster predictions only (for the test data if specified,
745	otherwise for the training data */
746	if (printClusterAssignments) {
747	return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
748	}
749
750	text.append(clusterer.toString());
751	text.append("\n\n=== Clustering stats for training data ===\n\n"
752	+ printClusterStats(clusterer, trainFileName));
753
754	if (testFileName.length() != 0) {
755	// check header compatibility
756	DataSource test = new DataSource(testFileName);
757	Instances testStructure = test.getStructure();
758	if (!trainHeader.equalHeaders(testStructure)) {
759	throw new Exception("Training and testing data are not compatible\n" + trainHeader.equalHeadersMsg(testStructure));
760	}
761
762	text.append("\n\n=== Clustering stats for testing data ===\n\n"
763	+ printClusterStats(clusterer, testFileName));
764	}
765
766	if ((clusterer instanceof DensityBasedClusterer) &&
767	(doXval == true) &&
768	(testFileName.length() == 0) &&
769	(objectInputFileName.length() == 0)) {
770	// cross validate the log likelihood on the training data
771	random = new Random(seed);
772	random.setSeed(seed);
773	train = source.getDataSet();
774	train.randomize(random);
775	text.append(
776	crossValidateModel(
777	clusterer.getClass().getName(), train, folds, savedOptions, random));
778	}
779
780	// Save the clusterer if an object output file is provided
781	if (objectOutputFileName.length() != 0) {
782	//SerializationHelper.write(objectOutputFileName, clusterer);
783	saveClusterer(objectOutputFileName, clusterer, trainHeader);
784	}
785
786	// If classifier is drawable output string describing graph
787	if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
788	BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
789	writer.write(((Drawable) clusterer).graph());
790	writer.newLine();
791	writer.flush();
792	writer.close();
793	}
794
795	return text.toString();
796	}
797
798	private static void saveClusterer(String fileName,
799	Clusterer clusterer,
800	Instances header) throws Exception {
801	java.io.ObjectOutputStream oos =
802	new java.io.ObjectOutputStream(
803	new java.io.BufferedOutputStream(
804	new java.io.FileOutputStream(fileName)));
805
806	oos.writeObject(clusterer);
807	if (header != null) {
808	oos.writeObject(header);
809	}
810	oos.flush();
811	oos.close();
812	}
813
814	/**
815	* Perform a cross-validation for DensityBasedClusterer on a set of instances.
816	*
817	* @param clusterer the clusterer to use
818	* @param data the training data
819	* @param numFolds number of folds of cross validation to perform
820	* @param random random number seed for cross-validation
821	* @return the cross-validated log-likelihood
822	* @throws Exception if an error occurs
823	*/
824	public static double crossValidateModel(DensityBasedClusterer clusterer,
825	Instances data,
826	int numFolds,
827	Random random) throws Exception {
828	Instances train, test;
829	double foldAv = 0;;
830	data = new Instances(data);
831	data.randomize(random);
832	// double sumOW = 0;
833	for (int i = 0; i < numFolds; i++) {
834	// Build and test clusterer
835	train = data.trainCV(numFolds, i, random);
836
837	clusterer.buildClusterer(train);
838
839	test = data.testCV(numFolds, i);
840
841	for (int j = 0; j < test.numInstances(); j++) {
842	try {
843	foldAv += ((DensityBasedClusterer)clusterer).
844	logDensityForInstance(test.instance(j));
845	// sumOW += test.instance(j).weight();
846	// double temp = Utils.sum(tempDist);
847	} catch (Exception ex) {
848	// unclustered instances
849	}
850	}
851	}
852
853	// return foldAv / sumOW;
854	return foldAv / data.numInstances();
855	}
856
857	/**
858	* Performs a cross-validation
859	* for a DensityBasedClusterer clusterer on a set of instances.
860	*
861	* @param clustererString a string naming the class of the clusterer
862	* @param data the data on which the cross-validation is to be
863	* performed
864	* @param numFolds the number of folds for the cross-validation
865	* @param options the options to the clusterer
866	* @param random a random number generator
867	* @return a string containing the cross validated log likelihood
868	* @throws Exception if a clusterer could not be generated
869	*/
870	public static String crossValidateModel (String clustererString,
871	Instances data,
872	int numFolds,
873	String[] options,
874	Random random)
875	throws Exception {
876	Clusterer clusterer = null;
877	String[] savedOptions = null;
878	double CvAv = 0.0;
879	StringBuffer CvString = new StringBuffer();
880
881	if (options != null) {
882	savedOptions = new String[options.length];
883	}
884
885	data = new Instances(data);
886
887	// create clusterer
888	try {
889	clusterer = (Clusterer)Class.forName(clustererString).newInstance();
890	}
891	catch (Exception e) {
892	throw new Exception("Can't find class with name "
893	+ clustererString + '.');
894	}
895
896	if (!(clusterer instanceof DensityBasedClusterer)) {
897	throw new Exception(clustererString
898	+ " must be a distrinbution "
899	+ "clusterer.");
900	}
901
902	// Save options
903	if (options != null) {
904	System.arraycopy(options, 0, savedOptions, 0, options.length);
905	}
906
907	// Parse options
908	if (clusterer instanceof OptionHandler) {
909	try {
910	((OptionHandler)clusterer).setOptions(savedOptions);
911	Utils.checkForRemainingOptions(savedOptions);
912	}
913	catch (Exception e) {
914	throw new Exception("Can't parse given options in "
915	+ "cross-validation!");
916	}
917	}
918	CvAv = crossValidateModel((DensityBasedClusterer)clusterer, data, numFolds, random);
919
920	CvString.append("\n" + numFolds
921	+ " fold CV Log Likelihood: "
922	+ Utils.doubleToString(CvAv, 6, 4)
923	+ "\n");
924	return CvString.toString();
925	}
926
927
928	// ===============
929	// Private methods
930	// ===============
931	/**
932	* Print the cluster statistics for either the training
933	* or the testing data.
934	*
935	* @param clusterer the clusterer to use for generating statistics.
936	* @param fileName the file to load
937	* @return a string containing cluster statistics.
938	* @throws Exception if statistics can't be generated.
939	*/
940	private static String printClusterStats (Clusterer clusterer,
941	String fileName)
942	throws Exception {
943	StringBuffer text = new StringBuffer();
944	int i = 0;
945	int cnum;
946	double loglk = 0.0;
947	int cc = clusterer.numberOfClusters();
948	double[] instanceStats = new double[cc];
949	int unclusteredInstances = 0;
950
951	if (fileName.length() != 0) {
952	DataSource source = new DataSource(fileName);
953	Instances structure = source.getStructure();
954	Instance inst;
955	while (source.hasMoreElements(structure)) {
956	inst = source.nextElement(structure);
957	try {
958	cnum = clusterer.clusterInstance(inst);
959
960	if (clusterer instanceof DensityBasedClusterer) {
961	loglk += ((DensityBasedClusterer)clusterer).
962	logDensityForInstance(inst);
963	// temp = Utils.sum(dist);
964	}
965	instanceStats[cnum]++;
966	}
967	catch (Exception e) {
968	unclusteredInstances++;
969	}
970	i++;
971	}
972
973	/*
974	// count the actual number of used clusters
975	int count = 0;
976	for (i = 0; i < cc; i++) {
977	if (instanceStats[i] > 0) {
978	count++;
979	}
980	}
981	if (count > 0) {
982	double[] tempStats = new double [count];
983	count=0;
984	for (i=0;i<cc;i++) {
985	if (instanceStats[i] > 0) {
986	tempStats[count++] = instanceStats[i];
987	}
988	}
989	instanceStats = tempStats;
990	cc = instanceStats.length;
991	} */
992
993	int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
994	int numInstFieldWidth = (int)((Math.log(i)/Math.log(10))+1);
995	double sum = Utils.sum(instanceStats);
996	loglk /= sum;
997	text.append("Clustered Instances\n");
998
999	for (i = 0; i < cc; i++) {
1000	if (instanceStats[i] > 0) {
1001	text.append(Utils.doubleToString((double)i,
1002	clustFieldWidth, 0)
1003	+ " "
1004	+ Utils.doubleToString(instanceStats[i],
1005	numInstFieldWidth, 0)
1006	+ " ("
1007	+ Utils.doubleToString((instanceStats[i]/sum*100.0)
1008	, 3, 0) + "%)\n");
1009	}
1010	}
1011	if (unclusteredInstances > 0) {
1012	text.append("\nUnclustered Instances : "+unclusteredInstances);
1013	}
1014
1015	if (clusterer instanceof DensityBasedClusterer) {
1016	text.append("\n\nLog likelihood: "
1017	+ Utils.doubleToString(loglk, 1, 5)
1018	+ "\n");
1019	}
1020	}
1021
1022	return text.toString();
1023	}
1024
1025
1026	/**
1027	* Print the cluster assignments for either the training
1028	* or the testing data.
1029	*
1030	* @param clusterer the clusterer to use for cluster assignments
1031	* @param trainFileName the train file
1032	* @param testFileName an optional test file
1033	* @param attributesToOutput the attributes to print
1034	* @return a string containing the instance indexes and cluster assigns.
1035	* @throws Exception if cluster assignments can't be printed
1036	*/
1037	private static String printClusterings (Clusterer clusterer, String trainFileName,
1038	String testFileName, Range attributesToOutput)
1039	throws Exception {
1040
1041	StringBuffer text = new StringBuffer();
1042	int i = 0;
1043	int cnum;
1044	DataSource source = null;
1045	Instance inst;
1046	Instances structure;
1047
1048	if (testFileName.length() != 0)
1049	source = new DataSource(testFileName);
1050	else
1051	source = new DataSource(trainFileName);
1052
1053	structure = source.getStructure();
1054	while (source.hasMoreElements(structure)) {
1055	inst = source.nextElement(structure);
1056	try {
1057	cnum = clusterer.clusterInstance(inst);
1058
1059	text.append(i + " " + cnum + " "
1060	+ attributeValuesString(inst, attributesToOutput) + "\n");
1061	}
1062	catch (Exception e) {
1063	/* throw new Exception('\n' + "Unable to cluster instance\n"
1064	+ e.getMessage()); */
1065	text.append(i + " Unclustered "
1066	+ attributeValuesString(inst, attributesToOutput) + "\n");
1067	}
1068	i++;
1069	}
1070
1071	return text.toString();
1072	}
1073
1074	/**
1075	* Builds a string listing the attribute values in a specified range of indices,
1076	* separated by commas and enclosed in brackets.
1077	*
1078	* @param instance the instance to print the values from
1079	* @param attRange the range of the attributes to list
1080	* @return a string listing values of the attributes in the range
1081	*/
1082	private static String attributeValuesString(Instance instance, Range attRange) {
1083	StringBuffer text = new StringBuffer();
1084	if (attRange != null) {
1085	boolean firstOutput = true;
1086	attRange.setUpper(instance.numAttributes() - 1);
1087	for (int i=0; i<instance.numAttributes(); i++)
1088	if (attRange.isInRange(i)) {
1089	if (firstOutput) text.append("(");
1090	else text.append(",");
1091	text.append(instance.toString(i));
1092	firstOutput = false;
1093	}
1094	if (!firstOutput) text.append(")");
1095	}
1096	return text.toString();
1097	}
1098
1099	/**
1100	* Make up the help string giving all the command line options
1101	*
1102	* @param clusterer the clusterer to include options for
1103	* @return a string detailing the valid command line options
1104	*/
1105	private static String makeOptionString (Clusterer clusterer,
1106	boolean globalInfo) {
1107	StringBuffer optionsText = new StringBuffer("");
1108	// General options
1109	optionsText.append("\n\nGeneral options:\n\n");
1110	optionsText.append("-h or -help\n");
1111	optionsText.append("\tOutput help information.\n");
1112	optionsText.append("-synopsis or -info\n");
1113	optionsText.append("\tOutput synopsis for clusterer (use in conjunction "
1114	+ " with -h)\n");
1115	optionsText.append("-t <name of training file>\n");
1116	optionsText.append("\tSets training file.\n");
1117	optionsText.append("-T <name of test file>\n");
1118	optionsText.append("\tSets test file.\n");
1119	optionsText.append("-l <name of input file>\n");
1120	optionsText.append("\tSets model input file.\n");
1121	optionsText.append("-d <name of output file>\n");
1122	optionsText.append("\tSets model output file.\n");
1123	optionsText.append("-p <attribute range>\n");
1124	optionsText.append("\tOutput predictions. Predictions are for "
1125	+ "training file"
1126	+ "\n\tif only training file is specified,"
1127	+ "\n\totherwise predictions are for the test file."
1128	+ "\n\tThe range specifies attribute values to be output"
1129	+ "\n\twith the predictions. Use '-p 0' for none.\n");
1130	optionsText.append("-x <number of folds>\n");
1131	optionsText.append("\tOnly Distribution Clusterers can be cross validated.\n");
1132	optionsText.append("-s <random number seed>\n");
1133	optionsText.append("\tSets the seed for randomizing the data in cross-validation\n");
1134	optionsText.append("-c <class index>\n");
1135	optionsText.append("\tSet class attribute. If supplied, class is ignored");
1136	optionsText.append("\n\tduring clustering but is used in a classes to");
1137	optionsText.append("\n\tclusters evaluation.\n");
1138	if (clusterer instanceof Drawable) {
1139	optionsText.append("-g <name of graph file>\n");
1140	optionsText.append("\tOutputs the graph representation of the clusterer to the file.\n");
1141	}
1142
1143	// Get scheme-specific options
1144	if (clusterer instanceof OptionHandler) {
1145	optionsText.append("\nOptions specific to "
1146	+ clusterer.getClass().getName() + ":\n\n");
1147	Enumeration enu = ((OptionHandler)clusterer).listOptions();
1148
1149	while (enu.hasMoreElements()) {
1150	Option option = (Option)enu.nextElement();
1151	optionsText.append(option.synopsis() + '\n');
1152	optionsText.append(option.description() + "\n");
1153	}
1154	}
1155
1156	// Get global information (if available)
1157	if (globalInfo) {
1158	try {
1159	String gi = getGlobalInfo(clusterer);
1160	optionsText.append(gi);
1161	} catch (Exception ex) {
1162	// quietly ignore
1163	}
1164	}
1165
1166	return optionsText.toString();
1167	}
1168
1169	/**
1170	* Return the global info (if it exists) for the supplied clusterer
1171	*
1172	* @param clusterer the clusterer to get the global info for
1173	* @return the global info (synopsis) for the clusterer
1174	* @throws Exception if there is a problem reflecting on the clusterer
1175	*/
1176	protected static String getGlobalInfo(Clusterer clusterer) throws Exception {
1177	BeanInfo bi = Introspector.getBeanInfo(clusterer.getClass());
1178	MethodDescriptor[] methods;
1179	methods = bi.getMethodDescriptors();
1180	Object[] args = {};
1181	String result = "\nSynopsis for " + clusterer.getClass().getName()
1182	+ ":\n\n";
1183
1184	for (int i = 0; i < methods.length; i++) {
1185	String name = methods[i].getDisplayName();
1186	Method meth = methods[i].getMethod();
1187	if (name.equals("globalInfo")) {
1188	String globalInfo = (String)(meth.invoke(clusterer, args));
1189	result += globalInfo;
1190	break;
1191	}
1192	}
1193
1194	return result;
1195	}
1196
1197	/**
1198	* Tests whether the current evaluation object is equal to another
1199	* evaluation object
1200	*
1201	* @param obj the object to compare against
1202	* @return true if the two objects are equal
1203	*/
1204	public boolean equals(Object obj) {
1205	if ((obj == null) \|\| !(obj.getClass().equals(this.getClass())))
1206	return false;
1207
1208	ClusterEvaluation cmp = (ClusterEvaluation) obj;
1209
1210	if ((m_classToCluster != null) != (cmp.m_classToCluster != null)) return false;
1211	if (m_classToCluster != null) {
1212	for (int i = 0; i < m_classToCluster.length; i++) {
1213	if (m_classToCluster[i] != cmp.m_classToCluster[i])
1214	return false;
1215	}
1216	}
1217
1218	if ((m_clusterAssignments != null) != (cmp.m_clusterAssignments != null)) return false;
1219	if (m_clusterAssignments != null) {
1220	for (int i = 0; i < m_clusterAssignments.length; i++) {
1221	if (m_clusterAssignments[i] != cmp.m_clusterAssignments[i])
1222	return false;
1223	}
1224	}
1225
1226	if (Double.isNaN(m_logL) != Double.isNaN(cmp.m_logL)) return false;
1227	if (!Double.isNaN(m_logL)) {
1228	if (m_logL != cmp.m_logL) return false;
1229	}
1230
1231	if (m_numClusters != cmp.m_numClusters) return false;
1232
1233	// TODO: better comparison? via members?
1234	String clusteringResults1 = m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
1235	String clusteringResults2 = cmp.m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
1236	if (!clusteringResults1.equals(clusteringResults2)) return false;
1237
1238	return true;
1239	}
1240
1241	/**
1242	* Returns the revision string.
1243	*
1244	* @return the revision
1245	*/
1246	public String getRevision() {
1247	return RevisionUtils.extract("$Revision: 6021 $");
1248	}
1249
1250	/**
1251	* Main method for testing this class.
1252	*
1253	* @param args the options
1254	*/
1255	public static void main (String[] args) {
1256	try {
1257	if (args.length == 0) {
1258	throw new Exception("The first argument must be the name of a "
1259	+ "clusterer");
1260	}
1261
1262	String ClustererString = args[0];
1263	args[0] = "";
1264	Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null);
1265	System.out.println(evaluateClusterer(newClusterer, args));
1266	}
1267	catch (Exception e) {
1268	System.out.println(e.getMessage());
1269	}
1270	}
1271	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: