Context Navigation

source: src/main/java/weka/experiment/DensityBasedClustererSplitEvaluator.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 18.8 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* DensityBasedClustererSplitEvaluator.java
19	* Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23
24	package weka.experiment;
25
26	import weka.clusterers.ClusterEvaluation;
27	import weka.clusterers.Clusterer;
28	import weka.clusterers.AbstractClusterer;
29	import weka.clusterers.AbstractDensityBasedClusterer;
30	import weka.clusterers.DensityBasedClusterer;
31	import weka.clusterers.EM;
32	import weka.core.AdditionalMeasureProducer;
33	import weka.core.Instances;
34	import weka.core.Option;
35	import weka.core.OptionHandler;
36	import weka.core.RevisionHandler;
37	import weka.core.RevisionUtils;
38	import weka.core.Utils;
39	import weka.filters.Filter;
40	import weka.filters.unsupervised.attribute.Remove;
41
42	import java.io.ObjectStreamClass;
43	import java.io.Serializable;
44	import java.util.Enumeration;
45	import java.util.Vector;
46
47	/**
48	* A SplitEvaluator that produces results for a density based clusterer.
49	*
50	* -W classname <br>
51	* Specify the full class name of the clusterer to evaluate. <p>
52	*
53	* @author Mark Hall (mhall{[at]}pentaho{[dot]}org
54	* @version $Revision: 5563 $
55	*/
56
57	public class DensityBasedClustererSplitEvaluator
58	implements SplitEvaluator,
59	OptionHandler,
60	AdditionalMeasureProducer,
61	RevisionHandler {
62
63	/** Remove the class column (if set) from the data */
64	protected boolean m_removeClassColumn = true;
65
66	/** The clusterer used for evaluation */
67	protected DensityBasedClusterer m_clusterer = new EM();
68
69	/** The names of any additional measures to look for in SplitEvaluators */
70	protected String [] m_additionalMeasures = null;
71
72	/** Array of booleans corresponding to the measures in m_AdditionalMeasures
73	indicating which of the AdditionalMeasures the current clusterer
74	can produce */
75	protected boolean [] m_doesProduce = null;
76
77	/** The number of additional measures that need to be filled in
78	after taking into account column constraints imposed by the final
79	destination for results */
80	protected int m_numberAdditionalMeasures = 0;
81
82	/** Holds the statistics for the most recent application of the clusterer */
83	protected String m_result = null;
84
85	/** The clusterer options (if any) */
86	protected String m_clustererOptions = "";
87
88	/** The clusterer version */
89	protected String m_clustererVersion = "";
90
91	/** The length of a key */
92	private static final int KEY_SIZE = 3;
93
94	/** The length of a result */
95	private static final int RESULT_SIZE = 6;
96
97
98	public DensityBasedClustererSplitEvaluator() {
99	updateOptions();
100	}
101
102	/**
103	* Returns a string describing this split evaluator
104	* @return a description of the split evaluator suitable for
105	* displaying in the explorer/experimenter gui
106	*/
107	public String globalInfo() {
108	return " A SplitEvaluator that produces results for a density based clusterer. ";
109	}
110
111	/**
112	* Returns an enumeration describing the available options.
113	*
114	* @return an enumeration of all the available options.
115	*/
116	public Enumeration listOptions() {
117
118	Vector newVector = new Vector(1);
119
120	newVector.addElement(new Option(
121	"\tThe full class name of the density based clusterer.\n"
122	+"\teg: weka.clusterers.EM",
123	"W", 1,
124	"-W <class name>"));
125
126	if ((m_clusterer != null) &&
127	(m_clusterer instanceof OptionHandler)) {
128	newVector.addElement(new Option(
129	"",
130	"", 0, "\nOptions specific to clusterer "
131	+ m_clusterer.getClass().getName() + ":"));
132	Enumeration enu = ((OptionHandler)m_clusterer).listOptions();
133	while (enu.hasMoreElements()) {
134	newVector.addElement(enu.nextElement());
135	}
136	}
137	return newVector.elements();
138	}
139
140	/**
141	* Parses a given list of options. Valid options are:<p>
142	*
143	* -W classname <br>
144	* Specify the full class name of the clusterer to evaluate. <p>
145	*
146	* All option after -- will be passed to the classifier.
147	*
148	* @param options the list of options as an array of strings
149	* @exception Exception if an option is not supported
150	*/
151	public void setOptions(String[] options) throws Exception {
152
153	String cName = Utils.getOption('W', options);
154	if (cName.length() == 0) {
155	throw new Exception("A clusterer must be specified with"
156	+ " the -W option.");
157	}
158	// Do it first without options, so if an exception is thrown during
159	// the option setting, listOptions will contain options for the actual
160	// Classifier.
161	setClusterer((DensityBasedClusterer)AbstractClusterer.forName(cName, null));
162	if (getClusterer() instanceof OptionHandler) {
163	((OptionHandler) getClusterer())
164	.setOptions(Utils.partitionOptions(options));
165	updateOptions();
166	}
167	}
168
169	/**
170	* Gets the current settings of the Classifier.
171	*
172	* @return an array of strings suitable for passing to setOptions
173	*/
174	public String [] getOptions() {
175
176	String [] clustererOptions = new String [0];
177	if ((m_clusterer != null) &&
178	(m_clusterer instanceof OptionHandler)) {
179	clustererOptions = ((OptionHandler)m_clusterer).getOptions();
180	}
181
182	String [] options = new String [clustererOptions.length + 3];
183	int current = 0;
184
185	if (getClusterer() != null) {
186	options[current++] = "-W";
187	options[current++] = getClusterer().getClass().getName();
188	}
189
190	options[current++] = "--";
191
192	System.arraycopy(clustererOptions, 0, options, current,
193	clustererOptions.length);
194	current += clustererOptions.length;
195	while (current < options.length) {
196	options[current++] = "";
197	}
198	return options;
199	}
200
201	/**
202	* Set a list of method names for additional measures to look for
203	* in Classifiers. This could contain many measures (of which only a
204	* subset may be produceable by the current Classifier) if an experiment
205	* is the type that iterates over a set of properties.
206	* @param additionalMeasures a list of method names
207	*/
208	public void setAdditionalMeasures(String [] additionalMeasures) {
209	// System.err.println("ClassifierSplitEvaluator: setting additional measures");
210	m_additionalMeasures = additionalMeasures;
211
212	// determine which (if any) of the additional measures this clusterer
213	// can produce
214	if (m_additionalMeasures != null && m_additionalMeasures.length > 0) {
215	m_doesProduce = new boolean [m_additionalMeasures.length];
216
217	if (m_clusterer instanceof AdditionalMeasureProducer) {
218	Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
219	enumerateMeasures();
220	while (en.hasMoreElements()) {
221	String mname = (String)en.nextElement();
222	for (int j=0;j<m_additionalMeasures.length;j++) {
223	if (mname.compareToIgnoreCase(m_additionalMeasures[j]) == 0) {
224	m_doesProduce[j] = true;
225	}
226	}
227	}
228	}
229	} else {
230	m_doesProduce = null;
231	}
232	}
233
234	/**
235	* Returns an enumeration of any additional measure names that might be
236	* in the classifier
237	* @return an enumeration of the measure names
238	*/
239	public Enumeration enumerateMeasures() {
240	Vector newVector = new Vector();
241	if (m_clusterer instanceof AdditionalMeasureProducer) {
242	Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
243	enumerateMeasures();
244	while (en.hasMoreElements()) {
245	String mname = (String)en.nextElement();
246	newVector.addElement(mname);
247	}
248	}
249	return newVector.elements();
250	}
251
252	/**
253	* Returns the value of the named measure
254	* @param additionalMeasureName the name of the measure to query for its value
255	* @return the value of the named measure
256	* @exception IllegalArgumentException if the named measure is not supported
257	*/
258	public double getMeasure(String additionalMeasureName) {
259	if (m_clusterer instanceof AdditionalMeasureProducer) {
260	return ((AdditionalMeasureProducer)m_clusterer).
261	getMeasure(additionalMeasureName);
262	} else {
263	throw new IllegalArgumentException("DensityBasedClustererSplitEvaluator: "
264	+"Can't return value for : "+additionalMeasureName
265	+". "+m_clusterer.getClass().getName()+" "
266	+"is not an AdditionalMeasureProducer");
267	}
268	}
269
270	/**
271	* Gets the data types of each of the key columns produced for a single run.
272	* The number of key fields must be constant
273	* for a given SplitEvaluator.
274	*
275	* @return an array containing objects of the type of each key column. The
276	* objects should be Strings, or Doubles.
277	*/
278	public Object [] getKeyTypes() {
279
280	Object [] keyTypes = new Object[KEY_SIZE];
281	keyTypes[0] = "";
282	keyTypes[1] = "";
283	keyTypes[2] = "";
284	return keyTypes;
285	}
286
287	/**
288	* Gets the names of each of the key columns produced for a single run.
289	* The number of key fields must be constant
290	* for a given SplitEvaluator.
291	*
292	* @return an array containing the name of each key column
293	*/
294	public String [] getKeyNames() {
295
296	String [] keyNames = new String[KEY_SIZE];
297	keyNames[0] = "Scheme";
298	keyNames[1] = "Scheme_options";
299	keyNames[2] = "Scheme_version_ID";
300	return keyNames;
301	}
302
303	/**
304	* Gets the key describing the current SplitEvaluator. For example
305	* This may contain the name of the classifier used for classifier
306	* predictive evaluation. The number of key fields must be constant
307	* for a given SplitEvaluator.
308	*
309	* @return an array of objects containing the key.
310	*/
311	public Object [] getKey(){
312
313	Object [] key = new Object[KEY_SIZE];
314	key[0] = m_clusterer.getClass().getName();
315	key[1] = m_clustererOptions;
316	key[2] = m_clustererVersion;
317	return key;
318	}
319
320	/**
321	* Gets the data types of each of the result columns produced for a
322	* single run. The number of result fields must be constant
323	* for a given SplitEvaluator.
324	*
325	* @return an array containing objects of the type of each result column.
326	* The objects should be Strings, or Doubles.
327	*/
328	public Object [] getResultTypes() {
329	int addm = (m_additionalMeasures != null)
330	? m_additionalMeasures.length
331	: 0;
332	int overall_length = RESULT_SIZE+addm;
333
334	Object [] resultTypes = new Object[overall_length];
335	Double doub = new Double(0);
336	int current = 0;
337
338	// number of training and testing instances
339	resultTypes[current++] = doub;
340	resultTypes[current++] = doub;
341
342	// log liklihood
343	resultTypes[current++] = doub;
344	// number of clusters
345	resultTypes[current++] = doub;
346
347	// timing stats
348	resultTypes[current++] = doub;
349	resultTypes[current++] = doub;
350
351
352	// resultTypes[current++] = "";
353
354	// add any additional measures
355	for (int i=0;i<addm;i++) {
356	resultTypes[current++] = doub;
357	}
358	if (current != overall_length) {
359	throw new Error("ResultTypes didn't fit RESULT_SIZE");
360	}
361	return resultTypes;
362	}
363
364	/**
365	* Gets the names of each of the result columns produced for a single run.
366	* The number of result fields must be constant
367	* for a given SplitEvaluator.
368	*
369	* @return an array containing the name of each result column
370	*/
371	public String [] getResultNames() {
372	int addm = (m_additionalMeasures != null)
373	? m_additionalMeasures.length
374	: 0;
375	int overall_length = RESULT_SIZE+addm;
376
377	String [] resultNames = new String[overall_length];
378	int current = 0;
379	resultNames[current++] = "Number_of_training_instances";
380	resultNames[current++] = "Number_of_testing_instances";
381
382	// Basic performance stats
383	resultNames[current++] = "Log_likelihood";
384	resultNames[current++] = "Number_of_clusters";
385
386	// Timing stats
387	resultNames[current++] = "Time_training";
388	resultNames[current++] = "Time_testing";
389
390	// Classifier defined extras
391	// resultNames[current++] = "Summary";
392	// add any additional measures
393	for (int i=0;i<addm;i++) {
394	resultNames[current++] = m_additionalMeasures[i];
395	}
396	if (current != overall_length) {
397	throw new Error("ResultNames didn't fit RESULT_SIZE");
398	}
399	return resultNames;
400	}
401
402	/**
403	* Gets the results for the supplied train and test datasets.
404	*
405	* @param train the training Instances.
406	* @param test the testing Instances.
407	* @return the results stored in an array. The objects stored in
408	* the array may be Strings, Doubles, or null (for the missing value).
409	* @exception Exception if a problem occurs while getting the results
410	*/
411	public Object [] getResult(Instances train, Instances test)
412	throws Exception {
413
414	if (m_clusterer == null) {
415	throw new Exception("No clusterer has been specified");
416	}
417	int addm = (m_additionalMeasures != null)
418	? m_additionalMeasures.length
419	: 0;
420	int overall_length = RESULT_SIZE+addm;
421
422	if (m_removeClassColumn && train.classIndex() != -1) {
423	// remove the class column from the training and testing data
424	Remove r = new Remove();
425	r.setAttributeIndicesArray(new int [] {train.classIndex()});
426	r.setInvertSelection(false);
427	r.setInputFormat(train);
428	train = Filter.useFilter(train, r);
429
430	test = Filter.useFilter(test, r);
431	}
432	train.setClassIndex(-1);
433	test.setClassIndex(-1);
434
435
436	ClusterEvaluation eval = new ClusterEvaluation();
437
438	Object [] result = new Object[overall_length];
439	long trainTimeStart = System.currentTimeMillis();
440	m_clusterer.buildClusterer(train);
441	double numClusters = m_clusterer.numberOfClusters();
442	eval.setClusterer(m_clusterer);
443	long trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
444	long testTimeStart = System.currentTimeMillis();
445	eval.evaluateClusterer(test);
446	long testTimeElapsed = System.currentTimeMillis() - testTimeStart;
447	// m_result = eval.toSummaryString();
448
449	// The results stored are all per instance -- can be multiplied by the
450	// number of instances to get absolute numbers
451	int current = 0;
452	result[current++] = new Double(train.numInstances());
453	result[current++] = new Double(test.numInstances());
454
455	result[current++] = new Double(eval.getLogLikelihood());
456	result[current++] = new Double(numClusters);
457
458	// Timing stats
459	result[current++] = new Double(trainTimeElapsed / 1000.0);
460	result[current++] = new Double(testTimeElapsed / 1000.0);
461
462	for (int i=0;i<addm;i++) {
463	if (m_doesProduce[i]) {
464	try {
465	double dv = ((AdditionalMeasureProducer)m_clusterer).
466	getMeasure(m_additionalMeasures[i]);
467	Double value = new Double(dv);
468
469	result[current++] = value;
470	} catch (Exception ex) {
471	System.err.println(ex);
472	}
473	} else {
474	result[current++] = null;
475	}
476	}
477
478	if (current != overall_length) {
479	throw new Error("Results didn't fit RESULT_SIZE");
480	}
481	return result;
482	}
483
484	/**
485	* Returns the tip text for this property
486	* @return tip text for this property suitable for
487	* displaying in the explorer/experimenter gui
488	*/
489	public String removeClassColumnTipText() {
490	return "Remove the class column (if set) from the data.";
491	}
492
493	/**
494	* Set whether the class column should be removed from the data.
495	*
496	* @param r true if the class column is to be removed.
497	*/
498	public void setRemoveClassColumn(boolean r) {
499	m_removeClassColumn = r;
500	}
501
502	/**
503	* Get whether the class column is to be removed.
504	*
505	* @return true if the class column is to be removed.
506	*/
507	public boolean getRemoveClassColumn() {
508	return m_removeClassColumn;
509	}
510
511	/**
512	* Returns the tip text for this property
513	* @return tip text for this property suitable for
514	* displaying in the explorer/experimenter gui
515	*/
516	public String clustererTipText() {
517	return "The density based clusterer to use.";
518	}
519
520	/**
521	* Get the value of clusterer
522	*
523	* @return Value of clusterer.
524	*/
525	public DensityBasedClusterer getClusterer() {
526
527	return m_clusterer;
528	}
529
530	/**
531	* Sets the clusterer.
532	*
533	* @param newClusterer the new clusterer to use.
534	*/
535	public void setClusterer(DensityBasedClusterer newClusterer) {
536
537	m_clusterer = newClusterer;
538	updateOptions();
539	}
540
541
542	protected void updateOptions() {
543
544	if (m_clusterer instanceof OptionHandler) {
545	m_clustererOptions = Utils.joinOptions(((OptionHandler)m_clusterer)
546	.getOptions());
547	} else {
548	m_clustererOptions = "";
549	}
550	if (m_clusterer instanceof Serializable) {
551	ObjectStreamClass obs = ObjectStreamClass.lookup(m_clusterer
552	.getClass());
553	m_clustererVersion = "" + obs.getSerialVersionUID();
554	} else {
555	m_clustererVersion = "";
556	}
557	}
558
559	/**
560	* Set the Clusterer to use, given it's class name. A new clusterer will be
561	* instantiated.
562	*
563	* @param newClustererName the clusterer class name.
564	* @exception Exception if the class name is invalid.
565	*/
566	public void setClustererName(String newClustererName) throws Exception {
567
568	try {
569	setClusterer((DensityBasedClusterer)Class.forName(newClustererName)
570	.newInstance());
571	} catch (Exception ex) {
572	throw new Exception("Can't find Clusterer with class name: "
573	+ newClustererName);
574	}
575	}
576
577	/**
578	* Gets the raw output from the classifier
579	* @return the raw output from the classifier
580	*/
581	public String getRawResultOutput() {
582	StringBuffer result = new StringBuffer();
583
584	if (m_clusterer == null) {
585	return "<null> clusterer";
586	}
587	result.append(toString());
588	result.append("Clustering model: \n"+m_clusterer.toString()+'\n');
589
590	// append the performance statistics
591	if (m_result != null) {
592	// result.append(m_result);
593
594	if (m_doesProduce != null) {
595	for (int i=0;i<m_doesProduce.length;i++) {
596	if (m_doesProduce[i]) {
597	try {
598	double dv = ((AdditionalMeasureProducer)m_clusterer).
599	getMeasure(m_additionalMeasures[i]);
600	Double value = new Double(dv);
601
602	result.append(m_additionalMeasures[i]+" : "+value+'\n');
603	} catch (Exception ex) {
604	System.err.println(ex);
605	}
606	}
607	}
608	}
609	}
610	return result.toString();
611	}
612
613	/**
614	* Returns a text description of the split evaluator.
615	*
616	* @return a text description of the split evaluator.
617	*/
618	public String toString() {
619
620	String result = "DensityBasedClustererSplitEvaluator: ";
621	if (m_clusterer == null) {
622	return result + "<null> clusterer";
623	}
624	return result + m_clusterer.getClass().getName() + " "
625	+ m_clustererOptions + "(version " + m_clustererVersion + ")";
626	}
627
628	/**
629	* Returns the revision string.
630	*
631	* @return the revision
632	*/
633	public String getRevision() {
634	return RevisionUtils.extract("$Revision: 5563 $");
635	}
636	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: