Context Navigation

source: src/main/java/weka/experiment/DensityBasedClustererSplitEvaluator.java @ 17

Last change on this file since 17 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 18.8 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* DensityBasedClustererSplitEvaluator.java
	19	* Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23
	24	package weka.experiment;
	25
	26	import weka.clusterers.ClusterEvaluation;
	27	import weka.clusterers.Clusterer;
	28	import weka.clusterers.AbstractClusterer;
	29	import weka.clusterers.AbstractDensityBasedClusterer;
	30	import weka.clusterers.DensityBasedClusterer;
	31	import weka.clusterers.EM;
	32	import weka.core.AdditionalMeasureProducer;
	33	import weka.core.Instances;
	34	import weka.core.Option;
	35	import weka.core.OptionHandler;
	36	import weka.core.RevisionHandler;
	37	import weka.core.RevisionUtils;
	38	import weka.core.Utils;
	39	import weka.filters.Filter;
	40	import weka.filters.unsupervised.attribute.Remove;
	41
	42	import java.io.ObjectStreamClass;
	43	import java.io.Serializable;
	44	import java.util.Enumeration;
	45	import java.util.Vector;
	46
	47	/**
	48	* A SplitEvaluator that produces results for a density based clusterer.
	49	*
	50	* -W classname <br>
	51	* Specify the full class name of the clusterer to evaluate. <p>
	52	*
	53	* @author Mark Hall (mhall{[at]}pentaho{[dot]}org
	54	* @version $Revision: 5563 $
	55	*/
	56
	57	public class DensityBasedClustererSplitEvaluator
	58	implements SplitEvaluator,
	59	OptionHandler,
	60	AdditionalMeasureProducer,
	61	RevisionHandler {
	62
	63	/** Remove the class column (if set) from the data */
	64	protected boolean m_removeClassColumn = true;
	65
	66	/** The clusterer used for evaluation */
	67	protected DensityBasedClusterer m_clusterer = new EM();
	68
	69	/** The names of any additional measures to look for in SplitEvaluators */
	70	protected String [] m_additionalMeasures = null;
	71
	72	/** Array of booleans corresponding to the measures in m_AdditionalMeasures
	73	indicating which of the AdditionalMeasures the current clusterer
	74	can produce */
	75	protected boolean [] m_doesProduce = null;
	76
	77	/** The number of additional measures that need to be filled in
	78	after taking into account column constraints imposed by the final
	79	destination for results */
	80	protected int m_numberAdditionalMeasures = 0;
	81
	82	/** Holds the statistics for the most recent application of the clusterer */
	83	protected String m_result = null;
	84
	85	/** The clusterer options (if any) */
	86	protected String m_clustererOptions = "";
	87
	88	/** The clusterer version */
	89	protected String m_clustererVersion = "";
	90
	91	/** The length of a key */
	92	private static final int KEY_SIZE = 3;
	93
	94	/** The length of a result */
	95	private static final int RESULT_SIZE = 6;
	96
	97
	98	public DensityBasedClustererSplitEvaluator() {
	99	updateOptions();
	100	}
	101
	102	/**
	103	* Returns a string describing this split evaluator
	104	* @return a description of the split evaluator suitable for
	105	* displaying in the explorer/experimenter gui
	106	*/
	107	public String globalInfo() {
	108	return " A SplitEvaluator that produces results for a density based clusterer. ";
	109	}
	110
	111	/**
	112	* Returns an enumeration describing the available options.
	113	*
	114	* @return an enumeration of all the available options.
	115	*/
	116	public Enumeration listOptions() {
	117
	118	Vector newVector = new Vector(1);
	119
	120	newVector.addElement(new Option(
	121	"\tThe full class name of the density based clusterer.\n"
	122	+"\teg: weka.clusterers.EM",
	123	"W", 1,
	124	"-W <class name>"));
	125
	126	if ((m_clusterer != null) &&
	127	(m_clusterer instanceof OptionHandler)) {
	128	newVector.addElement(new Option(
	129	"",
	130	"", 0, "\nOptions specific to clusterer "
	131	+ m_clusterer.getClass().getName() + ":"));
	132	Enumeration enu = ((OptionHandler)m_clusterer).listOptions();
	133	while (enu.hasMoreElements()) {
	134	newVector.addElement(enu.nextElement());
	135	}
	136	}
	137	return newVector.elements();
	138	}
	139
	140	/**
	141	* Parses a given list of options. Valid options are:<p>
	142	*
	143	* -W classname <br>
	144	* Specify the full class name of the clusterer to evaluate. <p>
	145	*
	146	* All option after -- will be passed to the classifier.
	147	*
	148	* @param options the list of options as an array of strings
	149	* @exception Exception if an option is not supported
	150	*/
	151	public void setOptions(String[] options) throws Exception {
	152
	153	String cName = Utils.getOption('W', options);
	154	if (cName.length() == 0) {
	155	throw new Exception("A clusterer must be specified with"
	156	+ " the -W option.");
	157	}
	158	// Do it first without options, so if an exception is thrown during
	159	// the option setting, listOptions will contain options for the actual
	160	// Classifier.
	161	setClusterer((DensityBasedClusterer)AbstractClusterer.forName(cName, null));
	162	if (getClusterer() instanceof OptionHandler) {
	163	((OptionHandler) getClusterer())
	164	.setOptions(Utils.partitionOptions(options));
	165	updateOptions();
	166	}
	167	}
	168
	169	/**
	170	* Gets the current settings of the Classifier.
	171	*
	172	* @return an array of strings suitable for passing to setOptions
	173	*/
	174	public String [] getOptions() {
	175
	176	String [] clustererOptions = new String [0];
	177	if ((m_clusterer != null) &&
	178	(m_clusterer instanceof OptionHandler)) {
	179	clustererOptions = ((OptionHandler)m_clusterer).getOptions();
	180	}
	181
	182	String [] options = new String [clustererOptions.length + 3];
	183	int current = 0;
	184
	185	if (getClusterer() != null) {
	186	options[current++] = "-W";
	187	options[current++] = getClusterer().getClass().getName();
	188	}
	189
	190	options[current++] = "--";
	191
	192	System.arraycopy(clustererOptions, 0, options, current,
	193	clustererOptions.length);
	194	current += clustererOptions.length;
	195	while (current < options.length) {
	196	options[current++] = "";
	197	}
	198	return options;
	199	}
	200
	201	/**
	202	* Set a list of method names for additional measures to look for
	203	* in Classifiers. This could contain many measures (of which only a
	204	* subset may be produceable by the current Classifier) if an experiment
	205	* is the type that iterates over a set of properties.
	206	* @param additionalMeasures a list of method names
	207	*/
	208	public void setAdditionalMeasures(String [] additionalMeasures) {
	209	// System.err.println("ClassifierSplitEvaluator: setting additional measures");
	210	m_additionalMeasures = additionalMeasures;
	211
	212	// determine which (if any) of the additional measures this clusterer
	213	// can produce
	214	if (m_additionalMeasures != null && m_additionalMeasures.length > 0) {
	215	m_doesProduce = new boolean [m_additionalMeasures.length];
	216
	217	if (m_clusterer instanceof AdditionalMeasureProducer) {
	218	Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
	219	enumerateMeasures();
	220	while (en.hasMoreElements()) {
	221	String mname = (String)en.nextElement();
	222	for (int j=0;j<m_additionalMeasures.length;j++) {
	223	if (mname.compareToIgnoreCase(m_additionalMeasures[j]) == 0) {
	224	m_doesProduce[j] = true;
	225	}
	226	}
	227	}
	228	}
	229	} else {
	230	m_doesProduce = null;
	231	}
	232	}
	233
	234	/**
	235	* Returns an enumeration of any additional measure names that might be
	236	* in the classifier
	237	* @return an enumeration of the measure names
	238	*/
	239	public Enumeration enumerateMeasures() {
	240	Vector newVector = new Vector();
	241	if (m_clusterer instanceof AdditionalMeasureProducer) {
	242	Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
	243	enumerateMeasures();
	244	while (en.hasMoreElements()) {
	245	String mname = (String)en.nextElement();
	246	newVector.addElement(mname);
	247	}
	248	}
	249	return newVector.elements();
	250	}
	251
	252	/**
	253	* Returns the value of the named measure
	254	* @param additionalMeasureName the name of the measure to query for its value
	255	* @return the value of the named measure
	256	* @exception IllegalArgumentException if the named measure is not supported
	257	*/
	258	public double getMeasure(String additionalMeasureName) {
	259	if (m_clusterer instanceof AdditionalMeasureProducer) {
	260	return ((AdditionalMeasureProducer)m_clusterer).
	261	getMeasure(additionalMeasureName);
	262	} else {
	263	throw new IllegalArgumentException("DensityBasedClustererSplitEvaluator: "
	264	+"Can't return value for : "+additionalMeasureName
	265	+". "+m_clusterer.getClass().getName()+" "
	266	+"is not an AdditionalMeasureProducer");
	267	}
	268	}
	269
	270	/**
	271	* Gets the data types of each of the key columns produced for a single run.
	272	* The number of key fields must be constant
	273	* for a given SplitEvaluator.
	274	*
	275	* @return an array containing objects of the type of each key column. The
	276	* objects should be Strings, or Doubles.
	277	*/
	278	public Object [] getKeyTypes() {
	279
	280	Object [] keyTypes = new Object[KEY_SIZE];
	281	keyTypes[0] = "";
	282	keyTypes[1] = "";
	283	keyTypes[2] = "";
	284	return keyTypes;
	285	}
	286
	287	/**
	288	* Gets the names of each of the key columns produced for a single run.
	289	* The number of key fields must be constant
	290	* for a given SplitEvaluator.
	291	*
	292	* @return an array containing the name of each key column
	293	*/
	294	public String [] getKeyNames() {
	295
	296	String [] keyNames = new String[KEY_SIZE];
	297	keyNames[0] = "Scheme";
	298	keyNames[1] = "Scheme_options";
	299	keyNames[2] = "Scheme_version_ID";
	300	return keyNames;
	301	}
	302
	303	/**
	304	* Gets the key describing the current SplitEvaluator. For example
	305	* This may contain the name of the classifier used for classifier
	306	* predictive evaluation. The number of key fields must be constant
	307	* for a given SplitEvaluator.
	308	*
	309	* @return an array of objects containing the key.
	310	*/
	311	public Object [] getKey(){
	312
	313	Object [] key = new Object[KEY_SIZE];
	314	key[0] = m_clusterer.getClass().getName();
	315	key[1] = m_clustererOptions;
	316	key[2] = m_clustererVersion;
	317	return key;
	318	}
	319
	320	/**
	321	* Gets the data types of each of the result columns produced for a
	322	* single run. The number of result fields must be constant
	323	* for a given SplitEvaluator.
	324	*
	325	* @return an array containing objects of the type of each result column.
	326	* The objects should be Strings, or Doubles.
	327	*/
	328	public Object [] getResultTypes() {
	329	int addm = (m_additionalMeasures != null)
	330	? m_additionalMeasures.length
	331	: 0;
	332	int overall_length = RESULT_SIZE+addm;
	333
	334	Object [] resultTypes = new Object[overall_length];
	335	Double doub = new Double(0);
	336	int current = 0;
	337
	338	// number of training and testing instances
	339	resultTypes[current++] = doub;
	340	resultTypes[current++] = doub;
	341
	342	// log liklihood
	343	resultTypes[current++] = doub;
	344	// number of clusters
	345	resultTypes[current++] = doub;
	346
	347	// timing stats
	348	resultTypes[current++] = doub;
	349	resultTypes[current++] = doub;
	350
	351
	352	// resultTypes[current++] = "";
	353
	354	// add any additional measures
	355	for (int i=0;i<addm;i++) {
	356	resultTypes[current++] = doub;
	357	}
	358	if (current != overall_length) {
	359	throw new Error("ResultTypes didn't fit RESULT_SIZE");
	360	}
	361	return resultTypes;
	362	}
	363
	364	/**
	365	* Gets the names of each of the result columns produced for a single run.
	366	* The number of result fields must be constant
	367	* for a given SplitEvaluator.
	368	*
	369	* @return an array containing the name of each result column
	370	*/
	371	public String [] getResultNames() {
	372	int addm = (m_additionalMeasures != null)
	373	? m_additionalMeasures.length
	374	: 0;
	375	int overall_length = RESULT_SIZE+addm;
	376
	377	String [] resultNames = new String[overall_length];
	378	int current = 0;
	379	resultNames[current++] = "Number_of_training_instances";
	380	resultNames[current++] = "Number_of_testing_instances";
	381
	382	// Basic performance stats
	383	resultNames[current++] = "Log_likelihood";
	384	resultNames[current++] = "Number_of_clusters";
	385
	386	// Timing stats
	387	resultNames[current++] = "Time_training";
	388	resultNames[current++] = "Time_testing";
	389
	390	// Classifier defined extras
	391	// resultNames[current++] = "Summary";
	392	// add any additional measures
	393	for (int i=0;i<addm;i++) {
	394	resultNames[current++] = m_additionalMeasures[i];
	395	}
	396	if (current != overall_length) {
	397	throw new Error("ResultNames didn't fit RESULT_SIZE");
	398	}
	399	return resultNames;
	400	}
	401
	402	/**
	403	* Gets the results for the supplied train and test datasets.
	404	*
	405	* @param train the training Instances.
	406	* @param test the testing Instances.
	407	* @return the results stored in an array. The objects stored in
	408	* the array may be Strings, Doubles, or null (for the missing value).
	409	* @exception Exception if a problem occurs while getting the results
	410	*/
	411	public Object [] getResult(Instances train, Instances test)
	412	throws Exception {
	413
	414	if (m_clusterer == null) {
	415	throw new Exception("No clusterer has been specified");
	416	}
	417	int addm = (m_additionalMeasures != null)
	418	? m_additionalMeasures.length
	419	: 0;
	420	int overall_length = RESULT_SIZE+addm;
	421
	422	if (m_removeClassColumn && train.classIndex() != -1) {
	423	// remove the class column from the training and testing data
	424	Remove r = new Remove();
	425	r.setAttributeIndicesArray(new int [] {train.classIndex()});
	426	r.setInvertSelection(false);
	427	r.setInputFormat(train);
	428	train = Filter.useFilter(train, r);
	429
	430	test = Filter.useFilter(test, r);
	431	}
	432	train.setClassIndex(-1);
	433	test.setClassIndex(-1);
	434
	435
	436	ClusterEvaluation eval = new ClusterEvaluation();
	437
	438	Object [] result = new Object[overall_length];
	439	long trainTimeStart = System.currentTimeMillis();
	440	m_clusterer.buildClusterer(train);
	441	double numClusters = m_clusterer.numberOfClusters();
	442	eval.setClusterer(m_clusterer);
	443	long trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
	444	long testTimeStart = System.currentTimeMillis();
	445	eval.evaluateClusterer(test);
	446	long testTimeElapsed = System.currentTimeMillis() - testTimeStart;
	447	// m_result = eval.toSummaryString();
	448
	449	// The results stored are all per instance -- can be multiplied by the
	450	// number of instances to get absolute numbers
	451	int current = 0;
	452	result[current++] = new Double(train.numInstances());
	453	result[current++] = new Double(test.numInstances());
	454
	455	result[current++] = new Double(eval.getLogLikelihood());
	456	result[current++] = new Double(numClusters);
	457
	458	// Timing stats
	459	result[current++] = new Double(trainTimeElapsed / 1000.0);
	460	result[current++] = new Double(testTimeElapsed / 1000.0);
	461
	462	for (int i=0;i<addm;i++) {
	463	if (m_doesProduce[i]) {
	464	try {
	465	double dv = ((AdditionalMeasureProducer)m_clusterer).
	466	getMeasure(m_additionalMeasures[i]);
	467	Double value = new Double(dv);
	468
	469	result[current++] = value;
	470	} catch (Exception ex) {
	471	System.err.println(ex);
	472	}
	473	} else {
	474	result[current++] = null;
	475	}
	476	}
	477
	478	if (current != overall_length) {
	479	throw new Error("Results didn't fit RESULT_SIZE");
	480	}
	481	return result;
	482	}
	483
	484	/**
	485	* Returns the tip text for this property
	486	* @return tip text for this property suitable for
	487	* displaying in the explorer/experimenter gui
	488	*/
	489	public String removeClassColumnTipText() {
	490	return "Remove the class column (if set) from the data.";
	491	}
	492
	493	/**
	494	* Set whether the class column should be removed from the data.
	495	*
	496	* @param r true if the class column is to be removed.
	497	*/
	498	public void setRemoveClassColumn(boolean r) {
	499	m_removeClassColumn = r;
	500	}
	501
	502	/**
	503	* Get whether the class column is to be removed.
	504	*
	505	* @return true if the class column is to be removed.
	506	*/
	507	public boolean getRemoveClassColumn() {
	508	return m_removeClassColumn;
	509	}
	510
	511	/**
	512	* Returns the tip text for this property
	513	* @return tip text for this property suitable for
	514	* displaying in the explorer/experimenter gui
	515	*/
	516	public String clustererTipText() {
	517	return "The density based clusterer to use.";
	518	}
	519
	520	/**
	521	* Get the value of clusterer
	522	*
	523	* @return Value of clusterer.
	524	*/
	525	public DensityBasedClusterer getClusterer() {
	526
	527	return m_clusterer;
	528	}
	529
	530	/**
	531	* Sets the clusterer.
	532	*
	533	* @param newClusterer the new clusterer to use.
	534	*/
	535	public void setClusterer(DensityBasedClusterer newClusterer) {
	536
	537	m_clusterer = newClusterer;
	538	updateOptions();
	539	}
	540
	541
	542	protected void updateOptions() {
	543
	544	if (m_clusterer instanceof OptionHandler) {
	545	m_clustererOptions = Utils.joinOptions(((OptionHandler)m_clusterer)
	546	.getOptions());
	547	} else {
	548	m_clustererOptions = "";
	549	}
	550	if (m_clusterer instanceof Serializable) {
	551	ObjectStreamClass obs = ObjectStreamClass.lookup(m_clusterer
	552	.getClass());
	553	m_clustererVersion = "" + obs.getSerialVersionUID();
	554	} else {
	555	m_clustererVersion = "";
	556	}
	557	}
	558
	559	/**
	560	* Set the Clusterer to use, given it's class name. A new clusterer will be
	561	* instantiated.
	562	*
	563	* @param newClustererName the clusterer class name.
	564	* @exception Exception if the class name is invalid.
	565	*/
	566	public void setClustererName(String newClustererName) throws Exception {
	567
	568	try {
	569	setClusterer((DensityBasedClusterer)Class.forName(newClustererName)
	570	.newInstance());
	571	} catch (Exception ex) {
	572	throw new Exception("Can't find Clusterer with class name: "
	573	+ newClustererName);
	574	}
	575	}
	576
	577	/**
	578	* Gets the raw output from the classifier
	579	* @return the raw output from the classifier
	580	*/
	581	public String getRawResultOutput() {
	582	StringBuffer result = new StringBuffer();
	583
	584	if (m_clusterer == null) {
	585	return "<null> clusterer";
	586	}
	587	result.append(toString());
	588	result.append("Clustering model: \n"+m_clusterer.toString()+'\n');
	589
	590	// append the performance statistics
	591	if (m_result != null) {
	592	// result.append(m_result);
	593
	594	if (m_doesProduce != null) {
	595	for (int i=0;i<m_doesProduce.length;i++) {
	596	if (m_doesProduce[i]) {
	597	try {
	598	double dv = ((AdditionalMeasureProducer)m_clusterer).
	599	getMeasure(m_additionalMeasures[i]);
	600	Double value = new Double(dv);
	601
	602	result.append(m_additionalMeasures[i]+" : "+value+'\n');
	603	} catch (Exception ex) {
	604	System.err.println(ex);
	605	}
	606	}
	607	}
	608	}
	609	}
	610	return result.toString();
	611	}
	612
	613	/**
	614	* Returns a text description of the split evaluator.
	615	*
	616	* @return a text description of the split evaluator.
	617	*/
	618	public String toString() {
	619
	620	String result = "DensityBasedClustererSplitEvaluator: ";
	621	if (m_clusterer == null) {
	622	return result + "<null> clusterer";
	623	}
	624	return result + m_clusterer.getClass().getName() + " "
	625	+ m_clustererOptions + "(version " + m_clustererVersion + ")";
	626	}
	627
	628	/**
	629	* Returns the revision string.
	630	*
	631	* @return the revision
	632	*/
	633	public String getRevision() {
	634	return RevisionUtils.extract("$Revision: 5563 $");
	635	}
	636	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: