Context Navigation

source: src/main/java/weka/clusterers/MakeDensityBasedClusterer.java @ 18

Last change on this file since 18 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 17.3 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* MakeDensityBasedClusterer.java
19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.clusterers;
24
25	import weka.core.Capabilities;
26	import weka.core.Instance;
27	import weka.core.Instances;
28	import weka.core.Option;
29	import weka.core.OptionHandler;
30	import weka.core.RevisionUtils;
31	import weka.core.Utils;
32	import weka.core.WeightedInstancesHandler;
33	import weka.core.Capabilities.Capability;
34	import weka.estimators.DiscreteEstimator;
35	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
36
37	import java.util.Enumeration;
38	import java.util.Vector;
39
40	/**
41	<!-- globalinfo-start -->
42	* Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
43	* <p/>
44	<!-- globalinfo-end -->
45	*
46	<!-- options-start -->
47	* Valid options are: <p/>
48	*
49	* <pre> -M <num>
50	* minimum allowable standard deviation for normal density computation
51	* (default 1e-6)</pre>
52	*
53	* <pre> -W <clusterer name>
54	* Clusterer to wrap.
55	* (default weka.clusterers.SimpleKMeans)</pre>
56	*
57	* <pre>
58	* Options specific to clusterer weka.clusterers.SimpleKMeans:
59	* </pre>
60	*
61	* <pre> -N <num>
62	* number of clusters.
63	* (default 2).</pre>
64	*
65	* <pre> -V
66	* Display std. deviations for centroids.
67	* </pre>
68	*
69	* <pre> -M
70	* Replace missing values with mean/mode.
71	* </pre>
72	*
73	* <pre> -S <num>
74	* Random number seed.
75	* (default 10)</pre>
76	*
77	<!-- options-end -->
78	*
79	* Options after "--" are passed on to the base clusterer.
80	*
81	* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
82	* @author Mark Hall (mhall@cs.waikato.ac.nz)
83	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
84	* @version $Revision: 5488 $
85	*/
86	public class MakeDensityBasedClusterer
87	extends AbstractDensityBasedClusterer
88	implements NumberOfClustersRequestable,
89	OptionHandler,
90	WeightedInstancesHandler {
91
92	/** for serialization */
93	static final long serialVersionUID = -5643302427972186631L;
94
95	/** holds training instances header information */
96	private Instances m_theInstances;
97	/** prior probabilities for the fitted clusters */
98	private double [] m_priors;
99	/** normal distributions fitted to each numeric attribute in each cluster */
100	private double [][][] m_modelNormal;
101	/** discrete distributions fitted to each discrete attribute in each cluster */
102	private DiscreteEstimator [][] m_model;
103	/** default minimum standard deviation */
104	private double m_minStdDev = 1e-6;
105	/** The clusterer being wrapped */
106	private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
107	/** globally replace missing values */
108	private ReplaceMissingValues m_replaceMissing;
109
110	/**
111	* Default constructor.
112	*
113	*/
114	public MakeDensityBasedClusterer() {
115	super();
116	}
117
118	/**
119	* Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
120	*
121	* @param toWrap the clusterer to wrap around
122	*/
123	public MakeDensityBasedClusterer(Clusterer toWrap) {
124
125	setClusterer(toWrap);
126	}
127
128	/**
129	* Returns a string describing classifier
130	* @return a description suitable for
131	* displaying in the explorer/experimenter gui
132	*/
133	public String globalInfo() {
134	return
135	"Class for wrapping a Clusterer to make it return a distribution "
136	+ "and density. Fits normal distributions and discrete distributions "
137	+ "within each cluster produced by the wrapped clusterer. Supports the "
138	+ "NumberOfClustersRequestable interface only if the wrapped Clusterer "
139	+ "does.";
140	}
141
142	/**
143	* String describing default clusterer.
144	*
145	* @return the default clusterer classname
146	*/
147	protected String defaultClustererString() {
148	return SimpleKMeans.class.getName();
149	}
150
151	/**
152	* Set the number of clusters to generate.
153	*
154	* @param n the number of clusters to generate
155	* @throws Exception if the wrapped clusterer has not been set, or if
156	* the wrapped clusterer does not implement this facility.
157	*/
158	public void setNumClusters(int n) throws Exception {
159	if (m_wrappedClusterer == null) {
160	throw new Exception("Can't set the number of clusters to generate - "
161	+"no clusterer has been set yet.");
162	}
163	if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {
164	throw new Exception("Can't set the number of clusters to generate - "
165	+"wrapped clusterer does not support this facility.");
166	}
167
168	((NumberOfClustersRequestable)m_wrappedClusterer).setNumClusters(n);
169	}
170
171	/**
172	* Returns default capabilities of the clusterer (i.e., of the wrapper
173	* clusterer).
174	*
175	* @return the capabilities of this clusterer
176	*/
177	public Capabilities getCapabilities() {
178	if (m_wrappedClusterer != null) {
179	return m_wrappedClusterer.getCapabilities();
180	}
181	Capabilities result = super.getCapabilities();
182	result.disableAll();
183	result.enable(Capability.NO_CLASS);
184
185	return result;
186	}
187
188	/**
189	* Builds a clusterer for a set of instances.
190	*
191	* @param data the instances to train the clusterer with
192	* @throws Exception if the clusterer hasn't been set or something goes wrong
193	*/
194	public void buildClusterer(Instances data) throws Exception {
195	// can clusterer handle the data?
196	getCapabilities().testWithFail(data);
197
198	m_replaceMissing = new ReplaceMissingValues();
199	m_replaceMissing.setInputFormat(data);
200	data = weka.filters.Filter.useFilter(data, m_replaceMissing);
201
202	m_theInstances = new Instances(data, 0);
203	if (m_wrappedClusterer == null) {
204	throw new Exception("No clusterer has been set");
205	}
206	m_wrappedClusterer.buildClusterer(data);
207	m_model =
208	new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
209	m_modelNormal =
210	new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()][2];
211	double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
212	m_priors = new double[m_wrappedClusterer.numberOfClusters()];
213	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
214	m_priors[i] = 1.0; // laplace correction
215	for (int j = 0; j < data.numAttributes(); j++) {
216	if (data.attribute(j).isNominal()) {
217	m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),
218	true);
219	}
220	}
221	}
222
223	Instance inst = null;
224
225	// Compute mean, etc.
226	int[] clusterIndex = new int[data.numInstances()];
227	for (int i = 0; i < data.numInstances(); i++) {
228	inst = data.instance(i);
229	int cluster = m_wrappedClusterer.clusterInstance(inst);
230	m_priors[cluster] += inst.weight();
231	for (int j = 0; j < data.numAttributes(); j++) {
232	if (!inst.isMissing(j)) {
233	if (data.attribute(j).isNominal()) {
234	m_model[cluster][j].addValue(inst.value(j),inst.weight());
235	} else {
236	m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);
237	weights[cluster][j] += inst.weight();
238	}
239	}
240	}
241	clusterIndex[i] = cluster;
242	}
243
244	for (int j = 0; j < data.numAttributes(); j++) {
245	if (data.attribute(j).isNumeric()) {
246	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
247	if (weights[i][j] > 0) {
248	m_modelNormal[i][j][0] /= weights[i][j];
249	}
250	}
251	}
252	}
253
254	// Compute standard deviations
255	for (int i = 0; i < data.numInstances(); i++) {
256	inst = data.instance(i);
257	for (int j = 0; j < data.numAttributes(); j++) {
258	if (!inst.isMissing(j)) {
259	if (data.attribute(j).isNumeric()) {
260	double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);
261	m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;
262	}
263	}
264	}
265	}
266
267	for (int j = 0; j < data.numAttributes(); j++) {
268	if (data.attribute(j).isNumeric()) {
269	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
270	if (weights[i][j] > 0) {
271	m_modelNormal[i][j][1] =
272	Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]);
273	} else if (weights[i][j] <= 0) {
274	m_modelNormal[i][j][1] = Double.MAX_VALUE;
275	}
276	if (m_modelNormal[i][j][1] <= m_minStdDev) {
277	m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
278	if (m_modelNormal[i][j][1] <= m_minStdDev) {
279	m_modelNormal[i][j][1] = m_minStdDev;
280	}
281	}
282	}
283	}
284	}
285
286	Utils.normalize(m_priors);
287	}
288
289	/**
290	* Returns the cluster priors.
291	*
292	* @return the cluster priors
293	*/
294	public double[] clusterPriors() {
295
296	double[] n = new double[m_priors.length];
297
298	System.arraycopy(m_priors, 0, n, 0, n.length);
299	return n;
300	}
301
302	/**
303	* Computes the log of the conditional density (per cluster) for a given instance.
304	*
305	* @param inst the instance to compute the density for
306	* @return an array containing the estimated densities
307	* @throws Exception if the density could not be computed
308	* successfully
309	*/
310	public double[] logDensityPerClusterForInstance(Instance inst) throws Exception {
311
312	int i, j;
313	double logprob;
314	double[] wghts = new double[m_wrappedClusterer.numberOfClusters()];
315
316	m_replaceMissing.input(inst);
317	inst = m_replaceMissing.output();
318
319	for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
320	logprob = 0;
321	for (j = 0; j < inst.numAttributes(); j++) {
322	if (!inst.isMissing(j)) {
323	if (inst.attribute(j).isNominal()) {
324	logprob += Math.log(m_model[i][j].getProbability(inst.value(j)));
325	} else { // numeric attribute
326	logprob += logNormalDens(inst.value(j),
327	m_modelNormal[i][j][0],
328	m_modelNormal[i][j][1]);
329	}
330	}
331	}
332	wghts[i] = logprob;
333	}
334	return wghts;
335	}
336
337	/** Constant for normal distribution. */
338	private static double m_normConst = 0.5 * Math.log(2 * Math.PI);
339
340	/**
341	* Density function of normal distribution.
342	* @param x input value
343	* @param mean mean of distribution
344	* @param stdDev standard deviation of distribution
345	* @return the density
346	*/
347	private double logNormalDens (double x, double mean, double stdDev) {
348
349	double diff = x - mean;
350
351	return - (diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev);
352	}
353
354	/**
355	* Returns the number of clusters.
356	*
357	* @return the number of clusters generated for a training dataset.
358	* @throws Exception if number of clusters could not be returned successfully
359	*/
360	public int numberOfClusters() throws Exception {
361
362	return m_wrappedClusterer.numberOfClusters();
363	}
364
365	/**
366	* Returns a description of the clusterer.
367	*
368	* @return a string containing a description of the clusterer
369	*/
370	public String toString() {
371	if (m_priors == null) {
372	return "No clusterer built yet!";
373	}
374
375	StringBuffer text = new StringBuffer();
376	text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
377	+ m_wrappedClusterer.toString());
378
379	text.append("\nFitted estimators (with ML estimates of variance):\n");
380
381	for (int j = 0; j < m_priors.length; j++) {
382	text.append("\nCluster: " + j + " Prior probability: "
383	+ Utils.doubleToString(m_priors[j], 4) + "\n\n");
384
385	for (int i = 0; i < m_model[0].length; i++) {
386	text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");
387
388	if (m_theInstances.attribute(i).isNominal()) {
389	if (m_model[j][i] != null) {
390	text.append(m_model[j][i].toString());
391	}
392	}
393	else {
394	text.append("Normal Distribution. Mean = "
395	+ Utils.doubleToString(m_modelNormal[j][i][0], 4)
396	+ " StdDev = "
397	+ Utils.doubleToString(m_modelNormal[j][i][1], 4)
398	+ "\n");
399	}
400	}
401	}
402
403	return text.toString();
404	}
405
406	/**
407	* Returns the tip text for this property
408	* @return tip text for this property suitable for
409	* displaying in the explorer/experimenter gui
410	*/
411	public String clustererTipText() {
412	return "the clusterer to wrap";
413	}
414
415	/**
416	* Sets the clusterer to wrap.
417	*
418	* @param toWrap the clusterer
419	*/
420	public void setClusterer(Clusterer toWrap) {
421
422	m_wrappedClusterer = toWrap;
423	}
424
425	/**
426	* Gets the clusterer being wrapped.
427	*
428	* @return the clusterer
429	*/
430	public Clusterer getClusterer() {
431
432	return m_wrappedClusterer;
433	}
434
435	/**
436	* Returns the tip text for this property
437	* @return tip text for this property suitable for
438	* displaying in the explorer/experimenter gui
439	*/
440	public String minStdDevTipText() {
441	return "set minimum allowable standard deviation";
442	}
443
444	/**
445	* Set the minimum value for standard deviation when calculating
446	* normal density. Reducing this value can help prevent arithmetic
447	* overflow resulting from multiplying large densities (arising from small
448	* standard deviations) when there are many singleton or near singleton
449	* values.
450	* @param m minimum value for standard deviation
451	*/
452	public void setMinStdDev(double m) {
453	m_minStdDev = m;
454	}
455
456	/**
457	* Get the minimum allowable standard deviation.
458	* @return the minumum allowable standard deviation
459	*/
460	public double getMinStdDev() {
461	return m_minStdDev;
462	}
463
464	/**
465	* Returns an enumeration describing the available options..
466	*
467	* @return an enumeration of all the available options.
468	*/
469	public Enumeration listOptions() {
470	Vector result = new Vector();
471
472	result.addElement(new Option(
473	"\tminimum allowable standard deviation for normal density computation "
474	+"\n\t(default 1e-6)"
475	,"M",1,"-M <num>"));
476
477	result.addElement(new Option(
478	"\tClusterer to wrap.\n"
479	+ "\t(default " + defaultClustererString() + ")",
480	"W", 1,"-W <clusterer name>"));
481
482	if ((m_wrappedClusterer != null) &&
483	(m_wrappedClusterer instanceof OptionHandler)) {
484	result.addElement(new Option(
485	"",
486	"", 0, "\nOptions specific to clusterer "
487	+ m_wrappedClusterer.getClass().getName() + ":"));
488	Enumeration enu = ((OptionHandler)m_wrappedClusterer).listOptions();
489	while (enu.hasMoreElements()) {
490	result.addElement(enu.nextElement());
491	}
492	}
493
494	return result.elements();
495	}
496
497	/**
498	* Parses a given list of options. <p/>
499	*
500	<!-- options-start -->
501	* Valid options are: <p/>
502	*
503	* <pre> -M <num>
504	* minimum allowable standard deviation for normal density computation
505	* (default 1e-6)</pre>
506	*
507	* <pre> -W <clusterer name>
508	* Clusterer to wrap.
509	* (default weka.clusterers.SimpleKMeans)</pre>
510	*
511	* <pre>
512	* Options specific to clusterer weka.clusterers.SimpleKMeans:
513	* </pre>
514	*
515	* <pre> -N <num>
516	* number of clusters.
517	* (default 2).</pre>
518	*
519	* <pre> -V
520	* Display std. deviations for centroids.
521	* </pre>
522	*
523	* <pre> -M
524	* Replace missing values with mean/mode.
525	* </pre>
526	*
527	* <pre> -S <num>
528	* Random number seed.
529	* (default 10)</pre>
530	*
531	<!-- options-end -->
532	*
533	* @param options the list of options as an array of strings
534	* @throws Exception if an option is not supported
535	*/
536	public void setOptions(String[] options) throws Exception {
537
538	String optionString = Utils.getOption('M', options);
539	if (optionString.length() != 0)
540	setMinStdDev((new Double(optionString)).doubleValue());
541	else
542	setMinStdDev(1e-6);
543
544	String wString = Utils.getOption('W', options);
545	if (wString.length() == 0)
546	wString = defaultClustererString();
547	setClusterer(AbstractClusterer.forName(wString, Utils.partitionOptions(options)));
548	}
549
550	/**
551	* Gets the current settings of the clusterer.
552	*
553	* @return an array of strings suitable for passing to setOptions()
554	*/
555	public String[] getOptions() {
556
557	String [] clustererOptions = new String [0];
558	if ((m_wrappedClusterer != null) &&
559	(m_wrappedClusterer instanceof OptionHandler)) {
560	clustererOptions = ((OptionHandler)m_wrappedClusterer).getOptions();
561	}
562	String [] options = new String [clustererOptions.length + 5];
563	int current = 0;
564
565	options[current++] = "-M";
566	options[current++] = ""+getMinStdDev();
567
568	if (getClusterer() != null) {
569	options[current++] = "-W";
570	options[current++] = getClusterer().getClass().getName();
571	}
572	options[current++] = "--";
573
574	System.arraycopy(clustererOptions, 0, options, current,
575	clustererOptions.length);
576	current += clustererOptions.length;
577	while (current < options.length) {
578	options[current++] = "";
579	}
580	return options;
581	}
582
583	/**
584	* Returns the revision string.
585	*
586	* @return the revision
587	*/
588	public String getRevision() {
589	return RevisionUtils.extract("$Revision: 5488 $");
590	}
591
592	/**
593	* Main method for testing this class.
594	*
595	* @param argv the options
596	*/
597	public static void main(String [] argv) {
598	runClusterer(new MakeDensityBasedClusterer(), argv);
599	}
600	}
601

Note: See TracBrowser for help on using the repository browser.

Download in other formats: