Context Navigation

source: src/main/java/weka/clusterers/FarthestFirst.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 16.8 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* FarthestFirst.java
19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22	package weka.clusterers;
23
24	import weka.core.Attribute;
25	import weka.core.Capabilities;
26	import weka.core.Instance;
27	import weka.core.Instances;
28	import weka.core.Option;
29	import weka.core.RevisionUtils;
30	import weka.core.TechnicalInformation;
31	import weka.core.TechnicalInformationHandler;
32	import weka.core.Utils;
33	import weka.core.Capabilities.Capability;
34	import weka.core.TechnicalInformation.Field;
35	import weka.core.TechnicalInformation.Type;
36	import weka.filters.Filter;
37	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
38
39	import java.util.Enumeration;
40	import java.util.Random;
41	import java.util.Vector;
42
43	/**
44	<!-- globalinfo-start -->
45	* Cluster data using the FarthestFirst algorithm.<br/>
46	* <br/>
47	* For more information see:<br/>
48	* <br/>
49	* Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/>
50	* <br/>
51	* Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/>
52	* <br/>
53	* Notes:<br/>
54	* - works as a fast simple approximate clusterer<br/>
55	* - modelled after SimpleKMeans, might be a useful initializer for it
56	* <p/>
57	<!-- globalinfo-end -->
58	*
59	<!-- technical-bibtex-start -->
60	* BibTeX:
61	* <pre>
62	* @article{Hochbaum1985,
63	* author = {Hochbaum and Shmoys},
64	* journal = {Mathematics of Operations Research},
65	* number = {2},
66	* pages = {180-184},
67	* title = {A best possible heuristic for the k-center problem},
68	* volume = {10},
69	* year = {1985}
70	* }
71	*
72	* @inproceedings{Dasgupta2002,
73	* author = {Sanjoy Dasgupta},
74	* booktitle = {15th Annual Conference on Computational Learning Theory},
75	* pages = {351-363},
76	* publisher = {Springer},
77	* title = {Performance Guarantees for Hierarchical Clustering},
78	* year = {2002}
79	* }
80	* </pre>
81	* <p/>
82	<!-- technical-bibtex-end -->
83	*
84	<!-- options-start -->
85	* Valid options are: <p/>
86	*
87	* <pre> -N <num>
88	* number of clusters. (default = 2).</pre>
89	*
90	* <pre> -S <num>
91	* Random number seed.
92	* (default 1)</pre>
93	*
94	<!-- options-end -->
95	*
96	* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
97	* @version $Revision: 5987 $
98	* @see RandomizableClusterer
99	*/
100	public class FarthestFirst
101	extends RandomizableClusterer
102	implements TechnicalInformationHandler {
103
104	//Todo: rewrite to be fully incremental
105	// cleanup, like deleting m_instances
106
107	/** for serialization */
108	static final long serialVersionUID = 7499838100631329509L;
109
110	/**
111	* training instances, not necessary to keep,
112	* could be replaced by m_ClusterCentroids where needed for header info
113	*/
114	protected Instances m_instances;
115
116	/**
117	* replace missing values in training instances
118	*/
119	protected ReplaceMissingValues m_ReplaceMissingFilter;
120
121	/**
122	* number of clusters to generate
123	*/
124	protected int m_NumClusters = 2;
125
126	/**
127	* holds the cluster centroids
128	*/
129	protected Instances m_ClusterCentroids;
130
131	/**
132	* attribute min values
133	*/
134	private double [] m_Min;
135
136	/**
137	* attribute max values
138	*/
139	private double [] m_Max;
140
141	/**
142	* Returns a string describing this clusterer
143	* @return a description of the evaluator suitable for
144	* displaying in the explorer/experimenter gui
145	*/
146	public String globalInfo() {
147	return "Cluster data using the FarthestFirst algorithm.\n\n"
148	+ "For more information see:\n\n"
149	+ getTechnicalInformation().toString() + "\n\n"
150	+ "Notes:\n"
151	+ "- works as a fast simple approximate clusterer\n"
152	+ "- modelled after SimpleKMeans, might be a useful initializer for it";
153	}
154
155	/**
156	* Returns an instance of a TechnicalInformation object, containing
157	* detailed information about the technical background of this class,
158	* e.g., paper reference or book this class is based on.
159	*
160	* @return the technical information about this class
161	*/
162	public TechnicalInformation getTechnicalInformation() {
163	TechnicalInformation result;
164	TechnicalInformation additional;
165
166	result = new TechnicalInformation(Type.ARTICLE);
167	result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");
168	result.setValue(Field.YEAR, "1985");
169	result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem");
170	result.setValue(Field.JOURNAL, "Mathematics of Operations Research");
171	result.setValue(Field.VOLUME, "10");
172	result.setValue(Field.NUMBER, "2");
173	result.setValue(Field.PAGES, "180-184");
174
175	additional = result.add(Type.INPROCEEDINGS);
176	additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");
177	additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering");
178	additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory");
179	additional.setValue(Field.YEAR, "2002");
180	additional.setValue(Field.PAGES, "351-363");
181	additional.setValue(Field.PUBLISHER, "Springer");
182
183	return result;
184	}
185
186	/**
187	* Returns default capabilities of the clusterer.
188	*
189	* @return the capabilities of this clusterer
190	*/
191	public Capabilities getCapabilities() {
192	Capabilities result = super.getCapabilities();
193	result.disableAll();
194	result.enable(Capability.NO_CLASS);
195
196	// attributes
197	result.enable(Capability.NOMINAL_ATTRIBUTES);
198	result.enable(Capability.NUMERIC_ATTRIBUTES);
199	result.enable(Capability.DATE_ATTRIBUTES);
200	result.enable(Capability.MISSING_VALUES);
201
202	return result;
203	}
204
205	/**
206	* Generates a clusterer. Has to initialize all fields of the clusterer
207	* that are not being set via options.
208	*
209	* @param data set of instances serving as training data
210	* @throws Exception if the clusterer has not been
211	* generated successfully
212	*/
213	public void buildClusterer(Instances data) throws Exception {
214
215	// can clusterer handle the data?
216	getCapabilities().testWithFail(data);
217
218	//long start = System.currentTimeMillis();
219
220	m_ReplaceMissingFilter = new ReplaceMissingValues();
221	m_ReplaceMissingFilter.setInputFormat(data);
222	m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
223
224	initMinMax(m_instances);
225
226	m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
227
228	int n = m_instances.numInstances();
229	Random r = new Random(getSeed());
230	boolean[] selected = new boolean[n];
231	double[] minDistance = new double[n];
232
233	for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;
234
235	int firstI = r.nextInt(n);
236	m_ClusterCentroids.add(m_instances.instance(firstI));
237	selected[firstI] = true;
238
239	updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));
240
241	if (m_NumClusters > n) m_NumClusters = n;
242
243	for(int i = 1; i < m_NumClusters; i++) {
244	int nextI = farthestAway(minDistance, selected);
245	m_ClusterCentroids.add(m_instances.instance(nextI));
246	selected[nextI] = true;
247	updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));
248	}
249
250	m_instances = new Instances(m_instances,0);
251	//long end = System.currentTimeMillis();
252	//System.out.println("Clustering Time = " + (end-start));
253	}
254
255
256	protected void updateMinDistance(double[] minDistance, boolean[] selected,
257	Instances data, Instance center) {
258	for(int i = 0; i<selected.length; i++)
259	if (!selected[i]) {
260	double d = distance(center,data.instance(i));
261	if (d<minDistance[i])
262	minDistance[i] = d;
263	}
264	}
265
266	protected int farthestAway(double[] minDistance, boolean[] selected) {
267	double maxDistance = -1.0;
268	int maxI = -1;
269	for(int i = 0; i<selected.length; i++)
270	if (!selected[i])
271	if (maxDistance < minDistance[i]) {
272	maxDistance = minDistance[i];
273	maxI = i;
274	}
275	return maxI;
276	}
277
278	protected void initMinMax(Instances data) {
279	m_Min = new double [data.numAttributes()];
280	m_Max = new double [data.numAttributes()];
281	for (int i = 0; i < data.numAttributes(); i++) {
282	m_Min[i] = m_Max[i] = Double.NaN;
283	}
284
285	for (int i = 0; i < data.numInstances(); i++) {
286	updateMinMax(data.instance(i));
287	}
288	}
289
290
291	/**
292	* Updates the minimum and maximum values for all the attributes
293	* based on a new instance.
294	*
295	* @param instance the new instance
296	*/
297	private void updateMinMax(Instance instance) {
298
299	for (int j = 0;j < instance.numAttributes(); j++) {
300	if (Double.isNaN(m_Min[j])) {
301	m_Min[j] = instance.value(j);
302	m_Max[j] = instance.value(j);
303	} else {
304	if (instance.value(j) < m_Min[j]) {
305	m_Min[j] = instance.value(j);
306	} else {
307	if (instance.value(j) > m_Max[j]) {
308	m_Max[j] = instance.value(j);
309	}
310	}
311	}
312	}
313	}
314
315
316	/**
317	* clusters an instance that has been through the filters
318	*
319	* @param instance the instance to assign a cluster to
320	* @return a cluster number
321	*/
322	protected int clusterProcessedInstance(Instance instance) {
323	double minDist = Double.MAX_VALUE;
324	int bestCluster = 0;
325	for (int i = 0; i < m_NumClusters; i++) {
326	double dist = distance(instance, m_ClusterCentroids.instance(i));
327	if (dist < minDist) {
328	minDist = dist;
329	bestCluster = i;
330	}
331	}
332	return bestCluster;
333	}
334
335	/**
336	* Classifies a given instance.
337	*
338	* @param instance the instance to be assigned to a cluster
339	* @return the number of the assigned cluster as an integer
340	* if the class is enumerated, otherwise the predicted value
341	* @throws Exception if instance could not be classified
342	* successfully
343	*/
344	public int clusterInstance(Instance instance) throws Exception {
345	m_ReplaceMissingFilter.input(instance);
346	m_ReplaceMissingFilter.batchFinished();
347	Instance inst = m_ReplaceMissingFilter.output();
348
349	return clusterProcessedInstance(inst);
350	}
351
352	/**
353	* Calculates the distance between two instances
354	*
355	* @param first the first instance
356	* @param second the second instance
357	* @return the distance between the two given instances, between 0 and 1
358	*/
359	protected double distance(Instance first, Instance second) {
360
361	double distance = 0;
362	int firstI, secondI;
363
364	for (int p1 = 0, p2 = 0;
365	p1 < first.numValues() \|\| p2 < second.numValues();) {
366	if (p1 >= first.numValues()) {
367	firstI = m_instances.numAttributes();
368	} else {
369	firstI = first.index(p1);
370	}
371	if (p2 >= second.numValues()) {
372	secondI = m_instances.numAttributes();
373	} else {
374	secondI = second.index(p2);
375	}
376	if (firstI == m_instances.classIndex()) {
377	p1++; continue;
378	}
379	if (secondI == m_instances.classIndex()) {
380	p2++; continue;
381	}
382	double diff;
383	if (firstI == secondI) {
384	diff = difference(firstI,
385	first.valueSparse(p1),
386	second.valueSparse(p2));
387	p1++; p2++;
388	} else if (firstI > secondI) {
389	diff = difference(secondI,
390	0, second.valueSparse(p2));
391	p2++;
392	} else {
393	diff = difference(firstI,
394	first.valueSparse(p1), 0);
395	p1++;
396	}
397	distance += diff * diff;
398	}
399
400	return Math.sqrt(distance / m_instances.numAttributes());
401	}
402
403	/**
404	* Computes the difference between two given attribute
405	* values.
406	*/
407	protected double difference(int index, double val1, double val2) {
408
409	switch (m_instances.attribute(index).type()) {
410	case Attribute.NOMINAL:
411
412	// If attribute is nominal
413	if (Utils.isMissingValue(val1) \|\|
414	Utils.isMissingValue(val2) \|\|
415	((int)val1 != (int)val2)) {
416	return 1;
417	} else {
418	return 0;
419	}
420	case Attribute.NUMERIC:
421
422	// If attribute is numeric
423	if (Utils.isMissingValue(val1) \|\|
424	Utils.isMissingValue(val2)) {
425	if (Utils.isMissingValue(val1) &&
426	Utils.isMissingValue(val2)) {
427	return 1;
428	} else {
429	double diff;
430	if (Utils.isMissingValue(val2)) {
431	diff = norm(val1, index);
432	} else {
433	diff = norm(val2, index);
434	}
435	if (diff < 0.5) {
436	diff = 1.0 - diff;
437	}
438	return diff;
439	}
440	} else {
441	return norm(val1, index) - norm(val2, index);
442	}
443	default:
444	return 0;
445	}
446	}
447
448	/**
449	* Normalizes a given value of a numeric attribute.
450	*
451	* @param x the value to be normalized
452	* @param i the attribute's index
453	* @return the normalized value
454	*/
455	protected double norm(double x, int i) {
456
457	if (Double.isNaN(m_Min[i]) \|\| Utils.eq(m_Max[i],m_Min[i])) {
458	return 0;
459	} else {
460	return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
461	}
462	}
463
464	/**
465	* Returns the number of clusters.
466	*
467	* @return the number of clusters generated for a training dataset.
468	* @throws Exception if number of clusters could not be returned
469	* successfully
470	*/
471	public int numberOfClusters() throws Exception {
472	return m_NumClusters;
473	}
474
475	/**
476	* Returns an enumeration describing the available options.
477	*
478	* @return an enumeration of all the available options.
479	*/
480	public Enumeration listOptions () {
481	Vector result = new Vector();
482
483	result.addElement(new Option(
484	"\tnumber of clusters. (default = 2).",
485	"N", 1, "-N <num>"));
486
487	Enumeration en = super.listOptions();
488	while (en.hasMoreElements())
489	result.addElement(en.nextElement());
490
491	return result.elements();
492	}
493
494	/**
495	* Returns the tip text for this property
496	* @return tip text for this property suitable for
497	* displaying in the explorer/experimenter gui
498	*/
499	public String numClustersTipText() {
500	return "set number of clusters";
501	}
502
503	/**
504	* set the number of clusters to generate
505	*
506	* @param n the number of clusters to generate
507	* @throws Exception if number of clusters is negative
508	*/
509	public void setNumClusters(int n) throws Exception {
510	if (n < 0) {
511	throw new Exception("Number of clusters must be > 0");
512	}
513	m_NumClusters = n;
514	}
515
516	/**
517	* gets the number of clusters to generate
518	*
519	* @return the number of clusters to generate
520	*/
521	public int getNumClusters() {
522	return m_NumClusters;
523	}
524
525	/**
526	* Parses a given list of options. <p/>
527	*
528	<!-- options-start -->
529	* Valid options are: <p/>
530	*
531	* <pre> -N <num>
532	* number of clusters. (default = 2).</pre>
533	*
534	* <pre> -S <num>
535	* Random number seed.
536	* (default 1)</pre>
537	*
538	<!-- options-end -->
539	*
540	* @param options the list of options as an array of strings
541	* @throws Exception if an option is not supported
542	*/
543	public void setOptions (String[] options)
544	throws Exception {
545
546	String optionString = Utils.getOption('N', options);
547
548	if (optionString.length() != 0) {
549	setNumClusters(Integer.parseInt(optionString));
550	}
551
552	super.setOptions(options);
553	}
554
555	/**
556	* Gets the current settings of FarthestFirst
557	*
558	* @return an array of strings suitable for passing to setOptions()
559	*/
560	public String[] getOptions () {
561	int i;
562	Vector result;
563	String[] options;
564
565	result = new Vector();
566
567	result.add("-N");
568	result.add("" + getNumClusters());
569
570	options = super.getOptions();
571	for (i = 0; i < options.length; i++)
572	result.add(options[i]);
573
574	return (String[]) result.toArray(new String[result.size()]);
575	}
576
577	/**
578	* return a string describing this clusterer
579	*
580	* @return a description of the clusterer as a string
581	*/
582	public String toString() {
583	StringBuffer temp = new StringBuffer();
584
585	temp.append("\n FarthestFirst\n==============\n");
586
587	temp.append("\nCluster centroids:\n");
588	for (int i = 0; i < m_NumClusters; i++) {
589	temp.append("\nCluster "+i+"\n\t");
590	for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
591	if (m_ClusterCentroids.attribute(j).isNominal()) {
592	temp.append(" "+m_ClusterCentroids.attribute(j).
593	value((int)m_ClusterCentroids.instance(i).value(j)));
594	} else {
595	temp.append(" "+m_ClusterCentroids.instance(i).value(j));
596	}
597	}
598	}
599	temp.append("\n\n");
600	return temp.toString();
601	}
602
603	/**
604	* Returns the revision string.
605	*
606	* @return the revision
607	*/
608	public String getRevision() {
609	return RevisionUtils.extract("$Revision: 5987 $");
610	}
611
612	/**
613	* Main method for testing this class.
614	*
615	* @param argv should contain the following arguments: <p>
616	* -t training file [-N number of clusters]
617	*/
618	public static void main (String[] argv) {
619	runClusterer(new FarthestFirst(), argv);
620	}
621	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: