Context Navigation

source: src/main/java/weka/clusterers/FarthestFirst.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 16.8 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* FarthestFirst.java
	19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22	package weka.clusterers;
	23
	24	import weka.core.Attribute;
	25	import weka.core.Capabilities;
	26	import weka.core.Instance;
	27	import weka.core.Instances;
	28	import weka.core.Option;
	29	import weka.core.RevisionUtils;
	30	import weka.core.TechnicalInformation;
	31	import weka.core.TechnicalInformationHandler;
	32	import weka.core.Utils;
	33	import weka.core.Capabilities.Capability;
	34	import weka.core.TechnicalInformation.Field;
	35	import weka.core.TechnicalInformation.Type;
	36	import weka.filters.Filter;
	37	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	38
	39	import java.util.Enumeration;
	40	import java.util.Random;
	41	import java.util.Vector;
	42
	43	/**
	44	<!-- globalinfo-start -->
	45	* Cluster data using the FarthestFirst algorithm.<br/>
	46	* <br/>
	47	* For more information see:<br/>
	48	* <br/>
	49	* Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/>
	50	* <br/>
	51	* Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/>
	52	* <br/>
	53	* Notes:<br/>
	54	* - works as a fast simple approximate clusterer<br/>
	55	* - modelled after SimpleKMeans, might be a useful initializer for it
	56	* <p/>
	57	<!-- globalinfo-end -->
	58	*
	59	<!-- technical-bibtex-start -->
	60	* BibTeX:
	61	* <pre>
	62	* @article{Hochbaum1985,
	63	* author = {Hochbaum and Shmoys},
	64	* journal = {Mathematics of Operations Research},
	65	* number = {2},
	66	* pages = {180-184},
	67	* title = {A best possible heuristic for the k-center problem},
	68	* volume = {10},
	69	* year = {1985}
	70	* }
	71	*
	72	* @inproceedings{Dasgupta2002,
	73	* author = {Sanjoy Dasgupta},
	74	* booktitle = {15th Annual Conference on Computational Learning Theory},
	75	* pages = {351-363},
	76	* publisher = {Springer},
	77	* title = {Performance Guarantees for Hierarchical Clustering},
	78	* year = {2002}
	79	* }
	80	* </pre>
	81	* <p/>
	82	<!-- technical-bibtex-end -->
	83	*
	84	<!-- options-start -->
	85	* Valid options are: <p/>
	86	*
	87	* <pre> -N <num>
	88	* number of clusters. (default = 2).</pre>
	89	*
	90	* <pre> -S <num>
	91	* Random number seed.
	92	* (default 1)</pre>
	93	*
	94	<!-- options-end -->
	95	*
	96	* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
	97	* @version $Revision: 5987 $
	98	* @see RandomizableClusterer
	99	*/
	100	public class FarthestFirst
	101	extends RandomizableClusterer
	102	implements TechnicalInformationHandler {
	103
	104	//Todo: rewrite to be fully incremental
	105	// cleanup, like deleting m_instances
	106
	107	/** for serialization */
	108	static final long serialVersionUID = 7499838100631329509L;
	109
	110	/**
	111	* training instances, not necessary to keep,
	112	* could be replaced by m_ClusterCentroids where needed for header info
	113	*/
	114	protected Instances m_instances;
	115
	116	/**
	117	* replace missing values in training instances
	118	*/
	119	protected ReplaceMissingValues m_ReplaceMissingFilter;
	120
	121	/**
	122	* number of clusters to generate
	123	*/
	124	protected int m_NumClusters = 2;
	125
	126	/**
	127	* holds the cluster centroids
	128	*/
	129	protected Instances m_ClusterCentroids;
	130
	131	/**
	132	* attribute min values
	133	*/
	134	private double [] m_Min;
	135
	136	/**
	137	* attribute max values
	138	*/
	139	private double [] m_Max;
	140
	141	/**
	142	* Returns a string describing this clusterer
	143	* @return a description of the evaluator suitable for
	144	* displaying in the explorer/experimenter gui
	145	*/
	146	public String globalInfo() {
	147	return "Cluster data using the FarthestFirst algorithm.\n\n"
	148	+ "For more information see:\n\n"
	149	+ getTechnicalInformation().toString() + "\n\n"
	150	+ "Notes:\n"
	151	+ "- works as a fast simple approximate clusterer\n"
	152	+ "- modelled after SimpleKMeans, might be a useful initializer for it";
	153	}
	154
	155	/**
	156	* Returns an instance of a TechnicalInformation object, containing
	157	* detailed information about the technical background of this class,
	158	* e.g., paper reference or book this class is based on.
	159	*
	160	* @return the technical information about this class
	161	*/
	162	public TechnicalInformation getTechnicalInformation() {
	163	TechnicalInformation result;
	164	TechnicalInformation additional;
	165
	166	result = new TechnicalInformation(Type.ARTICLE);
	167	result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");
	168	result.setValue(Field.YEAR, "1985");
	169	result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem");
	170	result.setValue(Field.JOURNAL, "Mathematics of Operations Research");
	171	result.setValue(Field.VOLUME, "10");
	172	result.setValue(Field.NUMBER, "2");
	173	result.setValue(Field.PAGES, "180-184");
	174
	175	additional = result.add(Type.INPROCEEDINGS);
	176	additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");
	177	additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering");
	178	additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory");
	179	additional.setValue(Field.YEAR, "2002");
	180	additional.setValue(Field.PAGES, "351-363");
	181	additional.setValue(Field.PUBLISHER, "Springer");
	182
	183	return result;
	184	}
	185
	186	/**
	187	* Returns default capabilities of the clusterer.
	188	*
	189	* @return the capabilities of this clusterer
	190	*/
	191	public Capabilities getCapabilities() {
	192	Capabilities result = super.getCapabilities();
	193	result.disableAll();
	194	result.enable(Capability.NO_CLASS);
	195
	196	// attributes
	197	result.enable(Capability.NOMINAL_ATTRIBUTES);
	198	result.enable(Capability.NUMERIC_ATTRIBUTES);
	199	result.enable(Capability.DATE_ATTRIBUTES);
	200	result.enable(Capability.MISSING_VALUES);
	201
	202	return result;
	203	}
	204
	205	/**
	206	* Generates a clusterer. Has to initialize all fields of the clusterer
	207	* that are not being set via options.
	208	*
	209	* @param data set of instances serving as training data
	210	* @throws Exception if the clusterer has not been
	211	* generated successfully
	212	*/
	213	public void buildClusterer(Instances data) throws Exception {
	214
	215	// can clusterer handle the data?
	216	getCapabilities().testWithFail(data);
	217
	218	//long start = System.currentTimeMillis();
	219
	220	m_ReplaceMissingFilter = new ReplaceMissingValues();
	221	m_ReplaceMissingFilter.setInputFormat(data);
	222	m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
	223
	224	initMinMax(m_instances);
	225
	226	m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
	227
	228	int n = m_instances.numInstances();
	229	Random r = new Random(getSeed());
	230	boolean[] selected = new boolean[n];
	231	double[] minDistance = new double[n];
	232
	233	for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;
	234
	235	int firstI = r.nextInt(n);
	236	m_ClusterCentroids.add(m_instances.instance(firstI));
	237	selected[firstI] = true;
	238
	239	updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));
	240
	241	if (m_NumClusters > n) m_NumClusters = n;
	242
	243	for(int i = 1; i < m_NumClusters; i++) {
	244	int nextI = farthestAway(minDistance, selected);
	245	m_ClusterCentroids.add(m_instances.instance(nextI));
	246	selected[nextI] = true;
	247	updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));
	248	}
	249
	250	m_instances = new Instances(m_instances,0);
	251	//long end = System.currentTimeMillis();
	252	//System.out.println("Clustering Time = " + (end-start));
	253	}
	254
	255
	256	protected void updateMinDistance(double[] minDistance, boolean[] selected,
	257	Instances data, Instance center) {
	258	for(int i = 0; i<selected.length; i++)
	259	if (!selected[i]) {
	260	double d = distance(center,data.instance(i));
	261	if (d<minDistance[i])
	262	minDistance[i] = d;
	263	}
	264	}
	265
	266	protected int farthestAway(double[] minDistance, boolean[] selected) {
	267	double maxDistance = -1.0;
	268	int maxI = -1;
	269	for(int i = 0; i<selected.length; i++)
	270	if (!selected[i])
	271	if (maxDistance < minDistance[i]) {
	272	maxDistance = minDistance[i];
	273	maxI = i;
	274	}
	275	return maxI;
	276	}
	277
	278	protected void initMinMax(Instances data) {
	279	m_Min = new double [data.numAttributes()];
	280	m_Max = new double [data.numAttributes()];
	281	for (int i = 0; i < data.numAttributes(); i++) {
	282	m_Min[i] = m_Max[i] = Double.NaN;
	283	}
	284
	285	for (int i = 0; i < data.numInstances(); i++) {
	286	updateMinMax(data.instance(i));
	287	}
	288	}
	289
	290
	291	/**
	292	* Updates the minimum and maximum values for all the attributes
	293	* based on a new instance.
	294	*
	295	* @param instance the new instance
	296	*/
	297	private void updateMinMax(Instance instance) {
	298
	299	for (int j = 0;j < instance.numAttributes(); j++) {
	300	if (Double.isNaN(m_Min[j])) {
	301	m_Min[j] = instance.value(j);
	302	m_Max[j] = instance.value(j);
	303	} else {
	304	if (instance.value(j) < m_Min[j]) {
	305	m_Min[j] = instance.value(j);
	306	} else {
	307	if (instance.value(j) > m_Max[j]) {
	308	m_Max[j] = instance.value(j);
	309	}
	310	}
	311	}
	312	}
	313	}
	314
	315
	316	/**
	317	* clusters an instance that has been through the filters
	318	*
	319	* @param instance the instance to assign a cluster to
	320	* @return a cluster number
	321	*/
	322	protected int clusterProcessedInstance(Instance instance) {
	323	double minDist = Double.MAX_VALUE;
	324	int bestCluster = 0;
	325	for (int i = 0; i < m_NumClusters; i++) {
	326	double dist = distance(instance, m_ClusterCentroids.instance(i));
	327	if (dist < minDist) {
	328	minDist = dist;
	329	bestCluster = i;
	330	}
	331	}
	332	return bestCluster;
	333	}
	334
	335	/**
	336	* Classifies a given instance.
	337	*
	338	* @param instance the instance to be assigned to a cluster
	339	* @return the number of the assigned cluster as an integer
	340	* if the class is enumerated, otherwise the predicted value
	341	* @throws Exception if instance could not be classified
	342	* successfully
	343	*/
	344	public int clusterInstance(Instance instance) throws Exception {
	345	m_ReplaceMissingFilter.input(instance);
	346	m_ReplaceMissingFilter.batchFinished();
	347	Instance inst = m_ReplaceMissingFilter.output();
	348
	349	return clusterProcessedInstance(inst);
	350	}
	351
	352	/**
	353	* Calculates the distance between two instances
	354	*
	355	* @param first the first instance
	356	* @param second the second instance
	357	* @return the distance between the two given instances, between 0 and 1
	358	*/
	359	protected double distance(Instance first, Instance second) {
	360
	361	double distance = 0;
	362	int firstI, secondI;
	363
	364	for (int p1 = 0, p2 = 0;
	365	p1 < first.numValues() \|\| p2 < second.numValues();) {
	366	if (p1 >= first.numValues()) {
	367	firstI = m_instances.numAttributes();
	368	} else {
	369	firstI = first.index(p1);
	370	}
	371	if (p2 >= second.numValues()) {
	372	secondI = m_instances.numAttributes();
	373	} else {
	374	secondI = second.index(p2);
	375	}
	376	if (firstI == m_instances.classIndex()) {
	377	p1++; continue;
	378	}
	379	if (secondI == m_instances.classIndex()) {
	380	p2++; continue;
	381	}
	382	double diff;
	383	if (firstI == secondI) {
	384	diff = difference(firstI,
	385	first.valueSparse(p1),
	386	second.valueSparse(p2));
	387	p1++; p2++;
	388	} else if (firstI > secondI) {
	389	diff = difference(secondI,
	390	0, second.valueSparse(p2));
	391	p2++;
	392	} else {
	393	diff = difference(firstI,
	394	first.valueSparse(p1), 0);
	395	p1++;
	396	}
	397	distance += diff * diff;
	398	}
	399
	400	return Math.sqrt(distance / m_instances.numAttributes());
	401	}
	402
	403	/**
	404	* Computes the difference between two given attribute
	405	* values.
	406	*/
	407	protected double difference(int index, double val1, double val2) {
	408
	409	switch (m_instances.attribute(index).type()) {
	410	case Attribute.NOMINAL:
	411
	412	// If attribute is nominal
	413	if (Utils.isMissingValue(val1) \|\|
	414	Utils.isMissingValue(val2) \|\|
	415	((int)val1 != (int)val2)) {
	416	return 1;
	417	} else {
	418	return 0;
	419	}
	420	case Attribute.NUMERIC:
	421
	422	// If attribute is numeric
	423	if (Utils.isMissingValue(val1) \|\|
	424	Utils.isMissingValue(val2)) {
	425	if (Utils.isMissingValue(val1) &&
	426	Utils.isMissingValue(val2)) {
	427	return 1;
	428	} else {
	429	double diff;
	430	if (Utils.isMissingValue(val2)) {
	431	diff = norm(val1, index);
	432	} else {
	433	diff = norm(val2, index);
	434	}
	435	if (diff < 0.5) {
	436	diff = 1.0 - diff;
	437	}
	438	return diff;
	439	}
	440	} else {
	441	return norm(val1, index) - norm(val2, index);
	442	}
	443	default:
	444	return 0;
	445	}
	446	}
	447
	448	/**
	449	* Normalizes a given value of a numeric attribute.
	450	*
	451	* @param x the value to be normalized
	452	* @param i the attribute's index
	453	* @return the normalized value
	454	*/
	455	protected double norm(double x, int i) {
	456
	457	if (Double.isNaN(m_Min[i]) \|\| Utils.eq(m_Max[i],m_Min[i])) {
	458	return 0;
	459	} else {
	460	return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
	461	}
	462	}
	463
	464	/**
	465	* Returns the number of clusters.
	466	*
	467	* @return the number of clusters generated for a training dataset.
	468	* @throws Exception if number of clusters could not be returned
	469	* successfully
	470	*/
	471	public int numberOfClusters() throws Exception {
	472	return m_NumClusters;
	473	}
	474
	475	/**
	476	* Returns an enumeration describing the available options.
	477	*
	478	* @return an enumeration of all the available options.
	479	*/
	480	public Enumeration listOptions () {
	481	Vector result = new Vector();
	482
	483	result.addElement(new Option(
	484	"\tnumber of clusters. (default = 2).",
	485	"N", 1, "-N <num>"));
	486
	487	Enumeration en = super.listOptions();
	488	while (en.hasMoreElements())
	489	result.addElement(en.nextElement());
	490
	491	return result.elements();
	492	}
	493
	494	/**
	495	* Returns the tip text for this property
	496	* @return tip text for this property suitable for
	497	* displaying in the explorer/experimenter gui
	498	*/
	499	public String numClustersTipText() {
	500	return "set number of clusters";
	501	}
	502
	503	/**
	504	* set the number of clusters to generate
	505	*
	506	* @param n the number of clusters to generate
	507	* @throws Exception if number of clusters is negative
	508	*/
	509	public void setNumClusters(int n) throws Exception {
	510	if (n < 0) {
	511	throw new Exception("Number of clusters must be > 0");
	512	}
	513	m_NumClusters = n;
	514	}
	515
	516	/**
	517	* gets the number of clusters to generate
	518	*
	519	* @return the number of clusters to generate
	520	*/
	521	public int getNumClusters() {
	522	return m_NumClusters;
	523	}
	524
	525	/**
	526	* Parses a given list of options. <p/>
	527	*
	528	<!-- options-start -->
	529	* Valid options are: <p/>
	530	*
	531	* <pre> -N <num>
	532	* number of clusters. (default = 2).</pre>
	533	*
	534	* <pre> -S <num>
	535	* Random number seed.
	536	* (default 1)</pre>
	537	*
	538	<!-- options-end -->
	539	*
	540	* @param options the list of options as an array of strings
	541	* @throws Exception if an option is not supported
	542	*/
	543	public void setOptions (String[] options)
	544	throws Exception {
	545
	546	String optionString = Utils.getOption('N', options);
	547
	548	if (optionString.length() != 0) {
	549	setNumClusters(Integer.parseInt(optionString));
	550	}
	551
	552	super.setOptions(options);
	553	}
	554
	555	/**
	556	* Gets the current settings of FarthestFirst
	557	*
	558	* @return an array of strings suitable for passing to setOptions()
	559	*/
	560	public String[] getOptions () {
	561	int i;
	562	Vector result;
	563	String[] options;
	564
	565	result = new Vector();
	566
	567	result.add("-N");
	568	result.add("" + getNumClusters());
	569
	570	options = super.getOptions();
	571	for (i = 0; i < options.length; i++)
	572	result.add(options[i]);
	573
	574	return (String[]) result.toArray(new String[result.size()]);
	575	}
	576
	577	/**
	578	* return a string describing this clusterer
	579	*
	580	* @return a description of the clusterer as a string
	581	*/
	582	public String toString() {
	583	StringBuffer temp = new StringBuffer();
	584
	585	temp.append("\n FarthestFirst\n==============\n");
	586
	587	temp.append("\nCluster centroids:\n");
	588	for (int i = 0; i < m_NumClusters; i++) {
	589	temp.append("\nCluster "+i+"\n\t");
	590	for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
	591	if (m_ClusterCentroids.attribute(j).isNominal()) {
	592	temp.append(" "+m_ClusterCentroids.attribute(j).
	593	value((int)m_ClusterCentroids.instance(i).value(j)));
	594	} else {
	595	temp.append(" "+m_ClusterCentroids.instance(i).value(j));
	596	}
	597	}
	598	}
	599	temp.append("\n\n");
	600	return temp.toString();
	601	}
	602
	603	/**
	604	* Returns the revision string.
	605	*
	606	* @return the revision
	607	*/
	608	public String getRevision() {
	609	return RevisionUtils.extract("$Revision: 5987 $");
	610	}
	611
	612	/**
	613	* Main method for testing this class.
	614	*
	615	* @param argv should contain the following arguments: <p>
	616	* -t training file [-N number of clusters]
	617	*/
	618	public static void main (String[] argv) {
	619	runClusterer(new FarthestFirst(), argv);
	620	}
	621	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: