Context Navigation

source: src/main/java/weka/clusterers/SimpleKMeans.java @ 21

Last change on this file since 21 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 38.2 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* SimpleKMeans.java
	19	* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22	package weka.clusterers;
	23
	24	import weka.classifiers.rules.DecisionTableHashKey;
	25	import weka.core.Attribute;
	26	import weka.core.Capabilities;
	27	import weka.core.DistanceFunction;
	28	import weka.core.EuclideanDistance;
	29	import weka.core.Instance;
	30	import weka.core.DenseInstance;
	31	import weka.core.Instances;
	32	import weka.core.ManhattanDistance;
	33	import weka.core.Option;
	34	import weka.core.RevisionUtils;
	35	import weka.core.Utils;
	36	import weka.core.WeightedInstancesHandler;
	37	import weka.core.Capabilities.Capability;
	38	import weka.filters.Filter;
	39	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	40
	41	import java.util.Enumeration;
	42	import java.util.HashMap;
	43	import java.util.Random;
	44	import java.util.Vector;
	45
	46	/**
	47	<!-- globalinfo-start -->
	48	* Cluster data using the k means algorithm
	49	* <p/>
	50	<!-- globalinfo-end -->
	51	*
	52	<!-- options-start -->
	53	* Valid options are: <p/>
	54	*
	55	* <pre> -N <num>
	56	* number of clusters.
	57	* (default 2).</pre>
	58	*
	59	* <pre> -V
	60	* Display std. deviations for centroids.
	61	* </pre>
	62	*
	63	* <pre> -M
	64	* Replace missing values with mean/mode.
	65	* </pre>
	66	*
	67	* <pre> -S <num>
	68	* Random number seed.
	69	* (default 10)</pre>
	70	*
	71	* <pre> -A <classname and options>
	72	* Distance function to be used for instance comparison
	73	* (default weka.core.EuclidianDistance)</pre>
	74	*
	75	* <pre> -I <num>
	76	* Maximum number of iterations. </pre>
	77	*
	78	* <pre> -O
	79	* Preserve order of instances. </pre>
	80	*
	81	*
	82	<!-- options-end -->
	83	*
	84	* @author Mark Hall (mhall@cs.waikato.ac.nz)
	85	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
	86	* @version $Revision: 5987 $
	87	* @see RandomizableClusterer
	88	*/
	89	public class SimpleKMeans
	90	extends RandomizableClusterer
	91	implements NumberOfClustersRequestable, WeightedInstancesHandler {
	92
	93	/** for serialization */
	94	static final long serialVersionUID = -3235809600124455376L;
	95
	96	/**
	97	* replace missing values in training instances
	98	*/
	99	private ReplaceMissingValues m_ReplaceMissingFilter;
	100
	101	/**
	102	* number of clusters to generate
	103	*/
	104	private int m_NumClusters = 2;
	105
	106	/**
	107	* holds the cluster centroids
	108	*/
	109	private Instances m_ClusterCentroids;
	110
	111	/**
	112	* Holds the standard deviations of the numeric attributes in each cluster
	113	*/
	114	private Instances m_ClusterStdDevs;
	115
	116
	117	/**
	118	* For each cluster, holds the frequency counts for the values of each
	119	* nominal attribute
	120	*/
	121	private int [][][] m_ClusterNominalCounts;
	122	private int[][] m_ClusterMissingCounts;
	123
	124	/**
	125	* Stats on the full data set for comparison purposes
	126	* In case the attribute is numeric the value is the mean if is
	127	* being used the Euclidian distance or the median if Manhattan distance
	128	* and if the attribute is nominal then it's mode is saved
	129	*/
	130	private double[] m_FullMeansOrMediansOrModes;
	131	private double[] m_FullStdDevs;
	132	private int[][] m_FullNominalCounts;
	133	private int[] m_FullMissingCounts;
	134
	135	/**
	136	* Display standard deviations for numeric atts
	137	*/
	138	private boolean m_displayStdDevs;
	139
	140	/**
	141	* Replace missing values globally?
	142	*/
	143	private boolean m_dontReplaceMissing = false;
	144
	145	/**
	146	* The number of instances in each cluster
	147	*/
	148	private int [] m_ClusterSizes;
	149
	150	/**
	151	* Maximum number of iterations to be executed
	152	*/
	153	private int m_MaxIterations = 500;
	154
	155	/**
	156	* Keep track of the number of iterations completed before convergence
	157	*/
	158	private int m_Iterations = 0;
	159
	160	/**
	161	* Holds the squared errors for all clusters
	162	*/
	163	private double [] m_squaredErrors;
	164
	165	/** the distance function used. */
	166	protected DistanceFunction m_DistanceFunction = new EuclideanDistance();
	167
	168	/**
	169	* Preserve order of instances
	170	*/
	171	private boolean m_PreserveOrder = false;
	172
	173	/**
	174	* Assignments obtained
	175	*/
	176	protected int[] m_Assignments = null;
	177
	178	/**
	179	* the default constructor
	180	*/
	181	public SimpleKMeans() {
	182	super();
	183
	184	m_SeedDefault = 10;
	185	setSeed(m_SeedDefault);
	186	}
	187
	188	/**
	189	* Returns a string describing this clusterer
	190	* @return a description of the evaluator suitable for
	191	* displaying in the explorer/experimenter gui
	192	*/
	193	public String globalInfo() {
	194	return "Cluster data using the k means algorithm. Can use either "
	195	+ "the Euclidean distance (default) or the Manhattan distance."
	196	+ " If the Manhattan distance is used, then centroids are computed "
	197	+ "as the component-wise median rather than mean.";
	198	}
	199
	200	/**
	201	* Returns default capabilities of the clusterer.
	202	*
	203	* @return the capabilities of this clusterer
	204	*/
	205	public Capabilities getCapabilities() {
	206	Capabilities result = super.getCapabilities();
	207	result.disableAll();
	208	result.enable(Capability.NO_CLASS);
	209
	210	// attributes
	211	result.enable(Capability.NOMINAL_ATTRIBUTES);
	212	result.enable(Capability.NUMERIC_ATTRIBUTES);
	213	result.enable(Capability.MISSING_VALUES);
	214
	215	return result;
	216	}
	217
	218	/**
	219	* Generates a clusterer. Has to initialize all fields of the clusterer
	220	* that are not being set via options.
	221	*
	222	* @param data set of instances serving as training data
	223	* @throws Exception if the clusterer has not been
	224	* generated successfully
	225	*/
	226	public void buildClusterer(Instances data) throws Exception {
	227
	228	// can clusterer handle the data?
	229	getCapabilities().testWithFail(data);
	230
	231	m_Iterations = 0;
	232
	233	m_ReplaceMissingFilter = new ReplaceMissingValues();
	234	Instances instances = new Instances(data);
	235
	236	instances.setClassIndex(-1);
	237	if (!m_dontReplaceMissing) {
	238	m_ReplaceMissingFilter.setInputFormat(instances);
	239	instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
	240	}
	241
	242	m_FullMissingCounts = new int[instances.numAttributes()];
	243	if (m_displayStdDevs) {
	244	m_FullStdDevs = new double[instances.numAttributes()];
	245	}
	246	m_FullNominalCounts = new int[instances.numAttributes()][0];
	247
	248	m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
	249	for (int i = 0; i < instances.numAttributes(); i++) {
	250	m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
	251	if (instances.attribute(i).isNumeric()) {
	252	if (m_displayStdDevs) {
	253	m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
	254	}
	255	if (m_FullMissingCounts[i] == instances.numInstances()) {
	256	m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
	257	}
	258	} else {
	259	m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
	260	if (m_FullMissingCounts[i]
	261	> m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
	262	m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value
	263	}
	264	}
	265	}
	266
	267	m_ClusterCentroids = new Instances(instances, m_NumClusters);
	268	int[] clusterAssignments = new int [instances.numInstances()];
	269
	270	if(m_PreserveOrder)
	271	m_Assignments = clusterAssignments;
	272
	273	m_DistanceFunction.setInstances(instances);
	274
	275	Random RandomO = new Random(getSeed());
	276	int instIndex;
	277	HashMap initC = new HashMap();
	278	DecisionTableHashKey hk = null;
	279
	280	Instances initInstances = null;
	281	if(m_PreserveOrder)
	282	initInstances = new Instances(instances);
	283	else
	284	initInstances = instances;
	285
	286	for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
	287	instIndex = RandomO.nextInt(j+1);
	288	hk = new DecisionTableHashKey(initInstances.instance(instIndex),
	289	initInstances.numAttributes(), true);
	290	if (!initC.containsKey(hk)) {
	291	m_ClusterCentroids.add(initInstances.instance(instIndex));
	292	initC.put(hk, null);
	293	}
	294	initInstances.swap(j, instIndex);
	295
	296	if (m_ClusterCentroids.numInstances() == m_NumClusters) {
	297	break;
	298	}
	299	}
	300
	301	m_NumClusters = m_ClusterCentroids.numInstances();
	302
	303	//removing reference
	304	initInstances = null;
	305
	306	int i;
	307	boolean converged = false;
	308	int emptyClusterCount;
	309	Instances [] tempI = new Instances[m_NumClusters];
	310	m_squaredErrors = new double [m_NumClusters];
	311	m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
	312	m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
	313	while (!converged) {
	314	emptyClusterCount = 0;
	315	m_Iterations++;
	316	converged = true;
	317	for (i = 0; i < instances.numInstances(); i++) {
	318	Instance toCluster = instances.instance(i);
	319	int newC = clusterProcessedInstance(toCluster, true);
	320	if (newC != clusterAssignments[i]) {
	321	converged = false;
	322	}
	323	clusterAssignments[i] = newC;
	324	}
	325
	326	// update centroids
	327	m_ClusterCentroids = new Instances(instances, m_NumClusters);
	328	for (i = 0; i < m_NumClusters; i++) {
	329	tempI[i] = new Instances(instances, 0);
	330	}
	331	for (i = 0; i < instances.numInstances(); i++) {
	332	tempI[clusterAssignments[i]].add(instances.instance(i));
	333	}
	334	for (i = 0; i < m_NumClusters; i++) {
	335	if (tempI[i].numInstances() == 0) {
	336	// empty cluster
	337	emptyClusterCount++;
	338	} else {
	339	moveCentroid( i, tempI[i], true );
	340	}
	341	}
	342
	343	if (emptyClusterCount > 0) {
	344	m_NumClusters -= emptyClusterCount;
	345	if (converged) {
	346	Instances[] t = new Instances[m_NumClusters];
	347	int index = 0;
	348	for (int k = 0; k < tempI.length; k++) {
	349	if (tempI[k].numInstances() > 0) {
	350	t[index++] = tempI[k];
	351	}
	352	}
	353	tempI = t;
	354	} else {
	355	tempI = new Instances[m_NumClusters];
	356	}
	357	}
	358
	359	if(m_Iterations == m_MaxIterations)
	360	converged = true;
	361
	362	if (!converged) {
	363	m_squaredErrors = new double [m_NumClusters];
	364	m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
	365	}
	366	}
	367
	368	if (m_displayStdDevs) {
	369	m_ClusterStdDevs = new Instances(instances, m_NumClusters);
	370	}
	371	m_ClusterSizes = new int [m_NumClusters];
	372	for (i = 0; i < m_NumClusters; i++) {
	373	if (m_displayStdDevs) {
	374	double [] vals2 = new double[instances.numAttributes()];
	375	for (int j = 0; j < instances.numAttributes(); j++) {
	376	if (instances.attribute(j).isNumeric()) {
	377	vals2[j] = Math.sqrt(tempI[i].variance(j));
	378	} else {
	379	vals2[j] = Utils.missingValue();
	380	}
	381	}
	382	m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
	383	}
	384	m_ClusterSizes[i] = tempI[i].numInstances();
	385	}
	386	}
	387
	388	/**
	389	* Move the centroid to it's new coordinates. Generate the centroid coordinates based
	390	* on it's members (objects assigned to the cluster of the centroid) and the distance
	391	* function being used.
	392	* @param centroidIndex index of the centroid which the coordinates will be computed
	393	* @param members the objects that are assigned to the cluster of this centroid
	394	* @param updateClusterInfo if the method is supposed to update the m_Cluster arrays
	395	* @return the centroid coordinates
	396	*/
	397	protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo){
	398	double [] vals = new double[members.numAttributes()];
	399
	400	//used only for Manhattan Distance
	401	Instances sortedMembers = null;
	402	int middle = 0;
	403	boolean dataIsEven = false;
	404
	405	if(m_DistanceFunction instanceof ManhattanDistance){
	406	middle = (members.numInstances()-1)/2;
	407	dataIsEven = ((members.numInstances()%2)==0);
	408	if(m_PreserveOrder){
	409	sortedMembers = members;
	410	}else{
	411	sortedMembers = new Instances(members);
	412	}
	413	}
	414
	415	for (int j = 0; j < members.numAttributes(); j++) {
	416
	417	//in case of Euclidian distance the centroid is the mean point
	418	//in case of Manhattan distance the centroid is the median point
	419	//in both cases, if the attribute is nominal, the centroid is the mode
	420	if(m_DistanceFunction instanceof EuclideanDistance \|\|
	421	members.attribute(j).isNominal())
	422	{
	423	vals[j] = members.meanOrMode(j);
	424	}else if(m_DistanceFunction instanceof ManhattanDistance){
	425	//singleton special case
	426	if(members.numInstances() == 1){
	427	vals[j] = members.instance(0).value(j);
	428	}else{
	429	sortedMembers.kthSmallestValue(j, middle+1);
	430	vals[j] = sortedMembers.instance(middle).value(j);
	431	if( dataIsEven ){
	432	sortedMembers.kthSmallestValue(j, middle+2);
	433	vals[j] = (vals[j]+sortedMembers.instance(middle+1).value(j))/2;
	434	}
	435	}
	436	}
	437
	438	if(updateClusterInfo){
	439	m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
	440	m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
	441	if (members.attribute(j).isNominal()) {
	442	if (m_ClusterMissingCounts[centroidIndex][j] >
	443	m_ClusterNominalCounts[centroidIndex][j][Utils.maxIndex(m_ClusterNominalCounts[centroidIndex][j])])
	444	{
	445	vals[j] = Utils.missingValue(); // mark mode as missing
	446	}
	447	} else {
	448	if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
	449	vals[j] = Utils.missingValue(); // mark mean as missing
	450	}
	451	}
	452	}
	453	}
	454	if(updateClusterInfo)
	455	m_ClusterCentroids.add(new DenseInstance(1.0, vals));
	456	return vals;
	457	}
	458
	459	/**
	460	* clusters an instance that has been through the filters
	461	*
	462	* @param instance the instance to assign a cluster to
	463	* @param updateErrors if true, update the within clusters sum of errors
	464	* @return a cluster number
	465	*/
	466	private int clusterProcessedInstance(Instance instance, boolean updateErrors) {
	467	double minDist = Integer.MAX_VALUE;
	468	int bestCluster = 0;
	469	for (int i = 0; i < m_NumClusters; i++) {
	470	double dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i));
	471	if (dist < minDist) {
	472	minDist = dist;
	473	bestCluster = i;
	474	}
	475	}
	476	if (updateErrors) {
	477	if(m_DistanceFunction instanceof EuclideanDistance){
	478	//Euclidean distance to Squared Euclidean distance
	479	minDist *= minDist;
	480	}
	481	m_squaredErrors[bestCluster] += minDist;
	482	}
	483	return bestCluster;
	484	}
	485
	486	/**
	487	* Classifies a given instance.
	488	*
	489	* @param instance the instance to be assigned to a cluster
	490	* @return the number of the assigned cluster as an interger
	491	* if the class is enumerated, otherwise the predicted value
	492	* @throws Exception if instance could not be classified
	493	* successfully
	494	*/
	495	public int clusterInstance(Instance instance) throws Exception {
	496	Instance inst = null;
	497	if (!m_dontReplaceMissing) {
	498	m_ReplaceMissingFilter.input(instance);
	499	m_ReplaceMissingFilter.batchFinished();
	500	inst = m_ReplaceMissingFilter.output();
	501	} else {
	502	inst = instance;
	503	}
	504
	505	return clusterProcessedInstance(inst, false);
	506	}
	507
	508	/**
	509	* Returns the number of clusters.
	510	*
	511	* @return the number of clusters generated for a training dataset.
	512	* @throws Exception if number of clusters could not be returned
	513	* successfully
	514	*/
	515	public int numberOfClusters() throws Exception {
	516	return m_NumClusters;
	517	}
	518
	519	/**
	520	* Returns an enumeration describing the available options.
	521	*
	522	* @return an enumeration of all the available options.
	523	*/
	524	public Enumeration listOptions () {
	525	Vector result = new Vector();
	526
	527	result.addElement(new Option(
	528	"\tnumber of clusters.\n"
	529	+ "\t(default 2).",
	530	"N", 1, "-N <num>"));
	531	result.addElement(new Option(
	532	"\tDisplay std. deviations for centroids.\n",
	533	"V", 0, "-V"));
	534	result.addElement(new Option(
	535	"\tReplace missing values with mean/mode.\n",
	536	"M", 0, "-M"));
	537
	538	result.add(new Option(
	539	"\tDistance function to use.\n"
	540	+ "\t(default: weka.core.EuclideanDistance)",
	541	"A", 1,"-A <classname and options>"));
	542
	543	result.add(new Option(
	544	"\tMaximum number of iterations.\n",
	545	"I",1,"-I <num>"));
	546
	547	result.addElement(new Option(
	548	"\tPreserve order of instances.\n",
	549	"O", 0, "-O"));
	550
	551	Enumeration en = super.listOptions();
	552	while (en.hasMoreElements())
	553	result.addElement(en.nextElement());
	554
	555	return result.elements();
	556	}
	557
	558	/**
	559	* Returns the tip text for this property
	560	* @return tip text for this property suitable for
	561	* displaying in the explorer/experimenter gui
	562	*/
	563	public String numClustersTipText() {
	564	return "set number of clusters";
	565	}
	566
	567	/**
	568	* set the number of clusters to generate
	569	*
	570	* @param n the number of clusters to generate
	571	* @throws Exception if number of clusters is negative
	572	*/
	573	public void setNumClusters(int n) throws Exception {
	574	if (n <= 0) {
	575	throw new Exception("Number of clusters must be > 0");
	576	}
	577	m_NumClusters = n;
	578	}
	579
	580	/**
	581	* gets the number of clusters to generate
	582	*
	583	* @return the number of clusters to generate
	584	*/
	585	public int getNumClusters() {
	586	return m_NumClusters;
	587	}
	588
	589	/**
	590	* Returns the tip text for this property
	591	* @return tip text for this property suitable for
	592	* displaying in the explorer/experimenter gui
	593	*/
	594	public String maxIterationsTipText() {
	595	return "set maximum number of iterations";
	596	}
	597
	598	/**
	599	* set the maximum number of iterations to be executed
	600	*
	601	* @param n the maximum number of iterations
	602	* @throws Exception if maximum number of iteration is smaller than 1
	603	*/
	604	public void setMaxIterations(int n) throws Exception {
	605	if (n <= 0) {
	606	throw new Exception("Maximum number of iterations must be > 0");
	607	}
	608	m_MaxIterations = n;
	609	}
	610
	611	/**
	612	* gets the number of maximum iterations to be executed
	613	*
	614	* @return the number of clusters to generate
	615	*/
	616	public int getMaxIterations() {
	617	return m_MaxIterations;
	618	}
	619
	620
	621	/**
	622	* Returns the tip text for this property
	623	* @return tip text for this property suitable for
	624	* displaying in the explorer/experimenter gui
	625	*/
	626	public String displayStdDevsTipText() {
	627	return "Display std deviations of numeric attributes "
	628	+ "and counts of nominal attributes.";
	629	}
	630
	631	/**
	632	* Sets whether standard deviations and nominal count
	633	* Should be displayed in the clustering output
	634	*
	635	* @param stdD true if std. devs and counts should be
	636	* displayed
	637	*/
	638	public void setDisplayStdDevs(boolean stdD) {
	639	m_displayStdDevs = stdD;
	640	}
	641
	642	/**
	643	* Gets whether standard deviations and nominal count
	644	* Should be displayed in the clustering output
	645	*
	646	* @return true if std. devs and counts should be
	647	* displayed
	648	*/
	649	public boolean getDisplayStdDevs() {
	650	return m_displayStdDevs;
	651	}
	652
	653	/**
	654	* Returns the tip text for this property
	655	* @return tip text for this property suitable for
	656	* displaying in the explorer/experimenter gui
	657	*/
	658	public String dontReplaceMissingValuesTipText() {
	659	return "Replace missing values globally with mean/mode.";
	660	}
	661
	662	/**
	663	* Sets whether missing values are to be replaced
	664	*
	665	* @param r true if missing values are to be
	666	* replaced
	667	*/
	668	public void setDontReplaceMissingValues(boolean r) {
	669	m_dontReplaceMissing = r;
	670	}
	671
	672	/**
	673	* Gets whether missing values are to be replaced
	674	*
	675	* @return true if missing values are to be
	676	* replaced
	677	*/
	678	public boolean getDontReplaceMissingValues() {
	679	return m_dontReplaceMissing;
	680	}
	681
	682	/**
	683	* Returns the tip text for this property.
	684	*
	685	* @return tip text for this property suitable for
	686	* displaying in the explorer/experimenter gui
	687	*/
	688	public String distanceFunctionTipText() {
	689	return "The distance function to use for instances comparison " +
	690	"(default: weka.core.EuclideanDistance). ";
	691	}
	692
	693	/**
	694	* returns the distance function currently in use.
	695	*
	696	* @return the distance function
	697	*/
	698	public DistanceFunction getDistanceFunction() {
	699	return m_DistanceFunction;
	700	}
	701
	702	/**
	703	* sets the distance function to use for instance comparison.
	704	*
	705	* @param df the new distance function to use
	706	* @throws Exception if instances cannot be processed
	707	*/
	708	public void setDistanceFunction(DistanceFunction df) throws Exception {
	709	if(!(df instanceof EuclideanDistance) &&
	710	!(df instanceof ManhattanDistance))
	711	{
	712	throw new Exception("SimpleKMeans currently only supports the Euclidean and Manhattan distances.");
	713	}
	714	m_DistanceFunction = df;
	715	}
	716
	717	/**
	718	* Returns the tip text for this property
	719	* @return tip text for this property suitable for
	720	* displaying in the explorer/experimenter gui
	721	*/
	722	public String preserveInstancesOrderTipText() {
	723	return "Preserve order of instances.";
	724	}
	725
	726	/**
	727	* Sets whether order of instances must be preserved
	728	*
	729	* @param r true if missing values are to be
	730	* replaced
	731	*/
	732	public void setPreserveInstancesOrder(boolean r) {
	733	m_PreserveOrder = r;
	734	}
	735
	736	/**
	737	* Gets whether order of instances must be preserved
	738	*
	739	* @return true if missing values are to be
	740	* replaced
	741	*/
	742	public boolean getPreserveInstancesOrder() {
	743	return m_PreserveOrder;
	744	}
	745
	746
	747	/**
	748	* Parses a given list of options. <p/>
	749	*
	750	<!-- options-start -->
	751	* Valid options are: <p/>
	752	*
	753	* <pre> -N <num>
	754	* number of clusters.
	755	* (default 2).</pre>
	756	*
	757	* <pre> -V
	758	* Display std. deviations for centroids.
	759	* </pre>
	760	*
	761	* <pre> -M
	762	* Replace missing values with mean/mode.
	763	* </pre>
	764	*
	765	* <pre> -S <num>
	766	* Random number seed.
	767	* (default 10)</pre>
	768	*
	769	* <pre> -A <classname and options>
	770	* Distance function to be used for instance comparison
	771	* (default weka.core.EuclidianDistance)</pre>
	772	*
	773	* <pre> -I <num>
	774	* Maximum number of iterations. </pre>
	775	*
	776	* <pre> -O
	777	* Preserve order of instances.
	778	* </pre>
	779	*
	780	<!-- options-end -->
	781	*
	782	* @param options the list of options as an array of strings
	783	* @throws Exception if an option is not supported
	784	*/
	785	public void setOptions (String[] options)
	786	throws Exception {
	787
	788	m_displayStdDevs = Utils.getFlag("V", options);
	789	m_dontReplaceMissing = Utils.getFlag("M", options);
	790
	791	String optionString = Utils.getOption('N', options);
	792
	793	if (optionString.length() != 0) {
	794	setNumClusters(Integer.parseInt(optionString));
	795	}
	796
	797	optionString = Utils.getOption("I", options);
	798	if (optionString.length() != 0) {
	799	setMaxIterations(Integer.parseInt(optionString));
	800	}
	801
	802	String distFunctionClass = Utils.getOption('A', options);
	803	if(distFunctionClass.length() != 0) {
	804	String distFunctionClassSpec[] = Utils.splitOptions(distFunctionClass);
	805	if(distFunctionClassSpec.length == 0) {
	806	throw new Exception("Invalid DistanceFunction specification string.");
	807	}
	808	String className = distFunctionClassSpec[0];
	809	distFunctionClassSpec[0] = "";
	810
	811	setDistanceFunction( (DistanceFunction)
	812	Utils.forName( DistanceFunction.class,
	813	className, distFunctionClassSpec) );
	814	}
	815	else {
	816	setDistanceFunction(new EuclideanDistance());
	817	}
	818
	819	m_PreserveOrder = Utils.getFlag("O", options);
	820
	821	super.setOptions(options);
	822	}
	823
	824	/**
	825	* Gets the current settings of SimpleKMeans
	826	*
	827	* @return an array of strings suitable for passing to setOptions()
	828	*/
	829	public String[] getOptions () {
	830	int i;
	831	Vector result;
	832	String[] options;
	833
	834	result = new Vector();
	835
	836	if (m_displayStdDevs) {
	837	result.add("-V");
	838	}
	839
	840	if (m_dontReplaceMissing) {
	841	result.add("-M");
	842	}
	843
	844	result.add("-N");
	845	result.add("" + getNumClusters());
	846
	847	result.add("-A");
	848	result.add((m_DistanceFunction.getClass().getName() + " " +
	849	Utils.joinOptions(m_DistanceFunction.getOptions())).trim());
	850
	851	result.add("-I");
	852	result.add(""+ getMaxIterations());
	853
	854	if(m_PreserveOrder){
	855	result.add("-O");
	856	}
	857
	858	options = super.getOptions();
	859	for (i = 0; i < options.length; i++)
	860	result.add(options[i]);
	861
	862	return (String[]) result.toArray(new String[result.size()]);
	863	}
	864
	865	/**
	866	* return a string describing this clusterer
	867	*
	868	* @return a description of the clusterer as a string
	869	*/
	870	public String toString() {
	871	if (m_ClusterCentroids == null) {
	872	return "No clusterer built yet!";
	873	}
	874
	875	int maxWidth = 0;
	876	int maxAttWidth = 0;
	877	boolean containsNumeric = false;
	878	for (int i = 0; i < m_NumClusters; i++) {
	879	for (int j = 0 ;j < m_ClusterCentroids.numAttributes(); j++) {
	880	if (m_ClusterCentroids.attribute(j).name().length() > maxAttWidth) {
	881	maxAttWidth = m_ClusterCentroids.attribute(j).name().length();
	882	}
	883	if (m_ClusterCentroids.attribute(j).isNumeric()) {
	884	containsNumeric = true;
	885	double width = Math.log(Math.abs(m_ClusterCentroids.instance(i).value(j))) /
	886	Math.log(10.0);
	887	// System.err.println(m_ClusterCentroids.instance(i).value(j)+" "+width);
	888	if (width < 0) {
	889	width = 1;
	890	}
	891	// decimal + # decimal places + 1
	892	width += 6.0;
	893	if ((int)width > maxWidth) {
	894	maxWidth = (int)width;
	895	}
	896	}
	897	}
	898	}
	899
	900	for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
	901	if (m_ClusterCentroids.attribute(i).isNominal()) {
	902	Attribute a = m_ClusterCentroids.attribute(i);
	903	for (int j = 0; j < m_ClusterCentroids.numInstances(); j++) {
	904	String val = a.value((int)m_ClusterCentroids.instance(j).value(i));
	905	if (val.length() > maxWidth) {
	906	maxWidth = val.length();
	907	}
	908	}
	909	for (int j = 0; j < a.numValues(); j++) {
	910	String val = a.value(j) + " ";
	911	if (val.length() > maxAttWidth) {
	912	maxAttWidth = val.length();
	913	}
	914	}
	915	}
	916	}
	917
	918	if (m_displayStdDevs) {
	919	// check for maximum width of maximum frequency count
	920	for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
	921	if (m_ClusterCentroids.attribute(i).isNominal()) {
	922	int maxV = Utils.maxIndex(m_FullNominalCounts[i]);
	923	/* int percent = (int)((double)m_FullNominalCounts[i][maxV] /
	924	Utils.sum(m_ClusterSizes) * 100.0); */
	925	int percent = 6; // max percent width (100%)
	926	String nomV = "" + m_FullNominalCounts[i][maxV];
	927	// + " (" + percent + "%)";
	928	if (nomV.length() + percent > maxWidth) {
	929	maxWidth = nomV.length() + 1;
	930	}
	931	}
	932	}
	933	}
	934
	935	// check for size of cluster sizes
	936	for (int i = 0; i < m_ClusterSizes.length; i++) {
	937	String size = "(" + m_ClusterSizes[i] + ")";
	938	if (size.length() > maxWidth) {
	939	maxWidth = size.length();
	940	}
	941	}
	942
	943	if (m_displayStdDevs && maxAttWidth < "missing".length()) {
	944	maxAttWidth = "missing".length();
	945	}
	946
	947	String plusMinus = "+/-";
	948	maxAttWidth += 2;
	949	if (m_displayStdDevs && containsNumeric) {
	950	maxWidth += plusMinus.length();
	951	}
	952	if (maxAttWidth < "Attribute".length() + 2) {
	953	maxAttWidth = "Attribute".length() + 2;
	954	}
	955
	956	if (maxWidth < "Full Data".length()) {
	957	maxWidth = "Full Data".length() + 1;
	958	}
	959
	960	if (maxWidth < "missing".length()) {
	961	maxWidth = "missing".length() + 1;
	962	}
	963
	964
	965
	966	StringBuffer temp = new StringBuffer();
	967	// String naString = "N/A";
	968
	969
	970	/* for (int i = 0; i < maxWidth+2; i++) {
	971	naString += " ";
	972	} */
	973	temp.append("\nkMeans\n======\n");
	974	temp.append("\nNumber of iterations: " + m_Iterations+"\n");
	975
	976	if(m_DistanceFunction instanceof EuclideanDistance){
	977	temp.append("Within cluster sum of squared errors: " + Utils.sum(m_squaredErrors));
	978	}else{
	979	temp.append("Sum of within cluster distances: " + Utils.sum(m_squaredErrors));
	980	}
	981
	982
	983	if (!m_dontReplaceMissing) {
	984	temp.append("\nMissing values globally replaced with mean/mode");
	985	}
	986
	987	temp.append("\n\nCluster centroids:\n");
	988	temp.append(pad("Cluster#", " ", (maxAttWidth + (maxWidth * 2 + 2)) - "Cluster#".length(), true));
	989
	990	temp.append("\n");
	991	temp.append(pad("Attribute", " ", maxAttWidth - "Attribute".length(), false));
	992
	993
	994	temp.append(pad("Full Data", " ", maxWidth + 1 - "Full Data".length(), true));
	995
	996	// cluster numbers
	997	for (int i = 0; i < m_NumClusters; i++) {
	998	String clustNum = "" + i;
	999	temp.append(pad(clustNum, " ", maxWidth + 1 - clustNum.length(), true));
	1000	}
	1001	temp.append("\n");
	1002
	1003	// cluster sizes
	1004	String cSize = "(" + Utils.sum(m_ClusterSizes) + ")";
	1005	temp.append(pad(cSize, " ", maxAttWidth + maxWidth + 1 - cSize.length(), true));
	1006	for (int i = 0; i < m_NumClusters; i++) {
	1007	cSize = "(" + m_ClusterSizes[i] + ")";
	1008	temp.append(pad(cSize, " ",maxWidth + 1 - cSize.length(), true));
	1009	}
	1010	temp.append("\n");
	1011
	1012	temp.append(pad("", "=", maxAttWidth +
	1013	(maxWidth * (m_ClusterCentroids.numInstances()+1)
	1014	+ m_ClusterCentroids.numInstances() + 1), true));
	1015	temp.append("\n");
	1016
	1017	for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
	1018	String attName = m_ClusterCentroids.attribute(i).name();
	1019	temp.append(attName);
	1020	for (int j = 0; j < maxAttWidth - attName.length(); j++) {
	1021	temp.append(" ");
	1022	}
	1023
	1024	String strVal;
	1025	String valMeanMode;
	1026	// full data
	1027	if (m_ClusterCentroids.attribute(i).isNominal()) {
	1028	if (m_FullMeansOrMediansOrModes[i] == -1) { // missing
	1029	valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(), true);
	1030	} else {
	1031	valMeanMode =
	1032	pad((strVal = m_ClusterCentroids.attribute(i).value((int)m_FullMeansOrMediansOrModes[i])),
	1033	" ", maxWidth + 1 - strVal.length(), true);
	1034	}
	1035	} else {
	1036	if (Double.isNaN(m_FullMeansOrMediansOrModes[i])) {
	1037	valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(), true);
	1038	} else {
	1039	valMeanMode = pad((strVal = Utils.doubleToString(m_FullMeansOrMediansOrModes[i],
	1040	maxWidth,4).trim()),
	1041	" ", maxWidth + 1 - strVal.length(), true);
	1042	}
	1043	}
	1044	temp.append(valMeanMode);
	1045
	1046	for (int j = 0; j < m_NumClusters; j++) {
	1047	if (m_ClusterCentroids.attribute(i).isNominal()) {
	1048	if (m_ClusterCentroids.instance(j).isMissing(i)) {
	1049	valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(), true);
	1050	} else {
	1051	valMeanMode =
	1052	pad((strVal = m_ClusterCentroids.attribute(i).value((int)m_ClusterCentroids.instance(j).value(i))),
	1053	" ", maxWidth + 1 - strVal.length(), true);
	1054	}
	1055	} else {
	1056	if (m_ClusterCentroids.instance(j).isMissing(i)) {
	1057	valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(), true);
	1058	} else {
	1059	valMeanMode = pad((strVal = Utils.doubleToString(m_ClusterCentroids.instance(j).value(i),
	1060	maxWidth,4).trim()),
	1061	" ", maxWidth + 1 - strVal.length(), true);
	1062	}
	1063	}
	1064	temp.append(valMeanMode);
	1065	}
	1066	temp.append("\n");
	1067
	1068	if (m_displayStdDevs) {
	1069	// Std devs/max nominal
	1070	String stdDevVal = "";
	1071
	1072	if (m_ClusterCentroids.attribute(i).isNominal()) {
	1073	// Do the values of the nominal attribute
	1074	Attribute a = m_ClusterCentroids.attribute(i);
	1075	for (int j = 0; j < a.numValues(); j++) {
	1076	// full data
	1077	String val = " " + a.value(j);
	1078	temp.append(pad(val, " ", maxAttWidth + 1 - val.length(), false));
	1079	int count = m_FullNominalCounts[i][j];
	1080	int percent = (int)((double)m_FullNominalCounts[i][j] /
	1081	Utils.sum(m_ClusterSizes) * 100.0);
	1082	String percentS = "" + percent + "%)";
	1083	percentS = pad(percentS, " ", 5 - percentS.length(), true);
	1084	stdDevVal = "" + count + " (" + percentS;
	1085	stdDevVal =
	1086	pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(), true);
	1087	temp.append(stdDevVal);
	1088
	1089	// Clusters
	1090	for (int k = 0; k < m_NumClusters; k++) {
	1091	count = m_ClusterNominalCounts[k][i][j];
	1092	percent = (int)((double)m_ClusterNominalCounts[k][i][j] /
	1093	m_ClusterSizes[k] * 100.0);
	1094	percentS = "" + percent + "%)";
	1095	percentS = pad(percentS, " ", 5 - percentS.length(), true);
	1096	stdDevVal = "" + count + " (" + percentS;
	1097	stdDevVal =
	1098	pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(), true);
	1099	temp.append(stdDevVal);
	1100	}
	1101	temp.append("\n");
	1102	}
	1103	// missing (if any)
	1104	if (m_FullMissingCounts[i] > 0) {
	1105	// Full data
	1106	temp.append(pad(" missing", " ", maxAttWidth + 1 - " missing".length(), false));
	1107	int count = m_FullMissingCounts[i];
	1108	int percent = (int)((double)m_FullMissingCounts[i] /
	1109	Utils.sum(m_ClusterSizes) * 100.0);
	1110	String percentS = "" + percent + "%)";
	1111	percentS = pad(percentS, " ", 5 - percentS.length(), true);
	1112	stdDevVal = "" + count + " (" + percentS;
	1113	stdDevVal =
	1114	pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(), true);
	1115	temp.append(stdDevVal);
	1116
	1117	// Clusters
	1118	for (int k = 0; k < m_NumClusters; k++) {
	1119	count = m_ClusterMissingCounts[k][i];
	1120	percent = (int)((double)m_ClusterMissingCounts[k][i] /
	1121	m_ClusterSizes[k] * 100.0);
	1122	percentS = "" + percent + "%)";
	1123	percentS = pad(percentS, " ", 5 - percentS.length(), true);
	1124	stdDevVal = "" + count + " (" + percentS;
	1125	stdDevVal =
	1126	pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(), true);
	1127	temp.append(stdDevVal);
	1128	}
	1129
	1130	temp.append("\n");
	1131	}
	1132
	1133	temp.append("\n");
	1134	} else {
	1135	// Full data
	1136	if (Double.isNaN(m_FullMeansOrMediansOrModes[i])) {
	1137	stdDevVal = pad("--", " ", maxAttWidth + maxWidth + 1 - 2, true);
	1138	} else {
	1139	stdDevVal = pad((strVal = plusMinus
	1140	+ Utils.doubleToString(m_FullStdDevs[i],
	1141	maxWidth,4).trim()),
	1142	" ", maxWidth + maxAttWidth + 1 - strVal.length(), true);
	1143	}
	1144	temp.append(stdDevVal);
	1145
	1146	// Clusters
	1147	for (int j = 0; j < m_NumClusters; j++) {
	1148	if (m_ClusterCentroids.instance(j).isMissing(i)) {
	1149	stdDevVal = pad("--", " ", maxWidth + 1 - 2, true);
	1150	} else {
	1151	stdDevVal =
	1152	pad((strVal = plusMinus
	1153	+ Utils.doubleToString(m_ClusterStdDevs.instance(j).value(i),
	1154	maxWidth,4).trim()),
	1155	" ", maxWidth + 1 - strVal.length(), true);
	1156	}
	1157	temp.append(stdDevVal);
	1158	}
	1159	temp.append("\n\n");
	1160	}
	1161	}
	1162	}
	1163
	1164	temp.append("\n\n");
	1165	return temp.toString();
	1166	}
	1167
	1168	private String pad(String source, String padChar,
	1169	int length, boolean leftPad) {
	1170	StringBuffer temp = new StringBuffer();
	1171
	1172	if (leftPad) {
	1173	for (int i = 0; i< length; i++) {
	1174	temp.append(padChar);
	1175	}
	1176	temp.append(source);
	1177	} else {
	1178	temp.append(source);
	1179	for (int i = 0; i< length; i++) {
	1180	temp.append(padChar);
	1181	}
	1182	}
	1183	return temp.toString();
	1184	}
	1185
	1186	/**
	1187	* Gets the the cluster centroids
	1188	*
	1189	* @return the cluster centroids
	1190	*/
	1191	public Instances getClusterCentroids() {
	1192	return m_ClusterCentroids;
	1193	}
	1194
	1195	/**
	1196	* Gets the standard deviations of the numeric attributes in each cluster
	1197	*
	1198	* @return the standard deviations of the numeric attributes
	1199	* in each cluster
	1200	*/
	1201	public Instances getClusterStandardDevs() {
	1202	return m_ClusterStdDevs;
	1203	}
	1204
	1205	/**
	1206	* Returns for each cluster the frequency counts for the values of each
	1207	* nominal attribute
	1208	*
	1209	* @return the counts
	1210	*/
	1211	public int [][][] getClusterNominalCounts() {
	1212	return m_ClusterNominalCounts;
	1213	}
	1214
	1215	/**
	1216	* Gets the squared error for all clusters
	1217	*
	1218	* @return the squared error
	1219	*/
	1220	public double getSquaredError() {
	1221	return Utils.sum(m_squaredErrors);
	1222	}
	1223
	1224	/**
	1225	* Gets the number of instances in each cluster
	1226	*
	1227	* @return The number of instances in each cluster
	1228	*/
	1229	public int [] getClusterSizes() {
	1230	return m_ClusterSizes;
	1231	}
	1232
	1233	/**
	1234	* Gets the assignments for each instance
	1235	* @return Array of indexes of the centroid assigned to each instance
	1236	* @throws Exception if order of instances wasn't preserved or no assignments were made
	1237	*/
	1238	public int [] getAssignments() throws Exception{
	1239	if(!m_PreserveOrder){
	1240	throw new Exception("The assignments are only available when order of instances is preserved (-O)");
	1241	}
	1242	if(m_Assignments == null){
	1243	throw new Exception("No assignments made.");
	1244	}
	1245	return m_Assignments;
	1246	}
	1247
	1248	/**
	1249	* Returns the revision string.
	1250	*
	1251	* @return the revision
	1252	*/
	1253	public String getRevision() {
	1254	return RevisionUtils.extract("$Revision: 5987 $");
	1255	}
	1256
	1257	/**
	1258	* Main method for testing this class.
	1259	*
	1260	* @param argv should contain the following arguments: <p>
	1261	* -t training file [-N number of clusters]
	1262	*/
	1263	public static void main (String[] argv) {
	1264	runClusterer(new SimpleKMeans(), argv);
	1265	}
	1266	}
	1267

Note: See TracBrowser for help on using the repository browser.

Download in other formats: