Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

XMeans.java @ 38

Last change on this file since 38 was 29, checked in by gnappo, 14 years ago
Taggata versione per la demo e aggiunto branch.
File size: 69.5 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* XMeans.java
19	* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.clusterers;
24
25	import weka.core.AlgVector;
26	import weka.core.Capabilities;
27	import weka.core.DistanceFunction;
28	import weka.core.EuclideanDistance;
29	import weka.core.Instance;
30	import weka.core.Instances;
31	import weka.core.RevisionUtils;
32	import weka.core.neighboursearch.KDTree;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.TechnicalInformation;
36	import weka.core.TechnicalInformationHandler;
37	import weka.core.Utils;
38	import weka.core.Capabilities.Capability;
39	import weka.core.TechnicalInformation.Field;
40	import weka.core.TechnicalInformation.Type;
41	import weka.filters.Filter;
42	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
43
44	import java.io.BufferedReader;
45	import java.io.File;
46	import java.io.FileOutputStream;
47	import java.io.FileReader;
48	import java.io.PrintWriter;
49	import java.io.Reader;
50	import java.util.Enumeration;
51	import java.util.Random;
52	import java.util.Vector;
53
54	/**
55	<!-- globalinfo-start -->
56	* Cluster data using the X-means algorithm.<br/>
57	* <br/>
58	* X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/>
59	* <br/>
60	* For more information see:<br/>
61	* <br/>
62	* Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000.
63	* <p/>
64	<!-- globalinfo-end -->
65	*
66	<!-- technical-bibtex-start -->
67	* BibTeX:
68	* <pre>
69	* @inproceedings{Pelleg2000,
70	* author = {Dan Pelleg and Andrew W. Moore},
71	* booktitle = {Seventeenth International Conference on Machine Learning},
72	* pages = {727-734},
73	* publisher = {Morgan Kaufmann},
74	* title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters},
75	* year = {2000}
76	* }
77	* </pre>
78	* <p/>
79	<!-- technical-bibtex-end -->
80	*
81	<!-- options-start -->
82	* Valid options are: <p/>
83	*
84	* <pre> -I <num>
85	* maximum number of overall iterations
86	* (default 1).</pre>
87	*
88	* <pre> -M <num>
89	* maximum number of iterations in the kMeans loop in
90	* the Improve-Parameter part
91	* (default 1000).</pre>
92	*
93	* <pre> -J <num>
94	* maximum number of iterations in the kMeans loop
95	* for the splitted centroids in the Improve-Structure part
96	* (default 1000).</pre>
97	*
98	* <pre> -L <num>
99	* minimum number of clusters
100	* (default 2).</pre>
101	*
102	* <pre> -H <num>
103	* maximum number of clusters
104	* (default 4).</pre>
105	*
106	* <pre> -B <value>
107	* distance value for binary attributes
108	* (default 1.0).</pre>
109	*
110	* <pre> -use-kdtree
111	* Uses the KDTree internally
112	* (default no).</pre>
113	*
114	* <pre> -K <KDTree class specification>
115	* Full class name of KDTree class to use, followed
116	* by scheme options.
117	* eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
118	* (default no KDTree class used).</pre>
119	*
120	* <pre> -C <value>
121	* cutoff factor, takes the given percentage of the splitted
122	* centroids if none of the children win
123	* (default 0.0).</pre>
124	*
125	* <pre> -D <distance function class specification>
126	* Full class name of Distance function class to use, followed
127	* by scheme options.
128	* (default weka.core.EuclideanDistance).</pre>
129	*
130	* <pre> -N <file name>
131	* file to read starting centers from (ARFF format).</pre>
132	*
133	* <pre> -O <file name>
134	* file to write centers to (ARFF format).</pre>
135	*
136	* <pre> -U <int>
137	* The debug level.
138	* (default 0)</pre>
139	*
140	* <pre> -Y <file name>
141	* The debug vectors file.</pre>
142	*
143	* <pre> -S <num>
144	* Random number seed.
145	* (default 10)</pre>
146	*
147	<!-- options-end -->
148	*
149	* @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
150	* @author Mark Hall (mhall@cs.waikato.ac.nz)
151	* @author Malcolm Ware (mfw4@cs.waikato.ac.nz)
152	* @version $Revision: 5488 $
153	* @see RandomizableClusterer
154	*/
155	public class XMeans
156	extends RandomizableClusterer
157	implements TechnicalInformationHandler {
158
159	/*
160	* major TODOS:
161	*
162	* make BIC-Score replaceable by other scores
163	*/
164
165	/** for serialization. */
166	private static final long serialVersionUID = -7941793078404132616L;
167
168	/** training instances. */
169	protected Instances m_Instances = null;
170
171	/** model information, should increase readability. */
172	protected Instances m_Model = null;
173
174	/** replace missing values in training instances. */
175	protected ReplaceMissingValues m_ReplaceMissingFilter;
176
177	/**
178	* Distance value between true and false of binary attributes and
179	* "same" and "different" of nominal attributes (default = 1.0).
180	*/
181	protected double m_BinValue = 1.0;
182
183	/** BIC-Score of the current model. */
184	protected double m_Bic = Double.MIN_VALUE;
185
186	/** Distortion. */
187	protected double[] m_Mle = null;
188
189	/** maximum overall iterations. */
190	protected int m_MaxIterations = 1;
191
192	/**
193	* maximum iterations to perform Kmeans part
194	* if negative, iterations are not checked.
195	*/
196	protected int m_MaxKMeans = 1000;
197
198	/** see above, but for kMeans of splitted clusters.
199	*/
200	protected int m_MaxKMeansForChildren = 1000;
201
202	/** The actual number of clusters. */
203	protected int m_NumClusters = 2;
204
205	/** min number of clusters to generate. */
206	protected int m_MinNumClusters = 2;
207
208	/** max number of clusters to generate. */
209	protected int m_MaxNumClusters = 4;
210
211	/** the distance function used. */
212	protected DistanceFunction m_DistanceF = new EuclideanDistance();
213
214	/** cluster centers. */
215	protected Instances m_ClusterCenters;
216
217	/** file name of the output file for the cluster centers. */
218	protected File m_InputCenterFile = new File(System.getProperty("user.dir"));
219
220	/* --> DebugVectors - USED FOR DEBUGGING */
221	/** input file for the random vectors --> USED FOR DEBUGGING. */
222	protected Reader m_DebugVectorsInput = null;
223	/** the index for the current debug vector. */
224	protected int m_DebugVectorsIndex = 0;
225	/** all the debug vectors. */
226	protected Instances m_DebugVectors = null;
227
228	/** file name of the input file for the random vectors. */
229	protected File m_DebugVectorsFile = new File(System.getProperty("user.dir"));
230
231	/** input file for the cluster centers. */
232	protected Reader m_CenterInput = null;
233
234	/** file name of the output file for the cluster centers. */
235	protected File m_OutputCenterFile = new File(System.getProperty("user.dir"));
236
237	/** output file for the cluster centers. */
238	protected PrintWriter m_CenterOutput = null;
239
240	/**
241	* temporary variable holding cluster assignments while iterating.
242	*/
243	protected int[] m_ClusterAssignments;
244
245	/** cutoff factor - percentage of splits done in Improve-Structure part
246	only relevant, if all children lost. */
247	protected double m_CutOffFactor = 0.5;
248
249	/** Index in ranges for LOW. */
250	public static int R_LOW = 0;
251	/** Index in ranges for HIGH. */
252	public static int R_HIGH = 1;
253	/** Index in ranges for WIDTH. */
254	public static int R_WIDTH = 2;
255
256	/**
257	* KDTrees class if KDTrees are used.
258	*/
259	protected KDTree m_KDTree = new KDTree();
260
261	/** whether to use the KDTree (the KDTree is only initialized to be
262	* configurable from the GUI). */
263	protected boolean m_UseKDTree = false;
264
265	/** counts iterations done in main loop. */
266	protected int m_IterationCount = 0;
267
268	/** counter to say how often kMeans was stopped by loop counter. */
269	protected int m_KMeansStopped = 0;
270
271	/** Number of splits prepared. */
272	protected int m_NumSplits = 0;
273
274	/** Number of splits accepted (including cutoff factor decisions). */
275	protected int m_NumSplitsDone = 0;
276
277	/** Number of splits accepted just because of cutoff factor. */
278	protected int m_NumSplitsStillDone = 0;
279
280	/**
281	* level of debug output, 0 is no output.
282	*/
283	protected int m_DebugLevel = 0;
284
285	/** print the centers. */
286	public static int D_PRINTCENTERS = 1;
287	/** follows the splitting of the centers. */
288	public static int D_FOLLOWSPLIT = 2;
289	/** have a closer look at converge children. */
290	public static int D_CONVCHCLOSER = 3;
291	/** check on random vectors. */
292	public static int D_RANDOMVECTOR = 4;
293	/** check on kdtree. */
294	public static int D_KDTREE = 5;
295	/** follow iterations. */
296	public static int D_ITERCOUNT = 6;
297	/** functions were maybe misused. */
298	public static int D_METH_MISUSE = 80;
299	/** for current debug. */
300	public static int D_CURR = 88;
301	/** general debugging. */
302	public static int D_GENERAL = 99;
303
304	/** Flag: I'm debugging. */
305	public boolean m_CurrDebugFlag = true;
306
307	/**
308	* the default constructor.
309	*/
310	public XMeans() {
311	super();
312
313	m_SeedDefault = 10;
314	setSeed(m_SeedDefault);
315	}
316
317	/**
318	* Returns a string describing this clusterer.
319	* @return a description of the evaluator suitable for
320	* displaying in the explorer/experimenter gui
321	*/
322	public String globalInfo() {
323	return
324	"Cluster data using the X-means algorithm.\n\n"
325	+ "X-Means is K-Means extended by an Improve-Structure part In this "
326	+ "part of the algorithm the centers are attempted to be split in "
327	+ "its region. The decision between the children of each center and "
328	+ "itself is done comparing the BIC-values of the two structures.\n\n"
329	+ "For more information see:\n\n"
330	+ getTechnicalInformation().toString();
331	}
332
333	/**
334	* Returns an instance of a TechnicalInformation object, containing
335	* detailed information about the technical background of this class,
336	* e.g., paper reference or book this class is based on.
337	*
338	* @return the technical information about this class
339	*/
340	public TechnicalInformation getTechnicalInformation() {
341	TechnicalInformation result;
342
343	result = new TechnicalInformation(Type.INPROCEEDINGS);
344	result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore");
345	result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters");
346	result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning");
347	result.setValue(Field.YEAR, "2000");
348	result.setValue(Field.PAGES, "727-734");
349	result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
350
351	return result;
352	}
353
354	/**
355	* Returns default capabilities of the clusterer.
356	*
357	* @return the capabilities of this clusterer
358	*/
359	public Capabilities getCapabilities() {
360	Capabilities result = super.getCapabilities();
361	result.disableAll();
362	result.enable(Capability.NO_CLASS);
363
364	// attributes
365	result.enable(Capability.NUMERIC_ATTRIBUTES);
366	result.enable(Capability.DATE_ATTRIBUTES);
367	result.enable(Capability.MISSING_VALUES);
368
369	return result;
370	}
371
372	/**
373	* Generates the X-Means clusterer.
374	*
375	* @param data set of instances serving as training data
376	* @throws Exception if the clusterer has not been
377	* generated successfully
378	*/
379	public void buildClusterer(Instances data) throws Exception {
380
381	// can clusterer handle the data?
382	getCapabilities().testWithFail(data);
383
384	if (m_MinNumClusters > m_MaxNumClusters) {
385	throw new Exception("XMeans: min number of clusters "
386	+ "can't be greater than max number of clusters!");
387	}
388
389	m_NumSplits = 0;
390	m_NumSplitsDone = 0;
391	m_NumSplitsStillDone = 0;
392
393	// replace missing values
394	m_ReplaceMissingFilter = new ReplaceMissingValues();
395	m_ReplaceMissingFilter.setInputFormat(data);
396	m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter);
397
398	// initialize random function
399	Random random0 = new Random(m_Seed);
400
401	// num of clusters to start with
402	m_NumClusters = m_MinNumClusters;
403
404	// set distance function to default
405	if (m_DistanceF == null) {
406	m_DistanceF = new EuclideanDistance();
407	}
408
409	m_DistanceF.setInstances(m_Instances);
410	checkInstances();
411
412	if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile())
413	initDebugVectorsInput();
414
415	// make list of indexes for m_Instances
416	int[] allInstList = new int[m_Instances.numInstances()];
417	for (int i = 0; i < m_Instances.numInstances(); i++) {
418	allInstList[i] = i;
419	}
420
421	// set model used (just for convenience)
422	m_Model = new Instances(m_Instances, 0);
423
424	// produce the starting centers
425	if (m_CenterInput != null) {
426	// read centers from file
427	m_ClusterCenters = new Instances(m_CenterInput);
428	m_NumClusters = m_ClusterCenters.numInstances();
429	}
430	else
431	// makes the first centers randomly
432	m_ClusterCenters = makeCentersRandomly(random0,
433	m_Instances, m_NumClusters);
434	PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");
435	for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {
436	PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));
437	}
438
439	PrCentersFD(D_PRINTCENTERS);
440
441	boolean finished = false;
442	Instances children;
443
444	// builds up a KDTree
445	if (m_UseKDTree)
446	m_KDTree.setInstances(m_Instances);
447
448	// loop counter of main loop
449	m_IterationCount = 0;
450
451	/**
452	* "finished" does get true as soon as:
453	* 1. number of clusters gets >= m_MaxClusters,
454	* 2. in the last round, none of the centers have been split
455	*
456	* if number of clusters is already >= m_MaxClusters
457	* part 1 (= Improve-Params) is done at least once.
458	*/
459	while (!finished &&
460	!stopIteration(m_IterationCount, m_MaxIterations)) {
461
462	/* ====================================================================
463	* 1. Improve-Params
464	* conventional K-means
465	*/
466
467
468	PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:");
469	PrCentersFD(D_FOLLOWSPLIT);
470
471	PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount +
472	". time");
473	m_IterationCount++;
474
475	// prepare to converge
476	boolean converged = false;
477
478	// initialize assignments to -1
479	m_ClusterAssignments = initAssignments(m_Instances.numInstances());
480	// stores a list of indexes of instances belonging to each center
481	int[][] instOfCent = new int[m_ClusterCenters.numInstances()][];
482
483	// KMeans loop counter
484	int kMeansIteration = 0;
485
486	// converge in conventional K-means ----------------------------------
487	PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:");
488	while (!converged &&
489	!stopKMeansIteration(kMeansIteration, m_MaxKMeans)) {
490
491	kMeansIteration++;
492	converged = true;
493
494	// assign instances to centers -------------------------------------
495	converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
496	m_ClusterCenters,
497	instOfCent,
498	allInstList,
499	m_ClusterAssignments,
500	kMeansIteration);
501
502	PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:");
503	PrCentersFD(D_FOLLOWSPLIT);
504	// compute new centers = centers of mass of points
505	converged = recomputeCenters(m_ClusterCenters, // clusters
506	instOfCent, // their instances
507	m_Model); // model information
508	PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:");
509	PrCentersFD(D_FOLLOWSPLIT);
510	}
511	PFD(D_FOLLOWSPLIT, "");
512	PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means");
513
514	/** =====================================================================
515	* 2. Improve-Structur
516	*/
517
518	// BIC before split distortioning the centres
519	m_Mle = distortion(instOfCent, m_ClusterCenters);
520	m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle);
521	PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic);
522
523	int currNumCent = m_ClusterCenters.numInstances();
524	Instances splitCenters = new Instances(m_ClusterCenters,
525	currNumCent * 2);
526
527	// store BIC values of parent and children
528	double[] pbic = new double [currNumCent];
529	double[] cbic = new double [currNumCent];
530
531	// split each center
532	for (int i = 0; i < currNumCent
533	// this could help to optimize the algorithm
534	// && currNumCent + numSplits <= m_MaxNumClusters
535	;
536	i++) {
537
538	PFD(D_FOLLOWSPLIT, "\nsplit center " + i +
539	" " + m_ClusterCenters.instance(i));
540	Instance currCenter = m_ClusterCenters.instance(i);
541	int[] currInstList = instOfCent[i];
542	int currNumInst = instOfCent[i].length;
543
544	// not enough instances; than continue with next
545	if (currNumInst <= 2) {
546	pbic[i] = Double.MAX_VALUE;
547	cbic[i] = 0.0;
548	// add center itself as dummy
549	splitCenters.add(currCenter);
550	splitCenters.add(currCenter);
551	continue;
552	}
553
554	// split centers ----------------------------------------------
555	double variance = m_Mle[i] / (double)currNumInst;
556	children = splitCenter(random0, currCenter, variance, m_Model);
557
558	// initialize assignments to -1
559	int[] oneCentAssignments = initAssignments(currNumInst);
560	int[][] instOfChCent = new int [2][]; // todo maybe split didn't work
561
562	// converge the children --------------------------------------
563	converged = false;
564	int kMeansForChildrenIteration = 0;
565	PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i);
566	while (!converged &&
567	!stopKMeansIteration(kMeansForChildrenIteration,
568	m_MaxKMeansForChildren)) {
569	kMeansForChildrenIteration++;
570
571	converged =
572	assignToCenters(children, instOfChCent,
573	currInstList, oneCentAssignments);
574
575	if (!converged) {
576	recomputeCentersFast(children, instOfChCent, m_Model);
577	}
578	}
579
580	// store new centers for later decision if they are taken
581	splitCenters.add(children.instance(0));
582	splitCenters.add(children.instance(1));
583
584	PFD(D_FOLLOWSPLIT, "\nconverged cildren ");
585	PFD(D_FOLLOWSPLIT, " " + children.instance(0));
586	PFD(D_FOLLOWSPLIT, " " + children.instance(1));
587
588	// compare parent and children model by their BIC-value
589	pbic[i] = calculateBIC(currInstList, currCenter, m_Mle[i], m_Model);
590	double[] chMLE = distortion(instOfChCent, children);
591	cbic[i] = calculateBIC(instOfChCent, children, chMLE);
592
593	} // end of loop over clusters
594
595	// decide which one to split and make new list of cluster centers
596	Instances newClusterCenters = null;
597	newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor,
598	splitCenters);
599	/**
600	* Compare with before Improve-Structure
601	*/
602	int newNumClusters = newClusterCenters.numInstances();
603	if (newNumClusters != m_NumClusters) {
604
605	PFD(D_FOLLOWSPLIT, "Compare with non-split");
606
607	// initialize assignments to -1
608	int[] newClusterAssignments =
609	initAssignments(m_Instances.numInstances());
610
611	// stores a list of indexes of instances belonging to each center
612	int[][] newInstOfCent = new int[newClusterCenters.numInstances()][];
613
614	// assign instances to centers -------------------------------------
615	converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
616	newClusterCenters,
617	newInstOfCent,
618	allInstList,
619	newClusterAssignments,
620	m_IterationCount);
621
622	double[] newMle = distortion(newInstOfCent, newClusterCenters);
623	double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle);
624	PFD(D_FOLLOWSPLIT, "newBic " + newBic);
625	if (newBic > m_Bic) {
626	PFD(D_FOLLOWSPLIT, "*** decide for new clusters");
627	m_Bic = newBic;
628	m_ClusterCenters = newClusterCenters;
629	m_ClusterAssignments = newClusterAssignments;
630	} else {
631	PFD(D_FOLLOWSPLIT, "*** keep old clusters");
632	}
633	}
634
635	newNumClusters = m_ClusterCenters.numInstances();
636	// decide if finished: max num cluster reached
637	// or last centers where not split at all
638	if ((newNumClusters >= m_MaxNumClusters)
639	\|\| (newNumClusters == m_NumClusters)) {
640	finished = true;
641	}
642	m_NumClusters = newNumClusters;
643	}
644	}
645
646	/**
647	* Checks for nominal attributes in the dataset.
648	* Class attribute is ignored.
649	* @param data the data to check
650	* @return false if no nominal attributes are present
651	*/
652	public boolean checkForNominalAttributes(Instances data) {
653
654	int i = 0;
655	while (i < data.numAttributes()) {
656	if ((i != data.classIndex()) && data.attribute(i++).isNominal()) {
657	return true;
658	}
659	}
660	return false;
661	}
662
663	/**
664	* Set array of int, used to store assignments, to -1.
665	* @param ass integer array used for storing assignments
666	* @return integer array used for storing assignments
667	*/
668	protected int[] initAssignments(int[] ass) {
669	for (int i = 0; i < ass.length; i++)
670	ass[i] = -1;
671	return ass;
672	}
673
674	/**
675	* Creates and initializes integer array, used to store assignments.
676	* @param numInstances length of array used for assignments
677	* @return integer array used for storing assignments
678	*/
679	protected int[] initAssignments(int numInstances) {
680	int[] ass = new int[numInstances];
681	for (int i = 0; i < numInstances; i++)
682	ass[i] = -1;
683	return ass;
684	}
685
686	/**
687	* Creates and initializes boolean array.
688	* @param len length of new array
689	* @return the new array
690	*/
691	boolean[] initBoolArray(int len) {
692	boolean[] boolArray = new boolean [len];
693	for (int i = 0; i < len; i++) {
694	boolArray[i] = false;
695	}
696	return boolArray;
697	}
698
699	/**
700	* Returns new center list.
701	*
702	* The following steps 1. and 2. both take care that the number of centers
703	* does not exceed maxCenters.
704	*
705	* 1. Compare BIC values of parent and children and takes the one as
706	* new centers which do win (= BIC-value is smaller).
707	*
708	* 2. If in 1. none of the children are chosen
709	* && and cutoff factor is > 0
710	* cutoff factor is taken as the percentage of "best" centers that are
711	* still taken.
712	* @param pbic array of parents BIC-values
713	* @param cbic array of childrens BIC-values
714	* @param cutoffFactor cutoff factor
715	* @param splitCenters all children
716	* @return the new centers
717	*/
718	protected Instances newCentersAfterSplit(double[] pbic,
719	double[] cbic,
720	double cutoffFactor,
721	Instances splitCenters) {
722
723	// store if split won
724	boolean splitPerCutoff = false;
725	boolean takeSomeAway = false;
726	boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances());
727	int numToSplit = 0;
728	Instances newCenters = null;
729
730	// how many would be split, because the children have a better bic value
731	for (int i = 0; i < cbic.length; i++) {
732	if (cbic[i] > pbic[i]) {
733	// decide for splitting ----------------------------------------
734	splitWon[i] = true; numToSplit++;
735	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children");
736	}
737	else {
738	// decide for parents and finished stays true -----------------
739	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent");
740	}
741	}
742
743	// no splits yet so split per cutoff factor
744	if ((numToSplit == 0) && (cutoffFactor > 0)) {
745	splitPerCutoff = true;
746
747	// how many to split per cutoff factor
748	numToSplit = (int)
749	((double) m_ClusterCenters.numInstances() * m_CutOffFactor);
750	}
751
752	// prepare indexes of values in ascending order
753	double[] diff = new double [m_NumClusters];
754	for (int j = 0; j < diff.length; j++) {
755	diff[j] = pbic[j] - cbic[j];
756	}
757	int[] sortOrder = Utils.sort(diff);
758
759	// check if maxNumClusters would be exceeded
760	int possibleToSplit = m_MaxNumClusters - m_NumClusters;
761
762	if (possibleToSplit > numToSplit) {
763	// still enough possible, do the whole amount
764	possibleToSplit = numToSplit;
765	}
766	else
767	takeSomeAway = true;
768
769	// prepare for splitting the one that are supposed to be split
770	if (splitPerCutoff) {
771	for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0);
772	j++) {
773	splitWon[sortOrder[j]] = true;
774	}
775	m_NumSplitsStillDone += possibleToSplit;
776	}
777	else {
778	// take some splits away if max number of clusters would be exceeded
779	if (takeSomeAway) {
780	int count = 0;
781	int j = 0;
782	for (;j < splitWon.length && count < possibleToSplit; j++){
783	if (splitWon[sortOrder[j]] == true) count++;
784	}
785
786	while (j < splitWon.length) {
787	splitWon[sortOrder[j]] = false;
788	j++;
789	}
790	}
791	}
792
793	// finally split
794	if (possibleToSplit > 0)
795	newCenters = newCentersAfterSplit(splitWon, splitCenters);
796	else
797	newCenters = m_ClusterCenters;
798	return newCenters;
799	}
800
801	/**
802	* Returns new centers. Depending on splitWon: if true takes children, if
803	* false takes parent = current center.
804	*
805	* @param splitWon
806	* array of boolean to indicate to take split or not
807	* @param splitCenters
808	* list of splitted centers
809	* @return the new centers
810	*/
811	protected Instances newCentersAfterSplit(boolean[] splitWon,
812	Instances splitCenters) {
813	Instances newCenters = new Instances(splitCenters, 0);
814
815	int sIndex = 0;
816	for (int i = 0; i < splitWon.length; i++) {
817	if (splitWon[i]) {
818	m_NumSplitsDone++;
819	newCenters.add(splitCenters.instance(sIndex++));
820	newCenters.add(splitCenters.instance(sIndex++));
821	} else {
822	sIndex++;
823	sIndex++;
824	newCenters.add(m_ClusterCenters.instance(i));
825	}
826	}
827	return newCenters;
828	}
829
830	/**
831	* Controls that counter does not exceed max iteration value. Special function
832	* for kmeans iterations.
833	*
834	* @param iterationCount
835	* current value of counter
836	* @param max
837	* maximum value for counter
838	* @return true if iteration should be stopped
839	*/
840	protected boolean stopKMeansIteration(int iterationCount, int max) {
841	boolean stopIterate = false;
842	if (max >= 0)
843	stopIterate = (iterationCount >= max);
844	if (stopIterate)
845	m_KMeansStopped++;
846	return stopIterate;
847	}
848
849	/**
850	* Checks if iterationCount has to be checked and if yes
851	* (this means max is > 0) compares it with max.
852	*
853	* @param iterationCount the current iteration count
854	* @param max the maximum number of iterations
855	* @return true if maximum has been reached
856	*/
857	protected boolean stopIteration(int iterationCount, int max) {
858	boolean stopIterate = false;
859	if (max >= 0)
860	stopIterate = (iterationCount >= max);
861	return stopIterate;
862	}
863
864	/**
865	* Recompute the new centers. New cluster center is center of mass of its
866	* instances. Returns true if cluster stays the same.
867	* @param centers the input and output centers
868	* @param instOfCent the instances to the centers
869	* @param model data model information
870	* @return true if converged.
871	*/
872	protected boolean recomputeCenters(Instances centers,
873	int[][] instOfCent,
874	Instances model) {
875	boolean converged = true;
876
877	for (int i = 0; i < centers.numInstances(); i++) {
878	double val;
879	for (int j = 0; j < model.numAttributes(); j++) {
880	val = meanOrMode(m_Instances, instOfCent[i], j);
881
882	for (int k = 0; k < instOfCent[i].length; k++)
883
884	if (converged && m_ClusterCenters.instance(i).value(j) != val)
885	converged = false;
886	if (!converged)
887	m_ClusterCenters.instance(i).setValue(j, val);
888	}
889	}
890	return converged;
891	}
892
893	/**
894	* Recompute the new centers - 2nd version
895	* Same as recomputeCenters, but does not check if center stays the same.
896	*
897	* @param centers the input center and output centers
898	* @param instOfCentIndexes the indexes of the instances to the centers
899	* @param model data model information
900	*/
901	protected void recomputeCentersFast(Instances centers,
902	int[][] instOfCentIndexes,
903	Instances model
904	) {
905	for (int i = 0; i < centers.numInstances(); i++) {
906	double val;
907	for (int j = 0; j < model.numAttributes(); j++) {
908	val = meanOrMode(m_Instances, instOfCentIndexes[i], j);
909	centers.instance(i).setValue(j, val);
910	}
911	}
912	}
913
914	/**
915	* Computes Mean Or Mode of one attribute on a subset of m_Instances.
916	* The subset is defined by an index list.
917	* @param instances all instances
918	* @param instList the indexes of the instances the mean is computed from
919	* @param attIndex the index of the attribute
920	* @return mean value
921	*/
922	protected double meanOrMode(Instances instances,
923	int[] instList,
924	int attIndex) {
925	double result, found;
926	int[] counts;
927	int numInst = instList.length;
928
929	if (instances.attribute(attIndex).isNumeric()) {
930	result = found = 0;
931	for (int j = 0; j < numInst; j++) {
932	Instance currInst = instances.instance(instList[j]);
933	if (!currInst.isMissing(attIndex)) {
934	found += currInst.weight();
935	result += currInst.weight() *
936	currInst.value(attIndex);
937	}
938	}
939	if (Utils.eq(found, 0)) {
940	return 0;
941	} else {
942	return result / found;
943	}
944	} else if (instances.attribute(attIndex).isNominal()) {
945	counts = new int[instances.attribute(attIndex).numValues()];
946	for (int j = 0; j < numInst; j++) {
947	Instance currInst = instances.instance(instList[j]);
948	if (!currInst.isMissing(attIndex)) {
949	counts[(int) currInst.value(attIndex)] += currInst.weight();
950	}
951	}
952	return (double)Utils.maxIndex(counts);
953	} else {
954	return 0;
955	}
956	}
957
958
959	/**
960	* Assigns instances to centers.
961	*
962	* @param tree KDTree on all instances
963	* @param centers all the input centers
964	* @param instOfCent the instances to each center
965	* @param allInstList list of all instances
966	* @param assignments assignments of instances to centers
967	* @param iterationCount the number of iteration
968	* @return true if converged
969	* @throws Exception is something goes wrong
970	*/
971	protected boolean assignToCenters(KDTree tree,
972	Instances centers,
973	int[][] instOfCent,
974	int[] allInstList,
975	int[] assignments,
976	int iterationCount) throws Exception {
977
978	boolean converged = true;
979	if (tree != null) {
980	// using KDTree structure for assigning
981	converged = assignToCenters(tree,
982	centers,
983	instOfCent,
984	assignments,
985	iterationCount);
986	} else {
987	converged = assignToCenters(centers,
988	instOfCent,
989	allInstList,
990	assignments);
991	}
992	return converged;
993	}
994
995	/**
996	* Assign instances to centers using KDtree.
997	* First part of conventionell K-Means, returns true if new assignment
998	* is the same as the last one.
999	*
1000	* @param kdtree KDTree on all instances
1001	* @param centers all the input centers
1002	* @param instOfCent the instances to each center
1003	* @param assignments assignments of instances to centers
1004	* @param iterationCount the number of iteration
1005	* @return true if converged
1006	* @throws Exception in case instances are not assigned to cluster
1007	*/
1008	protected boolean assignToCenters(KDTree kdtree,
1009	Instances centers,
1010	int[][] instOfCent,
1011	int[] assignments,
1012	int iterationCount) throws Exception {
1013
1014	int numCent = centers.numInstances();
1015	int numInst = m_Instances.numInstances();
1016	int[] oldAssignments = new int[numInst];
1017
1018	// WARNING: assignments is "input/output-parameter"
1019	// should not be null
1020	if (assignments == null) {
1021	assignments = new int[numInst];
1022	for (int i = 0; i < numInst; i++) {
1023	assignments[0] = -1;
1024	}
1025	}
1026
1027	// WARNING: instOfCent is "input/output-parameter"
1028	// should not be null
1029	if (instOfCent == null) {
1030	instOfCent = new int [numCent][];
1031	}
1032
1033	// save old assignments
1034	for (int i = 0; i < assignments.length; i++) {
1035	oldAssignments[i] = assignments[i];
1036	}
1037
1038	// use tree to get new assignments
1039	kdtree.centerInstances(centers, assignments,
1040	Math.pow(.8, iterationCount));
1041	boolean converged = true;
1042
1043	// compare with previous assignment
1044	for (int i = 0; converged && (i < assignments.length); i++) {
1045	converged = (oldAssignments[i] == assignments[i]);
1046	if (assignments[i] == -1)
1047	throw new Exception("Instance " + i +
1048	" has not been assigned to cluster.");
1049	}
1050
1051	if (!converged) {
1052	int[] numInstOfCent = new int[numCent];
1053	for (int i = 0; i < numCent; i++)
1054	numInstOfCent[i] = 0;
1055
1056	// count num of assignments per center
1057	for (int i = 0; i < numInst; i++)
1058	numInstOfCent[assignments[i]]++;
1059
1060	// prepare instancelists per center
1061	for (int i = 0; i < numCent; i++){
1062	instOfCent[i] = new int[numInstOfCent[i]];
1063	}
1064	// write instance lists per center
1065	for (int i = 0; i < numCent; i++) {
1066	int index = -1;
1067	for (int j = 0; j < numInstOfCent[i]; j++) {
1068	index = nextAssignedOne(i, index, assignments);
1069	instOfCent[i][j] = index;
1070	}
1071	}
1072	}
1073
1074	return converged;
1075	}
1076
1077	/**
1078	* Assign instances to centers.
1079	* Part of conventionell K-Means, returns true if new assignment
1080	* is the same as the last one.
1081	*
1082	* @param centers all the input centers
1083	* @param instOfCent the instances to each center
1084	* @param allInstList list of all indexes
1085	* @param assignments assignments of instances to centers
1086	* @return true if converged
1087	* @throws Exception if something goes wrong
1088	*/
1089	protected boolean assignToCenters(Instances centers,
1090	int[][] instOfCent,
1091	int[] allInstList,
1092	int[] assignments)
1093	throws Exception {
1094
1095	// todo: undecided situations
1096	boolean converged = true; // true if new assignment is the same
1097	// as the old one
1098
1099	int numInst = allInstList.length;
1100	int numCent = centers.numInstances();
1101	int[] numInstOfCent = new int [numCent];
1102	for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0;
1103
1104	// WARNING: assignments is "input/output-parameter"
1105	// should not be null
1106	if (assignments == null) {
1107	assignments = new int[numInst];
1108	for (int i = 0; i < numInst; i++) {
1109	assignments[i] = -1;
1110	}
1111	}
1112
1113	// WARNING: instOfCent is "input/output-parameter"
1114	// should not be null
1115	if (instOfCent == null) {
1116	instOfCent = new int [numCent][];
1117	}
1118
1119	// set assignments
1120	for (int i = 0; i < numInst; i++) {
1121	Instance inst = m_Instances.instance(allInstList[i]);
1122	int newC = clusterProcessedInstance(inst, centers);
1123
1124	if (converged && newC != assignments[i]) {
1125	converged = false;
1126	}
1127
1128	numInstOfCent[newC]++;
1129	if (!converged)
1130	assignments[i] = newC;
1131	}
1132
1133	// the following is only done
1134	// if assignments are not the same, because too much effort
1135	if (!converged) {
1136	PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged");
1137	for (int i = 0; i < numCent; i++) {
1138	instOfCent[i] = new int [numInstOfCent[i]];
1139	}
1140
1141	for (int i = 0; i < numCent; i++) {
1142	int index = -1;
1143	for (int j = 0; j < numInstOfCent[i]; j++) {
1144	index = nextAssignedOne(i, index, assignments);
1145	instOfCent[i][j] = allInstList[index];
1146	}
1147	}
1148	}
1149	else
1150	PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged");
1151
1152	return converged;
1153	}
1154
1155	/**
1156	* Searches along the assignment array for the next entry of the center
1157	* in question.
1158	* @param cent index of the center
1159	* @param lastIndex index to start searching
1160	* @param assignments assignments
1161	* @return index of the instance the center cent is assigned to
1162	*/
1163	protected int nextAssignedOne(int cent, int lastIndex,
1164	int[] assignments) {
1165	int len = assignments.length;
1166	int index = lastIndex + 1;
1167	while (index < len) {
1168	if (assignments[index] == cent) {
1169	return (index);
1170	}
1171	index++;
1172	}
1173	return (-1);
1174	}
1175
1176	/**
1177	* Split centers in their region. Generates random vector of
1178	* length = variance and
1179	* adds and substractsx to cluster vector to get two new clusters.
1180	*
1181	* @param random random function
1182	* @param center the center that is split here
1183	* @param variance variance of the cluster
1184	* @param model data model valid
1185	* @return a pair of new centers
1186	* @throws Exception something in AlgVector goes wrong
1187	*/
1188	protected Instances splitCenter(Random random,
1189	Instance center,
1190	double variance,
1191	Instances model) throws Exception {
1192	m_NumSplits++;
1193	AlgVector r = null;
1194	Instances children = new Instances(model, 2);
1195
1196	if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) {
1197	Instance nextVector = getNextDebugVectorsInstance(model);
1198	PFD(D_RANDOMVECTOR, "Random Vector from File " + nextVector);
1199	r = new AlgVector(nextVector);
1200	}
1201	else {
1202	// random vector of length = variance
1203	r = new AlgVector(model, random);
1204	}
1205	r.changeLength(Math.pow(variance, 0.5));
1206	PFD(D_RANDOMVECTOR, "random vector *variance "+ r);
1207
1208	// add random vector to center
1209	AlgVector c = new AlgVector(center);
1210	AlgVector c2 = (AlgVector) c.clone();
1211	c = c.add(r);
1212	Instance newCenter = c.getAsInstance(model, random);
1213	children.add(newCenter);
1214	PFD(D_FOLLOWSPLIT, "first child "+ newCenter);
1215
1216	// substract random vector to center
1217	c2 = c2.substract(r);
1218	newCenter = c2.getAsInstance(model, random);
1219	children.add(newCenter);
1220	PFD(D_FOLLOWSPLIT, "second child "+ newCenter);
1221
1222	return children;
1223	}
1224
1225	/**
1226	* Split centers in their region.
1227	* (Alternative version of splitCenter())
1228	*
1229	* @param random the random number generator
1230	* @param instances of the region
1231	* @param model the model for the centers
1232	* (should be the same as that of instances)
1233	* @return a pair of new centers
1234	*/
1235	protected Instances splitCenters(Random random,
1236	Instances instances,
1237	Instances model) {
1238	Instances children = new Instances(model, 2);
1239	int instIndex = Math.abs(random.nextInt()) % instances.numInstances();
1240	children.add(instances.instance(instIndex));
1241	int instIndex2 = instIndex;
1242	int count = 0;
1243	while ((instIndex2 == instIndex) && count < 10) {
1244	count++;
1245	instIndex2 = Math.abs(random.nextInt()) % instances.numInstances();
1246	}
1247	children.add(instances.instance(instIndex2));
1248
1249	return children;
1250	}
1251
1252	/**
1253	* Generates new centers randomly. Used for starting centers.
1254	*
1255	* @param random0 random number generator
1256	* @param model data model of the instances
1257	* @param numClusters number of clusters
1258	* @return new centers
1259	*/
1260	protected Instances makeCentersRandomly(Random random0,
1261	Instances model,
1262	int numClusters) {
1263	Instances clusterCenters = new Instances(model, numClusters);
1264	m_NumClusters = numClusters;
1265
1266	// makes the new centers randomly
1267	for (int i = 0; i < numClusters; i++) {
1268	int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances();
1269	clusterCenters.add(m_Instances.instance(instIndex));
1270	}
1271	return clusterCenters;
1272	}
1273
1274	/**
1275	* Returns the BIC-value for the given center and instances.
1276	* @param instList The indices of the instances that belong to the center
1277	* @param center the center.
1278	* @param mle maximum likelihood
1279	* @param model the data model
1280	* @return the BIC value
1281	*/
1282	protected double calculateBIC(int[] instList, Instance center,
1283	double mle, Instances model) {
1284	int[][] w1 = new int[1][instList.length];
1285	for (int i = 0; i < instList.length; i++) {
1286	w1[0][i] = instList[i];
1287	}
1288	double[] m = {mle};
1289	Instances w2 = new Instances(model, 1);
1290	w2.add(center);
1291	return calculateBIC(w1, w2, m);
1292	}
1293
1294	/**
1295	* Calculates the BIC for the given set of centers and instances.
1296	* @param instOfCent The instances that belong to their respective centers
1297	* @param centers the centers
1298	* @param mle maximum likelihood
1299	* @return The BIC for the input.
1300	*/
1301	protected double calculateBIC(int[][] instOfCent, Instances centers,
1302	double[] mle) {
1303	double loglike = 0.0;
1304	int numInstTotal = 0;
1305	int numCenters = centers.numInstances();
1306	int numDimensions = centers.numAttributes();
1307	int numParameters = (numCenters - 1) + //probabilities
1308	numCenters * numDimensions + //means
1309	numCenters; // variance params
1310	for (int i = 0; i < centers.numInstances(); i++) {
1311	loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i),
1312	mle[i], centers.numInstances() * 2);
1313	numInstTotal += instOfCent[i].length;
1314	}
1315	/* diff
1316	thats how we did it
1317	loglike -= ((centers.numAttributes() + 1.0) * centers.numInstances() * 1)
1318	* Math.log(count);
1319	*/
1320	loglike -= numInstTotal * Math.log(numInstTotal);
1321	//System.out.println ("numInstTotal " + numInstTotal +
1322	// "calculateBIC res " + loglike);
1323	loglike -= (numParameters / 2.0) * Math.log(numInstTotal);
1324	//System.out.println ("numParam " +
1325	// + numParameters +
1326	// " calculateBIC res " + loglike);
1327	return loglike;
1328	}
1329
1330	/**
1331	* Calculates the log-likelihood of the data for the given model, taken
1332	* at the maximum likelihood point.
1333	*
1334	* @param numInst number of instances that belong to the center
1335	* @param center the center
1336	* @param distortion distortion
1337	* @param numCent number of centers
1338	* @return the likelihood estimate
1339	*/
1340	protected double logLikelihoodEstimate(int numInst,
1341	Instance center,
1342	double distortion,
1343	int numCent) {
1344	// R(n) num of instances of the center -> numInst
1345	// K num of centers -> not used
1346	//
1347	//todo take the diff comments away
1348	double loglike = 0;
1349	/* if is new */
1350	if (numInst > 1) {
1351	/* diff variance is new */
1352	//
1353	// distortion = Sum over instances x of the center(x-center)
1354	// different to paper; sum should be squared
1355	//
1356	// (Sum of distances to center) / R(n) - 1.0
1357	// different to paper; should be R(n)-K
1358	double variance = distortion / (numInst - 1.0);
1359
1360	//
1361	// -R(n)/2 * log(pi*2)
1362	//
1363	double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);
1364	/* diff
1365	thats how we had it
1366	double p2 = -((ni * center.numAttributes()) / 2) * distortion;
1367	*/
1368	//
1369	// -(R(n)M)/2 log(variance)
1370	//
1371	double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);
1372
1373	/* diff
1374	thats how we had it, the difference is a bug in x-means
1375	double p3 = - (numInst - numCent) / 2;
1376	*/
1377	//
1378	// -(R(n)-1)/2
1379	//
1380	double p3 = - (numInst - 1.0) / 2.0;
1381
1382	//
1383	// R(n)*log(R(n))
1384	//
1385	double p4 = numInst * Math.log(numInst);
1386
1387	/* diff x-means doesn't have this part
1388	double p5 = - numInst * Math.log(numInstTotal);
1389	*/
1390
1391	/*
1392	loglike = -(ni / 2) * Math.log(Math.PI * 2)
1393	- (ni * center.numAttributes()) / 2.0) * logdistortion
1394	- (ni - k) / 2.0
1395	+ ni * Math.log(ni)
1396	- ni * Math.log(r);
1397	*/
1398	loglike = p1 + p2 + p3 + p4; // diff + p5;
1399	//the log(r) is something that can be reused.
1400	//as is the log(2 PI), these could provide extra speed up later on.
1401	//since distortion is so expensive to compute, I only do that once.
1402	}
1403	return loglike;
1404	}
1405
1406	/**
1407	* Calculates the maximum likelihood estimate for the variance.
1408	* @param instOfCent indices of instances to each center
1409	* @param centers the centers
1410	* @return the list of distortions distortion.
1411	*/
1412	protected double[] distortion(int[][] instOfCent, Instances centers) {
1413	double[] distortion = new double[centers.numInstances()];
1414	for (int i = 0; i < centers.numInstances(); i++) {
1415	distortion[i] = 0.0;
1416	for (int j = 0; j < instOfCent[i].length; j++) {
1417	distortion[i] += m_DistanceF.distance(m_Instances
1418	.instance(instOfCent[i][j]), centers.instance(i));
1419	}
1420	}
1421	/*
1422	* diff not done in x-means res *= 1.0 / (count - centers.numInstances());
1423	*/
1424	return distortion;
1425	}
1426
1427	/**
1428	* Clusters an instance.
1429	*
1430	* @param instance
1431	* the instance to assign a cluster to.
1432	* @param centers
1433	* the centers to cluster the instance to.
1434	* @return a cluster index.
1435	*/
1436	protected int clusterProcessedInstance(Instance instance, Instances centers) {
1437
1438	double minDist = Integer.MAX_VALUE;
1439	int bestCluster = 0;
1440	for (int i = 0; i < centers.numInstances(); i++) {
1441	double dist = m_DistanceF.distance(instance, centers.instance(i));
1442
1443	if (dist < minDist) {
1444	minDist = dist;
1445	bestCluster = i;
1446	}
1447	}
1448	;
1449	return bestCluster;
1450	}
1451
1452	/**
1453	* Clusters an instance that has been through the filters.
1454	*
1455	* @param instance
1456	* the instance to assign a cluster to
1457	* @return a cluster number
1458	*/
1459	protected int clusterProcessedInstance(Instance instance) {
1460	double minDist = Integer.MAX_VALUE;
1461	int bestCluster = 0;
1462	for (int i = 0; i < m_NumClusters; i++) {
1463	double dist = m_DistanceF
1464	.distance(instance, m_ClusterCenters.instance(i));
1465	if (dist < minDist) {
1466	minDist = dist;
1467	bestCluster = i;
1468	}
1469	}
1470	return bestCluster;
1471	}
1472
1473	/**
1474	* Classifies a given instance.
1475	*
1476	* @param instance the instance to be assigned to a cluster
1477	* @return the number of the assigned cluster as an integer
1478	* if the class is enumerated, otherwise the predicted value
1479	* @throws Exception if instance could not be classified
1480	* successfully
1481	*/
1482	public int clusterInstance(Instance instance) throws Exception {
1483	m_ReplaceMissingFilter.input(instance);
1484	Instance inst = m_ReplaceMissingFilter.output();
1485
1486	return clusterProcessedInstance(inst);
1487	}
1488
1489
1490	/**
1491	* Returns the number of clusters.
1492	*
1493	* @return the number of clusters generated for a training dataset.
1494	*/
1495	public int numberOfClusters() {
1496	return m_NumClusters;
1497	}
1498
1499
1500	/**
1501	* Returns an enumeration describing the available options.
1502	* @return an enumeration of all the available options
1503	**/
1504	public Enumeration listOptions() {
1505	Vector result = new Vector();
1506
1507	result.addElement(new Option(
1508	"\tmaximum number of overall iterations\n"
1509	+ "\t(default 1).",
1510	"I", 1, "-I <num>"));
1511
1512	result.addElement(new Option(
1513	"\tmaximum number of iterations in the kMeans loop in\n"
1514	+ "\tthe Improve-Parameter part \n"
1515	+ "\t(default 1000).",
1516	"M", 1, "-M <num>"));
1517
1518	result.addElement(new Option(
1519	"\tmaximum number of iterations in the kMeans loop\n"
1520	+ "\tfor the splitted centroids in the Improve-Structure part \n"
1521	+ "\t(default 1000).",
1522	"J", 1, "-J <num>"));
1523
1524	result.addElement(new Option(
1525	"\tminimum number of clusters\n"
1526	+ "\t(default 2).",
1527	"L", 1, "-L <num>"));
1528
1529	result.addElement(new Option(
1530	"\tmaximum number of clusters\n"
1531	+ "\t(default 4).",
1532	"H", 1, "-H <num>"));
1533
1534	result.addElement(new Option(
1535	"\tdistance value for binary attributes\n"
1536	+ "\t(default 1.0).",
1537	"B", 1, "-B <value>"));
1538
1539	result.addElement(new Option(
1540	"\tUses the KDTree internally\n"
1541	+ "\t(default no).",
1542	"use-kdtree", 0, "-use-kdtree"));
1543
1544	result.addElement(new Option(
1545	"\tFull class name of KDTree class to use, followed\n"
1546	+ "\tby scheme options.\n"
1547	+ "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n"
1548	+ "\t(default no KDTree class used).",
1549	"K", 1, "-K <KDTree class specification>"));
1550
1551	result.addElement(new Option(
1552	"\tcutoff factor, takes the given percentage of the splitted \n"
1553	+ "\tcentroids if none of the children win\n"
1554	+ "\t(default 0.0).",
1555	"C", 1, "-C <value>"));
1556
1557	result.addElement(new Option(
1558	"\tFull class name of Distance function class to use, followed\n"
1559	+ "\tby scheme options.\n" +
1560	"\t(default weka.core.EuclideanDistance).",
1561	"D", 1, "-D <distance function class specification>"));
1562
1563	result.addElement(new Option(
1564	"\tfile to read starting centers from (ARFF format).",
1565	"N", 1, "-N <file name>"));
1566
1567	result.addElement(new Option(
1568	"\tfile to write centers to (ARFF format).",
1569	"O", 1, "-O <file name>"));
1570
1571	result.addElement(new Option(
1572	"\tThe debug level.\n"
1573	+ "\t(default 0)",
1574	"U", 1, "-U <int>"));
1575
1576	result.addElement(new Option(
1577	"\tThe debug vectors file.",
1578	"Y", 1, "-Y <file name>"));
1579
1580	Enumeration en = super.listOptions();
1581	while (en.hasMoreElements())
1582	result.addElement(en.nextElement());
1583
1584	return result.elements();
1585	}
1586
1587	/**
1588	* Returns the tip text for this property.
1589	* @return tip text for this property
1590	*/
1591	public String minNumClustersTipText() {
1592	return "set minimum number of clusters";
1593	}
1594
1595	/**
1596	* Sets the minimum number of clusters to generate.
1597	*
1598	* @param n the minimum number of clusters to generate
1599	*/
1600	public void setMinNumClusters(int n) {
1601	m_MinNumClusters = n;
1602	}
1603
1604	/**
1605	* Gets the minimum number of clusters to generate.
1606	* @return the minimum number of clusters to generate
1607	*/
1608	public int getMinNumClusters() {
1609	return m_MinNumClusters;
1610	}
1611
1612	/**
1613	* Returns the tip text for this property.
1614	* @return tip text for this property
1615	*/
1616	public String maxNumClustersTipText() {
1617	return "set maximum number of clusters";
1618	}
1619
1620	/**
1621	* Sets the maximum number of clusters to generate.
1622	* @param n the maximum number of clusters to generate
1623	*/
1624	public void setMaxNumClusters(int n) {
1625	if (n >= m_MinNumClusters) {
1626	m_MaxNumClusters = n;
1627	}
1628	}
1629
1630	/**
1631	* Gets the maximum number of clusters to generate.
1632	* @return the maximum number of clusters to generate
1633	*/
1634	public int getMaxNumClusters() {
1635	return m_MaxNumClusters;
1636	}
1637
1638	/**
1639	* Returns the tip text for this property.
1640	* @return tip text for this property
1641	*/
1642	public String maxIterationsTipText() {
1643	return "the maximum number of iterations to perform";
1644	}
1645
1646	/**
1647	* Sets the maximum number of iterations to perform.
1648	* @param i the number of iterations
1649	* @throws Exception if i is less than 1
1650	*/
1651	public void setMaxIterations(int i) throws Exception {
1652	if (i < 0)
1653	throw new Exception("Only positive values for iteration number" +
1654	" allowed (Option I).");
1655	m_MaxIterations = i;
1656	}
1657
1658	/**
1659	* Gets the maximum number of iterations.
1660	* @return the number of iterations
1661	*/
1662	public int getMaxIterations() {
1663	return m_MaxIterations;
1664	}
1665
1666	/**
1667	* Returns the tip text for this property.
1668	* @return tip text for this property
1669	*/
1670	public String maxKMeansTipText() {
1671	return "the maximum number of iterations to perform in KMeans";
1672	}
1673
1674	/**
1675	* Set the maximum number of iterations to perform in KMeans.
1676	* @param i the number of iterations
1677	*/
1678	public void setMaxKMeans(int i) {
1679	m_MaxKMeans = i;
1680	m_MaxKMeansForChildren = i;
1681	}
1682
1683	/**
1684	* Gets the maximum number of iterations in KMeans.
1685	* @return the number of iterations
1686	*/
1687	public int getMaxKMeans() {
1688	return m_MaxKMeans;
1689	}
1690
1691	/**
1692	* Returns the tip text for this property.
1693	* @return tip text for this property
1694	*/
1695	public String maxKMeansForChildrenTipText() {
1696	return "the maximum number of iterations KMeans that is performed on the child centers";
1697	}
1698
1699	/**
1700	* Sets the maximum number of iterations KMeans that is performed
1701	* on the child centers.
1702	* @param i the number of iterations
1703	*/
1704	public void setMaxKMeansForChildren(int i) {
1705	m_MaxKMeansForChildren = i;
1706	}
1707
1708	/**
1709	* Gets the maximum number of iterations in KMeans.
1710	* @return the number of iterations
1711	*/
1712	public int getMaxKMeansForChildren() {
1713	return m_MaxKMeansForChildren;
1714	}
1715
1716	/**
1717	* Returns the tip text for this property.
1718	* @return tip text for this property
1719	*/
1720	public String cutOffFactorTipText() {
1721	return "the cut-off factor to use";
1722	}
1723
1724	/**
1725	* Sets a new cutoff factor.
1726	* @param i the new cutoff factor
1727	*/
1728	public void setCutOffFactor(double i) {
1729	m_CutOffFactor = i;
1730	}
1731
1732	/**
1733	* Gets the cutoff factor.
1734	* @return the cutoff factor
1735	*/
1736	public double getCutOffFactor() {
1737	return m_CutOffFactor;
1738	}
1739
1740	/**
1741	* Returns the tip text for this property.
1742	* @return tip text for this property suitable for
1743	* displaying in the explorer/experimenter gui
1744	*/
1745	public String binValueTipText() {
1746	return "Set the value that represents true in the new attributes.";
1747	}
1748
1749	/**
1750	* Gets value that represents true in a new numeric attribute.
1751	* (False is always represented by 0.0.)
1752	* @return the value that represents true in a new numeric attribute
1753	*/
1754	public double getBinValue() {
1755	return m_BinValue;
1756	}
1757
1758	/**
1759	* Sets the distance value between true and false of binary attributes.
1760	* and "same" and "different" of nominal attributes
1761	* @param value the distance
1762	*/
1763	public void setBinValue(double value) {
1764	m_BinValue = value;
1765	}
1766
1767	/**
1768	* Returns the tip text for this property.
1769	*
1770	* @return tip text for this property suitable for
1771	* displaying in the explorer/experimenter gui
1772	*/
1773	public String distanceFTipText() {
1774	return "The distance function to use.";
1775	}
1776
1777	/**
1778	* gets the "binary" distance value.
1779	* @param distanceF the distance function with all options set
1780	*/
1781	public void setDistanceF(DistanceFunction distanceF) {
1782	m_DistanceF = distanceF;
1783	}
1784
1785	/**
1786	* Gets the distance function.
1787	* @return the distance function
1788	*/
1789	public DistanceFunction getDistanceF() {
1790	return m_DistanceF;
1791	}
1792
1793	/**
1794	* Gets the distance function specification string, which contains the
1795	* class name of the distance function class and any options to it.
1796	*
1797	* @return the distance function specification string
1798	*/
1799	protected String getDistanceFSpec() {
1800
1801	DistanceFunction d = getDistanceF();
1802	if (d instanceof OptionHandler) {
1803	return d.getClass().getName() + " "
1804	+ Utils.joinOptions(((OptionHandler) d).getOptions());
1805	}
1806	return d.getClass().getName();
1807	}
1808
1809	/**
1810	* Returns the tip text for this property.
1811	*
1812	* @return tip text for this property suitable for
1813	* displaying in the explorer/experimenter gui
1814	*/
1815	public String debugVectorsFileTipText() {
1816	return "The file containing the debug vectors (only for debugging!).";
1817	}
1818
1819	/**
1820	* Sets the file that has the random vectors stored.
1821	* Only used for debugging reasons.
1822	* @param value the file to read the random vectors from
1823	*/
1824	public void setDebugVectorsFile(File value) {
1825	m_DebugVectorsFile = value;
1826	}
1827
1828	/**
1829	* Gets the file name for a file that has the random vectors stored.
1830	* Only used for debugging purposes.
1831	* @return the file to read the vectors from
1832	*/
1833	public File getDebugVectorsFile() {
1834	return m_DebugVectorsFile;
1835	}
1836
1837	/**
1838	* Initialises the debug vector input.
1839	* @throws Exception if there is error
1840	* opening the debug input file.
1841	*/
1842	public void initDebugVectorsInput() throws Exception {
1843	m_DebugVectorsInput =
1844	new BufferedReader(new FileReader(m_DebugVectorsFile));
1845	m_DebugVectors = new Instances(m_DebugVectorsInput);
1846	m_DebugVectorsIndex = 0;
1847	}
1848
1849	/**
1850	* Read an instance from debug vectors file.
1851	* @param model the data model for the instance.
1852	* @throws Exception if there are no debug vector
1853	* in m_DebugVectors.
1854	* @return the next debug vector.
1855	*/
1856	public Instance getNextDebugVectorsInstance(Instances model)
1857	throws Exception {
1858	if (m_DebugVectorsIndex >= m_DebugVectors.numInstances())
1859	throw new Exception("no more prefabricated Vectors");
1860	Instance nex = m_DebugVectors.instance(m_DebugVectorsIndex);
1861	nex.setDataset(model);
1862	m_DebugVectorsIndex++;
1863	return nex;
1864	}
1865
1866	/**
1867	* Returns the tip text for this property.
1868	*
1869	* @return tip text for this property suitable for
1870	* displaying in the explorer/experimenter gui
1871	*/
1872	public String inputCenterFileTipText() {
1873	return "The file to read the list of centers from.";
1874	}
1875
1876	/**
1877	* Sets the file to read the list of centers from.
1878	*
1879	* @param value the file to read centers from
1880	*/
1881	public void setInputCenterFile(File value) {
1882	m_InputCenterFile = value;
1883	}
1884
1885	/**
1886	* Gets the file to read the list of centers from.
1887	*
1888	* @return the file to read the centers from
1889	*/
1890	public File getInputCenterFile() {
1891	return m_InputCenterFile;
1892	}
1893
1894	/**
1895	* Returns the tip text for this property.
1896	*
1897	* @return tip text for this property suitable for
1898	* displaying in the explorer/experimenter gui
1899	*/
1900	public String outputCenterFileTipText() {
1901	return "The file to write the list of centers to.";
1902	}
1903
1904	/**
1905	* Sets file to write the list of centers to.
1906	*
1907	* @param value file to write centers to
1908	*/
1909	public void setOutputCenterFile(File value) {
1910	m_OutputCenterFile = value;
1911	}
1912
1913	/**
1914	* Gets the file to write the list of centers to.
1915	*
1916	* @return filename of the file to write centers to
1917	*/
1918	public File getOutputCenterFile() {
1919	return m_OutputCenterFile;
1920	}
1921
1922	/**
1923	* Returns the tip text for this property.
1924	*
1925	* @return tip text for this property suitable for
1926	* displaying in the explorer/experimenter gui
1927	*/
1928	public String KDTreeTipText() {
1929	return "The KDTree to use.";
1930	}
1931
1932	/**
1933	* Sets the KDTree class.
1934	* @param k a KDTree object with all options set
1935	*/
1936	public void setKDTree(KDTree k) {
1937	m_KDTree = k;
1938	}
1939
1940	/**
1941	* Gets the KDTree class.
1942	*
1943	* @return the configured KDTree
1944	*/
1945	public KDTree getKDTree() {
1946	return m_KDTree;
1947	}
1948
1949	/**
1950	* Returns the tip text for this property.
1951	*
1952	* @return tip text for this property suitable for
1953	* displaying in the explorer/experimenter gui
1954	*/
1955	public String useKDTreeTipText() {
1956	return "Whether to use the KDTree.";
1957	}
1958
1959	/**
1960	* Sets whether to use the KDTree or not.
1961	*
1962	* @param value if true the KDTree is used
1963	*/
1964	public void setUseKDTree(boolean value) {
1965	m_UseKDTree = value;
1966	}
1967
1968	/**
1969	* Gets whether the KDTree is used or not.
1970	*
1971	* @return true if KDTrees are used
1972	*/
1973	public boolean getUseKDTree() {
1974	return m_UseKDTree;
1975	}
1976
1977	/**
1978	* Gets the KDTree specification string, which contains the class name of
1979	* the KDTree class and any options to the KDTree.
1980	*
1981	* @return the KDTree string.
1982	*/
1983	protected String getKDTreeSpec() {
1984
1985	KDTree c = getKDTree();
1986	if (c instanceof OptionHandler) {
1987	return c.getClass().getName() + " "
1988	+ Utils.joinOptions(((OptionHandler)c).getOptions());
1989	}
1990	return c.getClass().getName();
1991	}
1992
1993	/**
1994	* Returns the tip text for this property.
1995	*
1996	* @return tip text for this property suitable for
1997	* displaying in the explorer/experimenter gui
1998	*/
1999	public String debugLevelTipText() {
2000	return "The debug level to use.";
2001	}
2002
2003	/**
2004	* Sets the debug level.
2005	* debug level = 0, means no output
2006	* @param d debuglevel
2007	*/
2008	public void setDebugLevel(int d) {
2009	m_DebugLevel = d;
2010	}
2011
2012	/**
2013	* Gets the debug level.
2014	* @return debug level
2015	*/
2016	public int getDebugLevel() {
2017	return m_DebugLevel;
2018	}
2019
2020	/**
2021	* Checks the instances.
2022	* No checks in this KDTree but it calls the check of the distance function.
2023	*/
2024	protected void checkInstances () {
2025
2026	// m_DistanceF.checkInstances();
2027	}
2028
2029	/**
2030	* Parses a given list of options. <p/>
2031	*
2032	<!-- options-start -->
2033	* Valid options are: <p/>
2034	*
2035	* <pre> -I <num>
2036	* maximum number of overall iterations
2037	* (default 1).</pre>
2038	*
2039	* <pre> -M <num>
2040	* maximum number of iterations in the kMeans loop in
2041	* the Improve-Parameter part
2042	* (default 1000).</pre>
2043	*
2044	* <pre> -J <num>
2045	* maximum number of iterations in the kMeans loop
2046	* for the splitted centroids in the Improve-Structure part
2047	* (default 1000).</pre>
2048	*
2049	* <pre> -L <num>
2050	* minimum number of clusters
2051	* (default 2).</pre>
2052	*
2053	* <pre> -H <num>
2054	* maximum number of clusters
2055	* (default 4).</pre>
2056	*
2057	* <pre> -B <value>
2058	* distance value for binary attributes
2059	* (default 1.0).</pre>
2060	*
2061	* <pre> -use-kdtree
2062	* Uses the KDTree internally
2063	* (default no).</pre>
2064	*
2065	* <pre> -K <KDTree class specification>
2066	* Full class name of KDTree class to use, followed
2067	* by scheme options.
2068	* eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
2069	* (default no KDTree class used).</pre>
2070	*
2071	* <pre> -C <value>
2072	* cutoff factor, takes the given percentage of the splitted
2073	* centroids if none of the children win
2074	* (default 0.0).</pre>
2075	*
2076	* <pre> -D <distance function class specification>
2077	* Full class name of Distance function class to use, followed
2078	* by scheme options.
2079	* (default weka.core.EuclideanDistance).</pre>
2080	*
2081	* <pre> -N <file name>
2082	* file to read starting centers from (ARFF format).</pre>
2083	*
2084	* <pre> -O <file name>
2085	* file to write centers to (ARFF format).</pre>
2086	*
2087	* <pre> -U <int>
2088	* The debug level.
2089	* (default 0)</pre>
2090	*
2091	* <pre> -Y <file name>
2092	* The debug vectors file.</pre>
2093	*
2094	* <pre> -S <num>
2095	* Random number seed.
2096	* (default 10)</pre>
2097	*
2098	<!-- options-end -->
2099	*
2100	* @param options the list of options as an array of strings
2101	* @throws Exception if an option is not supported
2102	*/
2103	public void setOptions(String[] options)
2104	throws Exception {
2105
2106	String optionString;
2107	String funcString;
2108
2109	optionString = Utils.getOption('I', options);
2110	if (optionString.length() != 0)
2111	setMaxIterations(Integer.parseInt(optionString));
2112	else
2113	setMaxIterations(1);
2114
2115	optionString = Utils.getOption('M', options);
2116	if (optionString.length() != 0)
2117	setMaxKMeans(Integer.parseInt(optionString));
2118	else
2119	setMaxKMeans(1000);
2120
2121	optionString = Utils.getOption('J', options);
2122	if (optionString.length() != 0)
2123	setMaxKMeansForChildren(Integer.parseInt(optionString));
2124	else
2125	setMaxKMeansForChildren(1000);
2126
2127	optionString = Utils.getOption('L', options);
2128	if (optionString.length() != 0)
2129	setMinNumClusters(Integer.parseInt(optionString));
2130	else
2131	setMinNumClusters(2);
2132
2133	optionString = Utils.getOption('H', options);
2134	if (optionString.length() != 0)
2135	setMaxNumClusters(Integer.parseInt(optionString));
2136	else
2137	setMaxNumClusters(4);
2138
2139	optionString = Utils.getOption('B', options);
2140	if (optionString.length() != 0)
2141	setBinValue(Double.parseDouble(optionString));
2142	else
2143	setBinValue(1.0);
2144
2145	setUseKDTree(Utils.getFlag("use-kdtree", options));
2146
2147	if (getUseKDTree()) {
2148	funcString = Utils.getOption('K', options);
2149	if (funcString.length() != 0) {
2150	String[] funcSpec = Utils.splitOptions(funcString);
2151	if (funcSpec.length == 0) {
2152	throw new Exception("Invalid function specification string");
2153	}
2154	String funcName = funcSpec[0];
2155	funcSpec[0] = "";
2156	setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec));
2157	}
2158	else {
2159	setKDTree(new KDTree());
2160	}
2161	}
2162	else {
2163	setKDTree(new KDTree());
2164	}
2165
2166	optionString = Utils.getOption('C', options);
2167	if (optionString.length() != 0)
2168	setCutOffFactor(Double.parseDouble(optionString));
2169	else
2170	setCutOffFactor(0.0);
2171
2172	funcString = Utils.getOption('D', options);
2173	if (funcString.length() != 0) {
2174	String[] funcSpec = Utils.splitOptions(funcString);
2175	if (funcSpec.length == 0) {
2176	throw new Exception("Invalid function specification string");
2177	}
2178	String funcName = funcSpec[0];
2179	funcSpec[0] = "";
2180	setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class,
2181	funcName, funcSpec));
2182	}
2183	else {
2184	setDistanceF(new EuclideanDistance());
2185	}
2186
2187	optionString = Utils.getOption('N', options);
2188	if (optionString.length() != 0) {
2189	setInputCenterFile(new File(optionString));
2190	m_CenterInput =
2191	new BufferedReader(new FileReader(optionString));
2192	}
2193	else {
2194	setInputCenterFile(new File(System.getProperty("user.dir")));
2195	m_CenterInput = null;
2196	}
2197
2198	optionString = Utils.getOption('O', options);
2199	if (optionString.length() != 0) {
2200	setOutputCenterFile(new File(optionString));
2201	m_CenterOutput = new PrintWriter(new FileOutputStream(optionString));
2202	}
2203	else {
2204	setOutputCenterFile(new File(System.getProperty("user.dir")));
2205	m_CenterOutput = null;
2206	}
2207
2208	optionString = Utils.getOption('U', options);
2209	int debugLevel = 0;
2210	if (optionString.length() != 0) {
2211	try {
2212	debugLevel = Integer.parseInt(optionString);
2213	} catch (NumberFormatException e) {
2214	throw new Exception(optionString +
2215	"is an illegal value for option -U");
2216	}
2217	}
2218	setDebugLevel(debugLevel);
2219
2220	optionString = Utils.getOption('Y', options);
2221	if (optionString.length() != 0) {
2222	setDebugVectorsFile(new File(optionString));
2223	}
2224	else {
2225	setDebugVectorsFile(new File(System.getProperty("user.dir")));
2226	m_DebugVectorsInput = null;
2227	m_DebugVectors = null;
2228	}
2229
2230	super.setOptions(options);
2231	}
2232
2233	/**
2234	* Gets the current settings of SimpleKMeans.
2235	* @return an array of strings suitable for passing to setOptions
2236	*/
2237	public String[] getOptions() {
2238	int i;
2239	Vector result;
2240	String[] options;
2241
2242	result = new Vector();
2243
2244	result.add("-I");
2245	result.add("" + getMaxIterations());
2246
2247	result.add("-M");
2248	result.add("" + getMaxKMeans());
2249
2250	result.add("-J");
2251	result.add("" + getMaxKMeansForChildren());
2252
2253	result.add("-L");
2254	result.add("" + getMinNumClusters());
2255
2256	result.add("-H");
2257	result.add("" + getMaxNumClusters());
2258
2259	result.add("-B");
2260	result.add("" + getBinValue());
2261
2262	if (getUseKDTree()) {
2263	result.add("-use-kdtree");
2264	result.add("-K");
2265	result.add("" + getKDTreeSpec());
2266	}
2267
2268	result.add("-C");
2269	result.add("" + getCutOffFactor());
2270
2271	if (getDistanceF() != null) {
2272	result.add("-D");
2273	result.add("" + getDistanceFSpec());
2274	}
2275
2276	if (getInputCenterFile().exists() && getInputCenterFile().isFile()) {
2277	result.add("-N");
2278	result.add("" + getInputCenterFile());
2279	}
2280
2281	if (getOutputCenterFile().exists() && getOutputCenterFile().isFile()) {
2282	result.add("-O");
2283	result.add("" + getOutputCenterFile());
2284	}
2285
2286	int dL = getDebugLevel();
2287	if (dL > 0) {
2288	result.add("-U");
2289	result.add("" + getDebugLevel());
2290	}
2291
2292	if (getDebugVectorsFile().exists() && getDebugVectorsFile().isFile()) {
2293	result.add("-Y");
2294	result.add("" + getDebugVectorsFile());
2295	}
2296
2297	options = super.getOptions();
2298	for (i = 0; i < options.length; i++)
2299	result.add(options[i]);
2300
2301	return (String[]) result.toArray(new String[result.size()]);
2302	}
2303
2304	/**
2305	* Return a string describing this clusterer.
2306	* @return a description of the clusterer as a string
2307	*/
2308	public String toString() {
2309	StringBuffer temp = new StringBuffer();
2310
2311	temp.append("\nXMeans\n======\n");
2312
2313	temp.append("Requested iterations : " + m_MaxIterations + "\n");
2314	temp.append("Iterations performed : " + m_IterationCount+ "\n");
2315	if (m_KMeansStopped > 0) {
2316	temp.append("kMeans did not converge\n");
2317	temp.append(" but was stopped by max-loops "
2318	+ m_KMeansStopped + " times (max kMeans-iter)\n");
2319	}
2320	temp.append("Splits prepared : " + m_NumSplits + "\n");
2321	temp.append("Splits performed : " + m_NumSplitsDone + "\n");
2322	temp.append("Cutoff factor : " + m_CutOffFactor + "\n");
2323	double perc;
2324	if (m_NumSplitsDone > 0)
2325	perc = (((double)m_NumSplitsStillDone)/((double) m_NumSplitsDone))
2326	* 100.0;
2327	else
2328	perc = 0.0;
2329	temp.append("Percentage of splits accepted \n" +
2330	"by cutoff factor : "
2331	+ Utils.doubleToString(perc,2) + " %\n");
2332	temp.append("------\n");
2333
2334	temp.append("Cutoff factor : " + m_CutOffFactor + "\n");
2335	temp.append("------\n");
2336	temp.append("\nCluster centers : " + m_NumClusters + " centers\n");
2337	for (int i = 0; i < m_NumClusters; i++) {
2338	temp.append("\nCluster "+i+"\n ");
2339	for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) {
2340	if (m_ClusterCenters.attribute(j).isNominal()) {
2341	temp.append(" "+m_ClusterCenters.attribute(j).
2342	value((int)m_ClusterCenters.instance(i).value(j)));
2343	} else {
2344	temp.append(" "+m_ClusterCenters.instance(i).value(j));
2345	}
2346	}
2347	}
2348	if (m_Mle != null)
2349	temp.append("\n\nDistortion: " +
2350	Utils.doubleToString(Utils.sum(m_Mle),6) + "\n");
2351	temp.append("BIC-Value : " + Utils.doubleToString(m_Bic,6) + "\n");
2352	return temp.toString();
2353	}
2354
2355	/**
2356	* Print centers for debug.
2357	* @param debugLevel level that gives according messages
2358	*/
2359	protected void PrCentersFD(int debugLevel) {
2360	if (debugLevel == m_DebugLevel) {
2361	for (int i = 0; i < m_ClusterCenters.numInstances(); i++) {
2362	System.out.println(m_ClusterCenters.instance(i));
2363	}
2364	}
2365	}
2366
2367	/**
2368	* Tests on debug status.
2369	* @param debugLevel level that gives according messages
2370	* @return true if debug level is set
2371	*/
2372	protected boolean TFD(int debugLevel) {
2373	return (debugLevel == m_DebugLevel);
2374	}
2375
2376	/**
2377	* Does debug printouts.
2378	* @param debugLevel level that gives according messages
2379	* @param output string that is printed
2380	*/
2381	protected void PFD(int debugLevel, String output) {
2382	if (debugLevel == m_DebugLevel)
2383	System.out.println(output);
2384	}
2385	/**
2386	* Does debug printouts.
2387	* @param output string that is printed
2388	*/
2389	protected void PFD_CURR(String output) {
2390	if (m_CurrDebugFlag)
2391	System.out.println(output);
2392	}
2393
2394	/**
2395	* Returns the revision string.
2396	*
2397	* @return the revision
2398	*/
2399	public String getRevision() {
2400	return RevisionUtils.extract("$Revision: 5488 $");
2401	}
2402
2403	/**
2404	* Main method for testing this class.
2405	* @param argv should contain options
2406	*/
2407	public static void main(String[] argv) {
2408	runClusterer(new XMeans(), argv);
2409	}
2410	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/clusterers/XMeans.java @ 38

Download in other formats: