Context Navigation

source: src/main/java/weka/clusterers/XMeans.java @ 11

Last change on this file since 11 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 69.5 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* XMeans.java
	19	* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.clusterers;
	24
	25	import weka.core.AlgVector;
	26	import weka.core.Capabilities;
	27	import weka.core.DistanceFunction;
	28	import weka.core.EuclideanDistance;
	29	import weka.core.Instance;
	30	import weka.core.Instances;
	31	import weka.core.RevisionUtils;
	32	import weka.core.neighboursearch.KDTree;
	33	import weka.core.Option;
	34	import weka.core.OptionHandler;
	35	import weka.core.TechnicalInformation;
	36	import weka.core.TechnicalInformationHandler;
	37	import weka.core.Utils;
	38	import weka.core.Capabilities.Capability;
	39	import weka.core.TechnicalInformation.Field;
	40	import weka.core.TechnicalInformation.Type;
	41	import weka.filters.Filter;
	42	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	43
	44	import java.io.BufferedReader;
	45	import java.io.File;
	46	import java.io.FileOutputStream;
	47	import java.io.FileReader;
	48	import java.io.PrintWriter;
	49	import java.io.Reader;
	50	import java.util.Enumeration;
	51	import java.util.Random;
	52	import java.util.Vector;
	53
	54	/**
	55	<!-- globalinfo-start -->
	56	* Cluster data using the X-means algorithm.<br/>
	57	* <br/>
	58	* X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/>
	59	* <br/>
	60	* For more information see:<br/>
	61	* <br/>
	62	* Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000.
	63	* <p/>
	64	<!-- globalinfo-end -->
	65	*
	66	<!-- technical-bibtex-start -->
	67	* BibTeX:
	68	* <pre>
	69	* @inproceedings{Pelleg2000,
	70	* author = {Dan Pelleg and Andrew W. Moore},
	71	* booktitle = {Seventeenth International Conference on Machine Learning},
	72	* pages = {727-734},
	73	* publisher = {Morgan Kaufmann},
	74	* title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters},
	75	* year = {2000}
	76	* }
	77	* </pre>
	78	* <p/>
	79	<!-- technical-bibtex-end -->
	80	*
	81	<!-- options-start -->
	82	* Valid options are: <p/>
	83	*
	84	* <pre> -I <num>
	85	* maximum number of overall iterations
	86	* (default 1).</pre>
	87	*
	88	* <pre> -M <num>
	89	* maximum number of iterations in the kMeans loop in
	90	* the Improve-Parameter part
	91	* (default 1000).</pre>
	92	*
	93	* <pre> -J <num>
	94	* maximum number of iterations in the kMeans loop
	95	* for the splitted centroids in the Improve-Structure part
	96	* (default 1000).</pre>
	97	*
	98	* <pre> -L <num>
	99	* minimum number of clusters
	100	* (default 2).</pre>
	101	*
	102	* <pre> -H <num>
	103	* maximum number of clusters
	104	* (default 4).</pre>
	105	*
	106	* <pre> -B <value>
	107	* distance value for binary attributes
	108	* (default 1.0).</pre>
	109	*
	110	* <pre> -use-kdtree
	111	* Uses the KDTree internally
	112	* (default no).</pre>
	113	*
	114	* <pre> -K <KDTree class specification>
	115	* Full class name of KDTree class to use, followed
	116	* by scheme options.
	117	* eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
	118	* (default no KDTree class used).</pre>
	119	*
	120	* <pre> -C <value>
	121	* cutoff factor, takes the given percentage of the splitted
	122	* centroids if none of the children win
	123	* (default 0.0).</pre>
	124	*
	125	* <pre> -D <distance function class specification>
	126	* Full class name of Distance function class to use, followed
	127	* by scheme options.
	128	* (default weka.core.EuclideanDistance).</pre>
	129	*
	130	* <pre> -N <file name>
	131	* file to read starting centers from (ARFF format).</pre>
	132	*
	133	* <pre> -O <file name>
	134	* file to write centers to (ARFF format).</pre>
	135	*
	136	* <pre> -U <int>
	137	* The debug level.
	138	* (default 0)</pre>
	139	*
	140	* <pre> -Y <file name>
	141	* The debug vectors file.</pre>
	142	*
	143	* <pre> -S <num>
	144	* Random number seed.
	145	* (default 10)</pre>
	146	*
	147	<!-- options-end -->
	148	*
	149	* @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
	150	* @author Mark Hall (mhall@cs.waikato.ac.nz)
	151	* @author Malcolm Ware (mfw4@cs.waikato.ac.nz)
	152	* @version $Revision: 5488 $
	153	* @see RandomizableClusterer
	154	*/
	155	public class XMeans
	156	extends RandomizableClusterer
	157	implements TechnicalInformationHandler {
	158
	159	/*
	160	* major TODOS:
	161	*
	162	* make BIC-Score replaceable by other scores
	163	*/
	164
	165	/** for serialization. */
	166	private static final long serialVersionUID = -7941793078404132616L;
	167
	168	/** training instances. */
	169	protected Instances m_Instances = null;
	170
	171	/** model information, should increase readability. */
	172	protected Instances m_Model = null;
	173
	174	/** replace missing values in training instances. */
	175	protected ReplaceMissingValues m_ReplaceMissingFilter;
	176
	177	/**
	178	* Distance value between true and false of binary attributes and
	179	* "same" and "different" of nominal attributes (default = 1.0).
	180	*/
	181	protected double m_BinValue = 1.0;
	182
	183	/** BIC-Score of the current model. */
	184	protected double m_Bic = Double.MIN_VALUE;
	185
	186	/** Distortion. */
	187	protected double[] m_Mle = null;
	188
	189	/** maximum overall iterations. */
	190	protected int m_MaxIterations = 1;
	191
	192	/**
	193	* maximum iterations to perform Kmeans part
	194	* if negative, iterations are not checked.
	195	*/
	196	protected int m_MaxKMeans = 1000;
	197
	198	/** see above, but for kMeans of splitted clusters.
	199	*/
	200	protected int m_MaxKMeansForChildren = 1000;
	201
	202	/** The actual number of clusters. */
	203	protected int m_NumClusters = 2;
	204
	205	/** min number of clusters to generate. */
	206	protected int m_MinNumClusters = 2;
	207
	208	/** max number of clusters to generate. */
	209	protected int m_MaxNumClusters = 4;
	210
	211	/** the distance function used. */
	212	protected DistanceFunction m_DistanceF = new EuclideanDistance();
	213
	214	/** cluster centers. */
	215	protected Instances m_ClusterCenters;
	216
	217	/** file name of the output file for the cluster centers. */
	218	protected File m_InputCenterFile = new File(System.getProperty("user.dir"));
	219
	220	/* --> DebugVectors - USED FOR DEBUGGING */
	221	/** input file for the random vectors --> USED FOR DEBUGGING. */
	222	protected Reader m_DebugVectorsInput = null;
	223	/** the index for the current debug vector. */
	224	protected int m_DebugVectorsIndex = 0;
	225	/** all the debug vectors. */
	226	protected Instances m_DebugVectors = null;
	227
	228	/** file name of the input file for the random vectors. */
	229	protected File m_DebugVectorsFile = new File(System.getProperty("user.dir"));
	230
	231	/** input file for the cluster centers. */
	232	protected Reader m_CenterInput = null;
	233
	234	/** file name of the output file for the cluster centers. */
	235	protected File m_OutputCenterFile = new File(System.getProperty("user.dir"));
	236
	237	/** output file for the cluster centers. */
	238	protected PrintWriter m_CenterOutput = null;
	239
	240	/**
	241	* temporary variable holding cluster assignments while iterating.
	242	*/
	243	protected int[] m_ClusterAssignments;
	244
	245	/** cutoff factor - percentage of splits done in Improve-Structure part
	246	only relevant, if all children lost. */
	247	protected double m_CutOffFactor = 0.5;
	248
	249	/** Index in ranges for LOW. */
	250	public static int R_LOW = 0;
	251	/** Index in ranges for HIGH. */
	252	public static int R_HIGH = 1;
	253	/** Index in ranges for WIDTH. */
	254	public static int R_WIDTH = 2;
	255
	256	/**
	257	* KDTrees class if KDTrees are used.
	258	*/
	259	protected KDTree m_KDTree = new KDTree();
	260
	261	/** whether to use the KDTree (the KDTree is only initialized to be
	262	* configurable from the GUI). */
	263	protected boolean m_UseKDTree = false;
	264
	265	/** counts iterations done in main loop. */
	266	protected int m_IterationCount = 0;
	267
	268	/** counter to say how often kMeans was stopped by loop counter. */
	269	protected int m_KMeansStopped = 0;
	270
	271	/** Number of splits prepared. */
	272	protected int m_NumSplits = 0;
	273
	274	/** Number of splits accepted (including cutoff factor decisions). */
	275	protected int m_NumSplitsDone = 0;
	276
	277	/** Number of splits accepted just because of cutoff factor. */
	278	protected int m_NumSplitsStillDone = 0;
	279
	280	/**
	281	* level of debug output, 0 is no output.
	282	*/
	283	protected int m_DebugLevel = 0;
	284
	285	/** print the centers. */
	286	public static int D_PRINTCENTERS = 1;
	287	/** follows the splitting of the centers. */
	288	public static int D_FOLLOWSPLIT = 2;
	289	/** have a closer look at converge children. */
	290	public static int D_CONVCHCLOSER = 3;
	291	/** check on random vectors. */
	292	public static int D_RANDOMVECTOR = 4;
	293	/** check on kdtree. */
	294	public static int D_KDTREE = 5;
	295	/** follow iterations. */
	296	public static int D_ITERCOUNT = 6;
	297	/** functions were maybe misused. */
	298	public static int D_METH_MISUSE = 80;
	299	/** for current debug. */
	300	public static int D_CURR = 88;
	301	/** general debugging. */
	302	public static int D_GENERAL = 99;
	303
	304	/** Flag: I'm debugging. */
	305	public boolean m_CurrDebugFlag = true;
	306
	307	/**
	308	* the default constructor.
	309	*/
	310	public XMeans() {
	311	super();
	312
	313	m_SeedDefault = 10;
	314	setSeed(m_SeedDefault);
	315	}
	316
	317	/**
	318	* Returns a string describing this clusterer.
	319	* @return a description of the evaluator suitable for
	320	* displaying in the explorer/experimenter gui
	321	*/
	322	public String globalInfo() {
	323	return
	324	"Cluster data using the X-means algorithm.\n\n"
	325	+ "X-Means is K-Means extended by an Improve-Structure part In this "
	326	+ "part of the algorithm the centers are attempted to be split in "
	327	+ "its region. The decision between the children of each center and "
	328	+ "itself is done comparing the BIC-values of the two structures.\n\n"
	329	+ "For more information see:\n\n"
	330	+ getTechnicalInformation().toString();
	331	}
	332
	333	/**
	334	* Returns an instance of a TechnicalInformation object, containing
	335	* detailed information about the technical background of this class,
	336	* e.g., paper reference or book this class is based on.
	337	*
	338	* @return the technical information about this class
	339	*/
	340	public TechnicalInformation getTechnicalInformation() {
	341	TechnicalInformation result;
	342
	343	result = new TechnicalInformation(Type.INPROCEEDINGS);
	344	result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore");
	345	result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters");
	346	result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning");
	347	result.setValue(Field.YEAR, "2000");
	348	result.setValue(Field.PAGES, "727-734");
	349	result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
	350
	351	return result;
	352	}
	353
	354	/**
	355	* Returns default capabilities of the clusterer.
	356	*
	357	* @return the capabilities of this clusterer
	358	*/
	359	public Capabilities getCapabilities() {
	360	Capabilities result = super.getCapabilities();
	361	result.disableAll();
	362	result.enable(Capability.NO_CLASS);
	363
	364	// attributes
	365	result.enable(Capability.NUMERIC_ATTRIBUTES);
	366	result.enable(Capability.DATE_ATTRIBUTES);
	367	result.enable(Capability.MISSING_VALUES);
	368
	369	return result;
	370	}
	371
	372	/**
	373	* Generates the X-Means clusterer.
	374	*
	375	* @param data set of instances serving as training data
	376	* @throws Exception if the clusterer has not been
	377	* generated successfully
	378	*/
	379	public void buildClusterer(Instances data) throws Exception {
	380
	381	// can clusterer handle the data?
	382	getCapabilities().testWithFail(data);
	383
	384	if (m_MinNumClusters > m_MaxNumClusters) {
	385	throw new Exception("XMeans: min number of clusters "
	386	+ "can't be greater than max number of clusters!");
	387	}
	388
	389	m_NumSplits = 0;
	390	m_NumSplitsDone = 0;
	391	m_NumSplitsStillDone = 0;
	392
	393	// replace missing values
	394	m_ReplaceMissingFilter = new ReplaceMissingValues();
	395	m_ReplaceMissingFilter.setInputFormat(data);
	396	m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter);
	397
	398	// initialize random function
	399	Random random0 = new Random(m_Seed);
	400
	401	// num of clusters to start with
	402	m_NumClusters = m_MinNumClusters;
	403
	404	// set distance function to default
	405	if (m_DistanceF == null) {
	406	m_DistanceF = new EuclideanDistance();
	407	}
	408
	409	m_DistanceF.setInstances(m_Instances);
	410	checkInstances();
	411
	412	if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile())
	413	initDebugVectorsInput();
	414
	415	// make list of indexes for m_Instances
	416	int[] allInstList = new int[m_Instances.numInstances()];
	417	for (int i = 0; i < m_Instances.numInstances(); i++) {
	418	allInstList[i] = i;
	419	}
	420
	421	// set model used (just for convenience)
	422	m_Model = new Instances(m_Instances, 0);
	423
	424	// produce the starting centers
	425	if (m_CenterInput != null) {
	426	// read centers from file
	427	m_ClusterCenters = new Instances(m_CenterInput);
	428	m_NumClusters = m_ClusterCenters.numInstances();
	429	}
	430	else
	431	// makes the first centers randomly
	432	m_ClusterCenters = makeCentersRandomly(random0,
	433	m_Instances, m_NumClusters);
	434	PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");
	435	for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {
	436	PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));
	437	}
	438
	439	PrCentersFD(D_PRINTCENTERS);
	440
	441	boolean finished = false;
	442	Instances children;
	443
	444	// builds up a KDTree
	445	if (m_UseKDTree)
	446	m_KDTree.setInstances(m_Instances);
	447
	448	// loop counter of main loop
	449	m_IterationCount = 0;
	450
	451	/**
	452	* "finished" does get true as soon as:
	453	* 1. number of clusters gets >= m_MaxClusters,
	454	* 2. in the last round, none of the centers have been split
	455	*
	456	* if number of clusters is already >= m_MaxClusters
	457	* part 1 (= Improve-Params) is done at least once.
	458	*/
	459	while (!finished &&
	460	!stopIteration(m_IterationCount, m_MaxIterations)) {
	461
	462	/* ====================================================================
	463	* 1. Improve-Params
	464	* conventional K-means
	465	*/
	466
	467
	468	PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:");
	469	PrCentersFD(D_FOLLOWSPLIT);
	470
	471	PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount +
	472	". time");
	473	m_IterationCount++;
	474
	475	// prepare to converge
	476	boolean converged = false;
	477
	478	// initialize assignments to -1
	479	m_ClusterAssignments = initAssignments(m_Instances.numInstances());
	480	// stores a list of indexes of instances belonging to each center
	481	int[][] instOfCent = new int[m_ClusterCenters.numInstances()][];
	482
	483	// KMeans loop counter
	484	int kMeansIteration = 0;
	485
	486	// converge in conventional K-means ----------------------------------
	487	PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:");
	488	while (!converged &&
	489	!stopKMeansIteration(kMeansIteration, m_MaxKMeans)) {
	490
	491	kMeansIteration++;
	492	converged = true;
	493
	494	// assign instances to centers -------------------------------------
	495	converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
	496	m_ClusterCenters,
	497	instOfCent,
	498	allInstList,
	499	m_ClusterAssignments,
	500	kMeansIteration);
	501
	502	PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:");
	503	PrCentersFD(D_FOLLOWSPLIT);
	504	// compute new centers = centers of mass of points
	505	converged = recomputeCenters(m_ClusterCenters, // clusters
	506	instOfCent, // their instances
	507	m_Model); // model information
	508	PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:");
	509	PrCentersFD(D_FOLLOWSPLIT);
	510	}
	511	PFD(D_FOLLOWSPLIT, "");
	512	PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means");
	513
	514	/** =====================================================================
	515	* 2. Improve-Structur
	516	*/
	517
	518	// BIC before split distortioning the centres
	519	m_Mle = distortion(instOfCent, m_ClusterCenters);
	520	m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle);
	521	PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic);
	522
	523	int currNumCent = m_ClusterCenters.numInstances();
	524	Instances splitCenters = new Instances(m_ClusterCenters,
	525	currNumCent * 2);
	526
	527	// store BIC values of parent and children
	528	double[] pbic = new double [currNumCent];
	529	double[] cbic = new double [currNumCent];
	530
	531	// split each center
	532	for (int i = 0; i < currNumCent
	533	// this could help to optimize the algorithm
	534	// && currNumCent + numSplits <= m_MaxNumClusters
	535	;
	536	i++) {
	537
	538	PFD(D_FOLLOWSPLIT, "\nsplit center " + i +
	539	" " + m_ClusterCenters.instance(i));
	540	Instance currCenter = m_ClusterCenters.instance(i);
	541	int[] currInstList = instOfCent[i];
	542	int currNumInst = instOfCent[i].length;
	543
	544	// not enough instances; than continue with next
	545	if (currNumInst <= 2) {
	546	pbic[i] = Double.MAX_VALUE;
	547	cbic[i] = 0.0;
	548	// add center itself as dummy
	549	splitCenters.add(currCenter);
	550	splitCenters.add(currCenter);
	551	continue;
	552	}
	553
	554	// split centers ----------------------------------------------
	555	double variance = m_Mle[i] / (double)currNumInst;
	556	children = splitCenter(random0, currCenter, variance, m_Model);
	557
	558	// initialize assignments to -1
	559	int[] oneCentAssignments = initAssignments(currNumInst);
	560	int[][] instOfChCent = new int [2][]; // todo maybe split didn't work
	561
	562	// converge the children --------------------------------------
	563	converged = false;
	564	int kMeansForChildrenIteration = 0;
	565	PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i);
	566	while (!converged &&
	567	!stopKMeansIteration(kMeansForChildrenIteration,
	568	m_MaxKMeansForChildren)) {
	569	kMeansForChildrenIteration++;
	570
	571	converged =
	572	assignToCenters(children, instOfChCent,
	573	currInstList, oneCentAssignments);
	574
	575	if (!converged) {
	576	recomputeCentersFast(children, instOfChCent, m_Model);
	577	}
	578	}
	579
	580	// store new centers for later decision if they are taken
	581	splitCenters.add(children.instance(0));
	582	splitCenters.add(children.instance(1));
	583
	584	PFD(D_FOLLOWSPLIT, "\nconverged cildren ");
	585	PFD(D_FOLLOWSPLIT, " " + children.instance(0));
	586	PFD(D_FOLLOWSPLIT, " " + children.instance(1));
	587
	588	// compare parent and children model by their BIC-value
	589	pbic[i] = calculateBIC(currInstList, currCenter, m_Mle[i], m_Model);
	590	double[] chMLE = distortion(instOfChCent, children);
	591	cbic[i] = calculateBIC(instOfChCent, children, chMLE);
	592
	593	} // end of loop over clusters
	594
	595	// decide which one to split and make new list of cluster centers
	596	Instances newClusterCenters = null;
	597	newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor,
	598	splitCenters);
	599	/**
	600	* Compare with before Improve-Structure
	601	*/
	602	int newNumClusters = newClusterCenters.numInstances();
	603	if (newNumClusters != m_NumClusters) {
	604
	605	PFD(D_FOLLOWSPLIT, "Compare with non-split");
	606
	607	// initialize assignments to -1
	608	int[] newClusterAssignments =
	609	initAssignments(m_Instances.numInstances());
	610
	611	// stores a list of indexes of instances belonging to each center
	612	int[][] newInstOfCent = new int[newClusterCenters.numInstances()][];
	613
	614	// assign instances to centers -------------------------------------
	615	converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
	616	newClusterCenters,
	617	newInstOfCent,
	618	allInstList,
	619	newClusterAssignments,
	620	m_IterationCount);
	621
	622	double[] newMle = distortion(newInstOfCent, newClusterCenters);
	623	double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle);
	624	PFD(D_FOLLOWSPLIT, "newBic " + newBic);
	625	if (newBic > m_Bic) {
	626	PFD(D_FOLLOWSPLIT, "*** decide for new clusters");
	627	m_Bic = newBic;
	628	m_ClusterCenters = newClusterCenters;
	629	m_ClusterAssignments = newClusterAssignments;
	630	} else {
	631	PFD(D_FOLLOWSPLIT, "*** keep old clusters");
	632	}
	633	}
	634
	635	newNumClusters = m_ClusterCenters.numInstances();
	636	// decide if finished: max num cluster reached
	637	// or last centers where not split at all
	638	if ((newNumClusters >= m_MaxNumClusters)
	639	\|\| (newNumClusters == m_NumClusters)) {
	640	finished = true;
	641	}
	642	m_NumClusters = newNumClusters;
	643	}
	644	}
	645
	646	/**
	647	* Checks for nominal attributes in the dataset.
	648	* Class attribute is ignored.
	649	* @param data the data to check
	650	* @return false if no nominal attributes are present
	651	*/
	652	public boolean checkForNominalAttributes(Instances data) {
	653
	654	int i = 0;
	655	while (i < data.numAttributes()) {
	656	if ((i != data.classIndex()) && data.attribute(i++).isNominal()) {
	657	return true;
	658	}
	659	}
	660	return false;
	661	}
	662
	663	/**
	664	* Set array of int, used to store assignments, to -1.
	665	* @param ass integer array used for storing assignments
	666	* @return integer array used for storing assignments
	667	*/
	668	protected int[] initAssignments(int[] ass) {
	669	for (int i = 0; i < ass.length; i++)
	670	ass[i] = -1;
	671	return ass;
	672	}
	673
	674	/**
	675	* Creates and initializes integer array, used to store assignments.
	676	* @param numInstances length of array used for assignments
	677	* @return integer array used for storing assignments
	678	*/
	679	protected int[] initAssignments(int numInstances) {
	680	int[] ass = new int[numInstances];
	681	for (int i = 0; i < numInstances; i++)
	682	ass[i] = -1;
	683	return ass;
	684	}
	685
	686	/**
	687	* Creates and initializes boolean array.
	688	* @param len length of new array
	689	* @return the new array
	690	*/
	691	boolean[] initBoolArray(int len) {
	692	boolean[] boolArray = new boolean [len];
	693	for (int i = 0; i < len; i++) {
	694	boolArray[i] = false;
	695	}
	696	return boolArray;
	697	}
	698
	699	/**
	700	* Returns new center list.
	701	*
	702	* The following steps 1. and 2. both take care that the number of centers
	703	* does not exceed maxCenters.
	704	*
	705	* 1. Compare BIC values of parent and children and takes the one as
	706	* new centers which do win (= BIC-value is smaller).
	707	*
	708	* 2. If in 1. none of the children are chosen
	709	* && and cutoff factor is > 0
	710	* cutoff factor is taken as the percentage of "best" centers that are
	711	* still taken.
	712	* @param pbic array of parents BIC-values
	713	* @param cbic array of childrens BIC-values
	714	* @param cutoffFactor cutoff factor
	715	* @param splitCenters all children
	716	* @return the new centers
	717	*/
	718	protected Instances newCentersAfterSplit(double[] pbic,
	719	double[] cbic,
	720	double cutoffFactor,
	721	Instances splitCenters) {
	722
	723	// store if split won
	724	boolean splitPerCutoff = false;
	725	boolean takeSomeAway = false;
	726	boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances());
	727	int numToSplit = 0;
	728	Instances newCenters = null;
	729
	730	// how many would be split, because the children have a better bic value
	731	for (int i = 0; i < cbic.length; i++) {
	732	if (cbic[i] > pbic[i]) {
	733	// decide for splitting ----------------------------------------
	734	splitWon[i] = true; numToSplit++;
	735	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children");
	736	}
	737	else {
	738	// decide for parents and finished stays true -----------------
	739	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent");
	740	}
	741	}
	742
	743	// no splits yet so split per cutoff factor
	744	if ((numToSplit == 0) && (cutoffFactor > 0)) {
	745	splitPerCutoff = true;
	746
	747	// how many to split per cutoff factor
	748	numToSplit = (int)
	749	((double) m_ClusterCenters.numInstances() * m_CutOffFactor);
	750	}
	751
	752	// prepare indexes of values in ascending order
	753	double[] diff = new double [m_NumClusters];
	754	for (int j = 0; j < diff.length; j++) {
	755	diff[j] = pbic[j] - cbic[j];
	756	}
	757	int[] sortOrder = Utils.sort(diff);
	758
	759	// check if maxNumClusters would be exceeded
	760	int possibleToSplit = m_MaxNumClusters - m_NumClusters;
	761
	762	if (possibleToSplit > numToSplit) {
	763	// still enough possible, do the whole amount
	764	possibleToSplit = numToSplit;
	765	}
	766	else
	767	takeSomeAway = true;
	768
	769	// prepare for splitting the one that are supposed to be split
	770	if (splitPerCutoff) {
	771	for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0);
	772	j++) {
	773	splitWon[sortOrder[j]] = true;
	774	}
	775	m_NumSplitsStillDone += possibleToSplit;
	776	}
	777	else {
	778	// take some splits away if max number of clusters would be exceeded
	779	if (takeSomeAway) {
	780	int count = 0;
	781	int j = 0;
	782	for (;j < splitWon.length && count < possibleToSplit; j++){
	783	if (splitWon[sortOrder[j]] == true) count++;
	784	}
	785
	786	while (j < splitWon.length) {
	787	splitWon[sortOrder[j]] = false;
	788	j++;
	789	}
	790	}
	791	}
	792
	793	// finally split
	794	if (possibleToSplit > 0)
	795	newCenters = newCentersAfterSplit(splitWon, splitCenters);
	796	else
	797	newCenters = m_ClusterCenters;
	798	return newCenters;
	799	}
	800
	801	/**
	802	* Returns new centers. Depending on splitWon: if true takes children, if
	803	* false takes parent = current center.
	804	*
	805	* @param splitWon
	806	* array of boolean to indicate to take split or not
	807	* @param splitCenters
	808	* list of splitted centers
	809	* @return the new centers
	810	*/
	811	protected Instances newCentersAfterSplit(boolean[] splitWon,
	812	Instances splitCenters) {
	813	Instances newCenters = new Instances(splitCenters, 0);
	814
	815	int sIndex = 0;
	816	for (int i = 0; i < splitWon.length; i++) {
	817	if (splitWon[i]) {
	818	m_NumSplitsDone++;
	819	newCenters.add(splitCenters.instance(sIndex++));
	820	newCenters.add(splitCenters.instance(sIndex++));
	821	} else {
	822	sIndex++;
	823	sIndex++;
	824	newCenters.add(m_ClusterCenters.instance(i));
	825	}
	826	}
	827	return newCenters;
	828	}
	829
	830	/**
	831	* Controls that counter does not exceed max iteration value. Special function
	832	* for kmeans iterations.
	833	*
	834	* @param iterationCount
	835	* current value of counter
	836	* @param max
	837	* maximum value for counter
	838	* @return true if iteration should be stopped
	839	*/
	840	protected boolean stopKMeansIteration(int iterationCount, int max) {
	841	boolean stopIterate = false;
	842	if (max >= 0)
	843	stopIterate = (iterationCount >= max);
	844	if (stopIterate)
	845	m_KMeansStopped++;
	846	return stopIterate;
	847	}
	848
	849	/**
	850	* Checks if iterationCount has to be checked and if yes
	851	* (this means max is > 0) compares it with max.
	852	*
	853	* @param iterationCount the current iteration count
	854	* @param max the maximum number of iterations
	855	* @return true if maximum has been reached
	856	*/
	857	protected boolean stopIteration(int iterationCount, int max) {
	858	boolean stopIterate = false;
	859	if (max >= 0)
	860	stopIterate = (iterationCount >= max);
	861	return stopIterate;
	862	}
	863
	864	/**
	865	* Recompute the new centers. New cluster center is center of mass of its
	866	* instances. Returns true if cluster stays the same.
	867	* @param centers the input and output centers
	868	* @param instOfCent the instances to the centers
	869	* @param model data model information
	870	* @return true if converged.
	871	*/
	872	protected boolean recomputeCenters(Instances centers,
	873	int[][] instOfCent,
	874	Instances model) {
	875	boolean converged = true;
	876
	877	for (int i = 0; i < centers.numInstances(); i++) {
	878	double val;
	879	for (int j = 0; j < model.numAttributes(); j++) {
	880	val = meanOrMode(m_Instances, instOfCent[i], j);
	881
	882	for (int k = 0; k < instOfCent[i].length; k++)
	883
	884	if (converged && m_ClusterCenters.instance(i).value(j) != val)
	885	converged = false;
	886	if (!converged)
	887	m_ClusterCenters.instance(i).setValue(j, val);
	888	}
	889	}
	890	return converged;
	891	}
	892
	893	/**
	894	* Recompute the new centers - 2nd version
	895	* Same as recomputeCenters, but does not check if center stays the same.
	896	*
	897	* @param centers the input center and output centers
	898	* @param instOfCentIndexes the indexes of the instances to the centers
	899	* @param model data model information
	900	*/
	901	protected void recomputeCentersFast(Instances centers,
	902	int[][] instOfCentIndexes,
	903	Instances model
	904	) {
	905	for (int i = 0; i < centers.numInstances(); i++) {
	906	double val;
	907	for (int j = 0; j < model.numAttributes(); j++) {
	908	val = meanOrMode(m_Instances, instOfCentIndexes[i], j);
	909	centers.instance(i).setValue(j, val);
	910	}
	911	}
	912	}
	913
	914	/**
	915	* Computes Mean Or Mode of one attribute on a subset of m_Instances.
	916	* The subset is defined by an index list.
	917	* @param instances all instances
	918	* @param instList the indexes of the instances the mean is computed from
	919	* @param attIndex the index of the attribute
	920	* @return mean value
	921	*/
	922	protected double meanOrMode(Instances instances,
	923	int[] instList,
	924	int attIndex) {
	925	double result, found;
	926	int[] counts;
	927	int numInst = instList.length;
	928
	929	if (instances.attribute(attIndex).isNumeric()) {
	930	result = found = 0;
	931	for (int j = 0; j < numInst; j++) {
	932	Instance currInst = instances.instance(instList[j]);
	933	if (!currInst.isMissing(attIndex)) {
	934	found += currInst.weight();
	935	result += currInst.weight() *
	936	currInst.value(attIndex);
	937	}
	938	}
	939	if (Utils.eq(found, 0)) {
	940	return 0;
	941	} else {
	942	return result / found;
	943	}
	944	} else if (instances.attribute(attIndex).isNominal()) {
	945	counts = new int[instances.attribute(attIndex).numValues()];
	946	for (int j = 0; j < numInst; j++) {
	947	Instance currInst = instances.instance(instList[j]);
	948	if (!currInst.isMissing(attIndex)) {
	949	counts[(int) currInst.value(attIndex)] += currInst.weight();
	950	}
	951	}
	952	return (double)Utils.maxIndex(counts);
	953	} else {
	954	return 0;
	955	}
	956	}
	957
	958
	959	/**
	960	* Assigns instances to centers.
	961	*
	962	* @param tree KDTree on all instances
	963	* @param centers all the input centers
	964	* @param instOfCent the instances to each center
	965	* @param allInstList list of all instances
	966	* @param assignments assignments of instances to centers
	967	* @param iterationCount the number of iteration
	968	* @return true if converged
	969	* @throws Exception is something goes wrong
	970	*/
	971	protected boolean assignToCenters(KDTree tree,
	972	Instances centers,
	973	int[][] instOfCent,
	974	int[] allInstList,
	975	int[] assignments,
	976	int iterationCount) throws Exception {
	977
	978	boolean converged = true;
	979	if (tree != null) {
	980	// using KDTree structure for assigning
	981	converged = assignToCenters(tree,
	982	centers,
	983	instOfCent,
	984	assignments,
	985	iterationCount);
	986	} else {
	987	converged = assignToCenters(centers,
	988	instOfCent,
	989	allInstList,
	990	assignments);
	991	}
	992	return converged;
	993	}
	994
	995	/**
	996	* Assign instances to centers using KDtree.
	997	* First part of conventionell K-Means, returns true if new assignment
	998	* is the same as the last one.
	999	*
	1000	* @param kdtree KDTree on all instances
	1001	* @param centers all the input centers
	1002	* @param instOfCent the instances to each center
	1003	* @param assignments assignments of instances to centers
	1004	* @param iterationCount the number of iteration
	1005	* @return true if converged
	1006	* @throws Exception in case instances are not assigned to cluster
	1007	*/
	1008	protected boolean assignToCenters(KDTree kdtree,
	1009	Instances centers,
	1010	int[][] instOfCent,
	1011	int[] assignments,
	1012	int iterationCount) throws Exception {
	1013
	1014	int numCent = centers.numInstances();
	1015	int numInst = m_Instances.numInstances();
	1016	int[] oldAssignments = new int[numInst];
	1017
	1018	// WARNING: assignments is "input/output-parameter"
	1019	// should not be null
	1020	if (assignments == null) {
	1021	assignments = new int[numInst];
	1022	for (int i = 0; i < numInst; i++) {
	1023	assignments[0] = -1;
	1024	}
	1025	}
	1026
	1027	// WARNING: instOfCent is "input/output-parameter"
	1028	// should not be null
	1029	if (instOfCent == null) {
	1030	instOfCent = new int [numCent][];
	1031	}
	1032
	1033	// save old assignments
	1034	for (int i = 0; i < assignments.length; i++) {
	1035	oldAssignments[i] = assignments[i];
	1036	}
	1037
	1038	// use tree to get new assignments
	1039	kdtree.centerInstances(centers, assignments,
	1040	Math.pow(.8, iterationCount));
	1041	boolean converged = true;
	1042
	1043	// compare with previous assignment
	1044	for (int i = 0; converged && (i < assignments.length); i++) {
	1045	converged = (oldAssignments[i] == assignments[i]);
	1046	if (assignments[i] == -1)
	1047	throw new Exception("Instance " + i +
	1048	" has not been assigned to cluster.");
	1049	}
	1050
	1051	if (!converged) {
	1052	int[] numInstOfCent = new int[numCent];
	1053	for (int i = 0; i < numCent; i++)
	1054	numInstOfCent[i] = 0;
	1055
	1056	// count num of assignments per center
	1057	for (int i = 0; i < numInst; i++)
	1058	numInstOfCent[assignments[i]]++;
	1059
	1060	// prepare instancelists per center
	1061	for (int i = 0; i < numCent; i++){
	1062	instOfCent[i] = new int[numInstOfCent[i]];
	1063	}
	1064	// write instance lists per center
	1065	for (int i = 0; i < numCent; i++) {
	1066	int index = -1;
	1067	for (int j = 0; j < numInstOfCent[i]; j++) {
	1068	index = nextAssignedOne(i, index, assignments);
	1069	instOfCent[i][j] = index;
	1070	}
	1071	}
	1072	}
	1073
	1074	return converged;
	1075	}
	1076
	1077	/**
	1078	* Assign instances to centers.
	1079	* Part of conventionell K-Means, returns true if new assignment
	1080	* is the same as the last one.
	1081	*
	1082	* @param centers all the input centers
	1083	* @param instOfCent the instances to each center
	1084	* @param allInstList list of all indexes
	1085	* @param assignments assignments of instances to centers
	1086	* @return true if converged
	1087	* @throws Exception if something goes wrong
	1088	*/
	1089	protected boolean assignToCenters(Instances centers,
	1090	int[][] instOfCent,
	1091	int[] allInstList,
	1092	int[] assignments)
	1093	throws Exception {
	1094
	1095	// todo: undecided situations
	1096	boolean converged = true; // true if new assignment is the same
	1097	// as the old one
	1098
	1099	int numInst = allInstList.length;
	1100	int numCent = centers.numInstances();
	1101	int[] numInstOfCent = new int [numCent];
	1102	for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0;
	1103
	1104	// WARNING: assignments is "input/output-parameter"
	1105	// should not be null
	1106	if (assignments == null) {
	1107	assignments = new int[numInst];
	1108	for (int i = 0; i < numInst; i++) {
	1109	assignments[i] = -1;
	1110	}
	1111	}
	1112
	1113	// WARNING: instOfCent is "input/output-parameter"
	1114	// should not be null
	1115	if (instOfCent == null) {
	1116	instOfCent = new int [numCent][];
	1117	}
	1118
	1119	// set assignments
	1120	for (int i = 0; i < numInst; i++) {
	1121	Instance inst = m_Instances.instance(allInstList[i]);
	1122	int newC = clusterProcessedInstance(inst, centers);
	1123
	1124	if (converged && newC != assignments[i]) {
	1125	converged = false;
	1126	}
	1127
	1128	numInstOfCent[newC]++;
	1129	if (!converged)
	1130	assignments[i] = newC;
	1131	}
	1132
	1133	// the following is only done
	1134	// if assignments are not the same, because too much effort
	1135	if (!converged) {
	1136	PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged");
	1137	for (int i = 0; i < numCent; i++) {
	1138	instOfCent[i] = new int [numInstOfCent[i]];
	1139	}
	1140
	1141	for (int i = 0; i < numCent; i++) {
	1142	int index = -1;
	1143	for (int j = 0; j < numInstOfCent[i]; j++) {
	1144	index = nextAssignedOne(i, index, assignments);
	1145	instOfCent[i][j] = allInstList[index];
	1146	}
	1147	}
	1148	}
	1149	else
	1150	PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged");
	1151
	1152	return converged;
	1153	}
	1154
	1155	/**
	1156	* Searches along the assignment array for the next entry of the center
	1157	* in question.
	1158	* @param cent index of the center
	1159	* @param lastIndex index to start searching
	1160	* @param assignments assignments
	1161	* @return index of the instance the center cent is assigned to
	1162	*/
	1163	protected int nextAssignedOne(int cent, int lastIndex,
	1164	int[] assignments) {
	1165	int len = assignments.length;
	1166	int index = lastIndex + 1;
	1167	while (index < len) {
	1168	if (assignments[index] == cent) {
	1169	return (index);
	1170	}
	1171	index++;
	1172	}
	1173	return (-1);
	1174	}
	1175
	1176	/**
	1177	* Split centers in their region. Generates random vector of
	1178	* length = variance and
	1179	* adds and substractsx to cluster vector to get two new clusters.
	1180	*
	1181	* @param random random function
	1182	* @param center the center that is split here
	1183	* @param variance variance of the cluster
	1184	* @param model data model valid
	1185	* @return a pair of new centers
	1186	* @throws Exception something in AlgVector goes wrong
	1187	*/
	1188	protected Instances splitCenter(Random random,
	1189	Instance center,
	1190	double variance,
	1191	Instances model) throws Exception {
	1192	m_NumSplits++;
	1193	AlgVector r = null;
	1194	Instances children = new Instances(model, 2);
	1195
	1196	if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) {
	1197	Instance nextVector = getNextDebugVectorsInstance(model);
	1198	PFD(D_RANDOMVECTOR, "Random Vector from File " + nextVector);
	1199	r = new AlgVector(nextVector);
	1200	}
	1201	else {
	1202	// random vector of length = variance
	1203	r = new AlgVector(model, random);
	1204	}
	1205	r.changeLength(Math.pow(variance, 0.5));
	1206	PFD(D_RANDOMVECTOR, "random vector *variance "+ r);
	1207
	1208	// add random vector to center
	1209	AlgVector c = new AlgVector(center);
	1210	AlgVector c2 = (AlgVector) c.clone();
	1211	c = c.add(r);
	1212	Instance newCenter = c.getAsInstance(model, random);
	1213	children.add(newCenter);
	1214	PFD(D_FOLLOWSPLIT, "first child "+ newCenter);
	1215
	1216	// substract random vector to center
	1217	c2 = c2.substract(r);
	1218	newCenter = c2.getAsInstance(model, random);
	1219	children.add(newCenter);
	1220	PFD(D_FOLLOWSPLIT, "second child "+ newCenter);
	1221
	1222	return children;
	1223	}
	1224
	1225	/**
	1226	* Split centers in their region.
	1227	* (Alternative version of splitCenter())
	1228	*
	1229	* @param random the random number generator
	1230	* @param instances of the region
	1231	* @param model the model for the centers
	1232	* (should be the same as that of instances)
	1233	* @return a pair of new centers
	1234	*/
	1235	protected Instances splitCenters(Random random,
	1236	Instances instances,
	1237	Instances model) {
	1238	Instances children = new Instances(model, 2);
	1239	int instIndex = Math.abs(random.nextInt()) % instances.numInstances();
	1240	children.add(instances.instance(instIndex));
	1241	int instIndex2 = instIndex;
	1242	int count = 0;
	1243	while ((instIndex2 == instIndex) && count < 10) {
	1244	count++;
	1245	instIndex2 = Math.abs(random.nextInt()) % instances.numInstances();
	1246	}
	1247	children.add(instances.instance(instIndex2));
	1248
	1249	return children;
	1250	}
	1251
	1252	/**
	1253	* Generates new centers randomly. Used for starting centers.
	1254	*
	1255	* @param random0 random number generator
	1256	* @param model data model of the instances
	1257	* @param numClusters number of clusters
	1258	* @return new centers
	1259	*/
	1260	protected Instances makeCentersRandomly(Random random0,
	1261	Instances model,
	1262	int numClusters) {
	1263	Instances clusterCenters = new Instances(model, numClusters);
	1264	m_NumClusters = numClusters;
	1265
	1266	// makes the new centers randomly
	1267	for (int i = 0; i < numClusters; i++) {
	1268	int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances();
	1269	clusterCenters.add(m_Instances.instance(instIndex));
	1270	}
	1271	return clusterCenters;
	1272	}
	1273
	1274	/**
	1275	* Returns the BIC-value for the given center and instances.
	1276	* @param instList The indices of the instances that belong to the center
	1277	* @param center the center.
	1278	* @param mle maximum likelihood
	1279	* @param model the data model
	1280	* @return the BIC value
	1281	*/
	1282	protected double calculateBIC(int[] instList, Instance center,
	1283	double mle, Instances model) {
	1284	int[][] w1 = new int[1][instList.length];
	1285	for (int i = 0; i < instList.length; i++) {
	1286	w1[0][i] = instList[i];
	1287	}
	1288	double[] m = {mle};
	1289	Instances w2 = new Instances(model, 1);
	1290	w2.add(center);
	1291	return calculateBIC(w1, w2, m);
	1292	}
	1293
	1294	/**
	1295	* Calculates the BIC for the given set of centers and instances.
	1296	* @param instOfCent The instances that belong to their respective centers
	1297	* @param centers the centers
	1298	* @param mle maximum likelihood
	1299	* @return The BIC for the input.
	1300	*/
	1301	protected double calculateBIC(int[][] instOfCent, Instances centers,
	1302	double[] mle) {
	1303	double loglike = 0.0;
	1304	int numInstTotal = 0;
	1305	int numCenters = centers.numInstances();
	1306	int numDimensions = centers.numAttributes();
	1307	int numParameters = (numCenters - 1) + //probabilities
	1308	numCenters * numDimensions + //means
	1309	numCenters; // variance params
	1310	for (int i = 0; i < centers.numInstances(); i++) {
	1311	loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i),
	1312	mle[i], centers.numInstances() * 2);
	1313	numInstTotal += instOfCent[i].length;
	1314	}
	1315	/* diff
	1316	thats how we did it
	1317	loglike -= ((centers.numAttributes() + 1.0) * centers.numInstances() * 1)
	1318	* Math.log(count);
	1319	*/
	1320	loglike -= numInstTotal * Math.log(numInstTotal);
	1321	//System.out.println ("numInstTotal " + numInstTotal +
	1322	// "calculateBIC res " + loglike);
	1323	loglike -= (numParameters / 2.0) * Math.log(numInstTotal);
	1324	//System.out.println ("numParam " +
	1325	// + numParameters +
	1326	// " calculateBIC res " + loglike);
	1327	return loglike;
	1328	}
	1329
	1330	/**
	1331	* Calculates the log-likelihood of the data for the given model, taken
	1332	* at the maximum likelihood point.
	1333	*
	1334	* @param numInst number of instances that belong to the center
	1335	* @param center the center
	1336	* @param distortion distortion
	1337	* @param numCent number of centers
	1338	* @return the likelihood estimate
	1339	*/
	1340	protected double logLikelihoodEstimate(int numInst,
	1341	Instance center,
	1342	double distortion,
	1343	int numCent) {
	1344	// R(n) num of instances of the center -> numInst
	1345	// K num of centers -> not used
	1346	//
	1347	//todo take the diff comments away
	1348	double loglike = 0;
	1349	/* if is new */
	1350	if (numInst > 1) {
	1351	/* diff variance is new */
	1352	//
	1353	// distortion = Sum over instances x of the center(x-center)
	1354	// different to paper; sum should be squared
	1355	//
	1356	// (Sum of distances to center) / R(n) - 1.0
	1357	// different to paper; should be R(n)-K
	1358	double variance = distortion / (numInst - 1.0);
	1359
	1360	//
	1361	// -R(n)/2 * log(pi*2)
	1362	//
	1363	double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);
	1364	/* diff
	1365	thats how we had it
	1366	double p2 = -((ni * center.numAttributes()) / 2) * distortion;
	1367	*/
	1368	//
	1369	// -(R(n)M)/2 log(variance)
	1370	//
	1371	double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);
	1372
	1373	/* diff
	1374	thats how we had it, the difference is a bug in x-means
	1375	double p3 = - (numInst - numCent) / 2;
	1376	*/
	1377	//
	1378	// -(R(n)-1)/2
	1379	//
	1380	double p3 = - (numInst - 1.0) / 2.0;
	1381
	1382	//
	1383	// R(n)*log(R(n))
	1384	//
	1385	double p4 = numInst * Math.log(numInst);
	1386
	1387	/* diff x-means doesn't have this part
	1388	double p5 = - numInst * Math.log(numInstTotal);
	1389	*/
	1390
	1391	/*
	1392	loglike = -(ni / 2) * Math.log(Math.PI * 2)
	1393	- (ni * center.numAttributes()) / 2.0) * logdistortion
	1394	- (ni - k) / 2.0
	1395	+ ni * Math.log(ni)
	1396	- ni * Math.log(r);
	1397	*/
	1398	loglike = p1 + p2 + p3 + p4; // diff + p5;
	1399	//the log(r) is something that can be reused.
	1400	//as is the log(2 PI), these could provide extra speed up later on.
	1401	//since distortion is so expensive to compute, I only do that once.
	1402	}
	1403	return loglike;
	1404	}
	1405
	1406	/**
	1407	* Calculates the maximum likelihood estimate for the variance.
	1408	* @param instOfCent indices of instances to each center
	1409	* @param centers the centers
	1410	* @return the list of distortions distortion.
	1411	*/
	1412	protected double[] distortion(int[][] instOfCent, Instances centers) {
	1413	double[] distortion = new double[centers.numInstances()];
	1414	for (int i = 0; i < centers.numInstances(); i++) {
	1415	distortion[i] = 0.0;
	1416	for (int j = 0; j < instOfCent[i].length; j++) {
	1417	distortion[i] += m_DistanceF.distance(m_Instances
	1418	.instance(instOfCent[i][j]), centers.instance(i));
	1419	}
	1420	}
	1421	/*
	1422	* diff not done in x-means res *= 1.0 / (count - centers.numInstances());
	1423	*/
	1424	return distortion;
	1425	}
	1426
	1427	/**
	1428	* Clusters an instance.
	1429	*
	1430	* @param instance
	1431	* the instance to assign a cluster to.
	1432	* @param centers
	1433	* the centers to cluster the instance to.
	1434	* @return a cluster index.
	1435	*/
	1436	protected int clusterProcessedInstance(Instance instance, Instances centers) {
	1437
	1438	double minDist = Integer.MAX_VALUE;
	1439	int bestCluster = 0;
	1440	for (int i = 0; i < centers.numInstances(); i++) {
	1441	double dist = m_DistanceF.distance(instance, centers.instance(i));
	1442
	1443	if (dist < minDist) {
	1444	minDist = dist;
	1445	bestCluster = i;
	1446	}
	1447	}
	1448	;
	1449	return bestCluster;
	1450	}
	1451
	1452	/**
	1453	* Clusters an instance that has been through the filters.
	1454	*
	1455	* @param instance
	1456	* the instance to assign a cluster to
	1457	* @return a cluster number
	1458	*/
	1459	protected int clusterProcessedInstance(Instance instance) {
	1460	double minDist = Integer.MAX_VALUE;
	1461	int bestCluster = 0;
	1462	for (int i = 0; i < m_NumClusters; i++) {
	1463	double dist = m_DistanceF
	1464	.distance(instance, m_ClusterCenters.instance(i));
	1465	if (dist < minDist) {
	1466	minDist = dist;
	1467	bestCluster = i;
	1468	}
	1469	}
	1470	return bestCluster;
	1471	}
	1472
	1473	/**
	1474	* Classifies a given instance.
	1475	*
	1476	* @param instance the instance to be assigned to a cluster
	1477	* @return the number of the assigned cluster as an integer
	1478	* if the class is enumerated, otherwise the predicted value
	1479	* @throws Exception if instance could not be classified
	1480	* successfully
	1481	*/
	1482	public int clusterInstance(Instance instance) throws Exception {
	1483	m_ReplaceMissingFilter.input(instance);
	1484	Instance inst = m_ReplaceMissingFilter.output();
	1485
	1486	return clusterProcessedInstance(inst);
	1487	}
	1488
	1489
	1490	/**
	1491	* Returns the number of clusters.
	1492	*
	1493	* @return the number of clusters generated for a training dataset.
	1494	*/
	1495	public int numberOfClusters() {
	1496	return m_NumClusters;
	1497	}
	1498
	1499
	1500	/**
	1501	* Returns an enumeration describing the available options.
	1502	* @return an enumeration of all the available options
	1503	**/
	1504	public Enumeration listOptions() {
	1505	Vector result = new Vector();
	1506
	1507	result.addElement(new Option(
	1508	"\tmaximum number of overall iterations\n"
	1509	+ "\t(default 1).",
	1510	"I", 1, "-I <num>"));
	1511
	1512	result.addElement(new Option(
	1513	"\tmaximum number of iterations in the kMeans loop in\n"
	1514	+ "\tthe Improve-Parameter part \n"
	1515	+ "\t(default 1000).",
	1516	"M", 1, "-M <num>"));
	1517
	1518	result.addElement(new Option(
	1519	"\tmaximum number of iterations in the kMeans loop\n"
	1520	+ "\tfor the splitted centroids in the Improve-Structure part \n"
	1521	+ "\t(default 1000).",
	1522	"J", 1, "-J <num>"));
	1523
	1524	result.addElement(new Option(
	1525	"\tminimum number of clusters\n"
	1526	+ "\t(default 2).",
	1527	"L", 1, "-L <num>"));
	1528
	1529	result.addElement(new Option(
	1530	"\tmaximum number of clusters\n"
	1531	+ "\t(default 4).",
	1532	"H", 1, "-H <num>"));
	1533
	1534	result.addElement(new Option(
	1535	"\tdistance value for binary attributes\n"
	1536	+ "\t(default 1.0).",
	1537	"B", 1, "-B <value>"));
	1538
	1539	result.addElement(new Option(
	1540	"\tUses the KDTree internally\n"
	1541	+ "\t(default no).",
	1542	"use-kdtree", 0, "-use-kdtree"));
	1543
	1544	result.addElement(new Option(
	1545	"\tFull class name of KDTree class to use, followed\n"
	1546	+ "\tby scheme options.\n"
	1547	+ "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n"
	1548	+ "\t(default no KDTree class used).",
	1549	"K", 1, "-K <KDTree class specification>"));
	1550
	1551	result.addElement(new Option(
	1552	"\tcutoff factor, takes the given percentage of the splitted \n"
	1553	+ "\tcentroids if none of the children win\n"
	1554	+ "\t(default 0.0).",
	1555	"C", 1, "-C <value>"));
	1556
	1557	result.addElement(new Option(
	1558	"\tFull class name of Distance function class to use, followed\n"
	1559	+ "\tby scheme options.\n" +
	1560	"\t(default weka.core.EuclideanDistance).",
	1561	"D", 1, "-D <distance function class specification>"));
	1562
	1563	result.addElement(new Option(
	1564	"\tfile to read starting centers from (ARFF format).",
	1565	"N", 1, "-N <file name>"));
	1566
	1567	result.addElement(new Option(
	1568	"\tfile to write centers to (ARFF format).",
	1569	"O", 1, "-O <file name>"));
	1570
	1571	result.addElement(new Option(
	1572	"\tThe debug level.\n"
	1573	+ "\t(default 0)",
	1574	"U", 1, "-U <int>"));
	1575
	1576	result.addElement(new Option(
	1577	"\tThe debug vectors file.",
	1578	"Y", 1, "-Y <file name>"));
	1579
	1580	Enumeration en = super.listOptions();
	1581	while (en.hasMoreElements())
	1582	result.addElement(en.nextElement());
	1583
	1584	return result.elements();
	1585	}
	1586
	1587	/**
	1588	* Returns the tip text for this property.
	1589	* @return tip text for this property
	1590	*/
	1591	public String minNumClustersTipText() {
	1592	return "set minimum number of clusters";
	1593	}
	1594
	1595	/**
	1596	* Sets the minimum number of clusters to generate.
	1597	*
	1598	* @param n the minimum number of clusters to generate
	1599	*/
	1600	public void setMinNumClusters(int n) {
	1601	m_MinNumClusters = n;
	1602	}
	1603
	1604	/**
	1605	* Gets the minimum number of clusters to generate.
	1606	* @return the minimum number of clusters to generate
	1607	*/
	1608	public int getMinNumClusters() {
	1609	return m_MinNumClusters;
	1610	}
	1611
	1612	/**
	1613	* Returns the tip text for this property.
	1614	* @return tip text for this property
	1615	*/
	1616	public String maxNumClustersTipText() {
	1617	return "set maximum number of clusters";
	1618	}
	1619
	1620	/**
	1621	* Sets the maximum number of clusters to generate.
	1622	* @param n the maximum number of clusters to generate
	1623	*/
	1624	public void setMaxNumClusters(int n) {
	1625	if (n >= m_MinNumClusters) {
	1626	m_MaxNumClusters = n;
	1627	}
	1628	}
	1629
	1630	/**
	1631	* Gets the maximum number of clusters to generate.
	1632	* @return the maximum number of clusters to generate
	1633	*/
	1634	public int getMaxNumClusters() {
	1635	return m_MaxNumClusters;
	1636	}
	1637
	1638	/**
	1639	* Returns the tip text for this property.
	1640	* @return tip text for this property
	1641	*/
	1642	public String maxIterationsTipText() {
	1643	return "the maximum number of iterations to perform";
	1644	}
	1645
	1646	/**
	1647	* Sets the maximum number of iterations to perform.
	1648	* @param i the number of iterations
	1649	* @throws Exception if i is less than 1
	1650	*/
	1651	public void setMaxIterations(int i) throws Exception {
	1652	if (i < 0)
	1653	throw new Exception("Only positive values for iteration number" +
	1654	" allowed (Option I).");
	1655	m_MaxIterations = i;
	1656	}
	1657
	1658	/**
	1659	* Gets the maximum number of iterations.
	1660	* @return the number of iterations
	1661	*/
	1662	public int getMaxIterations() {
	1663	return m_MaxIterations;
	1664	}
	1665
	1666	/**
	1667	* Returns the tip text for this property.
	1668	* @return tip text for this property
	1669	*/
	1670	public String maxKMeansTipText() {
	1671	return "the maximum number of iterations to perform in KMeans";
	1672	}
	1673
	1674	/**
	1675	* Set the maximum number of iterations to perform in KMeans.
	1676	* @param i the number of iterations
	1677	*/
	1678	public void setMaxKMeans(int i) {
	1679	m_MaxKMeans = i;
	1680	m_MaxKMeansForChildren = i;
	1681	}
	1682
	1683	/**
	1684	* Gets the maximum number of iterations in KMeans.
	1685	* @return the number of iterations
	1686	*/
	1687	public int getMaxKMeans() {
	1688	return m_MaxKMeans;
	1689	}
	1690
	1691	/**
	1692	* Returns the tip text for this property.
	1693	* @return tip text for this property
	1694	*/
	1695	public String maxKMeansForChildrenTipText() {
	1696	return "the maximum number of iterations KMeans that is performed on the child centers";
	1697	}
	1698
	1699	/**
	1700	* Sets the maximum number of iterations KMeans that is performed
	1701	* on the child centers.
	1702	* @param i the number of iterations
	1703	*/
	1704	public void setMaxKMeansForChildren(int i) {
	1705	m_MaxKMeansForChildren = i;
	1706	}
	1707
	1708	/**
	1709	* Gets the maximum number of iterations in KMeans.
	1710	* @return the number of iterations
	1711	*/
	1712	public int getMaxKMeansForChildren() {
	1713	return m_MaxKMeansForChildren;
	1714	}
	1715
	1716	/**
	1717	* Returns the tip text for this property.
	1718	* @return tip text for this property
	1719	*/
	1720	public String cutOffFactorTipText() {
	1721	return "the cut-off factor to use";
	1722	}
	1723
	1724	/**
	1725	* Sets a new cutoff factor.
	1726	* @param i the new cutoff factor
	1727	*/
	1728	public void setCutOffFactor(double i) {
	1729	m_CutOffFactor = i;
	1730	}
	1731
	1732	/**
	1733	* Gets the cutoff factor.
	1734	* @return the cutoff factor
	1735	*/
	1736	public double getCutOffFactor() {
	1737	return m_CutOffFactor;
	1738	}
	1739
	1740	/**
	1741	* Returns the tip text for this property.
	1742	* @return tip text for this property suitable for
	1743	* displaying in the explorer/experimenter gui
	1744	*/
	1745	public String binValueTipText() {
	1746	return "Set the value that represents true in the new attributes.";
	1747	}
	1748
	1749	/**
	1750	* Gets value that represents true in a new numeric attribute.
	1751	* (False is always represented by 0.0.)
	1752	* @return the value that represents true in a new numeric attribute
	1753	*/
	1754	public double getBinValue() {
	1755	return m_BinValue;
	1756	}
	1757
	1758	/**
	1759	* Sets the distance value between true and false of binary attributes.
	1760	* and "same" and "different" of nominal attributes
	1761	* @param value the distance
	1762	*/
	1763	public void setBinValue(double value) {
	1764	m_BinValue = value;
	1765	}
	1766
	1767	/**
	1768	* Returns the tip text for this property.
	1769	*
	1770	* @return tip text for this property suitable for
	1771	* displaying in the explorer/experimenter gui
	1772	*/
	1773	public String distanceFTipText() {
	1774	return "The distance function to use.";
	1775	}
	1776
	1777	/**
	1778	* gets the "binary" distance value.
	1779	* @param distanceF the distance function with all options set
	1780	*/
	1781	public void setDistanceF(DistanceFunction distanceF) {
	1782	m_DistanceF = distanceF;
	1783	}
	1784
	1785	/**
	1786	* Gets the distance function.
	1787	* @return the distance function
	1788	*/
	1789	public DistanceFunction getDistanceF() {
	1790	return m_DistanceF;
	1791	}
	1792
	1793	/**
	1794	* Gets the distance function specification string, which contains the
	1795	* class name of the distance function class and any options to it.
	1796	*
	1797	* @return the distance function specification string
	1798	*/
	1799	protected String getDistanceFSpec() {
	1800
	1801	DistanceFunction d = getDistanceF();
	1802	if (d instanceof OptionHandler) {
	1803	return d.getClass().getName() + " "
	1804	+ Utils.joinOptions(((OptionHandler) d).getOptions());
	1805	}
	1806	return d.getClass().getName();
	1807	}
	1808
	1809	/**
	1810	* Returns the tip text for this property.
	1811	*
	1812	* @return tip text for this property suitable for
	1813	* displaying in the explorer/experimenter gui
	1814	*/
	1815	public String debugVectorsFileTipText() {
	1816	return "The file containing the debug vectors (only for debugging!).";
	1817	}
	1818
	1819	/**
	1820	* Sets the file that has the random vectors stored.
	1821	* Only used for debugging reasons.
	1822	* @param value the file to read the random vectors from
	1823	*/
	1824	public void setDebugVectorsFile(File value) {
	1825	m_DebugVectorsFile = value;
	1826	}
	1827
	1828	/**
	1829	* Gets the file name for a file that has the random vectors stored.
	1830	* Only used for debugging purposes.
	1831	* @return the file to read the vectors from
	1832	*/
	1833	public File getDebugVectorsFile() {
	1834	return m_DebugVectorsFile;
	1835	}
	1836
	1837	/**
	1838	* Initialises the debug vector input.
	1839	* @throws Exception if there is error
	1840	* opening the debug input file.
	1841	*/
	1842	public void initDebugVectorsInput() throws Exception {
	1843	m_DebugVectorsInput =
	1844	new BufferedReader(new FileReader(m_DebugVectorsFile));
	1845	m_DebugVectors = new Instances(m_DebugVectorsInput);
	1846	m_DebugVectorsIndex = 0;
	1847	}
	1848
	1849	/**
	1850	* Read an instance from debug vectors file.
	1851	* @param model the data model for the instance.
	1852	* @throws Exception if there are no debug vector
	1853	* in m_DebugVectors.
	1854	* @return the next debug vector.
	1855	*/
	1856	public Instance getNextDebugVectorsInstance(Instances model)
	1857	throws Exception {
	1858	if (m_DebugVectorsIndex >= m_DebugVectors.numInstances())
	1859	throw new Exception("no more prefabricated Vectors");
	1860	Instance nex = m_DebugVectors.instance(m_DebugVectorsIndex);
	1861	nex.setDataset(model);
	1862	m_DebugVectorsIndex++;
	1863	return nex;
	1864	}
	1865
	1866	/**
	1867	* Returns the tip text for this property.
	1868	*
	1869	* @return tip text for this property suitable for
	1870	* displaying in the explorer/experimenter gui
	1871	*/
	1872	public String inputCenterFileTipText() {
	1873	return "The file to read the list of centers from.";
	1874	}
	1875
	1876	/**
	1877	* Sets the file to read the list of centers from.
	1878	*
	1879	* @param value the file to read centers from
	1880	*/
	1881	public void setInputCenterFile(File value) {
	1882	m_InputCenterFile = value;
	1883	}
	1884
	1885	/**
	1886	* Gets the file to read the list of centers from.
	1887	*
	1888	* @return the file to read the centers from
	1889	*/
	1890	public File getInputCenterFile() {
	1891	return m_InputCenterFile;
	1892	}
	1893
	1894	/**
	1895	* Returns the tip text for this property.
	1896	*
	1897	* @return tip text for this property suitable for
	1898	* displaying in the explorer/experimenter gui
	1899	*/
	1900	public String outputCenterFileTipText() {
	1901	return "The file to write the list of centers to.";
	1902	}
	1903
	1904	/**
	1905	* Sets file to write the list of centers to.
	1906	*
	1907	* @param value file to write centers to
	1908	*/
	1909	public void setOutputCenterFile(File value) {
	1910	m_OutputCenterFile = value;
	1911	}
	1912
	1913	/**
	1914	* Gets the file to write the list of centers to.
	1915	*
	1916	* @return filename of the file to write centers to
	1917	*/
	1918	public File getOutputCenterFile() {
	1919	return m_OutputCenterFile;
	1920	}
	1921
	1922	/**
	1923	* Returns the tip text for this property.
	1924	*
	1925	* @return tip text for this property suitable for
	1926	* displaying in the explorer/experimenter gui
	1927	*/
	1928	public String KDTreeTipText() {
	1929	return "The KDTree to use.";
	1930	}
	1931
	1932	/**
	1933	* Sets the KDTree class.
	1934	* @param k a KDTree object with all options set
	1935	*/
	1936	public void setKDTree(KDTree k) {
	1937	m_KDTree = k;
	1938	}
	1939
	1940	/**
	1941	* Gets the KDTree class.
	1942	*
	1943	* @return the configured KDTree
	1944	*/
	1945	public KDTree getKDTree() {
	1946	return m_KDTree;
	1947	}
	1948
	1949	/**
	1950	* Returns the tip text for this property.
	1951	*
	1952	* @return tip text for this property suitable for
	1953	* displaying in the explorer/experimenter gui
	1954	*/
	1955	public String useKDTreeTipText() {
	1956	return "Whether to use the KDTree.";
	1957	}
	1958
	1959	/**
	1960	* Sets whether to use the KDTree or not.
	1961	*
	1962	* @param value if true the KDTree is used
	1963	*/
	1964	public void setUseKDTree(boolean value) {
	1965	m_UseKDTree = value;
	1966	}
	1967
	1968	/**
	1969	* Gets whether the KDTree is used or not.
	1970	*
	1971	* @return true if KDTrees are used
	1972	*/
	1973	public boolean getUseKDTree() {
	1974	return m_UseKDTree;
	1975	}
	1976
	1977	/**
	1978	* Gets the KDTree specification string, which contains the class name of
	1979	* the KDTree class and any options to the KDTree.
	1980	*
	1981	* @return the KDTree string.
	1982	*/
	1983	protected String getKDTreeSpec() {
	1984
	1985	KDTree c = getKDTree();
	1986	if (c instanceof OptionHandler) {
	1987	return c.getClass().getName() + " "
	1988	+ Utils.joinOptions(((OptionHandler)c).getOptions());
	1989	}
	1990	return c.getClass().getName();
	1991	}
	1992
	1993	/**
	1994	* Returns the tip text for this property.
	1995	*
	1996	* @return tip text for this property suitable for
	1997	* displaying in the explorer/experimenter gui
	1998	*/
	1999	public String debugLevelTipText() {
	2000	return "The debug level to use.";
	2001	}
	2002
	2003	/**
	2004	* Sets the debug level.
	2005	* debug level = 0, means no output
	2006	* @param d debuglevel
	2007	*/
	2008	public void setDebugLevel(int d) {
	2009	m_DebugLevel = d;
	2010	}
	2011
	2012	/**
	2013	* Gets the debug level.
	2014	* @return debug level
	2015	*/
	2016	public int getDebugLevel() {
	2017	return m_DebugLevel;
	2018	}
	2019
	2020	/**
	2021	* Checks the instances.
	2022	* No checks in this KDTree but it calls the check of the distance function.
	2023	*/
	2024	protected void checkInstances () {
	2025
	2026	// m_DistanceF.checkInstances();
	2027	}
	2028
	2029	/**
	2030	* Parses a given list of options. <p/>
	2031	*
	2032	<!-- options-start -->
	2033	* Valid options are: <p/>
	2034	*
	2035	* <pre> -I <num>
	2036	* maximum number of overall iterations
	2037	* (default 1).</pre>
	2038	*
	2039	* <pre> -M <num>
	2040	* maximum number of iterations in the kMeans loop in
	2041	* the Improve-Parameter part
	2042	* (default 1000).</pre>
	2043	*
	2044	* <pre> -J <num>
	2045	* maximum number of iterations in the kMeans loop
	2046	* for the splitted centroids in the Improve-Structure part
	2047	* (default 1000).</pre>
	2048	*
	2049	* <pre> -L <num>
	2050	* minimum number of clusters
	2051	* (default 2).</pre>
	2052	*
	2053	* <pre> -H <num>
	2054	* maximum number of clusters
	2055	* (default 4).</pre>
	2056	*
	2057	* <pre> -B <value>
	2058	* distance value for binary attributes
	2059	* (default 1.0).</pre>
	2060	*
	2061	* <pre> -use-kdtree
	2062	* Uses the KDTree internally
	2063	* (default no).</pre>
	2064	*
	2065	* <pre> -K <KDTree class specification>
	2066	* Full class name of KDTree class to use, followed
	2067	* by scheme options.
	2068	* eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
	2069	* (default no KDTree class used).</pre>
	2070	*
	2071	* <pre> -C <value>
	2072	* cutoff factor, takes the given percentage of the splitted
	2073	* centroids if none of the children win
	2074	* (default 0.0).</pre>
	2075	*
	2076	* <pre> -D <distance function class specification>
	2077	* Full class name of Distance function class to use, followed
	2078	* by scheme options.
	2079	* (default weka.core.EuclideanDistance).</pre>
	2080	*
	2081	* <pre> -N <file name>
	2082	* file to read starting centers from (ARFF format).</pre>
	2083	*
	2084	* <pre> -O <file name>
	2085	* file to write centers to (ARFF format).</pre>
	2086	*
	2087	* <pre> -U <int>
	2088	* The debug level.
	2089	* (default 0)</pre>
	2090	*
	2091	* <pre> -Y <file name>
	2092	* The debug vectors file.</pre>
	2093	*
	2094	* <pre> -S <num>
	2095	* Random number seed.
	2096	* (default 10)</pre>
	2097	*
	2098	<!-- options-end -->
	2099	*
	2100	* @param options the list of options as an array of strings
	2101	* @throws Exception if an option is not supported
	2102	*/
	2103	public void setOptions(String[] options)
	2104	throws Exception {
	2105
	2106	String optionString;
	2107	String funcString;
	2108
	2109	optionString = Utils.getOption('I', options);
	2110	if (optionString.length() != 0)
	2111	setMaxIterations(Integer.parseInt(optionString));
	2112	else
	2113	setMaxIterations(1);
	2114
	2115	optionString = Utils.getOption('M', options);
	2116	if (optionString.length() != 0)
	2117	setMaxKMeans(Integer.parseInt(optionString));
	2118	else
	2119	setMaxKMeans(1000);
	2120
	2121	optionString = Utils.getOption('J', options);
	2122	if (optionString.length() != 0)
	2123	setMaxKMeansForChildren(Integer.parseInt(optionString));
	2124	else
	2125	setMaxKMeansForChildren(1000);
	2126
	2127	optionString = Utils.getOption('L', options);
	2128	if (optionString.length() != 0)
	2129	setMinNumClusters(Integer.parseInt(optionString));
	2130	else
	2131	setMinNumClusters(2);
	2132
	2133	optionString = Utils.getOption('H', options);
	2134	if (optionString.length() != 0)
	2135	setMaxNumClusters(Integer.parseInt(optionString));
	2136	else
	2137	setMaxNumClusters(4);
	2138
	2139	optionString = Utils.getOption('B', options);
	2140	if (optionString.length() != 0)
	2141	setBinValue(Double.parseDouble(optionString));
	2142	else
	2143	setBinValue(1.0);
	2144
	2145	setUseKDTree(Utils.getFlag("use-kdtree", options));
	2146
	2147	if (getUseKDTree()) {
	2148	funcString = Utils.getOption('K', options);
	2149	if (funcString.length() != 0) {
	2150	String[] funcSpec = Utils.splitOptions(funcString);
	2151	if (funcSpec.length == 0) {
	2152	throw new Exception("Invalid function specification string");
	2153	}
	2154	String funcName = funcSpec[0];
	2155	funcSpec[0] = "";
	2156	setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec));
	2157	}
	2158	else {
	2159	setKDTree(new KDTree());
	2160	}
	2161	}
	2162	else {
	2163	setKDTree(new KDTree());
	2164	}
	2165
	2166	optionString = Utils.getOption('C', options);
	2167	if (optionString.length() != 0)
	2168	setCutOffFactor(Double.parseDouble(optionString));
	2169	else
	2170	setCutOffFactor(0.0);
	2171
	2172	funcString = Utils.getOption('D', options);
	2173	if (funcString.length() != 0) {
	2174	String[] funcSpec = Utils.splitOptions(funcString);
	2175	if (funcSpec.length == 0) {
	2176	throw new Exception("Invalid function specification string");
	2177	}
	2178	String funcName = funcSpec[0];
	2179	funcSpec[0] = "";
	2180	setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class,
	2181	funcName, funcSpec));
	2182	}
	2183	else {
	2184	setDistanceF(new EuclideanDistance());
	2185	}
	2186
	2187	optionString = Utils.getOption('N', options);
	2188	if (optionString.length() != 0) {
	2189	setInputCenterFile(new File(optionString));
	2190	m_CenterInput =
	2191	new BufferedReader(new FileReader(optionString));
	2192	}
	2193	else {
	2194	setInputCenterFile(new File(System.getProperty("user.dir")));
	2195	m_CenterInput = null;
	2196	}
	2197
	2198	optionString = Utils.getOption('O', options);
	2199	if (optionString.length() != 0) {
	2200	setOutputCenterFile(new File(optionString));
	2201	m_CenterOutput = new PrintWriter(new FileOutputStream(optionString));
	2202	}
	2203	else {
	2204	setOutputCenterFile(new File(System.getProperty("user.dir")));
	2205	m_CenterOutput = null;
	2206	}
	2207
	2208	optionString = Utils.getOption('U', options);
	2209	int debugLevel = 0;
	2210	if (optionString.length() != 0) {
	2211	try {
	2212	debugLevel = Integer.parseInt(optionString);
	2213	} catch (NumberFormatException e) {
	2214	throw new Exception(optionString +
	2215	"is an illegal value for option -U");
	2216	}
	2217	}
	2218	setDebugLevel(debugLevel);
	2219
	2220	optionString = Utils.getOption('Y', options);
	2221	if (optionString.length() != 0) {
	2222	setDebugVectorsFile(new File(optionString));
	2223	}
	2224	else {
	2225	setDebugVectorsFile(new File(System.getProperty("user.dir")));
	2226	m_DebugVectorsInput = null;
	2227	m_DebugVectors = null;
	2228	}
	2229
	2230	super.setOptions(options);
	2231	}
	2232
	2233	/**
	2234	* Gets the current settings of SimpleKMeans.
	2235	* @return an array of strings suitable for passing to setOptions
	2236	*/
	2237	public String[] getOptions() {
	2238	int i;
	2239	Vector result;
	2240	String[] options;
	2241
	2242	result = new Vector();
	2243
	2244	result.add("-I");
	2245	result.add("" + getMaxIterations());
	2246
	2247	result.add("-M");
	2248	result.add("" + getMaxKMeans());
	2249
	2250	result.add("-J");
	2251	result.add("" + getMaxKMeansForChildren());
	2252
	2253	result.add("-L");
	2254	result.add("" + getMinNumClusters());
	2255
	2256	result.add("-H");
	2257	result.add("" + getMaxNumClusters());
	2258
	2259	result.add("-B");
	2260	result.add("" + getBinValue());
	2261
	2262	if (getUseKDTree()) {
	2263	result.add("-use-kdtree");
	2264	result.add("-K");
	2265	result.add("" + getKDTreeSpec());
	2266	}
	2267
	2268	result.add("-C");
	2269	result.add("" + getCutOffFactor());
	2270
	2271	if (getDistanceF() != null) {
	2272	result.add("-D");
	2273	result.add("" + getDistanceFSpec());
	2274	}
	2275
	2276	if (getInputCenterFile().exists() && getInputCenterFile().isFile()) {
	2277	result.add("-N");
	2278	result.add("" + getInputCenterFile());
	2279	}
	2280
	2281	if (getOutputCenterFile().exists() && getOutputCenterFile().isFile()) {
	2282	result.add("-O");
	2283	result.add("" + getOutputCenterFile());
	2284	}
	2285
	2286	int dL = getDebugLevel();
	2287	if (dL > 0) {
	2288	result.add("-U");
	2289	result.add("" + getDebugLevel());
	2290	}
	2291
	2292	if (getDebugVectorsFile().exists() && getDebugVectorsFile().isFile()) {
	2293	result.add("-Y");
	2294	result.add("" + getDebugVectorsFile());
	2295	}
	2296
	2297	options = super.getOptions();
	2298	for (i = 0; i < options.length; i++)
	2299	result.add(options[i]);
	2300
	2301	return (String[]) result.toArray(new String[result.size()]);
	2302	}
	2303
	2304	/**
	2305	* Return a string describing this clusterer.
	2306	* @return a description of the clusterer as a string
	2307	*/
	2308	public String toString() {
	2309	StringBuffer temp = new StringBuffer();
	2310
	2311	temp.append("\nXMeans\n======\n");
	2312
	2313	temp.append("Requested iterations : " + m_MaxIterations + "\n");
	2314	temp.append("Iterations performed : " + m_IterationCount+ "\n");
	2315	if (m_KMeansStopped > 0) {
	2316	temp.append("kMeans did not converge\n");
	2317	temp.append(" but was stopped by max-loops "
	2318	+ m_KMeansStopped + " times (max kMeans-iter)\n");
	2319	}
	2320	temp.append("Splits prepared : " + m_NumSplits + "\n");
	2321	temp.append("Splits performed : " + m_NumSplitsDone + "\n");
	2322	temp.append("Cutoff factor : " + m_CutOffFactor + "\n");
	2323	double perc;
	2324	if (m_NumSplitsDone > 0)
	2325	perc = (((double)m_NumSplitsStillDone)/((double) m_NumSplitsDone))
	2326	* 100.0;
	2327	else
	2328	perc = 0.0;
	2329	temp.append("Percentage of splits accepted \n" +
	2330	"by cutoff factor : "
	2331	+ Utils.doubleToString(perc,2) + " %\n");
	2332	temp.append("------\n");
	2333
	2334	temp.append("Cutoff factor : " + m_CutOffFactor + "\n");
	2335	temp.append("------\n");
	2336	temp.append("\nCluster centers : " + m_NumClusters + " centers\n");
	2337	for (int i = 0; i < m_NumClusters; i++) {
	2338	temp.append("\nCluster "+i+"\n ");
	2339	for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) {
	2340	if (m_ClusterCenters.attribute(j).isNominal()) {
	2341	temp.append(" "+m_ClusterCenters.attribute(j).
	2342	value((int)m_ClusterCenters.instance(i).value(j)));
	2343	} else {
	2344	temp.append(" "+m_ClusterCenters.instance(i).value(j));
	2345	}
	2346	}
	2347	}
	2348	if (m_Mle != null)
	2349	temp.append("\n\nDistortion: " +
	2350	Utils.doubleToString(Utils.sum(m_Mle),6) + "\n");
	2351	temp.append("BIC-Value : " + Utils.doubleToString(m_Bic,6) + "\n");
	2352	return temp.toString();
	2353	}
	2354
	2355	/**
	2356	* Print centers for debug.
	2357	* @param debugLevel level that gives according messages
	2358	*/
	2359	protected void PrCentersFD(int debugLevel) {
	2360	if (debugLevel == m_DebugLevel) {
	2361	for (int i = 0; i < m_ClusterCenters.numInstances(); i++) {
	2362	System.out.println(m_ClusterCenters.instance(i));
	2363	}
	2364	}
	2365	}
	2366
	2367	/**
	2368	* Tests on debug status.
	2369	* @param debugLevel level that gives according messages
	2370	* @return true if debug level is set
	2371	*/
	2372	protected boolean TFD(int debugLevel) {
	2373	return (debugLevel == m_DebugLevel);
	2374	}
	2375
	2376	/**
	2377	* Does debug printouts.
	2378	* @param debugLevel level that gives according messages
	2379	* @param output string that is printed
	2380	*/
	2381	protected void PFD(int debugLevel, String output) {
	2382	if (debugLevel == m_DebugLevel)
	2383	System.out.println(output);
	2384	}
	2385	/**
	2386	* Does debug printouts.
	2387	* @param output string that is printed
	2388	*/
	2389	protected void PFD_CURR(String output) {
	2390	if (m_CurrDebugFlag)
	2391	System.out.println(output);
	2392	}
	2393
	2394	/**
	2395	* Returns the revision string.
	2396	*
	2397	* @return the revision
	2398	*/
	2399	public String getRevision() {
	2400	return RevisionUtils.extract("$Revision: 5488 $");
	2401	}
	2402
	2403	/**
	2404	* Main method for testing this class.
	2405	* @param argv should contain options
	2406	*/
	2407	public static void main(String[] argv) {
	2408	runClusterer(new XMeans(), argv);
	2409	}
	2410	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: