source: branches/MetisMQI/src/main/java/weka/clusterers/XMeans.java @ 38

Last change on this file since 38 was 29, checked in by gnappo, 14 years ago

Taggata versione per la demo e aggiunto branch.

File size: 69.5 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    XMeans.java
19 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.clusterers;
24
25import weka.core.AlgVector;
26import weka.core.Capabilities;
27import weka.core.DistanceFunction;
28import weka.core.EuclideanDistance;
29import weka.core.Instance;
30import weka.core.Instances;
31import weka.core.RevisionUtils;
32import weka.core.neighboursearch.KDTree;
33import weka.core.Option;
34import weka.core.OptionHandler;
35import weka.core.TechnicalInformation;
36import weka.core.TechnicalInformationHandler;
37import weka.core.Utils;
38import weka.core.Capabilities.Capability;
39import weka.core.TechnicalInformation.Field;
40import weka.core.TechnicalInformation.Type;
41import weka.filters.Filter;
42import weka.filters.unsupervised.attribute.ReplaceMissingValues;
43
44import java.io.BufferedReader;
45import java.io.File;
46import java.io.FileOutputStream;
47import java.io.FileReader;
48import java.io.PrintWriter;
49import java.io.Reader;
50import java.util.Enumeration;
51import java.util.Random;
52import java.util.Vector;
53
54/**
55 <!-- globalinfo-start -->
56 * Cluster data using the X-means algorithm.<br/>
57 * <br/>
58 * X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/>
59 * <br/>
60 * For more information see:<br/>
61 * <br/>
62 * Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000.
63 * <p/>
64 <!-- globalinfo-end -->
65 *
66 <!-- technical-bibtex-start -->
67 * BibTeX:
68 * <pre>
69 * &#64;inproceedings{Pelleg2000,
70 *    author = {Dan Pelleg and Andrew W. Moore},
71 *    booktitle = {Seventeenth International Conference on Machine Learning},
72 *    pages = {727-734},
73 *    publisher = {Morgan Kaufmann},
74 *    title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters},
75 *    year = {2000}
76 * }
77 * </pre>
78 * <p/>
79 <!-- technical-bibtex-end -->
80 *
81 <!-- options-start -->
82 * Valid options are: <p/>
83 *
84 * <pre> -I &lt;num&gt;
85 *  maximum number of overall iterations
86 *  (default 1).</pre>
87 *
88 * <pre> -M &lt;num&gt;
89 *  maximum number of iterations in the kMeans loop in
90 *  the Improve-Parameter part
91 *  (default 1000).</pre>
92 *
93 * <pre> -J &lt;num&gt;
94 *  maximum number of iterations in the kMeans loop
95 *  for the splitted centroids in the Improve-Structure part
96 *  (default 1000).</pre>
97 *
98 * <pre> -L &lt;num&gt;
99 *  minimum number of clusters
100 *  (default 2).</pre>
101 *
102 * <pre> -H &lt;num&gt;
103 *  maximum number of clusters
104 *  (default 4).</pre>
105 *
106 * <pre> -B &lt;value&gt;
107 *  distance value for binary attributes
108 *  (default 1.0).</pre>
109 *
110 * <pre> -use-kdtree
111 *  Uses the KDTree internally
112 *  (default no).</pre>
113 *
114 * <pre> -K &lt;KDTree class specification&gt;
115 *  Full class name of KDTree class to use, followed
116 *  by scheme options.
117 *  eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
118 *  (default no KDTree class used).</pre>
119 *
120 * <pre> -C &lt;value&gt;
121 *  cutoff factor, takes the given percentage of the splitted
122 *  centroids if none of the children win
123 *  (default 0.0).</pre>
124 *
125 * <pre> -D &lt;distance function class specification&gt;
126 *  Full class name of Distance function class to use, followed
127 *  by scheme options.
128 *  (default weka.core.EuclideanDistance).</pre>
129 *
130 * <pre> -N &lt;file name&gt;
131 *  file to read starting centers from (ARFF format).</pre>
132 *
133 * <pre> -O &lt;file name&gt;
134 *  file to write centers to (ARFF format).</pre>
135 *
136 * <pre> -U &lt;int&gt;
137 *  The debug level.
138 *  (default 0)</pre>
139 *
140 * <pre> -Y &lt;file name&gt;
141 *  The debug vectors file.</pre>
142 *
143 * <pre> -S &lt;num&gt;
144 *  Random number seed.
145 *  (default 10)</pre>
146 *
147 <!-- options-end -->
148 *
149 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
150 * @author Mark Hall (mhall@cs.waikato.ac.nz)
151 * @author Malcolm Ware (mfw4@cs.waikato.ac.nz)
152 * @version $Revision: 5488 $
153 * @see RandomizableClusterer
154 */
155public class XMeans 
156  extends RandomizableClusterer
157  implements TechnicalInformationHandler {
158
159  /*
160   * major TODOS:
161   *
162   * make BIC-Score replaceable by other scores
163   */
164
165  /** for serialization. */
166  private static final long serialVersionUID = -7941793078404132616L;
167 
168  /** training instances. */
169  protected Instances m_Instances = null;
170
171  /** model information, should increase readability. */
172  protected Instances m_Model = null;
173 
174  /** replace missing values in training instances. */
175  protected ReplaceMissingValues m_ReplaceMissingFilter;
176
177  /**
178   * Distance value between true and false of binary attributes and
179   * "same" and "different" of nominal attributes (default = 1.0).
180   */
181  protected double m_BinValue = 1.0;
182
183  /** BIC-Score of the current model. */
184  protected double m_Bic = Double.MIN_VALUE;
185
186  /** Distortion.  */
187  protected double[] m_Mle = null;
188
189  /** maximum overall iterations. */
190  protected int m_MaxIterations = 1;
191
192  /**
193   * maximum iterations to perform Kmeans part
194   * if negative, iterations are not checked.
195   */
196  protected int m_MaxKMeans = 1000;
197
198  /** see above, but for kMeans of splitted clusters.
199   */
200  protected int m_MaxKMeansForChildren = 1000;
201
202  /** The actual number of clusters. */
203  protected int m_NumClusters = 2;
204
205  /** min number of clusters to generate. */
206  protected int m_MinNumClusters = 2;
207
208  /** max number of clusters to generate. */
209  protected int m_MaxNumClusters = 4;
210
211  /** the distance function used. */
212  protected DistanceFunction m_DistanceF = new EuclideanDistance();
213
214  /** cluster centers. */
215  protected Instances m_ClusterCenters;
216
217  /** file name of the output file for the cluster centers. */
218  protected File m_InputCenterFile = new File(System.getProperty("user.dir"));
219
220  /* --> DebugVectors - USED FOR DEBUGGING */
221  /** input file for the random vectors --> USED FOR DEBUGGING. */
222  protected Reader m_DebugVectorsInput = null;
223  /** the index for the current debug vector. */
224  protected int m_DebugVectorsIndex = 0;
225  /** all the debug vectors. */
226  protected Instances m_DebugVectors = null;
227
228  /** file name of the input file for the random vectors. */
229  protected File m_DebugVectorsFile = new File(System.getProperty("user.dir"));
230
231  /** input file for the cluster centers. */
232  protected Reader m_CenterInput = null;
233   
234  /** file name of the output file for the cluster centers. */
235  protected File m_OutputCenterFile = new File(System.getProperty("user.dir"));
236 
237  /** output file for the cluster centers. */
238  protected PrintWriter m_CenterOutput = null;
239   
240  /**
241   * temporary variable holding cluster assignments while iterating.
242   */
243  protected int[] m_ClusterAssignments;
244
245  /** cutoff factor - percentage of splits done in Improve-Structure part
246     only relevant, if all children lost. */ 
247  protected double m_CutOffFactor = 0.5;
248
249  /** Index in ranges for LOW. */
250  public static int R_LOW = 0;
251  /** Index in ranges for HIGH. */
252  public static int R_HIGH = 1;
253  /** Index in ranges for WIDTH. */
254  public static int R_WIDTH = 2;
255
256  /**
257   * KDTrees class if KDTrees are used.
258   */
259  protected KDTree m_KDTree = new KDTree();
260 
261  /** whether to use the KDTree (the KDTree is only initialized to be
262   * configurable from the GUI). */
263  protected boolean m_UseKDTree = false;
264
265  /** counts iterations done in main loop. */
266  protected int m_IterationCount = 0;
267
268  /** counter to say how often kMeans was stopped by loop counter. */
269  protected int m_KMeansStopped = 0;
270
271  /** Number of splits prepared. */
272  protected int m_NumSplits = 0;
273
274  /** Number of splits accepted (including cutoff factor decisions). */
275  protected int m_NumSplitsDone = 0;
276
277  /** Number of splits accepted just because of cutoff factor. */
278  protected int m_NumSplitsStillDone = 0;
279
280  /**
281   * level of debug output, 0 is no output.
282   */
283  protected int m_DebugLevel = 0;
284 
285  /** print the centers. */
286  public static int D_PRINTCENTERS = 1;
287  /** follows the splitting of the centers. */
288  public static int D_FOLLOWSPLIT = 2;
289  /** have a closer look at converge children. */
290  public static int D_CONVCHCLOSER = 3;
291  /** check on random vectors. */
292  public static int D_RANDOMVECTOR = 4;
293  /** check on kdtree. */
294  public static int D_KDTREE = 5;
295  /** follow iterations. */
296  public static int D_ITERCOUNT = 6;
297  /** functions were maybe misused.  */
298  public static int D_METH_MISUSE = 80; 
299  /** for current debug.  */
300  public static int D_CURR = 88;
301  /** general debugging. */
302  public static int D_GENERAL = 99;
303
304  /** Flag: I'm debugging. */
305  public boolean m_CurrDebugFlag = true;
306
307  /**
308   * the default constructor.
309   */
310  public XMeans() {
311    super();
312   
313    m_SeedDefault = 10;
314    setSeed(m_SeedDefault);
315  }
316 
317  /**
318   * Returns a string describing this clusterer.
319   * @return a description of the evaluator suitable for
320   * displaying in the explorer/experimenter gui
321   */
322  public String globalInfo() {
323    return 
324        "Cluster data using the X-means algorithm.\n\n" 
325      + "X-Means is K-Means extended by an Improve-Structure part In this "
326      + "part of the algorithm the centers are attempted to be split in "
327      + "its region. The decision between the children of each center and "
328      + "itself is done comparing the BIC-values of the two structures.\n\n"
329      + "For more information see:\n\n"
330      + getTechnicalInformation().toString();
331  }
332
333  /**
334   * Returns an instance of a TechnicalInformation object, containing
335   * detailed information about the technical background of this class,
336   * e.g., paper reference or book this class is based on.
337   *
338   * @return the technical information about this class
339   */
340  public TechnicalInformation getTechnicalInformation() {
341    TechnicalInformation        result;
342   
343    result = new TechnicalInformation(Type.INPROCEEDINGS);
344    result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore");
345    result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters");
346    result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning");
347    result.setValue(Field.YEAR, "2000");
348    result.setValue(Field.PAGES, "727-734");
349    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
350   
351    return result;
352  }
353
354  /**
355   * Returns default capabilities of the clusterer.
356   *
357   * @return      the capabilities of this clusterer
358   */
359  public Capabilities getCapabilities() {
360    Capabilities result = super.getCapabilities();
361    result.disableAll();
362    result.enable(Capability.NO_CLASS);
363
364    // attributes
365    result.enable(Capability.NUMERIC_ATTRIBUTES);
366    result.enable(Capability.DATE_ATTRIBUTES);
367    result.enable(Capability.MISSING_VALUES);
368
369    return result;
370  }
371 
372  /**
373   * Generates the X-Means clusterer.
374   *
375   * @param data set of instances serving as training data
376   * @throws Exception if the clusterer has not been
377   * generated successfully
378   */
379  public void buildClusterer(Instances data) throws Exception {
380
381    // can clusterer handle the data?
382    getCapabilities().testWithFail(data);
383   
384    if (m_MinNumClusters > m_MaxNumClusters) {
385      throw new Exception("XMeans: min number of clusters "
386          + "can't be greater than max number of clusters!");
387    }
388
389    m_NumSplits = 0;
390    m_NumSplitsDone = 0;
391    m_NumSplitsStillDone = 0;
392
393    // replace missing values
394    m_ReplaceMissingFilter = new ReplaceMissingValues();
395    m_ReplaceMissingFilter.setInputFormat(data);
396    m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter);
397   
398    // initialize random function
399    Random random0 = new Random(m_Seed);
400
401    // num of clusters to start with
402    m_NumClusters =  m_MinNumClusters;
403
404    // set distance function to default
405    if (m_DistanceF == null) {
406      m_DistanceF = new EuclideanDistance();
407    }
408
409    m_DistanceF.setInstances(m_Instances);
410    checkInstances();
411
412    if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile())
413      initDebugVectorsInput();
414
415    // make list of indexes for m_Instances
416    int[] allInstList = new int[m_Instances.numInstances()]; 
417    for (int i = 0; i < m_Instances.numInstances(); i++) {
418      allInstList[i] = i;
419    }
420   
421    // set model used (just for convenience)
422    m_Model = new Instances(m_Instances, 0);
423
424    // produce the starting centers
425    if (m_CenterInput != null) {
426      // read centers from file
427      m_ClusterCenters = new Instances(m_CenterInput);
428      m_NumClusters = m_ClusterCenters.numInstances();
429    }
430    else
431      // makes the first centers randomly
432      m_ClusterCenters = makeCentersRandomly(random0,
433                                             m_Instances, m_NumClusters);
434    PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");
435    for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {
436      PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));
437    }
438
439    PrCentersFD(D_PRINTCENTERS);
440
441    boolean finished = false;
442    Instances children; 
443
444    // builds up a KDTree
445    if (m_UseKDTree)
446      m_KDTree.setInstances(m_Instances);
447 
448    // loop counter of main loop
449    m_IterationCount = 0;
450
451    /**
452     * "finished" does get true as soon as:
453     * 1. number of clusters gets >= m_MaxClusters,
454     * 2. in the last round, none of the centers have been split
455     *
456     * if number of clusters is already >= m_MaxClusters
457     * part 1 (= Improve-Params) is done at least once.
458     */
459    while (!finished &&
460           !stopIteration(m_IterationCount, m_MaxIterations)) {
461     
462      /* ====================================================================
463       * 1. Improve-Params                 
464       *    conventional K-means
465       */
466
467
468      PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:");
469      PrCentersFD(D_FOLLOWSPLIT);
470
471      PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + 
472          ". time");
473      m_IterationCount++;
474
475      // prepare to converge
476      boolean converged = false;
477
478      // initialize assignments to -1
479      m_ClusterAssignments = initAssignments(m_Instances.numInstances());
480      // stores a list of indexes of instances belonging to each center
481      int[][] instOfCent = new int[m_ClusterCenters.numInstances()][];
482
483      // KMeans loop counter
484      int kMeansIteration = 0;
485
486      // converge in conventional K-means ----------------------------------
487      PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:");
488      while (!converged && 
489             !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) {
490       
491        kMeansIteration++;
492        converged = true;
493       
494        // assign instances to centers -------------------------------------
495        converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
496                                    m_ClusterCenters, 
497                                    instOfCent,
498                                    allInstList, 
499                                    m_ClusterAssignments,
500                                    kMeansIteration);
501       
502        PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:");
503        PrCentersFD(D_FOLLOWSPLIT);
504        // compute new centers = centers of mass of points
505        converged = recomputeCenters(m_ClusterCenters, // clusters
506                                     instOfCent,       // their instances
507                                     m_Model);         // model information
508      PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:");
509      PrCentersFD(D_FOLLOWSPLIT);
510      }
511      PFD(D_FOLLOWSPLIT, "");
512      PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means");
513
514      /** =====================================================================
515       * 2. Improve-Structur
516       */
517
518      // BIC before split distortioning the centres
519      m_Mle = distortion(instOfCent, m_ClusterCenters);
520      m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle);
521      PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic);
522
523      int currNumCent = m_ClusterCenters.numInstances();
524      Instances splitCenters = new Instances(m_ClusterCenters, 
525                                             currNumCent * 2);
526     
527      // store BIC values of parent and children
528      double[] pbic = new double [currNumCent];
529      double[] cbic = new double [currNumCent];
530           
531      // split each center
532      for (int i = 0; i < currNumCent
533           // this could help to optimize the algorithm
534           //        && currNumCent + numSplits <= m_MaxNumClusters
535           ; 
536           i++) {
537       
538        PFD(D_FOLLOWSPLIT, "\nsplit center " + i +
539                      " " + m_ClusterCenters.instance(i));
540        Instance currCenter = m_ClusterCenters.instance(i);
541        int[] currInstList = instOfCent[i];
542        int currNumInst = instOfCent[i].length;
543       
544        // not enough instances; than continue with next
545        if (currNumInst <= 2) {
546          pbic[i] = Double.MAX_VALUE;
547          cbic[i] = 0.0;
548          // add center itself as dummy
549          splitCenters.add(currCenter);
550          splitCenters.add(currCenter);
551          continue;
552        }
553       
554        // split centers  ----------------------------------------------
555        double variance = m_Mle[i] / (double)currNumInst;
556        children = splitCenter(random0, currCenter, variance, m_Model);
557       
558        // initialize assignments to -1
559        int[] oneCentAssignments = initAssignments(currNumInst);
560        int[][] instOfChCent = new int [2][]; // todo maybe split didn't work
561       
562        // converge the children  --------------------------------------
563        converged = false;
564        int kMeansForChildrenIteration = 0;
565        PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i);
566        while (!converged && 
567          !stopKMeansIteration(kMeansForChildrenIteration, 
568                               m_MaxKMeansForChildren)) {
569          kMeansForChildrenIteration++;
570         
571          converged =
572            assignToCenters(children, instOfChCent,
573                            currInstList, oneCentAssignments);
574
575          if (!converged) {       
576            recomputeCentersFast(children, instOfChCent, m_Model);
577          }
578        } 
579
580        // store new centers for later decision if they are taken
581        splitCenters.add(children.instance(0));
582        splitCenters.add(children.instance(1));
583
584        PFD(D_FOLLOWSPLIT, "\nconverged cildren ");
585        PFD(D_FOLLOWSPLIT, " " + children.instance(0));
586        PFD(D_FOLLOWSPLIT, " " + children.instance(1));
587
588        // compare parent and children model by their BIC-value
589        pbic[i] = calculateBIC(currInstList, currCenter,  m_Mle[i], m_Model);
590        double[] chMLE = distortion(instOfChCent, children);
591        cbic[i] = calculateBIC(instOfChCent, children, chMLE);
592
593      } // end of loop over clusters
594
595      // decide which one to split and make new list of cluster centers
596      Instances newClusterCenters = null;
597      newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor,
598                                                 splitCenters);
599      /**
600       * Compare with before Improve-Structure
601       */
602      int newNumClusters = newClusterCenters.numInstances();
603      if (newNumClusters != m_NumClusters) {
604       
605        PFD(D_FOLLOWSPLIT, "Compare with non-split");
606
607        // initialize assignments to -1
608        int[] newClusterAssignments = 
609          initAssignments(m_Instances.numInstances());
610       
611        // stores a list of indexes of instances belonging to each center
612        int[][] newInstOfCent = new int[newClusterCenters.numInstances()][];
613       
614        // assign instances to centers -------------------------------------
615        converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
616                                    newClusterCenters, 
617                                    newInstOfCent,
618                                    allInstList, 
619                                    newClusterAssignments,
620                                    m_IterationCount);
621       
622        double[] newMle = distortion(newInstOfCent, newClusterCenters);
623        double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle);
624        PFD(D_FOLLOWSPLIT, "newBic " + newBic);
625        if (newBic > m_Bic) {
626          PFD(D_FOLLOWSPLIT, "*** decide for new clusters");
627          m_Bic = newBic;
628          m_ClusterCenters = newClusterCenters;
629          m_ClusterAssignments = newClusterAssignments;
630        } else {
631          PFD(D_FOLLOWSPLIT, "*** keep old clusters");
632        }
633      }
634
635      newNumClusters = m_ClusterCenters.numInstances();
636      // decide if finished: max num cluster reached
637      // or last centers where not split at all
638      if ((newNumClusters >= m_MaxNumClusters) 
639          || (newNumClusters == m_NumClusters)) {
640        finished = true;
641      }
642      m_NumClusters = newNumClusters;
643    }
644  }
645
646  /**
647   * Checks for nominal attributes in the dataset.
648   * Class attribute is ignored.
649   * @param data the data to check
650   * @return false if no nominal attributes are present
651   */
652  public boolean checkForNominalAttributes(Instances data) {
653
654    int i = 0;
655    while (i < data.numAttributes()) {
656      if ((i != data.classIndex()) && data.attribute(i++).isNominal()) {
657        return true;
658      }
659    }
660    return false;
661  }
662
663  /**
664   * Set array of int, used to store assignments, to -1.
665   * @param ass integer array used for storing assignments
666   * @return integer array used for storing assignments
667   */
668  protected int[] initAssignments(int[] ass) {
669    for (int i = 0; i < ass.length; i++)
670      ass[i] = -1;
671    return ass;
672  }   
673 
674  /**
675   * Creates and initializes integer array, used to store assignments.
676   * @param numInstances length of array used for assignments
677   * @return integer array used for storing assignments
678   */
679  protected int[] initAssignments(int numInstances) {
680    int[] ass = new int[numInstances];
681    for (int i = 0; i < numInstances; i++)
682      ass[i] = -1;
683    return ass;
684  }   
685 
686  /**
687   * Creates and initializes boolean array.
688   * @param len length of new array
689   * @return the new array
690   */
691  boolean[] initBoolArray(int len) {
692    boolean[] boolArray = new boolean [len];
693    for (int i = 0; i < len; i++) {
694      boolArray[i] = false;
695    }
696    return boolArray;
697  }
698
699  /**
700   * Returns new center list.
701   *
702   * The following steps 1. and 2. both take care that the number of centers
703   * does not exceed maxCenters.
704   *
705   * 1. Compare BIC values of parent and children and takes the one as
706   * new centers which do win (= BIC-value is smaller).
707   *
708   * 2. If in 1. none of the children are chosen
709   *    && and cutoff factor is > 0
710   * cutoff factor is taken as the percentage of "best" centers that are
711   * still taken.
712   * @param pbic array of parents BIC-values
713   * @param cbic array of childrens BIC-values
714   * @param cutoffFactor cutoff factor
715   * @param splitCenters all children
716   * @return the new centers
717   */
718  protected Instances newCentersAfterSplit(double[] pbic, 
719                                         double[] cbic,
720                                         double cutoffFactor,
721                                         Instances splitCenters) {
722
723    // store if split won
724    boolean splitPerCutoff = false;
725    boolean takeSomeAway = false;
726    boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances());
727    int numToSplit = 0;
728    Instances newCenters = null;
729   
730    // how many would be split, because the children have a better bic value
731    for (int i = 0; i < cbic.length; i++) {
732      if (cbic[i] > pbic[i]) {
733        // decide for splitting ----------------------------------------
734        splitWon[i] = true; numToSplit++;
735        PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children");
736      }
737      else {
738        // decide for parents and finished stays true  -----------------
739        PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent");
740      }
741    }
742
743    // no splits yet so split per cutoff factor
744    if ((numToSplit == 0) && (cutoffFactor > 0)) {
745      splitPerCutoff = true;
746     
747      // how many to split per cutoff factor
748      numToSplit = (int) 
749        ((double) m_ClusterCenters.numInstances() * m_CutOffFactor); 
750    }
751
752    // prepare indexes of values in ascending order 
753    double[] diff = new double [m_NumClusters];
754    for (int j = 0; j < diff.length; j++) {
755      diff[j] = pbic[j] - cbic[j];
756    }   
757    int[] sortOrder = Utils.sort(diff);
758   
759    // check if maxNumClusters would be exceeded
760    int possibleToSplit = m_MaxNumClusters - m_NumClusters; 
761
762    if (possibleToSplit > numToSplit) {
763      // still enough possible, do the whole amount
764      possibleToSplit = numToSplit;
765    }
766    else
767      takeSomeAway = true;
768
769    // prepare for splitting the one that are supposed to be split
770    if (splitPerCutoff) {
771      for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0);
772           j++) {
773        splitWon[sortOrder[j]] = true;
774      }
775      m_NumSplitsStillDone += possibleToSplit;
776    } 
777    else {
778      // take some splits away if max number of clusters would be exceeded
779      if (takeSomeAway) {
780        int count = 0;
781        int j = 0;
782        for (;j < splitWon.length && count < possibleToSplit; j++){
783          if (splitWon[sortOrder[j]] == true) count++;
784        }
785       
786        while (j < splitWon.length) {
787          splitWon[sortOrder[j]] = false;
788          j++;
789        }
790      }
791    }
792   
793    // finally split
794    if (possibleToSplit > 0) 
795      newCenters = newCentersAfterSplit(splitWon, splitCenters);
796    else
797      newCenters = m_ClusterCenters;
798    return newCenters;
799  }
800
801  /**
802   * Returns new centers. Depending on splitWon: if true takes children, if
803   * false takes parent = current center.
804   *
805   * @param splitWon
806   *          array of boolean to indicate to take split or not
807   * @param splitCenters
808   *          list of splitted centers
809   * @return the new centers
810   */
811  protected Instances newCentersAfterSplit(boolean[] splitWon,
812      Instances splitCenters) {
813    Instances newCenters = new Instances(splitCenters, 0);
814
815    int sIndex = 0;
816    for (int i = 0; i < splitWon.length; i++) {
817      if (splitWon[i]) {
818        m_NumSplitsDone++;
819        newCenters.add(splitCenters.instance(sIndex++));
820        newCenters.add(splitCenters.instance(sIndex++));
821      } else {
822        sIndex++;
823        sIndex++;
824        newCenters.add(m_ClusterCenters.instance(i));
825      }
826    }
827    return newCenters;
828  }
829
830  /**
831   * Controls that counter does not exceed max iteration value. Special function
832   * for kmeans iterations.
833   *
834   * @param iterationCount
835   *          current value of counter
836   * @param max
837   *          maximum value for counter
838   * @return true if iteration should be stopped
839   */ 
840  protected boolean stopKMeansIteration(int iterationCount, int max) {
841    boolean stopIterate = false;
842    if (max >= 0) 
843      stopIterate = (iterationCount >= max);
844    if (stopIterate) 
845      m_KMeansStopped++;
846    return stopIterate;
847  }
848
849  /**
850   * Checks if iterationCount has to be checked and if yes
851   * (this means max is > 0) compares it with max.
852   *
853   * @param iterationCount the current iteration count
854   * @param max the maximum number of iterations
855   * @return true if maximum has been reached
856   */ 
857  protected boolean stopIteration(int iterationCount, int max) {
858    boolean stopIterate = false;
859    if (max >= 0) 
860      stopIterate = (iterationCount >= max);
861    return stopIterate;
862  }
863
864  /**
865   * Recompute the new centers. New cluster center is center of mass of its
866   * instances. Returns true if cluster stays the same.
867   * @param centers the input and output centers
868   * @param instOfCent the instances to the centers
869   * @param model data model information
870   * @return true if converged.
871   */
872   protected boolean recomputeCenters(Instances centers,         
873                                   int[][] instOfCent, 
874                                   Instances model) {
875    boolean converged = true;
876   
877    for (int i = 0; i < centers.numInstances(); i++) {
878      double val;
879      for (int j = 0; j < model.numAttributes(); j++) {
880        val = meanOrMode(m_Instances, instOfCent[i], j);
881
882        for (int k = 0; k < instOfCent[i].length; k++)
883
884        if (converged && m_ClusterCenters.instance(i).value(j) != val) 
885          converged = false;
886        if (!converged)
887          m_ClusterCenters.instance(i).setValue(j, val);
888      }
889    }
890    return converged;
891  }
892
893  /**
894   * Recompute the new centers - 2nd version
895   * Same as recomputeCenters, but does not check if center stays the same.
896   *
897   * @param centers the input center and output centers
898   * @param instOfCentIndexes the indexes of the instances to the centers
899   * @param model data model information
900   */
901  protected void recomputeCentersFast(Instances centers,         
902                                    int[][] instOfCentIndexes, 
903                                    Instances model   
904                                    ) {
905    for (int i = 0; i < centers.numInstances(); i++) {
906      double val;
907      for (int j = 0; j < model.numAttributes(); j++) {
908        val = meanOrMode(m_Instances, instOfCentIndexes[i], j);
909        centers.instance(i).setValue(j, val);
910      }
911    }
912  }
913
914  /**
915   * Computes Mean Or Mode of one attribute on a subset of m_Instances.
916   * The subset is defined by an index list.
917   * @param instances all instances
918   * @param instList the indexes of the instances the mean is computed from
919   * @param attIndex the index of the attribute
920   * @return mean value
921   */
922  protected double meanOrMode(Instances instances, 
923                            int[] instList, 
924                            int attIndex) {
925    double result, found;
926    int[] counts;
927    int numInst = instList.length;
928
929    if (instances.attribute(attIndex).isNumeric()) {
930      result = found = 0;
931      for (int j = 0; j < numInst; j++) {
932        Instance currInst = instances.instance(instList[j]);
933        if (!currInst.isMissing(attIndex)) {
934          found += currInst.weight();
935          result += currInst.weight() * 
936            currInst.value(attIndex);
937        }
938      }
939      if (Utils.eq(found, 0)) {
940        return 0;
941      } else {
942        return result / found;
943      }
944    } else if (instances.attribute(attIndex).isNominal()) {
945      counts = new int[instances.attribute(attIndex).numValues()];
946      for (int j = 0; j < numInst; j++) {
947        Instance currInst = instances.instance(instList[j]);
948        if (!currInst.isMissing(attIndex)) {
949          counts[(int) currInst.value(attIndex)] += currInst.weight();
950        }
951      }
952      return (double)Utils.maxIndex(counts);
953    } else {
954      return 0;
955    }
956  }
957
958 
959  /**
960   * Assigns instances to centers.
961   *
962   * @param tree KDTree on all instances
963   * @param centers all the input centers
964   * @param instOfCent the instances to each center
965   * @param allInstList list of all instances
966   * @param assignments assignments of instances to centers
967   * @param iterationCount the number of iteration
968   * @return true if converged
969   * @throws Exception is something goes wrong
970   */
971  protected boolean assignToCenters(KDTree tree,
972                                  Instances centers, 
973                                  int[][] instOfCent, 
974                                  int[] allInstList,
975                                  int[] assignments,
976                                  int iterationCount) throws Exception {
977   
978    boolean converged = true;
979    if (tree != null) {
980      // using KDTree structure for assigning
981      converged = assignToCenters(tree,
982                                  centers, 
983                                  instOfCent,
984                                  assignments,
985                                  iterationCount);
986    } else {
987      converged = assignToCenters(centers, 
988                                  instOfCent,
989                                  allInstList, 
990                                  assignments);
991    }
992    return converged;
993  }
994
995  /**
996   * Assign instances to centers using KDtree.
997   * First part of conventionell K-Means, returns true if new assignment
998   * is the same as the last one.
999   *
1000   * @param kdtree KDTree on all instances
1001   * @param centers all the input centers
1002   * @param instOfCent the instances to each center
1003   * @param assignments assignments of instances to centers
1004   * @param iterationCount the number of iteration
1005   * @return true if converged
1006   * @throws Exception in case instances are not assigned to cluster
1007   */
1008  protected boolean assignToCenters(KDTree kdtree,
1009                                  Instances centers, 
1010                                  int[][] instOfCent, 
1011                                  int[] assignments,
1012                                  int iterationCount) throws Exception {
1013
1014    int numCent = centers.numInstances();
1015    int numInst = m_Instances.numInstances(); 
1016    int[] oldAssignments = new int[numInst];
1017   
1018    // WARNING:  assignments is "input/output-parameter"
1019    // should not be null
1020    if (assignments == null) {
1021      assignments = new int[numInst];
1022      for (int i = 0; i < numInst; i++) {
1023        assignments[0] = -1;
1024      }
1025    }
1026   
1027    // WARNING:  instOfCent is "input/output-parameter"
1028    // should not be null
1029    if (instOfCent == null) {
1030      instOfCent = new int [numCent][];
1031    }
1032   
1033    // save old assignments
1034    for (int i = 0; i < assignments.length; i++) {
1035      oldAssignments[i] = assignments[i];
1036    }
1037   
1038    // use tree to get new assignments
1039    kdtree.centerInstances(centers, assignments,
1040                           Math.pow(.8, iterationCount));       
1041    boolean converged = true;
1042 
1043    // compare with previous assignment
1044    for (int i = 0; converged && (i < assignments.length); i++) {
1045      converged = (oldAssignments[i] == assignments[i]);
1046      if (assignments[i] == -1) 
1047        throw new Exception("Instance " + i + 
1048                            " has not been assigned to cluster.");
1049    }
1050
1051    if (!converged) {
1052      int[] numInstOfCent = new int[numCent];
1053      for (int i = 0; i < numCent; i++)
1054        numInstOfCent[i] = 0;
1055
1056      // count num of assignments per center
1057      for (int i = 0; i < numInst; i++)
1058        numInstOfCent[assignments[i]]++;
1059     
1060      // prepare instancelists per center
1061      for (int i = 0; i < numCent; i++){
1062        instOfCent[i] = new int[numInstOfCent[i]];
1063      }
1064      // write instance lists per center
1065      for (int i = 0; i < numCent; i++) {
1066        int index = -1;   
1067        for (int j = 0; j < numInstOfCent[i]; j++) {
1068          index = nextAssignedOne(i, index, assignments);
1069          instOfCent[i][j] = index;
1070        }
1071      }
1072    }
1073 
1074    return converged;
1075  }
1076
1077  /**
1078   * Assign instances to centers.
1079   * Part of conventionell K-Means, returns true if new assignment
1080   * is the same as the last one.
1081   *
1082   * @param centers all the input centers
1083   * @param instOfCent the instances to each center
1084   * @param allInstList list of all indexes
1085   * @param assignments assignments of instances to centers
1086   * @return true if converged
1087   * @throws Exception if something goes wrong
1088   */
1089  protected boolean assignToCenters(Instances centers, 
1090                                  int[][] instOfCent,
1091                                  int[] allInstList,
1092                                  int[] assignments) 
1093    throws Exception {
1094   
1095    // todo: undecided situations
1096    boolean converged = true; // true if new assignment is the same
1097                              // as the old one
1098
1099    int numInst = allInstList.length; 
1100    int numCent = centers.numInstances();
1101    int[] numInstOfCent = new int [numCent];
1102    for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0;
1103
1104    // WARNING:  assignments is "input/output-parameter"
1105    // should not be null
1106    if (assignments == null) {
1107      assignments = new int[numInst];
1108      for (int i = 0; i < numInst; i++) {
1109        assignments[i] = -1;
1110      }
1111    }
1112
1113    // WARNING: instOfCent is "input/output-parameter"
1114    // should not be null
1115    if (instOfCent == null) {
1116      instOfCent = new int [numCent][];
1117    }
1118
1119    // set assignments
1120    for (int i = 0; i < numInst; i++) {
1121      Instance inst = m_Instances.instance(allInstList[i]);
1122      int newC = clusterProcessedInstance(inst, centers);
1123     
1124      if (converged && newC != assignments[i]) {
1125        converged = false;
1126      }
1127
1128      numInstOfCent[newC]++; 
1129      if (!converged)
1130        assignments[i] = newC;
1131    }
1132
1133    // the following is only done
1134    // if assignments are not the same, because too much effort
1135    if (!converged) {
1136      PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged");
1137      for (int i = 0; i < numCent; i++) {
1138        instOfCent[i] = new int [numInstOfCent[i]];
1139      }
1140
1141      for (int i = 0; i < numCent; i++) {
1142        int index = -1;   
1143        for (int j = 0; j < numInstOfCent[i]; j++) {
1144          index = nextAssignedOne(i, index, assignments);
1145          instOfCent[i][j] = allInstList[index];
1146        }
1147      }
1148    }
1149    else
1150      PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged");
1151
1152    return converged;
1153  }
1154
1155  /**
1156   * Searches along the assignment array for the next entry of the center
1157   * in question.
1158   * @param cent index of the center
1159   * @param lastIndex index to start searching
1160   * @param assignments assignments
1161   * @return index of the instance the center cent is assigned to
1162   */
1163  protected int nextAssignedOne(int cent, int lastIndex, 
1164                              int[] assignments) {
1165    int len = assignments.length;
1166    int index = lastIndex + 1;
1167    while (index < len) {
1168      if (assignments[index] == cent) {
1169        return (index);
1170      }
1171      index++;
1172    }
1173    return (-1);
1174  }
1175
1176  /**
1177   * Split centers in their region. Generates random vector of
1178   * length = variance and
1179   * adds and substractsx to cluster vector to get two new clusters.
1180   *
1181   * @param random random function
1182   * @param center the center that is split here
1183   * @param variance variance of the cluster
1184   * @param model data model valid
1185   * @return a pair of new centers
1186   * @throws Exception something in AlgVector goes wrong
1187   */
1188  protected Instances splitCenter(Random random,
1189                                Instance center,
1190                                double variance,
1191                                Instances model) throws Exception {
1192    m_NumSplits++;
1193    AlgVector r = null;
1194    Instances children = new Instances(model, 2);
1195
1196    if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) {
1197      Instance nextVector = getNextDebugVectorsInstance(model);
1198      PFD(D_RANDOMVECTOR, "Random Vector from File " + nextVector);
1199      r = new AlgVector(nextVector);
1200    }
1201    else {
1202      // random vector of length = variance
1203      r = new AlgVector(model, random);
1204    }
1205    r.changeLength(Math.pow(variance, 0.5));
1206    PFD(D_RANDOMVECTOR, "random vector *variance "+ r);
1207   
1208    // add random vector to center
1209    AlgVector c = new AlgVector(center);
1210    AlgVector c2 = (AlgVector) c.clone();
1211    c = c.add(r);
1212    Instance newCenter = c.getAsInstance(model, random);
1213    children.add(newCenter);
1214    PFD(D_FOLLOWSPLIT, "first child "+ newCenter);
1215   
1216    // substract random vector to center
1217    c2 = c2.substract(r);
1218    newCenter = c2.getAsInstance(model, random);
1219    children.add(newCenter);
1220    PFD(D_FOLLOWSPLIT, "second child "+ newCenter);
1221
1222    return children;
1223  }
1224
1225  /**
1226   * Split centers in their region.
1227   * (*Alternative version of splitCenter()*)
1228   *
1229   * @param random the random number generator
1230   * @param instances of the region
1231   * @param model the model for the centers
1232   * (should be the same as that of instances)
1233   * @return a pair of new centers
1234   */
1235  protected Instances splitCenters(Random random,
1236                                 Instances instances,
1237                                 Instances model) {
1238    Instances children = new Instances(model, 2);
1239    int instIndex = Math.abs(random.nextInt()) % instances.numInstances();
1240    children.add(instances.instance(instIndex));
1241    int instIndex2 = instIndex;
1242    int count = 0;
1243    while ((instIndex2 == instIndex) && count < 10) {
1244      count++;
1245      instIndex2 = Math.abs(random.nextInt()) % instances.numInstances();
1246    }
1247    children.add(instances.instance(instIndex2));
1248   
1249    return children;
1250  }
1251
1252  /**
1253   * Generates new centers randomly. Used for starting centers.
1254   *
1255   * @param random0 random number generator
1256   * @param model data model of the instances
1257   * @param numClusters number of clusters
1258   * @return new centers
1259   */
1260  protected Instances makeCentersRandomly(Random random0,
1261                                        Instances model,
1262                                        int numClusters) {
1263    Instances clusterCenters = new Instances(model, numClusters);
1264    m_NumClusters = numClusters;
1265
1266    // makes the new centers randomly
1267    for (int i = 0; i < numClusters; i++) {
1268      int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances();
1269      clusterCenters.add(m_Instances.instance(instIndex));
1270    }
1271    return clusterCenters;
1272  }
1273
1274  /**
1275   * Returns the BIC-value for the given center and instances.
1276   * @param instList The indices of the instances that belong to the center
1277   * @param center the center.
1278   * @param mle maximum likelihood
1279   * @param model the data model
1280   * @return the BIC value
1281   */   
1282  protected double calculateBIC(int[] instList, Instance center,
1283                              double mle, Instances model) {
1284    int[][] w1 = new int[1][instList.length];
1285    for (int i = 0; i < instList.length; i++) {
1286      w1[0][i] = instList[i];
1287    }
1288    double[] m = {mle};
1289    Instances w2 = new Instances(model, 1);
1290    w2.add(center);
1291    return calculateBIC(w1, w2, m);
1292    }
1293 
1294  /**
1295   * Calculates the BIC for the given set of centers and instances.
1296   * @param instOfCent The instances that belong to their respective centers
1297   * @param centers the centers
1298   * @param mle maximum likelihood
1299   * @return The BIC for the input.
1300   */
1301  protected double calculateBIC(int[][] instOfCent, Instances centers,
1302                              double[] mle) {
1303    double loglike = 0.0;
1304    int numInstTotal = 0;
1305    int numCenters = centers.numInstances();
1306    int numDimensions = centers.numAttributes();
1307    int numParameters = (numCenters - 1) + //probabilities
1308      numCenters * numDimensions + //means
1309      numCenters; // variance params
1310    for (int i = 0; i < centers.numInstances(); i++) {
1311      loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i),
1312                                       mle[i], centers.numInstances() * 2);
1313      numInstTotal += instOfCent[i].length;
1314    }
1315    /* diff
1316       thats how we did it
1317    loglike -= ((centers.numAttributes() + 1.0) * centers.numInstances() * 1)
1318      * Math.log(count);
1319      */
1320    loglike -= numInstTotal * Math.log(numInstTotal);
1321    //System.out.println ("numInstTotal " + numInstTotal +
1322    //                    "calculateBIC res " + loglike);
1323    loglike -= (numParameters / 2.0) * Math.log(numInstTotal);
1324    //System.out.println ("numParam " +
1325    //                     + numParameters +
1326    //                  " calculateBIC res " + loglike);
1327    return loglike;
1328  }
1329 
1330  /**
1331   * Calculates the log-likelihood of the data for the given model, taken
1332   * at the maximum likelihood point.
1333   *
1334   * @param numInst number of instances that belong to the center
1335   * @param center the center
1336   * @param distortion distortion
1337   * @param numCent number of centers
1338   * @return the likelihood estimate
1339   */
1340  protected double logLikelihoodEstimate(int numInst, 
1341                                       Instance center, 
1342                                       double distortion, 
1343                                       int numCent) {
1344    // R(n) num of instances of the center -> numInst
1345    // K num of centers -> not used
1346    //
1347    //todo take the diff comments away
1348    double loglike = 0;
1349    /* if is new */
1350    if (numInst > 1) {
1351      /* diff variance is new */
1352      //
1353      // distortion = Sum over instances x of the center(x-center)
1354      // different to paper; sum should be squared
1355      //
1356      // (Sum of distances to center) / R(n) - 1.0
1357      // different to paper; should be R(n)-K
1358      double variance =  distortion / (numInst - 1.0); 
1359 
1360      //
1361      //  -R(n)/2 * log(pi*2)
1362      //
1363      double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);
1364      /* diff
1365         thats how we had it
1366         double p2 = -((ni * center.numAttributes()) / 2) * distortion;
1367      */
1368      //
1369      // -(R(n)*M)/2 * log(variance)
1370      //
1371      double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);
1372     
1373      /* diff
1374         thats how we had it, the difference is a bug in x-means
1375         double p3 = - (numInst - numCent) / 2;
1376      */
1377      //
1378      // -(R(n)-1)/2
1379      //
1380      double p3 = - (numInst - 1.0) / 2.0;
1381     
1382      //
1383      // R(n)*log(R(n))
1384      //
1385      double p4 = numInst * Math.log(numInst);
1386     
1387      /* diff x-means doesn't have this part
1388         double p5 = - numInst * Math.log(numInstTotal);
1389      */
1390     
1391      /*
1392        loglike = -(ni / 2) * Math.log(Math.PI * 2)
1393        - (ni * center.numAttributes()) / 2.0) * logdistortion
1394        - (ni - k) / 2.0
1395        + ni * Math.log(ni)
1396        - ni * Math.log(r);
1397      */
1398      loglike = p1 + p2 + p3 + p4; // diff + p5;
1399      //the log(r) is something that can be reused.
1400      //as is the log(2 PI), these could provide extra speed up later on.
1401      //since distortion is so expensive to compute, I only do that once.
1402    }
1403    return loglike;
1404  }
1405 
1406  /**
1407   * Calculates the maximum likelihood estimate for the variance.
1408   * @param instOfCent indices of instances to each center
1409   * @param centers the centers
1410   * @return the list of distortions distortion.
1411   */
1412  protected double[] distortion(int[][] instOfCent, Instances centers) {
1413    double[] distortion = new double[centers.numInstances()];
1414    for (int i = 0; i < centers.numInstances(); i++) {
1415      distortion[i] = 0.0;
1416      for (int j = 0; j < instOfCent[i].length; j++) {
1417        distortion[i] += m_DistanceF.distance(m_Instances
1418            .instance(instOfCent[i][j]), centers.instance(i));
1419      }
1420    }
1421    /*
1422     * diff not done in x-means res *= 1.0 / (count - centers.numInstances());
1423     */
1424    return distortion;
1425  }
1426 
1427  /**
1428   * Clusters an instance.
1429   *
1430   * @param instance
1431   *          the instance to assign a cluster to.
1432   * @param centers
1433   *          the centers to cluster the instance to.
1434   * @return a cluster index.
1435   */
1436  protected int clusterProcessedInstance(Instance instance, Instances centers) {
1437   
1438    double minDist = Integer.MAX_VALUE;
1439    int bestCluster = 0;
1440    for (int i = 0; i < centers.numInstances(); i++) {
1441      double dist = m_DistanceF.distance(instance, centers.instance(i));
1442
1443      if (dist < minDist) {
1444        minDist = dist;
1445        bestCluster = i;
1446      }
1447    }
1448    ;
1449    return bestCluster;
1450  }
1451 
1452  /**
1453   * Clusters an instance that has been through the filters.
1454   *
1455   * @param instance
1456   *          the instance to assign a cluster to
1457   * @return a cluster number
1458   */
1459  protected int clusterProcessedInstance(Instance instance) {
1460    double minDist = Integer.MAX_VALUE;
1461    int bestCluster = 0;
1462    for (int i = 0; i < m_NumClusters; i++) {
1463      double dist = m_DistanceF
1464          .distance(instance, m_ClusterCenters.instance(i));
1465      if (dist < minDist) {
1466        minDist = dist;
1467        bestCluster = i;
1468      }
1469    }
1470    return bestCluster;
1471  }
1472
1473  /**
1474   * Classifies a given instance.
1475   *
1476   * @param instance the instance to be assigned to a cluster
1477   * @return the number of the assigned cluster as an integer
1478   * if the class is enumerated, otherwise the predicted value
1479   * @throws Exception if instance could not be classified
1480   * successfully
1481   */
1482  public int clusterInstance(Instance instance) throws Exception {
1483    m_ReplaceMissingFilter.input(instance);
1484    Instance inst = m_ReplaceMissingFilter.output();
1485
1486    return clusterProcessedInstance(inst);
1487  }
1488
1489
1490  /**
1491   * Returns the number of clusters.
1492   *
1493   * @return the number of clusters generated for a training dataset.
1494   */
1495  public int numberOfClusters() {
1496    return m_NumClusters;
1497  }
1498
1499
1500  /**
1501   * Returns an enumeration describing the available options.
1502   * @return an enumeration of all the available options
1503   **/
1504  public Enumeration listOptions() {
1505    Vector result = new Vector();
1506   
1507    result.addElement(new Option(
1508        "\tmaximum number of overall iterations\n"
1509        + "\t(default 1).", 
1510        "I", 1, "-I <num>"));
1511   
1512    result.addElement(new Option(
1513        "\tmaximum number of iterations in the kMeans loop in\n"
1514        + "\tthe Improve-Parameter part \n"
1515        + "\t(default 1000).", 
1516        "M", 1, "-M <num>"));
1517   
1518    result.addElement(new Option(
1519        "\tmaximum number of iterations in the kMeans loop\n"
1520        + "\tfor the splitted centroids in the Improve-Structure part \n"
1521        + "\t(default 1000).",
1522        "J", 1, "-J <num>"));
1523   
1524    result.addElement(new Option(
1525        "\tminimum number of clusters\n"
1526        + "\t(default 2).", 
1527        "L", 1, "-L <num>"));
1528   
1529    result.addElement(new Option(
1530        "\tmaximum number of clusters\n"
1531        + "\t(default 4).",
1532        "H", 1, "-H <num>"));
1533   
1534    result.addElement(new Option(
1535        "\tdistance value for binary attributes\n"
1536        + "\t(default 1.0).",
1537        "B", 1, "-B <value>"));
1538   
1539    result.addElement(new Option(
1540        "\tUses the KDTree internally\n"
1541        + "\t(default no).",
1542        "use-kdtree", 0, "-use-kdtree"));
1543   
1544    result.addElement(new Option(
1545        "\tFull class name of KDTree class to use, followed\n"
1546        + "\tby scheme options.\n"
1547        + "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n"
1548        + "\t(default no KDTree class used).",
1549        "K", 1, "-K <KDTree class specification>"));
1550   
1551    result.addElement(new Option(
1552        "\tcutoff factor, takes the given percentage of the splitted \n"
1553        + "\tcentroids if none of the children win\n"
1554        + "\t(default 0.0).",
1555        "C", 1, "-C <value>"));
1556   
1557    result.addElement(new Option(
1558        "\tFull class name of Distance function class to use, followed\n"
1559        + "\tby scheme options.\n" +
1560        "\t(default weka.core.EuclideanDistance).",
1561        "D", 1, "-D <distance function class specification>"));
1562   
1563    result.addElement(new Option(
1564        "\tfile to read starting centers from (ARFF format).",
1565        "N", 1, "-N <file name>"));
1566   
1567    result.addElement(new Option(
1568        "\tfile to write centers to (ARFF format).",
1569        "O", 1, "-O <file name>"));
1570   
1571    result.addElement(new Option(
1572        "\tThe debug level.\n"
1573        + "\t(default 0)",
1574        "U", 1, "-U <int>"));
1575   
1576    result.addElement(new Option(
1577        "\tThe debug vectors file.",
1578        "Y", 1, "-Y <file name>"));
1579   
1580    Enumeration en = super.listOptions();
1581    while (en.hasMoreElements())
1582      result.addElement(en.nextElement());
1583   
1584    return result.elements();
1585  }
1586
1587  /**
1588   * Returns the tip text for this property.
1589   * @return tip text for this property
1590   */
1591  public String minNumClustersTipText() {
1592    return "set minimum number of clusters";
1593  }
1594
1595  /**
1596   * Sets the minimum number of clusters to generate.
1597   *
1598   * @param n the minimum number of clusters to generate
1599   */
1600  public void setMinNumClusters(int n) {
1601    m_MinNumClusters = n;
1602  }
1603
1604  /**
1605   * Gets the minimum number of clusters to generate.
1606   * @return the minimum number of clusters to generate
1607   */
1608  public int getMinNumClusters() {
1609    return m_MinNumClusters;
1610  }
1611
1612  /**
1613   * Returns the tip text for this property.
1614   * @return tip text for this property
1615   */
1616  public String maxNumClustersTipText() {
1617    return "set maximum number of clusters";
1618  }
1619
1620  /**
1621   * Sets the maximum number of clusters to generate.
1622   * @param n the maximum number of clusters to generate
1623   */
1624  public void setMaxNumClusters(int n) {
1625    if (n >= m_MinNumClusters) {
1626      m_MaxNumClusters = n;
1627    }
1628  }
1629 
1630  /**
1631   * Gets the maximum number of clusters to generate.
1632   * @return the maximum number of clusters to generate
1633   */
1634  public int getMaxNumClusters() {
1635    return m_MaxNumClusters;
1636  }
1637
1638  /**
1639   * Returns the tip text for this property.
1640   * @return tip text for this property
1641   */
1642  public String maxIterationsTipText() {
1643    return "the maximum number of iterations to perform";
1644  }
1645
1646  /**
1647   * Sets the maximum number of iterations to perform.
1648   * @param i the number of iterations
1649   * @throws Exception if i is less than 1
1650   */
1651  public void setMaxIterations(int i) throws Exception {
1652    if (i < 0) 
1653      throw new Exception("Only positive values for iteration number" +
1654                           " allowed (Option I)."); 
1655    m_MaxIterations = i;
1656  }
1657
1658  /**
1659   * Gets the maximum number of iterations.
1660   * @return the number of iterations
1661   */
1662  public int getMaxIterations() {
1663    return  m_MaxIterations;
1664  }
1665
1666  /**
1667   * Returns the tip text for this property.
1668   * @return tip text for this property
1669   */
1670  public String maxKMeansTipText() {
1671    return "the maximum number of iterations to perform in KMeans";
1672  }
1673
1674  /**
1675   * Set the maximum number of iterations to perform in KMeans.
1676   * @param i the number of iterations
1677   */
1678  public void setMaxKMeans(int i) {
1679    m_MaxKMeans = i;
1680    m_MaxKMeansForChildren = i;
1681  }
1682
1683  /**
1684   * Gets the maximum number of iterations in KMeans.
1685   * @return the number of iterations
1686   */
1687  public int getMaxKMeans() {
1688    return  m_MaxKMeans;
1689  }
1690
1691  /**
1692   * Returns the tip text for this property.
1693   * @return tip text for this property
1694   */
1695  public String maxKMeansForChildrenTipText() {
1696    return "the maximum number of iterations KMeans that is performed on the child centers";
1697  }
1698
1699  /**
1700   * Sets the maximum number of iterations KMeans that is performed
1701   * on the child centers.
1702   * @param i the number of iterations
1703   */
1704  public void setMaxKMeansForChildren(int i) {
1705    m_MaxKMeansForChildren = i;
1706  }
1707
1708  /**
1709   * Gets the maximum number of iterations in KMeans.
1710   * @return the number of iterations
1711   */
1712  public int getMaxKMeansForChildren() {
1713    return  m_MaxKMeansForChildren;
1714  }
1715
1716  /**
1717   * Returns the tip text for this property.
1718   * @return tip text for this property
1719   */
1720  public String cutOffFactorTipText() {
1721    return "the cut-off factor to use";
1722  }
1723
1724  /**
1725   * Sets a new cutoff factor.
1726   * @param i the new cutoff factor
1727   */
1728  public void setCutOffFactor(double i) {
1729    m_CutOffFactor = i;
1730  }
1731
1732  /**
1733   * Gets the cutoff factor.
1734   * @return the cutoff factor
1735   */
1736  public double getCutOffFactor() {
1737    return  m_CutOffFactor;
1738  }
1739
1740  /**
1741   * Returns the tip text for this property.
1742   * @return tip text for this property suitable for
1743   * displaying in the explorer/experimenter gui
1744   */
1745  public String binValueTipText() {
1746    return "Set the value that represents true in the new attributes.";
1747  }
1748 
1749  /**
1750   * Gets value that represents true in a new numeric attribute.
1751   * (False is always represented by 0.0.)
1752   * @return the value that represents true in a new numeric attribute
1753   */
1754  public double getBinValue() {
1755    return m_BinValue;
1756  }
1757
1758  /**
1759   * Sets the distance value between true and false of binary attributes.
1760   * and  "same" and "different" of nominal attributes   
1761   * @param value the distance
1762   */
1763  public void setBinValue(double value) {
1764    m_BinValue = value;
1765  }
1766
1767  /**
1768   * Returns the tip text for this property.
1769   *
1770   * @return            tip text for this property suitable for
1771   *                    displaying in the explorer/experimenter gui
1772   */
1773  public String distanceFTipText() {
1774    return "The distance function to use.";
1775  }
1776
1777  /**
1778   * gets the "binary" distance value.
1779   * @param distanceF the distance function with all options set
1780   */
1781  public void setDistanceF(DistanceFunction distanceF) {
1782    m_DistanceF = distanceF;
1783  }
1784
1785  /**
1786   * Gets the distance function.
1787   * @return the distance function
1788   */
1789  public DistanceFunction getDistanceF() {
1790    return m_DistanceF;
1791  }
1792
1793  /**
1794   * Gets the distance function specification string, which contains the
1795   * class name of the distance function class and any options to it.
1796   *
1797   * @return the distance function specification string
1798   */
1799  protected String getDistanceFSpec() {
1800   
1801    DistanceFunction d = getDistanceF();
1802    if (d instanceof OptionHandler) {
1803      return d.getClass().getName() + " "
1804        + Utils.joinOptions(((OptionHandler) d).getOptions());
1805    }
1806    return d.getClass().getName();
1807  }
1808
1809  /**
1810   * Returns the tip text for this property.
1811   *
1812   * @return            tip text for this property suitable for
1813   *                    displaying in the explorer/experimenter gui
1814   */
1815  public String debugVectorsFileTipText() {
1816    return "The file containing the debug vectors (only for debugging!).";
1817  }
1818 
1819  /**
1820   * Sets the file that has the random vectors stored.
1821   * Only used for debugging reasons.
1822   * @param value the file to read the random vectors from
1823   */
1824  public void setDebugVectorsFile(File value) {
1825    m_DebugVectorsFile = value;
1826  }
1827
1828  /**
1829   * Gets the file name for a file that has the random vectors stored.
1830   * Only used for debugging purposes.
1831   * @return the file to read the vectors from
1832   */
1833  public File getDebugVectorsFile() {
1834    return m_DebugVectorsFile;
1835  }
1836 
1837  /**
1838   * Initialises the debug vector input.
1839   * @throws Exception if there is error
1840   * opening the debug input file.
1841   */
1842  public void initDebugVectorsInput() throws Exception {
1843    m_DebugVectorsInput = 
1844      new BufferedReader(new FileReader(m_DebugVectorsFile));
1845    m_DebugVectors = new Instances(m_DebugVectorsInput);
1846    m_DebugVectorsIndex = 0;
1847  }
1848
1849  /**
1850   * Read an instance from debug vectors file.
1851   * @param model the data model for the instance.
1852   * @throws Exception if there are no debug vector
1853   * in m_DebugVectors.
1854   * @return the next debug vector.
1855   */
1856  public Instance getNextDebugVectorsInstance(Instances model) 
1857    throws Exception {
1858    if (m_DebugVectorsIndex >= m_DebugVectors.numInstances())
1859      throw new Exception("no more prefabricated Vectors");
1860    Instance nex = m_DebugVectors.instance(m_DebugVectorsIndex);
1861    nex.setDataset(model);
1862    m_DebugVectorsIndex++;
1863    return nex;
1864  }
1865
1866  /**
1867   * Returns the tip text for this property.
1868   *
1869   * @return            tip text for this property suitable for
1870   *                    displaying in the explorer/experimenter gui
1871   */
1872  public String inputCenterFileTipText() {
1873    return "The file to read the list of centers from.";
1874  }
1875
1876  /**
1877   * Sets the file to read the list of centers from.
1878   *
1879   * @param value the file to read centers from
1880   */
1881  public void setInputCenterFile(File value) {
1882    m_InputCenterFile = value;
1883  }
1884 
1885  /**
1886   * Gets the file to read the list of centers from.
1887   *
1888   * @return the file to read the centers from
1889   */
1890  public File getInputCenterFile() {
1891    return m_InputCenterFile;
1892  }
1893
1894  /**
1895   * Returns the tip text for this property.
1896   *
1897   * @return            tip text for this property suitable for
1898   *                    displaying in the explorer/experimenter gui
1899   */
1900  public String outputCenterFileTipText() {
1901    return "The file to write the list of centers to.";
1902  }
1903   
1904  /**
1905   * Sets file to write the list of centers to.
1906   *
1907   * @param value file to write centers to
1908   */
1909  public void setOutputCenterFile(File value) {
1910    m_OutputCenterFile = value;
1911  }
1912
1913  /**
1914   * Gets the file to write the list of centers to.
1915   *
1916   * @return filename of the file to write centers to
1917   */
1918  public File getOutputCenterFile() {
1919    return m_OutputCenterFile;
1920  }
1921
1922  /**
1923   * Returns the tip text for this property.
1924   *
1925   * @return            tip text for this property suitable for
1926   *                    displaying in the explorer/experimenter gui
1927   */
1928  public String KDTreeTipText() {
1929    return "The KDTree to use.";
1930  }
1931   
1932  /**
1933   * Sets the KDTree class.
1934   * @param k a KDTree object with all options set
1935   */
1936  public void setKDTree(KDTree k) {
1937    m_KDTree = k;
1938  }
1939
1940  /**
1941   * Gets the KDTree class.
1942   *
1943   * @return the configured KDTree
1944   */
1945  public KDTree getKDTree() {
1946    return m_KDTree;
1947  }
1948
1949  /**
1950   * Returns the tip text for this property.
1951   *
1952   * @return            tip text for this property suitable for
1953   *                    displaying in the explorer/experimenter gui
1954   */
1955  public String useKDTreeTipText() {
1956    return "Whether to use the KDTree.";
1957  }
1958   
1959  /**
1960   * Sets whether to use the KDTree or not.
1961   *
1962   * @param value       if true the KDTree is used
1963   */
1964  public void setUseKDTree(boolean value) {
1965    m_UseKDTree = value;
1966  }
1967
1968  /**
1969   * Gets whether the KDTree is used or not.
1970   *
1971   * @return            true if KDTrees are used
1972   */
1973  public boolean getUseKDTree() {
1974    return m_UseKDTree;
1975  }
1976
1977  /**
1978   * Gets the KDTree specification string, which contains the class name of
1979   * the KDTree class and any options to the KDTree.
1980   *
1981   * @return the KDTree string.
1982   */
1983  protected String getKDTreeSpec() {
1984   
1985    KDTree c = getKDTree();
1986    if (c instanceof OptionHandler) {
1987      return c.getClass().getName() + " "
1988        + Utils.joinOptions(((OptionHandler)c).getOptions());
1989    }
1990    return c.getClass().getName();
1991  }
1992
1993  /**
1994   * Returns the tip text for this property.
1995   *
1996   * @return            tip text for this property suitable for
1997   *                    displaying in the explorer/experimenter gui
1998   */
1999  public String debugLevelTipText() {
2000    return "The debug level to use.";
2001  }
2002
2003  /**
2004   * Sets the debug level.
2005   * debug level = 0, means no output
2006   * @param d debuglevel
2007   */
2008  public void setDebugLevel(int d) {
2009    m_DebugLevel = d;
2010  }
2011
2012  /**
2013   * Gets the debug level.
2014   * @return debug level
2015   */
2016  public int getDebugLevel() {
2017    return m_DebugLevel;
2018  }
2019
2020  /**
2021   * Checks the instances.
2022   * No checks in this KDTree but it calls the check of the distance function.
2023   */
2024  protected void checkInstances () {
2025   
2026   // m_DistanceF.checkInstances();
2027  }
2028 
2029  /**
2030   * Parses a given list of options. <p/>
2031   *
2032   <!-- options-start -->
2033   * Valid options are: <p/>
2034   *
2035   * <pre> -I &lt;num&gt;
2036   *  maximum number of overall iterations
2037   *  (default 1).</pre>
2038   *
2039   * <pre> -M &lt;num&gt;
2040   *  maximum number of iterations in the kMeans loop in
2041   *  the Improve-Parameter part
2042   *  (default 1000).</pre>
2043   *
2044   * <pre> -J &lt;num&gt;
2045   *  maximum number of iterations in the kMeans loop
2046   *  for the splitted centroids in the Improve-Structure part
2047   *  (default 1000).</pre>
2048   *
2049   * <pre> -L &lt;num&gt;
2050   *  minimum number of clusters
2051   *  (default 2).</pre>
2052   *
2053   * <pre> -H &lt;num&gt;
2054   *  maximum number of clusters
2055   *  (default 4).</pre>
2056   *
2057   * <pre> -B &lt;value&gt;
2058   *  distance value for binary attributes
2059   *  (default 1.0).</pre>
2060   *
2061   * <pre> -use-kdtree
2062   *  Uses the KDTree internally
2063   *  (default no).</pre>
2064   *
2065   * <pre> -K &lt;KDTree class specification&gt;
2066   *  Full class name of KDTree class to use, followed
2067   *  by scheme options.
2068   *  eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
2069   *  (default no KDTree class used).</pre>
2070   *
2071   * <pre> -C &lt;value&gt;
2072   *  cutoff factor, takes the given percentage of the splitted
2073   *  centroids if none of the children win
2074   *  (default 0.0).</pre>
2075   *
2076   * <pre> -D &lt;distance function class specification&gt;
2077   *  Full class name of Distance function class to use, followed
2078   *  by scheme options.
2079   *  (default weka.core.EuclideanDistance).</pre>
2080   *
2081   * <pre> -N &lt;file name&gt;
2082   *  file to read starting centers from (ARFF format).</pre>
2083   *
2084   * <pre> -O &lt;file name&gt;
2085   *  file to write centers to (ARFF format).</pre>
2086   *
2087   * <pre> -U &lt;int&gt;
2088   *  The debug level.
2089   *  (default 0)</pre>
2090   *
2091   * <pre> -Y &lt;file name&gt;
2092   *  The debug vectors file.</pre>
2093   *
2094   * <pre> -S &lt;num&gt;
2095   *  Random number seed.
2096   *  (default 10)</pre>
2097   *
2098   <!-- options-end -->
2099   *
2100   * @param options the list of options as an array of strings
2101   * @throws Exception if an option is not supported
2102   */
2103  public void setOptions(String[] options)
2104    throws Exception {
2105   
2106    String      optionString;
2107    String      funcString;
2108
2109    optionString = Utils.getOption('I', options);
2110    if (optionString.length() != 0)
2111      setMaxIterations(Integer.parseInt(optionString));
2112    else
2113      setMaxIterations(1);
2114   
2115    optionString = Utils.getOption('M', options);
2116    if (optionString.length() != 0)
2117      setMaxKMeans(Integer.parseInt(optionString));
2118    else
2119      setMaxKMeans(1000);
2120   
2121    optionString = Utils.getOption('J', options);
2122    if (optionString.length() != 0)
2123      setMaxKMeansForChildren(Integer.parseInt(optionString));
2124    else
2125      setMaxKMeansForChildren(1000);
2126     
2127    optionString = Utils.getOption('L', options);
2128    if (optionString.length() != 0)
2129      setMinNumClusters(Integer.parseInt(optionString));
2130    else
2131      setMinNumClusters(2);
2132     
2133    optionString = Utils.getOption('H', options);
2134    if (optionString.length() != 0)
2135      setMaxNumClusters(Integer.parseInt(optionString));
2136    else
2137      setMaxNumClusters(4);
2138   
2139    optionString = Utils.getOption('B', options);
2140    if (optionString.length() != 0)
2141      setBinValue(Double.parseDouble(optionString));
2142    else
2143      setBinValue(1.0);
2144
2145    setUseKDTree(Utils.getFlag("use-kdtree", options));
2146   
2147    if (getUseKDTree()) {
2148      funcString = Utils.getOption('K', options);
2149      if (funcString.length() != 0) {
2150        String[] funcSpec = Utils.splitOptions(funcString);
2151        if (funcSpec.length == 0) {
2152          throw new Exception("Invalid function specification string");
2153        }
2154        String funcName = funcSpec[0];
2155        funcSpec[0] = "";
2156        setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec));
2157      }
2158      else {
2159        setKDTree(new KDTree());
2160      }
2161    }
2162    else {
2163      setKDTree(new KDTree());
2164    }
2165
2166    optionString = Utils.getOption('C', options);
2167    if (optionString.length() != 0)
2168      setCutOffFactor(Double.parseDouble(optionString));
2169    else
2170      setCutOffFactor(0.0);
2171   
2172    funcString = Utils.getOption('D', options);
2173    if (funcString.length() != 0) {
2174      String[] funcSpec = Utils.splitOptions(funcString);
2175      if (funcSpec.length == 0) {
2176        throw new Exception("Invalid function specification string");
2177      }
2178      String funcName = funcSpec[0];
2179      funcSpec[0] = "";
2180      setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class,
2181                                                    funcName, funcSpec));
2182    }
2183    else {
2184      setDistanceF(new EuclideanDistance());
2185    }
2186
2187    optionString  = Utils.getOption('N', options);
2188    if (optionString.length() != 0) {
2189      setInputCenterFile(new File(optionString));
2190      m_CenterInput = 
2191        new BufferedReader(new FileReader(optionString));
2192    }
2193    else {
2194      setInputCenterFile(new File(System.getProperty("user.dir")));
2195      m_CenterInput = null;
2196    }
2197
2198    optionString  = Utils.getOption('O', options);
2199    if (optionString.length() != 0) {
2200      setOutputCenterFile(new File(optionString));
2201      m_CenterOutput = new PrintWriter(new FileOutputStream(optionString));
2202    }
2203    else {
2204      setOutputCenterFile(new File(System.getProperty("user.dir")));
2205      m_CenterOutput = null;
2206    }
2207
2208    optionString = Utils.getOption('U', options);
2209    int debugLevel = 0;
2210    if (optionString.length() != 0) {
2211      try {
2212        debugLevel = Integer.parseInt(optionString);
2213      } catch (NumberFormatException e) {
2214        throw new Exception(optionString +
2215                            "is an illegal value for option -U"); 
2216      }
2217    }
2218    setDebugLevel(debugLevel);
2219
2220    optionString  = Utils.getOption('Y', options);
2221    if (optionString.length() != 0) {
2222      setDebugVectorsFile(new File(optionString));
2223    }
2224    else {
2225      setDebugVectorsFile(new File(System.getProperty("user.dir")));
2226      m_DebugVectorsInput = null;
2227      m_DebugVectors      = null;
2228    }
2229   
2230    super.setOptions(options);
2231  }
2232 
2233  /**
2234   * Gets the current settings of SimpleKMeans.
2235   * @return an array of strings suitable for passing to setOptions
2236   */
2237  public String[] getOptions() {
2238    int         i;
2239    Vector      result;
2240    String[]    options;
2241
2242    result = new Vector();
2243
2244    result.add("-I");
2245    result.add("" + getMaxIterations());
2246   
2247    result.add("-M");
2248    result.add("" + getMaxKMeans());
2249   
2250    result.add("-J");
2251    result.add("" + getMaxKMeansForChildren());
2252   
2253    result.add("-L");
2254    result.add("" + getMinNumClusters());
2255   
2256    result.add("-H");
2257    result.add("" + getMaxNumClusters());
2258   
2259    result.add("-B");
2260    result.add("" + getBinValue());
2261   
2262    if (getUseKDTree()) {
2263      result.add("-use-kdtree");
2264      result.add("-K");
2265      result.add("" + getKDTreeSpec());
2266    }
2267   
2268    result.add("-C");
2269    result.add("" + getCutOffFactor());
2270   
2271    if (getDistanceF() != null) {
2272      result.add("-D");
2273      result.add("" + getDistanceFSpec());
2274    }
2275   
2276    if (getInputCenterFile().exists() && getInputCenterFile().isFile()) {
2277      result.add("-N");
2278      result.add("" + getInputCenterFile());
2279    }
2280   
2281    if (getOutputCenterFile().exists() && getOutputCenterFile().isFile()) {
2282      result.add("-O");
2283      result.add("" + getOutputCenterFile());
2284    }
2285   
2286    int dL = getDebugLevel();
2287    if (dL > 0) {
2288      result.add("-U");
2289      result.add("" + getDebugLevel());
2290    }
2291
2292    if (getDebugVectorsFile().exists() && getDebugVectorsFile().isFile()) {
2293      result.add("-Y");
2294      result.add("" + getDebugVectorsFile());
2295    }
2296   
2297    options = super.getOptions();
2298    for (i = 0; i < options.length; i++)
2299      result.add(options[i]);
2300
2301    return (String[]) result.toArray(new String[result.size()]);         
2302  }
2303
2304  /**
2305   * Return a string describing this clusterer.
2306   * @return a description of the clusterer as a string
2307   */
2308  public String toString() {
2309    StringBuffer temp = new StringBuffer();
2310
2311    temp.append("\nXMeans\n======\n");
2312
2313    temp.append("Requested iterations            : " + m_MaxIterations + "\n");
2314    temp.append("Iterations performed            : " + m_IterationCount+ "\n");
2315    if (m_KMeansStopped > 0) {
2316      temp.append("kMeans did not converge\n");
2317      temp.append("  but was stopped by max-loops " 
2318          + m_KMeansStopped + " times (max kMeans-iter)\n");
2319    }
2320    temp.append("Splits prepared                 : " + m_NumSplits + "\n");
2321    temp.append("Splits performed                : " + m_NumSplitsDone + "\n");
2322    temp.append("Cutoff factor                   : " + m_CutOffFactor + "\n");
2323    double perc;
2324    if (m_NumSplitsDone > 0)
2325      perc = (((double)m_NumSplitsStillDone)/((double) m_NumSplitsDone))
2326             * 100.0;
2327    else
2328      perc = 0.0;
2329    temp.append("Percentage of splits accepted \n" +
2330                "by cutoff factor                : " 
2331                + Utils.doubleToString(perc,2) + " %\n");
2332    temp.append("------\n");
2333
2334    temp.append("Cutoff factor                   : " + m_CutOffFactor + "\n");
2335    temp.append("------\n");
2336    temp.append("\nCluster centers                 : " + m_NumClusters + " centers\n");
2337    for (int i = 0; i < m_NumClusters; i++) {
2338      temp.append("\nCluster "+i+"\n           ");
2339      for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) {
2340        if (m_ClusterCenters.attribute(j).isNominal()) {
2341          temp.append(" "+m_ClusterCenters.attribute(j).
2342                      value((int)m_ClusterCenters.instance(i).value(j)));
2343        } else {
2344          temp.append(" "+m_ClusterCenters.instance(i).value(j));
2345        }
2346      }
2347    }
2348    if (m_Mle != null)
2349      temp.append("\n\nDistortion: " + 
2350                  Utils.doubleToString(Utils.sum(m_Mle),6) + "\n");
2351    temp.append("BIC-Value : " + Utils.doubleToString(m_Bic,6) + "\n");
2352    return temp.toString();
2353  }
2354
2355  /**
2356   * Print centers for debug.
2357   * @param debugLevel level that gives according messages
2358   */
2359  protected void PrCentersFD(int debugLevel) {
2360    if (debugLevel == m_DebugLevel) {
2361      for (int i = 0; i < m_ClusterCenters.numInstances(); i++) {
2362        System.out.println(m_ClusterCenters.instance(i));
2363      }
2364    }
2365  }
2366
2367  /**
2368   * Tests on debug status.
2369   * @param debugLevel level that gives according messages
2370   * @return true if debug level is set
2371   */
2372  protected boolean TFD(int debugLevel) {
2373    return (debugLevel == m_DebugLevel);
2374  }
2375
2376  /**
2377   * Does debug printouts.
2378   * @param debugLevel level that gives according messages
2379   * @param output string that is printed
2380   */
2381  protected void PFD(int debugLevel, String output) {
2382    if (debugLevel == m_DebugLevel)
2383      System.out.println(output);
2384  }
2385  /**
2386   * Does debug printouts.
2387   * @param output string that is printed
2388   */
2389  protected void PFD_CURR(String output) {
2390    if (m_CurrDebugFlag)
2391      System.out.println(output);
2392  }
2393 
2394  /**
2395   * Returns the revision string.
2396   *
2397   * @return            the revision
2398   */
2399  public String getRevision() {
2400    return RevisionUtils.extract("$Revision: 5488 $");
2401  }
2402
2403  /**
2404   * Main method for testing this class.
2405   * @param argv should contain options
2406   */
2407  public static void main(String[] argv) {
2408    runClusterer(new XMeans(), argv);
2409  }
2410}
Note: See TracBrowser for help on using the repository browser.