source: tags/MetisMQIDemo/src/main/java/weka/datagenerators/clusterers/BIRCHCluster.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 41.8 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    BIRCHCluster.java
19 *    Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.datagenerators.clusterers;
24
25import weka.core.Attribute;
26import weka.core.FastVector;
27import weka.core.Instance;
28import weka.core.DenseInstance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.RevisionHandler;
32import weka.core.RevisionUtils;
33import weka.core.SelectedTag;
34import weka.core.Tag;
35import weka.core.TechnicalInformation;
36import weka.core.TechnicalInformationHandler;
37import weka.core.Utils;
38import weka.core.TechnicalInformation.Field;
39import weka.core.TechnicalInformation.Type;
40import weka.datagenerators.ClusterGenerator;
41
42import java.io.Serializable;
43import java.util.Enumeration;
44import java.util.Random;
45import java.util.Vector;
46
47/**
48 <!-- globalinfo-start -->
49 * Cluster data generator designed for the BIRCH System<br/>
50 * <br/>
51 * Dataset is generated with instances in K clusters.<br/>
52 * Instances are 2-d data points.<br/>
53 * Each cluster is characterized by the number of data points in itits radius and its center. The location of the cluster centers isdetermined by the pattern parameter. Three patterns are currentlysupported grid, sine and random.<br/>
54 * <br/>
55 * For more information refer to:<br/>
56 * <br/>
57 * Tian Zhang, Raghu Ramakrishnan, Miron Livny: BIRCH: An Efficient Data Clustering Method for Very Large Databases. In: ACM SIGMOD International Conference on Management of Data, 103-114, 1996.
58 * <p/>
59 <!-- globalinfo-end -->
60 *
61 <!-- technical-bibtex-start -->
62 * BibTeX:
63 * <pre>
64 * &#64;inproceedings{Zhang1996,
65 *    author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny},
66 *    booktitle = {ACM SIGMOD International Conference on Management of Data},
67 *    pages = {103-114},
68 *    publisher = {ACM Press},
69 *    title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases},
70 *    year = {1996}
71 * }
72 * </pre>
73 * <p/>
74 <!-- technical-bibtex-end -->
75 *
76 <!-- options-start -->
77 * Valid options are: <p/>
78 *
79 * <pre> -h
80 *  Prints this help.</pre>
81 *
82 * <pre> -o &lt;file&gt;
83 *  The name of the output file, otherwise the generated data is
84 *  printed to stdout.</pre>
85 *
86 * <pre> -r &lt;name&gt;
87 *  The name of the relation.</pre>
88 *
89 * <pre> -d
90 *  Whether to print debug informations.</pre>
91 *
92 * <pre> -S
93 *  The seed for random function (default 1)</pre>
94 *
95 * <pre> -a &lt;num&gt;
96 *  The number of attributes (default 10).</pre>
97 *
98 * <pre> -c
99 *  Class Flag, if set, the cluster is listed in extra attribute.</pre>
100 *
101 * <pre> -b &lt;range&gt;
102 *  The indices for boolean attributes.</pre>
103 *
104 * <pre> -m &lt;range&gt;
105 *  The indices for nominal attributes.</pre>
106 *
107 * <pre> -k &lt;num&gt;
108 *  The number of clusters (default 4)</pre>
109 *
110 * <pre> -G
111 *  Set pattern to grid (default is random).
112 *  This flag cannot be used at the same time as flag I.
113 *  The pattern is random, if neither flag G nor flag I is set.</pre>
114 *
115 * <pre> -I
116 *  Set pattern to sine (default is random).
117 *  This flag cannot be used at the same time as flag I.
118 *  The pattern is random, if neither flag G nor flag I is set.</pre>
119 *
120 * <pre> -N &lt;num&gt;..&lt;num&gt;
121 *  The range of number of instances per cluster (default 1..50).
122 *  Lower number must be between 0 and 2500,
123 *  upper number must be between 50 and 2500.</pre>
124 *
125 * <pre> -R &lt;num&gt;..&lt;num&gt;
126 *  The range of radius per cluster (default 0.1..1.4142135623730951).
127 *  Lower number must be between 0 and SQRT(2),
128 *  upper number must be between SQRT(2) and SQRT(32).</pre>
129 *
130 * <pre> -M &lt;num&gt;
131 *  The distance multiplier (default 4.0).</pre>
132 *
133 * <pre> -C &lt;num&gt;
134 *  The number of cycles (default 4).</pre>
135 *
136 * <pre> -O
137 *  Flag for input order is ORDERED. If flag is not set then
138 *  input order is RANDOMIZED. RANDOMIZED is currently not
139 *  implemented, therefore is the input order always ORDERED.</pre>
140 *
141 * <pre> -P &lt;num&gt;
142 *  The noise rate in percent (default 0.0).
143 *  Can be between 0% and 30%. (Remark: The original
144 *  algorithm only allows noise up to 10%.)</pre>
145 *
146 <!-- options-end -->
147 *
148 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
149 * @author FracPete (fracpete at waikato dot ac dot nz)
150 * @version $Revision: 5987 $
151 */
152public class BIRCHCluster 
153  extends ClusterGenerator
154  implements TechnicalInformationHandler {
155
156  /** for serialization */
157  static final long serialVersionUID = -334820527230755027L;
158 
159  /** Number of Clusters the dataset should have */
160  protected int m_NumClusters;
161
162  /** minimal number of instances per cluster (option N)*/ 
163  private int m_MinInstNum;
164 
165  /** maximal number of instances per cluster (option N)*/ 
166  private int m_MaxInstNum;
167 
168  /** minimum radius (option R)*/ 
169  private double m_MinRadius;
170 
171  /** maximum radius (option R)*/ 
172  private double m_MaxRadius;
173 
174  /**  Constant set for choice of pattern. (option G)*/
175  public static final int GRID = 0;
176  /**  Constant set for choice of pattern. (option I)*/
177  public static final int SINE = 1;
178  /**  Constant set for choice of pattern. (default)*/
179  public static final int RANDOM = 2;
180  /** the pattern tags */
181  public static final Tag[] TAGS_PATTERN = {
182    new Tag(GRID,   "Grid"),
183    new Tag(SINE,   "Sine"),
184    new Tag(RANDOM, "Random")
185  };
186 
187  /** pattern (changed with options G or S)*/ 
188  private int m_Pattern;
189 
190  /** distance multiplier (option M)*/
191  private double m_DistMult;
192
193  /** number of cycles (option C)*/
194  private int m_NumCycles;
195
196  /**  Constant set for input order (option O)*/
197  public static final int ORDERED = 0;
198  /**  Constant set for input order (default)*/
199  public static final int RANDOMIZED = 1;
200  /** the input order tags */
201  public static final Tag[] TAGS_INPUTORDER = {
202    new Tag(ORDERED,    "ordered"),
203    new Tag(RANDOMIZED, "randomized")
204  };
205
206  /** input order (changed with option O)*/ 
207  private int m_InputOrder;
208
209  /** noise rate in percent (option P,  between 0 and 30)*/ 
210  private double m_NoiseRate;
211
212  /** cluster list */
213  private FastVector m_ClusterList;
214
215  // following are used for pattern is GRID
216  /** grid size*/
217  private int m_GridSize;
218
219  /** grid width*/
220  private double m_GridWidth;
221 
222  /**
223   * class to represent cluster
224   */
225  private class Cluster 
226    implements Serializable, RevisionHandler {
227
228    /** for serialization */
229    static final long serialVersionUID = -8336901069823498140L;   
230   
231    /** number of instances for this cluster */
232    private int m_InstNum;
233
234    /** radius of cluster
235     *   variance is radius ** 2 / 2 */
236    private double m_Radius;
237
238    /** center of cluster = array of Double values */
239    private double[] m_Center;
240
241    /**
242     * Constructor, used for pattern = RANDOM
243     *
244     * @param instNum the number of instances
245     * @param radius radius of the cluster
246     * @param random the random number generator to use
247     */
248    private Cluster(int instNum, double radius, Random random) {
249      m_InstNum = instNum;
250      m_Radius = radius;
251      m_Center = new double[getNumAttributes()];
252      for (int i = 0; i < getNumAttributes(); i++) {
253        m_Center[i] = random.nextDouble() * (double) m_NumClusters;
254      }
255    }
256
257    /**
258     * Constructor, used for pattern = GRID
259     *
260     * @param instNum the number of instances
261     * @param radius radius of the cluster
262     * @param gridVector vector for grid positions
263     * @param gridWidth factor for grid position
264     */
265      // center is defined in the constructor of cluster
266    private Cluster(int instNum,
267                    double radius,
268                    int[] gridVector,
269                    double gridWidth) {
270      m_InstNum = instNum;
271      m_Radius = radius;
272      m_Center = new double[getNumAttributes()];
273      for (int i = 0; i < getNumAttributes(); i++) {
274        m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth;
275      }
276     
277    }
278   
279    /**
280     * returns the number of instances
281     *
282     * @return the number of instances
283     */
284    private int getInstNum() { 
285      return m_InstNum; 
286    }
287   
288    /**
289     * returns the radius
290     *
291     * @return the radius
292     */
293    private double getRadius() { 
294      return m_Radius; 
295    }
296   
297    /**
298     * returns the variance
299     *
300     * @return the variance
301     */
302    private double getVariance() { 
303      return Math.pow(m_Radius, 2.0) / 2.0; 
304    }
305   
306    /**
307     * returns the standard deviation
308     *
309     * @return the standard deviation
310     */
311    private double getStdDev() { 
312      return (m_Radius / Math.pow(2.0, 0.5)); 
313    }
314   
315    /**
316     * returns the centers
317     *
318     * @return the centers
319     */
320    private double[] getCenter() { 
321      return m_Center; 
322    }
323   
324    /**
325     * returns the center value for a given dimension
326     *
327     * @param dimension the dimension to return the center for
328     * @return the center value for the given dimension
329     * @throws Exception if dimension invalid
330     */
331    private double getCenterValue(int dimension) throws Exception {
332      if (dimension >= m_Center.length)
333        throw new Exception("Current system has only " +
334                            m_Center.length + " dimensions.");
335      return m_Center[dimension];
336    }
337   
338    /**
339     * Returns the revision string.
340     *
341     * @return          the revision
342     */
343    public String getRevision() {
344      return RevisionUtils.extract("$Revision: 5987 $");
345    }
346  } // end class Cluster
347
348  /**
349   * class to represent Vector for placement of the center in space
350   */
351  private class GridVector 
352    implements Serializable, RevisionHandler {
353
354    /** for serialization */
355    static final long serialVersionUID = -1900309948991039522L;
356   
357    /** array of integer */
358    private int[] m_GridVector;
359
360    /**  one higher then the highest possible integer value
361     *  in any of the integers in the gridvector */
362    private int m_Base;
363
364    /** size of vector */
365    private int m_Size;
366
367    /**
368     * Constructor
369     *
370     * @param numDim number of dimensions = number of attributes
371     * @param base is one higher then the highest possible integer value
372     * in any of the integers in the gridvector
373     */
374    private GridVector(int numDim, int base) {
375      m_Size = numDim;
376      m_Base = base;
377      m_GridVector = new int [numDim];
378      for (int i = 0; i < numDim; i++)
379        m_GridVector[i] = 0;
380    }
381
382    /**
383     * returns the integer array
384     *
385     * @return the integer array
386     */
387    private int[] getGridVector() {
388      return m_GridVector;
389    }
390
391    /**
392     * Overflow has occurred when integer is zero.
393     *
394     *@param digit the input integer
395     *@return true if digit is 0
396     */
397    private boolean overflow(int digit) {
398      return (digit == 0);
399    }
400
401    /**
402     * Adds one to integer and sets to zero, if new value was
403     * equal m_Base.
404     *
405     *@param digit the input integer
406     *@return new integer object
407     */
408    private int addOne(int digit) {
409      int value = digit + 1;
410      if (value >= m_Base) value = 0;
411      return value;
412    }
413
414    /**
415     * add 1 to vector
416     */
417    private void addOne() {
418      m_GridVector[0] = addOne(m_GridVector[0]);
419      int i = 1;
420      while (overflow(m_GridVector[i - 1]) && i < m_Size) {
421        m_GridVector[i] = addOne(m_GridVector[i]);
422        i++;
423      }
424       
425    }
426   
427    /**
428     * Returns the revision string.
429     *
430     * @return          the revision
431     */
432    public String getRevision() {
433      return RevisionUtils.extract("$Revision: 5987 $");
434    }
435  } // end class GridVector
436 
437  /**
438   * initializes the generator with default values
439   */
440  public BIRCHCluster() {
441    super();
442
443    setNumClusters(defaultNumClusters());
444    setMinInstNum(defaultMinInstNum());
445    setMaxInstNum(defaultMaxInstNum());
446    setMinRadius(defaultMinRadius());
447    setMaxRadius(defaultMaxRadius());
448    setPattern(defaultPattern());
449    setDistMult(defaultDistMult());
450    setNumCycles(defaultNumCycles());
451    setInputOrder(defaultInputOrder());
452    setNoiseRate(defaultNoiseRate());
453  }
454 
455  /**
456   * Returns a string describing this data generator.
457   *
458   * @return a description of the data generator suitable for
459   * displaying in the explorer/experimenter gui
460   */
461  public String globalInfo() {
462    return 
463        "Cluster data generator designed for the BIRCH System\n\n"
464      + "Dataset is generated with instances in K clusters.\n"
465      + "Instances are 2-d data points.\n"
466      + "Each cluster is characterized by the number of data points in it"
467      + "its radius and its center. The location of the cluster centers is"
468      + "determined by the pattern parameter. Three patterns are currently"
469      + "supported grid, sine and random.\n\n"
470      + "For more information refer to:\n\n"
471      + getTechnicalInformation().toString();
472  }
473
474  /**
475   * Returns an instance of a TechnicalInformation object, containing
476   * detailed information about the technical background of this class,
477   * e.g., paper reference or book this class is based on.
478   *
479   * @return the technical information about this class
480   */
481  public TechnicalInformation getTechnicalInformation() {
482    TechnicalInformation        result;
483   
484    result = new TechnicalInformation(Type.INPROCEEDINGS);
485    result.setValue(Field.AUTHOR, "Tian Zhang and Raghu Ramakrishnan and Miron Livny");
486    result.setValue(Field.TITLE, "BIRCH: An Efficient Data Clustering Method for Very Large Databases");
487    result.setValue(Field.BOOKTITLE, "ACM SIGMOD International Conference on Management of Data");
488    result.setValue(Field.YEAR, "1996");
489    result.setValue(Field.PAGES, "103-114");
490    result.setValue(Field.PUBLISHER, "ACM Press");
491   
492    return result;
493  }
494
495  /**
496   * Returns an enumeration describing the available options.
497   *
498   * @return an enumeration of all the available options
499   */
500  public Enumeration listOptions() {
501    Vector result = enumToVector(super.listOptions());
502
503    result.addElement(new Option(
504          "\tThe number of clusters (default "
505          + defaultNumClusters() + ")",
506          "k", 1, "-k <num>"));
507
508    result.addElement(new Option(
509          "\tSet pattern to grid (default is random).\n"
510          + "\tThis flag cannot be used at the same time as flag I.\n"
511          + "\tThe pattern is random, if neither flag G nor flag I is set.",
512          "G", 0, "-G"));
513
514    result.addElement(new Option(
515          "\tSet pattern to sine (default is random).\n"
516          + "\tThis flag cannot be used at the same time as flag I.\n"
517          + "\tThe pattern is random, if neither flag G nor flag I is set.",
518          "I", 0, "-I"));
519
520    result.addElement(new Option(
521          "\tThe range of number of instances per cluster (default "
522          + defaultMinInstNum() + ".." + defaultMaxInstNum() + ").\n"
523          + "\tLower number must be between 0 and 2500,\n"
524          + "\tupper number must be between 50 and 2500.",
525          "N", 1, "-N <num>..<num>"));
526
527    result.addElement(new Option(
528          "\tThe range of radius per cluster (default "
529          + defaultMinRadius() + ".." + defaultMaxRadius() + ").\n"
530          + "\tLower number must be between 0 and SQRT(2), \n"
531          + "\tupper number must be between SQRT(2) and SQRT(32).",
532          "R", 1, "-R <num>..<num>"));
533
534    result.addElement(new Option(
535          "\tThe distance multiplier (default " 
536          + defaultDistMult() + ").",
537          "M", 1, "-M <num>"));
538
539    result.addElement(new Option(
540          "\tThe number of cycles (default "
541          + defaultNumCycles() + ").",
542          "C", 1, "-C <num>"));
543
544    result.addElement(new Option(
545          "\tFlag for input order is ORDERED. If flag is not set then \n"
546          + "\tinput order is RANDOMIZED. RANDOMIZED is currently not \n"
547          + "\timplemented, therefore is the input order always ORDERED.",
548          "O", 0, "-O"));
549
550    result.addElement(new Option(
551          "\tThe noise rate in percent (default " 
552          + defaultNoiseRate() + ").\n"
553          + "\tCan be between 0% and 30%. (Remark: The original \n"
554          + "\talgorithm only allows noise up to 10%.)",
555          "P", 1, "-P <num>"));
556
557    return result.elements();
558  }
559 
560  /**
561   * Parses a list of options for this object. <p/>
562   *
563   <!-- options-start -->
564   * Valid options are: <p/>
565   *
566   * <pre> -h
567   *  Prints this help.</pre>
568   *
569   * <pre> -o &lt;file&gt;
570   *  The name of the output file, otherwise the generated data is
571   *  printed to stdout.</pre>
572   *
573   * <pre> -r &lt;name&gt;
574   *  The name of the relation.</pre>
575   *
576   * <pre> -d
577   *  Whether to print debug informations.</pre>
578   *
579   * <pre> -S
580   *  The seed for random function (default 1)</pre>
581   *
582   * <pre> -a &lt;num&gt;
583   *  The number of attributes (default 10).</pre>
584   *
585   * <pre> -c
586   *  Class Flag, if set, the cluster is listed in extra attribute.</pre>
587   *
588   * <pre> -b &lt;range&gt;
589   *  The indices for boolean attributes.</pre>
590   *
591   * <pre> -m &lt;range&gt;
592   *  The indices for nominal attributes.</pre>
593   *
594   * <pre> -k &lt;num&gt;
595   *  The number of clusters (default 4)</pre>
596   *
597   * <pre> -G
598   *  Set pattern to grid (default is random).
599   *  This flag cannot be used at the same time as flag I.
600   *  The pattern is random, if neither flag G nor flag I is set.</pre>
601   *
602   * <pre> -I
603   *  Set pattern to sine (default is random).
604   *  This flag cannot be used at the same time as flag I.
605   *  The pattern is random, if neither flag G nor flag I is set.</pre>
606   *
607   * <pre> -N &lt;num&gt;..&lt;num&gt;
608   *  The range of number of instances per cluster (default 1..50).
609   *  Lower number must be between 0 and 2500,
610   *  upper number must be between 50 and 2500.</pre>
611   *
612   * <pre> -R &lt;num&gt;..&lt;num&gt;
613   *  The range of radius per cluster (default 0.1..1.4142135623730951).
614   *  Lower number must be between 0 and SQRT(2),
615   *  upper number must be between SQRT(2) and SQRT(32).</pre>
616   *
617   * <pre> -M &lt;num&gt;
618   *  The distance multiplier (default 4.0).</pre>
619   *
620   * <pre> -C &lt;num&gt;
621   *  The number of cycles (default 4).</pre>
622   *
623   * <pre> -O
624   *  Flag for input order is ORDERED. If flag is not set then
625   *  input order is RANDOMIZED. RANDOMIZED is currently not
626   *  implemented, therefore is the input order always ORDERED.</pre>
627   *
628   * <pre> -P &lt;num&gt;
629   *  The noise rate in percent (default 0.0).
630   *  Can be between 0% and 30%. (Remark: The original
631   *  algorithm only allows noise up to 10%.)</pre>
632   *
633   <!-- options-end -->
634   *
635   * @param options the list of options as an array of strings
636   * @throws Exception if an option is not supported
637   */
638  public void setOptions(String[] options) throws Exception {
639    String        tmpStr;
640   
641    super.setOptions(options);
642
643    tmpStr = Utils.getOption('k', options);
644    if (tmpStr.length() != 0)
645      setNumClusters(Integer.parseInt(tmpStr));
646    else
647      setNumClusters(defaultNumClusters());
648
649    tmpStr = Utils.getOption('N', options);
650    if (tmpStr.length() != 0)
651      setInstNums(tmpStr);
652    else
653      setInstNums(defaultMinInstNum() + ".." + defaultMaxInstNum());
654   
655    tmpStr = Utils.getOption('R', options);
656    if (tmpStr.length() != 0)
657      setRadiuses(tmpStr);
658    else
659      setRadiuses(defaultMinRadius() + ".." + defaultMaxRadius());
660
661    boolean grid = Utils.getFlag('G', options);
662    boolean sine = Utils.getFlag('I', options);
663
664    if (grid && sine)
665      throw new Exception("Flags -G and -I can only be set mutually exclusiv.");
666
667    setPattern(new SelectedTag(RANDOM, TAGS_PATTERN));
668    if (grid)
669      setPattern(new SelectedTag(GRID, TAGS_PATTERN));
670    if (sine)
671      setPattern(new SelectedTag(SINE, TAGS_PATTERN));
672
673    tmpStr= Utils.getOption('M', options);
674    if (tmpStr.length() != 0) {
675      if (!grid)
676        throw new Exception("Option M can only be used with GRID pattern.");
677      setDistMult(Double.parseDouble(tmpStr));
678    }
679    else {
680      setDistMult(defaultDistMult());
681    }
682
683    tmpStr = Utils.getOption('C', options);
684    if (tmpStr.length() != 0) {
685      if (!sine)
686        throw new Exception("Option C can only be used with SINE pattern.");
687      setNumCycles(Integer.parseInt(tmpStr));
688    } 
689    else {
690      setNumCycles(defaultNumCycles());
691    }
692
693    if (Utils.getFlag('O', options))
694      setInputOrder(new SelectedTag(ORDERED, TAGS_INPUTORDER));
695    else
696      setInputOrder(defaultInputOrder());
697
698    tmpStr = Utils.getOption('P', options);
699    if (tmpStr.length() != 0)
700      setNoiseRate(Double.parseDouble(tmpStr));
701    else
702      setNoiseRate(defaultNoiseRate());
703  }
704
705  /**
706   * Gets the current settings of the datagenerator BIRCHCluster.
707   *
708   * @return an array of strings suitable for passing to setOptions
709   */
710  public String[] getOptions() {
711    Vector        result;
712    String[]      options;
713    int           i;
714   
715    result  = new Vector();
716    options = super.getOptions();
717    for (i = 0; i < options.length; i++)
718      result.add(options[i]);
719   
720    result.add("-k");
721    result.add("" + getNumClusters());
722   
723    result.add("-N"); 
724    result.add("" + getInstNums());
725
726    result.add("-R"); 
727    result.add("" + getRadiuses());
728
729    if (m_Pattern == GRID) {
730      result.add("-G");
731     
732      result.add("-M"); 
733      result.add("" + getDistMult());
734    }
735
736    if (m_Pattern == SINE) {
737      result.add("-I");
738     
739      result.add("-C"); 
740      result.add("" + getNumCycles());
741    }
742
743    if (getOrderedFlag())
744      result.add("-O");
745
746    result.add("-P"); 
747    result.add("" + getNoiseRate());
748   
749    return (String[]) result.toArray(new String[result.size()]);
750  }
751
752  /**
753   * returns the default number of clusters
754   *
755   * @return the default number of clusters
756   */
757  protected int defaultNumClusters() {
758    return 4;
759  }
760
761  /**
762   * Sets the number of clusters the dataset should have.
763   * @param numClusters the new number of clusters
764   */
765  public void setNumClusters(int numClusters) { 
766    m_NumClusters = numClusters; 
767  }
768
769  /**
770   * Gets the number of clusters the dataset should have.
771   * @return the number of clusters the dataset should have
772   */
773  public int getNumClusters() { 
774    return m_NumClusters; 
775  }
776 
777  /**
778   * Returns the tip text for this property
779   *
780   * @return tip text for this property suitable for
781   *         displaying in the explorer/experimenter gui
782   */
783  public String numClustersTipText() {
784    return "The number of clusters to generate.";
785  }
786
787  /**
788   * Sets the upper and lower boundary for instances per cluster.
789   *
790   * @param fromTo  the string containing the upper and lower boundary for
791   *                instances per cluster separated by ..
792   */
793  protected void setInstNums(String fromTo) {
794    int i = fromTo.indexOf("..");
795    String from = fromTo.substring(0, i);
796    setMinInstNum(Integer.parseInt(from));
797    String to = fromTo.substring(i + 2, fromTo.length());
798    setMaxInstNum(Integer.parseInt(to));
799  }
800 
801  /**
802   * Gets the upper and lower boundary for instances per cluster.
803   *
804   * @return the string containing the upper and lower boundary for
805   * instances per cluster separated by ..
806   */
807  protected String getInstNums() {
808    String fromTo = "" 
809                    + getMinInstNum() + ".."
810                    + getMaxInstNum();
811    return fromTo;
812  }
813 
814  /**
815   * Returns the tip text for this property
816   *
817   * @return tip text for this property suitable for
818   *         displaying in the explorer/experimenter gui
819   */
820  protected String instNumsTipText() {
821    return "The upper and lowet boundary for instances per cluster.";
822  }
823
824  /**
825   * returns the default min number of instances
826   *
827   * @return the default min number of instances
828   */
829  protected int defaultMinInstNum() {
830    return 1;
831  }
832
833  /**
834   * Gets the lower boundary for instances per cluster.
835   *
836   * @return the the lower boundary for instances per cluster
837   */
838  public int getMinInstNum() { 
839    return m_MinInstNum; 
840  }
841 
842  /**
843   * Sets the lower boundary for instances per cluster.
844   *
845   * @param newMinInstNum new lower boundary for instances per cluster
846   */
847  public void setMinInstNum(int newMinInstNum) {
848    m_MinInstNum = newMinInstNum;
849  }
850 
851  /**
852   * Returns the tip text for this property
853   *
854   * @return tip text for this property suitable for
855   *         displaying in the explorer/experimenter gui
856   */
857  public String minInstNumTipText() {
858    return "The lower boundary for instances per cluster.";
859  }
860
861  /**
862   * returns the default max number of instances
863   *
864   * @return the default max number of instances
865   */
866  protected int defaultMaxInstNum() {
867    return 50;
868  }
869
870  /**
871   * Gets the upper boundary for instances per cluster.
872   *
873   * @return the upper boundary for instances per cluster
874   */
875  public int getMaxInstNum() { 
876    return m_MaxInstNum; 
877  }
878 
879  /**
880   * Sets the upper boundary for instances per cluster.
881   *
882   * @param newMaxInstNum new upper boundary for instances per cluster
883   */
884  public void setMaxInstNum(int newMaxInstNum) {
885    m_MaxInstNum = newMaxInstNum;
886  }
887 
888  /**
889   * Returns the tip text for this property
890   *
891   * @return tip text for this property suitable for
892   *         displaying in the explorer/experimenter gui
893   */
894  public String maxInstNumTipText() {
895    return "The upper boundary for instances per cluster.";
896  }
897
898  /**
899   * Sets the upper and lower boundary for the radius of the clusters.
900   *
901   * @param fromTo the string containing the upper and lower boundary for
902   * the radius  of the clusters, separated by ..
903   */
904  protected void setRadiuses(String fromTo) {
905    int i = fromTo.indexOf("..");
906    String from = fromTo.substring(0, i);
907    setMinRadius(Double.valueOf(from).doubleValue());
908    String to = fromTo.substring(i + 2, fromTo.length());
909    setMaxRadius(Double.valueOf(to).doubleValue());
910  }
911
912  /**
913   * Gets the upper and lower boundary for the radius of the clusters.
914   *
915   * @return the string containing the upper and lower boundary for
916   * the radius  of the clusters, separated by ..
917   */
918  protected String getRadiuses() {
919    String fromTo = "" 
920                    + Utils.doubleToString(getMinRadius(), 2) + ".."
921                    + Utils.doubleToString(getMaxRadius(), 2);
922    return fromTo;
923  }
924 
925  /**
926   * Returns the tip text for this property
927   *
928   * @return tip text for this property suitable for
929   *         displaying in the explorer/experimenter gui
930   */
931  protected String radiusesTipText() {
932    return "The upper and lower boundary for the radius of the clusters.";
933  }
934
935  /**
936   * returns the default min radius
937   *
938   * @return the default min radius
939   */
940  protected double defaultMinRadius() {
941    return 0.1;
942  }
943
944  /**
945   * Gets the lower boundary for the radiuses of the clusters.
946   *
947   * @return the lower boundary for the radiuses of the clusters
948   */
949  public double getMinRadius() { 
950    return m_MinRadius; 
951  }
952 
953  /**
954   * Sets the lower boundary for the radiuses of the clusters.
955   *
956   * @param newMinRadius new lower boundary for the radiuses of the clusters
957   */
958  public void setMinRadius(double newMinRadius) {
959    m_MinRadius = newMinRadius;
960  }
961 
962  /**
963   * Returns the tip text for this property
964   *
965   * @return tip text for this property suitable for
966   *         displaying in the explorer/experimenter gui
967   */
968  public String minRadiusTipText() {
969    return "The lower boundary for the radius of the clusters.";
970  }
971
972  /**
973   * returns the default max radius
974   *
975   * @return the default max radius
976   */
977  protected double defaultMaxRadius() {
978    return Math.sqrt(2.0);
979  }
980
981  /**
982   * Gets the upper boundary for the radiuses of the clusters.
983   *
984   * @return the upper boundary for the radiuses of the clusters
985   */
986  public double getMaxRadius() { 
987    return m_MaxRadius; 
988  }
989 
990  /**
991   * Sets the upper boundary for the radiuses of the clusters.
992   *
993   * @param newMaxRadius new upper boundary for the radiuses of the clusters
994   */
995  public void setMaxRadius(double newMaxRadius) {
996    m_MaxRadius = newMaxRadius;
997  }
998 
999  /**
1000   * Returns the tip text for this property
1001   *
1002   * @return tip text for this property suitable for
1003   *         displaying in the explorer/experimenter gui
1004   */
1005  public String maxRadiusTipText() {
1006    return "The upper boundary for the radius of the clusters.";
1007  }
1008
1009  /**
1010   * returns the default pattern
1011   *
1012   * @return the default pattern
1013   */
1014  protected SelectedTag defaultPattern() {
1015    return new SelectedTag(RANDOM, TAGS_PATTERN);
1016  }
1017 
1018  /**
1019   * Gets the pattern type.
1020   *
1021   * @return the current pattern type
1022   */
1023  public SelectedTag getPattern() { 
1024    return new SelectedTag(m_Pattern, TAGS_PATTERN);
1025  }
1026
1027  /**
1028   * Sets the pattern type.
1029   *
1030   * @param value new pattern type
1031   */
1032  public void setPattern(SelectedTag value) {
1033    if (value.getTags() == TAGS_PATTERN)
1034      m_Pattern = value.getSelectedTag().getID();
1035  }
1036 
1037  /**
1038   * Returns the tip text for this property
1039   *
1040   * @return tip text for this property suitable for
1041   *         displaying in the explorer/experimenter gui
1042   */
1043  public String patternTipText() {
1044    return "The pattern for generating the data.";
1045  }
1046
1047  /**
1048   * returns the default distance multiplier
1049   *
1050   * @return the default distance multiplier
1051   */
1052  protected double defaultDistMult() {
1053    return 4.0;
1054  }
1055
1056  /**
1057   * Gets the distance multiplier.
1058   *
1059   * @return the distance multiplier
1060   */
1061  public double getDistMult() { 
1062    return m_DistMult; 
1063  }
1064 
1065  /**
1066   * Sets the distance multiplier.
1067   *
1068   * @param newDistMult new distance multiplier
1069   */
1070  public void setDistMult(double newDistMult) {
1071    m_DistMult = newDistMult;
1072  }
1073 
1074  /**
1075   * Returns the tip text for this property
1076   *
1077   * @return tip text for this property suitable for
1078   *         displaying in the explorer/experimenter gui
1079   */
1080  public String distMultTipText() {
1081    return "The distance multiplier (in combination with the 'Grid' pattern).";
1082  }
1083
1084  /**
1085   * returns the default number of cycles
1086   *
1087   * @return the default number of cycles
1088   */
1089  protected int defaultNumCycles() {
1090    return 4;
1091  }
1092
1093  /**
1094   * Gets the number of cycles.
1095   *
1096   * @return the number of cycles
1097   */
1098  public int getNumCycles() { 
1099    return m_NumCycles; 
1100  }
1101 
1102  /**
1103   * Sets the the number of cycles.
1104   *
1105   * @param newNumCycles new number of cycles
1106   */
1107  public void setNumCycles(int newNumCycles) {
1108    m_NumCycles = newNumCycles;
1109  }
1110 
1111  /**
1112   * Returns the tip text for this property
1113   *
1114   * @return tip text for this property suitable for
1115   *         displaying in the explorer/experimenter gui
1116   */
1117  public String numCyclesTipText() {
1118    return "The number of cycles to use (in combination with the 'Sine' pattern).";
1119  }
1120
1121  /**
1122   * returns the default input order
1123   *
1124   * @return the default input order
1125   */
1126  protected SelectedTag defaultInputOrder() {
1127    return new SelectedTag(ORDERED, TAGS_INPUTORDER);  // TODO: the only one that is currently implemented, normally RANDOMIZED
1128  }
1129
1130  /**
1131   * Gets the input order.
1132   *
1133   * @return the current input order
1134   */
1135  public SelectedTag getInputOrder() { 
1136    return new SelectedTag(m_InputOrder, TAGS_INPUTORDER);
1137  }
1138
1139  /**
1140   * Sets the input order.
1141   *
1142   * @param value new input order
1143   */
1144  public void setInputOrder(SelectedTag value) {
1145    if (value.getTags() == TAGS_INPUTORDER)
1146      m_InputOrder = value.getSelectedTag().getID();
1147  }
1148 
1149  /**
1150   * Returns the tip text for this property
1151   *
1152   * @return tip text for this property suitable for
1153   *         displaying in the explorer/experimenter gui
1154   */
1155  public String inputOrderTipText() {
1156    return "The input order to use.";
1157  }
1158
1159  /**
1160   * Gets the ordered flag (option O).
1161   *
1162   * @return true if ordered flag is set
1163   */
1164  public boolean getOrderedFlag() { 
1165    return m_InputOrder == ORDERED; 
1166  }
1167
1168  /**
1169   * returns the default noise rate
1170   *
1171   * @return the default noise rate
1172   */
1173  protected double defaultNoiseRate() {
1174    return 0.0;
1175  }
1176
1177  /**
1178   * Gets the percentage of noise set.
1179   *
1180   * @return the percentage of noise set
1181   */
1182  public double getNoiseRate() { 
1183    return m_NoiseRate; 
1184  }
1185 
1186  /**
1187   * Sets the percentage of noise set.
1188   *
1189   * @param newNoiseRate new percentage of noise
1190   */
1191  public void setNoiseRate(double newNoiseRate) {
1192    m_NoiseRate = newNoiseRate;
1193  }
1194 
1195  /**
1196   * Returns the tip text for this property
1197   *
1198   * @return tip text for this property suitable for
1199   *         displaying in the explorer/experimenter gui
1200   */
1201  public String noiseRateTipText() {
1202    return "The noise rate to use.";
1203  }
1204
1205  /**
1206   * Gets the single mode flag.
1207   *
1208   * @return true if methode generateExample can be used.
1209   */
1210  public boolean getSingleModeFlag() { 
1211    return false; 
1212  }
1213 
1214  /**
1215   * Initializes the format for the dataset produced.
1216   *
1217   * @return the output data format
1218   * @throws Exception data format could not be defined
1219   */
1220
1221  public Instances defineDataFormat() throws Exception {
1222    Random random = new Random (getSeed());
1223    setRandom(random);
1224    Instances dataset;
1225    FastVector attributes = new FastVector(3);
1226    Attribute attribute;
1227    boolean classFlag = getClassFlag();
1228   
1229    FastVector classValues = null;
1230    if (classFlag) classValues = new FastVector (m_NumClusters);     
1231
1232    // define dataset
1233    for (int i = 0; i < getNumAttributes(); i++) {
1234      attribute = new Attribute("X" + i); 
1235      attributes.addElement(attribute);
1236    }
1237   
1238    if (classFlag) {
1239      for (int i = 0; i < m_NumClusters; i++)
1240        classValues.addElement("c" + i);
1241      attribute = new Attribute ("class", classValues); 
1242      attributes.addElement(attribute);
1243    }
1244
1245    dataset = new Instances(getRelationNameToUse(), attributes, 0);
1246    if (classFlag) 
1247      dataset.setClassIndex(getNumAttributes());
1248
1249    // set dataset format of this class
1250    Instances format = new Instances(dataset, 0);
1251    setDatasetFormat(format);
1252
1253    m_ClusterList = defineClusters(random);
1254
1255    //System.out.println("dataset" + dataset.numAttributes());
1256    return dataset; 
1257  }
1258
1259  /**
1260   * Generate an example of the dataset.
1261   * @return the instance generated
1262   * @throws Exception if format not defined or generating <br/>
1263   * examples one by one is not possible, because voting is chosen
1264   */
1265
1266  public Instance generateExample() throws Exception {
1267    throw new Exception("Examples cannot be generated" +
1268                                           " one by one.");
1269  }
1270
1271  /**
1272   * Generate all examples of the dataset.
1273   * @return the instance generated
1274   * @throws Exception if format not defined
1275   */
1276
1277  public Instances generateExamples() throws Exception {
1278    Random random = getRandom();
1279    Instances data = getDatasetFormat();
1280    if (data == null) throw new Exception("Dataset format not defined.");
1281
1282    // generate examples
1283    if (getOrderedFlag())
1284      data = generateExamples(random, data);
1285    else
1286      throw new Exception("RANDOMIZED is not yet implemented.");
1287 
1288    return (data);
1289  }
1290
1291  /**
1292   * Generate all examples of the dataset.
1293   *
1294   * @param random the random number generator to use
1295   * @param format the dataset format
1296   * @return the instance generated
1297   * @throws Exception if format not defined
1298   */
1299  public Instances generateExamples(Random random,
1300                                    Instances format) throws Exception {
1301    Instance example = null;
1302   
1303    if (format == null) 
1304      throw new Exception("Dataset format not defined.");
1305
1306    // generate examples for one cluster after another
1307    int cNum = 0;
1308    for (Enumeration enm = m_ClusterList.elements();
1309         enm.hasMoreElements(); cNum++) {
1310      Cluster cl  = (Cluster) enm.nextElement();
1311      double stdDev = cl.getStdDev();
1312      int instNum = cl.getInstNum();
1313      double[] center = cl.getCenter();
1314      String cName = "c" + cNum;
1315
1316      for (int i = 0; i < instNum; i++) {
1317        // generate example
1318        example = generateInstance(
1319                    format, random, stdDev, center, cName);
1320       
1321        if (example != null)
1322          example.setDataset(format);
1323        format.add(example);
1324      }
1325    }
1326
1327    return (format);
1328  }
1329
1330  /**
1331   * Generate an example of the dataset.
1332   *
1333   * @param format the dataset format
1334   * @param randomG the random number generator
1335   * @param stdDev the standard deviation to use
1336   * @param center the centers
1337   * @param cName the class value
1338   * @return the instance generated
1339   * examples one by one is not possible, because voting is chosen
1340   */
1341  private Instance generateInstance (Instances format,
1342                                     Random randomG,
1343                                     double stdDev,
1344                                     double[] center,
1345                                     String cName) {
1346    Instance example;
1347    int numAtts = getNumAttributes();
1348    if (getClassFlag()) 
1349      numAtts++;
1350
1351    example = new DenseInstance(numAtts);
1352    example.setDataset(format);
1353       
1354    for (int i = 0; i < getNumAttributes(); i++)
1355      example.setValue(i, randomG.nextGaussian() * stdDev + center[i]); 
1356   
1357    if (getClassFlag())
1358      example.setClassValue(cName);
1359
1360    return example; 
1361  }
1362
1363 /**
1364   * Defines the clusters
1365   *
1366   * @param random random number generator
1367   * @return the cluster definitions
1368   * @throws Exception if defining fails
1369   */
1370  private FastVector defineClusters(Random random)
1371   throws Exception {
1372
1373    if (m_Pattern == GRID)
1374      return defineClustersGRID(random);
1375    else
1376      return defineClustersRANDOM(random);
1377  }
1378
1379  /**
1380   * Defines the clusters if pattern is GRID
1381   *
1382   * @param random random number generator
1383   * @return the defined clusters for GRID
1384   * @throws Exception if something goes wrong
1385   */
1386  private FastVector defineClustersGRID(Random random)
1387    throws Exception {
1388
1389    FastVector clusters = new FastVector(m_NumClusters);
1390    double diffInstNum = (double) (m_MaxInstNum - m_MinInstNum);
1391    double minInstNum = (double) m_MinInstNum;
1392    double diffRadius = m_MaxRadius - m_MinRadius;
1393    Cluster cluster;
1394
1395    // compute gridsize
1396    double gs = Math.pow(m_NumClusters, 1.0 / getNumAttributes());
1397   
1398    if (gs - ((double) ((int) gs))  > 0.0) {
1399      m_GridSize = (int) (gs + 1.0);
1400    } else { m_GridSize = (int) gs; }
1401
1402    // compute gridwidth
1403    m_GridWidth = ((m_MaxRadius + m_MinRadius) / 2) * m_DistMult;
1404
1405    //System.out.println("GridSize= " + m_GridSize);
1406    //System.out.println("GridWidth= " + m_GridWidth);
1407   
1408    // initialize gridvector with zeros
1409    GridVector gv = new GridVector(getNumAttributes(), m_GridSize);
1410
1411    for (int i = 0; i < m_NumClusters; i++) {
1412      int instNum = (int) (random.nextDouble() * diffInstNum
1413                                   + minInstNum);
1414      double radius = (random.nextDouble() * diffRadius) + m_MinRadius;
1415
1416      // center is defined in the constructor of cluster
1417      cluster = new Cluster(instNum, radius,
1418                            gv.getGridVector(), m_GridWidth);
1419      clusters.addElement((Object) cluster);
1420      gv.addOne();
1421    }
1422    return clusters;
1423  }
1424
1425 /**
1426   * Defines the clusters if pattern is RANDOM
1427   *
1428   * @param random random number generator
1429   * @return the cluster definitions
1430   * @throws Exception if something goes wrong
1431   */
1432  private FastVector defineClustersRANDOM(Random random)
1433    throws Exception {
1434
1435    FastVector clusters = new FastVector(m_NumClusters);
1436    double diffInstNum = (double) (m_MaxInstNum - m_MinInstNum);
1437    double minInstNum = (double) m_MinInstNum;
1438    double diffRadius = m_MaxRadius - m_MinRadius;
1439    Cluster cluster;
1440
1441    for (int i = 0; i < m_NumClusters; i++) {
1442      int instNum = (int) (random.nextDouble() * diffInstNum
1443                                   + minInstNum);
1444      double radius = (random.nextDouble() * diffRadius) + m_MinRadius;
1445
1446      // center is defined in the constructor of cluster
1447      cluster = new Cluster(instNum, radius, random);
1448      clusters.addElement((Object) cluster);
1449    }
1450    return clusters;
1451  }
1452
1453
1454  /**
1455   * Compiles documentation about the data generation after
1456   * the generation process
1457   *
1458   * @return string with additional information about generated dataset
1459   * @throws Exception no input structure has been defined
1460   */
1461  public String generateFinished() throws Exception {
1462    return "";
1463  }
1464 
1465  /**
1466   * Compiles documentation about the data generation before
1467   * the generation process
1468   *
1469   * @return string with additional information
1470   */
1471  public String generateStart() {
1472    StringBuffer docu = new StringBuffer();
1473
1474    int sumInst = 0;
1475    int cNum = 0;
1476    for (Enumeration enm = m_ClusterList.elements();
1477         enm.hasMoreElements(); cNum++) {
1478      Cluster cl  = (Cluster) enm.nextElement();
1479      docu.append("%\n");
1480      docu.append("% Cluster: c"+ cNum + "\n");
1481      docu.append("% ----------------------------------------------\n");
1482      docu.append("% StandardDeviation: "
1483                  + Utils.doubleToString(cl.getStdDev(), 2) + "\n");
1484      docu.append("% Number of instances: "
1485                  + cl.getInstNum() + "\n");
1486      sumInst += cl.getInstNum();
1487      double[] center = cl.getCenter();
1488      docu.append("% "); 
1489      for (int i = 0; i < center.length - 1; i++) {
1490        docu.append(Utils.doubleToString(center[i], 2) + ", ");
1491      }
1492      docu.append(Utils.doubleToString(center[center.length - 1], 2) + "\n");
1493    }
1494    docu.append("%\n% ----------------------------------------------\n"); 
1495    docu.append("% Total number of instances: " + sumInst + "\n");
1496    docu.append("%                            in " + cNum + " clusters\n");
1497    docu.append("% Pattern chosen           : ");
1498    if (m_Pattern == GRID) 
1499      docu.append(
1500          "GRID, " + "distance multiplier = " 
1501          + Utils.doubleToString(m_DistMult, 2) + "\n");
1502    else if (m_Pattern == SINE) 
1503      docu.append("SINE\n");
1504    else
1505      docu.append("RANDOM\n");
1506
1507    return docu.toString();
1508  }
1509 
1510  /**
1511   * Returns the revision string.
1512   *
1513   * @return            the revision
1514   */
1515  public String getRevision() {
1516    return RevisionUtils.extract("$Revision: 5987 $");
1517  }
1518
1519  /**
1520   * Main method for testing this class.
1521   *
1522   * @param args should contain arguments for the data producer:
1523   */
1524  public static void main(String[] args) {
1525    runDataGenerator(new BIRCHCluster(), args);
1526  }
1527}
1528
Note: See TracBrowser for help on using the repository browser.