1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * SubspaceCluster.java |
---|
19 | * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.datagenerators.clusterers; |
---|
24 | |
---|
25 | import weka.core.Attribute; |
---|
26 | import weka.core.FastVector; |
---|
27 | import weka.core.Instance; |
---|
28 | import weka.core.DenseInstance; |
---|
29 | import weka.core.Instances; |
---|
30 | import weka.core.Option; |
---|
31 | import weka.core.Range; |
---|
32 | import weka.core.RevisionUtils; |
---|
33 | import weka.core.Tag; |
---|
34 | import weka.core.Utils; |
---|
35 | import weka.datagenerators.ClusterDefinition; |
---|
36 | import weka.datagenerators.ClusterGenerator; |
---|
37 | |
---|
38 | import java.util.Enumeration; |
---|
39 | import java.util.Random; |
---|
40 | import java.util.Vector; |
---|
41 | |
---|
42 | /** |
---|
43 | <!-- globalinfo-start --> |
---|
44 | * A data generator that produces data points in hyperrectangular subspace clusters. |
---|
45 | * <p/> |
---|
46 | <!-- globalinfo-end --> |
---|
47 | * |
---|
48 | <!-- options-start --> |
---|
49 | * Valid options are: <p/> |
---|
50 | * |
---|
51 | * <pre> -h |
---|
52 | * Prints this help.</pre> |
---|
53 | * |
---|
54 | * <pre> -o <file> |
---|
55 | * The name of the output file, otherwise the generated data is |
---|
56 | * printed to stdout.</pre> |
---|
57 | * |
---|
58 | * <pre> -r <name> |
---|
59 | * The name of the relation.</pre> |
---|
60 | * |
---|
61 | * <pre> -d |
---|
62 | * Whether to print debug informations.</pre> |
---|
63 | * |
---|
64 | * <pre> -S |
---|
65 | * The seed for random function (default 1)</pre> |
---|
66 | * |
---|
67 | * <pre> -a <num> |
---|
68 | * The number of attributes (default 1).</pre> |
---|
69 | * |
---|
70 | * <pre> -c |
---|
71 | * Class Flag, if set, the cluster is listed in extra attribute.</pre> |
---|
72 | * |
---|
73 | * <pre> -b <range> |
---|
74 | * The indices for boolean attributes.</pre> |
---|
75 | * |
---|
76 | * <pre> -m <range> |
---|
77 | * The indices for nominal attributes.</pre> |
---|
78 | * |
---|
79 | * <pre> -P <num> |
---|
80 | * The noise rate in percent (default 0.0). |
---|
81 | * Can be between 0% and 30%. (Remark: The original |
---|
82 | * algorithm only allows noise up to 10%.)</pre> |
---|
83 | * |
---|
84 | * <pre> -C <cluster-definition> |
---|
85 | * A cluster definition of class 'SubspaceClusterDefinition' |
---|
86 | * (definition needs to be quoted to be recognized as |
---|
87 | * a single argument).</pre> |
---|
88 | * |
---|
89 | * <pre> |
---|
90 | * Options specific to weka.datagenerators.clusterers.SubspaceClusterDefinition: |
---|
91 | * </pre> |
---|
92 | * |
---|
93 | * <pre> -A <range> |
---|
94 | * Generates randomly distributed instances in the cluster.</pre> |
---|
95 | * |
---|
96 | * <pre> -U <range> |
---|
97 | * Generates uniformly distributed instances in the cluster.</pre> |
---|
98 | * |
---|
99 | * <pre> -G <range> |
---|
100 | * Generates gaussian distributed instances in the cluster.</pre> |
---|
101 | * |
---|
102 | * <pre> -D <num>,<num> |
---|
103 | * The attribute min/max (-A and -U) or mean/stddev (-G) for |
---|
104 | * the cluster.</pre> |
---|
105 | * |
---|
106 | * <pre> -N <num>..<num> |
---|
107 | * The range of number of instances per cluster (default 1..50).</pre> |
---|
108 | * |
---|
109 | * <pre> -I |
---|
110 | * Uses integer instead of continuous values (default continuous).</pre> |
---|
111 | * |
---|
112 | <!-- options-end --> |
---|
113 | * |
---|
114 | * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) |
---|
115 | * @author FracPete (fracpete at waikato dot ac dot nz) |
---|
116 | * @version $Revision: 5987 $ |
---|
117 | */ |
---|
118 | public class SubspaceCluster |
---|
119 | extends ClusterGenerator { |
---|
120 | |
---|
121 | /** for serialization */ |
---|
122 | static final long serialVersionUID = -3454999858505621128L; |
---|
123 | |
---|
124 | /** noise rate in percent (option P, between 0 and 30)*/ |
---|
125 | protected double m_NoiseRate; |
---|
126 | |
---|
127 | /** cluster list */ |
---|
128 | protected ClusterDefinition[] m_Clusters; |
---|
129 | |
---|
130 | /** if nominal, store number of values */ |
---|
131 | protected int[] m_numValues; |
---|
132 | |
---|
133 | /** store global min values */ |
---|
134 | protected double[] m_globalMinValue; |
---|
135 | |
---|
136 | /** store global max values */ |
---|
137 | protected double[] m_globalMaxValue; |
---|
138 | |
---|
139 | /** cluster type: uniform/random */ |
---|
140 | public static final int UNIFORM_RANDOM = 0; |
---|
141 | /** cluster type: total uniform */ |
---|
142 | public static final int TOTAL_UNIFORM = 1; |
---|
143 | /** cluster type: gaussian */ |
---|
144 | public static final int GAUSSIAN = 2; |
---|
145 | /** the tags for the cluster types */ |
---|
146 | public static final Tag[] TAGS_CLUSTERTYPE = { |
---|
147 | new Tag(UNIFORM_RANDOM, "uniform/random"), |
---|
148 | new Tag(TOTAL_UNIFORM, "total uniform"), |
---|
149 | new Tag(GAUSSIAN, "gaussian") |
---|
150 | }; |
---|
151 | |
---|
152 | /** cluster subtype: continuous */ |
---|
153 | public static final int CONTINUOUS = 0; |
---|
154 | /** cluster subtype: integer */ |
---|
155 | public static final int INTEGER = 1; |
---|
156 | /** the tags for the cluster types */ |
---|
157 | public static final Tag[] TAGS_CLUSTERSUBTYPE = { |
---|
158 | new Tag(CONTINUOUS, "continuous"), |
---|
159 | new Tag(INTEGER, "integer") |
---|
160 | }; |
---|
161 | |
---|
162 | /** |
---|
163 | * initializes the generator, sets the number of clusters to 0, since user |
---|
164 | * has to specify them explicitly |
---|
165 | */ |
---|
166 | public SubspaceCluster() { |
---|
167 | super(); |
---|
168 | |
---|
169 | setNoiseRate(defaultNoiseRate()); |
---|
170 | } |
---|
171 | |
---|
172 | /** |
---|
173 | * Returns a string describing this data generator. |
---|
174 | * |
---|
175 | * @return a description of the data generator suitable for |
---|
176 | * displaying in the explorer/experimenter gui |
---|
177 | */ |
---|
178 | public String globalInfo() { |
---|
179 | return "A data generator that produces data points in " |
---|
180 | + "hyperrectangular subspace clusters."; |
---|
181 | } |
---|
182 | |
---|
183 | /** |
---|
184 | * Returns an enumeration describing the available options. |
---|
185 | * |
---|
186 | * @return an enumeration of all the available options |
---|
187 | */ |
---|
188 | public Enumeration listOptions() { |
---|
189 | Vector result = enumToVector(super.listOptions()); |
---|
190 | |
---|
191 | result.addElement(new Option( |
---|
192 | "\tThe noise rate in percent (default " |
---|
193 | + defaultNoiseRate() + ").\n" |
---|
194 | + "\tCan be between 0% and 30%. (Remark: The original \n" |
---|
195 | + "\talgorithm only allows noise up to 10%.)", |
---|
196 | "P", 1, "-P <num>")); |
---|
197 | |
---|
198 | result.addElement(new Option( |
---|
199 | "\tA cluster definition of class '" |
---|
200 | + SubspaceClusterDefinition.class.getName().replaceAll(".*\\.", "") + "'\n" |
---|
201 | + "\t(definition needs to be quoted to be recognized as \n" |
---|
202 | + "\ta single argument).", |
---|
203 | "C", 1, "-C <cluster-definition>")); |
---|
204 | |
---|
205 | result.addElement(new Option( |
---|
206 | "", "", 0, |
---|
207 | "\nOptions specific to " |
---|
208 | + SubspaceClusterDefinition.class.getName() + ":")); |
---|
209 | |
---|
210 | result.addAll( |
---|
211 | enumToVector(new SubspaceClusterDefinition(this).listOptions())); |
---|
212 | |
---|
213 | return result.elements(); |
---|
214 | } |
---|
215 | |
---|
216 | /** |
---|
217 | * Parses a list of options for this object. <p/> |
---|
218 | * |
---|
219 | <!-- options-start --> |
---|
220 | * Valid options are: <p/> |
---|
221 | * |
---|
222 | * <pre> -h |
---|
223 | * Prints this help.</pre> |
---|
224 | * |
---|
225 | * <pre> -o <file> |
---|
226 | * The name of the output file, otherwise the generated data is |
---|
227 | * printed to stdout.</pre> |
---|
228 | * |
---|
229 | * <pre> -r <name> |
---|
230 | * The name of the relation.</pre> |
---|
231 | * |
---|
232 | * <pre> -d |
---|
233 | * Whether to print debug informations.</pre> |
---|
234 | * |
---|
235 | * <pre> -S |
---|
236 | * The seed for random function (default 1)</pre> |
---|
237 | * |
---|
238 | * <pre> -a <num> |
---|
239 | * The number of attributes (default 1).</pre> |
---|
240 | * |
---|
241 | * <pre> -c |
---|
242 | * Class Flag, if set, the cluster is listed in extra attribute.</pre> |
---|
243 | * |
---|
244 | * <pre> -b <range> |
---|
245 | * The indices for boolean attributes.</pre> |
---|
246 | * |
---|
247 | * <pre> -m <range> |
---|
248 | * The indices for nominal attributes.</pre> |
---|
249 | * |
---|
250 | * <pre> -P <num> |
---|
251 | * The noise rate in percent (default 0.0). |
---|
252 | * Can be between 0% and 30%. (Remark: The original |
---|
253 | * algorithm only allows noise up to 10%.)</pre> |
---|
254 | * |
---|
255 | * <pre> -C <cluster-definition> |
---|
256 | * A cluster definition of class 'SubspaceClusterDefinition' |
---|
257 | * (definition needs to be quoted to be recognized as |
---|
258 | * a single argument).</pre> |
---|
259 | * |
---|
260 | * <pre> |
---|
261 | * Options specific to weka.datagenerators.clusterers.SubspaceClusterDefinition: |
---|
262 | * </pre> |
---|
263 | * |
---|
264 | * <pre> -A <range> |
---|
265 | * Generates randomly distributed instances in the cluster.</pre> |
---|
266 | * |
---|
267 | * <pre> -U <range> |
---|
268 | * Generates uniformly distributed instances in the cluster.</pre> |
---|
269 | * |
---|
270 | * <pre> -G <range> |
---|
271 | * Generates gaussian distributed instances in the cluster.</pre> |
---|
272 | * |
---|
273 | * <pre> -D <num>,<num> |
---|
274 | * The attribute min/max (-A and -U) or mean/stddev (-G) for |
---|
275 | * the cluster.</pre> |
---|
276 | * |
---|
277 | * <pre> -N <num>..<num> |
---|
278 | * The range of number of instances per cluster (default 1..50).</pre> |
---|
279 | * |
---|
280 | * <pre> -I |
---|
281 | * Uses integer instead of continuous values (default continuous).</pre> |
---|
282 | * |
---|
283 | <!-- options-end --> |
---|
284 | * |
---|
285 | * @param options the list of options as an array of strings |
---|
286 | * @throws Exception if an option is not supported |
---|
287 | */ |
---|
288 | public void setOptions(String[] options) throws Exception { |
---|
289 | String tmpStr; |
---|
290 | SubspaceClusterDefinition cl; |
---|
291 | Vector list; |
---|
292 | int clCount; |
---|
293 | |
---|
294 | super.setOptions(options); |
---|
295 | |
---|
296 | m_numValues = new int[getNumAttributes()]; |
---|
297 | // numValues might be changed by a cluster definition |
---|
298 | // (only relevant for nominal data) |
---|
299 | for (int i = 0; i < getNumAttributes(); i++) |
---|
300 | m_numValues[i] = 1; |
---|
301 | |
---|
302 | tmpStr = Utils.getOption('P', options); |
---|
303 | if (tmpStr.length() != 0) |
---|
304 | setNoiseRate(Double.parseDouble(tmpStr)); |
---|
305 | else |
---|
306 | setNoiseRate(defaultNoiseRate()); |
---|
307 | |
---|
308 | // cluster definitions |
---|
309 | list = new Vector(); |
---|
310 | |
---|
311 | clCount = 0; |
---|
312 | do { |
---|
313 | tmpStr = Utils.getOption('C', options); |
---|
314 | if (tmpStr.length() != 0) { |
---|
315 | clCount++; |
---|
316 | cl = new SubspaceClusterDefinition(this); |
---|
317 | cl.setOptions(Utils.splitOptions(tmpStr)); |
---|
318 | list.add(cl); |
---|
319 | } |
---|
320 | } |
---|
321 | while (tmpStr.length() != 0); |
---|
322 | |
---|
323 | m_Clusters = (ClusterDefinition[]) |
---|
324 | list.toArray(new ClusterDefinition[list.size()]); |
---|
325 | // in case no cluster definition was provided, make sure that there's at |
---|
326 | // least one definition present -> see getClusters() |
---|
327 | getClusters(); |
---|
328 | } |
---|
329 | |
---|
330 | |
---|
331 | /** |
---|
332 | * Gets the current settings of the datagenerator. |
---|
333 | * |
---|
334 | * @return an array of strings suitable for passing to setOptions |
---|
335 | */ |
---|
336 | public String[] getOptions() { |
---|
337 | Vector result; |
---|
338 | String[] options; |
---|
339 | int i; |
---|
340 | |
---|
341 | result = new Vector(); |
---|
342 | options = super.getOptions(); |
---|
343 | for (i = 0; i < options.length; i++) |
---|
344 | result.add(options[i]); |
---|
345 | |
---|
346 | result.add("-P"); |
---|
347 | result.add("" + getNoiseRate()); |
---|
348 | |
---|
349 | for (i = 0; i < getClusters().length; i++) { |
---|
350 | result.add("-C"); |
---|
351 | result.add(Utils.joinOptions(getClusters()[i].getOptions())); |
---|
352 | } |
---|
353 | |
---|
354 | return (String[]) result.toArray(new String[result.size()]); |
---|
355 | } |
---|
356 | |
---|
357 | /** |
---|
358 | * returns the current cluster definitions, if necessary initializes them |
---|
359 | * |
---|
360 | * @return the current cluster definitions |
---|
361 | */ |
---|
362 | protected ClusterDefinition[] getClusters() { |
---|
363 | if ( (m_Clusters == null) || (m_Clusters.length == 0) ) { |
---|
364 | if (m_Clusters != null) |
---|
365 | System.out.println("NOTE: at least 1 cluster definition is necessary, " |
---|
366 | + "created default one."); |
---|
367 | m_Clusters = new ClusterDefinition[]{new SubspaceClusterDefinition(this)}; |
---|
368 | } |
---|
369 | |
---|
370 | return m_Clusters; |
---|
371 | } |
---|
372 | |
---|
373 | /** |
---|
374 | * returns the default number of attributes |
---|
375 | * |
---|
376 | * @return the default number of attributes |
---|
377 | */ |
---|
378 | protected int defaultNumAttributes() { |
---|
379 | return 1; |
---|
380 | } |
---|
381 | |
---|
382 | /** |
---|
383 | * Sets the number of attributes the dataset should have. |
---|
384 | * @param numAttributes the new number of attributes |
---|
385 | */ |
---|
386 | public void setNumAttributes(int numAttributes) { |
---|
387 | super.setNumAttributes(numAttributes); |
---|
388 | m_numValues = new int[getNumAttributes()]; |
---|
389 | } |
---|
390 | |
---|
391 | /** |
---|
392 | * Returns the tip text for this property |
---|
393 | * |
---|
394 | * @return tip text for this property suitable for |
---|
395 | * displaying in the explorer/experimenter gui |
---|
396 | */ |
---|
397 | public String numAttributesTipText() { |
---|
398 | return "The number of attributes the generated data will contain (Note: they must be covered by the cluster definitions!)"; |
---|
399 | } |
---|
400 | |
---|
401 | /** |
---|
402 | * returns the default noise rate |
---|
403 | * |
---|
404 | * @return the default noise rate |
---|
405 | */ |
---|
406 | protected double defaultNoiseRate() { |
---|
407 | return 0.0; |
---|
408 | } |
---|
409 | |
---|
410 | /** |
---|
411 | * Gets the percentage of noise set. |
---|
412 | * |
---|
413 | * @return the percentage of noise set |
---|
414 | */ |
---|
415 | public double getNoiseRate() { |
---|
416 | return m_NoiseRate; |
---|
417 | } |
---|
418 | |
---|
419 | /** |
---|
420 | * Sets the percentage of noise set. |
---|
421 | * |
---|
422 | * @param newNoiseRate new percentage of noise |
---|
423 | */ |
---|
424 | public void setNoiseRate(double newNoiseRate) { |
---|
425 | m_NoiseRate = newNoiseRate; |
---|
426 | } |
---|
427 | |
---|
428 | /** |
---|
429 | * Returns the tip text for this property |
---|
430 | * |
---|
431 | * @return tip text for this property suitable for |
---|
432 | * displaying in the explorer/experimenter gui |
---|
433 | */ |
---|
434 | public String noiseRateTipText() { |
---|
435 | return "The noise rate to use."; |
---|
436 | } |
---|
437 | |
---|
438 | /** |
---|
439 | * returns the currently set clusters |
---|
440 | * |
---|
441 | * @return the currently set clusters |
---|
442 | */ |
---|
443 | public ClusterDefinition[] getClusterDefinitions() { |
---|
444 | return getClusters(); |
---|
445 | } |
---|
446 | |
---|
447 | /** |
---|
448 | * sets the clusters to use |
---|
449 | * |
---|
450 | * @param value the clusters do use |
---|
451 | * @throws Exception if clusters are not the correct class |
---|
452 | */ |
---|
453 | public void setClusterDefinitions(ClusterDefinition[] value) |
---|
454 | throws Exception { |
---|
455 | |
---|
456 | String indexStr; |
---|
457 | |
---|
458 | indexStr = ""; |
---|
459 | m_Clusters = value; |
---|
460 | for (int i = 0; i < getClusters().length; i++) { |
---|
461 | if (!(getClusters()[i] instanceof SubspaceClusterDefinition)) { |
---|
462 | if (indexStr.length() != 0) |
---|
463 | indexStr += ","; |
---|
464 | indexStr += "" + (i+1); |
---|
465 | } |
---|
466 | getClusters()[i].setParent(this); |
---|
467 | getClusters()[i].setOptions(getClusters()[i].getOptions()); // for initializing! |
---|
468 | } |
---|
469 | |
---|
470 | // any wrong classes encountered? |
---|
471 | if (indexStr.length() != 0) |
---|
472 | throw new Exception("These cluster definitions are not '" |
---|
473 | + SubspaceClusterDefinition.class.getName() + "': " + indexStr); |
---|
474 | } |
---|
475 | |
---|
476 | /** |
---|
477 | * Returns the tip text for this property |
---|
478 | * |
---|
479 | * @return tip text for this property suitable for |
---|
480 | * displaying in the explorer/experimenter gui |
---|
481 | */ |
---|
482 | public String clusterDefinitionsTipText() { |
---|
483 | return "The clusters to use."; |
---|
484 | } |
---|
485 | |
---|
486 | /** |
---|
487 | * Checks, whether all attributes are covered by cluster definitions and |
---|
488 | * returns TRUE in that case. |
---|
489 | * |
---|
490 | * @return whether all attributes are covered |
---|
491 | */ |
---|
492 | protected boolean checkCoverage() { |
---|
493 | int i; |
---|
494 | int n; |
---|
495 | int[] count; |
---|
496 | Range r; |
---|
497 | String attrIndex; |
---|
498 | SubspaceClusterDefinition cl; |
---|
499 | |
---|
500 | // check whether all the attributes are covered |
---|
501 | count = new int[getNumAttributes()]; |
---|
502 | for (i = 0; i < getNumAttributes(); i++) { |
---|
503 | for (n = 0; n < getClusters().length; n++) { |
---|
504 | cl = (SubspaceClusterDefinition) getClusters()[n]; |
---|
505 | r = new Range(cl.getAttrIndexRange()); |
---|
506 | r.setUpper(getNumAttributes()); |
---|
507 | if (r.isInRange(i)) |
---|
508 | count[i]++; |
---|
509 | } |
---|
510 | } |
---|
511 | |
---|
512 | // list all indices that are not covered |
---|
513 | attrIndex = ""; |
---|
514 | for (i = 0; i < count.length; i++) { |
---|
515 | if (count[i] == 0) { |
---|
516 | if (attrIndex.length() != 0) |
---|
517 | attrIndex += ","; |
---|
518 | attrIndex += (i+1); |
---|
519 | } |
---|
520 | } |
---|
521 | |
---|
522 | if (attrIndex.length() != 0) |
---|
523 | throw new IllegalArgumentException( |
---|
524 | "The following attributes are not covered by a cluster " |
---|
525 | + "definition: " + attrIndex + "\n"); |
---|
526 | |
---|
527 | return true; |
---|
528 | } |
---|
529 | |
---|
530 | /** |
---|
531 | * Gets the single mode flag. |
---|
532 | * |
---|
533 | * @return true if methode generateExample can be used. |
---|
534 | */ |
---|
535 | public boolean getSingleModeFlag() { |
---|
536 | return false; |
---|
537 | } |
---|
538 | |
---|
539 | /** |
---|
540 | * Initializes the format for the dataset produced. |
---|
541 | * |
---|
542 | * @return the output data format |
---|
543 | * @throws Exception data format could not be defined |
---|
544 | */ |
---|
545 | |
---|
546 | public Instances defineDataFormat() throws Exception { |
---|
547 | |
---|
548 | // initialize |
---|
549 | setOptions(getOptions()); |
---|
550 | |
---|
551 | checkCoverage(); |
---|
552 | |
---|
553 | Random random = new Random (getSeed()); |
---|
554 | setRandom(random); |
---|
555 | Instances dataset; |
---|
556 | FastVector attributes = new FastVector(3); |
---|
557 | Attribute attribute; |
---|
558 | boolean classFlag = getClassFlag(); |
---|
559 | |
---|
560 | FastVector classValues = null; |
---|
561 | if (classFlag) |
---|
562 | classValues = new FastVector(getClusters().length); |
---|
563 | FastVector boolValues = new FastVector(2); |
---|
564 | boolValues.addElement("false"); |
---|
565 | boolValues.addElement("true"); |
---|
566 | FastVector nomValues = null; |
---|
567 | |
---|
568 | // define dataset |
---|
569 | for (int i = 0; i < getNumAttributes(); i++) { |
---|
570 | // define boolean attribute |
---|
571 | if (m_booleanCols.isInRange(i)) { |
---|
572 | attribute = new Attribute("B" + i, boolValues); |
---|
573 | } |
---|
574 | else if (m_nominalCols.isInRange(i)) { |
---|
575 | // define nominal attribute |
---|
576 | nomValues = new FastVector(m_numValues[i]); |
---|
577 | for (int j = 0; j < m_numValues[i]; j++) |
---|
578 | nomValues.addElement("value-" + j); |
---|
579 | attribute = new Attribute("N" + i, nomValues); |
---|
580 | } |
---|
581 | else { |
---|
582 | // numerical attribute |
---|
583 | attribute = new Attribute("X" + i); |
---|
584 | } |
---|
585 | attributes.addElement(attribute); |
---|
586 | } |
---|
587 | |
---|
588 | if (classFlag) { |
---|
589 | for (int i = 0; i < getClusters().length; i++) |
---|
590 | classValues.addElement("c" + i); |
---|
591 | attribute = new Attribute ("class", classValues); |
---|
592 | attributes.addElement(attribute); |
---|
593 | } |
---|
594 | |
---|
595 | dataset = new Instances(getRelationNameToUse(), attributes, 0); |
---|
596 | if (classFlag) |
---|
597 | dataset.setClassIndex(m_NumAttributes); |
---|
598 | |
---|
599 | // set dataset format of this class |
---|
600 | Instances format = new Instances(dataset, 0); |
---|
601 | setDatasetFormat(format); |
---|
602 | |
---|
603 | for (int i = 0; i < getClusters().length; i++) { |
---|
604 | SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[i]; |
---|
605 | cl.setNumInstances(random); |
---|
606 | cl.setParent(this); |
---|
607 | } |
---|
608 | |
---|
609 | return dataset; |
---|
610 | } |
---|
611 | |
---|
612 | /** |
---|
613 | * Returns true if attribute is boolean |
---|
614 | *@param index of the attribute |
---|
615 | *@return true if the attribute is boolean |
---|
616 | */ |
---|
617 | public boolean isBoolean(int index) { |
---|
618 | return m_booleanCols.isInRange(index); |
---|
619 | } |
---|
620 | |
---|
621 | /** |
---|
622 | * Returns true if attribute is nominal |
---|
623 | *@param index of the attribute |
---|
624 | *@return true if the attribute is nominal |
---|
625 | */ |
---|
626 | public boolean isNominal(int index) { |
---|
627 | return m_nominalCols.isInRange(index); |
---|
628 | } |
---|
629 | |
---|
630 | /** |
---|
631 | * returns array that stores the number of values for a nominal attribute. |
---|
632 | * |
---|
633 | * @return the array that stores the number of values for a nominal attribute |
---|
634 | */ |
---|
635 | public int[] getNumValues() { |
---|
636 | return m_numValues; |
---|
637 | } |
---|
638 | |
---|
639 | /** |
---|
640 | * Generate an example of the dataset. |
---|
641 | * @return the instance generated |
---|
642 | * @throws Exception if format not defined or generating <br/> |
---|
643 | * examples one by one is not possible, because voting is chosen |
---|
644 | */ |
---|
645 | |
---|
646 | public Instance generateExample() throws Exception { |
---|
647 | throw new Exception("Examples cannot be generated one by one."); |
---|
648 | } |
---|
649 | |
---|
650 | /** |
---|
651 | * Generate all examples of the dataset. |
---|
652 | * @return the instance generated |
---|
653 | * @throws Exception if format not defined |
---|
654 | */ |
---|
655 | |
---|
656 | public Instances generateExamples() throws Exception { |
---|
657 | Instances format = getDatasetFormat(); |
---|
658 | Instance example = null; |
---|
659 | |
---|
660 | if (format == null) |
---|
661 | throw new Exception("Dataset format not defined."); |
---|
662 | |
---|
663 | // generate examples for one cluster after another |
---|
664 | for (int cNum = 0; cNum < getClusters().length; cNum++) { |
---|
665 | SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[cNum]; |
---|
666 | |
---|
667 | //get the number of instances to create |
---|
668 | int instNum = cl.getNumInstances(); |
---|
669 | |
---|
670 | //class value is c + cluster number |
---|
671 | String cName = "c" + cNum; |
---|
672 | |
---|
673 | switch (cl.getClusterType().getSelectedTag().getID()) { |
---|
674 | case (UNIFORM_RANDOM): |
---|
675 | for (int i = 0; i < instNum; i++) { |
---|
676 | // generate example |
---|
677 | example = generateExample(format, getRandom(), cl, cName); |
---|
678 | if (example != null) |
---|
679 | format.add(example); |
---|
680 | } |
---|
681 | break; |
---|
682 | case (TOTAL_UNIFORM): |
---|
683 | // generate examples |
---|
684 | if (!cl.isInteger()) |
---|
685 | generateUniformExamples(format, instNum, cl, cName); |
---|
686 | else |
---|
687 | generateUniformIntegerExamples(format, instNum, cl, cName); |
---|
688 | break; |
---|
689 | case (GAUSSIAN): |
---|
690 | // generate examples |
---|
691 | generateGaussianExamples(format, instNum, getRandom(), cl, cName); |
---|
692 | break; |
---|
693 | } |
---|
694 | } |
---|
695 | |
---|
696 | return format; |
---|
697 | } |
---|
698 | |
---|
699 | /** |
---|
700 | * Generate an example of the dataset. |
---|
701 | * |
---|
702 | * @param format the dataset format |
---|
703 | * @param randomG the random number generator to use |
---|
704 | * @param cl the cluster definition |
---|
705 | * @param cName the class value |
---|
706 | * @return the generated instance |
---|
707 | */ |
---|
708 | private Instance generateExample( |
---|
709 | Instances format, Random randomG, SubspaceClusterDefinition cl, |
---|
710 | String cName) { |
---|
711 | |
---|
712 | boolean makeInteger = cl.isInteger(); |
---|
713 | int num = -1; |
---|
714 | Instance example = null; |
---|
715 | int numAtts = m_NumAttributes; |
---|
716 | if (getClassFlag()) numAtts++; |
---|
717 | |
---|
718 | example = new DenseInstance(numAtts); |
---|
719 | example.setDataset(format); |
---|
720 | boolean[] attributes = cl.getAttributes(); |
---|
721 | double[] minValue = cl.getMinValue(); |
---|
722 | double[] maxValue = cl.getMaxValue(); |
---|
723 | double value; |
---|
724 | |
---|
725 | int clusterI = -1; |
---|
726 | for (int i = 0; i < m_NumAttributes; i++) { |
---|
727 | if (attributes[i]) { |
---|
728 | clusterI++; |
---|
729 | num++; |
---|
730 | // boolean or nominal attribute |
---|
731 | if (isBoolean(i) || isNominal(i)) { |
---|
732 | |
---|
733 | if (minValue[clusterI] == maxValue[clusterI]) { |
---|
734 | value = minValue[clusterI]; |
---|
735 | } |
---|
736 | else { |
---|
737 | int numValues = (int)(maxValue[clusterI] - minValue[clusterI] + 1.0); |
---|
738 | value = randomG.nextInt(numValues); |
---|
739 | value += minValue[clusterI]; |
---|
740 | } |
---|
741 | } |
---|
742 | else { |
---|
743 | // numeric attribute |
---|
744 | value = randomG.nextDouble() * |
---|
745 | (maxValue[num] - minValue[num]) + minValue[num]; |
---|
746 | if (makeInteger) |
---|
747 | value = Math.round(value); |
---|
748 | } |
---|
749 | example.setValue(i, value); |
---|
750 | } |
---|
751 | else { |
---|
752 | example.setMissing(i); |
---|
753 | } |
---|
754 | } |
---|
755 | |
---|
756 | if (getClassFlag()) |
---|
757 | example.setClassValue(cName); |
---|
758 | |
---|
759 | return example; |
---|
760 | } |
---|
761 | |
---|
762 | /** |
---|
763 | * Generate examples for a uniform cluster dataset. |
---|
764 | * |
---|
765 | * @param format the dataset format |
---|
766 | * @param numInstances the number of instances to generator |
---|
767 | * @param cl the cluster definition |
---|
768 | * @param cName the class value |
---|
769 | */ |
---|
770 | private void generateUniformExamples( |
---|
771 | Instances format, int numInstances, SubspaceClusterDefinition cl, |
---|
772 | String cName) { |
---|
773 | |
---|
774 | Instance example = null; |
---|
775 | int numAtts = m_NumAttributes; |
---|
776 | if (getClassFlag()) numAtts++; |
---|
777 | |
---|
778 | example = new DenseInstance(numAtts); |
---|
779 | example.setDataset(format); |
---|
780 | boolean[] attributes = cl.getAttributes(); |
---|
781 | double[] minValue = cl.getMinValue(); |
---|
782 | double[] maxValue = cl.getMaxValue(); |
---|
783 | double[] diff = new double[minValue.length]; |
---|
784 | |
---|
785 | for (int i = 0; i < minValue.length; i++) |
---|
786 | diff[i] = (maxValue[i] - minValue[i]); |
---|
787 | |
---|
788 | for (int j = 0; j < numInstances; j++) { |
---|
789 | int num = -1; |
---|
790 | for (int i = 0; i < m_NumAttributes; i++) { |
---|
791 | if (attributes[i]) { |
---|
792 | num++; |
---|
793 | double value = minValue[num] + (diff[num] * (double)((double)j / (double)(numInstances - 1))); |
---|
794 | example.setValue(i, value); |
---|
795 | } |
---|
796 | else { |
---|
797 | example.setMissing(i); |
---|
798 | } |
---|
799 | } |
---|
800 | if (getClassFlag()) |
---|
801 | example.setClassValue(cName); |
---|
802 | format.add(example); |
---|
803 | } |
---|
804 | } |
---|
805 | |
---|
806 | /** |
---|
807 | * Generate examples for a uniform cluster dataset. |
---|
808 | * |
---|
809 | * @param format the dataset format |
---|
810 | * @param numInstances the number of instances to generator |
---|
811 | * @param cl the cluster definition |
---|
812 | * @param cName the class value |
---|
813 | */ |
---|
814 | private void generateUniformIntegerExamples( |
---|
815 | Instances format, int numInstances, SubspaceClusterDefinition cl, |
---|
816 | String cName) { |
---|
817 | |
---|
818 | Instance example = null; |
---|
819 | int numAtts = m_NumAttributes; |
---|
820 | if (getClassFlag()) numAtts++; |
---|
821 | |
---|
822 | example = new DenseInstance(numAtts); |
---|
823 | example.setDataset(format); |
---|
824 | boolean[] attributes = cl.getAttributes(); |
---|
825 | double[] minValue = cl.getMinValue(); |
---|
826 | double[] maxValue = cl.getMaxValue(); |
---|
827 | int[] minInt = new int[minValue.length]; |
---|
828 | int[] maxInt = new int[maxValue.length]; |
---|
829 | int[] intValue = new int[maxValue.length]; |
---|
830 | int[] numInt = new int[minValue.length]; |
---|
831 | |
---|
832 | int num = 1; |
---|
833 | for (int i = 0; i < minValue.length; i++) { |
---|
834 | minInt[i] = (int)Math.ceil(minValue[i]); |
---|
835 | maxInt[i] = (int)Math.floor(maxValue[i]); |
---|
836 | numInt[i] = (maxInt[i] - minInt[i] + 1); |
---|
837 | num = num * numInt[i]; |
---|
838 | } |
---|
839 | int numEach = numInstances / num; |
---|
840 | int rest = numInstances - numEach * num; |
---|
841 | |
---|
842 | // initialize with smallest values combination |
---|
843 | for (int i = 0; i < m_NumAttributes; i++) { |
---|
844 | if (attributes[i]) { |
---|
845 | example.setValue(i, (double)minInt[i]); |
---|
846 | intValue[i] = minInt[i]; |
---|
847 | } |
---|
848 | else { |
---|
849 | example.setMissing(i); |
---|
850 | } |
---|
851 | } |
---|
852 | if (getClassFlag()) |
---|
853 | example.setClassValue(cName); |
---|
854 | int added = 0; |
---|
855 | int attr = 0; |
---|
856 | // do while not added all |
---|
857 | do { |
---|
858 | // add all for one value combination |
---|
859 | for (int k = 0; k < numEach; k++) { |
---|
860 | format.add(example); |
---|
861 | example = (Instance) example.copy(); |
---|
862 | added++; |
---|
863 | } |
---|
864 | if (rest > 0) { |
---|
865 | format.add(example); |
---|
866 | example = (Instance) example.copy(); |
---|
867 | added++; |
---|
868 | rest--; |
---|
869 | } |
---|
870 | |
---|
871 | if (added >= numInstances) break; |
---|
872 | // switch to the next value combination |
---|
873 | boolean done = false; |
---|
874 | do { |
---|
875 | if (attributes[attr] && (intValue[attr] + 1 <= maxInt[attr])) { |
---|
876 | intValue[attr]++; |
---|
877 | done = true; |
---|
878 | } |
---|
879 | else { |
---|
880 | attr++; |
---|
881 | } |
---|
882 | } while (!done); |
---|
883 | |
---|
884 | example.setValue(attr, (double)intValue[attr]); |
---|
885 | } while (added < numInstances); |
---|
886 | } |
---|
887 | |
---|
888 | /** |
---|
889 | * Generate examples for a uniform cluster dataset. |
---|
890 | * |
---|
891 | * @param format the dataset format |
---|
892 | * @param numInstances the number of instances to generate |
---|
893 | * @param random the random number generator |
---|
894 | * @param cl the cluster definition |
---|
895 | * @param cName the class value |
---|
896 | */ |
---|
897 | private void generateGaussianExamples( |
---|
898 | Instances format, int numInstances, Random random, |
---|
899 | SubspaceClusterDefinition cl, String cName) { |
---|
900 | |
---|
901 | boolean makeInteger = cl.isInteger(); |
---|
902 | Instance example = null; |
---|
903 | int numAtts = m_NumAttributes; |
---|
904 | if (getClassFlag()) numAtts++; |
---|
905 | |
---|
906 | example = new DenseInstance(numAtts); |
---|
907 | example.setDataset(format); |
---|
908 | boolean[] attributes = cl.getAttributes(); |
---|
909 | double[] meanValue = cl.getMeanValue(); |
---|
910 | double[] stddevValue = cl.getStddevValue(); |
---|
911 | |
---|
912 | for (int j = 0; j < numInstances; j++) { |
---|
913 | int num = -1; |
---|
914 | for (int i = 0; i < m_NumAttributes; i++) { |
---|
915 | if (attributes[i]) { |
---|
916 | num++; |
---|
917 | double value = meanValue[num] + (random.nextGaussian() * stddevValue[num]); |
---|
918 | if (makeInteger) |
---|
919 | value = Math.round(value); |
---|
920 | example.setValue(i, value); |
---|
921 | } |
---|
922 | else { |
---|
923 | example.setMissing(i); |
---|
924 | } |
---|
925 | } |
---|
926 | if (getClassFlag()) |
---|
927 | example.setClassValue(cName); |
---|
928 | format.add(example); |
---|
929 | } |
---|
930 | } |
---|
931 | |
---|
932 | /** |
---|
933 | * Compiles documentation about the data generation after |
---|
934 | * the generation process |
---|
935 | * |
---|
936 | * @return string with additional information about generated dataset |
---|
937 | * @throws Exception no input structure has been defined |
---|
938 | */ |
---|
939 | public String generateFinished() throws Exception { |
---|
940 | return ""; |
---|
941 | } |
---|
942 | |
---|
943 | /** |
---|
944 | * Compiles documentation about the data generation before |
---|
945 | * the generation process |
---|
946 | * |
---|
947 | * @return string with additional information |
---|
948 | */ |
---|
949 | public String generateStart() { |
---|
950 | StringBuffer docu = new StringBuffer(); |
---|
951 | |
---|
952 | int sumInst = 0; |
---|
953 | for (int cNum = 0; cNum < getClusters().length; cNum++) { |
---|
954 | SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[cNum]; |
---|
955 | docu.append("%\n"); |
---|
956 | docu.append("% Cluster: c"+ cNum + " "); |
---|
957 | switch (cl.getClusterType().getSelectedTag().getID()) { |
---|
958 | case UNIFORM_RANDOM: |
---|
959 | docu.append("Uniform Random"); |
---|
960 | break; |
---|
961 | case TOTAL_UNIFORM: |
---|
962 | docu.append("Total Random"); |
---|
963 | break; |
---|
964 | case GAUSSIAN: |
---|
965 | docu.append("Gaussian"); |
---|
966 | break; |
---|
967 | } |
---|
968 | if (cl.isInteger()) { |
---|
969 | docu.append(" / INTEGER"); |
---|
970 | } |
---|
971 | |
---|
972 | docu.append("\n% ----------------------------------------------\n"); |
---|
973 | docu.append("%"+cl.attributesToString()); |
---|
974 | |
---|
975 | docu.append("\n% Number of Instances: " + cl.getInstNums() + "\n"); |
---|
976 | docu.append( "% Generated Number of Instances: " + cl.getNumInstances() + "\n"); |
---|
977 | sumInst += cl.getNumInstances(); |
---|
978 | } |
---|
979 | docu.append("%\n% ----------------------------------------------\n"); |
---|
980 | docu.append("% Total Number of Instances: " + sumInst + "\n"); |
---|
981 | docu.append("% in " + getClusters().length + " Cluster(s)\n%"); |
---|
982 | |
---|
983 | return docu.toString(); |
---|
984 | } |
---|
985 | |
---|
986 | /** |
---|
987 | * Returns the revision string. |
---|
988 | * |
---|
989 | * @return the revision |
---|
990 | */ |
---|
991 | public String getRevision() { |
---|
992 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
993 | } |
---|
994 | |
---|
995 | /** |
---|
996 | * Main method for testing this class. |
---|
997 | * |
---|
998 | * @param args should contain arguments for the data producer: |
---|
999 | */ |
---|
1000 | public static void main(String[] args) { |
---|
1001 | runDataGenerator(new SubspaceCluster(), args); |
---|
1002 | } |
---|
1003 | } |
---|