1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * Agrawal.java |
---|
19 | * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.datagenerators.classifiers.classification; |
---|
24 | |
---|
25 | import weka.core.Attribute; |
---|
26 | import weka.core.FastVector; |
---|
27 | import weka.core.Instance; |
---|
28 | import weka.core.DenseInstance; |
---|
29 | import weka.core.Instances; |
---|
30 | import weka.core.Option; |
---|
31 | import weka.core.RevisionUtils; |
---|
32 | import weka.core.SelectedTag; |
---|
33 | import weka.core.Tag; |
---|
34 | import weka.core.TechnicalInformation; |
---|
35 | import weka.core.TechnicalInformationHandler; |
---|
36 | import weka.core.Utils; |
---|
37 | import weka.core.TechnicalInformation.Field; |
---|
38 | import weka.core.TechnicalInformation.Type; |
---|
39 | import weka.datagenerators.ClassificationGenerator; |
---|
40 | |
---|
41 | import java.util.Enumeration; |
---|
42 | import java.util.Random; |
---|
43 | import java.util.Vector; |
---|
44 | |
---|
45 | /** |
---|
46 | <!-- globalinfo-start --> |
---|
47 | * Generates a people database and is based on the paper by Agrawal et al.:<br/> |
---|
48 | * R. Agrawal, T. Imielinski, A. Swami (1993). Database Mining: A Performance Perspective. IEEE Transactions on Knowledge and Data Engineering. 5(6):914-925. URL http://www.almaden.ibm.com/software/quest/Publications/ByDate.html. |
---|
49 | * <p/> |
---|
50 | <!-- globalinfo-end --> |
---|
51 | * |
---|
52 | <!-- technical-bibtex-start --> |
---|
53 | * BibTeX: |
---|
54 | * <pre> |
---|
55 | * @article{Agrawal1993, |
---|
56 | * author = {R. Agrawal and T. Imielinski and A. Swami}, |
---|
57 | * journal = {IEEE Transactions on Knowledge and Data Engineering}, |
---|
58 | * note = {Special issue on Learning and Discovery in Knowledge-Based Databases}, |
---|
59 | * number = {6}, |
---|
60 | * pages = {914-925}, |
---|
61 | * title = {Database Mining: A Performance Perspective}, |
---|
62 | * volume = {5}, |
---|
63 | * year = {1993}, |
---|
64 | * URL = {http://www.almaden.ibm.com/software/quest/Publications/ByDate.html}, |
---|
65 | * PDF = {http://www.almaden.ibm.com/software/quest/Publications/papers/tkde93.pdf} |
---|
66 | * } |
---|
67 | * </pre> |
---|
68 | * <p/> |
---|
69 | <!-- technical-bibtex-end --> |
---|
70 | * |
---|
71 | <!-- options-start --> |
---|
72 | * Valid options are: <p/> |
---|
73 | * |
---|
74 | * <pre> -h |
---|
75 | * Prints this help.</pre> |
---|
76 | * |
---|
77 | * <pre> -o <file> |
---|
78 | * The name of the output file, otherwise the generated data is |
---|
79 | * printed to stdout.</pre> |
---|
80 | * |
---|
81 | * <pre> -r <name> |
---|
82 | * The name of the relation.</pre> |
---|
83 | * |
---|
84 | * <pre> -d |
---|
85 | * Whether to print debug informations.</pre> |
---|
86 | * |
---|
87 | * <pre> -S |
---|
88 | * The seed for random function (default 1)</pre> |
---|
89 | * |
---|
90 | * <pre> -n <num> |
---|
91 | * The number of examples to generate (default 100)</pre> |
---|
92 | * |
---|
93 | * <pre> -F <num> |
---|
94 | * The function to use for generating the data. (default 1)</pre> |
---|
95 | * |
---|
96 | * <pre> -B |
---|
97 | * Whether to balance the class.</pre> |
---|
98 | * |
---|
99 | * <pre> -P <num> |
---|
100 | * The perturbation factor. (default 0.05)</pre> |
---|
101 | * |
---|
102 | <!-- options-end --> |
---|
103 | * |
---|
104 | * @author Richard Kirkby (rkirkby at cs dot waikato dot ac dot nz) |
---|
105 | * @author FracPete (fracpete at waikato dot ac dot nz) |
---|
106 | * @version $Revision: 5987 $ |
---|
107 | */ |
---|
108 | |
---|
109 | public class Agrawal |
---|
110 | extends ClassificationGenerator |
---|
111 | implements TechnicalInformationHandler { |
---|
112 | |
---|
113 | /** for serialization */ |
---|
114 | static final long serialVersionUID = 2254651939636143025L; |
---|
115 | |
---|
116 | /** |
---|
117 | * the interface for the class functions |
---|
118 | */ |
---|
119 | protected interface ClassFunction { |
---|
120 | /** |
---|
121 | * returns a class value based on the given inputs |
---|
122 | * @param salary the salary |
---|
123 | * @param commission the commission |
---|
124 | * @param age the age |
---|
125 | * @param elevel the education level |
---|
126 | * @param car |
---|
127 | * @param zipcode the zip code |
---|
128 | * @param hvalue |
---|
129 | * @param hyears |
---|
130 | * @param loan |
---|
131 | */ |
---|
132 | public long determineClass(double salary, double commission, int age, |
---|
133 | int elevel, int car, int zipcode, double hvalue, int hyears, |
---|
134 | double loan); |
---|
135 | } |
---|
136 | |
---|
137 | /** |
---|
138 | * built in functions are based on the paper (page 924), |
---|
139 | * which turn out to be functions pred20 thru pred29 in the public c code |
---|
140 | */ |
---|
141 | protected static ClassFunction[] builtInFunctions = { |
---|
142 | // function 1 |
---|
143 | new ClassFunction() { |
---|
144 | public long determineClass(double salary, double commission, |
---|
145 | int age, int elevel, int car, int zipcode, |
---|
146 | double hvalue, int hyears, double loan) { |
---|
147 | if (age < 40 || 60 <= age) |
---|
148 | return 0; |
---|
149 | else |
---|
150 | return 1; |
---|
151 | } |
---|
152 | }, |
---|
153 | // function 2 |
---|
154 | new ClassFunction() { |
---|
155 | public long determineClass(double salary, double commission, |
---|
156 | int age, int elevel, int car, int zipcode, |
---|
157 | double hvalue, int hyears, double loan) { |
---|
158 | if (age < 40) |
---|
159 | if (50000 <= salary && salary <= 100000) |
---|
160 | return 0; |
---|
161 | else |
---|
162 | return 1; |
---|
163 | else if (age < 60) // && age >= 40 |
---|
164 | if (75000 <= salary && salary <= 125000) |
---|
165 | return 0; |
---|
166 | else |
---|
167 | return 1; |
---|
168 | else // age >= 60 |
---|
169 | if (25000 <= salary && salary <= 75000) |
---|
170 | return 0; |
---|
171 | else |
---|
172 | return 1; |
---|
173 | } |
---|
174 | }, |
---|
175 | // function 3 |
---|
176 | new ClassFunction() { |
---|
177 | public long determineClass(double salary, double commission, |
---|
178 | int age, int elevel, int car, int zipcode, |
---|
179 | double hvalue, int hyears, double loan) { |
---|
180 | if (age < 40) |
---|
181 | if (elevel == 0 || elevel == 1) |
---|
182 | return 0; |
---|
183 | else |
---|
184 | return 1; |
---|
185 | else if (age < 60) // && age >= 40 |
---|
186 | if (elevel == 1 || elevel == 2 || elevel == 3) |
---|
187 | return 0; |
---|
188 | else |
---|
189 | return 1; |
---|
190 | else // age >= 60 |
---|
191 | if (elevel == 2 || elevel == 3 || elevel == 4) |
---|
192 | return 0; |
---|
193 | else |
---|
194 | return 1; |
---|
195 | } |
---|
196 | }, |
---|
197 | // function 4 |
---|
198 | new ClassFunction() { |
---|
199 | public long determineClass(double salary, double commission, |
---|
200 | int age, int elevel, int car, int zipcode, |
---|
201 | double hvalue, int hyears, double loan) { |
---|
202 | if (age < 40) |
---|
203 | if (elevel == 0 || elevel == 1) |
---|
204 | if (25000 <= salary && salary <= 75000) |
---|
205 | return 0; |
---|
206 | else |
---|
207 | return 1; |
---|
208 | else if (50000 <= salary && salary <= 100000) |
---|
209 | return 0; |
---|
210 | else |
---|
211 | return 1; |
---|
212 | else if (age < 60) // && age >= 40 |
---|
213 | if (elevel == 1 || elevel == 2 || elevel == 3) |
---|
214 | if (50000 <= salary && salary <= 100000) |
---|
215 | return 0; |
---|
216 | else |
---|
217 | return 1; |
---|
218 | else if (75000 <= salary && salary <= 125000) |
---|
219 | return 0; |
---|
220 | else |
---|
221 | return 1; |
---|
222 | else // age >= 60 |
---|
223 | if (elevel == 2 || elevel == 3 || elevel == 4) |
---|
224 | if (50000 <= salary && salary <= 100000) |
---|
225 | return 0; |
---|
226 | else |
---|
227 | return 1; |
---|
228 | else if (25000 <= salary && salary <= 75000) |
---|
229 | return 0; |
---|
230 | else |
---|
231 | return 1; |
---|
232 | } |
---|
233 | }, |
---|
234 | // function 5 |
---|
235 | new ClassFunction() { |
---|
236 | public long determineClass(double salary, double commission, |
---|
237 | int age, int elevel, int car, int zipcode, |
---|
238 | double hvalue, int hyears, double loan) { |
---|
239 | if (age < 40) |
---|
240 | if (50000 <= salary && salary <= 100000) |
---|
241 | if (100000 <= loan && loan <= 300000) |
---|
242 | return 0; |
---|
243 | else |
---|
244 | return 1; |
---|
245 | else if (200000 <= loan && loan <= 400000) |
---|
246 | return 0; |
---|
247 | else |
---|
248 | return 1; |
---|
249 | else if (age < 60) // && age >= 40 |
---|
250 | if (75000 <= salary && salary <= 125000) |
---|
251 | if (200000 <= loan && loan <= 400000) |
---|
252 | return 0; |
---|
253 | else |
---|
254 | return 1; |
---|
255 | else if (300000 <= loan && loan <= 500000) |
---|
256 | return 0; |
---|
257 | else |
---|
258 | return 1; |
---|
259 | else // age >= 60 |
---|
260 | if (25000 <= salary && salary <= 75000) |
---|
261 | if (300000 <= loan && loan <= 500000) |
---|
262 | return 0; |
---|
263 | else |
---|
264 | return 1; |
---|
265 | else if (100000 <= loan && loan <= 300000) |
---|
266 | return 0; |
---|
267 | else |
---|
268 | return 1; |
---|
269 | } |
---|
270 | }, |
---|
271 | // function 6 |
---|
272 | new ClassFunction() { |
---|
273 | public long determineClass(double salary, double commission, |
---|
274 | int age, int elevel, int car, int zipcode, |
---|
275 | double hvalue, int hyears, double loan) { |
---|
276 | double totalSalary = salary + commission; |
---|
277 | if (age < 40) |
---|
278 | if (50000 <= totalSalary && totalSalary <= 100000) |
---|
279 | return 0; |
---|
280 | else |
---|
281 | return 1; |
---|
282 | else if (age < 60) // && age >= 40 |
---|
283 | if (75000 <= totalSalary && totalSalary <= 125000) |
---|
284 | return 0; |
---|
285 | else |
---|
286 | return 1; |
---|
287 | else // age >= 60 |
---|
288 | if (25000 <= totalSalary && totalSalary <= 75000) |
---|
289 | return 0; |
---|
290 | else |
---|
291 | return 1; |
---|
292 | } |
---|
293 | }, |
---|
294 | // function 7 |
---|
295 | new ClassFunction() { |
---|
296 | public long determineClass(double salary, double commission, |
---|
297 | int age, int elevel, int car, int zipcode, |
---|
298 | double hvalue, int hyears, double loan) { |
---|
299 | double disposable = (2.0 * (salary + commission) / 3.0 |
---|
300 | - loan / 5.0 - 20000.0); |
---|
301 | return disposable > 0 ? 0 : 1; |
---|
302 | } |
---|
303 | }, |
---|
304 | // function 8 |
---|
305 | new ClassFunction() { |
---|
306 | public long determineClass(double salary, double commission, |
---|
307 | int age, int elevel, int car, int zipcode, |
---|
308 | double hvalue, int hyears, double loan) { |
---|
309 | double disposable = (2.0 * (salary + commission) / 3.0 |
---|
310 | - 5000.0 * (double) elevel - 20000.0); |
---|
311 | return disposable > 0 ? 0 : 1; |
---|
312 | } |
---|
313 | }, |
---|
314 | // function 9 |
---|
315 | new ClassFunction() { |
---|
316 | public long determineClass(double salary, double commission, |
---|
317 | int age, int elevel, int car, int zipcode, |
---|
318 | double hvalue, int hyears, double loan) { |
---|
319 | double disposable = (2.0 * (salary + commission) / 3.0 |
---|
320 | - 5000.0 * (double) elevel - loan / 5.0 - 10000.0); |
---|
321 | return disposable > 0 ? 0 : 1; |
---|
322 | } |
---|
323 | }, |
---|
324 | // function 10 |
---|
325 | new ClassFunction() { |
---|
326 | public long determineClass(double salary, double commission, |
---|
327 | int age, int elevel, int car, int zipcode, |
---|
328 | double hvalue, int hyears, double loan) { |
---|
329 | double equity = 0.0; |
---|
330 | if (hyears >= 20) |
---|
331 | equity = hvalue * ((double) hyears - 20.0) / 10.0; |
---|
332 | double disposable = (2.0 * (salary + commission) / 3.0 |
---|
333 | - 5000.0 * (double) elevel + equity / 5.0 - 10000.0); |
---|
334 | return disposable > 0 ? 0 : 1; |
---|
335 | } |
---|
336 | } |
---|
337 | }; |
---|
338 | |
---|
339 | /** function 1 */ |
---|
340 | public final static int FUNCTION_1 = 1; |
---|
341 | /** function 2 */ |
---|
342 | public final static int FUNCTION_2 = 2; |
---|
343 | /** function 3 */ |
---|
344 | public final static int FUNCTION_3 = 3; |
---|
345 | /** function 4 */ |
---|
346 | public final static int FUNCTION_4 = 4; |
---|
347 | /** function 5 */ |
---|
348 | public final static int FUNCTION_5 = 5; |
---|
349 | /** function 6 */ |
---|
350 | public final static int FUNCTION_6 = 6; |
---|
351 | /** function 7 */ |
---|
352 | public final static int FUNCTION_7 = 7; |
---|
353 | /** function 8 */ |
---|
354 | public final static int FUNCTION_8 = 8; |
---|
355 | /** function 9 */ |
---|
356 | public final static int FUNCTION_9 = 9; |
---|
357 | /** function 10 */ |
---|
358 | public final static int FUNCTION_10 = 10; |
---|
359 | /** the funtion tags */ |
---|
360 | public static final Tag[] FUNCTION_TAGS = { |
---|
361 | new Tag(FUNCTION_1, "Function 1"), |
---|
362 | new Tag(FUNCTION_2, "Function 2"), |
---|
363 | new Tag(FUNCTION_3, "Function 3"), |
---|
364 | new Tag(FUNCTION_4, "Function 4"), |
---|
365 | new Tag(FUNCTION_5, "Function 5"), |
---|
366 | new Tag(FUNCTION_6, "Function 6"), |
---|
367 | new Tag(FUNCTION_7, "Function 7"), |
---|
368 | new Tag(FUNCTION_8, "Function 8"), |
---|
369 | new Tag(FUNCTION_9, "Function 9"), |
---|
370 | new Tag(FUNCTION_10, "Function 10"), |
---|
371 | }; |
---|
372 | |
---|
373 | /** the function to use for generating the data */ |
---|
374 | protected int m_Function; |
---|
375 | |
---|
376 | /** whether to balance the class */ |
---|
377 | protected boolean m_BalanceClass; |
---|
378 | |
---|
379 | /** the perturabation fraction */ |
---|
380 | protected double m_PerturbationFraction; |
---|
381 | |
---|
382 | /** used for balancing the class */ |
---|
383 | protected boolean m_nextClassShouldBeZero; |
---|
384 | |
---|
385 | /** the last class label that was generated */ |
---|
386 | protected double m_lastLabel; |
---|
387 | |
---|
388 | /** |
---|
389 | * initializes the generator with default values |
---|
390 | */ |
---|
391 | public Agrawal() { |
---|
392 | super(); |
---|
393 | |
---|
394 | setFunction(defaultFunction()); |
---|
395 | setBalanceClass(defaultBalanceClass()); |
---|
396 | setPerturbationFraction(defaultPerturbationFraction()); |
---|
397 | } |
---|
398 | |
---|
399 | /** |
---|
400 | * Returns a string describing this data generator. |
---|
401 | * |
---|
402 | * @return a description of the data generator suitable for |
---|
403 | * displaying in the explorer/experimenter gui |
---|
404 | */ |
---|
405 | public String globalInfo() { |
---|
406 | return |
---|
407 | "Generates a people database and is based on the paper by Agrawal " |
---|
408 | + "et al.:\n" |
---|
409 | + getTechnicalInformation().toString(); |
---|
410 | } |
---|
411 | |
---|
412 | /** |
---|
413 | * Returns an instance of a TechnicalInformation object, containing |
---|
414 | * detailed information about the technical background of this class, |
---|
415 | * e.g., paper reference or book this class is based on. |
---|
416 | * |
---|
417 | * @return the technical information about this class |
---|
418 | */ |
---|
419 | public TechnicalInformation getTechnicalInformation() { |
---|
420 | TechnicalInformation result; |
---|
421 | |
---|
422 | result = new TechnicalInformation(Type.ARTICLE); |
---|
423 | result.setValue(Field.AUTHOR, "R. Agrawal and T. Imielinski and A. Swami"); |
---|
424 | result.setValue(Field.YEAR, "1993"); |
---|
425 | result.setValue(Field.TITLE, "Database Mining: A Performance Perspective"); |
---|
426 | result.setValue(Field.JOURNAL, "IEEE Transactions on Knowledge and Data Engineering"); |
---|
427 | result.setValue(Field.VOLUME, "5"); |
---|
428 | result.setValue(Field.NUMBER, "6"); |
---|
429 | result.setValue(Field.PAGES, "914-925"); |
---|
430 | result.setValue(Field.NOTE, "Special issue on Learning and Discovery in Knowledge-Based Databases"); |
---|
431 | result.setValue(Field.URL, "http://www.almaden.ibm.com/software/quest/Publications/ByDate.html"); |
---|
432 | result.setValue(Field.PDF, "http://www.almaden.ibm.com/software/quest/Publications/papers/tkde93.pdf"); |
---|
433 | |
---|
434 | return result; |
---|
435 | } |
---|
436 | |
---|
437 | /** |
---|
438 | * Returns an enumeration describing the available options. |
---|
439 | * |
---|
440 | * @return an enumeration of all the available options |
---|
441 | */ |
---|
442 | public Enumeration listOptions() { |
---|
443 | Vector result = enumToVector(super.listOptions()); |
---|
444 | |
---|
445 | result.add(new Option( |
---|
446 | "\tThe function to use for generating the data. (default " |
---|
447 | + defaultFunction().getSelectedTag().getID() + ")", |
---|
448 | "F", 1, "-F <num>")); |
---|
449 | |
---|
450 | result.add(new Option( |
---|
451 | "\tWhether to balance the class.", |
---|
452 | "B", 0, "-B")); |
---|
453 | |
---|
454 | result.add(new Option( |
---|
455 | "\tThe perturbation factor. (default " |
---|
456 | + defaultPerturbationFraction() + ")", |
---|
457 | "P", 1, "-P <num>")); |
---|
458 | |
---|
459 | return result.elements(); |
---|
460 | } |
---|
461 | |
---|
462 | /** |
---|
463 | * Parses a list of options for this object. <p/> |
---|
464 | * |
---|
465 | <!-- options-start --> |
---|
466 | * Valid options are: <p/> |
---|
467 | * |
---|
468 | * <pre> -h |
---|
469 | * Prints this help.</pre> |
---|
470 | * |
---|
471 | * <pre> -o <file> |
---|
472 | * The name of the output file, otherwise the generated data is |
---|
473 | * printed to stdout.</pre> |
---|
474 | * |
---|
475 | * <pre> -r <name> |
---|
476 | * The name of the relation.</pre> |
---|
477 | * |
---|
478 | * <pre> -d |
---|
479 | * Whether to print debug informations.</pre> |
---|
480 | * |
---|
481 | * <pre> -S |
---|
482 | * The seed for random function (default 1)</pre> |
---|
483 | * |
---|
484 | * <pre> -n <num> |
---|
485 | * The number of examples to generate (default 100)</pre> |
---|
486 | * |
---|
487 | * <pre> -F <num> |
---|
488 | * The function to use for generating the data. (default 1)</pre> |
---|
489 | * |
---|
490 | * <pre> -B |
---|
491 | * Whether to balance the class.</pre> |
---|
492 | * |
---|
493 | * <pre> -P <num> |
---|
494 | * The perturbation factor. (default 0.05)</pre> |
---|
495 | * |
---|
496 | <!-- options-end --> |
---|
497 | * |
---|
498 | * @param options the list of options as an array of strings |
---|
499 | * @throws Exception if an option is not supported |
---|
500 | */ |
---|
501 | public void setOptions(String[] options) throws Exception { |
---|
502 | String tmpStr; |
---|
503 | |
---|
504 | super.setOptions(options); |
---|
505 | |
---|
506 | tmpStr = Utils.getOption('F', options); |
---|
507 | if (tmpStr.length() != 0) |
---|
508 | setFunction(new SelectedTag(Integer.parseInt(tmpStr), FUNCTION_TAGS)); |
---|
509 | else |
---|
510 | setFunction(defaultFunction()); |
---|
511 | |
---|
512 | setBalanceClass(Utils.getFlag('B', options)); |
---|
513 | |
---|
514 | tmpStr = Utils.getOption('P', options); |
---|
515 | if (tmpStr.length() != 0) |
---|
516 | setPerturbationFraction(Double.parseDouble(tmpStr)); |
---|
517 | else |
---|
518 | setPerturbationFraction(defaultPerturbationFraction()); |
---|
519 | } |
---|
520 | |
---|
521 | /** |
---|
522 | * Gets the current settings of the datagenerator. |
---|
523 | * |
---|
524 | * @return an array of strings suitable for passing to setOptions |
---|
525 | */ |
---|
526 | public String[] getOptions() { |
---|
527 | Vector result; |
---|
528 | String[] options; |
---|
529 | int i; |
---|
530 | |
---|
531 | result = new Vector(); |
---|
532 | options = super.getOptions(); |
---|
533 | for (i = 0; i < options.length; i++) |
---|
534 | result.add(options[i]); |
---|
535 | |
---|
536 | result.add("-F"); |
---|
537 | result.add("" + m_Function); |
---|
538 | |
---|
539 | if (getBalanceClass()) |
---|
540 | result.add("-B"); |
---|
541 | |
---|
542 | result.add("-P"); |
---|
543 | result.add("" + getPerturbationFraction()); |
---|
544 | |
---|
545 | return (String[]) result.toArray(new String[result.size()]); |
---|
546 | } |
---|
547 | |
---|
548 | /** |
---|
549 | * returns the default function |
---|
550 | * |
---|
551 | * @return the default function |
---|
552 | */ |
---|
553 | protected SelectedTag defaultFunction() { |
---|
554 | return new SelectedTag(FUNCTION_1, FUNCTION_TAGS); |
---|
555 | } |
---|
556 | |
---|
557 | /** |
---|
558 | * Gets the function for generating the data. |
---|
559 | * |
---|
560 | * @return the function. |
---|
561 | * @see #FUNCTION_TAGS |
---|
562 | */ |
---|
563 | public SelectedTag getFunction() { |
---|
564 | return new SelectedTag(m_Function, FUNCTION_TAGS); |
---|
565 | } |
---|
566 | |
---|
567 | /** |
---|
568 | * Sets the function for generating the data. |
---|
569 | * |
---|
570 | * @param value the function. |
---|
571 | * @see #FUNCTION_TAGS |
---|
572 | */ |
---|
573 | public void setFunction(SelectedTag value) { |
---|
574 | if (value.getTags() == FUNCTION_TAGS) |
---|
575 | m_Function = value.getSelectedTag().getID(); |
---|
576 | } |
---|
577 | |
---|
578 | /** |
---|
579 | * Returns the tip text for this property |
---|
580 | * @return tip text for this property suitable for |
---|
581 | * displaying in the explorer/experimenter gui |
---|
582 | */ |
---|
583 | public String functionTipText() { |
---|
584 | return "The function to use for generating the data."; |
---|
585 | } |
---|
586 | |
---|
587 | /** |
---|
588 | * returns the default for balancing the class |
---|
589 | * |
---|
590 | * @return the default for balancing the class |
---|
591 | */ |
---|
592 | protected boolean defaultBalanceClass() { |
---|
593 | return false; |
---|
594 | } |
---|
595 | |
---|
596 | /** |
---|
597 | * Gets whether the class is balanced. |
---|
598 | * |
---|
599 | * @return whether the class is balanced. |
---|
600 | */ |
---|
601 | public boolean getBalanceClass() { |
---|
602 | return m_BalanceClass; |
---|
603 | } |
---|
604 | |
---|
605 | /** |
---|
606 | * Sets whether the class is balanced. |
---|
607 | * |
---|
608 | * @param value whether to balance the class. |
---|
609 | */ |
---|
610 | public void setBalanceClass(boolean value) { |
---|
611 | m_BalanceClass = value; |
---|
612 | } |
---|
613 | |
---|
614 | /** |
---|
615 | * Returns the tip text for this property |
---|
616 | * |
---|
617 | * @return tip text for this property suitable for |
---|
618 | * displaying in the explorer/experimenter gui |
---|
619 | */ |
---|
620 | public String balanceClassTipText() { |
---|
621 | return "Whether to balance the class."; |
---|
622 | } |
---|
623 | |
---|
624 | /** |
---|
625 | * returns the default perturbation fraction |
---|
626 | * |
---|
627 | * @return the default perturbation fraction |
---|
628 | */ |
---|
629 | protected double defaultPerturbationFraction() { |
---|
630 | return 0.05; |
---|
631 | } |
---|
632 | |
---|
633 | /** |
---|
634 | * Gets the perturbation fraction. |
---|
635 | * |
---|
636 | * @return the perturbation fraction. |
---|
637 | */ |
---|
638 | public double getPerturbationFraction() { |
---|
639 | return m_PerturbationFraction; |
---|
640 | } |
---|
641 | |
---|
642 | /** |
---|
643 | * Sets the perturbation fraction. |
---|
644 | * |
---|
645 | * @param value the perturbation fraction. |
---|
646 | */ |
---|
647 | public void setPerturbationFraction(double value) { |
---|
648 | if ( (value >= 0.0) && (value <= 1.0) ) |
---|
649 | m_PerturbationFraction = value; |
---|
650 | else |
---|
651 | throw new IllegalArgumentException( |
---|
652 | "Perturbation fraction must be in [0,1] (provided: " + value + ")!"); |
---|
653 | } |
---|
654 | |
---|
655 | /** |
---|
656 | * Returns the tip text for this property |
---|
657 | * |
---|
658 | * @return tip text for this property suitable for |
---|
659 | * displaying in the explorer/experimenter gui |
---|
660 | */ |
---|
661 | public String perturbationFractionTipText() { |
---|
662 | return "The perturbation fraction: 0 <= fraction <= 1."; |
---|
663 | } |
---|
664 | |
---|
665 | /** |
---|
666 | * Return if single mode is set for the given data generator |
---|
667 | * mode depends on option setting and or generator type. |
---|
668 | * |
---|
669 | * @return single mode flag |
---|
670 | * @throws Exception if mode is not set yet |
---|
671 | */ |
---|
672 | public boolean getSingleModeFlag() throws Exception { |
---|
673 | return true; |
---|
674 | } |
---|
675 | |
---|
676 | /** |
---|
677 | * Initializes the format for the dataset produced. |
---|
678 | * Must be called before the generateExample or generateExamples |
---|
679 | * methods are used. |
---|
680 | * Re-initializes the random number generator with the given seed. |
---|
681 | * |
---|
682 | * @return the format for the dataset |
---|
683 | * @throws Exception if the generating of the format failed |
---|
684 | * @see #getSeed() |
---|
685 | */ |
---|
686 | public Instances defineDataFormat() throws Exception { |
---|
687 | FastVector atts; |
---|
688 | FastVector attValues; |
---|
689 | int i; |
---|
690 | |
---|
691 | m_Random = new Random(getSeed()); |
---|
692 | m_nextClassShouldBeZero = true; |
---|
693 | m_lastLabel = Double.NaN; |
---|
694 | |
---|
695 | // number of examples is the same as given per option |
---|
696 | setNumExamplesAct(getNumExamples()); |
---|
697 | |
---|
698 | // set up attributes |
---|
699 | atts = new FastVector(); |
---|
700 | |
---|
701 | atts.addElement(new Attribute("salary")); |
---|
702 | |
---|
703 | atts.addElement(new Attribute("commission")); |
---|
704 | |
---|
705 | attValues = new FastVector(); |
---|
706 | atts.addElement(new Attribute("age")); |
---|
707 | |
---|
708 | attValues = new FastVector(); |
---|
709 | for (i = 0; i < 5; i++) |
---|
710 | attValues.addElement("" + i); |
---|
711 | atts.addElement(new Attribute("elevel", attValues)); |
---|
712 | |
---|
713 | attValues = new FastVector(); |
---|
714 | for (i = 1; i <= 20; i++) |
---|
715 | attValues.addElement("" + i); |
---|
716 | atts.addElement(new Attribute("car", attValues)); |
---|
717 | |
---|
718 | attValues = new FastVector(); |
---|
719 | for (i = 0; i < 9; i++) |
---|
720 | attValues.addElement("" + i); |
---|
721 | atts.addElement(new Attribute("zipcode", attValues)); |
---|
722 | |
---|
723 | atts.addElement(new Attribute("hvalue")); |
---|
724 | |
---|
725 | atts.addElement(new Attribute("hyears")); |
---|
726 | |
---|
727 | atts.addElement(new Attribute("loan")); |
---|
728 | |
---|
729 | attValues = new FastVector(); |
---|
730 | for (i = 0; i < 2; i++) |
---|
731 | attValues.addElement("" + i); |
---|
732 | atts.addElement(new Attribute("group", attValues)); |
---|
733 | |
---|
734 | // dataset |
---|
735 | m_DatasetFormat = new Instances(getRelationNameToUse(), atts, 0); |
---|
736 | |
---|
737 | return m_DatasetFormat; |
---|
738 | } |
---|
739 | |
---|
740 | /** |
---|
741 | * perturbs the given value |
---|
742 | * |
---|
743 | * @param val the value to perturb |
---|
744 | * @param min the minimum |
---|
745 | * @param max the maximum |
---|
746 | * @return the perturbed value |
---|
747 | */ |
---|
748 | protected double perturbValue(double val, double min, double max) { |
---|
749 | return perturbValue(val, max - min, min, max); |
---|
750 | } |
---|
751 | |
---|
752 | /** |
---|
753 | * perturbs the given value |
---|
754 | * |
---|
755 | * @param val the value to perturb |
---|
756 | * @param range the range for the perturbation |
---|
757 | * @param min the minimum |
---|
758 | * @param max the maximum |
---|
759 | * @return the perturbed value |
---|
760 | */ |
---|
761 | protected double perturbValue(double val, double range, |
---|
762 | double min, double max) { |
---|
763 | |
---|
764 | val += range * (2.0 * (getRandom().nextDouble() - 0.5)) |
---|
765 | * getPerturbationFraction(); |
---|
766 | |
---|
767 | if (val < min) |
---|
768 | val = min; |
---|
769 | else if (val > max) |
---|
770 | val = max; |
---|
771 | |
---|
772 | return val; |
---|
773 | } |
---|
774 | |
---|
775 | /** |
---|
776 | * Generates one example of the dataset. |
---|
777 | * |
---|
778 | * @return the generated example |
---|
779 | * @throws Exception if the format of the dataset is not yet defined |
---|
780 | * @throws Exception if the generator only works with generateExamples |
---|
781 | * which means in non single mode |
---|
782 | */ |
---|
783 | public Instance generateExample() throws Exception { |
---|
784 | Instance result; |
---|
785 | double salary; |
---|
786 | double commission; |
---|
787 | double hvalue; |
---|
788 | double loan; |
---|
789 | int age; |
---|
790 | int elevel; |
---|
791 | int car; |
---|
792 | int zipcode; |
---|
793 | int hyears; |
---|
794 | boolean desiredClassFound; |
---|
795 | double[] atts; |
---|
796 | Random random; |
---|
797 | ClassFunction classFunction; |
---|
798 | |
---|
799 | result = null; |
---|
800 | random = getRandom(); |
---|
801 | |
---|
802 | if (m_DatasetFormat == null) |
---|
803 | throw new Exception("Dataset format not defined."); |
---|
804 | |
---|
805 | salary = 0; |
---|
806 | commission = 0; |
---|
807 | hvalue = 0; |
---|
808 | loan = 0; |
---|
809 | age = 0; |
---|
810 | elevel = 0; |
---|
811 | car = 0; |
---|
812 | zipcode = 0; |
---|
813 | hyears = 0; |
---|
814 | desiredClassFound = false; |
---|
815 | classFunction = builtInFunctions[m_Function - 1]; |
---|
816 | |
---|
817 | while (!desiredClassFound) { |
---|
818 | // generate attributes |
---|
819 | salary = 20000.0 + 130000.0 * random.nextDouble(); |
---|
820 | commission = (salary >= 75000.0) ? |
---|
821 | 0 : (10000.0 + 65000.0 * random.nextDouble()); |
---|
822 | age = 20 + random.nextInt(61); |
---|
823 | elevel = random.nextInt(5); |
---|
824 | car = 1 + random.nextInt(20); |
---|
825 | zipcode = random.nextInt(9); |
---|
826 | hvalue = (9.0 - (double) zipcode) * 100000.0 |
---|
827 | * (0.5 + random.nextDouble()); |
---|
828 | hyears = 1 + random.nextInt(30); |
---|
829 | loan = random.nextDouble() * 500000.0; |
---|
830 | |
---|
831 | // determine class |
---|
832 | m_lastLabel = classFunction.determineClass(salary, commission, age, |
---|
833 | elevel, car, zipcode, hvalue, hyears, loan); |
---|
834 | if (!getBalanceClass()) { |
---|
835 | desiredClassFound = true; |
---|
836 | } |
---|
837 | else { |
---|
838 | // balance the classes |
---|
839 | if ( ( m_nextClassShouldBeZero && (m_lastLabel == 0)) |
---|
840 | || (!m_nextClassShouldBeZero && (m_lastLabel == 1)) ) { |
---|
841 | desiredClassFound = true; |
---|
842 | m_nextClassShouldBeZero = !m_nextClassShouldBeZero; |
---|
843 | } // else keep searching |
---|
844 | } |
---|
845 | } |
---|
846 | |
---|
847 | // perturb values |
---|
848 | if (getPerturbationFraction() > 0.0) { |
---|
849 | salary = perturbValue(salary, 20000, 150000); |
---|
850 | if (commission > 0) |
---|
851 | commission = perturbValue(commission, 10000, 75000); |
---|
852 | age = (int) Math.round(perturbValue(age, 20, 80)); |
---|
853 | hvalue = perturbValue( |
---|
854 | hvalue, (9.0 - (double) zipcode) * 100000.0, 0, 135000); |
---|
855 | hyears = (int) Math.round(perturbValue(hyears, 1, 30)); |
---|
856 | loan = perturbValue(loan, 0, 500000); |
---|
857 | } |
---|
858 | |
---|
859 | // create instance |
---|
860 | atts = new double[m_DatasetFormat.numAttributes()]; |
---|
861 | atts[0] = salary; |
---|
862 | atts[1] = commission; |
---|
863 | atts[2] = age; |
---|
864 | atts[3] = elevel; |
---|
865 | atts[4] = car - 1; |
---|
866 | atts[5] = zipcode; |
---|
867 | atts[6] = hvalue; |
---|
868 | atts[7] = hyears; |
---|
869 | atts[8] = loan; |
---|
870 | atts[9] = m_lastLabel; |
---|
871 | result = new DenseInstance(1.0, atts); |
---|
872 | result.setDataset(m_DatasetFormat); |
---|
873 | |
---|
874 | return result; |
---|
875 | } |
---|
876 | |
---|
877 | /** |
---|
878 | * Generates all examples of the dataset. Re-initializes the random number |
---|
879 | * generator with the given seed, before generating instances. |
---|
880 | * |
---|
881 | * @return the generated dataset |
---|
882 | * @throws Exception if the format of the dataset is not yet defined |
---|
883 | * @throws Exception if the generator only works with generateExample, |
---|
884 | * which means in single mode |
---|
885 | * @see #getSeed() |
---|
886 | */ |
---|
887 | public Instances generateExamples() throws Exception { |
---|
888 | Instances result; |
---|
889 | int i; |
---|
890 | |
---|
891 | result = new Instances(m_DatasetFormat, 0); |
---|
892 | m_Random = new Random(getSeed()); |
---|
893 | |
---|
894 | for (i = 0; i < getNumExamplesAct(); i++) |
---|
895 | result.add(generateExample()); |
---|
896 | |
---|
897 | return result; |
---|
898 | } |
---|
899 | |
---|
900 | /** |
---|
901 | * Generates a comment string that documentates the data generator. |
---|
902 | * By default this string is added at the beginning of the produced output |
---|
903 | * as ARFF file type, next after the options. |
---|
904 | * |
---|
905 | * @return string contains info about the generated rules |
---|
906 | */ |
---|
907 | public String generateStart () { |
---|
908 | return ""; |
---|
909 | } |
---|
910 | |
---|
911 | /** |
---|
912 | * Generates a comment string that documentats the data generator. |
---|
913 | * By default this string is added at the end of theproduces output |
---|
914 | * as ARFF file type. |
---|
915 | * |
---|
916 | * @return string contains info about the generated rules |
---|
917 | * @throws Exception if the generating of the documentaion fails |
---|
918 | */ |
---|
919 | public String generateFinished() throws Exception { |
---|
920 | return ""; |
---|
921 | } |
---|
922 | |
---|
923 | /** |
---|
924 | * Returns the revision string. |
---|
925 | * |
---|
926 | * @return the revision |
---|
927 | */ |
---|
928 | public String getRevision() { |
---|
929 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
930 | } |
---|
931 | |
---|
932 | /** |
---|
933 | * Main method for executing this class. |
---|
934 | * |
---|
935 | * @param args should contain arguments for the data producer: |
---|
936 | */ |
---|
937 | public static void main(String[] args) { |
---|
938 | runDataGenerator(new Agrawal(), args); |
---|
939 | } |
---|
940 | } |
---|