1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * CheckScheme.java |
---|
19 | * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.core; |
---|
24 | |
---|
25 | import java.util.Enumeration; |
---|
26 | import java.util.Random; |
---|
27 | import java.util.StringTokenizer; |
---|
28 | import java.util.Vector; |
---|
29 | |
---|
30 | /** |
---|
31 | * Abstract general class for testing schemes in Weka. Derived classes are |
---|
32 | * also used for JUnit tests. |
---|
33 | * |
---|
34 | * @author FracPete (fracpete at waikato dot ac dot nz) |
---|
35 | * @version $Revision: 5953 $ |
---|
36 | * @see TestInstances |
---|
37 | */ |
---|
38 | public abstract class CheckScheme |
---|
39 | extends Check { |
---|
40 | |
---|
41 | /** a class for postprocessing the test-data */ |
---|
42 | public static class PostProcessor |
---|
43 | implements RevisionHandler { |
---|
44 | |
---|
45 | /** |
---|
46 | * Provides a hook for derived classes to further modify the data. Currently, |
---|
47 | * the data is just passed through. |
---|
48 | * |
---|
49 | * @param data the data to process |
---|
50 | * @return the processed data |
---|
51 | */ |
---|
52 | public Instances process(Instances data) { |
---|
53 | return data; |
---|
54 | } |
---|
55 | |
---|
56 | /** |
---|
57 | * Returns the revision string. |
---|
58 | * |
---|
59 | * @return the revision |
---|
60 | */ |
---|
61 | public String getRevision() { |
---|
62 | return RevisionUtils.extract("$Revision: 5953 $"); |
---|
63 | } |
---|
64 | } |
---|
65 | |
---|
66 | /** The number of instances in the datasets */ |
---|
67 | protected int m_NumInstances = 20; |
---|
68 | |
---|
69 | /** the number of nominal attributes */ |
---|
70 | protected int m_NumNominal = 2; |
---|
71 | |
---|
72 | /** the number of numeric attributes */ |
---|
73 | protected int m_NumNumeric = 1; |
---|
74 | |
---|
75 | /** the number of string attributes */ |
---|
76 | protected int m_NumString = 1; |
---|
77 | |
---|
78 | /** the number of date attributes */ |
---|
79 | protected int m_NumDate = 1; |
---|
80 | |
---|
81 | /** the number of relational attributes */ |
---|
82 | protected int m_NumRelational = 1; |
---|
83 | |
---|
84 | /** the number of instances in relational attributes (applies also for bags |
---|
85 | * in multi-instance) */ |
---|
86 | protected int m_NumInstancesRelational = 10; |
---|
87 | |
---|
88 | /** for generating String attributes/classes */ |
---|
89 | protected String[] m_Words = TestInstances.DEFAULT_WORDS; |
---|
90 | |
---|
91 | /** for generating String attributes/classes */ |
---|
92 | protected String m_WordSeparators = TestInstances.DEFAULT_SEPARATORS; |
---|
93 | |
---|
94 | /** for post-processing the data even further */ |
---|
95 | protected PostProcessor m_PostProcessor = null; |
---|
96 | |
---|
97 | /** whether classpath problems occurred */ |
---|
98 | protected boolean m_ClasspathProblems = false; |
---|
99 | |
---|
100 | /** |
---|
101 | * Returns an enumeration describing the available options. |
---|
102 | * |
---|
103 | * @return an enumeration of all the available options. |
---|
104 | */ |
---|
105 | public Enumeration listOptions() { |
---|
106 | Vector<Option> result = new Vector<Option>(); |
---|
107 | |
---|
108 | Enumeration en = super.listOptions(); |
---|
109 | while (en.hasMoreElements()) |
---|
110 | result.addElement((Option)en.nextElement()); |
---|
111 | |
---|
112 | result.addElement(new Option( |
---|
113 | "\tThe number of instances in the datasets (default 20).", |
---|
114 | "N", 1, "-N <num>")); |
---|
115 | |
---|
116 | result.addElement(new Option( |
---|
117 | "\tThe number of nominal attributes (default 2).", |
---|
118 | "nominal", 1, "-nominal <num>")); |
---|
119 | |
---|
120 | result.addElement(new Option( |
---|
121 | "\tThe number of values for nominal attributes (default 1).", |
---|
122 | "nominal-values", 1, "-nominal-values <num>")); |
---|
123 | |
---|
124 | result.addElement(new Option( |
---|
125 | "\tThe number of numeric attributes (default 1).", |
---|
126 | "numeric", 1, "-numeric <num>")); |
---|
127 | |
---|
128 | result.addElement(new Option( |
---|
129 | "\tThe number of string attributes (default 1).", |
---|
130 | "string", 1, "-string <num>")); |
---|
131 | |
---|
132 | result.addElement(new Option( |
---|
133 | "\tThe number of date attributes (default 1).", |
---|
134 | "date", 1, "-date <num>")); |
---|
135 | |
---|
136 | result.addElement(new Option( |
---|
137 | "\tThe number of relational attributes (default 1).", |
---|
138 | "relational", 1, "-relational <num>")); |
---|
139 | |
---|
140 | result.addElement(new Option( |
---|
141 | "\tThe number of instances in relational/bag attributes (default 10).", |
---|
142 | "num-instances-relational", 1, "-num-instances-relational <num>")); |
---|
143 | |
---|
144 | result.addElement(new Option( |
---|
145 | "\tThe words to use in string attributes.", |
---|
146 | "words", 1, "-words <comma-separated-list>")); |
---|
147 | |
---|
148 | result.addElement(new Option( |
---|
149 | "\tThe word separators to use in string attributes.", |
---|
150 | "word-separators", 1, "-word-separators <chars>")); |
---|
151 | |
---|
152 | return result.elements(); |
---|
153 | } |
---|
154 | |
---|
155 | /** |
---|
156 | * Parses a given list of options. |
---|
157 | * |
---|
158 | * @param options the list of options as an array of strings |
---|
159 | * @throws Exception if an option is not supported |
---|
160 | */ |
---|
161 | public void setOptions(String[] options) throws Exception { |
---|
162 | String tmpStr; |
---|
163 | |
---|
164 | super.setOptions(options); |
---|
165 | |
---|
166 | tmpStr = Utils.getOption('N', options); |
---|
167 | if (tmpStr.length() != 0) |
---|
168 | setNumInstances(Integer.parseInt(tmpStr)); |
---|
169 | else |
---|
170 | setNumInstances(20); |
---|
171 | |
---|
172 | tmpStr = Utils.getOption("nominal", options); |
---|
173 | if (tmpStr.length() != 0) |
---|
174 | setNumNominal(Integer.parseInt(tmpStr)); |
---|
175 | else |
---|
176 | setNumNominal(2); |
---|
177 | |
---|
178 | tmpStr = Utils.getOption("numeric", options); |
---|
179 | if (tmpStr.length() != 0) |
---|
180 | setNumNumeric(Integer.parseInt(tmpStr)); |
---|
181 | else |
---|
182 | setNumNumeric(1); |
---|
183 | |
---|
184 | tmpStr = Utils.getOption("string", options); |
---|
185 | if (tmpStr.length() != 0) |
---|
186 | setNumString(Integer.parseInt(tmpStr)); |
---|
187 | else |
---|
188 | setNumString(1); |
---|
189 | |
---|
190 | tmpStr = Utils.getOption("date", options); |
---|
191 | if (tmpStr.length() != 0) |
---|
192 | setNumDate(Integer.parseInt(tmpStr)); |
---|
193 | else |
---|
194 | setNumDate(1); |
---|
195 | |
---|
196 | tmpStr = Utils.getOption("relational", options); |
---|
197 | if (tmpStr.length() != 0) |
---|
198 | setNumRelational(Integer.parseInt(tmpStr)); |
---|
199 | else |
---|
200 | setNumRelational(1); |
---|
201 | |
---|
202 | tmpStr = Utils.getOption("num-instances-relational", options); |
---|
203 | if (tmpStr.length() != 0) |
---|
204 | setNumInstancesRelational(Integer.parseInt(tmpStr)); |
---|
205 | else |
---|
206 | setNumInstancesRelational(10); |
---|
207 | |
---|
208 | tmpStr = Utils.getOption("words", options); |
---|
209 | if (tmpStr.length() != 0) |
---|
210 | setWords(tmpStr); |
---|
211 | else |
---|
212 | setWords(new TestInstances().getWords()); |
---|
213 | |
---|
214 | if (Utils.getOptionPos("word-separators", options) > -1) { |
---|
215 | tmpStr = Utils.getOption("word-separators", options); |
---|
216 | setWordSeparators(tmpStr); |
---|
217 | } |
---|
218 | else { |
---|
219 | setWordSeparators(TestInstances.DEFAULT_SEPARATORS); |
---|
220 | } |
---|
221 | } |
---|
222 | |
---|
223 | /** |
---|
224 | * Gets the current settings of the CheckClassifier. |
---|
225 | * |
---|
226 | * @return an array of strings suitable for passing to setOptions |
---|
227 | */ |
---|
228 | public String[] getOptions() { |
---|
229 | Vector<String> result; |
---|
230 | String[] options; |
---|
231 | int i; |
---|
232 | |
---|
233 | result = new Vector<String>(); |
---|
234 | |
---|
235 | options = super.getOptions(); |
---|
236 | for (i = 0; i < options.length; i++) |
---|
237 | result.add(options[i]); |
---|
238 | |
---|
239 | result.add("-N"); |
---|
240 | result.add("" + getNumInstances()); |
---|
241 | |
---|
242 | result.add("-nominal"); |
---|
243 | result.add("" + getNumNominal()); |
---|
244 | |
---|
245 | result.add("-numeric"); |
---|
246 | result.add("" + getNumNumeric()); |
---|
247 | |
---|
248 | result.add("-string"); |
---|
249 | result.add("" + getNumString()); |
---|
250 | |
---|
251 | result.add("-date"); |
---|
252 | result.add("" + getNumDate()); |
---|
253 | |
---|
254 | result.add("-relational"); |
---|
255 | result.add("" + getNumRelational()); |
---|
256 | |
---|
257 | result.add("-words"); |
---|
258 | result.add("" + getWords()); |
---|
259 | |
---|
260 | result.add("-word-separators"); |
---|
261 | result.add("" + getWordSeparators()); |
---|
262 | |
---|
263 | return (String[]) result.toArray(new String[result.size()]); |
---|
264 | } |
---|
265 | |
---|
266 | /** |
---|
267 | * sets the PostProcessor to use |
---|
268 | * |
---|
269 | * @param value the new PostProcessor |
---|
270 | * @see #m_PostProcessor |
---|
271 | */ |
---|
272 | public void setPostProcessor(PostProcessor value) { |
---|
273 | m_PostProcessor = value; |
---|
274 | } |
---|
275 | |
---|
276 | /** |
---|
277 | * returns the current PostProcessor, can be null |
---|
278 | * |
---|
279 | * @return the current PostProcessor |
---|
280 | */ |
---|
281 | public PostProcessor getPostProcessor() { |
---|
282 | return m_PostProcessor; |
---|
283 | } |
---|
284 | |
---|
285 | /** |
---|
286 | * returns TRUE if the classifier returned a "not in classpath" Exception |
---|
287 | * |
---|
288 | * @return true if CLASSPATH problems occurred |
---|
289 | */ |
---|
290 | public boolean hasClasspathProblems() { |
---|
291 | return m_ClasspathProblems; |
---|
292 | } |
---|
293 | |
---|
294 | /** |
---|
295 | * Begin the tests, reporting results to System.out |
---|
296 | */ |
---|
297 | public abstract void doTests(); |
---|
298 | |
---|
299 | /** |
---|
300 | * Sets the number of instances to use in the datasets (some classifiers |
---|
301 | * might require more instances). |
---|
302 | * |
---|
303 | * @param value the number of instances to use |
---|
304 | */ |
---|
305 | public void setNumInstances(int value) { |
---|
306 | m_NumInstances = value; |
---|
307 | } |
---|
308 | |
---|
309 | /** |
---|
310 | * Gets the current number of instances to use for the datasets. |
---|
311 | * |
---|
312 | * @return the number of instances |
---|
313 | */ |
---|
314 | public int getNumInstances() { |
---|
315 | return m_NumInstances; |
---|
316 | } |
---|
317 | |
---|
318 | /** |
---|
319 | * sets the number of nominal attributes |
---|
320 | * |
---|
321 | * @param value the number of nominal attributes |
---|
322 | */ |
---|
323 | public void setNumNominal(int value) { |
---|
324 | m_NumNominal = value; |
---|
325 | } |
---|
326 | |
---|
327 | /** |
---|
328 | * returns the current number of nominal attributes |
---|
329 | * |
---|
330 | * @return the number of nominal attributes |
---|
331 | */ |
---|
332 | public int getNumNominal() { |
---|
333 | return m_NumNominal; |
---|
334 | } |
---|
335 | |
---|
336 | /** |
---|
337 | * sets the number of numeric attributes |
---|
338 | * |
---|
339 | * @param value the number of numeric attributes |
---|
340 | */ |
---|
341 | public void setNumNumeric(int value) { |
---|
342 | m_NumNumeric = value; |
---|
343 | } |
---|
344 | |
---|
345 | /** |
---|
346 | * returns the current number of numeric attributes |
---|
347 | * |
---|
348 | * @return the number of numeric attributes |
---|
349 | */ |
---|
350 | public int getNumNumeric() { |
---|
351 | return m_NumNumeric; |
---|
352 | } |
---|
353 | |
---|
354 | /** |
---|
355 | * sets the number of string attributes |
---|
356 | * |
---|
357 | * @param value the number of string attributes |
---|
358 | */ |
---|
359 | public void setNumString(int value) { |
---|
360 | m_NumString = value; |
---|
361 | } |
---|
362 | |
---|
363 | /** |
---|
364 | * returns the current number of string attributes |
---|
365 | * |
---|
366 | * @return the number of string attributes |
---|
367 | */ |
---|
368 | public int getNumString() { |
---|
369 | return m_NumString; |
---|
370 | } |
---|
371 | |
---|
372 | /** |
---|
373 | * sets the number of data attributes |
---|
374 | * |
---|
375 | * @param value the number of date attributes |
---|
376 | */ |
---|
377 | public void setNumDate(int value) { |
---|
378 | m_NumDate = value; |
---|
379 | } |
---|
380 | |
---|
381 | /** |
---|
382 | * returns the current number of date attributes |
---|
383 | * |
---|
384 | * @return the number of date attributes |
---|
385 | */ |
---|
386 | public int getNumDate() { |
---|
387 | return m_NumDate; |
---|
388 | } |
---|
389 | |
---|
390 | /** |
---|
391 | * sets the number of relational attributes |
---|
392 | * |
---|
393 | * @param value the number of relational attributes |
---|
394 | */ |
---|
395 | public void setNumRelational(int value) { |
---|
396 | m_NumRelational = value; |
---|
397 | } |
---|
398 | |
---|
399 | /** |
---|
400 | * returns the current number of relational attributes |
---|
401 | * |
---|
402 | * @return the number of relational attributes |
---|
403 | */ |
---|
404 | public int getNumRelational() { |
---|
405 | return m_NumRelational; |
---|
406 | } |
---|
407 | |
---|
408 | /** |
---|
409 | * sets the number of instances in relational/bag attributes to produce |
---|
410 | * |
---|
411 | * @param value the number of instances |
---|
412 | */ |
---|
413 | public void setNumInstancesRelational(int value) { |
---|
414 | m_NumInstancesRelational = value; |
---|
415 | } |
---|
416 | |
---|
417 | /** |
---|
418 | * returns the current number of instances in relational/bag attributes to produce |
---|
419 | * |
---|
420 | * @return the number of instances |
---|
421 | */ |
---|
422 | public int getNumInstancesRelational() { |
---|
423 | return m_NumInstancesRelational; |
---|
424 | } |
---|
425 | |
---|
426 | /** |
---|
427 | * turns the comma-separated list into an array |
---|
428 | * |
---|
429 | * @param value the list to process |
---|
430 | * @return the list as array |
---|
431 | */ |
---|
432 | protected static String[] listToArray(String value) { |
---|
433 | StringTokenizer tok; |
---|
434 | Vector<String> list; |
---|
435 | |
---|
436 | list = new Vector<String>(); |
---|
437 | tok = new StringTokenizer(value, ","); |
---|
438 | while (tok.hasMoreTokens()) |
---|
439 | list.add(tok.nextToken()); |
---|
440 | |
---|
441 | return (String[]) list.toArray(new String[list.size()]); |
---|
442 | } |
---|
443 | |
---|
444 | /** |
---|
445 | * turns the array into a comma-separated list |
---|
446 | * |
---|
447 | * @param value the array to process |
---|
448 | * @return the array as list |
---|
449 | */ |
---|
450 | protected static String arrayToList(String[] value) { |
---|
451 | String result; |
---|
452 | int i; |
---|
453 | |
---|
454 | result = ""; |
---|
455 | |
---|
456 | for (i = 0; i < value.length; i++) { |
---|
457 | if (i > 0) |
---|
458 | result += ","; |
---|
459 | result += value[i]; |
---|
460 | } |
---|
461 | |
---|
462 | return result; |
---|
463 | } |
---|
464 | |
---|
465 | /** |
---|
466 | * returns a string representation of the attribute type |
---|
467 | * |
---|
468 | * @param type the attribute type to get a string rerpresentation for |
---|
469 | * @return the string representation |
---|
470 | */ |
---|
471 | public static String attributeTypeToString(int type) { |
---|
472 | String result; |
---|
473 | |
---|
474 | switch (type) { |
---|
475 | case Attribute.NUMERIC: |
---|
476 | result = "numeric"; |
---|
477 | break; |
---|
478 | |
---|
479 | case Attribute.NOMINAL: |
---|
480 | result = "nominal"; |
---|
481 | break; |
---|
482 | |
---|
483 | case Attribute.STRING: |
---|
484 | result = "string"; |
---|
485 | break; |
---|
486 | |
---|
487 | case Attribute.DATE: |
---|
488 | result = "date"; |
---|
489 | break; |
---|
490 | |
---|
491 | case Attribute.RELATIONAL: |
---|
492 | result = "relational"; |
---|
493 | break; |
---|
494 | |
---|
495 | default: |
---|
496 | result = "???"; |
---|
497 | } |
---|
498 | |
---|
499 | return result; |
---|
500 | } |
---|
501 | |
---|
502 | /** |
---|
503 | * Sets the comma-separated list of words to use for generating strings. The |
---|
504 | * list must contain at least 2 words, otherwise an exception will be thrown. |
---|
505 | * |
---|
506 | * @param value the list of words |
---|
507 | * @throws IllegalArgumentException if not at least 2 words are provided |
---|
508 | */ |
---|
509 | public void setWords(String value) { |
---|
510 | if (listToArray(value).length < 2) |
---|
511 | throw new IllegalArgumentException("At least 2 words must be provided!"); |
---|
512 | |
---|
513 | m_Words = listToArray(value); |
---|
514 | } |
---|
515 | |
---|
516 | /** |
---|
517 | * returns the words used for assembling strings in a comma-separated list. |
---|
518 | * |
---|
519 | * @return the words as comma-separated list |
---|
520 | */ |
---|
521 | public String getWords() { |
---|
522 | return arrayToList(m_Words); |
---|
523 | } |
---|
524 | |
---|
525 | /** |
---|
526 | * sets the word separators (chars) to use for assembling strings. |
---|
527 | * |
---|
528 | * @param value the characters to use as separators |
---|
529 | */ |
---|
530 | public void setWordSeparators(String value) { |
---|
531 | m_WordSeparators = value; |
---|
532 | } |
---|
533 | |
---|
534 | /** |
---|
535 | * returns the word separators (chars) to use for assembling strings. |
---|
536 | * |
---|
537 | * @return the current separators |
---|
538 | */ |
---|
539 | public String getWordSeparators() { |
---|
540 | return m_WordSeparators; |
---|
541 | } |
---|
542 | |
---|
543 | /** |
---|
544 | * Compare two datasets to see if they differ. |
---|
545 | * |
---|
546 | * @param data1 one set of instances |
---|
547 | * @param data2 the other set of instances |
---|
548 | * @throws Exception if the datasets differ |
---|
549 | */ |
---|
550 | protected void compareDatasets(Instances data1, Instances data2) |
---|
551 | throws Exception { |
---|
552 | |
---|
553 | if (!data2.equalHeaders(data1)) { |
---|
554 | throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); |
---|
555 | } |
---|
556 | if (!(data2.numInstances() == data1.numInstances())) { |
---|
557 | throw new Exception("number of instances has changed"); |
---|
558 | } |
---|
559 | for (int i = 0; i < data2.numInstances(); i++) { |
---|
560 | Instance orig = data1.instance(i); |
---|
561 | Instance copy = data2.instance(i); |
---|
562 | for (int j = 0; j < orig.numAttributes(); j++) { |
---|
563 | if (orig.isMissing(j)) { |
---|
564 | if (!copy.isMissing(j)) { |
---|
565 | throw new Exception("instances have changed"); |
---|
566 | } |
---|
567 | } else if (orig.value(j) != copy.value(j)) { |
---|
568 | throw new Exception("instances have changed"); |
---|
569 | } |
---|
570 | if (orig.weight() != copy.weight()) { |
---|
571 | throw new Exception("instance weights have changed"); |
---|
572 | } |
---|
573 | } |
---|
574 | } |
---|
575 | } |
---|
576 | |
---|
577 | /** |
---|
578 | * Add missing values to a dataset. |
---|
579 | * |
---|
580 | * @param data the instances to add missing values to |
---|
581 | * @param level the level of missing values to add (if positive, this |
---|
582 | * is the probability that a value will be set to missing, if negative |
---|
583 | * all but one value will be set to missing (not yet implemented)) |
---|
584 | * @param predictorMissing if true, predictor attributes will be modified |
---|
585 | * @param classMissing if true, the class attribute will be modified |
---|
586 | */ |
---|
587 | protected void addMissing(Instances data, int level, |
---|
588 | boolean predictorMissing, boolean classMissing) { |
---|
589 | |
---|
590 | int classIndex = data.classIndex(); |
---|
591 | Random random = new Random(1); |
---|
592 | for (int i = 0; i < data.numInstances(); i++) { |
---|
593 | Instance current = data.instance(i); |
---|
594 | for (int j = 0; j < data.numAttributes(); j++) { |
---|
595 | if (((j == classIndex) && classMissing) || |
---|
596 | ((j != classIndex) && predictorMissing)) { |
---|
597 | if (Math.abs(random.nextInt()) % 100 < level) |
---|
598 | current.setMissing(j); |
---|
599 | } |
---|
600 | } |
---|
601 | } |
---|
602 | } |
---|
603 | |
---|
604 | /** |
---|
605 | * Provides a hook for derived classes to further modify the data. |
---|
606 | * |
---|
607 | * @param data the data to process |
---|
608 | * @return the processed data |
---|
609 | * @see #m_PostProcessor |
---|
610 | */ |
---|
611 | protected Instances process(Instances data) { |
---|
612 | if (getPostProcessor() == null) |
---|
613 | return data; |
---|
614 | else |
---|
615 | return getPostProcessor().process(data); |
---|
616 | } |
---|
617 | } |
---|