1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * CheckEstimator.java |
---|
19 | * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.estimators; |
---|
24 | |
---|
25 | import weka.core.Attribute; |
---|
26 | import weka.core.FastVector; |
---|
27 | import weka.core.Instance; |
---|
28 | import weka.core.Instances; |
---|
29 | import weka.core.Option; |
---|
30 | import weka.core.OptionHandler; |
---|
31 | import weka.core.RevisionHandler; |
---|
32 | import weka.core.RevisionUtils; |
---|
33 | import weka.core.TestInstances; |
---|
34 | import weka.core.Utils; |
---|
35 | import weka.core.WeightedInstancesHandler; |
---|
36 | |
---|
37 | import java.util.Enumeration; |
---|
38 | import java.util.Random; |
---|
39 | import java.util.Vector; |
---|
40 | |
---|
41 | /** |
---|
42 | * Class for examining the capabilities and finding problems with |
---|
43 | * estimators. If you implement a estimator using the WEKA.libraries, |
---|
44 | * you should run the checks on it to ensure robustness and correct |
---|
45 | * operation. Passing all the tests of this object does not mean |
---|
46 | * bugs in the estimator don't exist, but this will help find some |
---|
47 | * common ones. <p/> |
---|
48 | * |
---|
49 | * Typical usage: <p/> |
---|
50 | * <code>java weka.estimators.CheckEstimator -W estimator_name |
---|
51 | * estimator_options </code><p/> |
---|
52 | * |
---|
53 | * This class uses code from the CheckEstimatorClass |
---|
54 | * ATTENTION! Current estimators can only |
---|
55 | * 1. split on a nominal class attribute |
---|
56 | * 2. build estimators for nominal and numeric attributes |
---|
57 | * 3. build estimators independendly of the class type |
---|
58 | * The functionality to test on other class and attribute types |
---|
59 | * is left in big parts in the code. |
---|
60 | * |
---|
61 | * CheckEstimator reports on the following: |
---|
62 | * <ul> |
---|
63 | * <li> Estimator abilities |
---|
64 | * <ul> |
---|
65 | * <li> Possible command line options to the estimator </li> |
---|
66 | * <li> Whether the estimator can predict nominal, numeric, string, |
---|
67 | * date or relational class attributes. Warnings will be displayed if |
---|
68 | * performance is worse than ZeroR </li> |
---|
69 | * <li> Whether the estimator can be trained incrementally </li> |
---|
70 | * <li> Whether the estimator can build estimates for numeric attributes </li> |
---|
71 | * <li> Whether the estimator can handle nominal attributes </li> |
---|
72 | * <li> Whether the estimator can handle string attributes </li> |
---|
73 | * <li> Whether the estimator can handle date attributes </li> |
---|
74 | * <li> Whether the estimator can handle relational attributes </li> |
---|
75 | * <li> Whether the estimator build estimates for multi-instance data </li> |
---|
76 | * <li> Whether the estimator can handle missing attribute values </li> |
---|
77 | * <li> Whether the estimator can handle missing class values </li> |
---|
78 | * <li> Whether a nominal estimator only handles 2 class problems </li> |
---|
79 | * <li> Whether the estimator can handle instance weights </li> |
---|
80 | * </ul> |
---|
81 | * </li> |
---|
82 | * <li> Correct functioning |
---|
83 | * <ul> |
---|
84 | * <li> Correct initialisation during addvalues (i.e. no result |
---|
85 | * changes when addValues called repeatedly) </li> |
---|
86 | * <li> Whether incremental training produces the same results |
---|
87 | * as during non-incremental training (which may or may not |
---|
88 | * be OK) </li> |
---|
89 | * <li> Whether the estimator alters the data pased to it |
---|
90 | * (number of instances, instance order, instance weights, etc) </li> |
---|
91 | * </ul> |
---|
92 | * </li> |
---|
93 | * <li> Degenerate cases |
---|
94 | * <ul> |
---|
95 | * <li> building estimator with zero training instances </li> |
---|
96 | * <li> all but one attribute attribute values missing </li> |
---|
97 | * <li> all attribute attribute values missing </li> |
---|
98 | * <li> all but one class values missing </li> |
---|
99 | * <li> all class values missing </li> |
---|
100 | * </ul> |
---|
101 | * </li> |
---|
102 | * </ul> |
---|
103 | * Running CheckEstimator with the debug option set will output the |
---|
104 | * training and test datasets for any failed tests.<p/> |
---|
105 | * |
---|
106 | * The <code>weka.estimators.AbstractEstimatorTest</code> uses this |
---|
107 | * class to test all the estimators. Any changes here, have to be |
---|
108 | * checked in that abstract test class, too. <p/> |
---|
109 | * |
---|
110 | <!-- options-start --> |
---|
111 | * Valid options are: <p/> |
---|
112 | * |
---|
113 | * <pre> -D |
---|
114 | * Turn on debugging output.</pre> |
---|
115 | * |
---|
116 | * <pre> -S |
---|
117 | * Silent mode - prints nothing to stdout.</pre> |
---|
118 | * |
---|
119 | * <pre> -N <num> |
---|
120 | * The number of instances in the datasets (default 100).</pre> |
---|
121 | * |
---|
122 | * <pre> -W |
---|
123 | * Full name of the estimator analysed. |
---|
124 | * eg: weka.estimators.NormalEstimator</pre> |
---|
125 | * |
---|
126 | * <pre> |
---|
127 | * Options specific to estimator weka.estimators.NormalEstimator: |
---|
128 | * </pre> |
---|
129 | * |
---|
130 | * <pre> -D |
---|
131 | * If set, estimator is run in debug mode and |
---|
132 | * may output additional info to the console</pre> |
---|
133 | * |
---|
134 | <!-- options-end --> |
---|
135 | * |
---|
136 | * Options after -- are passed to the designated estimator.<p/> |
---|
137 | * |
---|
138 | * @author Len Trigg (trigg@cs.waikato.ac.nz) |
---|
139 | * @author FracPete (fracpete at waikato dot ac dot nz) |
---|
140 | * @version $Revision: 4997 $ |
---|
141 | * @see TestInstances |
---|
142 | */ |
---|
143 | public class CheckEstimator implements OptionHandler, RevisionHandler { |
---|
144 | |
---|
145 | /* |
---|
146 | * Note about test methods: |
---|
147 | * - methods return array of booleans |
---|
148 | * - first index: success or not |
---|
149 | * - second index: acceptable or not (e.g., Exception is OK) |
---|
150 | * - in case the performance is worse than that of ZeroR both indices are true |
---|
151 | * |
---|
152 | * FracPete (fracpete at waikato dot ac dot nz) |
---|
153 | */ |
---|
154 | |
---|
155 | /** a class for postprocessing the test-data |
---|
156 | */ |
---|
157 | public class PostProcessor |
---|
158 | implements RevisionHandler { |
---|
159 | /** |
---|
160 | * Provides a hook for derived classes to further modify the data. Currently, |
---|
161 | * the data is just passed through. |
---|
162 | * |
---|
163 | * @param data the data to process |
---|
164 | * @return the processed data |
---|
165 | */ |
---|
166 | protected Instances process(Instances data) { |
---|
167 | return data; |
---|
168 | } |
---|
169 | |
---|
170 | /** |
---|
171 | * Returns the revision string. |
---|
172 | * |
---|
173 | * @return the revision |
---|
174 | */ |
---|
175 | public String getRevision() { |
---|
176 | return RevisionUtils.extract("$Revision: 4997 $"); |
---|
177 | } |
---|
178 | } |
---|
179 | |
---|
180 | /*** The estimator to be examined */ |
---|
181 | protected Estimator m_Estimator = (Estimator) new weka.estimators.NormalEstimator(0.000001); |
---|
182 | |
---|
183 | /** The options to be passed to the base estimator. */ |
---|
184 | protected String[] m_EstimatorOptions; |
---|
185 | |
---|
186 | /** The results of the analysis as a string */ |
---|
187 | protected String m_AnalysisResults; |
---|
188 | |
---|
189 | /** Debugging mode, gives extra output if true */ |
---|
190 | protected boolean m_Debug = false; |
---|
191 | |
---|
192 | /** Silent mode, for no output at all to stdout */ |
---|
193 | protected boolean m_Silent = false; |
---|
194 | |
---|
195 | /** The number of instances in the datasets */ |
---|
196 | protected int m_NumInstances = 100; |
---|
197 | |
---|
198 | /** for post-processing the data even further */ |
---|
199 | protected PostProcessor m_PostProcessor = null; |
---|
200 | |
---|
201 | /** whether classpath problems occurred */ |
---|
202 | protected boolean m_ClasspathProblems = false; |
---|
203 | |
---|
204 | /** |
---|
205 | * class that contains info about the attribute types the estimator can estimate |
---|
206 | * estimator work on one attribute only |
---|
207 | */ |
---|
208 | public static class AttrTypes |
---|
209 | implements RevisionHandler { |
---|
210 | |
---|
211 | boolean nominal = false; |
---|
212 | boolean numeric = false; |
---|
213 | boolean string = false; |
---|
214 | boolean date = false; |
---|
215 | boolean relational = false; |
---|
216 | |
---|
217 | AttrTypes() { |
---|
218 | } |
---|
219 | |
---|
220 | AttrTypes (AttrTypes newTypes) { |
---|
221 | nominal = newTypes.nominal; |
---|
222 | numeric = newTypes.numeric; |
---|
223 | string = newTypes.string; |
---|
224 | date = newTypes.date; |
---|
225 | relational = newTypes.relational; |
---|
226 | } |
---|
227 | |
---|
228 | AttrTypes (int type) { |
---|
229 | if (type == Attribute.NOMINAL) nominal = true; |
---|
230 | if (type == Attribute.NUMERIC) numeric = true; |
---|
231 | if (type == Attribute.STRING) string = true; |
---|
232 | if (type == Attribute.DATE) date = true; |
---|
233 | if (type == Attribute.RELATIONAL) relational = true; |
---|
234 | } |
---|
235 | |
---|
236 | int getSetType() throws Exception { |
---|
237 | int sum = 0; |
---|
238 | int type = -1; |
---|
239 | if (nominal) { sum ++; type = Attribute.NOMINAL; } |
---|
240 | if (numeric) { sum ++; type = Attribute.NUMERIC; } |
---|
241 | if (string) { sum ++; type = Attribute.STRING; } |
---|
242 | if (date) { sum ++; type = Attribute.DATE; } |
---|
243 | if (relational) { sum ++; type = Attribute.RELATIONAL; } |
---|
244 | if (sum > 1) |
---|
245 | throw new Exception("Expected to have only one type set used wrongly."); |
---|
246 | if (type < 0) |
---|
247 | throw new Exception("No type set."); |
---|
248 | return type; |
---|
249 | } |
---|
250 | |
---|
251 | boolean oneIsSet() { |
---|
252 | return (nominal || numeric || string || date || relational); |
---|
253 | } |
---|
254 | |
---|
255 | public Vector getVectorOfAttrTypes() { |
---|
256 | Vector attrs = new Vector(); |
---|
257 | if (nominal) attrs.add(new Integer(Attribute.NOMINAL)); |
---|
258 | if (numeric) attrs.add(new Integer(Attribute.NUMERIC)); |
---|
259 | if (string) attrs.add(new Integer(Attribute.STRING)); |
---|
260 | if (date) attrs.add(new Integer(Attribute.DATE)); |
---|
261 | if (relational) attrs.add(new Integer(Attribute.RELATIONAL)); |
---|
262 | return attrs; |
---|
263 | } |
---|
264 | |
---|
265 | /** |
---|
266 | * Returns the revision string. |
---|
267 | * |
---|
268 | * @return the revision |
---|
269 | */ |
---|
270 | public String getRevision() { |
---|
271 | return RevisionUtils.extract("$Revision: 4997 $"); |
---|
272 | } |
---|
273 | } |
---|
274 | |
---|
275 | /** |
---|
276 | * public class that contains info about the chosen attribute type |
---|
277 | * estimator work on one attribute only |
---|
278 | */ |
---|
279 | public static class EstTypes |
---|
280 | implements RevisionHandler { |
---|
281 | |
---|
282 | boolean incremental = false; |
---|
283 | boolean weighted = false; |
---|
284 | boolean supervised = false; |
---|
285 | |
---|
286 | /** |
---|
287 | * Constructor |
---|
288 | */ |
---|
289 | public EstTypes () { |
---|
290 | } |
---|
291 | |
---|
292 | /** |
---|
293 | * Constructor |
---|
294 | */ |
---|
295 | public EstTypes (boolean i, boolean w, boolean s) { |
---|
296 | incremental = i; |
---|
297 | weighted = w; |
---|
298 | supervised = s; |
---|
299 | } |
---|
300 | |
---|
301 | /** |
---|
302 | * Returns the revision string. |
---|
303 | * |
---|
304 | * @return the revision |
---|
305 | */ |
---|
306 | public String getRevision() { |
---|
307 | return RevisionUtils.extract("$Revision: 4997 $"); |
---|
308 | } |
---|
309 | } |
---|
310 | |
---|
311 | /** |
---|
312 | * Returns an enumeration describing the available options. |
---|
313 | * |
---|
314 | * @return an enumeration of all the available options. |
---|
315 | */ |
---|
316 | public Enumeration listOptions() { |
---|
317 | |
---|
318 | Vector newVector = new Vector(2); |
---|
319 | |
---|
320 | newVector.addElement(new Option( |
---|
321 | "\tTurn on debugging output.", |
---|
322 | "D", 0, "-D")); |
---|
323 | |
---|
324 | newVector.addElement(new Option( |
---|
325 | "\tSilent mode - prints nothing to stdout.", |
---|
326 | "S", 0, "-S")); |
---|
327 | |
---|
328 | newVector.addElement(new Option( |
---|
329 | "\tThe number of instances in the datasets (default 100).", |
---|
330 | "N", 1, "-N <num>")); |
---|
331 | |
---|
332 | newVector.addElement(new Option( |
---|
333 | "\tFull name of the estimator analysed.\n" |
---|
334 | +"\teg: weka.estimators.NormalEstimator", |
---|
335 | "W", 1, "-W")); |
---|
336 | |
---|
337 | if ((m_Estimator != null) |
---|
338 | && (m_Estimator instanceof OptionHandler)) { |
---|
339 | newVector.addElement(new Option("", "", 0, |
---|
340 | "\nOptions specific to estimator " |
---|
341 | + m_Estimator.getClass().getName() |
---|
342 | + ":")); |
---|
343 | Enumeration enu = ((OptionHandler)m_Estimator).listOptions(); |
---|
344 | while (enu.hasMoreElements()) |
---|
345 | newVector.addElement(enu.nextElement()); |
---|
346 | } |
---|
347 | |
---|
348 | return newVector.elements(); |
---|
349 | } |
---|
350 | |
---|
351 | /** |
---|
352 | * Parses a given list of options. |
---|
353 | * |
---|
354 | <!-- options-start --> |
---|
355 | * Valid options are: <p/> |
---|
356 | * |
---|
357 | * <pre> -D |
---|
358 | * Turn on debugging output.</pre> |
---|
359 | * |
---|
360 | * <pre> -S |
---|
361 | * Silent mode - prints nothing to stdout.</pre> |
---|
362 | * |
---|
363 | * <pre> -N <num> |
---|
364 | * The number of instances in the datasets (default 100).</pre> |
---|
365 | * |
---|
366 | * <pre> -W |
---|
367 | * Full name of the estimator analysed. |
---|
368 | * eg: weka.estimators.NormalEstimator</pre> |
---|
369 | * |
---|
370 | * <pre> |
---|
371 | * Options specific to estimator weka.estimators.NormalEstimator: |
---|
372 | * </pre> |
---|
373 | * |
---|
374 | * <pre> -D |
---|
375 | * If set, estimator is run in debug mode and |
---|
376 | * may output additional info to the console</pre> |
---|
377 | * |
---|
378 | <!-- options-end --> |
---|
379 | * |
---|
380 | * @param options the list of options as an array of strings |
---|
381 | * @throws Exception if an option is not supported |
---|
382 | */ |
---|
383 | public void setOptions(String[] options) throws Exception { |
---|
384 | String tmpStr; |
---|
385 | |
---|
386 | setDebug(Utils.getFlag('D', options)); |
---|
387 | |
---|
388 | setSilent(Utils.getFlag('S', options)); |
---|
389 | |
---|
390 | tmpStr = Utils.getOption('N', options); |
---|
391 | if (tmpStr.length() != 0) |
---|
392 | setNumInstances(Integer.parseInt(tmpStr)); |
---|
393 | else |
---|
394 | setNumInstances(100); |
---|
395 | |
---|
396 | tmpStr = Utils.getOption('W', options); |
---|
397 | if (tmpStr.length() == 0) |
---|
398 | throw new Exception("A estimator must be specified with the -W option."); |
---|
399 | setEstimator(Estimator.forName(tmpStr, Utils.partitionOptions(options))); |
---|
400 | } |
---|
401 | |
---|
402 | /** |
---|
403 | * Gets the current settings of the CheckEstimator. |
---|
404 | * |
---|
405 | * @return an array of strings suitable for passing to setOptions |
---|
406 | */ |
---|
407 | public String[] getOptions() { |
---|
408 | Vector result; |
---|
409 | String[] options; |
---|
410 | int i; |
---|
411 | |
---|
412 | result = new Vector(); |
---|
413 | |
---|
414 | if (getDebug()) |
---|
415 | result.add("-D"); |
---|
416 | |
---|
417 | if (getSilent()) |
---|
418 | result.add("-S"); |
---|
419 | |
---|
420 | result.add("-N"); |
---|
421 | result.add("" + getNumInstances()); |
---|
422 | |
---|
423 | if (getEstimator() != null) { |
---|
424 | result.add("-W"); |
---|
425 | result.add(getEstimator().getClass().getName()); |
---|
426 | } |
---|
427 | |
---|
428 | if ((m_Estimator != null) && (m_Estimator instanceof OptionHandler)) |
---|
429 | options = ((OptionHandler) m_Estimator).getOptions(); |
---|
430 | else |
---|
431 | options = new String[0]; |
---|
432 | |
---|
433 | if (options.length > 0) { |
---|
434 | result.add("--"); |
---|
435 | for (i = 0; i < options.length; i++) |
---|
436 | result.add(options[i]); |
---|
437 | } |
---|
438 | |
---|
439 | return (String[]) result.toArray(new String[result.size()]); |
---|
440 | } |
---|
441 | |
---|
442 | /** |
---|
443 | * sets the PostProcessor to use |
---|
444 | * |
---|
445 | * @param value the new PostProcessor |
---|
446 | * @see #m_PostProcessor |
---|
447 | */ |
---|
448 | public void setPostProcessor(PostProcessor value) { |
---|
449 | m_PostProcessor = value; |
---|
450 | } |
---|
451 | |
---|
452 | /** |
---|
453 | * returns the current PostProcessor, can be null |
---|
454 | * |
---|
455 | * @return the current PostProcessor |
---|
456 | */ |
---|
457 | public PostProcessor getPostProcessor() { |
---|
458 | return m_PostProcessor; |
---|
459 | } |
---|
460 | |
---|
461 | /** |
---|
462 | * returns TRUE if the estimator returned a "not in classpath" Exception |
---|
463 | * |
---|
464 | * @return true if CLASSPATH problems occurred |
---|
465 | */ |
---|
466 | public boolean hasClasspathProblems() { |
---|
467 | return m_ClasspathProblems; |
---|
468 | } |
---|
469 | |
---|
470 | /** |
---|
471 | * Begin the tests, reporting results to System.out |
---|
472 | */ |
---|
473 | public void doTests() { |
---|
474 | |
---|
475 | if (getEstimator() == null) { |
---|
476 | println("\n=== No estimator set ==="); |
---|
477 | return; |
---|
478 | } |
---|
479 | println("\n=== Check on Estimator: " |
---|
480 | + getEstimator().getClass().getName() |
---|
481 | + " ===\n"); |
---|
482 | |
---|
483 | m_ClasspathProblems = false; |
---|
484 | |
---|
485 | // Start tests with test for options |
---|
486 | canTakeOptions(); |
---|
487 | |
---|
488 | // test what type of estimator it is |
---|
489 | EstTypes estTypes = new EstTypes(); |
---|
490 | estTypes.incremental = incrementalEstimator()[0]; |
---|
491 | estTypes.weighted = weightedInstancesHandler()[0]; |
---|
492 | estTypes.supervised = supervisedEstimator()[0]; |
---|
493 | |
---|
494 | // in none of the estimators yet the functionality is depending on the class type |
---|
495 | // since this could change the basic structure taken from checkclassifiers is kept here |
---|
496 | int classType = Attribute.NOMINAL; |
---|
497 | AttrTypes attrTypes = testsPerClassType(classType, estTypes); |
---|
498 | |
---|
499 | |
---|
500 | // only nominal class can be split up so far |
---|
501 | canSplitUpClass(attrTypes, classType); |
---|
502 | } |
---|
503 | |
---|
504 | |
---|
505 | /** |
---|
506 | * Set debugging mode |
---|
507 | * |
---|
508 | * @param debug true if debug output should be printed |
---|
509 | */ |
---|
510 | public void setDebug(boolean debug) { |
---|
511 | m_Debug = debug; |
---|
512 | |
---|
513 | // disable silent mode, if necessary |
---|
514 | if (getDebug()) |
---|
515 | setSilent(false); |
---|
516 | } |
---|
517 | |
---|
518 | /** |
---|
519 | * Get whether debugging is turned on |
---|
520 | * |
---|
521 | * @return true if debugging output is on |
---|
522 | */ |
---|
523 | public boolean getDebug() { |
---|
524 | return m_Debug; |
---|
525 | } |
---|
526 | |
---|
527 | /** |
---|
528 | * Set slient mode, i.e., no output at all to stdout |
---|
529 | * |
---|
530 | * @param value whether silent mode is active or not |
---|
531 | */ |
---|
532 | public void setSilent(boolean value) { |
---|
533 | m_Silent = value; |
---|
534 | } |
---|
535 | |
---|
536 | /** |
---|
537 | * Get whether silent mode is turned on |
---|
538 | * |
---|
539 | * @return true if silent mode is on |
---|
540 | */ |
---|
541 | public boolean getSilent() { |
---|
542 | return m_Silent; |
---|
543 | } |
---|
544 | |
---|
545 | /** |
---|
546 | * Sets the number of instances to use in the datasets (some estimators |
---|
547 | * might require more instances). |
---|
548 | * |
---|
549 | * @param value the number of instances to use |
---|
550 | */ |
---|
551 | public void setNumInstances(int value) { |
---|
552 | m_NumInstances = value; |
---|
553 | } |
---|
554 | |
---|
555 | /** |
---|
556 | * Gets the current number of instances to use for the datasets. |
---|
557 | * |
---|
558 | * @return the number of instances |
---|
559 | */ |
---|
560 | public int getNumInstances() { |
---|
561 | return m_NumInstances; |
---|
562 | } |
---|
563 | |
---|
564 | /** |
---|
565 | * Set the estimator for boosting. |
---|
566 | * |
---|
567 | * @param newEstimator the Estimator to use. |
---|
568 | */ |
---|
569 | public void setEstimator(Estimator newEstimator) { |
---|
570 | m_Estimator = newEstimator; |
---|
571 | } |
---|
572 | |
---|
573 | /** |
---|
574 | * Get the estimator used as the estimator |
---|
575 | * |
---|
576 | * @return the estimator used as the estimator |
---|
577 | */ |
---|
578 | public Estimator getEstimator() { |
---|
579 | return m_Estimator; |
---|
580 | } |
---|
581 | |
---|
582 | /** |
---|
583 | * prints the given message to stdout, if not silent mode |
---|
584 | * |
---|
585 | * @param msg the text to print to stdout |
---|
586 | */ |
---|
587 | protected void print(Object msg) { |
---|
588 | if (!getSilent()) |
---|
589 | System.out.print(msg); |
---|
590 | } |
---|
591 | |
---|
592 | /** |
---|
593 | * prints the given message (+ LF) to stdout, if not silent mode |
---|
594 | * |
---|
595 | * @param msg the message to println to stdout |
---|
596 | */ |
---|
597 | protected void println(Object msg) { |
---|
598 | print(msg + "\n"); |
---|
599 | } |
---|
600 | |
---|
601 | /** |
---|
602 | * prints a LF to stdout, if not silent mode |
---|
603 | */ |
---|
604 | protected void println() { |
---|
605 | print("\n"); |
---|
606 | } |
---|
607 | |
---|
608 | /** |
---|
609 | * Run a battery of tests for a given class attribute type |
---|
610 | * |
---|
611 | * @param classType true if the class attribute should be numeric |
---|
612 | * @param estTypes types the estimator is, like incremental, weighted, supervised etc |
---|
613 | * @return attribute types estimator can work with |
---|
614 | */ |
---|
615 | protected AttrTypes testsPerClassType(int classType, EstTypes estTypes) { |
---|
616 | |
---|
617 | // in none of the estimators yet is the estimation depending on the class type |
---|
618 | // since this could change the basic structure taken from checkclassifiers is kept here |
---|
619 | |
---|
620 | // test A: simple test - if can estimate |
---|
621 | AttrTypes attrTypes = new AttrTypes(); |
---|
622 | AttrTypes at = new AttrTypes(Attribute.NOMINAL); |
---|
623 | attrTypes.nominal = canEstimate(at, estTypes.supervised, classType)[0]; |
---|
624 | at = new AttrTypes(Attribute.NUMERIC); |
---|
625 | attrTypes.numeric = canEstimate(at, estTypes.supervised, classType)[0]; |
---|
626 | attrTypes.string = false; |
---|
627 | attrTypes.date = false; |
---|
628 | attrTypes.relational = false; |
---|
629 | |
---|
630 | // if (!multiInstance) |
---|
631 | // PRel = canEstimate(false, false, false, false, true, classType)[0]; |
---|
632 | // else |
---|
633 | // PRel = false; |
---|
634 | |
---|
635 | // one of the attribute types succeeded |
---|
636 | |
---|
637 | if (attrTypes.oneIsSet()) { |
---|
638 | Vector attributesSet = attrTypes.getVectorOfAttrTypes(); |
---|
639 | |
---|
640 | // make tests for each attribute |
---|
641 | for (int i = 0; i < attributesSet.size(); i++) { |
---|
642 | AttrTypes workAttrTypes = new AttrTypes(((Integer) attributesSet.elementAt(i)).intValue()); |
---|
643 | |
---|
644 | // test B: weights change estimate or not |
---|
645 | if (estTypes.weighted) |
---|
646 | instanceWeights(workAttrTypes, classType); |
---|
647 | |
---|
648 | if (classType == Attribute.NOMINAL) { |
---|
649 | int numClasses = 4; |
---|
650 | canHandleNClasses(workAttrTypes, numClasses); |
---|
651 | } |
---|
652 | |
---|
653 | // tests with class not the last attribute and the attribute not the first |
---|
654 | |
---|
655 | // if (!multiInstance) { |
---|
656 | int numAtt = 4; |
---|
657 | |
---|
658 | canHandleClassAsNthAttribute(workAttrTypes, numAtt, 0, classType, 1); |
---|
659 | |
---|
660 | //TODOTODOcanHandleAttrAsNthAttribute(workAttrTypes, numAtt, 2, classType); |
---|
661 | //} |
---|
662 | |
---|
663 | canHandleZeroTraining(workAttrTypes, classType); |
---|
664 | boolean handleMissingAttributes = canHandleMissing(workAttrTypes, |
---|
665 | classType, true, false, 20)[0]; |
---|
666 | if (handleMissingAttributes) |
---|
667 | canHandleMissing(workAttrTypes, classType, true, false, 100); |
---|
668 | |
---|
669 | boolean handleMissingClass = canHandleMissing(workAttrTypes, |
---|
670 | classType, |
---|
671 | false, true, 20)[0]; |
---|
672 | if (handleMissingClass) |
---|
673 | canHandleMissing(workAttrTypes, classType, false, true, 100); |
---|
674 | |
---|
675 | correctBuildInitialisation(workAttrTypes, classType); |
---|
676 | datasetIntegrity(workAttrTypes, classType, |
---|
677 | handleMissingAttributes, handleMissingClass); |
---|
678 | |
---|
679 | if (estTypes.incremental) |
---|
680 | incrementingEquality(workAttrTypes, classType); |
---|
681 | } |
---|
682 | } |
---|
683 | return attrTypes; |
---|
684 | } |
---|
685 | |
---|
686 | /** |
---|
687 | * Checks whether the scheme can take command line options. |
---|
688 | * |
---|
689 | * @return index 0 is true if the estimator can take options |
---|
690 | */ |
---|
691 | protected boolean[] canTakeOptions() { |
---|
692 | |
---|
693 | boolean[] result = new boolean[2]; |
---|
694 | |
---|
695 | print("options..."); |
---|
696 | if (m_Estimator instanceof OptionHandler) { |
---|
697 | println("yes"); |
---|
698 | if (m_Debug) { |
---|
699 | println("\n=== Full report ==="); |
---|
700 | Enumeration enu = ((OptionHandler)m_Estimator).listOptions(); |
---|
701 | while (enu.hasMoreElements()) { |
---|
702 | Option option = (Option) enu.nextElement(); |
---|
703 | print(option.synopsis() + "\n" |
---|
704 | + option.description() + "\n"); |
---|
705 | } |
---|
706 | println("\n"); |
---|
707 | } |
---|
708 | result[0] = true; |
---|
709 | } |
---|
710 | else { |
---|
711 | println("no"); |
---|
712 | result[0] = false; |
---|
713 | } |
---|
714 | |
---|
715 | return result; |
---|
716 | } |
---|
717 | |
---|
718 | /** |
---|
719 | * Checks whether the scheme can build models incrementally. |
---|
720 | * |
---|
721 | * @return index 0 is true if the estimator can train incrementally |
---|
722 | */ |
---|
723 | protected boolean[] incrementalEstimator() { |
---|
724 | |
---|
725 | boolean[] result = new boolean[2]; |
---|
726 | |
---|
727 | print("incremental estimator..."); |
---|
728 | if (m_Estimator instanceof IncrementalEstimator) { |
---|
729 | println("yes"); |
---|
730 | result[0] = true; |
---|
731 | } |
---|
732 | else { |
---|
733 | println("no"); |
---|
734 | result[0] = false; |
---|
735 | } |
---|
736 | |
---|
737 | return result; |
---|
738 | } |
---|
739 | |
---|
740 | /** |
---|
741 | * Checks whether the scheme says it can handle instance weights. |
---|
742 | * |
---|
743 | * @return true if the estimator handles instance weights |
---|
744 | */ |
---|
745 | protected boolean[] weightedInstancesHandler() { |
---|
746 | |
---|
747 | boolean[] result = new boolean[2]; |
---|
748 | |
---|
749 | print("weighted instances estimator..."); |
---|
750 | if (m_Estimator instanceof WeightedInstancesHandler) { |
---|
751 | println("yes"); |
---|
752 | result[0] = true; |
---|
753 | } |
---|
754 | else { |
---|
755 | println("no"); |
---|
756 | result[0] = false; |
---|
757 | } |
---|
758 | |
---|
759 | return result; |
---|
760 | } |
---|
761 | |
---|
762 | /** |
---|
763 | * Checks whether the estimator is supervised. |
---|
764 | * |
---|
765 | * @return true if the estimator handles instance weights |
---|
766 | */ |
---|
767 | protected boolean[] supervisedEstimator() { |
---|
768 | boolean[] result = new boolean[2]; |
---|
769 | result[0] = false; |
---|
770 | return result; |
---|
771 | } |
---|
772 | |
---|
773 | /** |
---|
774 | * Checks basic estimation of one attribute of the scheme, for simple non-troublesome |
---|
775 | * datasets. |
---|
776 | * |
---|
777 | * @param attrTypes the types the estimator can work with |
---|
778 | * @param classType the class type (NOMINAL, NUMERIC, etc.) |
---|
779 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
780 | * was acceptable |
---|
781 | */ |
---|
782 | protected boolean[] canEstimate(AttrTypes attrTypes, boolean supervised, int classType) { |
---|
783 | |
---|
784 | // supervised is ignored, no supervised estimators used yet |
---|
785 | |
---|
786 | print("basic estimation"); |
---|
787 | printAttributeSummary(attrTypes, classType); |
---|
788 | print("..."); |
---|
789 | FastVector accepts = new FastVector(); |
---|
790 | accepts.addElement("nominal"); |
---|
791 | accepts.addElement("numeric"); |
---|
792 | accepts.addElement("string"); |
---|
793 | accepts.addElement("date"); |
---|
794 | accepts.addElement("relational"); |
---|
795 | accepts.addElement("not in classpath"); |
---|
796 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
797 | numClasses = 2, missingLevel = 0; |
---|
798 | boolean attributeMissing = false, classMissing = false; |
---|
799 | int numAtts = 1, attrIndex = 0; |
---|
800 | |
---|
801 | return runBasicTest(attrTypes, numAtts, attrIndex, |
---|
802 | classType, |
---|
803 | missingLevel, attributeMissing, classMissing, |
---|
804 | numTrain, numTest, numClasses, |
---|
805 | accepts); |
---|
806 | } |
---|
807 | |
---|
808 | /** |
---|
809 | * Checks basic estimation of one attribute of the scheme, for simple non-troublesome |
---|
810 | * datasets. |
---|
811 | * |
---|
812 | * @param attrTypes the types the estimator can work with |
---|
813 | * @param classType the class type (NOMINAL, NUMERIC, etc.) |
---|
814 | */ |
---|
815 | protected void canSplitUpClass(AttrTypes attrTypes, int classType) { |
---|
816 | |
---|
817 | if (attrTypes.nominal) |
---|
818 | canSplitUpClass(Attribute.NOMINAL, classType); |
---|
819 | if (attrTypes.numeric) |
---|
820 | canSplitUpClass(Attribute.NUMERIC, classType); |
---|
821 | } |
---|
822 | |
---|
823 | /** |
---|
824 | * Checks basic estimation of one attribute of the scheme, for simple non-troublesome |
---|
825 | * datasets. |
---|
826 | * |
---|
827 | * @param attrType the type of the estimator |
---|
828 | * @param classType the class type (NOMINAL, NUMERIC, etc.) |
---|
829 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
830 | * was acceptable |
---|
831 | */ |
---|
832 | protected boolean[] canSplitUpClass(int attrType, int classType) { |
---|
833 | |
---|
834 | boolean[] result = new boolean[2]; |
---|
835 | |
---|
836 | FastVector accepts = new FastVector(); |
---|
837 | accepts.addElement("not in classpath"); |
---|
838 | |
---|
839 | // supervised is ignored, no supervised estimators used yet |
---|
840 | print("split per class type "); |
---|
841 | printAttributeSummary(attrType, Attribute.NOMINAL); |
---|
842 | print("..."); |
---|
843 | |
---|
844 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
845 | numClasses = 2; |
---|
846 | boolean attributeMissing = false, classMissing = false; |
---|
847 | int numAtts = 3, attrIndex = 0, classIndex = 1; |
---|
848 | Instances train = null; |
---|
849 | Vector test; |
---|
850 | Estimator estimator = null; |
---|
851 | boolean built = false; |
---|
852 | |
---|
853 | try { |
---|
854 | AttrTypes at = new AttrTypes(attrType); |
---|
855 | train = makeTestDataset(42, numTrain, numAtts, at, |
---|
856 | numClasses, classType, classIndex); |
---|
857 | |
---|
858 | // prepare training data set and test value list |
---|
859 | test = makeTestValueList(24, numTest, train, attrIndex, |
---|
860 | attrType); |
---|
861 | |
---|
862 | estimator = Estimator.makeCopies(getEstimator(), 1)[0]; |
---|
863 | } catch (Exception ex) { |
---|
864 | ex.printStackTrace(); |
---|
865 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
866 | } |
---|
867 | try { |
---|
868 | estimator.addValues(train, attrIndex, classType, classIndex); |
---|
869 | built = true; |
---|
870 | |
---|
871 | testWithTestValues(estimator, test); |
---|
872 | |
---|
873 | println("yes"); |
---|
874 | result[0] = true; |
---|
875 | } |
---|
876 | catch (Exception ex) { |
---|
877 | boolean acceptable = false; |
---|
878 | String msg; |
---|
879 | if (ex.getMessage() == null) |
---|
880 | msg = ""; |
---|
881 | else |
---|
882 | msg = ex.getMessage().toLowerCase(); |
---|
883 | if (msg.indexOf("not in classpath") > -1) |
---|
884 | m_ClasspathProblems = true; |
---|
885 | |
---|
886 | for (int i = 0; i < accepts.size(); i++) { |
---|
887 | if (msg.indexOf((String)accepts.elementAt(i)) >= 0) { |
---|
888 | acceptable = true; |
---|
889 | } |
---|
890 | } |
---|
891 | |
---|
892 | println("no" + (acceptable ? " (OK error message)" : "")); |
---|
893 | result[1] = acceptable; |
---|
894 | |
---|
895 | |
---|
896 | if (m_Debug) { |
---|
897 | println("\n=== Full Report ==="); |
---|
898 | print("Problem during"); |
---|
899 | if (built) { |
---|
900 | print(" testing"); |
---|
901 | } else { |
---|
902 | print(" training"); |
---|
903 | } |
---|
904 | println(": " + ex.getMessage() + "\n"); |
---|
905 | if (!acceptable) { |
---|
906 | if (accepts.size() > 0) { |
---|
907 | print("Error message doesn't mention "); |
---|
908 | for (int i = 0; i < accepts.size(); i++) { |
---|
909 | if (i != 0) { |
---|
910 | print(" or "); |
---|
911 | } |
---|
912 | print('"' + (String)accepts.elementAt(i) + '"'); |
---|
913 | } |
---|
914 | } |
---|
915 | println("here are the datasets:\n"); |
---|
916 | println("=== Train Dataset ===\n" |
---|
917 | + train.toString() + "\n"); |
---|
918 | println("=== Test Dataset ===\n" |
---|
919 | + test.toString() + "\n\n"); |
---|
920 | } |
---|
921 | |
---|
922 | } |
---|
923 | } |
---|
924 | return result; |
---|
925 | } |
---|
926 | |
---|
927 | /** |
---|
928 | * Checks whether nominal schemes can handle more than two classes. |
---|
929 | * If a scheme is only designed for two-class problems it should |
---|
930 | * throw an appropriate exception for multi-class problems. |
---|
931 | * |
---|
932 | * @param attrTypes attribute types the estimator excepts |
---|
933 | * @param numClasses the number of classes to test |
---|
934 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
935 | * was acceptable |
---|
936 | */ |
---|
937 | protected boolean[] canHandleNClasses(AttrTypes attrTypes, int numClasses) { |
---|
938 | |
---|
939 | print("more than two class problems"); |
---|
940 | printAttributeSummary(attrTypes, Attribute.NOMINAL); |
---|
941 | print("..."); |
---|
942 | |
---|
943 | FastVector accepts = new FastVector(); |
---|
944 | accepts.addElement("number"); |
---|
945 | accepts.addElement("class"); |
---|
946 | |
---|
947 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
948 | missingLevel = 0; |
---|
949 | boolean attributeMissing = false, classMissing = false; |
---|
950 | int numAttr = 1, attrIndex = 0; |
---|
951 | |
---|
952 | return runBasicTest(attrTypes, |
---|
953 | numAttr, attrIndex, |
---|
954 | Attribute.NOMINAL, |
---|
955 | missingLevel, attributeMissing, classMissing, |
---|
956 | numTrain, numTest, numClasses, |
---|
957 | accepts); |
---|
958 | } |
---|
959 | |
---|
960 | /** |
---|
961 | * Checks whether the scheme can handle class attributes as Nth attribute. |
---|
962 | * |
---|
963 | * @param attrTypes the attribute types the estimator accepts |
---|
964 | * @param numAtts of attributes |
---|
965 | * @param attrIndex the index of the attribute |
---|
966 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
967 | * @param classIndex the index of the class attribute (0-based, -1 means last attribute) |
---|
968 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
969 | * was acceptable |
---|
970 | * @see TestInstances#CLASS_IS_LAST |
---|
971 | */ |
---|
972 | protected boolean[] canHandleClassAsNthAttribute(AttrTypes attrTypes, |
---|
973 | int numAtts, |
---|
974 | int attrIndex, |
---|
975 | int classType, |
---|
976 | int classIndex) { |
---|
977 | |
---|
978 | if (classIndex == TestInstances.CLASS_IS_LAST) |
---|
979 | print("class attribute as last attribute"); |
---|
980 | else |
---|
981 | print("class attribute as " + (classIndex + 1) + ". attribute"); |
---|
982 | printAttributeSummary(attrTypes, classType); |
---|
983 | print("..."); |
---|
984 | FastVector accepts = new FastVector(); |
---|
985 | int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, |
---|
986 | missingLevel = 0; |
---|
987 | boolean attributeMissing = false, classMissing = false; |
---|
988 | |
---|
989 | return runBasicTest(attrTypes, |
---|
990 | numAtts, attrIndex, |
---|
991 | classType, classIndex, |
---|
992 | missingLevel, attributeMissing, classMissing, |
---|
993 | numTrain, numTest, numClasses, |
---|
994 | accepts); |
---|
995 | } |
---|
996 | |
---|
997 | /** |
---|
998 | * Checks whether the scheme can handle zero training instances. |
---|
999 | * |
---|
1000 | * @param attrTypes attribute types that can be estimated |
---|
1001 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1002 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
1003 | * was acceptable |
---|
1004 | */ |
---|
1005 | protected boolean[] canHandleZeroTraining(AttrTypes attrTypes, int classType) { |
---|
1006 | |
---|
1007 | print("handle zero training instances"); |
---|
1008 | printAttributeSummary(attrTypes, classType); |
---|
1009 | |
---|
1010 | print("..."); |
---|
1011 | FastVector accepts = new FastVector(); |
---|
1012 | accepts.addElement("train"); |
---|
1013 | accepts.addElement("value"); |
---|
1014 | int numTrain = 0, numTest = getNumInstances(), numClasses = 2, |
---|
1015 | missingLevel = 0; |
---|
1016 | boolean attributeMissing = false, classMissing = false; |
---|
1017 | int numAtts = 1; |
---|
1018 | int attrIndex = 0; |
---|
1019 | return runBasicTest( |
---|
1020 | attrTypes, numAtts, attrIndex, |
---|
1021 | classType, |
---|
1022 | missingLevel, attributeMissing, classMissing, |
---|
1023 | numTrain, numTest, numClasses, |
---|
1024 | accepts); |
---|
1025 | } |
---|
1026 | |
---|
1027 | /** |
---|
1028 | * Checks whether the scheme correctly initialises models when |
---|
1029 | * buildEstimator is called. This test calls buildEstimator with |
---|
1030 | * one training dataset and records performance on a test set. |
---|
1031 | * buildEstimator is then called on a training set with different |
---|
1032 | * structure, and then again with the original training set. The |
---|
1033 | * performance on the test set is compared with the original results |
---|
1034 | * and any performance difference noted as incorrect build initialisation. |
---|
1035 | * |
---|
1036 | * @param attrTypes attribute types that can be estimated |
---|
1037 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1038 | * @return index 0 is true if the test was passed, index 1 is true if the |
---|
1039 | * scheme performs worse than ZeroR, but without error (index 0 is |
---|
1040 | * false) |
---|
1041 | */ |
---|
1042 | protected boolean[] correctBuildInitialisation(AttrTypes attrTypes, |
---|
1043 | int classType) { |
---|
1044 | |
---|
1045 | boolean[] result = new boolean[2]; |
---|
1046 | |
---|
1047 | print("correct initialisation during buildEstimator"); |
---|
1048 | printAttributeSummary(attrTypes, classType); |
---|
1049 | |
---|
1050 | print("..."); |
---|
1051 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
1052 | numClasses = 2, missingLevel = 0; |
---|
1053 | boolean attributeMissing = false, classMissing = false; |
---|
1054 | |
---|
1055 | Instances train1 = null; |
---|
1056 | Instances test1 = null; |
---|
1057 | Instances train2 = null; |
---|
1058 | Instances test2 = null; |
---|
1059 | Estimator estimator = null; |
---|
1060 | Estimator estimator1 = null; |
---|
1061 | |
---|
1062 | boolean built = false; |
---|
1063 | int stage = 0; |
---|
1064 | int attrIndex1 = 1; |
---|
1065 | int attrIndex2 = 2; |
---|
1066 | |
---|
1067 | try { |
---|
1068 | |
---|
1069 | // Make two sets of train/test splits with different |
---|
1070 | // numbers of attributes |
---|
1071 | train1 = makeTestDataset(42, numTrain, 2, attrTypes, |
---|
1072 | numClasses, |
---|
1073 | classType); |
---|
1074 | train2 = makeTestDataset(84, numTrain, 3, attrTypes, |
---|
1075 | numClasses, |
---|
1076 | classType); |
---|
1077 | if (missingLevel > 0) { |
---|
1078 | addMissing(train1, missingLevel, attributeMissing, classMissing, attrIndex1); |
---|
1079 | addMissing(train2, missingLevel, attributeMissing, classMissing, attrIndex2); |
---|
1080 | } |
---|
1081 | |
---|
1082 | estimator = Estimator.makeCopies(getEstimator(), 1)[0]; |
---|
1083 | } catch (Exception ex) { |
---|
1084 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
1085 | } |
---|
1086 | try { |
---|
1087 | //TESTING?? |
---|
1088 | stage = 0; |
---|
1089 | estimator.addValues(train1, attrIndex1); |
---|
1090 | built = true; |
---|
1091 | |
---|
1092 | estimator1 = estimator.makeCopies(getEstimator(), 1)[0]; |
---|
1093 | |
---|
1094 | stage = 1; |
---|
1095 | built = false; |
---|
1096 | estimator.addValues(train2, attrIndex2); |
---|
1097 | built = true; |
---|
1098 | |
---|
1099 | stage = 2; |
---|
1100 | built = false; |
---|
1101 | estimator.addValues(train1, attrIndex1); |
---|
1102 | built = true; |
---|
1103 | |
---|
1104 | stage = 3; |
---|
1105 | if (!estimator.equals(estimator1)) { |
---|
1106 | if (m_Debug) { |
---|
1107 | println("\n=== Full report ===\n" |
---|
1108 | + "\nFirst build estimator\n"+ |
---|
1109 | estimator.toString() + "\n\n"); |
---|
1110 | println("\nSecond build estimator\n"+ |
---|
1111 | estimator.toString() + "\n\n"); |
---|
1112 | } |
---|
1113 | throw new Exception("Results differ between buildEstimator calls"); |
---|
1114 | } |
---|
1115 | println("yes"); |
---|
1116 | result[0] = true; |
---|
1117 | |
---|
1118 | if (false && m_Debug) { |
---|
1119 | println("\n=== Full report ===\n" |
---|
1120 | + "\nFirst buildEstimator()" |
---|
1121 | + "\n\n"); |
---|
1122 | println("\nSecond buildEstimator()" |
---|
1123 | + "\n\n"); |
---|
1124 | } |
---|
1125 | } |
---|
1126 | catch (Exception ex) { |
---|
1127 | String msg = ex.getMessage().toLowerCase(); |
---|
1128 | if (msg.indexOf("worse than zeror") >= 0) { |
---|
1129 | println("warning: performs worse than ZeroR"); |
---|
1130 | result[0] = true; |
---|
1131 | result[1] = true; |
---|
1132 | } else { |
---|
1133 | println("no"); |
---|
1134 | result[0] = false; |
---|
1135 | } |
---|
1136 | if (m_Debug) { |
---|
1137 | println("\n=== Full Report ==="); |
---|
1138 | print("Problem during"); |
---|
1139 | if (built) { |
---|
1140 | print(" testing"); |
---|
1141 | } else { |
---|
1142 | print(" training"); |
---|
1143 | } |
---|
1144 | switch (stage) { |
---|
1145 | case 0: |
---|
1146 | print(" of dataset 1"); |
---|
1147 | break; |
---|
1148 | case 1: |
---|
1149 | print(" of dataset 2"); |
---|
1150 | break; |
---|
1151 | case 2: |
---|
1152 | print(" of dataset 1 (2nd build)"); |
---|
1153 | break; |
---|
1154 | case 3: |
---|
1155 | print(", comparing results from builds of dataset 1"); |
---|
1156 | break; |
---|
1157 | } |
---|
1158 | println(": " + ex.getMessage() + "\n"); |
---|
1159 | println("here are the datasets:\n"); |
---|
1160 | println("=== Train1 Dataset ===\n" |
---|
1161 | + train1.toString() + "\n"); |
---|
1162 | println("=== Test1 Dataset ===\n" |
---|
1163 | + test1.toString() + "\n\n"); |
---|
1164 | println("=== Train2 Dataset ===\n" |
---|
1165 | + train2.toString() + "\n"); |
---|
1166 | println("=== Test2 Dataset ===\n" |
---|
1167 | + test2.toString() + "\n\n"); |
---|
1168 | } |
---|
1169 | } |
---|
1170 | |
---|
1171 | return result; |
---|
1172 | } |
---|
1173 | |
---|
1174 | /** |
---|
1175 | * Checks basic missing value handling of the scheme. If the missing |
---|
1176 | * values cause an exception to be thrown by the scheme, this will be |
---|
1177 | * recorded. |
---|
1178 | * |
---|
1179 | * @param attrTypes attribute types that can be estimated |
---|
1180 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1181 | * @param attributeMissing true if the missing values may be in |
---|
1182 | * the attributes |
---|
1183 | * @param classMissing true if the missing values may be in the class |
---|
1184 | * @param missingLevel the percentage of missing values |
---|
1185 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
1186 | * was acceptable |
---|
1187 | */ |
---|
1188 | protected boolean[] canHandleMissing(AttrTypes attrTypes, |
---|
1189 | int classType, |
---|
1190 | boolean attributeMissing, |
---|
1191 | boolean classMissing, |
---|
1192 | int missingLevel) { |
---|
1193 | |
---|
1194 | if (missingLevel == 100) |
---|
1195 | print("100% "); |
---|
1196 | print("missing"); |
---|
1197 | if (attributeMissing) { |
---|
1198 | print(" attribute"); |
---|
1199 | if (classMissing) |
---|
1200 | print(" and"); |
---|
1201 | } |
---|
1202 | if (classMissing) |
---|
1203 | print(" class"); |
---|
1204 | print(" values"); |
---|
1205 | printAttributeSummary(attrTypes, classType); |
---|
1206 | |
---|
1207 | print("..."); |
---|
1208 | FastVector accepts = new FastVector(); |
---|
1209 | accepts.addElement("missing"); |
---|
1210 | accepts.addElement("value"); |
---|
1211 | accepts.addElement("train"); |
---|
1212 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
1213 | numClasses = 2; |
---|
1214 | |
---|
1215 | int numAtts = 1, attrIndex = 0; |
---|
1216 | return runBasicTest(attrTypes, |
---|
1217 | numAtts, attrIndex, |
---|
1218 | classType, |
---|
1219 | missingLevel, attributeMissing, classMissing, |
---|
1220 | numTrain, numTest, numClasses, |
---|
1221 | accepts); |
---|
1222 | } |
---|
1223 | |
---|
1224 | /** |
---|
1225 | * Checks whether an incremental scheme produces the same model when |
---|
1226 | * trained incrementally as when batch trained. The model itself |
---|
1227 | * cannot be compared, so we compare the evaluation on test data |
---|
1228 | * for both models. It is possible to get a false positive on this |
---|
1229 | * test (likelihood depends on the estimator). |
---|
1230 | * |
---|
1231 | * @param attrTypes attribute types that can be estimated |
---|
1232 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1233 | * @return index 0 is true if the test was passed |
---|
1234 | */ |
---|
1235 | protected boolean[] incrementingEquality(AttrTypes attrTypes, |
---|
1236 | int classType) { |
---|
1237 | |
---|
1238 | print("incremental training produces the same results" |
---|
1239 | + " as batch training"); |
---|
1240 | printAttributeSummary(attrTypes, classType); |
---|
1241 | |
---|
1242 | print("..."); |
---|
1243 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
1244 | numClasses = 2, missingLevel = 0; |
---|
1245 | boolean attributeMissing = false, classMissing = false; |
---|
1246 | |
---|
1247 | boolean[] result = new boolean[2]; |
---|
1248 | Instances train = null; |
---|
1249 | Estimator [] estimators = null; |
---|
1250 | boolean built = false; |
---|
1251 | int attrIndex = 0; |
---|
1252 | Vector test; |
---|
1253 | try { |
---|
1254 | train = makeTestDataset(42, numTrain, 1, attrTypes, |
---|
1255 | numClasses, |
---|
1256 | classType |
---|
1257 | ); |
---|
1258 | |
---|
1259 | // prepare training data set and test value list |
---|
1260 | test = makeTestValueList(24, numTest, train, attrIndex, |
---|
1261 | attrTypes.getSetType()); |
---|
1262 | |
---|
1263 | if (missingLevel > 0) { |
---|
1264 | addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex); |
---|
1265 | } |
---|
1266 | estimators = Estimator.makeCopies(getEstimator(), 2); |
---|
1267 | estimators[0].addValues(train, attrIndex); |
---|
1268 | } catch (Exception ex) { |
---|
1269 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
1270 | } |
---|
1271 | try { |
---|
1272 | for (int i = 0; i < train.numInstances(); i++) { |
---|
1273 | ((IncrementalEstimator)estimators[1]).addValue(train.instance(i).value(attrIndex), 1.0); |
---|
1274 | } |
---|
1275 | built = true; |
---|
1276 | if (!estimators[0].equals(estimators[1])) { |
---|
1277 | println("no"); |
---|
1278 | result[0] = false; |
---|
1279 | |
---|
1280 | if (m_Debug) { |
---|
1281 | println("\n=== Full Report ==="); |
---|
1282 | println("Results differ between batch and " |
---|
1283 | + "incrementally built models.\n" |
---|
1284 | + "Depending on the estimator, this may be OK"); |
---|
1285 | println("Here are the results:\n"); |
---|
1286 | println("batch built results\n" + estimators[0].toString()); |
---|
1287 | println("incrementally built results\n" + estimators[1].toString()); |
---|
1288 | println("Here are the datasets:\n"); |
---|
1289 | println("=== Train Dataset ===\n" |
---|
1290 | + train.toString() + "\n"); |
---|
1291 | println("=== Test Dataset ===\n" |
---|
1292 | + test.toString() + "\n\n"); |
---|
1293 | } |
---|
1294 | } |
---|
1295 | else { |
---|
1296 | println("yes"); |
---|
1297 | result[0] = true; |
---|
1298 | } |
---|
1299 | } catch (Exception ex) { |
---|
1300 | result[0] = false; |
---|
1301 | |
---|
1302 | print("Problem during"); |
---|
1303 | if (built) |
---|
1304 | print(" testing"); |
---|
1305 | else |
---|
1306 | print(" training"); |
---|
1307 | println(": " + ex.getMessage() + "\n"); |
---|
1308 | } |
---|
1309 | |
---|
1310 | return result; |
---|
1311 | } |
---|
1312 | |
---|
1313 | |
---|
1314 | /** |
---|
1315 | * Checks whether the estimator can handle instance weights. |
---|
1316 | * This test compares the estimator performance on two datasets |
---|
1317 | * that are identical except for the training weights. If the |
---|
1318 | * results change, then the estimator must be using the weights. It |
---|
1319 | * may be possible to get a false positive from this test if the |
---|
1320 | * weight changes aren't significant enough to induce a change |
---|
1321 | * in estimator performance (but the weights are chosen to minimize |
---|
1322 | * the likelihood of this). |
---|
1323 | * |
---|
1324 | * @param attrTypes attribute types that can be estimated |
---|
1325 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1326 | * @return index 0 true if the test was passed |
---|
1327 | */ |
---|
1328 | protected boolean[] instanceWeights(AttrTypes attrTypes, |
---|
1329 | int classType) { |
---|
1330 | |
---|
1331 | print("estimator uses instance weights"); |
---|
1332 | printAttributeSummary(attrTypes, classType); |
---|
1333 | |
---|
1334 | print("..."); |
---|
1335 | |
---|
1336 | int numTrain = 2 * getNumInstances(), numTest = getNumInstances(), |
---|
1337 | numClasses = 2, missingLevel = 0; |
---|
1338 | boolean attributeMissing = false, classMissing = false; |
---|
1339 | |
---|
1340 | boolean[] result = new boolean[2]; |
---|
1341 | Instances train = null; |
---|
1342 | Vector test = null; |
---|
1343 | Estimator [] estimators = null; |
---|
1344 | |
---|
1345 | Vector resultProbsO = null; |
---|
1346 | Vector resultProbsW = null; |
---|
1347 | boolean built = false; |
---|
1348 | boolean evalFail = false; |
---|
1349 | int attrIndex = 0; |
---|
1350 | try { |
---|
1351 | train = makeTestDataset(42, numTrain, 1, |
---|
1352 | attrTypes, numClasses, |
---|
1353 | classType); |
---|
1354 | |
---|
1355 | // prepare training data set and test value list |
---|
1356 | test = makeTestValueList(24, numTest, train, attrIndex, |
---|
1357 | attrTypes.getSetType()); |
---|
1358 | |
---|
1359 | if (missingLevel > 0) { |
---|
1360 | addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex); |
---|
1361 | } |
---|
1362 | |
---|
1363 | estimators = Estimator.makeCopies(getEstimator(), 2); |
---|
1364 | |
---|
1365 | estimators[0].addValues(train, attrIndex); |
---|
1366 | resultProbsO = testWithTestValues(estimators[0], test); |
---|
1367 | |
---|
1368 | } catch (Exception ex) { |
---|
1369 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
1370 | } |
---|
1371 | try { |
---|
1372 | |
---|
1373 | // Now modify instance weights and re-built |
---|
1374 | for (int i = 0; i < train.numInstances(); i++) { |
---|
1375 | train.instance(i).setWeight(0); |
---|
1376 | } |
---|
1377 | Random random = new Random(1); |
---|
1378 | for (int i = 0; i < train.numInstances() / 2; i++) { |
---|
1379 | int inst = Math.abs(random.nextInt()) % train.numInstances(); |
---|
1380 | int weight = Math.abs(random.nextInt()) % 10 + 1; |
---|
1381 | train.instance(inst).setWeight(weight); |
---|
1382 | } |
---|
1383 | estimators[1].addValues(train, attrIndex); |
---|
1384 | resultProbsW = testWithTestValues(estimators[1], test); |
---|
1385 | |
---|
1386 | built = true; |
---|
1387 | if (resultProbsO.equals(resultProbsW)) { |
---|
1388 | // println("no"); |
---|
1389 | evalFail = true; |
---|
1390 | throw new Exception("evalFail"); |
---|
1391 | } |
---|
1392 | |
---|
1393 | println("yes"); |
---|
1394 | result[0] = true; |
---|
1395 | } catch (Exception ex) { |
---|
1396 | println("no"); |
---|
1397 | result[0] = false; |
---|
1398 | |
---|
1399 | if (m_Debug) { |
---|
1400 | println("\n=== Full Report ==="); |
---|
1401 | |
---|
1402 | if (evalFail) { |
---|
1403 | println("Results don't differ between non-weighted and " |
---|
1404 | + "weighted instance models."); |
---|
1405 | println("Here are the results:\n"); |
---|
1406 | println(probsToString(resultProbsO)); |
---|
1407 | } else { |
---|
1408 | print("Problem during"); |
---|
1409 | if (built) { |
---|
1410 | print(" testing"); |
---|
1411 | } else { |
---|
1412 | print(" training"); |
---|
1413 | } |
---|
1414 | println(": " + ex.getMessage() + "\n"); |
---|
1415 | } |
---|
1416 | println("Here are the datasets:\n"); |
---|
1417 | println("=== Train Dataset ===\n" |
---|
1418 | + train.toString() + "\n"); |
---|
1419 | println("=== Train Weights ===\n"); |
---|
1420 | for (int i = 0; i < train.numInstances(); i++) { |
---|
1421 | println(" " + (i + 1) |
---|
1422 | + " " + train.instance(i).weight()); |
---|
1423 | } |
---|
1424 | println("=== Test Dataset ===\n" |
---|
1425 | + test.toString() + "\n\n"); |
---|
1426 | println("(test weights all 1.0\n"); |
---|
1427 | } |
---|
1428 | } |
---|
1429 | |
---|
1430 | return result; |
---|
1431 | } |
---|
1432 | |
---|
1433 | /** |
---|
1434 | * Checks whether the scheme alters the training dataset during |
---|
1435 | * training. If the scheme needs to modify the training |
---|
1436 | * data it should take a copy of the training data. Currently checks |
---|
1437 | * for changes to header structure, number of instances, order of |
---|
1438 | * instances, instance weights. |
---|
1439 | * |
---|
1440 | * @param attrTypes attribute types that can be estimated |
---|
1441 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1442 | * @param attributeMissing true if we know the estimator can handle |
---|
1443 | * (at least) moderate missing attribute values |
---|
1444 | * @param classMissing true if we know the estimator can handle |
---|
1445 | * (at least) moderate missing class values |
---|
1446 | * @return index 0 is true if the test was passed |
---|
1447 | */ |
---|
1448 | protected boolean[] datasetIntegrity(AttrTypes attrTypes, |
---|
1449 | int classType, |
---|
1450 | boolean attributeMissing, |
---|
1451 | boolean classMissing) { |
---|
1452 | |
---|
1453 | Estimator estimator = null; |
---|
1454 | print("estimator doesn't alter original datasets"); |
---|
1455 | printAttributeSummary(attrTypes, classType); |
---|
1456 | print("..."); |
---|
1457 | int numTrain = getNumInstances(), numTest = getNumInstances(), |
---|
1458 | numClasses = 2, missingLevel = 100; |
---|
1459 | |
---|
1460 | boolean[] result = new boolean[2]; |
---|
1461 | Instances train = null; |
---|
1462 | boolean built = false; |
---|
1463 | try { |
---|
1464 | train = makeTestDataset(42, numTrain, 1, attrTypes, |
---|
1465 | numClasses, |
---|
1466 | classType); |
---|
1467 | int attrIndex = 0; |
---|
1468 | |
---|
1469 | if (missingLevel > 0) { |
---|
1470 | addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex); |
---|
1471 | } |
---|
1472 | estimator = Estimator.makeCopies(getEstimator(), 1)[0]; |
---|
1473 | } catch (Exception ex) { |
---|
1474 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
1475 | } |
---|
1476 | try { |
---|
1477 | Instances trainCopy = new Instances(train); |
---|
1478 | int attrIndex = 0; |
---|
1479 | estimator.addValues(trainCopy, attrIndex); |
---|
1480 | compareDatasets(train, trainCopy); |
---|
1481 | built = true; |
---|
1482 | |
---|
1483 | println("yes"); |
---|
1484 | result[0] = true; |
---|
1485 | } catch (Exception ex) { |
---|
1486 | println("no"); |
---|
1487 | result[0] = false; |
---|
1488 | |
---|
1489 | if (m_Debug) { |
---|
1490 | println("\n=== Full Report ==="); |
---|
1491 | print("Problem during"); |
---|
1492 | if (built) { |
---|
1493 | print(" testing"); |
---|
1494 | } else { |
---|
1495 | print(" training"); |
---|
1496 | } |
---|
1497 | println(": " + ex.getMessage() + "\n"); |
---|
1498 | println("Here are the datasets:\n"); |
---|
1499 | println("=== Train Dataset ===\n" |
---|
1500 | + train.toString() + "\n"); |
---|
1501 | } |
---|
1502 | } |
---|
1503 | |
---|
1504 | return result; |
---|
1505 | } |
---|
1506 | |
---|
1507 | /** |
---|
1508 | * Runs a text on the datasets with the given characteristics. |
---|
1509 | * |
---|
1510 | * @param attrTypes attribute types that can be estimated |
---|
1511 | * @param numAtts number of attributes |
---|
1512 | * @param attrIndex attribute index |
---|
1513 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1514 | * @param missingLevel the percentage of missing values |
---|
1515 | * @param attributeMissing true if the missing values may be in |
---|
1516 | * the attributes |
---|
1517 | * @param classMissing true if the missing values may be in the class |
---|
1518 | * @param numTrain the number of instances in the training set |
---|
1519 | * @param numTest the number of instaces in the test set |
---|
1520 | * @param numClasses the number of classes |
---|
1521 | * @param accepts the acceptable string in an exception |
---|
1522 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
1523 | * was acceptable |
---|
1524 | */ |
---|
1525 | protected boolean[] runBasicTest(AttrTypes attrTypes, |
---|
1526 | int numAtts, |
---|
1527 | int attrIndex, |
---|
1528 | int classType, |
---|
1529 | int missingLevel, |
---|
1530 | boolean attributeMissing, |
---|
1531 | boolean classMissing, |
---|
1532 | int numTrain, |
---|
1533 | int numTest, |
---|
1534 | int numClasses, |
---|
1535 | FastVector accepts) { |
---|
1536 | |
---|
1537 | return runBasicTest(attrTypes, |
---|
1538 | numAtts, |
---|
1539 | attrIndex, |
---|
1540 | classType, |
---|
1541 | TestInstances.CLASS_IS_LAST, |
---|
1542 | missingLevel, |
---|
1543 | attributeMissing, |
---|
1544 | classMissing, |
---|
1545 | numTrain, |
---|
1546 | numTest, |
---|
1547 | numClasses, |
---|
1548 | accepts); |
---|
1549 | } |
---|
1550 | |
---|
1551 | /** |
---|
1552 | * Runs a text on the datasets with the given characteristics. |
---|
1553 | * |
---|
1554 | * @param attrTypes attribute types that can be estimated |
---|
1555 | * @param numAtts number of attributes |
---|
1556 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1557 | * @param classIndex the attribute index of the class |
---|
1558 | * @param missingLevel the percentage of missing values |
---|
1559 | * @param attributeMissing true if the missing values may be in |
---|
1560 | * the attributes |
---|
1561 | * @param classMissing true if the missing values may be in the class |
---|
1562 | * @param numTrain the number of instances in the training set |
---|
1563 | * @param numTest the number of instaces in the test set |
---|
1564 | * @param numClasses the number of classes |
---|
1565 | * @param accepts the acceptable string in an exception |
---|
1566 | * @return index 0 is true if the test was passed, index 1 is true if test |
---|
1567 | * was acceptable |
---|
1568 | */ |
---|
1569 | protected boolean[] runBasicTest(AttrTypes attrTypes, |
---|
1570 | int numAtts, |
---|
1571 | int attrIndex, |
---|
1572 | int classType, |
---|
1573 | int classIndex, |
---|
1574 | int missingLevel, |
---|
1575 | boolean attributeMissing, |
---|
1576 | boolean classMissing, |
---|
1577 | int numTrain, |
---|
1578 | int numTest, |
---|
1579 | int numClasses, |
---|
1580 | FastVector accepts) { |
---|
1581 | |
---|
1582 | boolean[] result = new boolean[2]; |
---|
1583 | Instances train = null; |
---|
1584 | Vector test = null; |
---|
1585 | Estimator estimator = null; |
---|
1586 | boolean built = false; |
---|
1587 | |
---|
1588 | try { |
---|
1589 | train = makeTestDataset(42, numTrain, numAtts, attrTypes, |
---|
1590 | numClasses, |
---|
1591 | classType, |
---|
1592 | classIndex); |
---|
1593 | |
---|
1594 | // prepare training data set and test value list |
---|
1595 | if (numTrain > 0) { |
---|
1596 | test = makeTestValueList(24, numTest, train, attrIndex, |
---|
1597 | attrTypes.getSetType()); |
---|
1598 | |
---|
1599 | } else { |
---|
1600 | double min = -10.0; |
---|
1601 | double max = 8.0; |
---|
1602 | test = makeTestValueList(24, numTest, min, max, |
---|
1603 | attrTypes.getSetType()); |
---|
1604 | } |
---|
1605 | |
---|
1606 | if (missingLevel > 0) { |
---|
1607 | addMissing(train, missingLevel, attributeMissing, classMissing, attrIndex); |
---|
1608 | } |
---|
1609 | estimator = Estimator.makeCopies(getEstimator(), 1)[0]; |
---|
1610 | } catch (Exception ex) { |
---|
1611 | ex.printStackTrace(); |
---|
1612 | throw new Error("Error setting up for tests: " + ex.getMessage()); |
---|
1613 | } |
---|
1614 | try { |
---|
1615 | estimator.addValues(train, attrIndex); |
---|
1616 | built = true; |
---|
1617 | |
---|
1618 | testWithTestValues(estimator, test); |
---|
1619 | |
---|
1620 | println("yes"); |
---|
1621 | result[0] = true; |
---|
1622 | } |
---|
1623 | catch (Exception ex) { |
---|
1624 | boolean acceptable = false; |
---|
1625 | String msg; |
---|
1626 | if (ex.getMessage() == null) |
---|
1627 | msg = ""; |
---|
1628 | else |
---|
1629 | msg = ex.getMessage().toLowerCase(); |
---|
1630 | if (msg.indexOf("not in classpath") > -1) |
---|
1631 | m_ClasspathProblems = true; |
---|
1632 | |
---|
1633 | for (int i = 0; i < accepts.size(); i++) { |
---|
1634 | if (msg.indexOf((String)accepts.elementAt(i)) >= 0) { |
---|
1635 | acceptable = true; |
---|
1636 | } |
---|
1637 | } |
---|
1638 | |
---|
1639 | println("no" + (acceptable ? " (OK error message)" : "")); |
---|
1640 | result[1] = acceptable; |
---|
1641 | |
---|
1642 | |
---|
1643 | if (m_Debug) { |
---|
1644 | println("\n=== Full Report ==="); |
---|
1645 | print("Problem during"); |
---|
1646 | if (built) { |
---|
1647 | print(" testing"); |
---|
1648 | } else { |
---|
1649 | print(" training"); |
---|
1650 | } |
---|
1651 | println(": " + ex.getMessage() + "\n"); |
---|
1652 | if (!acceptable) { |
---|
1653 | if (accepts.size() > 0) { |
---|
1654 | print("Error message doesn't mention "); |
---|
1655 | for (int i = 0; i < accepts.size(); i++) { |
---|
1656 | if (i != 0) { |
---|
1657 | print(" or "); |
---|
1658 | } |
---|
1659 | print('"' + (String)accepts.elementAt(i) + '"'); |
---|
1660 | } |
---|
1661 | } |
---|
1662 | println("here are the datasets:\n"); |
---|
1663 | println("=== Train Dataset ===\n" |
---|
1664 | + train.toString() + "\n"); |
---|
1665 | println("=== Test Dataset ===\n" |
---|
1666 | + test.toString() + "\n\n"); |
---|
1667 | } |
---|
1668 | |
---|
1669 | } |
---|
1670 | } |
---|
1671 | return result; |
---|
1672 | } |
---|
1673 | |
---|
1674 | /** |
---|
1675 | * Compare two datasets to see if they differ. |
---|
1676 | * |
---|
1677 | * @param data1 one set of instances |
---|
1678 | * @param data2 the other set of instances |
---|
1679 | * @throws Exception if the datasets differ |
---|
1680 | */ |
---|
1681 | protected void compareDatasets(Instances data1, Instances data2) |
---|
1682 | throws Exception { |
---|
1683 | if (!data2.equalHeaders(data1)) { |
---|
1684 | throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); |
---|
1685 | } |
---|
1686 | if (!(data2.numInstances() == data1.numInstances())) { |
---|
1687 | throw new Exception("number of instances has changed"); |
---|
1688 | } |
---|
1689 | for (int i = 0; i < data2.numInstances(); i++) { |
---|
1690 | Instance orig = data1.instance(i); |
---|
1691 | Instance copy = data2.instance(i); |
---|
1692 | for (int j = 0; j < orig.numAttributes(); j++) { |
---|
1693 | if (orig.isMissing(j)) { |
---|
1694 | if (!copy.isMissing(j)) { |
---|
1695 | throw new Exception("instances have changed"); |
---|
1696 | } |
---|
1697 | } else if (orig.value(j) != copy.value(j)) { |
---|
1698 | throw new Exception("instances have changed"); |
---|
1699 | } |
---|
1700 | if (orig.weight() != copy.weight()) { |
---|
1701 | throw new Exception("instance weights have changed"); |
---|
1702 | } |
---|
1703 | } |
---|
1704 | } |
---|
1705 | } |
---|
1706 | |
---|
1707 | /** |
---|
1708 | * Add missing values to a dataset. |
---|
1709 | * |
---|
1710 | * @param data the instances to add missing values to |
---|
1711 | * @param level the level of missing values to add (if positive, this |
---|
1712 | * is the probability that a value will be set to missing, if negative |
---|
1713 | * all but one value will be set to missing (not yet implemented)) |
---|
1714 | * @param attributeMissing if true, attributes will be modified |
---|
1715 | * @param classMissing if true, the class attribute will be modified |
---|
1716 | * @param attrIndex index of the attribute |
---|
1717 | */ |
---|
1718 | protected void addMissing(Instances data, int level, |
---|
1719 | boolean attributeMissing, boolean classMissing, |
---|
1720 | int attrIndex) { |
---|
1721 | |
---|
1722 | int classIndex = data.classIndex(); |
---|
1723 | Random random = new Random(1); |
---|
1724 | for (int i = 0; i < data.numInstances(); i++) { |
---|
1725 | Instance current = data.instance(i); |
---|
1726 | |
---|
1727 | for (int j = 0; j < data.numAttributes(); j++) { |
---|
1728 | if (((j == classIndex) && classMissing) || |
---|
1729 | ((j == attrIndex) && attributeMissing)) { |
---|
1730 | if (Math.abs(random.nextInt()) % 100 < level) |
---|
1731 | current.setMissing(j); |
---|
1732 | } |
---|
1733 | } |
---|
1734 | } |
---|
1735 | } |
---|
1736 | |
---|
1737 | /** |
---|
1738 | * Make a simple set of instances, which can later be modified |
---|
1739 | * for use in specific tests. |
---|
1740 | * |
---|
1741 | * @param seed the random number seed |
---|
1742 | * @param numInstances the number of instances to generate |
---|
1743 | * @param numAttr the number of attributes |
---|
1744 | * @param attrTypes the attribute types |
---|
1745 | * @param numClasses the number of classes (if nominal class) |
---|
1746 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1747 | * @return the test dataset |
---|
1748 | * @throws Exception if the dataset couldn't be generated |
---|
1749 | * @see #process(Instances) |
---|
1750 | */ |
---|
1751 | protected Instances makeTestDataset(int seed, |
---|
1752 | int numInstances, |
---|
1753 | int numAttr, |
---|
1754 | AttrTypes attrTypes, |
---|
1755 | int numClasses, |
---|
1756 | int classType) |
---|
1757 | throws Exception { |
---|
1758 | |
---|
1759 | return makeTestDataset( |
---|
1760 | seed, |
---|
1761 | numInstances, |
---|
1762 | numAttr, |
---|
1763 | attrTypes, |
---|
1764 | numClasses, |
---|
1765 | classType, |
---|
1766 | TestInstances.CLASS_IS_LAST); |
---|
1767 | } |
---|
1768 | |
---|
1769 | |
---|
1770 | /** |
---|
1771 | * Make a simple set of instances with variable position of the class |
---|
1772 | * attribute, which can later be modified for use in specific tests. |
---|
1773 | * |
---|
1774 | * @param seed the random number seed |
---|
1775 | * @param numInstances the number of instances to generate |
---|
1776 | * @param numAttr the number of attributes to generate |
---|
1777 | * @param attrTypes the type of attrbute that is excepted |
---|
1778 | * @param numClasses the number of classes (if nominal class) |
---|
1779 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
1780 | * @param classIndex the index of the class (0-based, -1 as last) |
---|
1781 | * @return the test dataset |
---|
1782 | * @throws Exception if the dataset couldn't be generated |
---|
1783 | * @see TestInstances#CLASS_IS_LAST |
---|
1784 | * @see #process(Instances) |
---|
1785 | */ |
---|
1786 | protected Instances makeTestDataset(int seed, int numInstances, |
---|
1787 | int numAttr, AttrTypes attrTypes, |
---|
1788 | int numClasses, int classType, |
---|
1789 | int classIndex) |
---|
1790 | throws Exception { |
---|
1791 | |
---|
1792 | TestInstances dataset = new TestInstances(); |
---|
1793 | |
---|
1794 | dataset.setSeed(seed); |
---|
1795 | dataset.setNumInstances(numInstances); |
---|
1796 | dataset.setNumNominal (attrTypes.nominal ? numAttr : 0); |
---|
1797 | dataset.setNumNumeric (attrTypes.numeric ? numAttr : 0); |
---|
1798 | dataset.setNumString (attrTypes.string ? numAttr : 0); |
---|
1799 | dataset.setNumDate (attrTypes.date ? numAttr : 0); |
---|
1800 | dataset.setNumRelational(attrTypes.relational ? numAttr : 0); |
---|
1801 | dataset.setNumClasses(numClasses); |
---|
1802 | dataset.setClassType(classType); |
---|
1803 | dataset.setClassIndex(classIndex); |
---|
1804 | |
---|
1805 | return process(dataset.generate()); |
---|
1806 | } |
---|
1807 | |
---|
1808 | /** |
---|
1809 | * Make a simple set of values. Only one of the num'type' parameters should be larger 0. |
---|
1810 | * (just to make parameter similar to the makeTestDataset parameters) |
---|
1811 | * |
---|
1812 | * @param seed the random number seed |
---|
1813 | * @param numValues the number of values to generate |
---|
1814 | * @param data the dataset to make test examples for |
---|
1815 | * @param attrIndex index of the attribute |
---|
1816 | * @param attrType the class type (NUMERIC, NOMINAL, etc.) |
---|
1817 | * @throws Exception if the dataset couldn't be generated |
---|
1818 | * @see #process(Instances) |
---|
1819 | */ |
---|
1820 | protected Vector makeTestValueList(int seed, int numValues, |
---|
1821 | Instances data, int attrIndex, int attrType) |
---|
1822 | throws Exception { |
---|
1823 | |
---|
1824 | // get min max |
---|
1825 | double []minMax = getMinimumMaximum(data, attrIndex); |
---|
1826 | double minValue = minMax[0]; |
---|
1827 | double maxValue = minMax[1]; |
---|
1828 | |
---|
1829 | // make value list and put into a VECTOR |
---|
1830 | double range = maxValue - minValue; |
---|
1831 | Vector values = new Vector(numValues); |
---|
1832 | Random random = new Random(seed); |
---|
1833 | |
---|
1834 | if (attrType == Attribute.NOMINAL) { |
---|
1835 | for (int i = 0; i < numValues; i++) { |
---|
1836 | Double v = new Double((Math.abs(random.nextInt()) % (int)range)+ (int)minValue); |
---|
1837 | values.add(v); |
---|
1838 | } |
---|
1839 | } |
---|
1840 | if (attrType == Attribute.NUMERIC) { |
---|
1841 | for (int i = 0; i < numValues; i++) { |
---|
1842 | Double v = new Double(random.nextDouble() * range + minValue); |
---|
1843 | values.add(v); |
---|
1844 | } |
---|
1845 | } |
---|
1846 | return values; |
---|
1847 | } |
---|
1848 | |
---|
1849 | /** |
---|
1850 | * Make a simple set of values. Only one of the num'type' parameters should be larger 0. |
---|
1851 | * (just to make parameter similar to the makeTestDataset parameters) |
---|
1852 | * |
---|
1853 | * @param seed the random number seed |
---|
1854 | * @param numValues the number of values to generate |
---|
1855 | * @param minValue the minimal data value |
---|
1856 | * @param maxValue the maximal data value |
---|
1857 | * @param attrType the class type (NUMERIC, NOMINAL, etc.) |
---|
1858 | * @throws Exception if the dataset couldn't be generated |
---|
1859 | * @see #process(Instances) |
---|
1860 | */ |
---|
1861 | protected Vector makeTestValueList(int seed, int numValues, |
---|
1862 | double minValue, double maxValue, int attrType) |
---|
1863 | throws Exception { |
---|
1864 | |
---|
1865 | |
---|
1866 | // make value list and put into a VECTOR |
---|
1867 | double range = maxValue - minValue; |
---|
1868 | Vector values = new Vector(numValues); |
---|
1869 | Random random = new Random(seed); |
---|
1870 | |
---|
1871 | if (attrType == Attribute.NOMINAL) { |
---|
1872 | for (int i = 0; i < numValues; i++) { |
---|
1873 | Double v = new Double((Math.abs(random.nextInt()) % (int)range)+ (int)minValue); |
---|
1874 | values.add(v); |
---|
1875 | } |
---|
1876 | } |
---|
1877 | if (attrType == Attribute.NUMERIC) { |
---|
1878 | for (int i = 0; i < numValues; i++) { |
---|
1879 | Double v = new Double(random.nextDouble() * range + minValue); |
---|
1880 | values.add(v); |
---|
1881 | } |
---|
1882 | } |
---|
1883 | return values; |
---|
1884 | } |
---|
1885 | |
---|
1886 | /** |
---|
1887 | * Test with test values. |
---|
1888 | * |
---|
1889 | * @param est estimator to be tested |
---|
1890 | * @param test vector with test values |
---|
1891 | * |
---|
1892 | **/ |
---|
1893 | protected Vector testWithTestValues(Estimator est, Vector test) { |
---|
1894 | |
---|
1895 | Vector results = new Vector(); |
---|
1896 | for (int i = 0; i < test.size(); i++) { |
---|
1897 | double testValue = ((Double)(test.elementAt(i))).doubleValue(); |
---|
1898 | double prob = est.getProbability(testValue); |
---|
1899 | Double p = new Double(prob); |
---|
1900 | results.add(p); |
---|
1901 | } |
---|
1902 | return results; |
---|
1903 | } |
---|
1904 | |
---|
1905 | /** |
---|
1906 | * Gets the minimum and maximum of the values a the first attribute |
---|
1907 | * of the given data set |
---|
1908 | * |
---|
1909 | * @param inst the instance |
---|
1910 | * @param attrIndex the index of the attribut to find min and max |
---|
1911 | * @return the array with the minimum value on index 0 and the max on index 1 |
---|
1912 | */ |
---|
1913 | |
---|
1914 | protected double[] getMinimumMaximum(Instances inst, int attrIndex) { |
---|
1915 | double []minMax = new double[2]; |
---|
1916 | |
---|
1917 | try { |
---|
1918 | int num = getMinMax(inst, attrIndex, minMax); |
---|
1919 | } catch (Exception ex) { |
---|
1920 | ex.printStackTrace(); |
---|
1921 | System.out.println(ex.getMessage()); |
---|
1922 | } |
---|
1923 | return minMax; |
---|
1924 | // double minValue = minMax[0]; |
---|
1925 | // double maxValue = minMax[1]; |
---|
1926 | } |
---|
1927 | |
---|
1928 | /** |
---|
1929 | * Find the minimum and the maximum of the attribute and return it in |
---|
1930 | * the last parameter.. |
---|
1931 | * @param inst instances used to build the estimator |
---|
1932 | * @param attrIndex index of the attribute |
---|
1933 | * @param minMax the array to return minimum and maximum in |
---|
1934 | * @return number of not missing values |
---|
1935 | * @exception Exception if parameter minMax wasn't initialized properly |
---|
1936 | */ |
---|
1937 | public static int getMinMax(Instances inst, int attrIndex, double [] minMax) |
---|
1938 | throws Exception { |
---|
1939 | double min = Double.NaN; |
---|
1940 | double max = Double.NaN; |
---|
1941 | Instance instance = null; |
---|
1942 | int numNotMissing = 0; |
---|
1943 | if ((minMax == null) || (minMax.length < 2)) { |
---|
1944 | throw new Exception("Error in Program, privat method getMinMax"); |
---|
1945 | } |
---|
1946 | |
---|
1947 | Enumeration enumInst = inst.enumerateInstances(); |
---|
1948 | if (enumInst.hasMoreElements()) { |
---|
1949 | do { |
---|
1950 | instance = (Instance) enumInst.nextElement(); |
---|
1951 | } while (instance.isMissing(attrIndex) && (enumInst.hasMoreElements())); |
---|
1952 | |
---|
1953 | // add values if not missing |
---|
1954 | if (!instance.isMissing(attrIndex)) { |
---|
1955 | numNotMissing++; |
---|
1956 | min = instance.value(attrIndex); |
---|
1957 | max = instance.value(attrIndex); |
---|
1958 | } |
---|
1959 | while (enumInst.hasMoreElements()) { |
---|
1960 | instance = (Instance) enumInst.nextElement(); |
---|
1961 | if (!instance.isMissing(attrIndex)) { |
---|
1962 | numNotMissing++; |
---|
1963 | if (instance.value(attrIndex) < min) { |
---|
1964 | min = (instance.value(attrIndex)); |
---|
1965 | } else { |
---|
1966 | if (instance.value(attrIndex) > max) { |
---|
1967 | max = (instance.value(attrIndex)); |
---|
1968 | } |
---|
1969 | } |
---|
1970 | } |
---|
1971 | } |
---|
1972 | } |
---|
1973 | minMax[0] = min; |
---|
1974 | minMax[1] = max; |
---|
1975 | return numNotMissing; |
---|
1976 | } |
---|
1977 | |
---|
1978 | /** |
---|
1979 | * Print the probabilities after testing |
---|
1980 | * @param probs vector with probability values |
---|
1981 | * @return string with probability values printed |
---|
1982 | */ |
---|
1983 | private String probsToString(Vector probs) { |
---|
1984 | StringBuffer txt = new StringBuffer (" "); |
---|
1985 | for (int i = 0; i < probs.size(); i++) { |
---|
1986 | txt.append("" + ((Double)(probs.elementAt(i))).doubleValue() + " "); |
---|
1987 | } |
---|
1988 | return txt.toString(); |
---|
1989 | } |
---|
1990 | |
---|
1991 | /** |
---|
1992 | * Provides a hook for derived classes to further modify the data. |
---|
1993 | * |
---|
1994 | * @param data the data to process |
---|
1995 | * @return the processed data |
---|
1996 | * @see #m_PostProcessor |
---|
1997 | */ |
---|
1998 | protected Instances process(Instances data) { |
---|
1999 | if (getPostProcessor() == null) |
---|
2000 | return data; |
---|
2001 | else |
---|
2002 | return getPostProcessor().process(data); |
---|
2003 | } |
---|
2004 | |
---|
2005 | /** |
---|
2006 | * Print out a short summary string for the dataset characteristics |
---|
2007 | * |
---|
2008 | * @param attrTypes the attribute types used (NUMERIC, NOMINAL, etc.) |
---|
2009 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
2010 | */ |
---|
2011 | protected void printAttributeSummary(AttrTypes attrTypes, int classType) { |
---|
2012 | |
---|
2013 | String str = ""; |
---|
2014 | |
---|
2015 | if (attrTypes.numeric) |
---|
2016 | str += " numeric"; |
---|
2017 | |
---|
2018 | if (attrTypes.nominal) { |
---|
2019 | if (str.length() > 0) |
---|
2020 | str += " &"; |
---|
2021 | str += " nominal"; |
---|
2022 | } |
---|
2023 | |
---|
2024 | if (attrTypes.string) { |
---|
2025 | if (str.length() > 0) |
---|
2026 | str += " &"; |
---|
2027 | str += " string"; |
---|
2028 | } |
---|
2029 | |
---|
2030 | if (attrTypes.date) { |
---|
2031 | if (str.length() > 0) |
---|
2032 | str += " &"; |
---|
2033 | str += " date"; |
---|
2034 | } |
---|
2035 | |
---|
2036 | if (attrTypes.relational) { |
---|
2037 | if (str.length() > 0) |
---|
2038 | str += " &"; |
---|
2039 | str += " relational"; |
---|
2040 | } |
---|
2041 | |
---|
2042 | str += " attributes)"; |
---|
2043 | |
---|
2044 | switch (classType) { |
---|
2045 | case Attribute.NUMERIC: |
---|
2046 | str = " (numeric class," + str; |
---|
2047 | break; |
---|
2048 | case Attribute.NOMINAL: |
---|
2049 | str = " (nominal class," + str; |
---|
2050 | break; |
---|
2051 | case Attribute.STRING: |
---|
2052 | str = " (string class," + str; |
---|
2053 | break; |
---|
2054 | case Attribute.DATE: |
---|
2055 | str = " (date class," + str; |
---|
2056 | break; |
---|
2057 | case Attribute.RELATIONAL: |
---|
2058 | str = " (relational class," + str; |
---|
2059 | break; |
---|
2060 | } |
---|
2061 | |
---|
2062 | print(str); |
---|
2063 | } |
---|
2064 | |
---|
2065 | /** |
---|
2066 | * Print out a short summary string for the dataset characteristics |
---|
2067 | * |
---|
2068 | * @param attrType the attribute type (NUMERIC, NOMINAL, etc.) |
---|
2069 | * @param classType the class type (NUMERIC, NOMINAL, etc.) |
---|
2070 | */ |
---|
2071 | protected void printAttributeSummary(int attrType, int classType) { |
---|
2072 | |
---|
2073 | String str = ""; |
---|
2074 | |
---|
2075 | switch (attrType) { |
---|
2076 | case Attribute.NUMERIC: |
---|
2077 | str = " numeric" + str; |
---|
2078 | break; |
---|
2079 | case Attribute.NOMINAL: |
---|
2080 | str = " nominal" + str; |
---|
2081 | break; |
---|
2082 | case Attribute.STRING: |
---|
2083 | str = " string" + str; |
---|
2084 | break; |
---|
2085 | case Attribute.DATE: |
---|
2086 | str = " date" + str; |
---|
2087 | break; |
---|
2088 | case Attribute.RELATIONAL: |
---|
2089 | str = " relational" + str; |
---|
2090 | break; |
---|
2091 | } |
---|
2092 | str += " attribute(s))"; |
---|
2093 | |
---|
2094 | switch (classType) { |
---|
2095 | case Attribute.NUMERIC: |
---|
2096 | str = " (numeric class," + str; |
---|
2097 | break; |
---|
2098 | case Attribute.NOMINAL: |
---|
2099 | str = " (nominal class," + str; |
---|
2100 | break; |
---|
2101 | case Attribute.STRING: |
---|
2102 | str = " (string class," + str; |
---|
2103 | break; |
---|
2104 | case Attribute.DATE: |
---|
2105 | str = " (date class," + str; |
---|
2106 | break; |
---|
2107 | case Attribute.RELATIONAL: |
---|
2108 | str = " (relational class," + str; |
---|
2109 | break; |
---|
2110 | } |
---|
2111 | |
---|
2112 | print(str); |
---|
2113 | } |
---|
2114 | |
---|
2115 | /** |
---|
2116 | * Returns the revision string. |
---|
2117 | * |
---|
2118 | * @return the revision |
---|
2119 | */ |
---|
2120 | public String getRevision() { |
---|
2121 | return RevisionUtils.extract("$Revision: 4997 $"); |
---|
2122 | } |
---|
2123 | |
---|
2124 | /** |
---|
2125 | * Test method for this class |
---|
2126 | * |
---|
2127 | * @param args the commandline parameters |
---|
2128 | */ |
---|
2129 | public static void main(String [] args) { |
---|
2130 | try { |
---|
2131 | CheckEstimator check = new CheckEstimator(); |
---|
2132 | |
---|
2133 | try { |
---|
2134 | check.setOptions(args); |
---|
2135 | Utils.checkForRemainingOptions(args); |
---|
2136 | } catch (Exception ex) { |
---|
2137 | String result = ex.getMessage() + "\n\n" + check.getClass().getName().replaceAll(".*\\.", "") + " Options:\n\n"; |
---|
2138 | Enumeration enu = check.listOptions(); |
---|
2139 | while (enu.hasMoreElements()) { |
---|
2140 | Option option = (Option) enu.nextElement(); |
---|
2141 | result += option.synopsis() + "\n" + option.description() + "\n"; |
---|
2142 | } |
---|
2143 | throw new Exception(result); |
---|
2144 | } |
---|
2145 | |
---|
2146 | check.doTests(); |
---|
2147 | } catch (Exception ex) { |
---|
2148 | System.err.println(ex.getMessage()); |
---|
2149 | } |
---|
2150 | } |
---|
2151 | } |
---|
2152 | |
---|