1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * InstancesUtil.java |
---|
19 | * Copyright (C) 2004 Stijn Lievens |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.classifiers.misc.monotone; |
---|
24 | |
---|
25 | import weka.classifiers.Classifier; |
---|
26 | import weka.core.Instance; |
---|
27 | import weka.core.DenseInstance; |
---|
28 | import weka.core.Instances; |
---|
29 | import weka.core.RevisionHandler; |
---|
30 | import weka.core.RevisionUtils; |
---|
31 | import weka.core.Utils; |
---|
32 | import weka.estimators.DiscreteEstimator; |
---|
33 | |
---|
34 | import java.io.BufferedWriter; |
---|
35 | import java.io.IOException; |
---|
36 | import java.util.HashMap; |
---|
37 | import java.util.Iterator; |
---|
38 | import java.util.Map; |
---|
39 | import java.util.Random; |
---|
40 | |
---|
41 | /** |
---|
42 | * This class contains some methods for working with objects of |
---|
43 | * type <code> Instance </code> and <code> Instances, </code> not |
---|
44 | * provided by there respective classes. |
---|
45 | * <p> |
---|
46 | * This implementation is part of the master's thesis: "Studie |
---|
47 | * en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd |
---|
48 | * rangschikken", Stijn Lievens, Ghent University, 2004. |
---|
49 | * </p> |
---|
50 | * |
---|
51 | * @author Stijn Lievens (stijn.lievens@ugent.be) |
---|
52 | * @version $Revision: 5987 $ |
---|
53 | */ |
---|
54 | public class InstancesUtil |
---|
55 | implements RevisionHandler { |
---|
56 | |
---|
57 | /** |
---|
58 | * Compares two instances, ignoring the class attribute (if any) |
---|
59 | * |
---|
60 | * @param i1 the first instance |
---|
61 | * @param i2 the second instance |
---|
62 | * @return true if both instances are equal (ignoring the class |
---|
63 | * attribute), false otherwise |
---|
64 | */ |
---|
65 | public static boolean equalIgnoreClass(Instance i1, Instance i2) { |
---|
66 | int n = i1.numAttributes(); |
---|
67 | int classIndex = i1.classIndex(); |
---|
68 | if (i2.numAttributes() != n || classIndex != i2.classIndex()) { |
---|
69 | return false; |
---|
70 | } |
---|
71 | int i = 0; |
---|
72 | while(i < n && (i == classIndex |
---|
73 | || Utils.eq(i1.value(i), i2.value(i)))) { |
---|
74 | i++; |
---|
75 | } |
---|
76 | return i == n; |
---|
77 | } |
---|
78 | |
---|
79 | /** |
---|
80 | * Get the index of an instance in a set of instances, where |
---|
81 | * instances are compared ignoring the class attribute. |
---|
82 | * |
---|
83 | * @param instances the set of instances |
---|
84 | * @param instance to instance to be found in the given set of instances |
---|
85 | * @return the index of the first instance that equals the given instance |
---|
86 | * (ignoring the class attribute), -1 if the instance was not found |
---|
87 | */ |
---|
88 | public static int containsIgnoreClass(Instances instances, Instance instance) { |
---|
89 | double[] dd = instance.toDoubleArray(); |
---|
90 | int classIndex = instances.classIndex(); |
---|
91 | int n = instances.numAttributes(); |
---|
92 | Iterator it = |
---|
93 | new EnumerationIterator(instances.enumerateInstances()); |
---|
94 | int index = 0; |
---|
95 | while(it.hasNext()) { |
---|
96 | Instance tmp = (Instance) it.next(); |
---|
97 | int i = 0; |
---|
98 | while(i < n && |
---|
99 | (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) { |
---|
100 | i++; |
---|
101 | } |
---|
102 | if (i == n) { |
---|
103 | return index; // found it |
---|
104 | } |
---|
105 | index++; |
---|
106 | } |
---|
107 | return -1; |
---|
108 | } |
---|
109 | |
---|
110 | /** |
---|
111 | * Find the next occurence of an instance, ignoring the class, |
---|
112 | * for which the index in the dataset is at least <code> index. </code> |
---|
113 | * |
---|
114 | * @param instances the set of instances to be searched |
---|
115 | * @param instance the instance to be found |
---|
116 | * @param index the minimum index that might be returned |
---|
117 | * @return the index of the first instance with index at |
---|
118 | * least <code> index </code> that equals the given instance |
---|
119 | * (ignoring the class attribute), -1 if the instance was not found |
---|
120 | */ |
---|
121 | public static int nextOccurenceIgnoreClass(Instances instances, Instance instance, int index) { |
---|
122 | double[] dd = instance.toDoubleArray(); |
---|
123 | int classIndex = instances.classIndex(); |
---|
124 | int n = instances.numAttributes(); |
---|
125 | int numInstances = instances.numInstances(); |
---|
126 | int currentIndex = index; |
---|
127 | while(currentIndex < numInstances) { |
---|
128 | Instance tmp = instances.instance(currentIndex); |
---|
129 | int i = 0; |
---|
130 | while(i < n && |
---|
131 | (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) { |
---|
132 | i++; |
---|
133 | } |
---|
134 | if (i == n) { |
---|
135 | return currentIndex; // found it |
---|
136 | } |
---|
137 | currentIndex++; |
---|
138 | } |
---|
139 | return -1; // not present |
---|
140 | } |
---|
141 | |
---|
142 | /** |
---|
143 | * Check if all instances have the same class value. |
---|
144 | * |
---|
145 | * @param instances the instances to be checked for homogeneity |
---|
146 | * @return true if the instances have the same class value, false otherwise |
---|
147 | */ |
---|
148 | public static boolean isHomogeneous(Instances instances) { |
---|
149 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
---|
150 | if (it.hasNext()) { |
---|
151 | double classValue = ((Instance) it.next()).classValue(); |
---|
152 | while(it.hasNext()) { |
---|
153 | if (((Instance) it.next()).classValue() != classValue) { |
---|
154 | return false; |
---|
155 | } |
---|
156 | } |
---|
157 | } |
---|
158 | return true; // empty or all identical |
---|
159 | } |
---|
160 | |
---|
161 | /** |
---|
162 | * Compares two instances in the data space, this is ignoring the class |
---|
163 | * attribute. An instance is strictly smaller than another instance |
---|
164 | * if the same holds for the <code> Coordinates </code> based on |
---|
165 | * these instances. |
---|
166 | * |
---|
167 | * @param i1 the first instance |
---|
168 | * @param i2 the second instance |
---|
169 | * @return <code> true </code> if the first instance is strictly smaller |
---|
170 | * than the second instance, <code> false </code> otherwise |
---|
171 | */ |
---|
172 | public static boolean strictlySmaller(Instance i1, Instance i2) { |
---|
173 | // XXX implementation can be done faster |
---|
174 | Coordinates c1 = new Coordinates(i1); |
---|
175 | Coordinates c2 = new Coordinates(i2); |
---|
176 | |
---|
177 | return c1.strictlySmaller(c2); |
---|
178 | } |
---|
179 | |
---|
180 | /** |
---|
181 | * Compares two instances in the data space, this is, ignoring the class |
---|
182 | * attribute. An instance is smaller or equal than another instance |
---|
183 | * if the same holds for the <code> Coordinates </code> based on |
---|
184 | * these instances. |
---|
185 | * |
---|
186 | * @param i1 the first instance |
---|
187 | * @param i2 the second instance |
---|
188 | * @return <code> true </code> if the first instance is smaller or equal |
---|
189 | * than the second instance, <code> false </code> otherwise |
---|
190 | */ |
---|
191 | public static boolean smallerOrEqual(Instance i1,Instance i2) { |
---|
192 | // XXX implementation can be done faster |
---|
193 | Coordinates c1 = new Coordinates(i1); |
---|
194 | Coordinates c2 = new Coordinates(i2); |
---|
195 | |
---|
196 | return c1.smallerOrEqual(c2); |
---|
197 | } |
---|
198 | |
---|
199 | /** |
---|
200 | * Checks if two instances are comparable in the data space, this is |
---|
201 | * ignoring the class attribute. Two instances are comparable if the |
---|
202 | * first is smaller or equal than the second, or the other way around. |
---|
203 | * |
---|
204 | * @param i1 the first instance |
---|
205 | * @param i2 the second instance |
---|
206 | * @return <code> true </code> if the given instances are comparable, |
---|
207 | * <code> false </code> otherwise |
---|
208 | * @throws IllegalArgumentException if the two instances don't have the |
---|
209 | * same length |
---|
210 | */ |
---|
211 | public static boolean comparable(Instance i1, Instance i2) throws IllegalArgumentException { |
---|
212 | // XXX maybe we should think about using 'equalHeaders' of Instance |
---|
213 | // to obtain a fool proof implementation |
---|
214 | Coordinates c1 = new Coordinates(i1); |
---|
215 | Coordinates c2 = new Coordinates(i2); |
---|
216 | |
---|
217 | return c1.smallerOrEqual(c2) || c2.smallerOrEqual(c1); |
---|
218 | } |
---|
219 | |
---|
220 | /** |
---|
221 | * Checks it two instances give rise to doubt. There is doubt between |
---|
222 | * two instances if their <code> Coordinates </code> are equal, but |
---|
223 | * their class value is different. |
---|
224 | * |
---|
225 | * @param i1 the first instance |
---|
226 | * @param i2 the second instance |
---|
227 | * @return <code> true </code> if there is doubt between the two |
---|
228 | * given instances, <code> false </code> otherwise |
---|
229 | */ |
---|
230 | public static boolean doubt(Instance i1, Instance i2) { |
---|
231 | // XXX use equalHeaders ? |
---|
232 | if (i1.classValue() == i2.classValue()) { |
---|
233 | return false; |
---|
234 | } |
---|
235 | Coordinates c1 = new Coordinates(i1); |
---|
236 | Coordinates c2 = new Coordinates(i2); |
---|
237 | |
---|
238 | return c1.equals(c2); |
---|
239 | } |
---|
240 | |
---|
241 | /** |
---|
242 | * Checks if two instances give rise to reversed preference. |
---|
243 | * Two instances give rise to reversed preference in the data space, |
---|
244 | * if their <code> Coordinates </code> are comparable but different, |
---|
245 | * and their class values are not related in the same way. |
---|
246 | * |
---|
247 | * @param i1 the first instance |
---|
248 | * @param i2 the second instance |
---|
249 | * @return <code> true </code> if <code> i1 </code> and <code> i2 </code> |
---|
250 | * give rise to reversed preference, <code> false </code> otherwise |
---|
251 | * @throws IllegalArgumentException if the two instances don't have |
---|
252 | * the same length |
---|
253 | */ |
---|
254 | public static boolean reversedPreference(Instance i1, Instance i2) throws IllegalArgumentException { |
---|
255 | // XXX should the implementation be made fool proof by use of |
---|
256 | // 'equalHeaders'? It can also be speeded up I think. |
---|
257 | |
---|
258 | if (i1.classValue() == i2.classValue()) { |
---|
259 | return false; |
---|
260 | } |
---|
261 | Coordinates c1 = new Coordinates(i1); |
---|
262 | Coordinates c2 = new Coordinates(i2); |
---|
263 | |
---|
264 | if (i1.classValue() > i2.classValue() && c1.strictlySmaller(c2)) { |
---|
265 | return true; |
---|
266 | } |
---|
267 | if (i2.classValue() > i1.classValue() && c2.strictlySmaller(c1)) { |
---|
268 | return true; |
---|
269 | } |
---|
270 | |
---|
271 | return false; |
---|
272 | } |
---|
273 | |
---|
274 | /** |
---|
275 | * Checks if the given data set is monotone. We say that a data set |
---|
276 | * is monotone if it contains doubt nor reversed preferences. |
---|
277 | * |
---|
278 | * @param instances the data set to be checked |
---|
279 | * @return <code> true </code> if the given data set if monotone, |
---|
280 | * <code> false </code> otherwise |
---|
281 | */ |
---|
282 | public static boolean isMonotone(Instances instances) { |
---|
283 | int n = instances.numInstances(); |
---|
284 | for (int i = 0; i < n; i++) { |
---|
285 | Instance i1 = instances.instance(i); |
---|
286 | for (int j = i + 1; j < n; j++) { |
---|
287 | if ( doubt(i1, instances.instance(j)) || |
---|
288 | reversedPreference(i1, instances.instance(j))) { |
---|
289 | return false; |
---|
290 | } |
---|
291 | } |
---|
292 | } |
---|
293 | return true; |
---|
294 | } |
---|
295 | |
---|
296 | /** |
---|
297 | * Test if a set of instances is quasi monotone. We say that a set |
---|
298 | * of instances <code> S </code> is quasi monotone with respect to |
---|
299 | * a set of instances <code> D </code> iff |
---|
300 | * <code> [x,y] \cap D \neq \emptyset \implies class(x) \leq class(y). |
---|
301 | * </code> This implies that <code> D </code> itself is monotone. |
---|
302 | * |
---|
303 | * @param ground the instances playing the role of <code> D </code> |
---|
304 | * @param other the instances playing the role of <code> S </code> |
---|
305 | * @return true if the instances are quasi monotone, false otherwise |
---|
306 | */ |
---|
307 | public static boolean isQuasiMonotone(Instances ground, Instances other) { |
---|
308 | if (!isMonotone(ground)) { |
---|
309 | return false; |
---|
310 | } |
---|
311 | Iterator it1 = new EnumerationIterator(ground.enumerateInstances()); |
---|
312 | while(it1.hasNext()) { |
---|
313 | Instance inst1 = (Instance) it1.next(); |
---|
314 | Iterator it2 = new EnumerationIterator(other.enumerateInstances()); |
---|
315 | while(it2.hasNext()) { |
---|
316 | Instance inst2 = (Instance) it2.next(); |
---|
317 | if (doubt(inst1, inst2) || reversedPreference(inst1, inst2)) { |
---|
318 | return false; |
---|
319 | } |
---|
320 | } |
---|
321 | } |
---|
322 | return true; |
---|
323 | } |
---|
324 | |
---|
325 | /** |
---|
326 | * Gather some statistics regarding reversed preferences. |
---|
327 | * |
---|
328 | * @param instances the instances to be examined |
---|
329 | * @return array of length 3; position 0 indicates the number of |
---|
330 | * couples that have reversed preference, position 1 the number of |
---|
331 | * couples that are comparable, and position 2 the total |
---|
332 | * number of couples |
---|
333 | * @see #reversedPreference(Instance, Instance) |
---|
334 | */ |
---|
335 | public static int[] nrOfReversedPreferences(Instances instances) { |
---|
336 | int[] stats = new int[3]; |
---|
337 | int n = instances.numInstances(); |
---|
338 | stats[0] = 0; |
---|
339 | stats[1] = 0; |
---|
340 | // number of couples |
---|
341 | stats[2] = n * (n - 1) / 2; |
---|
342 | for (int i = 0; i < n; i++) { |
---|
343 | Instance i1 = instances.instance(i); |
---|
344 | for (int j = i + 1; j < n; j++) { |
---|
345 | Instance j1 = instances.instance(j); |
---|
346 | if (comparable(i1, j1)) { |
---|
347 | stats[1]++; // comparable |
---|
348 | if (reversedPreference(i1, j1)) { |
---|
349 | stats[0]++; // reversed preference |
---|
350 | } |
---|
351 | } |
---|
352 | } |
---|
353 | } |
---|
354 | return stats; |
---|
355 | } |
---|
356 | |
---|
357 | /** |
---|
358 | * Find the number of stochastic reversed preferences in the dataset. |
---|
359 | * |
---|
360 | * @param instances the instances to be examined |
---|
361 | * @return an array of integers containing at position |
---|
362 | * <ul> |
---|
363 | * <li> 0: number of different coordinates, this is the size of S_X </li> |
---|
364 | * <li> 1: number of couples showing reversed preference:<br> |
---|
365 | * <code> x < y </code> and |
---|
366 | * <code> not (F_x leqstoch F_y) </code> </li> |
---|
367 | * <li> 2: number of couples having<br> |
---|
368 | * <code> x < y </code> and <code> F_y leqstoch F_x </code> |
---|
369 | * and <code> F_x neq F_y </code> </li> |
---|
370 | * <li> 3: number of couples that are comparable <br> |
---|
371 | * <code> |\{ (x,y)\in S_X \times S_x | x < y\}| </code> </li> |
---|
372 | * <li> 4: number of couples in S_X </li> |
---|
373 | * </ul> |
---|
374 | * @throws IllegalArgumentException if there are no instances with |
---|
375 | * a non-missing class value, or if the class is not set |
---|
376 | */ |
---|
377 | public static int[] nrStochasticReversedPreference(Instances instances) |
---|
378 | throws IllegalArgumentException { |
---|
379 | |
---|
380 | if (instances.classIndex() < 0) { |
---|
381 | throw new IllegalArgumentException("Class is not set"); |
---|
382 | } |
---|
383 | |
---|
384 | // copy the dataset |
---|
385 | Instances data = new Instances(instances); |
---|
386 | |
---|
387 | // new dataset where examples with missing class value are removed |
---|
388 | data.deleteWithMissingClass(); |
---|
389 | if (data.numInstances() == 0) { |
---|
390 | throw new IllegalArgumentException |
---|
391 | ("No instances with a class value!"); |
---|
392 | } |
---|
393 | |
---|
394 | // build the Map for the estimatedDistributions |
---|
395 | Map distributions = new HashMap(data.numInstances()/2); |
---|
396 | |
---|
397 | // cycle through all instances |
---|
398 | Iterator i = |
---|
399 | new EnumerationIterator(instances.enumerateInstances()); |
---|
400 | |
---|
401 | while (i.hasNext()) { |
---|
402 | Instance instance = (Instance) i.next(); |
---|
403 | Coordinates c = new Coordinates(instance); |
---|
404 | |
---|
405 | // get DiscreteEstimator from the map |
---|
406 | DiscreteEstimator df = |
---|
407 | (DiscreteEstimator) distributions.get(c); |
---|
408 | |
---|
409 | // if no DiscreteEstimator is present in the map, create one |
---|
410 | if (df == null) { |
---|
411 | df = new DiscreteEstimator(instances.numClasses(), 0); |
---|
412 | } |
---|
413 | df.addValue(instance.classValue(),instance.weight()); // update |
---|
414 | distributions.put(c,df); // put back in map |
---|
415 | } |
---|
416 | |
---|
417 | |
---|
418 | // build the map of cumulative distribution functions |
---|
419 | Map cumulativeDistributions = |
---|
420 | new HashMap(distributions.size()); |
---|
421 | |
---|
422 | // Cycle trough the map of discrete distributions, and create a new |
---|
423 | // one containing cumulative discrete distributions |
---|
424 | for (Iterator it=distributions.keySet().iterator(); |
---|
425 | it.hasNext();) { |
---|
426 | Coordinates c = (Coordinates) it.next(); |
---|
427 | DiscreteEstimator df = |
---|
428 | (DiscreteEstimator) distributions.get(c); |
---|
429 | cumulativeDistributions.put |
---|
430 | (c, new CumulativeDiscreteDistribution(df)); |
---|
431 | } |
---|
432 | int[] revPref = new int[5]; |
---|
433 | revPref[0] = cumulativeDistributions.size(); |
---|
434 | Iterator it = cumulativeDistributions.keySet().iterator(); |
---|
435 | while (it.hasNext()) { |
---|
436 | Coordinates c1 = (Coordinates) it.next(); |
---|
437 | CumulativeDiscreteDistribution cdf1 = |
---|
438 | (CumulativeDiscreteDistribution) |
---|
439 | cumulativeDistributions.get(c1); |
---|
440 | Iterator it2 = cumulativeDistributions.keySet().iterator(); |
---|
441 | while(it2.hasNext()) { |
---|
442 | Coordinates c2 = (Coordinates) it2.next(); |
---|
443 | CumulativeDiscreteDistribution cdf2 = |
---|
444 | (CumulativeDiscreteDistribution) |
---|
445 | cumulativeDistributions.get(c2); |
---|
446 | if (c2.equals(c1)) { |
---|
447 | continue; |
---|
448 | } |
---|
449 | |
---|
450 | revPref[4]++; |
---|
451 | |
---|
452 | if (c1.strictlySmaller(c2) == true) { |
---|
453 | revPref[3]++; //vergelijkbaar |
---|
454 | if (cdf1.stochasticDominatedBy(cdf2) == false ) { |
---|
455 | revPref[1]++; |
---|
456 | if (cdf2.stochasticDominatedBy(cdf1) == true) { |
---|
457 | revPref[2]++; |
---|
458 | } |
---|
459 | } |
---|
460 | } |
---|
461 | } |
---|
462 | } |
---|
463 | revPref[4] /= 2; |
---|
464 | return revPref; |
---|
465 | } |
---|
466 | |
---|
467 | /** |
---|
468 | * Counts the number of redundant pairs in the sense of OLM. |
---|
469 | * Two instances are redundant if they are comparable and have the same |
---|
470 | * class value. |
---|
471 | * |
---|
472 | * @param instances the instances to be checked |
---|
473 | * @return the number of redundant pairs in the given set of instances |
---|
474 | */ |
---|
475 | public static int nrOfRedundant(Instances instances) { |
---|
476 | int n = instances.numInstances(); |
---|
477 | int nrRedundant = 0; |
---|
478 | for (int i = 0; i < n; i++) { |
---|
479 | Instance i1 = instances.instance(i); |
---|
480 | for (int j = i + 1; j < n; j++) { |
---|
481 | Instance j1 = instances.instance(j); |
---|
482 | if (j1.classValue() == i1.classValue() && comparable(i1, j1) ) { |
---|
483 | nrRedundant++; |
---|
484 | } |
---|
485 | } |
---|
486 | } |
---|
487 | |
---|
488 | return nrRedundant; |
---|
489 | } |
---|
490 | |
---|
491 | /** |
---|
492 | * Calulates the total loss over the <code> instances </code>, |
---|
493 | * using the trained <code> classifier </code> and the |
---|
494 | * specified <code> lossFunction. </code> The instances |
---|
495 | * should not contain missing values in the class attribute. |
---|
496 | * |
---|
497 | * @param classifier the trained classifier to use |
---|
498 | * @param instances the test instances |
---|
499 | * @param lossFunction the loss function to use |
---|
500 | * @return the total loss of all the instances using the given classifier and loss function |
---|
501 | */ |
---|
502 | public static double totalLoss(Classifier classifier, Instances instances, |
---|
503 | NominalLossFunction lossFunction) { |
---|
504 | |
---|
505 | double loss = 0; |
---|
506 | int n = instances.numInstances(); |
---|
507 | for (int i = 0; i < n; i++) { |
---|
508 | try { |
---|
509 | loss += lossFunction.loss(instances.instance(i).classValue(), |
---|
510 | classifier.classifyInstance(instances.instance(i))); |
---|
511 | } catch (Exception e) { |
---|
512 | // what should we do here ?? |
---|
513 | } |
---|
514 | } |
---|
515 | return loss; |
---|
516 | } |
---|
517 | |
---|
518 | |
---|
519 | /** |
---|
520 | * Classify a set of instances using a given classifier. The class value |
---|
521 | * of the instances are set. |
---|
522 | * |
---|
523 | * @param instances the instances to be classified |
---|
524 | * @param classifier a built classifier |
---|
525 | * @throws Exception if one of the instances could no be classified |
---|
526 | */ |
---|
527 | public static void classifyInstances(Instances instances, Classifier classifier) |
---|
528 | throws Exception { |
---|
529 | |
---|
530 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
---|
531 | while(it.hasNext()) { |
---|
532 | Instance instance = (Instance) it.next(); |
---|
533 | instance.setClassValue(classifier.classifyInstance(instance)); |
---|
534 | } |
---|
535 | } |
---|
536 | |
---|
537 | |
---|
538 | /** |
---|
539 | * Calculates the relation (poset) formed by the instances. |
---|
540 | * |
---|
541 | * @param instances the instances for which the poset is to be formed |
---|
542 | * @return a <code> BooleanBitMatrix </code> for which position |
---|
543 | * <code> bm.get(i,j) == true </code> iff <code> |
---|
544 | * InstancesUtil.strictlySmaller(instances.instance(i), |
---|
545 | * instances.instance(j)) == true </code> |
---|
546 | */ |
---|
547 | public static BooleanBitMatrix getBitMatrix(Instances instances) { |
---|
548 | int numInstances = instances.numInstances(); |
---|
549 | BooleanBitMatrix bm = |
---|
550 | new BooleanBitMatrix(numInstances, numInstances); |
---|
551 | for (int i = 0; i < numInstances; i++ ) { |
---|
552 | Instance instance1 = instances.instance(i); |
---|
553 | for (int j = 0; j < numInstances; j++) { |
---|
554 | Instance instance2 = instances.instance(j); |
---|
555 | if (InstancesUtil.strictlySmaller(instance1, instance2)) { |
---|
556 | bm.set(i, j); // arc from instance1 to instance2 |
---|
557 | } |
---|
558 | } |
---|
559 | } |
---|
560 | return bm; |
---|
561 | } |
---|
562 | |
---|
563 | /** |
---|
564 | * Calculatus the number of elements in the closed interval |
---|
565 | * <code> [low,up]. </code> If the class index is set, then |
---|
566 | * the class attribute does not play part in the calculations, |
---|
567 | * this is we work in the data space. The code also works with |
---|
568 | * numeric attributes, but is primarily intended for ordinal attributes. |
---|
569 | * |
---|
570 | * @param low the lower bound of the interval |
---|
571 | * @param up the upper bound of the interval |
---|
572 | * @return the size of the interval (in floating point format) |
---|
573 | * @throws IllegalArgumentException if the given instances do not |
---|
574 | * constitute an interval. |
---|
575 | */ |
---|
576 | public static double numberInInterval(Instance low, Instance up) |
---|
577 | throws IllegalArgumentException { |
---|
578 | |
---|
579 | Coordinates cLow = new Coordinates(low); |
---|
580 | Coordinates cUp = new Coordinates(up); |
---|
581 | if (cLow.smallerOrEqual(cUp) == false) { |
---|
582 | throw new IllegalArgumentException |
---|
583 | ("The given instances are not the bounds of an interval"); |
---|
584 | } |
---|
585 | double number = 1; |
---|
586 | int dim = cLow.dimension(); |
---|
587 | for (int i = 0; i < dim; i++) { |
---|
588 | number *= (cUp.getValue(i) - cLow.getValue(i) + 1); |
---|
589 | } |
---|
590 | return number; |
---|
591 | } |
---|
592 | |
---|
593 | |
---|
594 | /** |
---|
595 | * Calculatutes the number of vectors in the data space that are smaller |
---|
596 | * or equal than the given instance. |
---|
597 | * |
---|
598 | * @param instance the given instance |
---|
599 | * @return the number of vectors in the data space smaller or equal |
---|
600 | * than the given instance |
---|
601 | * @throws IllegalArgumentException if there are numeric attributes |
---|
602 | */ |
---|
603 | public static double numberOfSmallerVectors(Instance instance) |
---|
604 | throws IllegalArgumentException { |
---|
605 | |
---|
606 | double[] values = InstancesUtil.toDataDouble(instance); |
---|
607 | double nr = 1; |
---|
608 | |
---|
609 | for (int i = 0; i < values.length; i++) { |
---|
610 | if (instance.attribute(i).isNumeric()) { |
---|
611 | throw new IllegalArgumentException |
---|
612 | ("Numeric attributes are not supported"); |
---|
613 | } |
---|
614 | nr *= (values[i] + 1); |
---|
615 | } |
---|
616 | |
---|
617 | return nr; |
---|
618 | } |
---|
619 | |
---|
620 | /** |
---|
621 | * Calculatutes the number of vectors in the data space that are |
---|
622 | * greater or equal than the given instance. |
---|
623 | * |
---|
624 | * @param instance the given instance |
---|
625 | * @return the number of vectors in the data space greater of equal |
---|
626 | * than the given instance |
---|
627 | * @throws IllegalArgumentException if there are numeric attributes |
---|
628 | */ |
---|
629 | public static double numberOfGreaterVectors(Instance instance) |
---|
630 | throws IllegalArgumentException { |
---|
631 | |
---|
632 | double[] values = InstancesUtil.toDataDouble(instance); |
---|
633 | double nr = 1; |
---|
634 | |
---|
635 | for (int i = 0; i < values.length; i++) { |
---|
636 | if (instance.attribute(i).isNumeric()) { |
---|
637 | throw new IllegalArgumentException |
---|
638 | ("Numeric attributes are not supported"); |
---|
639 | } |
---|
640 | nr *= (instance.attribute(i).numValues() - values[i]); |
---|
641 | } |
---|
642 | |
---|
643 | return nr; |
---|
644 | } |
---|
645 | |
---|
646 | /** |
---|
647 | * Write the instances in ARFF-format to the indicated |
---|
648 | * <code> BufferedWriter </code>. |
---|
649 | * @param instances the instances to write |
---|
650 | * @param file the <code> BufferedWriter </code> to write to |
---|
651 | * @throws IOException if something goes wrong while writing the instances |
---|
652 | */ |
---|
653 | public static void write(Instances instances, BufferedWriter file) |
---|
654 | throws IOException{ |
---|
655 | |
---|
656 | file.write(instances.toString()); // XXX can probably be done better |
---|
657 | } |
---|
658 | |
---|
659 | |
---|
660 | /** |
---|
661 | * Return a histogram of the values for the specified attribute. |
---|
662 | * |
---|
663 | * @param instances the instances |
---|
664 | * @param attributeIndex the attribute to consider |
---|
665 | * @return a <code> DiscreteEstimator </code> where the <code>i</code>th |
---|
666 | * @throws IllegalArgumentException if the attribute at the specified |
---|
667 | * index is numeric |
---|
668 | */ |
---|
669 | public static DiscreteEstimator countValues(Instances instances, int attributeIndex) |
---|
670 | throws IllegalArgumentException{ |
---|
671 | |
---|
672 | int numValues = instances.attribute(attributeIndex).numValues(); |
---|
673 | if (numValues == 0) { |
---|
674 | throw new IllegalArgumentException |
---|
675 | ("Can't create histogram for numeric attribute"); |
---|
676 | } |
---|
677 | |
---|
678 | DiscreteEstimator de = new DiscreteEstimator(numValues, false); |
---|
679 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
---|
680 | while (it.hasNext()) { |
---|
681 | Instance instance = (Instance) it.next(); |
---|
682 | if (!instance.isMissing(attributeIndex)) { |
---|
683 | de.addValue(instance.value(attributeIndex), instance.weight()); |
---|
684 | } |
---|
685 | } |
---|
686 | return de; |
---|
687 | } |
---|
688 | |
---|
689 | /** |
---|
690 | * Create, without replacement, a random subsample of the given size |
---|
691 | * from the given instances. |
---|
692 | * |
---|
693 | * @param instances the instances to sample from |
---|
694 | * @param size the requested size of the sample |
---|
695 | * @param random the random generator to use |
---|
696 | * @return a sample of the requested size, drawn from the given |
---|
697 | * instances without replacement |
---|
698 | * @throws IllegalArgumentException if the size exceeds the number |
---|
699 | * of instances |
---|
700 | */ |
---|
701 | public static Instances sampleWithoutReplacement( |
---|
702 | Instances instances, int size, Random random) { |
---|
703 | |
---|
704 | if (size > instances.numInstances()) { |
---|
705 | throw new IllegalArgumentException |
---|
706 | ("Size of requested sample exceeds number of instances"); |
---|
707 | } |
---|
708 | |
---|
709 | int numInstances = instances.numInstances(); |
---|
710 | int[] indices = new int[instances.numInstances()]; |
---|
711 | for (int i = 0; i < numInstances; i++) { |
---|
712 | indices[i] = i; |
---|
713 | } |
---|
714 | |
---|
715 | Instances sample = new Instances(instances, size); |
---|
716 | int index; |
---|
717 | for (int i = 0; i < size; i++) { |
---|
718 | index = random.nextInt(numInstances--); |
---|
719 | sample.add(instances.instance(indices[index])); |
---|
720 | swap(indices, index, numInstances); |
---|
721 | } |
---|
722 | return sample; |
---|
723 | } |
---|
724 | |
---|
725 | /** |
---|
726 | * Swaps two elements of the given array. |
---|
727 | * |
---|
728 | * @param aa the array |
---|
729 | * @param i the index of the first element |
---|
730 | * @param j the index of the second element |
---|
731 | */ |
---|
732 | final private static void swap(int[] aa, int i, int j) { |
---|
733 | int tmp = aa[i]; |
---|
734 | aa[i] = aa[j]; |
---|
735 | aa[j] = tmp; |
---|
736 | } |
---|
737 | /** |
---|
738 | * Generates a random sample of instances. Each attribute must be nominal, and the |
---|
739 | * class labels are not set. |
---|
740 | * |
---|
741 | * @param headerInfo Instances whose header information is used to determine how the |
---|
742 | * set of returned instances will look |
---|
743 | * @param numberOfExamples the desired size of the returned set |
---|
744 | * @param random the random number generator to use |
---|
745 | * @return a set of Instances containing the random sample. |
---|
746 | * @throws IllegalArgumentException if numeric attributes are given |
---|
747 | */ |
---|
748 | public static Instances generateRandomSample( |
---|
749 | Instances headerInfo, int numberOfExamples, Random random) |
---|
750 | throws IllegalArgumentException { |
---|
751 | |
---|
752 | int n = headerInfo.numAttributes(); |
---|
753 | double[] info = new double[n]; |
---|
754 | int classIndex = headerInfo.classIndex(); |
---|
755 | for (int i = 0; i < n; i++) { |
---|
756 | info[i] = headerInfo.attribute(i).numValues(); |
---|
757 | if (i != classIndex && info[i] == 0) { |
---|
758 | throw new IllegalArgumentException |
---|
759 | ("Numeric attributes are currently not supported"); |
---|
760 | } |
---|
761 | } |
---|
762 | Instances sample = new Instances(headerInfo, numberOfExamples); |
---|
763 | sample.setRelationName(headerInfo.relationName() + |
---|
764 | ".random.sample.of." + numberOfExamples); |
---|
765 | for (int i = 0; i < numberOfExamples; i++) { |
---|
766 | sample.add(randomSample(info, classIndex, random)); |
---|
767 | } |
---|
768 | return sample; |
---|
769 | } |
---|
770 | /** |
---|
771 | * Generates a random instance. |
---|
772 | * |
---|
773 | * @param info array that gives for each attribute the number of possible values |
---|
774 | * @param classIndex the index of the class attribute |
---|
775 | * @param random the random number generator used |
---|
776 | * @return a random instance |
---|
777 | */ |
---|
778 | private static Instance randomSample(double[] info, |
---|
779 | int classIndex, Random random) { |
---|
780 | |
---|
781 | double[] attValues = new double[info.length]; |
---|
782 | for (int i = 0; i < attValues.length; i++) { |
---|
783 | if (i != classIndex) { |
---|
784 | attValues[i] = random.nextInt( (int) info[i]); |
---|
785 | } |
---|
786 | } |
---|
787 | return new DenseInstance(1, attValues); |
---|
788 | } |
---|
789 | |
---|
790 | |
---|
791 | |
---|
792 | /** |
---|
793 | * Returns an array containing the attribute values (in internal floating |
---|
794 | * point format) of the given instance in data space, this is, the class |
---|
795 | * attribute (if any) is removed. |
---|
796 | * |
---|
797 | * @param instance the instance to get the attribute values from |
---|
798 | * @return array of doubles containing the attribute values |
---|
799 | */ |
---|
800 | public static double[] toDataDouble(Instance instance) { |
---|
801 | double[] vector = null; |
---|
802 | int classIndex = instance.classIndex(); |
---|
803 | if(classIndex >= 0) { |
---|
804 | vector = new double[instance.numAttributes() - 1]; |
---|
805 | } else { |
---|
806 | vector = new double[instance.numAttributes()]; |
---|
807 | } |
---|
808 | int index = 0; |
---|
809 | for (int i = 0; i < instance.numAttributes(); i++) { |
---|
810 | if(i != classIndex) { |
---|
811 | vector[index++] = instance.value(i); |
---|
812 | } |
---|
813 | } |
---|
814 | return vector; |
---|
815 | } |
---|
816 | |
---|
817 | /** |
---|
818 | * Computes the minimal extension for a given instance. |
---|
819 | * |
---|
820 | * @param instances the set of instances |
---|
821 | * @param instance the instance for which the minimal extension is to be |
---|
822 | * calculated |
---|
823 | * @return the value of the minimal extension, in internal floating point |
---|
824 | * format |
---|
825 | */ |
---|
826 | public static double minimalExtension(Instances instances, Instance instance) { |
---|
827 | return minimalExtension(instances, instance, 0); |
---|
828 | } |
---|
829 | |
---|
830 | /** |
---|
831 | * Computes the minimal extension of a given instance, but the |
---|
832 | * minimal value returned is <code> minValue. </code> This method |
---|
833 | * may have its applications when the training set is divided into |
---|
834 | * multiple Instances objects. |
---|
835 | * |
---|
836 | * @param instances the set of instances |
---|
837 | * @param instance the instance for which the minimal extension is to |
---|
838 | * be calculated |
---|
839 | * @param minValue a double indicating the minimal value that should |
---|
840 | * be returned |
---|
841 | * @return the label of the minimal extension, in internal floating point format |
---|
842 | */ |
---|
843 | public static double minimalExtension( |
---|
844 | Instances instances, Instance instance, double minValue) { |
---|
845 | |
---|
846 | double value = minValue; |
---|
847 | |
---|
848 | Iterator it = |
---|
849 | new EnumerationIterator(instances.enumerateInstances()); |
---|
850 | while(it.hasNext()) { |
---|
851 | Instance tmp = (Instance) it.next(); |
---|
852 | if (tmp.classValue() > value |
---|
853 | && InstancesUtil.smallerOrEqual(tmp, instance) ) { |
---|
854 | value = tmp.classValue(); |
---|
855 | } |
---|
856 | } |
---|
857 | return value; |
---|
858 | } |
---|
859 | |
---|
860 | /** |
---|
861 | * Computes the maximal extension for a given instance. |
---|
862 | * |
---|
863 | * @param instances the set of instances |
---|
864 | * @param instance the instance for which the minimal extension is to be |
---|
865 | * calculated |
---|
866 | * @return the value of the minimal extension, in internal floating point |
---|
867 | * format |
---|
868 | */ |
---|
869 | public static double maximalExtension(Instances instances, Instance instance) { |
---|
870 | return maximalExtension(instances, instance, instances.numClasses() - 1); |
---|
871 | } |
---|
872 | |
---|
873 | /** |
---|
874 | * Computes the maximal extension of a given instance, but the |
---|
875 | * maximal value returned is <code> maxValue. </code> This method |
---|
876 | * may have its applications when the training set is divided into |
---|
877 | * multiple Instances objects. |
---|
878 | * |
---|
879 | * @param instances the set of instances |
---|
880 | * @param instance the instance for which the maximal extension is to |
---|
881 | * be calculated |
---|
882 | * @param maxValue a double indicating the maximal value that should |
---|
883 | * be returned |
---|
884 | * @return the value of the minimal extension, in internal floating point |
---|
885 | * format |
---|
886 | */ |
---|
887 | public static double maximalExtension( |
---|
888 | Instances instances, Instance instance, double maxValue) { |
---|
889 | |
---|
890 | double value = maxValue; |
---|
891 | |
---|
892 | Iterator it = |
---|
893 | new EnumerationIterator(instances.enumerateInstances()); |
---|
894 | while(it.hasNext()) { |
---|
895 | Instance tmp = (Instance) it.next(); |
---|
896 | if (tmp.classValue() < value |
---|
897 | && InstancesUtil.smallerOrEqual(instance, tmp) ) { |
---|
898 | value = tmp.classValue(); |
---|
899 | } |
---|
900 | } |
---|
901 | return value; |
---|
902 | } |
---|
903 | |
---|
904 | /** |
---|
905 | * Returns the revision string. |
---|
906 | * |
---|
907 | * @return the revision |
---|
908 | */ |
---|
909 | public String getRevision() { |
---|
910 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
911 | } |
---|
912 | } |
---|