| 1 | /* |
|---|
| 2 | * This program is free software; you can redistribute it and/or modify |
|---|
| 3 | * it under the terms of the GNU General Public License as published by |
|---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 5 | * (at your option) any later version. |
|---|
| 6 | * |
|---|
| 7 | * This program is distributed in the hope that it will be useful, |
|---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | * GNU General Public License for more details. |
|---|
| 11 | * |
|---|
| 12 | * You should have received a copy of the GNU General Public License |
|---|
| 13 | * along with this program; if not, write to the Free Software |
|---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 15 | */ |
|---|
| 16 | |
|---|
| 17 | /* |
|---|
| 18 | * InstancesUtil.java |
|---|
| 19 | * Copyright (C) 2004 Stijn Lievens |
|---|
| 20 | * |
|---|
| 21 | */ |
|---|
| 22 | |
|---|
| 23 | package weka.classifiers.misc.monotone; |
|---|
| 24 | |
|---|
| 25 | import weka.classifiers.Classifier; |
|---|
| 26 | import weka.core.Instance; |
|---|
| 27 | import weka.core.DenseInstance; |
|---|
| 28 | import weka.core.Instances; |
|---|
| 29 | import weka.core.RevisionHandler; |
|---|
| 30 | import weka.core.RevisionUtils; |
|---|
| 31 | import weka.core.Utils; |
|---|
| 32 | import weka.estimators.DiscreteEstimator; |
|---|
| 33 | |
|---|
| 34 | import java.io.BufferedWriter; |
|---|
| 35 | import java.io.IOException; |
|---|
| 36 | import java.util.HashMap; |
|---|
| 37 | import java.util.Iterator; |
|---|
| 38 | import java.util.Map; |
|---|
| 39 | import java.util.Random; |
|---|
| 40 | |
|---|
| 41 | /** |
|---|
| 42 | * This class contains some methods for working with objects of |
|---|
| 43 | * type <code> Instance </code> and <code> Instances, </code> not |
|---|
| 44 | * provided by there respective classes. |
|---|
| 45 | * <p> |
|---|
| 46 | * This implementation is part of the master's thesis: "Studie |
|---|
| 47 | * en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd |
|---|
| 48 | * rangschikken", Stijn Lievens, Ghent University, 2004. |
|---|
| 49 | * </p> |
|---|
| 50 | * |
|---|
| 51 | * @author Stijn Lievens (stijn.lievens@ugent.be) |
|---|
| 52 | * @version $Revision: 5987 $ |
|---|
| 53 | */ |
|---|
| 54 | public class InstancesUtil |
|---|
| 55 | implements RevisionHandler { |
|---|
| 56 | |
|---|
| 57 | /** |
|---|
| 58 | * Compares two instances, ignoring the class attribute (if any) |
|---|
| 59 | * |
|---|
| 60 | * @param i1 the first instance |
|---|
| 61 | * @param i2 the second instance |
|---|
| 62 | * @return true if both instances are equal (ignoring the class |
|---|
| 63 | * attribute), false otherwise |
|---|
| 64 | */ |
|---|
| 65 | public static boolean equalIgnoreClass(Instance i1, Instance i2) { |
|---|
| 66 | int n = i1.numAttributes(); |
|---|
| 67 | int classIndex = i1.classIndex(); |
|---|
| 68 | if (i2.numAttributes() != n || classIndex != i2.classIndex()) { |
|---|
| 69 | return false; |
|---|
| 70 | } |
|---|
| 71 | int i = 0; |
|---|
| 72 | while(i < n && (i == classIndex |
|---|
| 73 | || Utils.eq(i1.value(i), i2.value(i)))) { |
|---|
| 74 | i++; |
|---|
| 75 | } |
|---|
| 76 | return i == n; |
|---|
| 77 | } |
|---|
| 78 | |
|---|
| 79 | /** |
|---|
| 80 | * Get the index of an instance in a set of instances, where |
|---|
| 81 | * instances are compared ignoring the class attribute. |
|---|
| 82 | * |
|---|
| 83 | * @param instances the set of instances |
|---|
| 84 | * @param instance to instance to be found in the given set of instances |
|---|
| 85 | * @return the index of the first instance that equals the given instance |
|---|
| 86 | * (ignoring the class attribute), -1 if the instance was not found |
|---|
| 87 | */ |
|---|
| 88 | public static int containsIgnoreClass(Instances instances, Instance instance) { |
|---|
| 89 | double[] dd = instance.toDoubleArray(); |
|---|
| 90 | int classIndex = instances.classIndex(); |
|---|
| 91 | int n = instances.numAttributes(); |
|---|
| 92 | Iterator it = |
|---|
| 93 | new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 94 | int index = 0; |
|---|
| 95 | while(it.hasNext()) { |
|---|
| 96 | Instance tmp = (Instance) it.next(); |
|---|
| 97 | int i = 0; |
|---|
| 98 | while(i < n && |
|---|
| 99 | (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) { |
|---|
| 100 | i++; |
|---|
| 101 | } |
|---|
| 102 | if (i == n) { |
|---|
| 103 | return index; // found it |
|---|
| 104 | } |
|---|
| 105 | index++; |
|---|
| 106 | } |
|---|
| 107 | return -1; |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | /** |
|---|
| 111 | * Find the next occurence of an instance, ignoring the class, |
|---|
| 112 | * for which the index in the dataset is at least <code> index. </code> |
|---|
| 113 | * |
|---|
| 114 | * @param instances the set of instances to be searched |
|---|
| 115 | * @param instance the instance to be found |
|---|
| 116 | * @param index the minimum index that might be returned |
|---|
| 117 | * @return the index of the first instance with index at |
|---|
| 118 | * least <code> index </code> that equals the given instance |
|---|
| 119 | * (ignoring the class attribute), -1 if the instance was not found |
|---|
| 120 | */ |
|---|
| 121 | public static int nextOccurenceIgnoreClass(Instances instances, Instance instance, int index) { |
|---|
| 122 | double[] dd = instance.toDoubleArray(); |
|---|
| 123 | int classIndex = instances.classIndex(); |
|---|
| 124 | int n = instances.numAttributes(); |
|---|
| 125 | int numInstances = instances.numInstances(); |
|---|
| 126 | int currentIndex = index; |
|---|
| 127 | while(currentIndex < numInstances) { |
|---|
| 128 | Instance tmp = instances.instance(currentIndex); |
|---|
| 129 | int i = 0; |
|---|
| 130 | while(i < n && |
|---|
| 131 | (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) { |
|---|
| 132 | i++; |
|---|
| 133 | } |
|---|
| 134 | if (i == n) { |
|---|
| 135 | return currentIndex; // found it |
|---|
| 136 | } |
|---|
| 137 | currentIndex++; |
|---|
| 138 | } |
|---|
| 139 | return -1; // not present |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | /** |
|---|
| 143 | * Check if all instances have the same class value. |
|---|
| 144 | * |
|---|
| 145 | * @param instances the instances to be checked for homogeneity |
|---|
| 146 | * @return true if the instances have the same class value, false otherwise |
|---|
| 147 | */ |
|---|
| 148 | public static boolean isHomogeneous(Instances instances) { |
|---|
| 149 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 150 | if (it.hasNext()) { |
|---|
| 151 | double classValue = ((Instance) it.next()).classValue(); |
|---|
| 152 | while(it.hasNext()) { |
|---|
| 153 | if (((Instance) it.next()).classValue() != classValue) { |
|---|
| 154 | return false; |
|---|
| 155 | } |
|---|
| 156 | } |
|---|
| 157 | } |
|---|
| 158 | return true; // empty or all identical |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | /** |
|---|
| 162 | * Compares two instances in the data space, this is ignoring the class |
|---|
| 163 | * attribute. An instance is strictly smaller than another instance |
|---|
| 164 | * if the same holds for the <code> Coordinates </code> based on |
|---|
| 165 | * these instances. |
|---|
| 166 | * |
|---|
| 167 | * @param i1 the first instance |
|---|
| 168 | * @param i2 the second instance |
|---|
| 169 | * @return <code> true </code> if the first instance is strictly smaller |
|---|
| 170 | * than the second instance, <code> false </code> otherwise |
|---|
| 171 | */ |
|---|
| 172 | public static boolean strictlySmaller(Instance i1, Instance i2) { |
|---|
| 173 | // XXX implementation can be done faster |
|---|
| 174 | Coordinates c1 = new Coordinates(i1); |
|---|
| 175 | Coordinates c2 = new Coordinates(i2); |
|---|
| 176 | |
|---|
| 177 | return c1.strictlySmaller(c2); |
|---|
| 178 | } |
|---|
| 179 | |
|---|
| 180 | /** |
|---|
| 181 | * Compares two instances in the data space, this is, ignoring the class |
|---|
| 182 | * attribute. An instance is smaller or equal than another instance |
|---|
| 183 | * if the same holds for the <code> Coordinates </code> based on |
|---|
| 184 | * these instances. |
|---|
| 185 | * |
|---|
| 186 | * @param i1 the first instance |
|---|
| 187 | * @param i2 the second instance |
|---|
| 188 | * @return <code> true </code> if the first instance is smaller or equal |
|---|
| 189 | * than the second instance, <code> false </code> otherwise |
|---|
| 190 | */ |
|---|
| 191 | public static boolean smallerOrEqual(Instance i1,Instance i2) { |
|---|
| 192 | // XXX implementation can be done faster |
|---|
| 193 | Coordinates c1 = new Coordinates(i1); |
|---|
| 194 | Coordinates c2 = new Coordinates(i2); |
|---|
| 195 | |
|---|
| 196 | return c1.smallerOrEqual(c2); |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | /** |
|---|
| 200 | * Checks if two instances are comparable in the data space, this is |
|---|
| 201 | * ignoring the class attribute. Two instances are comparable if the |
|---|
| 202 | * first is smaller or equal than the second, or the other way around. |
|---|
| 203 | * |
|---|
| 204 | * @param i1 the first instance |
|---|
| 205 | * @param i2 the second instance |
|---|
| 206 | * @return <code> true </code> if the given instances are comparable, |
|---|
| 207 | * <code> false </code> otherwise |
|---|
| 208 | * @throws IllegalArgumentException if the two instances don't have the |
|---|
| 209 | * same length |
|---|
| 210 | */ |
|---|
| 211 | public static boolean comparable(Instance i1, Instance i2) throws IllegalArgumentException { |
|---|
| 212 | // XXX maybe we should think about using 'equalHeaders' of Instance |
|---|
| 213 | // to obtain a fool proof implementation |
|---|
| 214 | Coordinates c1 = new Coordinates(i1); |
|---|
| 215 | Coordinates c2 = new Coordinates(i2); |
|---|
| 216 | |
|---|
| 217 | return c1.smallerOrEqual(c2) || c2.smallerOrEqual(c1); |
|---|
| 218 | } |
|---|
| 219 | |
|---|
| 220 | /** |
|---|
| 221 | * Checks it two instances give rise to doubt. There is doubt between |
|---|
| 222 | * two instances if their <code> Coordinates </code> are equal, but |
|---|
| 223 | * their class value is different. |
|---|
| 224 | * |
|---|
| 225 | * @param i1 the first instance |
|---|
| 226 | * @param i2 the second instance |
|---|
| 227 | * @return <code> true </code> if there is doubt between the two |
|---|
| 228 | * given instances, <code> false </code> otherwise |
|---|
| 229 | */ |
|---|
| 230 | public static boolean doubt(Instance i1, Instance i2) { |
|---|
| 231 | // XXX use equalHeaders ? |
|---|
| 232 | if (i1.classValue() == i2.classValue()) { |
|---|
| 233 | return false; |
|---|
| 234 | } |
|---|
| 235 | Coordinates c1 = new Coordinates(i1); |
|---|
| 236 | Coordinates c2 = new Coordinates(i2); |
|---|
| 237 | |
|---|
| 238 | return c1.equals(c2); |
|---|
| 239 | } |
|---|
| 240 | |
|---|
| 241 | /** |
|---|
| 242 | * Checks if two instances give rise to reversed preference. |
|---|
| 243 | * Two instances give rise to reversed preference in the data space, |
|---|
| 244 | * if their <code> Coordinates </code> are comparable but different, |
|---|
| 245 | * and their class values are not related in the same way. |
|---|
| 246 | * |
|---|
| 247 | * @param i1 the first instance |
|---|
| 248 | * @param i2 the second instance |
|---|
| 249 | * @return <code> true </code> if <code> i1 </code> and <code> i2 </code> |
|---|
| 250 | * give rise to reversed preference, <code> false </code> otherwise |
|---|
| 251 | * @throws IllegalArgumentException if the two instances don't have |
|---|
| 252 | * the same length |
|---|
| 253 | */ |
|---|
| 254 | public static boolean reversedPreference(Instance i1, Instance i2) throws IllegalArgumentException { |
|---|
| 255 | // XXX should the implementation be made fool proof by use of |
|---|
| 256 | // 'equalHeaders'? It can also be speeded up I think. |
|---|
| 257 | |
|---|
| 258 | if (i1.classValue() == i2.classValue()) { |
|---|
| 259 | return false; |
|---|
| 260 | } |
|---|
| 261 | Coordinates c1 = new Coordinates(i1); |
|---|
| 262 | Coordinates c2 = new Coordinates(i2); |
|---|
| 263 | |
|---|
| 264 | if (i1.classValue() > i2.classValue() && c1.strictlySmaller(c2)) { |
|---|
| 265 | return true; |
|---|
| 266 | } |
|---|
| 267 | if (i2.classValue() > i1.classValue() && c2.strictlySmaller(c1)) { |
|---|
| 268 | return true; |
|---|
| 269 | } |
|---|
| 270 | |
|---|
| 271 | return false; |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | /** |
|---|
| 275 | * Checks if the given data set is monotone. We say that a data set |
|---|
| 276 | * is monotone if it contains doubt nor reversed preferences. |
|---|
| 277 | * |
|---|
| 278 | * @param instances the data set to be checked |
|---|
| 279 | * @return <code> true </code> if the given data set if monotone, |
|---|
| 280 | * <code> false </code> otherwise |
|---|
| 281 | */ |
|---|
| 282 | public static boolean isMonotone(Instances instances) { |
|---|
| 283 | int n = instances.numInstances(); |
|---|
| 284 | for (int i = 0; i < n; i++) { |
|---|
| 285 | Instance i1 = instances.instance(i); |
|---|
| 286 | for (int j = i + 1; j < n; j++) { |
|---|
| 287 | if ( doubt(i1, instances.instance(j)) || |
|---|
| 288 | reversedPreference(i1, instances.instance(j))) { |
|---|
| 289 | return false; |
|---|
| 290 | } |
|---|
| 291 | } |
|---|
| 292 | } |
|---|
| 293 | return true; |
|---|
| 294 | } |
|---|
| 295 | |
|---|
| 296 | /** |
|---|
| 297 | * Test if a set of instances is quasi monotone. We say that a set |
|---|
| 298 | * of instances <code> S </code> is quasi monotone with respect to |
|---|
| 299 | * a set of instances <code> D </code> iff |
|---|
| 300 | * <code> [x,y] \cap D \neq \emptyset \implies class(x) \leq class(y). |
|---|
| 301 | * </code> This implies that <code> D </code> itself is monotone. |
|---|
| 302 | * |
|---|
| 303 | * @param ground the instances playing the role of <code> D </code> |
|---|
| 304 | * @param other the instances playing the role of <code> S </code> |
|---|
| 305 | * @return true if the instances are quasi monotone, false otherwise |
|---|
| 306 | */ |
|---|
| 307 | public static boolean isQuasiMonotone(Instances ground, Instances other) { |
|---|
| 308 | if (!isMonotone(ground)) { |
|---|
| 309 | return false; |
|---|
| 310 | } |
|---|
| 311 | Iterator it1 = new EnumerationIterator(ground.enumerateInstances()); |
|---|
| 312 | while(it1.hasNext()) { |
|---|
| 313 | Instance inst1 = (Instance) it1.next(); |
|---|
| 314 | Iterator it2 = new EnumerationIterator(other.enumerateInstances()); |
|---|
| 315 | while(it2.hasNext()) { |
|---|
| 316 | Instance inst2 = (Instance) it2.next(); |
|---|
| 317 | if (doubt(inst1, inst2) || reversedPreference(inst1, inst2)) { |
|---|
| 318 | return false; |
|---|
| 319 | } |
|---|
| 320 | } |
|---|
| 321 | } |
|---|
| 322 | return true; |
|---|
| 323 | } |
|---|
| 324 | |
|---|
| 325 | /** |
|---|
| 326 | * Gather some statistics regarding reversed preferences. |
|---|
| 327 | * |
|---|
| 328 | * @param instances the instances to be examined |
|---|
| 329 | * @return array of length 3; position 0 indicates the number of |
|---|
| 330 | * couples that have reversed preference, position 1 the number of |
|---|
| 331 | * couples that are comparable, and position 2 the total |
|---|
| 332 | * number of couples |
|---|
| 333 | * @see #reversedPreference(Instance, Instance) |
|---|
| 334 | */ |
|---|
| 335 | public static int[] nrOfReversedPreferences(Instances instances) { |
|---|
| 336 | int[] stats = new int[3]; |
|---|
| 337 | int n = instances.numInstances(); |
|---|
| 338 | stats[0] = 0; |
|---|
| 339 | stats[1] = 0; |
|---|
| 340 | // number of couples |
|---|
| 341 | stats[2] = n * (n - 1) / 2; |
|---|
| 342 | for (int i = 0; i < n; i++) { |
|---|
| 343 | Instance i1 = instances.instance(i); |
|---|
| 344 | for (int j = i + 1; j < n; j++) { |
|---|
| 345 | Instance j1 = instances.instance(j); |
|---|
| 346 | if (comparable(i1, j1)) { |
|---|
| 347 | stats[1]++; // comparable |
|---|
| 348 | if (reversedPreference(i1, j1)) { |
|---|
| 349 | stats[0]++; // reversed preference |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | } |
|---|
| 353 | } |
|---|
| 354 | return stats; |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | /** |
|---|
| 358 | * Find the number of stochastic reversed preferences in the dataset. |
|---|
| 359 | * |
|---|
| 360 | * @param instances the instances to be examined |
|---|
| 361 | * @return an array of integers containing at position |
|---|
| 362 | * <ul> |
|---|
| 363 | * <li> 0: number of different coordinates, this is the size of S_X </li> |
|---|
| 364 | * <li> 1: number of couples showing reversed preference:<br> |
|---|
| 365 | * <code> x < y </code> and |
|---|
| 366 | * <code> not (F_x leqstoch F_y) </code> </li> |
|---|
| 367 | * <li> 2: number of couples having<br> |
|---|
| 368 | * <code> x < y </code> and <code> F_y leqstoch F_x </code> |
|---|
| 369 | * and <code> F_x neq F_y </code> </li> |
|---|
| 370 | * <li> 3: number of couples that are comparable <br> |
|---|
| 371 | * <code> |\{ (x,y)\in S_X \times S_x | x < y\}| </code> </li> |
|---|
| 372 | * <li> 4: number of couples in S_X </li> |
|---|
| 373 | * </ul> |
|---|
| 374 | * @throws IllegalArgumentException if there are no instances with |
|---|
| 375 | * a non-missing class value, or if the class is not set |
|---|
| 376 | */ |
|---|
| 377 | public static int[] nrStochasticReversedPreference(Instances instances) |
|---|
| 378 | throws IllegalArgumentException { |
|---|
| 379 | |
|---|
| 380 | if (instances.classIndex() < 0) { |
|---|
| 381 | throw new IllegalArgumentException("Class is not set"); |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | // copy the dataset |
|---|
| 385 | Instances data = new Instances(instances); |
|---|
| 386 | |
|---|
| 387 | // new dataset where examples with missing class value are removed |
|---|
| 388 | data.deleteWithMissingClass(); |
|---|
| 389 | if (data.numInstances() == 0) { |
|---|
| 390 | throw new IllegalArgumentException |
|---|
| 391 | ("No instances with a class value!"); |
|---|
| 392 | } |
|---|
| 393 | |
|---|
| 394 | // build the Map for the estimatedDistributions |
|---|
| 395 | Map distributions = new HashMap(data.numInstances()/2); |
|---|
| 396 | |
|---|
| 397 | // cycle through all instances |
|---|
| 398 | Iterator i = |
|---|
| 399 | new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 400 | |
|---|
| 401 | while (i.hasNext()) { |
|---|
| 402 | Instance instance = (Instance) i.next(); |
|---|
| 403 | Coordinates c = new Coordinates(instance); |
|---|
| 404 | |
|---|
| 405 | // get DiscreteEstimator from the map |
|---|
| 406 | DiscreteEstimator df = |
|---|
| 407 | (DiscreteEstimator) distributions.get(c); |
|---|
| 408 | |
|---|
| 409 | // if no DiscreteEstimator is present in the map, create one |
|---|
| 410 | if (df == null) { |
|---|
| 411 | df = new DiscreteEstimator(instances.numClasses(), 0); |
|---|
| 412 | } |
|---|
| 413 | df.addValue(instance.classValue(),instance.weight()); // update |
|---|
| 414 | distributions.put(c,df); // put back in map |
|---|
| 415 | } |
|---|
| 416 | |
|---|
| 417 | |
|---|
| 418 | // build the map of cumulative distribution functions |
|---|
| 419 | Map cumulativeDistributions = |
|---|
| 420 | new HashMap(distributions.size()); |
|---|
| 421 | |
|---|
| 422 | // Cycle trough the map of discrete distributions, and create a new |
|---|
| 423 | // one containing cumulative discrete distributions |
|---|
| 424 | for (Iterator it=distributions.keySet().iterator(); |
|---|
| 425 | it.hasNext();) { |
|---|
| 426 | Coordinates c = (Coordinates) it.next(); |
|---|
| 427 | DiscreteEstimator df = |
|---|
| 428 | (DiscreteEstimator) distributions.get(c); |
|---|
| 429 | cumulativeDistributions.put |
|---|
| 430 | (c, new CumulativeDiscreteDistribution(df)); |
|---|
| 431 | } |
|---|
| 432 | int[] revPref = new int[5]; |
|---|
| 433 | revPref[0] = cumulativeDistributions.size(); |
|---|
| 434 | Iterator it = cumulativeDistributions.keySet().iterator(); |
|---|
| 435 | while (it.hasNext()) { |
|---|
| 436 | Coordinates c1 = (Coordinates) it.next(); |
|---|
| 437 | CumulativeDiscreteDistribution cdf1 = |
|---|
| 438 | (CumulativeDiscreteDistribution) |
|---|
| 439 | cumulativeDistributions.get(c1); |
|---|
| 440 | Iterator it2 = cumulativeDistributions.keySet().iterator(); |
|---|
| 441 | while(it2.hasNext()) { |
|---|
| 442 | Coordinates c2 = (Coordinates) it2.next(); |
|---|
| 443 | CumulativeDiscreteDistribution cdf2 = |
|---|
| 444 | (CumulativeDiscreteDistribution) |
|---|
| 445 | cumulativeDistributions.get(c2); |
|---|
| 446 | if (c2.equals(c1)) { |
|---|
| 447 | continue; |
|---|
| 448 | } |
|---|
| 449 | |
|---|
| 450 | revPref[4]++; |
|---|
| 451 | |
|---|
| 452 | if (c1.strictlySmaller(c2) == true) { |
|---|
| 453 | revPref[3]++; //vergelijkbaar |
|---|
| 454 | if (cdf1.stochasticDominatedBy(cdf2) == false ) { |
|---|
| 455 | revPref[1]++; |
|---|
| 456 | if (cdf2.stochasticDominatedBy(cdf1) == true) { |
|---|
| 457 | revPref[2]++; |
|---|
| 458 | } |
|---|
| 459 | } |
|---|
| 460 | } |
|---|
| 461 | } |
|---|
| 462 | } |
|---|
| 463 | revPref[4] /= 2; |
|---|
| 464 | return revPref; |
|---|
| 465 | } |
|---|
| 466 | |
|---|
| 467 | /** |
|---|
| 468 | * Counts the number of redundant pairs in the sense of OLM. |
|---|
| 469 | * Two instances are redundant if they are comparable and have the same |
|---|
| 470 | * class value. |
|---|
| 471 | * |
|---|
| 472 | * @param instances the instances to be checked |
|---|
| 473 | * @return the number of redundant pairs in the given set of instances |
|---|
| 474 | */ |
|---|
| 475 | public static int nrOfRedundant(Instances instances) { |
|---|
| 476 | int n = instances.numInstances(); |
|---|
| 477 | int nrRedundant = 0; |
|---|
| 478 | for (int i = 0; i < n; i++) { |
|---|
| 479 | Instance i1 = instances.instance(i); |
|---|
| 480 | for (int j = i + 1; j < n; j++) { |
|---|
| 481 | Instance j1 = instances.instance(j); |
|---|
| 482 | if (j1.classValue() == i1.classValue() && comparable(i1, j1) ) { |
|---|
| 483 | nrRedundant++; |
|---|
| 484 | } |
|---|
| 485 | } |
|---|
| 486 | } |
|---|
| 487 | |
|---|
| 488 | return nrRedundant; |
|---|
| 489 | } |
|---|
| 490 | |
|---|
| 491 | /** |
|---|
| 492 | * Calulates the total loss over the <code> instances </code>, |
|---|
| 493 | * using the trained <code> classifier </code> and the |
|---|
| 494 | * specified <code> lossFunction. </code> The instances |
|---|
| 495 | * should not contain missing values in the class attribute. |
|---|
| 496 | * |
|---|
| 497 | * @param classifier the trained classifier to use |
|---|
| 498 | * @param instances the test instances |
|---|
| 499 | * @param lossFunction the loss function to use |
|---|
| 500 | * @return the total loss of all the instances using the given classifier and loss function |
|---|
| 501 | */ |
|---|
| 502 | public static double totalLoss(Classifier classifier, Instances instances, |
|---|
| 503 | NominalLossFunction lossFunction) { |
|---|
| 504 | |
|---|
| 505 | double loss = 0; |
|---|
| 506 | int n = instances.numInstances(); |
|---|
| 507 | for (int i = 0; i < n; i++) { |
|---|
| 508 | try { |
|---|
| 509 | loss += lossFunction.loss(instances.instance(i).classValue(), |
|---|
| 510 | classifier.classifyInstance(instances.instance(i))); |
|---|
| 511 | } catch (Exception e) { |
|---|
| 512 | // what should we do here ?? |
|---|
| 513 | } |
|---|
| 514 | } |
|---|
| 515 | return loss; |
|---|
| 516 | } |
|---|
| 517 | |
|---|
| 518 | |
|---|
| 519 | /** |
|---|
| 520 | * Classify a set of instances using a given classifier. The class value |
|---|
| 521 | * of the instances are set. |
|---|
| 522 | * |
|---|
| 523 | * @param instances the instances to be classified |
|---|
| 524 | * @param classifier a built classifier |
|---|
| 525 | * @throws Exception if one of the instances could no be classified |
|---|
| 526 | */ |
|---|
| 527 | public static void classifyInstances(Instances instances, Classifier classifier) |
|---|
| 528 | throws Exception { |
|---|
| 529 | |
|---|
| 530 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 531 | while(it.hasNext()) { |
|---|
| 532 | Instance instance = (Instance) it.next(); |
|---|
| 533 | instance.setClassValue(classifier.classifyInstance(instance)); |
|---|
| 534 | } |
|---|
| 535 | } |
|---|
| 536 | |
|---|
| 537 | |
|---|
| 538 | /** |
|---|
| 539 | * Calculates the relation (poset) formed by the instances. |
|---|
| 540 | * |
|---|
| 541 | * @param instances the instances for which the poset is to be formed |
|---|
| 542 | * @return a <code> BooleanBitMatrix </code> for which position |
|---|
| 543 | * <code> bm.get(i,j) == true </code> iff <code> |
|---|
| 544 | * InstancesUtil.strictlySmaller(instances.instance(i), |
|---|
| 545 | * instances.instance(j)) == true </code> |
|---|
| 546 | */ |
|---|
| 547 | public static BooleanBitMatrix getBitMatrix(Instances instances) { |
|---|
| 548 | int numInstances = instances.numInstances(); |
|---|
| 549 | BooleanBitMatrix bm = |
|---|
| 550 | new BooleanBitMatrix(numInstances, numInstances); |
|---|
| 551 | for (int i = 0; i < numInstances; i++ ) { |
|---|
| 552 | Instance instance1 = instances.instance(i); |
|---|
| 553 | for (int j = 0; j < numInstances; j++) { |
|---|
| 554 | Instance instance2 = instances.instance(j); |
|---|
| 555 | if (InstancesUtil.strictlySmaller(instance1, instance2)) { |
|---|
| 556 | bm.set(i, j); // arc from instance1 to instance2 |
|---|
| 557 | } |
|---|
| 558 | } |
|---|
| 559 | } |
|---|
| 560 | return bm; |
|---|
| 561 | } |
|---|
| 562 | |
|---|
| 563 | /** |
|---|
| 564 | * Calculatus the number of elements in the closed interval |
|---|
| 565 | * <code> [low,up]. </code> If the class index is set, then |
|---|
| 566 | * the class attribute does not play part in the calculations, |
|---|
| 567 | * this is we work in the data space. The code also works with |
|---|
| 568 | * numeric attributes, but is primarily intended for ordinal attributes. |
|---|
| 569 | * |
|---|
| 570 | * @param low the lower bound of the interval |
|---|
| 571 | * @param up the upper bound of the interval |
|---|
| 572 | * @return the size of the interval (in floating point format) |
|---|
| 573 | * @throws IllegalArgumentException if the given instances do not |
|---|
| 574 | * constitute an interval. |
|---|
| 575 | */ |
|---|
| 576 | public static double numberInInterval(Instance low, Instance up) |
|---|
| 577 | throws IllegalArgumentException { |
|---|
| 578 | |
|---|
| 579 | Coordinates cLow = new Coordinates(low); |
|---|
| 580 | Coordinates cUp = new Coordinates(up); |
|---|
| 581 | if (cLow.smallerOrEqual(cUp) == false) { |
|---|
| 582 | throw new IllegalArgumentException |
|---|
| 583 | ("The given instances are not the bounds of an interval"); |
|---|
| 584 | } |
|---|
| 585 | double number = 1; |
|---|
| 586 | int dim = cLow.dimension(); |
|---|
| 587 | for (int i = 0; i < dim; i++) { |
|---|
| 588 | number *= (cUp.getValue(i) - cLow.getValue(i) + 1); |
|---|
| 589 | } |
|---|
| 590 | return number; |
|---|
| 591 | } |
|---|
| 592 | |
|---|
| 593 | |
|---|
| 594 | /** |
|---|
| 595 | * Calculatutes the number of vectors in the data space that are smaller |
|---|
| 596 | * or equal than the given instance. |
|---|
| 597 | * |
|---|
| 598 | * @param instance the given instance |
|---|
| 599 | * @return the number of vectors in the data space smaller or equal |
|---|
| 600 | * than the given instance |
|---|
| 601 | * @throws IllegalArgumentException if there are numeric attributes |
|---|
| 602 | */ |
|---|
| 603 | public static double numberOfSmallerVectors(Instance instance) |
|---|
| 604 | throws IllegalArgumentException { |
|---|
| 605 | |
|---|
| 606 | double[] values = InstancesUtil.toDataDouble(instance); |
|---|
| 607 | double nr = 1; |
|---|
| 608 | |
|---|
| 609 | for (int i = 0; i < values.length; i++) { |
|---|
| 610 | if (instance.attribute(i).isNumeric()) { |
|---|
| 611 | throw new IllegalArgumentException |
|---|
| 612 | ("Numeric attributes are not supported"); |
|---|
| 613 | } |
|---|
| 614 | nr *= (values[i] + 1); |
|---|
| 615 | } |
|---|
| 616 | |
|---|
| 617 | return nr; |
|---|
| 618 | } |
|---|
| 619 | |
|---|
| 620 | /** |
|---|
| 621 | * Calculatutes the number of vectors in the data space that are |
|---|
| 622 | * greater or equal than the given instance. |
|---|
| 623 | * |
|---|
| 624 | * @param instance the given instance |
|---|
| 625 | * @return the number of vectors in the data space greater of equal |
|---|
| 626 | * than the given instance |
|---|
| 627 | * @throws IllegalArgumentException if there are numeric attributes |
|---|
| 628 | */ |
|---|
| 629 | public static double numberOfGreaterVectors(Instance instance) |
|---|
| 630 | throws IllegalArgumentException { |
|---|
| 631 | |
|---|
| 632 | double[] values = InstancesUtil.toDataDouble(instance); |
|---|
| 633 | double nr = 1; |
|---|
| 634 | |
|---|
| 635 | for (int i = 0; i < values.length; i++) { |
|---|
| 636 | if (instance.attribute(i).isNumeric()) { |
|---|
| 637 | throw new IllegalArgumentException |
|---|
| 638 | ("Numeric attributes are not supported"); |
|---|
| 639 | } |
|---|
| 640 | nr *= (instance.attribute(i).numValues() - values[i]); |
|---|
| 641 | } |
|---|
| 642 | |
|---|
| 643 | return nr; |
|---|
| 644 | } |
|---|
| 645 | |
|---|
| 646 | /** |
|---|
| 647 | * Write the instances in ARFF-format to the indicated |
|---|
| 648 | * <code> BufferedWriter </code>. |
|---|
| 649 | * @param instances the instances to write |
|---|
| 650 | * @param file the <code> BufferedWriter </code> to write to |
|---|
| 651 | * @throws IOException if something goes wrong while writing the instances |
|---|
| 652 | */ |
|---|
| 653 | public static void write(Instances instances, BufferedWriter file) |
|---|
| 654 | throws IOException{ |
|---|
| 655 | |
|---|
| 656 | file.write(instances.toString()); // XXX can probably be done better |
|---|
| 657 | } |
|---|
| 658 | |
|---|
| 659 | |
|---|
| 660 | /** |
|---|
| 661 | * Return a histogram of the values for the specified attribute. |
|---|
| 662 | * |
|---|
| 663 | * @param instances the instances |
|---|
| 664 | * @param attributeIndex the attribute to consider |
|---|
| 665 | * @return a <code> DiscreteEstimator </code> where the <code>i</code>th |
|---|
| 666 | * @throws IllegalArgumentException if the attribute at the specified |
|---|
| 667 | * index is numeric |
|---|
| 668 | */ |
|---|
| 669 | public static DiscreteEstimator countValues(Instances instances, int attributeIndex) |
|---|
| 670 | throws IllegalArgumentException{ |
|---|
| 671 | |
|---|
| 672 | int numValues = instances.attribute(attributeIndex).numValues(); |
|---|
| 673 | if (numValues == 0) { |
|---|
| 674 | throw new IllegalArgumentException |
|---|
| 675 | ("Can't create histogram for numeric attribute"); |
|---|
| 676 | } |
|---|
| 677 | |
|---|
| 678 | DiscreteEstimator de = new DiscreteEstimator(numValues, false); |
|---|
| 679 | Iterator it = new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 680 | while (it.hasNext()) { |
|---|
| 681 | Instance instance = (Instance) it.next(); |
|---|
| 682 | if (!instance.isMissing(attributeIndex)) { |
|---|
| 683 | de.addValue(instance.value(attributeIndex), instance.weight()); |
|---|
| 684 | } |
|---|
| 685 | } |
|---|
| 686 | return de; |
|---|
| 687 | } |
|---|
| 688 | |
|---|
| 689 | /** |
|---|
| 690 | * Create, without replacement, a random subsample of the given size |
|---|
| 691 | * from the given instances. |
|---|
| 692 | * |
|---|
| 693 | * @param instances the instances to sample from |
|---|
| 694 | * @param size the requested size of the sample |
|---|
| 695 | * @param random the random generator to use |
|---|
| 696 | * @return a sample of the requested size, drawn from the given |
|---|
| 697 | * instances without replacement |
|---|
| 698 | * @throws IllegalArgumentException if the size exceeds the number |
|---|
| 699 | * of instances |
|---|
| 700 | */ |
|---|
| 701 | public static Instances sampleWithoutReplacement( |
|---|
| 702 | Instances instances, int size, Random random) { |
|---|
| 703 | |
|---|
| 704 | if (size > instances.numInstances()) { |
|---|
| 705 | throw new IllegalArgumentException |
|---|
| 706 | ("Size of requested sample exceeds number of instances"); |
|---|
| 707 | } |
|---|
| 708 | |
|---|
| 709 | int numInstances = instances.numInstances(); |
|---|
| 710 | int[] indices = new int[instances.numInstances()]; |
|---|
| 711 | for (int i = 0; i < numInstances; i++) { |
|---|
| 712 | indices[i] = i; |
|---|
| 713 | } |
|---|
| 714 | |
|---|
| 715 | Instances sample = new Instances(instances, size); |
|---|
| 716 | int index; |
|---|
| 717 | for (int i = 0; i < size; i++) { |
|---|
| 718 | index = random.nextInt(numInstances--); |
|---|
| 719 | sample.add(instances.instance(indices[index])); |
|---|
| 720 | swap(indices, index, numInstances); |
|---|
| 721 | } |
|---|
| 722 | return sample; |
|---|
| 723 | } |
|---|
| 724 | |
|---|
| 725 | /** |
|---|
| 726 | * Swaps two elements of the given array. |
|---|
| 727 | * |
|---|
| 728 | * @param aa the array |
|---|
| 729 | * @param i the index of the first element |
|---|
| 730 | * @param j the index of the second element |
|---|
| 731 | */ |
|---|
| 732 | final private static void swap(int[] aa, int i, int j) { |
|---|
| 733 | int tmp = aa[i]; |
|---|
| 734 | aa[i] = aa[j]; |
|---|
| 735 | aa[j] = tmp; |
|---|
| 736 | } |
|---|
| 737 | /** |
|---|
| 738 | * Generates a random sample of instances. Each attribute must be nominal, and the |
|---|
| 739 | * class labels are not set. |
|---|
| 740 | * |
|---|
| 741 | * @param headerInfo Instances whose header information is used to determine how the |
|---|
| 742 | * set of returned instances will look |
|---|
| 743 | * @param numberOfExamples the desired size of the returned set |
|---|
| 744 | * @param random the random number generator to use |
|---|
| 745 | * @return a set of Instances containing the random sample. |
|---|
| 746 | * @throws IllegalArgumentException if numeric attributes are given |
|---|
| 747 | */ |
|---|
| 748 | public static Instances generateRandomSample( |
|---|
| 749 | Instances headerInfo, int numberOfExamples, Random random) |
|---|
| 750 | throws IllegalArgumentException { |
|---|
| 751 | |
|---|
| 752 | int n = headerInfo.numAttributes(); |
|---|
| 753 | double[] info = new double[n]; |
|---|
| 754 | int classIndex = headerInfo.classIndex(); |
|---|
| 755 | for (int i = 0; i < n; i++) { |
|---|
| 756 | info[i] = headerInfo.attribute(i).numValues(); |
|---|
| 757 | if (i != classIndex && info[i] == 0) { |
|---|
| 758 | throw new IllegalArgumentException |
|---|
| 759 | ("Numeric attributes are currently not supported"); |
|---|
| 760 | } |
|---|
| 761 | } |
|---|
| 762 | Instances sample = new Instances(headerInfo, numberOfExamples); |
|---|
| 763 | sample.setRelationName(headerInfo.relationName() + |
|---|
| 764 | ".random.sample.of." + numberOfExamples); |
|---|
| 765 | for (int i = 0; i < numberOfExamples; i++) { |
|---|
| 766 | sample.add(randomSample(info, classIndex, random)); |
|---|
| 767 | } |
|---|
| 768 | return sample; |
|---|
| 769 | } |
|---|
| 770 | /** |
|---|
| 771 | * Generates a random instance. |
|---|
| 772 | * |
|---|
| 773 | * @param info array that gives for each attribute the number of possible values |
|---|
| 774 | * @param classIndex the index of the class attribute |
|---|
| 775 | * @param random the random number generator used |
|---|
| 776 | * @return a random instance |
|---|
| 777 | */ |
|---|
| 778 | private static Instance randomSample(double[] info, |
|---|
| 779 | int classIndex, Random random) { |
|---|
| 780 | |
|---|
| 781 | double[] attValues = new double[info.length]; |
|---|
| 782 | for (int i = 0; i < attValues.length; i++) { |
|---|
| 783 | if (i != classIndex) { |
|---|
| 784 | attValues[i] = random.nextInt( (int) info[i]); |
|---|
| 785 | } |
|---|
| 786 | } |
|---|
| 787 | return new DenseInstance(1, attValues); |
|---|
| 788 | } |
|---|
| 789 | |
|---|
| 790 | |
|---|
| 791 | |
|---|
| 792 | /** |
|---|
| 793 | * Returns an array containing the attribute values (in internal floating |
|---|
| 794 | * point format) of the given instance in data space, this is, the class |
|---|
| 795 | * attribute (if any) is removed. |
|---|
| 796 | * |
|---|
| 797 | * @param instance the instance to get the attribute values from |
|---|
| 798 | * @return array of doubles containing the attribute values |
|---|
| 799 | */ |
|---|
| 800 | public static double[] toDataDouble(Instance instance) { |
|---|
| 801 | double[] vector = null; |
|---|
| 802 | int classIndex = instance.classIndex(); |
|---|
| 803 | if(classIndex >= 0) { |
|---|
| 804 | vector = new double[instance.numAttributes() - 1]; |
|---|
| 805 | } else { |
|---|
| 806 | vector = new double[instance.numAttributes()]; |
|---|
| 807 | } |
|---|
| 808 | int index = 0; |
|---|
| 809 | for (int i = 0; i < instance.numAttributes(); i++) { |
|---|
| 810 | if(i != classIndex) { |
|---|
| 811 | vector[index++] = instance.value(i); |
|---|
| 812 | } |
|---|
| 813 | } |
|---|
| 814 | return vector; |
|---|
| 815 | } |
|---|
| 816 | |
|---|
| 817 | /** |
|---|
| 818 | * Computes the minimal extension for a given instance. |
|---|
| 819 | * |
|---|
| 820 | * @param instances the set of instances |
|---|
| 821 | * @param instance the instance for which the minimal extension is to be |
|---|
| 822 | * calculated |
|---|
| 823 | * @return the value of the minimal extension, in internal floating point |
|---|
| 824 | * format |
|---|
| 825 | */ |
|---|
| 826 | public static double minimalExtension(Instances instances, Instance instance) { |
|---|
| 827 | return minimalExtension(instances, instance, 0); |
|---|
| 828 | } |
|---|
| 829 | |
|---|
| 830 | /** |
|---|
| 831 | * Computes the minimal extension of a given instance, but the |
|---|
| 832 | * minimal value returned is <code> minValue. </code> This method |
|---|
| 833 | * may have its applications when the training set is divided into |
|---|
| 834 | * multiple Instances objects. |
|---|
| 835 | * |
|---|
| 836 | * @param instances the set of instances |
|---|
| 837 | * @param instance the instance for which the minimal extension is to |
|---|
| 838 | * be calculated |
|---|
| 839 | * @param minValue a double indicating the minimal value that should |
|---|
| 840 | * be returned |
|---|
| 841 | * @return the label of the minimal extension, in internal floating point format |
|---|
| 842 | */ |
|---|
| 843 | public static double minimalExtension( |
|---|
| 844 | Instances instances, Instance instance, double minValue) { |
|---|
| 845 | |
|---|
| 846 | double value = minValue; |
|---|
| 847 | |
|---|
| 848 | Iterator it = |
|---|
| 849 | new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 850 | while(it.hasNext()) { |
|---|
| 851 | Instance tmp = (Instance) it.next(); |
|---|
| 852 | if (tmp.classValue() > value |
|---|
| 853 | && InstancesUtil.smallerOrEqual(tmp, instance) ) { |
|---|
| 854 | value = tmp.classValue(); |
|---|
| 855 | } |
|---|
| 856 | } |
|---|
| 857 | return value; |
|---|
| 858 | } |
|---|
| 859 | |
|---|
| 860 | /** |
|---|
| 861 | * Computes the maximal extension for a given instance. |
|---|
| 862 | * |
|---|
| 863 | * @param instances the set of instances |
|---|
| 864 | * @param instance the instance for which the minimal extension is to be |
|---|
| 865 | * calculated |
|---|
| 866 | * @return the value of the minimal extension, in internal floating point |
|---|
| 867 | * format |
|---|
| 868 | */ |
|---|
| 869 | public static double maximalExtension(Instances instances, Instance instance) { |
|---|
| 870 | return maximalExtension(instances, instance, instances.numClasses() - 1); |
|---|
| 871 | } |
|---|
| 872 | |
|---|
| 873 | /** |
|---|
| 874 | * Computes the maximal extension of a given instance, but the |
|---|
| 875 | * maximal value returned is <code> maxValue. </code> This method |
|---|
| 876 | * may have its applications when the training set is divided into |
|---|
| 877 | * multiple Instances objects. |
|---|
| 878 | * |
|---|
| 879 | * @param instances the set of instances |
|---|
| 880 | * @param instance the instance for which the maximal extension is to |
|---|
| 881 | * be calculated |
|---|
| 882 | * @param maxValue a double indicating the maximal value that should |
|---|
| 883 | * be returned |
|---|
| 884 | * @return the value of the minimal extension, in internal floating point |
|---|
| 885 | * format |
|---|
| 886 | */ |
|---|
| 887 | public static double maximalExtension( |
|---|
| 888 | Instances instances, Instance instance, double maxValue) { |
|---|
| 889 | |
|---|
| 890 | double value = maxValue; |
|---|
| 891 | |
|---|
| 892 | Iterator it = |
|---|
| 893 | new EnumerationIterator(instances.enumerateInstances()); |
|---|
| 894 | while(it.hasNext()) { |
|---|
| 895 | Instance tmp = (Instance) it.next(); |
|---|
| 896 | if (tmp.classValue() < value |
|---|
| 897 | && InstancesUtil.smallerOrEqual(instance, tmp) ) { |
|---|
| 898 | value = tmp.classValue(); |
|---|
| 899 | } |
|---|
| 900 | } |
|---|
| 901 | return value; |
|---|
| 902 | } |
|---|
| 903 | |
|---|
| 904 | /** |
|---|
| 905 | * Returns the revision string. |
|---|
| 906 | * |
|---|
| 907 | * @return the revision |
|---|
| 908 | */ |
|---|
| 909 | public String getRevision() { |
|---|
| 910 | return RevisionUtils.extract("$Revision: 5987 $"); |
|---|
| 911 | } |
|---|
| 912 | } |
|---|