source: src/main/java/weka/classifiers/misc/monotone/InstancesUtil.java @ 14

Last change on this file since 14 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 29.8 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    InstancesUtil.java
19 *    Copyright (C) 2004 Stijn Lievens
20 *
21 */
22
23package weka.classifiers.misc.monotone;
24
25import weka.classifiers.Classifier;
26import weka.core.Instance;
27import weka.core.DenseInstance;
28import weka.core.Instances;
29import weka.core.RevisionHandler;
30import weka.core.RevisionUtils;
31import weka.core.Utils;
32import weka.estimators.DiscreteEstimator;
33
34import java.io.BufferedWriter;
35import java.io.IOException;
36import java.util.HashMap;
37import java.util.Iterator;
38import java.util.Map;
39import java.util.Random;
40
41/**
42 * This class contains some methods for working with objects of
43 * type <code> Instance </code> and <code> Instances, </code> not
44 * provided by there respective classes.
45 * <p>
46 * This implementation is part of the master's thesis: "Studie
47 * en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd
48 * rangschikken", Stijn Lievens, Ghent University, 2004.
49 * </p>
50 *
51 * @author Stijn Lievens (stijn.lievens@ugent.be)
52 * @version $Revision: 5987 $
53 */
54public class InstancesUtil
55  implements RevisionHandler {
56
57  /**
58   * Compares two instances, ignoring the class attribute (if any)
59   *
60   * @param i1 the first instance
61   * @param i2 the second instance
62   * @return true if both instances are equal (ignoring the class
63   * attribute), false otherwise
64   */
65  public static boolean equalIgnoreClass(Instance i1, Instance i2) {
66    int n = i1.numAttributes();
67    int classIndex = i1.classIndex();
68    if (i2.numAttributes() != n || classIndex != i2.classIndex()) {
69      return false;
70    }
71    int i = 0;
72    while(i < n && (i == classIndex
73        || Utils.eq(i1.value(i), i2.value(i)))) {
74      i++;
75    }
76    return i == n;
77  }
78
79  /**
80   * Get the index of an instance in a set of instances, where
81   * instances are compared ignoring the class attribute.
82   *
83   * @param instances the set of instances
84   * @param instance to instance to be found in the given set of instances
85   * @return the index of the first instance that equals the given instance
86   * (ignoring the class attribute), -1 if the instance was not found
87   */
88  public static int containsIgnoreClass(Instances instances, Instance instance) {
89    double[] dd = instance.toDoubleArray();
90    int classIndex = instances.classIndex();
91    int n = instances.numAttributes();
92    Iterator it = 
93      new EnumerationIterator(instances.enumerateInstances());
94    int index = 0;
95    while(it.hasNext()) {
96      Instance tmp = (Instance) it.next();
97      int i = 0;
98      while(i < n && 
99          (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) {
100        i++;
101      }
102      if (i == n) { 
103        return index;  // found it
104      }
105      index++;
106    }
107    return -1;
108  }
109
110  /**
111   * Find the next occurence of an instance, ignoring the class,
112   * for which the index in the dataset is at least <code> index. </code>
113   *
114   * @param instances the set of instances to be searched
115   * @param instance the instance to be found
116   * @param index the minimum index that might be returned
117   * @return the index of the first instance with index at
118   * least <code> index </code> that equals the given instance
119   * (ignoring the class attribute), -1 if the instance was not found
120   */
121  public static int nextOccurenceIgnoreClass(Instances instances, Instance instance, int index) {
122    double[] dd = instance.toDoubleArray();
123    int classIndex = instances.classIndex();
124    int n = instances.numAttributes();
125    int numInstances = instances.numInstances();
126    int currentIndex = index;
127    while(currentIndex < numInstances) {
128      Instance tmp = instances.instance(currentIndex);
129      int i = 0;
130      while(i < n && 
131          (i == classIndex || Utils.eq(dd[i], tmp.value(i)))) {
132        i++;
133      }
134      if (i == n) { 
135        return currentIndex;  // found it
136      }
137      currentIndex++;
138    }
139    return -1; // not present
140  }
141
142  /**
143   * Check if all instances have the same class value.
144   *
145   * @param instances the instances to be checked for homogeneity
146   * @return true if the instances have the same class value, false otherwise
147   */
148  public static boolean isHomogeneous(Instances instances) {
149    Iterator it = new EnumerationIterator(instances.enumerateInstances());
150    if (it.hasNext()) {
151      double classValue = ((Instance) it.next()).classValue();
152      while(it.hasNext()) {
153        if (((Instance) it.next()).classValue() != classValue) {
154          return false;
155        }
156      }
157    }
158    return true; // empty or all identical
159  }
160
161  /**
162   * Compares two instances in the data space, this is ignoring the class
163   * attribute.  An instance is strictly smaller than another instance
164   * if the same holds for the <code> Coordinates </code> based on
165   * these instances.
166   *
167   * @param i1 the first instance
168   * @param i2 the second instance
169   * @return <code> true </code> if the first instance is strictly smaller
170   * than the second instance, <code> false </code> otherwise
171   */
172  public static boolean strictlySmaller(Instance i1, Instance i2) {
173    // XXX implementation can be done faster
174    Coordinates c1 = new Coordinates(i1);
175    Coordinates c2 = new Coordinates(i2);
176
177    return c1.strictlySmaller(c2);
178  }
179
180  /**
181   * Compares two instances in the data space, this is, ignoring the class
182   * attribute.  An instance is smaller or equal than another instance
183   * if the same holds for the <code> Coordinates </code> based on
184   * these instances.
185   *
186   * @param i1 the first instance
187   * @param i2 the second instance
188   * @return <code> true </code> if the first instance is smaller or equal
189   * than the second instance, <code> false </code> otherwise
190   */
191  public static boolean smallerOrEqual(Instance i1,Instance i2) {
192    // XXX implementation can be done faster
193    Coordinates c1 = new Coordinates(i1);
194    Coordinates c2 = new Coordinates(i2);
195
196    return c1.smallerOrEqual(c2);
197  }
198
199  /**
200   * Checks if two instances are comparable in the data space, this is
201   * ignoring the class attribute.  Two instances are comparable if the
202   * first is smaller or equal than the second, or the other way around.
203   *
204   * @param i1 the first instance
205   * @param i2 the second instance
206   * @return <code> true </code> if the given instances are comparable,
207   * <code> false </code> otherwise
208   * @throws IllegalArgumentException if the two instances don't have the
209   * same length
210   */
211  public static boolean comparable(Instance i1, Instance i2) throws IllegalArgumentException {
212    // XXX maybe we should think about using 'equalHeaders' of Instance
213    // to obtain a fool proof implementation
214    Coordinates c1 = new Coordinates(i1);
215    Coordinates c2 = new Coordinates(i2);
216
217    return c1.smallerOrEqual(c2) || c2.smallerOrEqual(c1);
218  }
219
220  /**
221   * Checks it two instances give rise to doubt.  There is doubt between
222   * two instances if their <code> Coordinates </code> are equal, but
223   * their class value is different.
224   *
225   * @param i1 the first instance
226   * @param i2 the second instance
227   * @return <code> true </code> if there is doubt between the two
228   * given instances, <code> false </code> otherwise
229   */
230  public static boolean doubt(Instance i1, Instance i2) {
231    // XXX use equalHeaders ?
232    if (i1.classValue() == i2.classValue()) {
233      return false;
234    }
235    Coordinates c1 = new Coordinates(i1);
236    Coordinates c2 = new Coordinates(i2);
237
238    return c1.equals(c2); 
239  }
240
241  /**
242   *  Checks if two instances give rise to reversed preference. 
243   *  Two instances give rise to reversed preference in the data space,
244   *  if their <code> Coordinates </code> are comparable but different,
245   *  and their class values are not related in the same way.
246   * 
247   *  @param i1 the first instance
248   *  @param i2 the second instance
249   *  @return <code> true </code> if <code> i1 </code> and <code> i2 </code>
250   *  give rise to reversed preference, <code> false </code> otherwise
251   *  @throws IllegalArgumentException if the two instances don't have
252   *  the same length
253   */
254  public static boolean reversedPreference(Instance i1, Instance i2) throws IllegalArgumentException {
255    // XXX should the implementation be made fool proof by use of
256    // 'equalHeaders'?  It can also be speeded up I think.
257
258    if (i1.classValue() == i2.classValue()) {
259      return false;
260    }
261    Coordinates c1 = new Coordinates(i1);
262    Coordinates c2 = new Coordinates(i2);
263
264    if (i1.classValue() > i2.classValue() && c1.strictlySmaller(c2)) {
265      return true;
266    }
267    if (i2.classValue() > i1.classValue() && c2.strictlySmaller(c1)) {
268      return true;
269    }
270
271    return false;
272  }
273
274  /**
275   *  Checks if the given data set is monotone.  We say that a data set
276   *  is monotone if it contains doubt nor reversed preferences.
277   *
278   *  @param instances the data set to be checked
279   *  @return <code> true </code> if the given data set if monotone,
280   *  <code> false </code> otherwise
281   */ 
282  public static boolean isMonotone(Instances instances) {
283    int n = instances.numInstances();
284    for (int i = 0; i < n; i++) {
285      Instance i1 = instances.instance(i);
286      for (int j = i + 1; j < n; j++) {
287        if ( doubt(i1, instances.instance(j)) ||
288            reversedPreference(i1, instances.instance(j))) {
289          return false;
290        }
291      }
292    }
293    return true;
294  }
295
296  /**
297   * Test if a set of instances is quasi monotone.  We say that a set
298   * of instances <code> S </code> is quasi monotone with respect to
299   * a set of instances <code> D </code> iff
300   * <code> [x,y] \cap D \neq \emptyset \implies class(x) \leq class(y).
301   * </code>  This implies that <code> D </code> itself is monotone.
302   *
303   * @param ground the instances playing the role of <code> D </code>
304   * @param other the instances playing the role of <code> S </code>
305   * @return true if the instances are quasi monotone, false otherwise
306   */
307  public static boolean isQuasiMonotone(Instances ground, Instances other) {
308    if (!isMonotone(ground)) {
309      return false;
310    }
311    Iterator it1 = new EnumerationIterator(ground.enumerateInstances());
312    while(it1.hasNext()) {
313      Instance inst1 = (Instance) it1.next();
314      Iterator it2 = new EnumerationIterator(other.enumerateInstances());
315      while(it2.hasNext()) {
316        Instance inst2 = (Instance) it2.next();
317        if (doubt(inst1, inst2) || reversedPreference(inst1, inst2)) {
318          return false;
319        }
320      }
321    }
322    return true;
323  }
324
325  /**
326   * Gather some statistics regarding reversed preferences.
327   *
328   * @param instances the instances to be examined
329   * @return array of length 3; position 0 indicates the number of
330   *  couples that have reversed preference, position 1 the number of
331   *  couples that are comparable, and position 2 the total
332   *  number of couples
333   * @see #reversedPreference(Instance, Instance)
334   */ 
335  public static int[] nrOfReversedPreferences(Instances instances) {
336    int[] stats = new int[3];
337    int n = instances.numInstances();
338    stats[0] = 0;
339    stats[1] = 0;
340    // number of couples
341    stats[2] = n * (n - 1) / 2; 
342    for (int i = 0; i < n; i++) {
343      Instance i1 = instances.instance(i);
344      for (int j = i + 1; j < n; j++) {
345        Instance j1 = instances.instance(j);
346        if (comparable(i1, j1)) {
347          stats[1]++;  // comparable
348          if (reversedPreference(i1, j1)) {
349            stats[0]++; // reversed preference
350          }
351        }
352      }
353    }
354    return stats;
355  }
356
357  /**
358   * Find the number of stochastic reversed preferences in the dataset.
359   *
360   * @param instances the instances to be examined
361   * @return an array of integers containing at position
362   * <ul>
363   * <li> 0: number of different coordinates, this is the size of S_X </li>
364   * <li> 1: number of couples showing reversed preference:<br>
365   * <code> x &lt; y </code> and
366   *        <code> not (F_x leqstoch F_y) </code> </li>
367   * <li> 2: number of couples having<br>
368   *   <code> x &lt; y </code> and <code> F_y leqstoch F_x </code>
369   *                                  and <code> F_x neq F_y </code> </li>
370   * <li> 3: number of couples that are comparable <br>
371   *          <code> |\{ (x,y)\in S_X \times S_x | x &lt; y\}| </code> </li>
372   * <li> 4: number of couples in S_X </li>
373   * </ul>
374   * @throws IllegalArgumentException if there are no instances with
375   * a non-missing class value, or if the class is not set
376   */
377  public static int[] nrStochasticReversedPreference(Instances instances)
378    throws IllegalArgumentException {
379
380    if (instances.classIndex() < 0) {
381      throw new IllegalArgumentException("Class is not set");
382    }
383
384    // copy the dataset
385    Instances data = new Instances(instances);
386
387    // new dataset where examples with missing class value are removed
388    data.deleteWithMissingClass();
389    if (data.numInstances() == 0) {
390      throw new IllegalArgumentException
391      ("No instances with a class value!");
392    }
393
394    // build the Map for the estimatedDistributions
395    Map distributions = new HashMap(data.numInstances()/2);
396
397    // cycle through all instances
398    Iterator i = 
399      new EnumerationIterator(instances.enumerateInstances());
400
401    while (i.hasNext()) { 
402      Instance instance = (Instance) i.next();
403      Coordinates c = new Coordinates(instance);
404
405      // get DiscreteEstimator from the map
406      DiscreteEstimator df = 
407        (DiscreteEstimator) distributions.get(c);
408
409      // if no DiscreteEstimator is present in the map, create one
410      if (df == null) {
411        df = new DiscreteEstimator(instances.numClasses(), 0);
412      }
413      df.addValue(instance.classValue(),instance.weight()); // update
414      distributions.put(c,df); // put back in map
415    }
416
417
418    // build the map of cumulative distribution functions
419    Map cumulativeDistributions = 
420      new HashMap(distributions.size());
421
422    // Cycle trough the map of discrete distributions, and create a new
423    // one containing cumulative discrete distributions
424    for (Iterator it=distributions.keySet().iterator();
425    it.hasNext();) {
426      Coordinates c = (Coordinates) it.next();
427      DiscreteEstimator df = 
428        (DiscreteEstimator) distributions.get(c);
429      cumulativeDistributions.put
430      (c, new CumulativeDiscreteDistribution(df));
431    }
432    int[] revPref = new int[5]; 
433    revPref[0] = cumulativeDistributions.size();
434    Iterator it = cumulativeDistributions.keySet().iterator();
435    while (it.hasNext()) {
436      Coordinates c1 = (Coordinates) it.next();
437      CumulativeDiscreteDistribution cdf1 = 
438        (CumulativeDiscreteDistribution) 
439        cumulativeDistributions.get(c1);
440      Iterator it2 = cumulativeDistributions.keySet().iterator();
441      while(it2.hasNext()) {
442        Coordinates c2 = (Coordinates) it2.next();
443        CumulativeDiscreteDistribution cdf2 = 
444          (CumulativeDiscreteDistribution)
445          cumulativeDistributions.get(c2);
446        if (c2.equals(c1)) {
447          continue;
448        }
449
450        revPref[4]++;
451
452        if (c1.strictlySmaller(c2) == true) {
453          revPref[3]++; //vergelijkbaar
454          if (cdf1.stochasticDominatedBy(cdf2) == false ) {
455            revPref[1]++;
456            if (cdf2.stochasticDominatedBy(cdf1) == true) {
457              revPref[2]++;
458            }
459          }
460        } 
461      }
462    }
463    revPref[4] /= 2; 
464    return revPref;
465  }
466
467  /**
468   * Counts the number of redundant pairs in the sense of OLM.
469   * Two instances are redundant if they are comparable and have the same
470   * class value.
471   *
472   * @param instances the instances to be checked
473   * @return the number of redundant pairs in the given set of instances
474   */
475  public static int nrOfRedundant(Instances instances) {
476    int n = instances.numInstances();
477    int nrRedundant = 0;
478    for (int i = 0; i < n; i++) {
479      Instance i1 = instances.instance(i);
480      for (int j = i + 1; j < n; j++) {
481        Instance j1 = instances.instance(j);
482        if (j1.classValue() == i1.classValue() && comparable(i1, j1) ) {
483          nrRedundant++; 
484        }
485      }
486    }
487
488    return nrRedundant;
489  }
490
491  /**
492   * Calulates the total loss over the <code> instances </code>,
493   * using the trained <code> classifier </code> and the
494   * specified <code> lossFunction. </code>  The instances
495   * should not contain missing values in the class attribute.
496   *
497   * @param classifier the trained classifier to use
498   * @param instances the test instances
499   * @param lossFunction the loss function to use
500   * @return the total loss of all the instances using the given classifier and loss function
501   */
502  public static double totalLoss(Classifier classifier, Instances instances, 
503      NominalLossFunction lossFunction) {
504
505    double loss = 0;
506    int n = instances.numInstances();
507    for (int i = 0; i < n; i++) {
508      try {
509        loss += lossFunction.loss(instances.instance(i).classValue(), 
510            classifier.classifyInstance(instances.instance(i)));
511      } catch (Exception e) { 
512        // what should we do here ??
513      }
514    }
515    return loss;
516  }
517
518
519  /**
520   * Classify a set of instances using a given classifier.  The class value
521   * of the instances are set.
522   *
523   * @param instances the instances to be classified
524   * @param classifier a built classifier
525   * @throws Exception if one of the instances could no be classified
526   */
527  public static void classifyInstances(Instances instances, Classifier classifier)
528    throws Exception {
529   
530    Iterator it = new EnumerationIterator(instances.enumerateInstances());
531    while(it.hasNext()) {
532      Instance instance = (Instance) it.next();
533      instance.setClassValue(classifier.classifyInstance(instance));
534    }
535  }
536
537
538  /**
539   * Calculates the relation (poset) formed by the instances.
540   *
541   * @param instances the instances for which the poset is to be formed
542   * @return a <code> BooleanBitMatrix </code> for which position
543   * <code> bm.get(i,j) == true </code> iff <code>
544   * InstancesUtil.strictlySmaller(instances.instance(i),
545   * instances.instance(j)) == true </code>
546   */
547  public static BooleanBitMatrix getBitMatrix(Instances instances) {
548    int numInstances = instances.numInstances();
549    BooleanBitMatrix bm = 
550      new BooleanBitMatrix(numInstances, numInstances);
551    for (int i = 0; i < numInstances; i++ ) {
552      Instance instance1 = instances.instance(i);
553      for (int j = 0; j < numInstances; j++) {
554        Instance instance2 = instances.instance(j);
555        if (InstancesUtil.strictlySmaller(instance1, instance2)) {
556          bm.set(i, j); // arc from instance1 to instance2
557        }
558      }
559    }
560    return bm;
561  }
562
563  /**
564   * Calculatus the number of elements in the closed interval
565   * <code> [low,up]. </code>  If the class index is set, then
566   * the class attribute does not play part in the calculations,
567   * this is we work in the data space.  The code also works with
568   * numeric attributes, but is primarily intended for ordinal attributes.
569   *
570   * @param low the lower bound of the interval
571   * @param up the upper bound of the interval
572   * @return the size of the interval (in floating point format)
573   * @throws IllegalArgumentException if the given instances do not
574   * constitute an interval.
575   */
576  public static double numberInInterval(Instance low, Instance up)
577    throws IllegalArgumentException {
578   
579    Coordinates cLow = new Coordinates(low);
580    Coordinates cUp = new Coordinates(up);
581    if (cLow.smallerOrEqual(cUp) == false) {
582      throw new IllegalArgumentException
583      ("The given instances are not the bounds of an interval");
584    }
585    double number = 1;
586    int dim = cLow.dimension();
587    for (int i = 0; i < dim; i++) {
588      number *= (cUp.getValue(i) - cLow.getValue(i) + 1);
589    }
590    return number;
591  }
592
593
594  /**
595   * Calculatutes the number of vectors in the data space that are smaller
596   * or equal than the given instance.
597   *
598   * @param instance the given instance
599   * @return the number of vectors in the data space smaller or equal
600   * than the given instance
601   * @throws IllegalArgumentException if there are numeric attributes
602   */
603  public static double numberOfSmallerVectors(Instance instance) 
604    throws IllegalArgumentException {
605   
606    double[] values = InstancesUtil.toDataDouble(instance);
607    double nr = 1;
608
609    for (int i = 0; i < values.length; i++) {
610      if (instance.attribute(i).isNumeric()) {
611        throw new IllegalArgumentException
612        ("Numeric attributes are not supported"); 
613      }
614      nr *= (values[i] + 1);
615    }
616
617    return nr;
618  }
619
620  /**
621   * Calculatutes the number of vectors in the data space that are
622   * greater or equal than the given instance.
623   *
624   * @param instance the given instance
625   * @return the number of vectors in the data space greater of equal
626   * than the given instance
627   * @throws IllegalArgumentException if there are numeric attributes
628   */
629  public static double numberOfGreaterVectors(Instance instance) 
630    throws IllegalArgumentException {
631   
632    double[] values = InstancesUtil.toDataDouble(instance);
633    double nr = 1;
634
635    for (int i = 0; i < values.length; i++) {
636      if (instance.attribute(i).isNumeric()) {
637        throw new IllegalArgumentException
638        ("Numeric attributes are not supported"); 
639      }
640      nr *= (instance.attribute(i).numValues() - values[i]);
641    }
642
643    return nr;
644  }
645
646  /**
647   * Write the instances in ARFF-format to the indicated
648   * <code> BufferedWriter </code>.
649   * @param instances the instances to write
650   * @param file the <code> BufferedWriter </code> to write to
651   * @throws IOException if something goes wrong while writing the instances
652   */
653  public static void write(Instances instances, BufferedWriter file)
654    throws IOException{
655   
656    file.write(instances.toString()); // XXX can probably be done better
657  }
658
659
660  /**
661   * Return a histogram of the values for the specified attribute.
662   *
663   * @param instances the instances
664   * @param attributeIndex the attribute to consider
665   * @return a <code> DiscreteEstimator </code> where the <code>i</code>th
666   * @throws IllegalArgumentException if the attribute at the specified
667   * index is numeric
668   */
669  public static DiscreteEstimator countValues(Instances instances, int attributeIndex) 
670    throws IllegalArgumentException{
671   
672    int numValues = instances.attribute(attributeIndex).numValues();
673    if (numValues == 0) {
674      throw new IllegalArgumentException
675      ("Can't create histogram for numeric attribute");
676    }
677
678    DiscreteEstimator de = new DiscreteEstimator(numValues, false);
679    Iterator it = new EnumerationIterator(instances.enumerateInstances());
680    while (it.hasNext()) {
681      Instance instance = (Instance) it.next();
682      if (!instance.isMissing(attributeIndex)) {
683        de.addValue(instance.value(attributeIndex), instance.weight());
684      }
685    }
686    return de;
687  }
688
689  /**
690   * Create, without replacement, a random subsample of the given size
691   * from the given instances.
692   *
693   * @param instances the instances to sample from
694   * @param size the requested size of the sample
695   * @param random the random generator to use
696   * @return a sample of the requested size, drawn from the given
697   * instances without replacement
698   * @throws IllegalArgumentException if the size exceeds the number
699   * of instances
700   */
701  public static Instances sampleWithoutReplacement(
702      Instances instances, int size, Random random) {
703   
704    if (size > instances.numInstances()) {
705      throw new IllegalArgumentException
706      ("Size of requested sample exceeds number of instances");
707    }
708
709    int numInstances = instances.numInstances();
710    int[] indices = new int[instances.numInstances()];
711    for (int i = 0; i < numInstances; i++) {
712      indices[i] = i;
713    }
714
715    Instances sample = new Instances(instances, size);
716    int index;
717    for (int i = 0; i < size; i++) {
718      index = random.nextInt(numInstances--);
719      sample.add(instances.instance(indices[index]));
720      swap(indices, index, numInstances);
721    }
722    return sample;
723  }
724
725  /**
726   * Swaps two elements of the given array.
727   *
728   * @param aa the array
729   * @param i the index of the first element
730   * @param j the index of the second element
731   */
732  final private static void swap(int[] aa, int i, int j) {
733    int tmp = aa[i];
734    aa[i] = aa[j];
735    aa[j] = tmp;
736  }
737  /**
738   * Generates a random sample of instances.  Each attribute must be nominal, and the
739   * class labels are not set.
740   *
741   * @param headerInfo Instances whose header information is used to determine how the
742   * set of returned instances will look
743   * @param numberOfExamples the desired size of the returned set
744   * @param random the random number generator to use
745   * @return a set of Instances containing the random sample.
746   * @throws IllegalArgumentException if numeric attributes are given
747   */
748  public static Instances generateRandomSample(
749      Instances headerInfo, int numberOfExamples, Random random) 
750    throws IllegalArgumentException {
751   
752    int n = headerInfo.numAttributes();
753    double[] info = new double[n];
754    int classIndex = headerInfo.classIndex();
755    for (int i = 0; i < n; i++) {
756      info[i] = headerInfo.attribute(i).numValues();
757      if (i != classIndex && info[i] == 0) {
758        throw new IllegalArgumentException
759        ("Numeric attributes are currently not supported");
760      }
761    }
762    Instances sample = new Instances(headerInfo, numberOfExamples);
763    sample.setRelationName(headerInfo.relationName() + 
764        ".random.sample.of." + numberOfExamples);
765    for (int i = 0; i < numberOfExamples; i++) {
766      sample.add(randomSample(info, classIndex, random));
767    }
768    return sample;
769  }
770  /**
771   * Generates a random instance.
772   *
773   * @param info array that gives for each attribute the number of possible values
774   * @param classIndex the index of the class attribute
775   * @param random the random number generator used
776   * @return a random instance
777   */
778  private static Instance randomSample(double[] info, 
779      int classIndex, Random random) {
780   
781    double[] attValues = new double[info.length];
782    for (int i = 0; i < attValues.length; i++) {
783      if (i != classIndex) {
784        attValues[i] = random.nextInt( (int) info[i]); 
785      }
786    }
787    return new DenseInstance(1, attValues);
788  }
789
790
791
792  /**
793   * Returns an array containing the attribute values (in internal floating
794   * point format) of the given instance in data space, this is, the class
795   * attribute (if any) is removed.
796   *
797   * @param instance the instance to get the attribute values from
798   * @return array of doubles containing the attribute values
799   */
800  public static double[] toDataDouble(Instance instance) {
801    double[] vector = null;
802    int classIndex = instance.classIndex();
803    if(classIndex >= 0) {
804      vector = new double[instance.numAttributes() - 1];
805    } else {
806      vector = new double[instance.numAttributes()];
807    }
808    int index = 0;
809    for (int i = 0; i < instance.numAttributes(); i++) {
810      if(i != classIndex) {
811        vector[index++] = instance.value(i);
812      }
813    }
814    return vector;
815  }
816
817  /**
818   * Computes the minimal extension for a given instance.
819   *
820   * @param instances the set of instances
821   * @param instance the instance for which the minimal extension is to be
822   * calculated
823   * @return the value of the minimal extension, in internal floating point
824   * format
825   */
826  public static double minimalExtension(Instances instances, Instance instance) {
827    return minimalExtension(instances, instance, 0);
828  }
829
830  /**
831   * Computes the minimal extension of a given instance, but the
832   * minimal value returned is <code> minValue. </code>  This method
833   * may have its applications when the training set is divided into
834   * multiple Instances objects.
835   *
836   * @param instances the set of instances
837   * @param instance the instance for which the minimal extension is to
838   * be calculated
839   * @param minValue a double indicating the minimal value that should
840   * be returned
841   * @return the label of the minimal extension, in internal floating point format
842   */
843  public static double minimalExtension(
844      Instances instances, Instance instance, double minValue) {
845   
846    double value = minValue;
847
848    Iterator it = 
849      new EnumerationIterator(instances.enumerateInstances());
850    while(it.hasNext()) {
851      Instance tmp = (Instance) it.next();
852      if (tmp.classValue() > value
853          && InstancesUtil.smallerOrEqual(tmp, instance) ) {
854        value = tmp.classValue();
855      }
856    }
857    return value;
858  }
859
860  /**
861   * Computes the maximal extension for a given instance.
862   *
863   * @param instances the set of instances
864   * @param instance the instance for which the minimal extension is to be
865   * calculated
866   * @return the value of the minimal extension, in internal floating point
867   * format
868   */
869  public static double maximalExtension(Instances instances, Instance instance) {
870    return maximalExtension(instances, instance, instances.numClasses() - 1);
871  }
872
873  /**
874   * Computes the maximal extension of a given instance, but the
875   * maximal value returned is <code> maxValue. </code>  This method
876   * may have its applications when the training set is divided into
877   * multiple Instances objects.
878   *
879   * @param instances the set of instances
880   * @param instance the instance for which the maximal extension is to
881   * be calculated
882   * @param maxValue a double indicating the maximal value that should
883   * be returned
884   * @return the value of the minimal extension, in internal floating point
885   * format
886   */
887  public static double maximalExtension(
888      Instances instances, Instance instance, double maxValue) {
889   
890    double value = maxValue;
891
892    Iterator it = 
893      new EnumerationIterator(instances.enumerateInstances());
894    while(it.hasNext()) {
895      Instance tmp = (Instance) it.next();
896      if (tmp.classValue() < value
897          && InstancesUtil.smallerOrEqual(instance, tmp) ) {
898        value = tmp.classValue();
899      }
900    }
901    return value;
902  }
903 
904  /**
905   * Returns the revision string.
906   *
907   * @return            the revision
908   */
909  public String getRevision() {
910    return RevisionUtils.extract("$Revision: 5987 $");
911  }
912}
Note: See TracBrowser for help on using the repository browser.