source: branches/MetisMQI/src/main/java/weka/filters/supervised/instance/Resample.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 19.9 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Resample.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.supervised.instance;
24
25import weka.core.Capabilities;
26import weka.core.Instance;
27import weka.core.Instances;
28import weka.core.Option;
29import weka.core.OptionHandler;
30import weka.core.RevisionUtils;
31import weka.core.Utils;
32import weka.core.Capabilities.Capability;
33import weka.filters.Filter;
34import weka.filters.SupervisedFilter;
35
36import java.util.Collections;
37import java.util.Enumeration;
38import java.util.Random;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * Produces a random subsample of a dataset using either sampling with replacement or without replacement.<br/>
44 * The original dataset must fit entirely in memory. The number of instances in the generated dataset may be specified. The dataset must have a nominal class attribute. If not, use the unsupervised version. The filter can be made to maintain the class distribution in the subsample, or to bias the class distribution toward a uniform distribution. When used in batch mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.
45 * <p/>
46 <!-- globalinfo-end -->
47 *
48 <!-- options-start -->
49 * Valid options are: <p/>
50 *
51 * <pre> -S &lt;num&gt;
52 *  Specify the random number seed (default 1)</pre>
53 *
54 * <pre> -Z &lt;num&gt;
55 *  The size of the output dataset, as a percentage of
56 *  the input dataset (default 100)</pre>
57 *
58 * <pre> -B &lt;num&gt;
59 *  Bias factor towards uniform class distribution.
60 *  0 = distribution in input data -- 1 = uniform distribution.
61 *  (default 0)</pre>
62 *
63 * <pre> -no-replacement
64 *  Disables replacement of instances
65 *  (default: with replacement)</pre>
66 *
67 * <pre> -V
68 *  Inverts the selection - only available with '-no-replacement'.</pre>
69 *
70 <!-- options-end -->
71 *
72 * @author Len Trigg (len@reeltwo.com)
73 * @author FracPete (fracpete at waikato dot ac dot nz)
74 * @version $Revision: 5492 $
75 */
76public class Resample
77  extends Filter
78  implements SupervisedFilter, OptionHandler {
79 
80  /** for serialization. */
81  static final long serialVersionUID = 7079064953548300681L;
82
83  /** The subsample size, percent of original set, default 100%. */
84  protected double m_SampleSizePercent = 100;
85 
86  /** The random number generator seed. */
87  protected int m_RandomSeed = 1;
88 
89  /** The degree of bias towards uniform (nominal) class distribution. */
90  protected double m_BiasToUniformClass = 0;
91
92  /** Whether to perform sampling with replacement or without. */
93  protected boolean m_NoReplacement = false;
94
95  /** Whether to invert the selection (only if instances are drawn WITHOUT
96   * replacement).
97   * @see #m_NoReplacement */
98  protected boolean m_InvertSelection = false;
99
100  /**
101   * Returns a string describing this filter.
102   *
103   * @return a description of the filter suitable for
104   * displaying in the explorer/experimenter gui
105   */
106  public String globalInfo() {
107    return 
108        "Produces a random subsample of a dataset using either sampling "
109      + "with replacement or without replacement.\n"
110      + "The original dataset must "
111      + "fit entirely in memory. The number of instances in the generated "
112      + "dataset may be specified. The dataset must have a nominal class "
113      + "attribute. If not, use the unsupervised version. The filter can be "
114      + "made to maintain the class distribution in the subsample, or to bias "
115      + "the class distribution toward a uniform distribution. When used in batch "
116      + "mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.";
117  }
118
119  /**
120   * Returns an enumeration describing the available options.
121   *
122   * @return an enumeration of all the available options.
123   */
124  public Enumeration listOptions() {
125    Vector result = new Vector();
126
127    result.addElement(new Option(
128        "\tSpecify the random number seed (default 1)",
129        "S", 1, "-S <num>"));
130
131    result.addElement(new Option(
132        "\tThe size of the output dataset, as a percentage of\n"
133        +"\tthe input dataset (default 100)",
134        "Z", 1, "-Z <num>"));
135
136    result.addElement(new Option(
137        "\tBias factor towards uniform class distribution.\n"
138        +"\t0 = distribution in input data -- 1 = uniform distribution.\n"
139        +"\t(default 0)",
140        "B", 1, "-B <num>"));
141
142    result.addElement(new Option(
143        "\tDisables replacement of instances\n"
144        +"\t(default: with replacement)",
145        "no-replacement", 0, "-no-replacement"));
146
147    result.addElement(new Option(
148        "\tInverts the selection - only available with '-no-replacement'.",
149        "V", 0, "-V"));
150
151    return result.elements();
152  }
153
154
155  /**
156   * Parses a given list of options. <p/>
157   *
158   <!-- options-start -->
159   * Valid options are: <p/>
160   *
161   * <pre> -S &lt;num&gt;
162   *  Specify the random number seed (default 1)</pre>
163   *
164   * <pre> -Z &lt;num&gt;
165   *  The size of the output dataset, as a percentage of
166   *  the input dataset (default 100)</pre>
167   *
168   * <pre> -B &lt;num&gt;
169   *  Bias factor towards uniform class distribution.
170   *  0 = distribution in input data -- 1 = uniform distribution.
171   *  (default 0)</pre>
172   *
173   * <pre> -no-replacement
174   *  Disables replacement of instances
175   *  (default: with replacement)</pre>
176   *
177   * <pre> -V
178   *  Inverts the selection - only available with '-no-replacement'.</pre>
179   *
180   <!-- options-end -->
181   *
182   * @param options the list of options as an array of strings
183   * @throws Exception if an option is not supported
184   */
185  public void setOptions(String[] options) throws Exception {
186    String      tmpStr;
187   
188    tmpStr = Utils.getOption('S', options);
189    if (tmpStr.length() != 0)
190      setRandomSeed(Integer.parseInt(tmpStr));
191    else
192      setRandomSeed(1);
193
194    tmpStr = Utils.getOption('B', options);
195    if (tmpStr.length() != 0)
196      setBiasToUniformClass(Double.parseDouble(tmpStr));
197    else
198      setBiasToUniformClass(0);
199
200    tmpStr = Utils.getOption('Z', options);
201    if (tmpStr.length() != 0)
202      setSampleSizePercent(Double.parseDouble(tmpStr));
203    else
204      setSampleSizePercent(100);
205
206    setNoReplacement(Utils.getFlag("no-replacement", options));
207
208    if (getNoReplacement())
209      setInvertSelection(Utils.getFlag('V', options));
210
211    if (getInputFormat() != null) {
212      setInputFormat(getInputFormat());
213    }
214  }
215
216  /**
217   * Gets the current settings of the filter.
218   *
219   * @return an array of strings suitable for passing to setOptions
220   */
221  public String [] getOptions() {
222    Vector<String>      result;
223
224    result = new Vector<String>();
225
226    result.add("-B");
227    result.add("" + getBiasToUniformClass());
228
229    result.add("-S");
230    result.add("" + getRandomSeed());
231
232    result.add("-Z");
233    result.add("" + getSampleSizePercent());
234
235    if (getNoReplacement()) {
236      result.add("-no-replacement");
237      if (getInvertSelection())
238        result.add("-V");
239    }
240   
241    return result.toArray(new String[result.size()]);
242  }
243   
244  /**
245   * Returns the tip text for this property.
246   *
247   * @return tip text for this property suitable for
248   * displaying in the explorer/experimenter gui
249   */
250  public String biasToUniformClassTipText() {
251    return "Whether to use bias towards a uniform class. A value of 0 leaves the class "
252      + "distribution as-is, a value of 1 ensures the class distribution is "
253      + "uniform in the output data.";
254  }
255   
256  /**
257   * Gets the bias towards a uniform class. A value of 0 leaves the class
258   * distribution as-is, a value of 1 ensures the class distributions are
259   * uniform in the output data.
260   *
261   * @return the current bias
262   */
263  public double getBiasToUniformClass() {
264    return m_BiasToUniformClass;
265  }
266 
267  /**
268   * Sets the bias towards a uniform class. A value of 0 leaves the class
269   * distribution as-is, a value of 1 ensures the class distributions are
270   * uniform in the output data.
271   *
272   * @param newBiasToUniformClass the new bias value, between 0 and 1.
273   */
274  public void setBiasToUniformClass(double newBiasToUniformClass) {
275    m_BiasToUniformClass = newBiasToUniformClass;
276  }
277   
278  /**
279   * Returns the tip text for this property.
280   *
281   * @return tip text for this property suitable for
282   * displaying in the explorer/experimenter gui
283   */
284  public String randomSeedTipText() {
285    return "Sets the random number seed for subsampling.";
286  }
287 
288  /**
289   * Gets the random number seed.
290   *
291   * @return the random number seed.
292   */
293  public int getRandomSeed() {
294    return m_RandomSeed;
295  }
296 
297  /**
298   * Sets the random number seed.
299   *
300   * @param newSeed the new random number seed.
301   */
302  public void setRandomSeed(int newSeed) {
303    m_RandomSeed = newSeed;
304  }
305   
306  /**
307   * Returns the tip text for this property.
308   *
309   * @return tip text for this property suitable for
310   * displaying in the explorer/experimenter gui
311   */
312  public String sampleSizePercentTipText() {
313    return "The subsample size as a percentage of the original set.";
314  }
315 
316  /**
317   * Gets the subsample size as a percentage of the original set.
318   *
319   * @return the subsample size
320   */
321  public double getSampleSizePercent() {
322    return m_SampleSizePercent;
323  }
324 
325  /**
326   * Sets the size of the subsample, as a percentage of the original set.
327   *
328   * @param newSampleSizePercent the subsample set size, between 0 and 100.
329   */
330  public void setSampleSizePercent(double newSampleSizePercent) {
331    m_SampleSizePercent = newSampleSizePercent;
332  }
333 
334  /**
335   * Returns the tip text for this property.
336   *
337   * @return tip text for this property suitable for
338   * displaying in the explorer/experimenter gui
339   */
340  public String noReplacementTipText() {
341    return "Disables the replacement of instances.";
342  }
343
344  /**
345   * Gets whether instances are drawn with or without replacement.
346   *
347   * @return true if the replacement is disabled
348   */
349  public boolean getNoReplacement() {
350    return m_NoReplacement;
351  }
352 
353  /**
354   * Sets whether instances are drawn with or with out replacement.
355   *
356   * @param value if true then the replacement of instances is disabled
357   */
358  public void setNoReplacement(boolean value) {
359    m_NoReplacement = value;
360  }
361 
362  /**
363   * Returns the tip text for this property.
364   *
365   * @return tip text for this property suitable for
366   * displaying in the explorer/experimenter gui
367   */
368  public String invertSelectionTipText() {
369    return "Inverts the selection (only if instances are drawn WITHOUT replacement).";
370  }
371
372  /**
373   * Gets whether selection is inverted (only if instances are drawn WIHTOUT
374   * replacement).
375   *
376   * @return true if the replacement is disabled
377   * @see #m_NoReplacement
378   */
379  public boolean getInvertSelection() {
380    return m_InvertSelection;
381  }
382 
383  /**
384   * Sets whether the selection is inverted (only if instances are drawn WIHTOUT
385   * replacement).
386   *
387   * @param value if true then selection is inverted
388   */
389  public void setInvertSelection(boolean value) {
390    m_InvertSelection = value;
391  }
392
393  /**
394   * Returns the Capabilities of this filter.
395   *
396   * @return            the capabilities of this object
397   * @see               Capabilities
398   */
399  public Capabilities getCapabilities() {
400    Capabilities result = super.getCapabilities();
401    result.disableAll();
402
403    // attributes
404    result.enableAllAttributes();
405    result.enable(Capability.MISSING_VALUES);
406   
407    // class
408    result.enable(Capability.NOMINAL_CLASS);
409   
410    return result;
411  }
412 
413  /**
414   * Sets the format of the input instances.
415   *
416   * @param instanceInfo an Instances object containing the input
417   * instance structure (any instances contained in the object are
418   * ignored - only the structure is required).
419   * @return true if the outputFormat may be collected immediately
420   * @throws Exception if the input format can't be set
421   * successfully
422   */
423  public boolean setInputFormat(Instances instanceInfo) 
424       throws Exception {
425
426    super.setInputFormat(instanceInfo);
427    setOutputFormat(instanceInfo);
428    return true;
429  }
430
431  /**
432   * Input an instance for filtering. Filter requires all
433   * training instances be read before producing output.
434   *
435   * @param instance the input instance
436   * @return true if the filtered instance may now be
437   * collected with output().
438   * @throws IllegalStateException if no input structure has been defined
439   */
440  public boolean input(Instance instance) {
441
442    if (getInputFormat() == null) {
443      throw new IllegalStateException("No input instance format defined");
444    }
445    if (m_NewBatch) {
446      resetQueue();
447      m_NewBatch = false;
448    }
449    if (isFirstBatchDone()) {
450      push(instance);
451      return true;
452    } else {
453      bufferInput(instance);
454      return false;
455    }
456  }
457
458  /**
459   * Signify that this batch of input to the filter is finished.
460   * If the filter requires all instances prior to filtering,
461   * output() may now be called to retrieve the filtered instances.
462   *
463   * @return true if there are instances pending output
464   * @throws IllegalStateException if no input structure has been defined
465   */
466  public boolean batchFinished() {
467
468    if (getInputFormat() == null) {
469      throw new IllegalStateException("No input instance format defined");
470    }
471
472    if (!isFirstBatchDone()) {
473      // Do the subsample, and clear the input instances.
474      createSubsample();
475    }
476    flushInput();
477
478    m_NewBatch = true;
479    m_FirstBatchDone = true;
480    return (numPendingOutput() != 0);
481  }
482
483  /**
484   * creates the subsample with replacement.
485   *
486   * @param random              the random number generator to use
487   * @param origSize            the original size of the dataset
488   * @param sampleSize          the size to generate
489   * @param actualClasses       the number of classes found in the data
490   * @param classIndices        the indices where classes start
491   */
492  public void createSubsampleWithReplacement(Random random, int origSize, 
493      int sampleSize, int actualClasses, int[] classIndices) {
494   
495    for (int i = 0; i < sampleSize; i++) {
496      int index = 0;
497      if (random.nextDouble() < m_BiasToUniformClass) {
498        // Pick a random class (of those classes that actually appear)
499        int cIndex = random.nextInt(actualClasses);
500        for (int j = 0, k = 0; j < classIndices.length - 1; j++) {
501          if ((classIndices[j] != classIndices[j + 1]) && (k++ >= cIndex)) {
502            // Pick a random instance of the designated class
503            index =   classIndices[j] 
504                    + random.nextInt(classIndices[j + 1] - classIndices[j]);
505            break;
506          }
507        }
508      }
509      else {
510        index = random.nextInt(origSize);
511      }
512      push((Instance) getInputFormat().instance(index).copy());
513    }
514  }
515
516  /**
517   * creates the subsample without replacement.
518   *
519   * @param random              the random number generator to use
520   * @param origSize            the original size of the dataset
521   * @param sampleSize          the size to generate
522   * @param actualClasses       the number of classes found in the data
523   * @param classIndices        the indices where classes start
524   */
525  public void createSubsampleWithoutReplacement(Random random, int origSize, 
526      int sampleSize, int actualClasses, int[] classIndices) {
527   
528    if (sampleSize > origSize) {
529      sampleSize = origSize;
530      System.err.println(
531          "Resampling without replacement can only use percentage <=100% - "
532          + "Using full dataset!");
533    }
534
535    Vector<Integer>[] indices = new Vector[classIndices.length - 1];
536    Vector<Integer>[] indicesNew = new Vector[classIndices.length - 1];
537
538    // generate list of all indices to draw from
539    for (int i = 0; i < classIndices.length - 1; i++) {
540      indices[i] = new Vector<Integer>(classIndices[i + 1] - classIndices[i]);
541      indicesNew[i] = new Vector<Integer>(indices[i].capacity());
542      for (int n = classIndices[i]; n < classIndices[i + 1]; n++)
543        indices[i].add(n);
544    }
545
546    // draw X samples
547    int currentSize = origSize;
548    for (int i = 0; i < sampleSize; i++) {
549      int index = 0;
550      if (random.nextDouble() < m_BiasToUniformClass) {
551        // Pick a random class (of those classes that actually appear)
552        int cIndex = random.nextInt(actualClasses);
553        for (int j = 0, k = 0; j < classIndices.length - 1; j++) {
554          if ((classIndices[j] != classIndices[j + 1]) && (k++ >= cIndex)) {
555            // no more indices for this class left, try again
556            if (indices[j].size() == 0) {
557              i--;
558              break;
559            }
560            // Pick a random instance of the designated class
561            index = random.nextInt(indices[j].size());
562            indicesNew[j].add(indices[j].get(index));
563            indices[j].remove(index);
564            break;
565          }
566        }
567      }
568      else {
569        index = random.nextInt(currentSize);
570        for (int n = 0; n < actualClasses; n++) {
571          if (index < indices[n].size()) {
572            indicesNew[n].add(indices[n].get(index));
573            indices[n].remove(index);
574            break;
575          }
576          else {
577            index -= indices[n].size();
578          }
579        }
580        currentSize--;
581      }
582    }
583
584    // sort indices
585    if (getInvertSelection()) {
586      indicesNew = indices;
587    }
588    else {
589      for (int i = 0; i < indicesNew.length; i++)
590        Collections.sort(indicesNew[i]);
591    }
592
593    // add to ouput
594    for (int i = 0; i < indicesNew.length; i++) {
595      for (int n = 0; n < indicesNew[i].size(); n++)
596        push((Instance) getInputFormat().instance(indicesNew[i].get(n)).copy());
597    }
598
599    // clean up
600    for (int i = 0; i < indices.length; i++) {
601      indices[i].clear();
602      indicesNew[i].clear();
603    }
604    indices = null;
605    indicesNew = null;
606  }
607
608  /**
609   * Creates a subsample of the current set of input instances. The output
610   * instances are pushed onto the output queue for collection.
611   */
612  protected void createSubsample() {
613    int origSize = getInputFormat().numInstances();
614    int sampleSize = (int) (origSize * m_SampleSizePercent / 100);
615
616    // Subsample that takes class distribution into consideration
617
618    // Sort according to class attribute.
619    getInputFormat().sort(getInputFormat().classIndex());
620   
621    // Create an index of where each class value starts
622    int[] classIndices = new int [getInputFormat().numClasses() + 1];
623    int currentClass = 0;
624    classIndices[currentClass] = 0;
625    for (int i = 0; i < getInputFormat().numInstances(); i++) {
626      Instance current = getInputFormat().instance(i);
627      if (current.classIsMissing()) {
628        for (int j = currentClass + 1; j < classIndices.length; j++) {
629          classIndices[j] = i;
630        }
631        break;
632      } else if (current.classValue() != currentClass) {
633        for (int j = currentClass + 1; j <= current.classValue(); j++) {
634          classIndices[j] = i;
635        }         
636        currentClass = (int) current.classValue();
637      }
638    }
639    if (currentClass <= getInputFormat().numClasses()) {
640      for (int j = currentClass + 1; j < classIndices.length; j++) {
641        classIndices[j] = getInputFormat().numInstances();
642      }
643    }
644   
645    int actualClasses = 0;
646    for (int i = 0; i < classIndices.length - 1; i++) {
647      if (classIndices[i] != classIndices[i + 1]) {
648        actualClasses++;
649      }
650    }
651
652    // Create the new sample
653    Random random = new Random(m_RandomSeed);
654
655    // Convert pending input instances
656    if (getNoReplacement())
657      createSubsampleWithoutReplacement(
658          random, origSize, sampleSize, actualClasses, classIndices);
659    else
660      createSubsampleWithReplacement(
661          random, origSize, sampleSize, actualClasses, classIndices);
662  }
663 
664  /**
665   * Returns the revision string.
666   *
667   * @return            the revision
668   */
669  public String getRevision() {
670    return RevisionUtils.extract("$Revision: 5492 $");
671  }
672 
673  /**
674   * Main method for testing this class.
675   *
676   * @param argv should contain arguments to the filter:
677   * use -h for help
678   */
679  public static void main(String [] argv) {
680    runFilter(new Resample(), argv);
681  }
682}
Note: See TracBrowser for help on using the repository browser.