source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/instance/ReservoirSample.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 10.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    ReservoirSample.java
19 *    Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.instance;
24
25import java.util.Enumeration;
26import java.util.Random;
27import java.util.Vector;
28
29import weka.core.Capabilities;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.OptionHandler;
34import weka.core.RevisionUtils;
35import weka.core.Utils;
36import weka.core.Capabilities.Capability;
37import weka.filters.Filter;
38import weka.filters.StreamableFilter;
39import weka.filters.UnsupervisedFilter;
40
41/**
42 <!-- globalinfo-start -->
43 * Produces a random subsample of a dataset using the reservoir sampling Algorithm "R" by Vitter. The original data set does not have to fit into main memory, but the reservoir does.
44 * <p/>
45 <!-- globalinfo-end -->
46 *
47 <!-- technical-bibtex-start -->
48 * BibTeX:
49 * <pre>
50 * &#64;article{Vitter1985,
51 *    author = {J. S. Vitter},
52 *    journal = {ACM Transactions on Mathematical Software},
53 *    number = {1}
54 *    volume = {11}
55 *    pages = {37-57},
56 *    title = {Random Sampling with a Reservoir},
57 *    year = {1985}
58 * }
59 * </pre>
60 * </p>
61 <!-- options-start -->
62 * Valid options are: <p/>
63 *
64 * <pre> -S &lt;num&gt;
65 *  Specify the random number seed (default 1)</pre>
66 *
67 * <pre> -Z &lt;num&gt;
68 *  The size of the output dataset - number of instances
69 *  (default 100)</pre>
70 *
71 <!-- options-end -->
72 *
73 * @author Mark Hall (mhall{[at]}pentaho{[dot]}org)
74 * @version $Revision: 5563 $
75 */
76public class ReservoirSample 
77  extends Filter
78  implements UnsupervisedFilter, OptionHandler, StreamableFilter {
79 
80  /** for serialization */
81  static final long serialVersionUID = 3119607037607101160L;
82
83  /** The subsample size, number of instances% */
84  protected int m_SampleSize = 100;
85
86  /** Holds the sub-sample (reservoir) */
87  protected Instance[] m_subSample;
88
89  /** The current instance being processed */
90  protected int m_currentInst;
91 
92  /** The random number generator seed */
93  protected int m_RandomSeed = 1;
94
95  /** The random number generator */
96  protected Random m_random;
97 
98  /**
99   * Returns a string describing this filter
100   * @return a description of the classifier suitable for
101   * displaying in the explorer/experimenter gui
102   */
103  public String globalInfo() {
104    return 
105        "Produces a random subsample of a dataset using the reservoir sampling "
106      + "Algorithm \"R\" by Vitter. The original data set does not have to fit "
107      + "into main memory, but the reservoir does. ";
108  }
109
110  /**
111   * Returns an enumeration describing the available options.
112   *
113   * @return an enumeration of all the available options.
114   */
115  public Enumeration listOptions() {
116    Vector result = new Vector();
117
118    result.addElement(new Option(
119        "\tSpecify the random number seed (default 1)",
120        "S", 1, "-S <num>"));
121
122    result.addElement(new Option(
123        "\tThe size of the output dataset - number of instances\n"
124        +"\t(default 100)",
125        "Z", 1, "-Z <num>"));
126
127    return result.elements();
128  }
129
130
131  /**
132   * Parses a given list of options. <p/>
133   *
134   <!-- options-start -->
135   * Valid options are: <p/>
136   *
137   * <pre> -S &lt;num&gt;
138   *  Specify the random number seed (default 1)</pre>
139   *
140   * <pre> -Z &lt;num&gt;
141   *  The size of the output dataset - number of instances
142   *  (default 100)</pre>
143   *
144   <!-- options-end -->
145   *
146   * @param options the list of options as an array of strings
147   * @throws Exception if an option is not supported
148   */
149  public void setOptions(String[] options) throws Exception {
150    String      tmpStr;
151   
152    tmpStr = Utils.getOption('S', options);
153    if (tmpStr.length() != 0) {
154      setRandomSeed(Integer.parseInt(tmpStr));
155    } else {
156      setRandomSeed(1);
157    }
158
159    tmpStr = Utils.getOption('Z', options);
160    if (tmpStr.length() != 0) {
161      setSampleSize(Integer.parseInt(tmpStr));
162    } else {
163      setSampleSize(100);
164    }
165  }
166
167  /**
168   * Gets the current settings of the filter.
169   *
170   * @return an array of strings suitable for passing to setOptions
171   */
172  public String [] getOptions() {
173    Vector<String>      result;
174
175    result = new Vector<String>();
176
177    result.add("-S");
178    result.add("" + getRandomSeed());
179
180    result.add("-Z");
181    result.add("" + getSampleSize());
182   
183    return result.toArray(new String[result.size()]);
184  }
185
186  /**
187   * Returns the tip text for this property
188   *
189   * @return tip text for this property suitable for
190   * displaying in the explorer/experimenter gui
191   */
192  public String randomSeedTipText() {
193    return "The seed used for random sampling.";
194  }
195
196  /**
197   * Gets the random number seed.
198   *
199   * @return the random number seed.
200   */
201  public int getRandomSeed() {
202    return m_RandomSeed;
203  }
204 
205  /**
206   * Sets the random number seed.
207   *
208   * @param newSeed the new random number seed.
209   */
210  public void setRandomSeed(int newSeed) {
211    m_RandomSeed = newSeed;
212  }
213 
214  /**
215   * Returns the tip text for this property
216   *
217   * @return tip text for this property suitable for
218   * displaying in the explorer/experimenter gui
219   */
220  public String sampleSizeTipText() {
221    return "Size of the subsample (reservoir). i.e. the number of instances.";
222  }
223
224  /**
225   * Gets the subsample size.
226   *
227   * @return the subsample size
228   */
229  public int getSampleSize() {
230    return m_SampleSize;
231  }
232 
233  /**
234   * Sets the size of the subsample.
235   *
236   * @param newSampleSize size of the subsample.
237   */
238  public void setSampleSize(int newSampleSize) {
239    m_SampleSize = newSampleSize;
240  }
241 
242 
243  /**
244   * Returns the Capabilities of this filter.
245   *
246   * @return            the capabilities of this object
247   * @see               Capabilities
248   */
249  public Capabilities getCapabilities() {
250    Capabilities result = super.getCapabilities();
251    result.disableAll();
252
253    // attributes
254    result.enableAllAttributes();
255    result.enable(Capability.MISSING_VALUES);
256   
257    // class
258    result.enableAllClasses();
259    result.enable(Capability.MISSING_CLASS_VALUES);
260    result.enable(Capability.NO_CLASS);
261   
262    return result;
263  }
264
265  /**
266   * Sets the format of the input instances.
267   *
268   * @param instanceInfo an Instances object containing the input
269   * instance structure (any instances contained in the object are
270   * ignored - only the structure is required).
271   * @return true if the outputFormat may be collected immediately
272   * @throws Exception if the input format can't be set
273   * successfully
274   */
275  public boolean setInputFormat(Instances instanceInfo) 
276       throws Exception {
277
278    super.setInputFormat(instanceInfo);
279    setOutputFormat(instanceInfo);
280
281    m_subSample = new Instance[m_SampleSize];
282    m_currentInst = 0;
283    m_random = new Random(m_RandomSeed);
284
285    return true;
286  }
287
288  /**
289   * Decides whether the current instance gets retained in the
290   * reservoir.
291   *
292   * @param instance the Instance to potentially retain
293   */
294  protected void processInstance(Instance instance) {
295    if (m_currentInst < m_SampleSize) {
296      m_subSample[m_currentInst] = (Instance)instance.copy();
297    } else {
298      double r = m_random.nextDouble();
299      if (r < ((double)m_SampleSize / (double)m_currentInst)) {
300        r = m_random.nextDouble();
301        int replace = (int)((double)m_SampleSize * r);
302        m_subSample[replace] = (Instance)instance.copy();
303      }
304    }
305    m_currentInst++;
306  }
307
308  /**
309   * Input an instance for filtering. Filter requires all
310   * training instances be read before producing output.
311   *
312   * @param instance the input instance
313   * @return true if the filtered instance may now be
314   * collected with output().
315   * @throws IllegalStateException if no input structure has been defined
316   */
317  public boolean input(Instance instance) {
318
319    if (getInputFormat() == null) {
320      throw new IllegalStateException("No input instance format defined");
321    }
322    if (m_NewBatch) {
323      resetQueue();
324      m_NewBatch = false;
325    }
326    if (isFirstBatchDone()) {
327      push(instance);
328      return true;
329    } else {
330      //      bufferInput(instance);
331      copyValues(instance, false);
332      processInstance(instance);
333      return false;
334    }
335  }
336
337  /**
338   * Signify that this batch of input to the filter is finished.
339   * If the filter requires all instances prior to filtering,
340   * output() may now be called to retrieve the filtered instances.
341   *
342   * @return true if there are instances pending output
343   * @throws IllegalStateException if no input structure has been defined
344   */
345  public boolean batchFinished() {
346
347    if (getInputFormat() == null) {
348      throw new IllegalStateException("No input instance format defined");
349    }
350
351    if (!isFirstBatchDone()) {
352      // Do the subsample, and clear the input instances.
353      createSubsample();
354    }
355    flushInput();
356
357    m_NewBatch = true;
358    m_FirstBatchDone = true;
359    return (numPendingOutput() != 0);
360  }
361 
362  /**
363   * Creates a subsample of the current set of input instances. The output
364   * instances are pushed onto the output queue for collection.
365   */
366  protected void createSubsample() {
367
368    for (int i = 0; i < m_SampleSize; i++) {
369      if (m_subSample[i] != null) {
370        Instance copy = (Instance) m_subSample[i].copy();
371        push(copy);
372      } else {
373        // less data in the original than was asked for
374        // as a sample.
375        break;
376      }
377    }
378    m_subSample = null;
379  }
380 
381  /**
382   * Returns the revision string.
383   *
384   * @return            the revision
385   */
386  public String getRevision() {
387    return RevisionUtils.extract("$Revision: 5563 $");
388  }
389 
390  /**
391   * Main method for testing this class.
392   *
393   * @param argv should contain arguments to the filter:
394   * use -h for help
395   */
396  public static void main(String [] argv) {
397    runFilter(new ReservoirSample(), argv);
398  }
399}
400
Note: See TracBrowser for help on using the repository browser.