source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/AddNoise.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 18.0 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    AddNoise.java
19 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.filters.unsupervised.attribute;
23
24import weka.core.Capabilities;
25import weka.core.Instance; 
26import weka.core.DenseInstance;
27import weka.core.Instances;
28import weka.core.Option;
29import weka.core.OptionHandler;
30import weka.core.RevisionUtils;
31import weka.core.SingleIndex;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.filters.Filter;
35import weka.filters.UnsupervisedFilter;
36
37import java.util.Enumeration;
38import java.util.Random;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * An instance filter that changes a percentage of a given attributes values. The attribute must be nominal. Missing value can be treated as value itself.
44 * <p/>
45 <!-- globalinfo-end -->
46 *
47 <!-- options-start -->
48 * Valid options are: <p/>
49 *
50 * <pre> -C &lt;col&gt;
51 *  Index of the attribute to be changed
52 *  (default last attribute)</pre>
53 *
54 * <pre> -M
55 *  Treat missing values as an extra value
56 * </pre>
57 *
58 * <pre> -P &lt;num&gt;
59 *  Specify the percentage of noise introduced
60 *  to the data (default 10)</pre>
61 *
62 * <pre> -S &lt;num&gt;
63 *  Specify the random number seed (default 1)</pre>
64 *
65 <!-- options-end -->
66 *
67 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
68 * @version $Revision: 5987 $
69 */
70public class AddNoise 
71  extends Filter
72  implements UnsupervisedFilter, OptionHandler {
73 
74  /** for serialization */
75  static final long serialVersionUID = -8499673222857299082L;
76
77  /** The attribute's index setting. */
78  private SingleIndex m_AttIndex = new SingleIndex("last"); 
79
80  /** Flag if missing values are taken as value. */
81  private boolean m_UseMissing = false;
82
83  /** The subsample size, percent of original set, default 10% */
84  private int m_Percent = 10;
85 
86  /** The random number generator seed */
87  private int m_RandomSeed = 1;
88
89  /**
90   * Returns a string describing this filter
91   *
92   * @return a description of the filter suitable for
93   * displaying in the explorer/experimenter gui
94   */
95  public String globalInfo() {
96
97    return "An instance filter that changes a percentage of a given"
98           + " attributes values. The attribute must be nominal."
99           + " Missing value can be treated as value itself.";
100  }
101
102  /**
103   * Returns an enumeration describing the available options
104   *
105   * @return an enumeration of all the available options
106   */
107  public Enumeration listOptions() {
108
109    Vector newVector = new Vector(4);
110
111    newVector.addElement(new Option(
112              "\tIndex of the attribute to be changed \n"
113              +"\t(default last attribute)",
114              "C", 1, "-C <col>"));
115    newVector.addElement(new Option(
116              "\tTreat missing values as an extra value \n",
117              "M", 1, "-M"));
118    newVector.addElement(new Option(
119              "\tSpecify the percentage of noise introduced \n"
120              +"\tto the data (default 10)",
121              "P", 1, "-P <num>"));
122    newVector.addElement(new Option(
123              "\tSpecify the random number seed (default 1)",
124              "S", 1, "-S <num>"));
125
126    return newVector.elements();
127  }
128
129  /**
130   * Parses a given list of options. <p/>
131   *
132   <!-- options-start -->
133   * Valid options are: <p/>
134   *
135   * <pre> -C &lt;col&gt;
136   *  Index of the attribute to be changed
137   *  (default last attribute)</pre>
138   *
139   * <pre> -M
140   *  Treat missing values as an extra value
141   * </pre>
142   *
143   * <pre> -P &lt;num&gt;
144   *  Specify the percentage of noise introduced
145   *  to the data (default 10)</pre>
146   *
147   * <pre> -S &lt;num&gt;
148   *  Specify the random number seed (default 1)</pre>
149   *
150   <!-- options-end -->
151   *
152   * @param options the list of options as an array of strings
153   * @throws Exception if an option is not supported
154   */
155  public void setOptions(String[] options) throws Exception {
156
157    String indexString = Utils.getOption('C', options);
158    if (indexString.length() != 0) {
159      setAttributeIndex(indexString);
160    } else {
161      setAttributeIndex("last");
162    }
163
164    if (Utils.getFlag('M', options)) {
165      setUseMissing(true);
166    }
167
168    String percentString = Utils.getOption('P', options);
169    if (percentString.length() != 0) {
170      setPercent((int) Double.valueOf(percentString).doubleValue());
171    } else {
172      setPercent(10);
173    }
174
175    String seedString = Utils.getOption('S', options);
176    if (seedString.length() != 0) {
177      setRandomSeed(Integer.parseInt(seedString));
178    } else {
179      setRandomSeed(1);
180    }
181
182  }
183
184  /**
185   * Gets the current settings of the filter.
186   *
187   * @return an array of strings suitable for passing to setOptions
188   */
189  public String [] getOptions() {
190
191    String [] options = new String [7];
192    int current = 0;
193
194    options[current++] = "-C"; options[current++] = "" + getAttributeIndex();
195
196    if (getUseMissing()) {
197      options[current++] = "-M";
198    }
199
200    options[current++] = "-P"; options[current++] = "" + getPercent();
201
202    options[current++] = "-S"; options[current++] = "" + getRandomSeed();
203
204    while (current < options.length) {
205      options[current++] = "";
206    }
207    return options;
208  }
209   
210  /**
211   * Returns the tip text for this property
212   *
213   * @return tip text for this property suitable for
214   * displaying in the explorer/experimenter gui
215   */
216  public String useMissingTipText() {
217
218    return "Flag to set if missing values are used.";
219  }
220
221  /**
222   * Gets the flag if missing values are treated as extra values.
223   *
224   * @return the flag missing values.
225   */
226  public boolean getUseMissing() {
227
228    return m_UseMissing;
229  }
230
231  /**
232   * Sets the flag if missing values are treated as extra values.
233   *
234   * @param newUseMissing the new flag value.
235   */
236  public void setUseMissing(boolean newUseMissing) {
237
238    m_UseMissing = newUseMissing;
239  }
240
241  /**
242   * Returns the tip text for this property
243   *
244   * @return tip text for this property suitable for
245   * displaying in the explorer/experimenter gui
246   */
247  public String randomSeedTipText() {
248
249    return "Random number seed.";
250  }
251
252  /**
253   * Gets the random number seed.
254   *
255   * @return the random number seed.
256   */
257  public int getRandomSeed() {
258
259    return m_RandomSeed;
260  }
261 
262  /**
263   * Sets the random number seed.
264   *
265   * @param newSeed the new random number seed.
266   */
267  public void setRandomSeed(int newSeed) {
268
269    m_RandomSeed = newSeed;
270  }
271 
272  /**
273   * Returns the tip text for this property
274   *
275   * @return tip text for this property suitable for
276   * displaying in the explorer/experimenter gui
277   */
278  public String percentTipText() {
279
280    return "Percentage of introduced noise to data.";
281  }
282
283  /**
284   * Gets the size of noise data as a percentage of the original set.
285   *
286   * @return the noise data size
287   */
288  public int getPercent() {
289
290    return m_Percent;
291  }
292 
293  /**
294   * Sets the size of noise data, as a percentage of the original set.
295   *
296   * @param newPercent the subsample set size, between 0 and 100.
297   */
298  public void setPercent(int newPercent) {
299
300    m_Percent = newPercent;
301  }
302 
303  /**
304   * Returns the tip text for this property
305   *
306   * @return tip text for this property suitable for
307   * displaying in the explorer/experimenter gui
308   */
309  public String attributeIndexTipText() {
310
311    return "Index of the attribute that is to changed.";
312  }
313
314  /**
315   * Get the index of the attribute used.
316   *
317   * @return the index of the attribute
318   */
319  public String getAttributeIndex() {
320
321    return m_AttIndex.getSingleIndex();
322  }
323
324  /**
325   * Sets index of the attribute used.
326   *
327   * @param attIndex the index of the attribute
328   */
329  public void setAttributeIndex(String attIndex) {
330   
331    m_AttIndex.setSingleIndex(attIndex);
332  }
333
334  /**
335   * Returns the Capabilities of this filter.
336   *
337   * @return            the capabilities of this object
338   * @see               Capabilities
339   */
340  public Capabilities getCapabilities() {
341    Capabilities result = super.getCapabilities();
342    result.disableAll();
343
344    // attributes
345    result.enableAllAttributes();
346    result.enable(Capability.MISSING_VALUES);
347   
348    // class
349    result.enableAllClasses();
350    result.enable(Capability.MISSING_CLASS_VALUES);
351    result.enable(Capability.NO_CLASS);
352   
353    return result;
354  }
355
356  /**
357   * Sets the format of the input instances.
358   *
359   * @param instanceInfo an Instances object containing the input
360   * instance structure (any instances contained in the object are
361   * ignored - only the structure is required).
362   * @return true if the outputFormat may be collected immediately
363   * @throws Exception if the input format can't be set
364   * successfully
365   */
366  public boolean setInputFormat(Instances instanceInfo) 
367       throws Exception {
368
369    super.setInputFormat(instanceInfo);
370    // set input format
371    //m_InputFormat = new Instances(instanceInfo, 0);
372    m_AttIndex.setUpper(getInputFormat().numAttributes() - 1);
373    // set index of attribute to be changed
374
375    // test if nominal
376    if (!getInputFormat().attribute(m_AttIndex.getIndex()).isNominal()) {
377      throw new Exception("Adding noise is not possible:"
378                          + "Chosen attribute is numeric.");
379      }
380
381    // test if two values are given
382    if ((getInputFormat().attribute(m_AttIndex.getIndex()).numValues() < 2)
383        && (!m_UseMissing)) {
384      throw new Exception("Adding noise is not possible:"
385                          + "Chosen attribute has less than two values.");
386    }
387 
388    setOutputFormat(getInputFormat());
389    m_NewBatch = true; 
390    return false;
391  }
392
393  /**
394   * Input an instance for filtering.
395   *
396   * @param instance the input instance
397   * @return true if the filtered instance may now be
398   * collected with output().
399   * @throws Exception if the input format was not set
400   */
401  public boolean input(Instance instance) throws Exception {
402
403    // check if input format is defined
404    if (getInputFormat() == null) {
405      throw new Exception("No input instance format defined");
406    }
407   
408    if (m_NewBatch) {
409      resetQueue();
410      m_NewBatch = false;
411    }
412
413    if (isFirstBatchDone()) {
414      push(instance);
415      return true;
416    } else {
417      bufferInput(instance);
418      return false;
419    }
420  }
421
422  /**
423   * Signify that this batch of input to the filter is finished.
424   * If the filter requires all instances prior to filtering,
425   * output() may now be called to retrieve the filtered instances.
426   *
427   * @return true if there are instances pending output
428   * @throws Exception if no input structure has been defined
429   */
430  public boolean batchFinished() throws Exception {
431
432    if (getInputFormat() == null) {
433      throw new Exception("No input instance format defined");
434    }
435
436    // Do the subsample, and clear the input instances.
437    addNoise (getInputFormat(), m_RandomSeed, m_Percent, m_AttIndex.getIndex(), 
438              m_UseMissing);
439
440    for(int i=0; i<getInputFormat().numInstances(); i++) {
441      push ((Instance)getInputFormat().instance(i).copy());
442    }
443
444    flushInput();
445
446    m_NewBatch = true;
447    m_FirstBatchDone = true;
448    return (numPendingOutput() != 0);
449  }
450
451  /**
452   * add noise to the dataset
453   *
454   * a given percentage of the instances are changed in the  way, that
455   * a set of instances are randomly selected using seed. The attribute
456   * given by its index is changed from its current value to one of the
457   * other possibly ones, also randomly. This is done with leaving the
458   * apportion the same. 
459   * if m_UseMissing is true, missing value is  used as a value of its own
460   * @param instances is the dataset
461   * @param seed used for random function
462   * @param percent percentage of instances that are changed
463   * @param attIndex index of the attribute changed
464   * @param useMissing if true missing values are treated as extra value
465   */
466  public void addNoise (Instances instances, 
467                         int seed, 
468                         int percent,
469                         int attIndex,
470                         boolean useMissing) {
471    int indexList [];
472    int partition_count [];
473    int partition_max [];
474    double splitPercent = (double) percent; // percentage used for splits
475
476    // fill array with the indexes
477    indexList = new int [instances.numInstances()];
478    for (int i=0; i<instances.numInstances(); i++) {
479      indexList[i] = i;
480      }
481
482    // randomize list of indexes
483    Random random = new Random(seed);
484    for (int i=instances.numInstances()-1; i>=0; i--) {
485      int hValue = indexList[i];
486      int hIndex = (int)(random.nextDouble()*(double) i);
487      indexList[i] = indexList[hIndex];
488      indexList[hIndex] = hValue;
489      }
490 
491    // initialize arrays that are used to count instances
492    // of each value and to keep the amount of instances of that value
493    // that has to be changed
494    // this is done for the missing values in the two variables
495    // missing_count and missing_max
496    int numValues = instances.attribute(attIndex).numValues();
497
498    partition_count = new int[numValues];
499    partition_max = new int[numValues];
500    int missing_count = 0;;
501    int missing_max = 0;;
502
503    for (int i = 0; i < numValues; i++) {
504      partition_count[i] = 0;
505      partition_max[i] = 0;
506      }
507
508    // go through the dataset and count all occurrences of values
509    // and all missing values using temporarily .._max arrays and
510    // variable missing_max
511    for (Enumeration e = instances.enumerateInstances();
512         e.hasMoreElements();) {
513      Instance instance = (Instance) e.nextElement(); 
514      if (instance.isMissing(attIndex)) {
515        missing_max++;
516      }
517      else {
518        int j = (int) instance.value(attIndex);
519        partition_max[(int) instance.value(attIndex)]++; 
520      }
521    }
522     
523    // use given percentage to calculate
524    // how many have to be changed per split and
525    // how many of the missing values
526    if (!useMissing) {
527      missing_max = missing_count;
528    } else {
529      missing_max = (int) (((double)missing_max/100) * splitPercent + 0.5);
530    }
531    int sum_max = missing_max;
532    for (int i=0; i<numValues; i++) {
533      partition_max[i]=(int) (((double)partition_max[i]/100) * splitPercent
534                              + 0.5);
535      sum_max = sum_max + partition_max[i];
536      }
537
538    // initialize sum_count to zero, use this variable to see if
539    // everything is done already
540    int sum_count = 0;
541 
542    // add noise
543    // using the randomized index-array
544    //
545    Random randomValue = new Random (seed);
546    int numOfValues = instances.attribute(attIndex).numValues();
547    for(int i=0; i<instances.numInstances(); i++) {
548       if (sum_count >= sum_max) { break; } // finished
549       Instance currInstance = instances.instance(indexList[i]);
550       // if value is missing then...
551       if (currInstance.isMissing(attIndex)) {
552         if (missing_count < missing_max) {
553           changeValueRandomly (randomValue, 
554                                numOfValues,
555                                attIndex, 
556                                currInstance,
557                                useMissing); 
558           missing_count++;
559           sum_count++;
560         }
561         
562       } else {
563         int vIndex = (int) currInstance.value(attIndex);
564         if (partition_count[vIndex] < partition_max[vIndex]) {
565           changeValueRandomly (randomValue,
566                                numOfValues,
567                                attIndex,     
568                                currInstance, 
569                                useMissing);           
570           partition_count[vIndex]++;
571           sum_count++;
572         }
573       }
574    }
575
576  }
577
578  /**
579   * method to set a new value
580   *
581   * @param r random function
582   * @param numOfValues
583   * @param instance
584   * @param useMissing
585   */
586  private void changeValueRandomly(Random r, int numOfValues,
587                                   int indexOfAtt, 
588                                   Instance instance, 
589                                   boolean useMissing) {
590    int currValue;
591
592    // get current value
593    // if value is missing set current value to number of values
594    // whiche is the highest possible value plus one
595    if (instance.isMissing(indexOfAtt)) {
596      currValue = numOfValues;
597    } else {
598      currValue = (int) instance.value(indexOfAtt);
599    }
600
601    // with only two possible values it is easier
602    if ((numOfValues == 2) && (!instance.isMissing(indexOfAtt))) {
603        instance.setValue(indexOfAtt, (double) ((currValue+1)% 2));
604    } else {
605      // get randomly a new value not equal to the current value
606      // if missing values are used as values they must be treated
607      // in a special way
608      while (true) {
609          int newValue;
610        if (useMissing) {
611          newValue = (int) (r.nextDouble() * (double) (numOfValues + 1));
612        } else {
613          newValue = (int) (r.nextDouble() * (double) numOfValues);
614        }
615        // have we found a new value?
616        if (newValue != currValue) { 
617          // the value 1 above the highest possible value (=numOfValues)
618          // is used as missing value
619          if (newValue == numOfValues) { instance.setMissing(indexOfAtt); }
620          else { instance.setValue(indexOfAtt, (double) newValue); }
621          break;
622        }
623      }
624    }
625  }
626 
627  /**
628   * Returns the revision string.
629   *
630   * @return            the revision
631   */
632  public String getRevision() {
633    return RevisionUtils.extract("$Revision: 5987 $");
634  }
635
636  /**
637   * Main method for testing this class.
638   *
639   * @param argv should contain arguments to the filter:
640   * use -h for help
641   */
642  public static void main(String [] argv) {
643    runFilter(new AddNoise(), argv);
644  }
645}
Note: See TracBrowser for help on using the repository browser.