source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/RemoveUseless.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 9.8 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RemoveUseless.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.attribute;
24
25import weka.core.AttributeStats;
26import weka.core.Capabilities;
27import weka.core.Instance; 
28import weka.core.DenseInstance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.OptionHandler;
32import weka.core.RevisionUtils;
33import weka.core.Utils;
34import weka.core.Capabilities.Capability;
35import weka.filters.Filter;
36import weka.filters.UnsupervisedFilter;
37
38import java.util.Enumeration;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * This filter removes attributes that do not vary at all or that vary too much. All constant attributes are deleted automatically, along with any that exceed the maximum percentage of variance parameter. The maximum variance test is only applied to nominal attributes.
44 * <p/>
45 <!-- globalinfo-end -->
46 *
47 <!-- options-start -->
48 * Valid options are: <p/>
49 *
50 * <pre> -M &lt;max variance %&gt;
51 *  Maximum variance percentage allowed (default 99)</pre>
52 *
53 <!-- options-end -->
54 *
55 * @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
56 * @version $Revision: 5987 $
57 */
58public class RemoveUseless 
59  extends Filter
60  implements UnsupervisedFilter, OptionHandler {
61 
62  /** for serialization */
63  static final long serialVersionUID = -8659417851407640038L;
64
65  /** The filter used to remove attributes */
66  protected Remove m_removeFilter = null;
67
68  /** The type of attribute to delete */
69  protected double m_maxVariancePercentage = 99.0;
70
71  /**
72   * Returns the Capabilities of this filter.
73   *
74   * @return            the capabilities of this object
75   * @see               Capabilities
76   */
77  public Capabilities getCapabilities() {
78    Capabilities result = super.getCapabilities();
79
80    // attributes
81    result.enable(Capability.NOMINAL_ATTRIBUTES);
82    result.enable(Capability.NUMERIC_ATTRIBUTES);
83    result.enable(Capability.DATE_ATTRIBUTES);
84    result.enable(Capability.STRING_ATTRIBUTES);
85    result.enable(Capability.MISSING_VALUES);
86   
87    // class
88    result.enableAllClasses();
89    result.enable(Capability.MISSING_CLASS_VALUES);
90    result.enable(Capability.NO_CLASS);
91   
92    return result;
93  }
94
95  /**
96   * Sets the format of the input instances.
97   *
98   * @param instanceInfo an Instances object containing the input instance
99   * structure (any instances contained in the object are ignored - only the
100   * structure is required).
101   * @return true if the outputFormat may be collected immediately
102   * @throws Exception if the inputFormat can't be set successfully
103   */ 
104  public boolean setInputFormat(Instances instanceInfo) throws Exception {
105
106    super.setInputFormat(instanceInfo);
107    m_removeFilter = null;
108    return false;
109  }
110
111  /**
112   * Input an instance for filtering.
113   *
114   * @param instance the input instance
115   * @return true if the filtered instance may now be
116   * collected with output().
117   */
118  public boolean input(Instance instance) {
119
120    if (getInputFormat() == null) {
121      throw new IllegalStateException("No input instance format defined");
122    }
123    if (m_NewBatch) {
124      resetQueue();
125      m_NewBatch = false;
126    }
127    if (m_removeFilter != null) {
128      m_removeFilter.input(instance);
129      Instance processed = m_removeFilter.output();
130      processed.setDataset(getOutputFormat());
131      copyValues(processed, false, instance.dataset(), getOutputFormat());
132      push(processed);
133      return true;
134    }
135    bufferInput(instance);
136    return false;
137  }
138
139  /**
140   * Signify that this batch of input to the filter is finished.
141   *
142   * @return true if there are instances pending output
143   * @throws Exception if no input format defined
144   */ 
145  public boolean batchFinished() throws Exception {
146
147    if (getInputFormat() == null) {
148      throw new IllegalStateException("No input instance format defined");
149    }
150    if (m_removeFilter == null) {
151
152      // establish attributes to remove from first batch
153
154      Instances toFilter = getInputFormat();
155      int[] attsToDelete = new int[toFilter.numAttributes()];
156      int numToDelete = 0;
157      for(int i = 0; i < toFilter.numAttributes(); i++) {
158        if (i==toFilter.classIndex()) continue; // skip class
159        AttributeStats stats = toFilter.attributeStats(i);
160        if (stats.distinctCount < 2) {
161          // remove constant attributes
162          attsToDelete[numToDelete++] = i;
163        } else if (toFilter.attribute(i).isNominal()) {
164          // remove nominal attributes that vary too much
165          double variancePercent = (double) stats.distinctCount
166            / (double)(stats.totalCount - stats.missingCount) * 100.0;
167          if (variancePercent > m_maxVariancePercentage) {
168              attsToDelete[numToDelete++] = i;
169          }
170        }
171      }
172     
173      int[] finalAttsToDelete = new int[numToDelete];
174      System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete);
175     
176      m_removeFilter = new Remove();
177      m_removeFilter.setAttributeIndicesArray(finalAttsToDelete);
178      m_removeFilter.setInvertSelection(false);
179      m_removeFilter.setInputFormat(toFilter);
180     
181      for (int i = 0; i < toFilter.numInstances(); i++) {
182        m_removeFilter.input(toFilter.instance(i));
183      }
184      m_removeFilter.batchFinished();
185
186      Instance processed;
187      Instances outputDataset = m_removeFilter.getOutputFormat();
188   
189      // restore old relation name to hide attribute filter stamp
190      outputDataset.setRelationName(toFilter.relationName());
191   
192      setOutputFormat(outputDataset);
193      while ((processed = m_removeFilter.output()) != null) {
194        processed.setDataset(outputDataset);
195        push(processed);
196      }
197    }
198    flushInput();
199   
200    m_NewBatch = true;
201    return (numPendingOutput() != 0);
202  }
203
204  /**
205   * Returns an enumeration describing the available options.
206   *
207   * @return an enumeration of all the available options.
208   */
209  public Enumeration listOptions() {
210
211    Vector newVector = new Vector(1);
212
213    newVector.addElement(new Option(
214                                    "\tMaximum variance percentage allowed (default 99)",
215                                    "M", 1, "-M <max variance %>"));
216
217
218    return newVector.elements();
219  }
220
221  /**
222   * Parses a given list of options. <p/>
223   *
224   <!-- options-start -->
225   * Valid options are: <p/>
226   *
227   * <pre> -M &lt;max variance %&gt;
228   *  Maximum variance percentage allowed (default 99)</pre>
229   *
230   <!-- options-end -->
231   *
232   * @param options the list of options as an array of strings
233   * @throws Exception if an option is not supported
234   */
235  public void setOptions(String[] options) throws Exception {
236   
237    String mString = Utils.getOption('M', options);
238    if (mString.length() != 0) {
239      setMaximumVariancePercentageAllowed((int) Double.valueOf(mString).doubleValue());
240    } else {
241      setMaximumVariancePercentageAllowed(99.0);
242    }
243
244    if (getInputFormat() != null) {
245      setInputFormat(getInputFormat());
246    }
247  }
248
249  /**
250   * Gets the current settings of the filter.
251   *
252   * @return an array of strings suitable for passing to setOptions
253   */
254  public String [] getOptions() {
255
256    String [] options = new String [2];
257    int current = 0;
258
259    options[current++] = "-M";
260    options[current++] = "" + getMaximumVariancePercentageAllowed();
261   
262    while (current < options.length) {
263      options[current++] = "";
264    }
265    return options;
266  }
267
268  /**
269   * Returns a string describing this filter
270   *
271   * @return a description of the filter suitable for
272   * displaying in the explorer/experimenter gui
273   */
274  public String globalInfo() {
275    return 
276        "This filter removes attributes that do not vary at all or that vary "
277      + "too much. All constant attributes are deleted automatically, along "
278      + "with any that exceed the maximum percentage of variance parameter. "
279      + "The maximum variance test is only applied to nominal attributes.";
280  }
281
282  /**
283   * Returns the tip text for this property
284   *
285   * @return tip text for this property suitable for
286   * displaying in the explorer/experimenter gui
287   */
288  public String maximumVariancePercentageAllowedTipText() {
289
290    return "Set the threshold for the highest variance allowed before a nominal attribute will be deleted."
291      + "Specifically, if (number_of_distinct_values / total_number_of_values * 100)"
292      + " is greater than this value then the attribute will be removed.";
293  }
294
295  /**
296   * Sets the maximum variance attributes are allowed to have before they are
297   * deleted by the filter.
298   *
299   * @param maxVariance the maximum variance allowed, specified as a percentage
300   */
301  public void setMaximumVariancePercentageAllowed(double maxVariance) {
302   
303    m_maxVariancePercentage = maxVariance;
304  }
305
306  /**
307   * Gets the maximum variance attributes are allowed to have before they are
308   * deleted by the filter.
309   *
310   * @return the maximum variance allowed, specified as a percentage
311   */
312  public double getMaximumVariancePercentageAllowed() {
313
314    return m_maxVariancePercentage;
315  }
316 
317  /**
318   * Returns the revision string.
319   *
320   * @return            the revision
321   */
322  public String getRevision() {
323    return RevisionUtils.extract("$Revision: 5987 $");
324  }
325
326  /**
327   * Main method for testing this class.
328   *
329   * @param argv should contain arguments to the filter: use -h for help
330   */
331  public static void main(String [] argv) {
332    runFilter(new RemoveUseless(), argv);
333  }
334}
Note: See TracBrowser for help on using the repository browser.