source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/instance/RemoveFrequentValues.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 19.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RemoveFrequentValues.java
19 *    Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.instance;
24
25import weka.core.Attribute;
26import weka.core.AttributeStats;
27import weka.core.Capabilities;
28import weka.core.FastVector;
29import weka.core.Instance;
30import weka.core.Instances;
31import weka.core.Option;
32import weka.core.OptionHandler;
33import weka.core.RevisionUtils;
34import weka.core.SingleIndex;
35import weka.core.UnsupportedAttributeTypeException;
36import weka.core.Utils;
37import weka.core.Capabilities.Capability;
38import weka.filters.Filter;
39import weka.filters.UnsupervisedFilter;
40
41import java.util.Arrays;
42import java.util.Enumeration;
43import java.util.HashSet;
44import java.util.Iterator;
45import java.util.Vector;
46
47/**
48 <!-- globalinfo-start -->
49 * Determines which values (frequent or infrequent ones) of an (nominal) attribute are retained and filters the instances accordingly. In case of values with the same frequency, they are kept in the way they appear in the original instances object. E.g. if you have the values "1,2,3,4" with the frequencies "10,5,5,3" and you chose to keep the 2 most common values, the values "1,2" would be returned, since the value "2" comes before "3", even though they have the same frequency.
50 * <p/>
51 <!-- globalinfo-end -->
52 *
53 <!-- options-start -->
54 * Valid options are: <p/>
55 *
56 * <pre> -C &lt;num&gt;
57 *  Choose attribute to be used for selection.</pre>
58 *
59 * <pre> -N &lt;num&gt;
60 *  Number of values to retain for the sepcified attribute,
61 *  i.e. the ones with the most instances (default 2).</pre>
62 *
63 * <pre> -L
64 *  Instead of values with the most instances the ones with the
65 *  least are retained.
66 * </pre>
67 *
68 * <pre> -H
69 *  When selecting on nominal attributes, removes header
70 *  references to excluded values.</pre>
71 *
72 * <pre> -V
73 *  Invert matching sense.</pre>
74 *
75 <!-- options-end -->
76 *
77 * @author FracPete (fracpete at waikato dot ac dot nz)
78 * @version $Revision: 5499 $
79 */
80public class RemoveFrequentValues 
81   extends Filter
82   implements OptionHandler, UnsupervisedFilter {
83 
84   /** for serialization */
85   static final long serialVersionUID = -2447432930070059511L;
86
87   /** The attribute's index setting. */
88   private SingleIndex m_AttIndex = new SingleIndex("last"); 
89
90   /** the number of values to retain. */
91   protected int m_NumValues = 2; 
92   
93   /** whether to retain values with least instances instead of most. */
94   protected boolean m_LeastValues = false;
95   
96   /** whether to invert the matching sense. */
97   protected boolean m_Invert = false;
98   
99   /** Modify header for nominal attributes? */
100   protected boolean m_ModifyHeader = false;
101   
102   /** If m_ModifyHeader, stores a mapping from old to new indexes */
103   protected int [] m_NominalMapping;
104   
105   /** contains the values to retain */
106   protected HashSet m_Values = null;
107   
108   /**
109    * Returns a string describing this filter
110    * @return a description of the classifier suitable for
111    * displaying in the explorer/experimenter gui
112    */
113   public String globalInfo() {
114     return 
115         "Determines which values (frequent or infrequent ones) of an "
116       + "(nominal) attribute are retained and filters the instances "
117       + "accordingly. In case of values with the same frequency, they are "
118       + "kept in the way they appear in the original instances object. E.g. "
119       + "if you have the values \"1,2,3,4\" with the frequencies \"10,5,5,3\" "
120       + "and you chose to keep the 2 most common values, the values \"1,2\" "
121       + "would be returned, since the value \"2\" comes before \"3\", even "
122       + "though they have the same frequency.";
123   }
124
125   /**
126    * Returns an enumeration describing the available options.
127    *
128    * @return an enumeration of all the available options.
129    */
130   public Enumeration listOptions() {
131      Vector newVector = new Vector(5);
132
133      newVector.addElement(new Option(
134                "\tChoose attribute to be used for selection.",
135                "C", 1, "-C <num>"));
136      newVector.addElement(new Option(
137                  "\tNumber of values to retain for the sepcified attribute, \n"
138                + "\ti.e. the ones with the most instances (default 2).",
139                "N", 1, "-N <num>"));
140      newVector.addElement(new Option(
141                  "\tInstead of values with the most instances the ones with the \n"
142                + "\tleast are retained.\n",
143                "L", 0, "-L"));
144      newVector.addElement(new Option(
145                  "\tWhen selecting on nominal attributes, removes header\n"
146                 + "\treferences to excluded values.",
147                 "H", 0, "-H"));
148      newVector.addElement(new Option(
149                 "\tInvert matching sense.",
150                "V", 0, "-V"));
151
152      return newVector.elements();
153   }
154
155   /**
156    * Parses a given list of options. <p/>
157    *
158    <!-- options-start -->
159    * Valid options are: <p/>
160    *
161    * <pre> -C &lt;num&gt;
162    *  Choose attribute to be used for selection.</pre>
163    *
164    * <pre> -N &lt;num&gt;
165    *  Number of values to retain for the sepcified attribute,
166    *  i.e. the ones with the most instances (default 2).</pre>
167    *
168    * <pre> -L
169    *  Instead of values with the most instances the ones with the
170    *  least are retained.
171    * </pre>
172    *
173    * <pre> -H
174    *  When selecting on nominal attributes, removes header
175    *  references to excluded values.</pre>
176    *
177    * <pre> -V
178    *  Invert matching sense.</pre>
179    *
180    <!-- options-end -->
181    *
182    * @param options the list of options as an array of strings
183    * @throws Exception if an option is not supported
184    */
185   public void setOptions(String[] options) throws Exception {
186      String attIndex = Utils.getOption('C', options);
187      if (attIndex.length() != 0) {
188         setAttributeIndex(attIndex);
189      } else {
190         setAttributeIndex("last");
191      }
192     
193      String numValues = Utils.getOption('N', options);
194      if (numValues.length() != 0) {
195         setNumValues(Integer.parseInt(numValues));
196      } else {
197         setNumValues(2);
198      }
199     
200      setUseLeastValues(Utils.getFlag('L', options));
201
202      setModifyHeader(Utils.getFlag('H', options));
203
204      setInvertSelection(Utils.getFlag('V', options));
205     
206      if (getInputFormat() != null) {
207         setInputFormat(getInputFormat());
208      }
209   }
210
211   /**
212    * Gets the current settings of the filter.
213    *
214    * @return an array of strings suitable for passing to setOptions
215    */
216   public String[] getOptions() {
217      String [] options = new String [7];
218      int current = 0;
219
220      options[current++] = "-C";
221      options[current++] = "" + (getAttributeIndex());
222      options[current++] = "-N";
223      options[current++] = "" + (getNumValues());
224      if (getUseLeastValues()) {
225        options[current++] = "-H";
226      }
227      if (getModifyHeader()) {
228         options[current++] = "-H";
229      }
230      if (getInvertSelection()) {
231         options[current++] = "-V";
232      }
233      while (current < options.length) {
234        options[current++] = "";
235      }
236      return options;
237   }
238
239   /**
240    * Returns the tip text for this property
241    * @return tip text for this property suitable for
242    * displaying in the explorer/experimenter gui
243    */
244   public String attributeIndexTipText() {
245     return "Choose attribute to be used for selection (default last).";
246   }
247
248   /**
249    * Get the index of the attribute used.
250    *
251    * @return the index of the attribute
252    */
253   public String getAttributeIndex() {
254     return m_AttIndex.getSingleIndex();
255   }
256
257   /**
258    * Sets index of the attribute used.
259    *
260    * @param attIndex the index of the attribute
261    */
262   public void setAttributeIndex(String attIndex) {
263     m_AttIndex.setSingleIndex(attIndex);
264   }
265
266   /**
267    * Returns the tip text for this property
268    *
269    * @return tip text for this property suitable for
270    * displaying in the explorer/experimenter gui
271    */
272   public String numValuesTipText() {
273      return "The number of values to retain.";
274   }
275
276   /**
277    * Gets how many values are retained
278    *
279    * @return how many values are retained
280    */
281   public int getNumValues() {
282      return m_NumValues;
283   }
284
285   /**
286    * Sets how many values are retained 
287    *
288    * @param numValues the number of values to retain
289    */
290   public void setNumValues(int numValues) {
291      m_NumValues = numValues;
292   }
293
294   /**
295    * Returns the tip text for this property
296    *
297    * @return tip text for this property suitable for
298    * displaying in the explorer/experimenter gui
299    */
300   public String useLeastValuesTipText() {
301      return "Retains values with least instance instead of most.";
302   }
303
304   /**
305    * Gets whether to use values with least or most instances
306    *
307    * @return true if values with least instances are retained
308    */
309   public boolean getUseLeastValues() {
310      return m_LeastValues;
311   }
312
313   /**
314    * Sets whether to use values with least or most instances 
315    *
316    * @param leastValues whether values with least or most instances are retained
317    */
318   public void setUseLeastValues(boolean leastValues) {
319      m_LeastValues = leastValues;
320   }
321
322   /**
323    * Returns the tip text for this property
324    * @return tip text for this property suitable for
325    * displaying in the explorer/experimenter gui
326    */
327   public String modifyHeaderTipText() {
328      return "When selecting on nominal attributes, removes header references to "
329      + "excluded values.";
330   }
331   
332   /**
333    * Gets whether the header will be modified when selecting on nominal
334    * attributes.
335    *
336    * @return true if so.
337    */
338   public boolean getModifyHeader() {
339      return m_ModifyHeader;
340   }
341   
342   /**
343    * Sets whether the header will be modified when selecting on nominal
344    * attributes.
345    *
346    * @param newModifyHeader true if so.
347    */
348   public void setModifyHeader(boolean newModifyHeader) {
349      m_ModifyHeader = newModifyHeader;
350   }
351   
352   /**
353    * Returns the tip text for this property
354    *
355    * @return tip text for this property suitable for
356    * displaying in the explorer/experimenter gui
357    */
358   public String invertSelectionTipText() {
359      return "Invert matching sense.";
360   }
361
362   /**
363    * Get whether the supplied columns are to be removed or kept
364    *
365    * @return true if the supplied columns will be kept
366    */
367   public boolean getInvertSelection() {
368      return m_Invert;
369   }
370
371   /**
372    * Set whether selected values should be removed or kept. If true the
373    * selected values are kept and unselected values are deleted.
374    *
375    * @param invert the new invert setting
376    */
377   public void setInvertSelection(boolean invert) {
378      m_Invert = invert;
379   }
380
381   /**
382    * Returns true if selection attribute is nominal.
383    *
384    * @return true if selection attribute is nominal
385    */
386   public boolean isNominal() {
387      if (getInputFormat() == null) {
388         return false;
389      } else {
390         return getInputFormat().attribute(m_AttIndex.getIndex()).isNominal();
391      }
392   }
393   
394   /**
395    * determines the values to retain, it is always at least 1
396    * and up to the maximum number of distinct values
397    *
398    * @param inst the Instances to determine the values from which are kept 
399    */
400   public void determineValues(Instances inst) {
401      int                                       i;
402      AttributeStats            stats;
403      int                                       attIdx;
404      int                                       min;
405      int                                       max;
406      int                                       count;
407
408      m_AttIndex.setUpper(inst.numAttributes() - 1);
409      attIdx = m_AttIndex.getIndex();
410     
411      // init names
412      m_Values = new HashSet();
413     
414      if (inst == null)
415         return;
416     
417      // number of values to retain
418      stats = inst.attributeStats(attIdx);
419      if (m_Invert)
420         count = stats.nominalCounts.length - m_NumValues;
421      else
422         count = m_NumValues;
423      // out of bounds? -> fix
424      if (count < 1)
425         count = 1;  // at least one value!
426      if (count > stats.nominalCounts.length)
427         count = stats.nominalCounts.length;  // at max the existing values
428     
429      // determine min/max occurences
430      Arrays.sort(stats.nominalCounts);
431      if (m_LeastValues) {
432         min = stats.nominalCounts[0];
433         max = stats.nominalCounts[count - 1];
434      }
435      else {
436         min = stats.nominalCounts[(stats.nominalCounts.length - 1) - count + 1];
437         max = stats.nominalCounts[stats.nominalCounts.length - 1];
438      }
439     
440      // add values if they are inside min/max (incl. borders) and not more than count
441      stats = inst.attributeStats(attIdx);
442      for (i = 0; i < stats.nominalCounts.length; i++) {
443         if ( (stats.nominalCounts[i] >= min) && (stats.nominalCounts[i] <= max) && (m_Values.size() < count) )
444            m_Values.add(inst.attribute(attIdx).value(i));
445      }
446   }
447   
448   /**
449    * modifies the header of the Instances and returns the format w/o
450    * any instances
451    *
452    * @param instanceInfo the instances structure to modify
453    * @return the new structure (w/o instances!)
454    */
455   protected Instances modifyHeader(Instances instanceInfo) {
456      instanceInfo = new Instances(getInputFormat(), 0); // copy before modifying
457      Attribute oldAtt = instanceInfo.attribute(m_AttIndex.getIndex());
458      int [] selection = new int[m_Values.size()];
459      Iterator iter = m_Values.iterator();
460      int i = 0;
461      while (iter.hasNext()) {
462         selection[i] = oldAtt.indexOfValue(iter.next().toString());
463         i++;
464      }
465      FastVector newVals = new FastVector();
466      for (i = 0; i < selection.length; i++) {
467         newVals.addElement(oldAtt.value(selection[i]));
468      }
469      instanceInfo.deleteAttributeAt(m_AttIndex.getIndex());
470      instanceInfo.insertAttributeAt(new Attribute(oldAtt.name(), newVals),
471            m_AttIndex.getIndex());
472      m_NominalMapping = new int [oldAtt.numValues()];
473      for (i = 0; i < m_NominalMapping.length; i++) {
474         boolean found = false;
475         for (int j = 0; j < selection.length; j++) {
476            if (selection[j] == i) {
477               m_NominalMapping[i] = j;
478               found = true;
479               break;
480            }
481         }
482         if (!found) {
483            m_NominalMapping[i] = -1;
484         }
485      }
486     
487      return instanceInfo;
488   }
489
490   /**
491    * Returns the Capabilities of this filter.
492    *
493    * @return            the capabilities of this object
494    * @see               Capabilities
495    */
496   public Capabilities getCapabilities() {
497     Capabilities result = super.getCapabilities();
498     result.disableAll();
499
500     // attributes
501     result.enableAllAttributes();
502     result.enable(Capability.MISSING_VALUES);
503     
504     // class
505     result.enableAllClasses();
506     result.enable(Capability.MISSING_CLASS_VALUES);
507     result.enable(Capability.NO_CLASS);
508     
509     return result;
510   }
511
512   /**
513    * Sets the format of the input instances.
514    *
515    * @param instanceInfo an Instances object containing the input instance
516    * structure (any instances contained in the object are ignored - only the
517    * structure is required).
518    * @return true if the outputFormat can be collected immediately
519    * @throws UnsupportedAttributeTypeException if the specified attribute
520    *         is not nominal.
521    */
522   public boolean setInputFormat(Instances instanceInfo) throws Exception {
523      super.setInputFormat(instanceInfo);
524     
525      m_AttIndex.setUpper(instanceInfo.numAttributes() - 1);
526
527      if (!isNominal())
528         throw new UnsupportedAttributeTypeException("Can only handle nominal attributes.");
529     
530      m_Values = null;
531         
532      return false;
533   }
534   
535   /**
536    * Set the output format. Takes the currently defined Values to retain and
537    * m_InputFormat and calls setOutputFormat(Instances) appropriately.
538    * Those instances that have a value to retain are "push"ed to the output.
539    */
540   protected void setOutputFormat() {
541      Instances      instances;
542      int            i;
543      Instance       instance;
544     
545      if (m_Values == null) {
546         setOutputFormat(null);
547         return;
548      }
549     
550      // get structure
551      if (getModifyHeader())
552         instances = modifyHeader(getInputFormat());
553      else
554         instances = new Instances(getInputFormat(), 0);
555      setOutputFormat(instances);
556     
557      // remove instances with unwanted values, for the others change the values
558      // value if m_ModifyHeader is set
559      for (i = 0; i < getInputFormat().numInstances(); i++) {
560         instance = getInputFormat().instance(i);
561     
562         if (m_Values.contains(instance.stringValue(m_AttIndex.getIndex()))) {
563            if (getModifyHeader()) {
564               instance.setValue(m_AttIndex.getIndex(),
565                     m_NominalMapping[(int)instance.value(m_AttIndex.getIndex())]);
566            }
567            push(instance);
568         }
569      }
570   }
571   
572   /**
573    * Input an instance for filtering. Ordinarily the instance is processed
574    * and made available for output immediately. Some filters require all
575    * instances be read before producing output.
576    *
577    * @param instance the input instance
578    * @return true if the filtered instance may now be
579    * collected with output().
580    * @throws IllegalStateException if no input format has been set.
581    */
582   public boolean input(Instance instance) {
583      if (getInputFormat() == null) {
584         throw new IllegalStateException("No input instance format defined");
585      }
586     
587      if (m_NewBatch) {
588         resetQueue();
589         m_NewBatch = false;
590      }
591
592      if (isFirstBatchDone()) {
593        push(instance);
594        return true;
595      } 
596      else {
597        bufferInput(instance);
598        return false;
599      }
600   }
601
602   /**
603    * Signifies that this batch of input to the filter is finished. If the
604    * filter requires all instances prior to filtering, output() may now
605    * be called to retrieve the filtered instances.
606    *
607    * @return true if there are instances pending output
608    * @throws IllegalStateException if no input structure has been defined
609    */
610   public boolean batchFinished() {
611      if (getInputFormat() == null) {
612         throw new IllegalStateException("No input instance format defined");
613      }
614
615      // process input
616      if (m_Values == null) {
617         determineValues(getInputFormat());
618         setOutputFormat();
619      } 
620      flushInput();
621     
622      m_NewBatch = true;
623      m_FirstBatchDone = true;
624
625      return (numPendingOutput() != 0);
626   }
627   
628   /**
629    * Returns the revision string.
630    *
631    * @return           the revision
632    */
633   public String getRevision() {
634     return RevisionUtils.extract("$Revision: 5499 $");
635   }
636   
637   /**
638    * Main method for testing this class.
639    *
640    * @param argv should contain arguments to the filter:
641    * use -h for help
642    */
643   public static void main(String[] argv) {
644      runFilter(new RemoveFrequentValues(), argv);
645   }
646}
Note: See TracBrowser for help on using the repository browser.