source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/RELAGGS.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 17.4 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * RELAGGS.java
19 * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.filters.unsupervised.attribute;
23
24import weka.core.Attribute;
25import weka.core.AttributeStats;
26import weka.core.Capabilities;
27import weka.core.FastVector;
28import weka.core.Instance; 
29import weka.core.DenseInstance;
30import weka.core.Instances;
31import weka.core.Option;
32import weka.core.Range;
33import weka.core.RevisionUtils;
34import weka.core.TechnicalInformation;
35import weka.core.TechnicalInformationHandler;
36import weka.core.Utils;
37import weka.core.Capabilities.Capability;
38import weka.core.TechnicalInformation.Field;
39import weka.core.TechnicalInformation.Type;
40import weka.filters.SimpleBatchFilter;
41
42import java.util.Enumeration;
43import java.util.Hashtable;
44import java.util.Vector;
45
46/**
47 <!-- globalinfo-start -->
48 * A propositionalization filter inspired by the RELAGGS algorithm.<br/>
49 * It processes all relational attributes that fall into the user defined range (all others are skipped, i.e., not added to the output). Currently, the filter only processes one level of nesting.<br/>
50 * The class attribute is not touched.<br/>
51 * <br/>
52 * For more information see:<br/>
53 * <br/>
54 * M.-A. Krogel, S. Wrobel: Facets of Aggregation Approaches to Propositionalization. In: Work-in-Progress Track at the Thirteenth International Conference on Inductive Logic Programming (ILP), 2003.
55 * <p/>
56 <!-- globalinfo-end -->
57 *
58 <!-- technical-bibtex-start -->
59 * BibTeX:
60 * <pre>
61 * &#64;inproceedings{Krogel2003,
62 *    author = {M.-A. Krogel and S. Wrobel},
63 *    booktitle = {Work-in-Progress Track at the Thirteenth International Conference on Inductive Logic Programming (ILP)},
64 *    editor = {T. Horvath and A. Yamamoto},
65 *    title = {Facets of Aggregation Approaches to Propositionalization},
66 *    year = {2003},
67 *    PDF = {http://kd.cs.uni-magdeburg.de/\~krogel/papers/aggs.pdf}
68 * }
69 * </pre>
70 * <p/>
71 <!-- technical-bibtex-end -->
72 *
73 <!-- options-start -->
74 * Valid options are: <p/>
75 *
76 * <pre> -D
77 *  Turns on output of debugging information.</pre>
78 *
79 * <pre> -R &lt;index1,index2-index4,...&gt;
80 *  Specify list of string attributes to convert to words.
81 *  (default: select all relational attributes)</pre>
82 *
83 * <pre> -V
84 *  Inverts the matching sense of the selection.</pre>
85 *
86 * <pre> -C &lt;num&gt;
87 *  Max. cardinality of nominal attributes. If a nominal attribute
88 *  has more values than this upper limit, then it will be skipped.
89 *  (default: 20)</pre>
90 *
91 <!-- options-end -->
92 *
93 * @author  fracpete (fracpete at waikato dot ac dot nz)
94 * @version $Revision: 5987 $
95 */
96public class RELAGGS
97  extends SimpleBatchFilter
98  implements TechnicalInformationHandler {
99
100  /** for serialization */
101  private static final long serialVersionUID = -3333791375278589231L;
102 
103  /** the max. cardinality for nominal attributes */
104  protected int m_MaxCardinality = 20;
105
106  /** the range of attributes to process (only relational ones will be processed) */
107  protected Range m_SelectedRange = new Range("first-last");
108 
109  /** stores the attribute statistics
110   * <code>att_index-att_index_in_rel_att &lt;-&gt; AttributeStats</code> */
111  protected Hashtable<String,AttributeStats> m_AttStats = new Hashtable<String,AttributeStats>();
112 
113  /**
114   * Returns a string describing this filter
115   *
116   * @return            a description of the filter suitable for
117   *                    displaying in the explorer/experimenter gui
118   */
119  public String globalInfo() {
120    return 
121        "A propositionalization filter inspired by the RELAGGS algorithm.\n"
122      + "It processes all relational attributes that fall into the user defined "
123      + "range (all others are skipped, i.e., not added to the output). "
124      + "Currently, the filter only processes one level of nesting.\n"
125      + "The class attribute is not touched.\n"
126      + "\n"
127      + "For more information see:\n\n"
128      + getTechnicalInformation().toString();
129  }
130
131  /**
132   * Returns an instance of a TechnicalInformation object, containing
133   * detailed information about the technical background of this class,
134   * e.g., paper reference or book this class is based on.
135   *
136   * @return            the technical information about this class
137   */
138  public TechnicalInformation getTechnicalInformation() {
139    TechnicalInformation        result;
140   
141    result = new TechnicalInformation(Type.INPROCEEDINGS);
142    result.setValue(Field.AUTHOR, "M.-A. Krogel and S. Wrobel");
143    result.setValue(Field.TITLE, "Facets of Aggregation Approaches to Propositionalization");
144    result.setValue(Field.BOOKTITLE, "Work-in-Progress Track at the Thirteenth International Conference on Inductive Logic Programming (ILP)");
145    result.setValue(Field.EDITOR, "T. Horvath and A. Yamamoto");
146    result.setValue(Field.YEAR, "2003");
147    result.setValue(Field.PDF, "http://kd.cs.uni-magdeburg.de/~krogel/papers/aggs.pdf");
148   
149    return result;
150  }
151
152  /**
153   * Returns an enumeration describing the available options.
154   *
155   * @return            an enumeration of all the available options.
156   */
157  public Enumeration listOptions() {
158    Vector              result;
159    Enumeration         en;
160
161    result = new Vector();
162
163    en = super.listOptions();
164    while (en.hasMoreElements())
165      result.addElement(en.nextElement());
166
167    result.addElement(new Option(
168        "\tSpecify list of string attributes to convert to words.\n"
169        + "\t(default: select all relational attributes)",
170        "R", 1, "-R <index1,index2-index4,...>"));
171
172    result.addElement(new Option(
173        "\tInverts the matching sense of the selection.",
174        "V", 0, "-V"));
175
176    result.addElement(new Option(
177        "\tMax. cardinality of nominal attributes. If a nominal attribute\n"
178        + "\thas more values than this upper limit, then it will be skipped.\n"
179        + "\t(default: 20)",
180        "C", 1, "-C <num>"));
181
182    return result.elements();
183  }
184
185  /**
186   * Parses the options for this object. <p/>
187   *
188   <!-- options-start -->
189   * Valid options are: <p/>
190   *
191   * <pre> -D
192   *  Turns on output of debugging information.</pre>
193   *
194   * <pre> -R &lt;index1,index2-index4,...&gt;
195   *  Specify list of string attributes to convert to words.
196   *  (default: select all relational attributes)</pre>
197   *
198   * <pre> -V
199   *  Inverts the matching sense of the selection.</pre>
200   *
201   * <pre> -C &lt;num&gt;
202   *  Max. cardinality of nominal attributes. If a nominal attribute
203   *  has more values than this upper limit, then it will be skipped.
204   *  (default: 20)</pre>
205   *
206   <!-- options-end -->
207   *
208   * @param options     the options to use
209   * @throws Exception  if setting of options fails
210   */
211  public void setOptions(String[] options) throws Exception {
212    String      tmpStr;
213
214    tmpStr = Utils.getOption('R', options);
215    if (tmpStr.length() != 0)
216      setSelectedRange(tmpStr);
217    else
218      setSelectedRange("first-last");
219
220    setInvertSelection(Utils.getFlag('V', options));
221   
222    tmpStr = Utils.getOption('C', options);
223    if (tmpStr.length() != 0)
224      setMaxCardinality(Integer.parseInt(tmpStr));
225    else
226      setMaxCardinality(20);
227
228    super.setOptions(options);
229  }
230
231  /**
232   * Gets the current settings of the classifier.
233   *
234   * @return            an array of strings suitable for passing to setOptions
235   */
236  public String[] getOptions() {
237    int                 i;
238    Vector<String>      result;
239    String[]            options;
240
241    result = new Vector<String>();
242
243    options = super.getOptions();
244    for (i = 0; i < options.length; i++)
245      result.add(options[i]);
246
247    result.add("-R"); 
248    result.add(getSelectedRange().getRanges());
249
250    if (getInvertSelection())
251      result.add("-V");
252   
253    result.add("-C");
254    result.add("" + getMaxCardinality());
255   
256    return result.toArray(new String[result.size()]);     
257  }
258
259  /**
260   * Returns the tip text for this property
261   *
262   * @return            tip text for this property suitable for
263   *                    displaying in the explorer/experimenter gui
264   */
265  public String maxCardinalityTipText() {
266    return "The maximum number of values a nominal attribute can have before it's skipped.";
267  }
268
269  /**
270   * Sets the maximum number of values allowed for nominal attributes, before
271   * they're skipped.
272   *
273   * @param value       the maximum value.
274   */
275  public void setMaxCardinality(int value) {
276    m_MaxCardinality = value;
277  }
278 
279  /**
280   * Gets the maximum number of values allowed for nominal attributes, before
281   * they're skipped.
282   *
283   * @return            the maximum number.
284   */
285  public int getMaxCardinality() {
286    return m_MaxCardinality;
287  }
288
289  /**
290   * Returns the tip text for this property
291   *
292   * @return            tip text for this property suitable for
293   *                    displaying in the explorer/experimenter gui
294   */
295  public String attributeIndicesTipText() {
296    return 
297        "Specify range of attributes to act on; "
298      + "this is a comma separated list of attribute indices, with "
299      + "\"first\" and \"last\" valid values; Specify an inclusive "
300      + "range with \"-\"; eg: \"first-3,5,6-10,last\".";
301  }
302 
303  /**
304   * Set the range of attributes to process.
305   *
306   * @param value       the new range.
307   */
308  public void setSelectedRange(String value) {
309    m_SelectedRange = new Range(value);
310  }
311
312  /**
313   * Gets the current range selection.
314   *
315   * @return            current selection.
316   */
317  public Range getSelectedRange() {
318    return m_SelectedRange;
319  }
320
321  /**
322   * Returns the tip text for this property
323   *
324   * @return            tip text for this property suitable for
325   *                    displaying in the explorer/experimenter gui
326   */
327  public String invertSelectionTipText() {
328    return 
329        "Set attribute selection mode. If false, only selected "
330      + "attributes in the range will be worked on; if "
331      + "true, only non-selected attributes will be processed.";
332  }
333
334  /**
335   * Sets whether selected columns should be processed or skipped.
336   *
337   * @param value       the new invert setting
338   */
339  public void setInvertSelection(boolean value) {
340    m_SelectedRange.setInvert(value);
341  }
342
343  /**
344   * Gets whether the supplied columns are to be processed or skipped
345   *
346   * @return            true if the supplied columns will be kept
347   */
348  public boolean getInvertSelection() {
349    return m_SelectedRange.getInvert();
350  }
351
352  /**
353   * Returns the Capabilities of this filter.
354   *
355   * @return            the capabilities of this object
356   * @see               Capabilities
357   */
358  public Capabilities getCapabilities() {
359    Capabilities result = super.getCapabilities();
360    result.disableAll();
361
362    // attributes
363    result.enable(Capability.NOMINAL_ATTRIBUTES);
364    result.enable(Capability.NUMERIC_ATTRIBUTES);
365    result.enable(Capability.DATE_ATTRIBUTES);
366    result.enable(Capability.RELATIONAL_ATTRIBUTES);
367    result.enable(Capability.MISSING_VALUES);
368   
369    // class
370    result.enable(Capability.NOMINAL_CLASS);
371    result.enable(Capability.NUMERIC_CLASS);
372    result.enable(Capability.DATE_CLASS);
373    result.enable(Capability.MISSING_CLASS_VALUES);
374    result.enable(Capability.NO_CLASS);
375   
376    return result;
377  }
378
379  /**
380   * Determines the output format based on the input format and returns
381   * this. In case the output format cannot be returned immediately, i.e.,
382   * immediateOutputFormat() returns false, then this method will be called
383   * from batchFinished().
384   *
385   * @param inputFormat     the input format to base the output format on
386   * @return                the output format
387   * @throws Exception      in case the determination goes wrong
388   * @see   #hasImmediateOutputFormat()
389   * @see   #batchFinished()
390   */
391  protected Instances determineOutputFormat(Instances inputFormat)
392      throws Exception {
393
394    Instances   result;
395    Instances   relFormat;
396    FastVector  atts;
397    int         i;
398    int         n;
399    int         m;
400    int         clsIndex;
401    Attribute   att;
402    String      prefix;
403
404    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
405   
406    atts     = new FastVector();
407    clsIndex = -1;
408    for (i = 0; i < inputFormat.numAttributes(); i++) {
409      // we don't process the class
410      if (i == inputFormat.classIndex()) {
411        clsIndex = atts.size();
412        atts.addElement(inputFormat.attribute(i).copy());
413        continue;
414      }
415     
416      if (!inputFormat.attribute(i).isRelationValued()) {
417        atts.addElement(inputFormat.attribute(i).copy());
418        continue;
419      }
420     
421      if (!m_SelectedRange.isInRange(i)) {
422        if (getDebug())
423          System.out.println(
424              "Attribute " + (i+1) + " (" + inputFormat.attribute(i).name() 
425              + ") skipped.");
426        continue;
427      }
428
429      // process relational attribute
430      prefix    = inputFormat.attribute(i).name() + "_";
431      relFormat = inputFormat.attribute(i).relation();
432      for (n = 0; n < relFormat.numAttributes(); n++) {
433        att = relFormat.attribute(n);
434       
435        if (att.isNumeric()) {
436          atts.addElement(new Attribute(prefix + att.name() + "_MIN"));
437          atts.addElement(new Attribute(prefix + att.name() + "_MAX"));
438          atts.addElement(new Attribute(prefix + att.name() + "_AVG"));
439          atts.addElement(new Attribute(prefix + att.name() + "_STDEV"));
440          atts.addElement(new Attribute(prefix + att.name() + "_SUM"));
441        }
442        else if (att.isNominal()) {
443          if (att.numValues() <= m_MaxCardinality) {
444            for (m = 0; m < att.numValues(); m++)
445              atts.addElement(new Attribute(prefix + att.name() + "_" + att.value(m) + "_CNT"));
446          }
447          else {
448            if (getDebug())
449              System.out.println(
450                  "Attribute " + (i+1) + "/" + (n+1) 
451                  + " (" + inputFormat.attribute(i).name() + "/" + att.name()
452                  + ") skipped, " + att.numValues() + " > " + m_MaxCardinality + ".");
453          }
454        }
455        else {
456          if (getDebug())
457            System.out.println(
458                "Attribute " + (i+1) + "/" + (n+1) 
459                + " (" + inputFormat.attribute(i).name() + "/" + att.name()
460                + ") skipped.");
461        }
462      }
463    }
464   
465    // generate new format
466    result = new Instances(inputFormat.relationName(), atts, 0);
467    result.setClassIndex(clsIndex);
468   
469    // neither string nor relational attributes need to be copied to the
470    // output:
471    initOutputLocators(result, new int[0]);
472   
473    return result;
474  }
475
476  /**
477   * Processes the given data (may change the provided dataset) and returns
478   * the modified version. This method is called in batchFinished().
479   *
480   * @param instances   the data to process
481   * @return            the modified data
482   * @throws Exception  in case the processing goes wrong
483   * @see               #batchFinished()
484   */
485  protected Instances process(Instances instances) throws Exception {
486    Instances           result;
487    Instance            inst;
488    Instance            newInst;
489    Instances           relInstances;
490    int                 k;
491    int                 l;
492    int                 i;
493    int                 n;
494    int                 m;
495    AttributeStats      stats;
496    Attribute           att;
497   
498    result = getOutputFormat();
499
500    // initialize attribute statistics
501    m_AttStats.clear();
502
503    // collect data for all relational attributes
504    for (i = 0; i < instances.numAttributes(); i++) {
505      if (i == instances.classIndex())
506        continue;
507
508      if (!instances.attribute(i).isRelationValued())
509        continue;
510
511      if (!m_SelectedRange.isInRange(i))
512        continue;
513
514      // compute statistics
515      for (k = 0; k < instances.numInstances(); k++) {
516        relInstances = instances.instance(k).relationalValue(i);
517
518        for (n = 0; n < relInstances.numAttributes(); n++) {
519          att   = relInstances.attribute(n);
520          stats = null;
521
522          if (    att.isNumeric() 
523              || (att.isNominal() && att.numValues() <= m_MaxCardinality) ) {
524            stats = relInstances.attributeStats(n);
525            m_AttStats.put(k + "-" + i + "-" + n, stats);
526          }
527        }
528      }
529    }
530   
531    // convert data
532    for (k = 0; k < instances.numInstances(); k++) {
533      inst    = instances.instance(k);
534      newInst = new DenseInstance(result.numAttributes());
535      newInst.setWeight(inst.weight());
536
537      l = 0;
538      for (i = 0; i < instances.numAttributes(); i++) {
539        if (!instances.attribute(i).isRelationValued()) {
540          newInst.setValue(l, inst.value(i));
541          l++;
542        }
543        else {
544          if (!m_SelectedRange.isInRange(i))
545            continue;
546         
547          // replace relational data with statistics
548          relInstances = inst.relationalValue(i);
549          for (n = 0; n < relInstances.numAttributes(); n++) {
550            att   = relInstances.attribute(n);
551            stats = (AttributeStats) m_AttStats.get(k + "-" + i + "-" + n);
552           
553            if (att.isNumeric()) {
554              newInst.setValue(l, stats.numericStats.min); l++;
555              newInst.setValue(l, stats.numericStats.max); l++;
556              newInst.setValue(l, stats.numericStats.mean); l++;
557              newInst.setValue(l, stats.numericStats.stdDev); l++;
558              newInst.setValue(l, stats.numericStats.sum); l++;
559            }
560            else if (att.isNominal() && att.numValues() <= m_MaxCardinality) {
561              for (m = 0; m < att.numValues(); m++) {
562                newInst.setValue(l, stats.nominalCounts[m]);
563                l++;
564              }
565            }
566          }
567        }
568      }
569     
570      result.add(newInst);
571    }
572   
573    return result;
574  }
575 
576  /**
577   * Returns the revision string.
578   *
579   * @return            the revision
580   */
581  public String getRevision() {
582    return RevisionUtils.extract("$Revision: 5987 $");
583  }
584
585  /**
586   * runs the filter with the given arguments
587   *
588   * @param args      the commandline arguments
589   */
590  public static void main(String[] args) {
591    runFilter(new RELAGGS(), args);
592  }
593}
Note: See TracBrowser for help on using the repository browser.