source: branches/MetisMQI/src/main/java/weka/filters/supervised/attribute/NominalToBinary.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 19.4 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    NominalToBinary.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23
24package weka.filters.supervised.attribute;
25
26import weka.core.Attribute;
27import weka.core.Capabilities;
28import weka.core.FastVector;
29import weka.core.Instance;
30import weka.core.DenseInstance;
31import weka.core.Instances;
32import weka.core.Option;
33import weka.core.OptionHandler;
34import weka.core.RevisionUtils;
35import weka.core.SparseInstance;
36import weka.core.TechnicalInformation;
37import weka.core.TechnicalInformationHandler;
38import weka.core.UnassignedClassException;
39import weka.core.Utils;
40import weka.core.Capabilities.Capability;
41import weka.core.TechnicalInformation.Field;
42import weka.core.TechnicalInformation.Type;
43import weka.filters.Filter;
44import weka.filters.SupervisedFilter;
45
46import java.util.Enumeration;
47import java.util.Vector;
48
49/**
50 <!-- globalinfo-start -->
51 * Converts all nominal attributes into binary numeric attributes. An attribute with k values is transformed into k binary attributes if the class is nominal (using the one-attribute-per-value approach). Binary attributes are left binary, if option '-A' is not given.If the class is numeric, k - 1 new binary attributes are generated in the manner described in "Classification and Regression Trees" by Breiman et al. (i.e. taking the average class value associated with each attribute value into account)<br/>
52 * <br/>
53 * For more information, see:<br/>
54 * <br/>
55 * L. Breiman, J.H. Friedman, R.A. Olshen, C.J. Stone (1984). Classification and Regression Trees. Wadsworth Inc.
56 * <p/>
57 <!-- globalinfo-end -->
58 *
59 <!-- technical-bibtex-start -->
60 * BibTeX:
61 * <pre>
62 * &#64;book{Breiman1984,
63 *    author = {L. Breiman and J.H. Friedman and R.A. Olshen and C.J. Stone},
64 *    publisher = {Wadsworth Inc},
65 *    title = {Classification and Regression Trees},
66 *    year = {1984},
67 *    ISBN = {0412048418}
68 * }
69 * </pre>
70 * <p/>
71 <!-- technical-bibtex-end -->
72 *
73 <!-- options-start -->
74 * Valid options are: <p/>
75 *
76 * <pre> -N
77 *  Sets if binary attributes are to be coded as nominal ones.</pre>
78 *
79 * <pre> -A
80 *  For each nominal value a new attribute is created,
81 *  not only if there are more than 2 values.</pre>
82 *
83 <!-- options-end -->
84 *
85 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
86 * @version $Revision: 5987 $
87 */
88public class NominalToBinary 
89  extends Filter
90  implements SupervisedFilter, OptionHandler, TechnicalInformationHandler {
91 
92  /** for serialization */
93  static final long serialVersionUID = -5004607029857673950L;
94
95  /** The sorted indices of the attribute values. */
96  private int[][] m_Indices = null;
97
98  /** Are the new attributes going to be nominal or numeric ones? */
99  private boolean m_Numeric = true;
100
101  /** Are all values transformed into new attributes? */
102  private boolean m_TransformAll = false;
103
104  /**
105   * Returns a string describing this filter
106   *
107   * @return a description of the filter suitable for
108   * displaying in the explorer/experimenter gui
109   */
110  public String globalInfo() {
111
112    return "Converts all nominal attributes into binary numeric attributes. An "
113      + "attribute with k values is transformed into k binary attributes if "
114      + "the class is nominal (using the one-attribute-per-value approach). "
115      + "Binary attributes are left binary, if option '-A' is not given."
116      + "If the class is numeric, k - 1 new binary attributes are generated "
117      + "in the manner described in \"Classification and Regression "
118      + "Trees\" by Breiman et al. (i.e. taking the average class value associated "
119      + "with each attribute value into account)\n\n"
120      + "For more information, see:\n\n"
121      + getTechnicalInformation().toString();
122  }
123
124  /**
125   * Returns an instance of a TechnicalInformation object, containing
126   * detailed information about the technical background of this class,
127   * e.g., paper reference or book this class is based on.
128   *
129   * @return the technical information about this class
130   */
131  public TechnicalInformation getTechnicalInformation() {
132    TechnicalInformation        result;
133   
134    result = new TechnicalInformation(Type.BOOK);
135    result.setValue(Field.AUTHOR, "L. Breiman and J.H. Friedman and R.A. Olshen and C.J. Stone");
136    result.setValue(Field.TITLE, "Classification and Regression Trees");
137    result.setValue(Field.YEAR, "1984");
138    result.setValue(Field.PUBLISHER, "Wadsworth Inc");
139    result.setValue(Field.ISBN, "0412048418");
140   
141    return result;
142  }
143
144  /**
145   * Returns the Capabilities of this filter.
146   *
147   * @return            the capabilities of this object
148   * @see               Capabilities
149   */
150  public Capabilities getCapabilities() {
151    Capabilities result = super.getCapabilities();
152    result.disableAll();
153
154    // attributes
155    result.enableAllAttributes();
156    result.enable(Capability.MISSING_VALUES);
157   
158    // class
159    result.enable(Capability.NUMERIC_CLASS);
160    result.enable(Capability.DATE_CLASS);
161    result.enable(Capability.NOMINAL_CLASS);
162   
163    return result;
164  }
165
166  /**
167   * Sets the format of the input instances.
168   *
169   * @param instanceInfo an Instances object containing the input
170   * instance structure (any instances contained in the object are
171   * ignored - only the structure is required).
172   * @return true if the outputFormat may be collected immediately
173   * @throws Exception if the input format can't be set
174   * successfully
175   */
176  public boolean setInputFormat(Instances instanceInfo) 
177       throws Exception {
178
179    super.setInputFormat(instanceInfo);
180    if (instanceInfo.classIndex() < 0) {
181      throw new UnassignedClassException("No class has been assigned to the instances");
182    }
183    setOutputFormat();
184    m_Indices = null;
185    if (instanceInfo.classAttribute().isNominal()) {
186      return true;
187    } else {
188      return false;
189    }
190  }
191
192  /**
193   * Input an instance for filtering. Filter requires all
194   * training instances be read before producing output.
195   *
196   * @param instance the input instance
197   * @return true if the filtered instance may now be
198   * collected with output().
199   * @throws IllegalStateException if no input format has been set
200   */
201  public boolean input(Instance instance) {
202
203    if (getInputFormat() == null) {
204      throw new IllegalStateException("No input instance format defined");
205    }
206    if (m_NewBatch) {
207      resetQueue();
208      m_NewBatch = false;
209    }
210    if ((m_Indices != null) || 
211        (getInputFormat().classAttribute().isNominal())) {
212      convertInstance(instance);
213      return true;
214    }
215    bufferInput(instance);
216    return false;
217  }
218
219  /**
220   * Signify that this batch of input to the filter is finished.
221   * If the filter requires all instances prior to filtering,
222   * output() may now be called to retrieve the filtered instances.
223   *
224   * @return true if there are instances pending output
225   * @throws IllegalStateException if no input structure has been defined
226   */
227  public boolean batchFinished() {
228
229    if (getInputFormat() == null) {
230      throw new IllegalStateException("No input instance format defined");
231    }
232    if ((m_Indices == null) && 
233        (getInputFormat().classAttribute().isNumeric())) {
234      computeAverageClassValues();
235      setOutputFormat();
236
237      // Convert pending input instances
238
239      for(int i = 0; i < getInputFormat().numInstances(); i++) {
240        convertInstance(getInputFormat().instance(i));
241      }
242    } 
243    flushInput();
244
245    m_NewBatch = true;
246    return (numPendingOutput() != 0);
247  }
248
249  /**
250   * Returns an enumeration describing the available options.
251   *
252   * @return an enumeration of all the available options.
253   */
254  public Enumeration listOptions() {
255
256    Vector newVector = new Vector(1);
257
258    newVector.addElement(new Option(
259        "\tSets if binary attributes are to be coded as nominal ones.",
260        "N", 0, "-N"));
261   
262    newVector.addElement(new Option(
263        "\tFor each nominal value a new attribute is created, \n"
264        + "\tnot only if there are more than 2 values.",
265        "A", 0, "-A"));
266
267    return newVector.elements();
268  }
269
270
271  /**
272   * Parses a given list of options. <p/>
273   *
274   <!-- options-start -->
275   * Valid options are: <p/>
276   *
277   * <pre> -N
278   *  Sets if binary attributes are to be coded as nominal ones.</pre>
279   *
280   * <pre> -A
281   *  For each nominal value a new attribute is created,
282   *  not only if there are more than 2 values.</pre>
283   *
284   <!-- options-end -->
285   *
286   * @param options the list of options as an array of strings
287   * @throws Exception if an option is not supported
288   */
289  public void setOptions(String[] options) throws Exception {
290
291    setBinaryAttributesNominal(Utils.getFlag('N', options));
292
293    setTransformAllValues(Utils.getFlag('A', options));
294
295    if (getInputFormat() != null)
296      setInputFormat(getInputFormat());
297  }
298
299  /**
300   * Gets the current settings of the filter.
301   *
302   * @return an array of strings suitable for passing to setOptions
303   */
304  public String [] getOptions() {
305
306    String [] options = new String [1];
307    int current = 0;
308
309    if (getBinaryAttributesNominal()) {
310      options[current++] = "-N";
311    }
312
313    if (getTransformAllValues()) {
314      options[current++] = "-A";
315    }
316
317    while (current < options.length) {
318      options[current++] = "";
319    }
320    return options;
321  }
322
323  /**
324   * Returns the tip text for this property
325   *
326   * @return tip text for this property suitable for
327   * displaying in the explorer/experimenter gui
328   */
329  public String binaryAttributesNominalTipText() {
330    return "Whether resulting binary attributes will be nominal.";
331  }
332
333  /**
334   * Gets if binary attributes are to be treated as nominal ones.
335   *
336   * @return true if binary attributes are to be treated as nominal ones
337   */
338  public boolean getBinaryAttributesNominal() {
339
340    return !m_Numeric;
341  }
342
343  /**
344   * Sets if binary attributes are to be treates as nominal ones.
345   *
346   * @param bool true if binary attributes are to be treated as nominal ones
347   */
348  public void setBinaryAttributesNominal(boolean bool) {
349
350    m_Numeric = !bool;
351  }
352
353  /**
354   * Returns the tip text for this property
355   *
356   * @return tip text for this property suitable for
357   * displaying in the explorer/experimenter gui
358   */
359  public String transformAllValuesTipText() {
360    return "Whether all nominal values are turned into new attributes, not only if there are more than 2.";
361  }
362
363  /**
364   * Gets if all nominal values are turned into new attributes, not only if
365   * there are more than 2.
366   *
367   * @return true all nominal values are transformed into new attributes
368   */
369  public boolean getTransformAllValues() {
370
371    return m_TransformAll;
372  }
373
374  /**
375   * Sets whether all nominal values are transformed into new attributes, not
376   * just if there are more than 2.
377   *
378   * @param bool true if all nominal value are transformed into new attributes
379   */
380  public void setTransformAllValues(boolean bool) {
381
382    m_TransformAll = bool;
383  }
384
385  /** Computes average class values for each attribute and value */
386  private void computeAverageClassValues() {
387
388    double totalCounts, sum;
389    Instance instance;
390    double [] counts;
391
392    double [][] avgClassValues = new double[getInputFormat().numAttributes()][0];
393    m_Indices = new int[getInputFormat().numAttributes()][0];
394    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
395      Attribute att = getInputFormat().attribute(j);
396      if (att.isNominal()) {
397        avgClassValues[j] = new double [att.numValues()];
398        counts = new double [att.numValues()];
399        for (int i = 0; i < getInputFormat().numInstances(); i++) {
400          instance = getInputFormat().instance(i);
401          if (!instance.classIsMissing() && 
402              (!instance.isMissing(j))) {
403            counts[(int)instance.value(j)] += instance.weight();
404            avgClassValues[j][(int)instance.value(j)] += 
405              instance.weight() * instance.classValue();
406          }
407        }
408        sum = Utils.sum(avgClassValues[j]);
409        totalCounts = Utils.sum(counts);
410        if (Utils.gr(totalCounts, 0)) {
411          for (int k = 0; k < att.numValues(); k++) {
412            if (Utils.gr(counts[k], 0)) {
413              avgClassValues[j][k] /= (double)counts[k];
414            } else {
415              avgClassValues[j][k] = sum / (double)totalCounts;
416            }
417          }
418        }
419        m_Indices[j] = Utils.sort(avgClassValues[j]);
420      }
421    }
422  }
423
424  /** Set the output format. */
425  private void setOutputFormat() {
426
427    if (getInputFormat().classAttribute().isNominal()) {
428      setOutputFormatNominal();
429    } else {
430      setOutputFormatNumeric();
431    }
432  }
433
434  /**
435   * Convert a single instance over. The converted instance is
436   * added to the end of the output queue.
437   *
438   * @param instance the instance to convert
439   */
440  private void convertInstance(Instance inst) {
441
442    if (getInputFormat().classAttribute().isNominal()) {
443      convertInstanceNominal(inst);
444    } else {
445      convertInstanceNumeric(inst);
446    }
447  }
448
449  /**
450   * Set the output format if the class is nominal.
451   */
452  private void setOutputFormatNominal() {
453
454    FastVector newAtts;
455    int newClassIndex;
456    StringBuffer attributeName;
457    Instances outputFormat;
458    FastVector vals;
459
460    // Compute new attributes
461
462    newClassIndex = getInputFormat().classIndex();
463    newAtts = new FastVector();
464    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
465      Attribute att = getInputFormat().attribute(j);
466      if ((!att.isNominal()) || 
467          (j == getInputFormat().classIndex())) {
468        newAtts.addElement(att.copy());
469      } else {
470        if ( (att.numValues() <= 2) && (!m_TransformAll) ) {
471          if (m_Numeric) {
472            newAtts.addElement(new Attribute(att.name()));
473          } else {
474            newAtts.addElement(att.copy());
475          }
476        } else {
477
478          if (j < getInputFormat().classIndex()) {
479            newClassIndex += att.numValues() - 1;
480          }
481
482          // Compute values for new attributes
483          for (int k = 0; k < att.numValues(); k++) {
484            attributeName = 
485              new StringBuffer(att.name() + "=");
486            attributeName.append(att.value(k));
487            if (m_Numeric) {
488              newAtts.
489                addElement(new Attribute(attributeName.toString()));
490            } else {
491              vals = new FastVector(2);
492              vals.addElement("f"); vals.addElement("t");
493              newAtts.
494                addElement(new Attribute(attributeName.toString(), vals));
495            }
496          }
497        }
498      }
499    }
500    outputFormat = new Instances(getInputFormat().relationName(),
501                                 newAtts, 0);
502    outputFormat.setClassIndex(newClassIndex);
503    setOutputFormat(outputFormat);
504  }
505
506  /**
507   * Set the output format if the class is numeric.
508   */
509  private void setOutputFormatNumeric() {
510
511    if (m_Indices == null) {
512      setOutputFormat(null);
513      return;
514    }
515    FastVector newAtts;
516    int newClassIndex;
517    StringBuffer attributeName;
518    Instances outputFormat;
519    FastVector vals;
520
521    // Compute new attributes
522
523    newClassIndex = getInputFormat().classIndex();
524    newAtts = new FastVector();
525    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
526      Attribute att = getInputFormat().attribute(j);
527      if ((!att.isNominal()) || 
528          (j == getInputFormat().classIndex())) {
529        newAtts.addElement(att.copy());
530      } else {
531        if (j < getInputFormat().classIndex())
532          newClassIndex += att.numValues() - 2;
533         
534        // Compute values for new attributes
535         
536        for (int k = 1; k < att.numValues(); k++) {
537          attributeName = 
538            new StringBuffer(att.name() + "=");
539          for (int l = k; l < att.numValues(); l++) {
540            if (l > k) {
541              attributeName.append(',');
542            }
543            attributeName.append(att.value(m_Indices[j][l]));
544          }
545          if (m_Numeric) {
546            newAtts.
547              addElement(new Attribute(attributeName.toString()));
548          } else {
549            vals = new FastVector(2);
550            vals.addElement("f"); vals.addElement("t");
551            newAtts.
552              addElement(new Attribute(attributeName.toString(), vals));
553          }
554        }
555      }
556    }
557    outputFormat = new Instances(getInputFormat().relationName(),
558                                 newAtts, 0);
559    outputFormat.setClassIndex(newClassIndex);
560    setOutputFormat(outputFormat);
561  }
562
563  /**
564   * Convert a single instance over if the class is nominal. The converted
565   * instance is added to the end of the output queue.
566   *
567   * @param instance the instance to convert
568   */
569  private void convertInstanceNominal(Instance instance) {
570
571    double [] vals = new double [outputFormatPeek().numAttributes()];
572    int attSoFar = 0;
573
574    for(int j = 0; j < getInputFormat().numAttributes(); j++) {
575      Attribute att = getInputFormat().attribute(j);
576      if ((!att.isNominal()) || (j == getInputFormat().classIndex())) {
577        vals[attSoFar] = instance.value(j);
578        attSoFar++;
579      } else {
580        if ( (att.numValues() <= 2) && (!m_TransformAll) ) {
581          vals[attSoFar] = instance.value(j);
582          attSoFar++;
583        } else {
584          if (instance.isMissing(j)) {
585            for (int k = 0; k < att.numValues(); k++) {
586              vals[attSoFar + k] = instance.value(j);
587            }
588          } else {
589            for (int k = 0; k < att.numValues(); k++) {
590              if (k == (int)instance.value(j)) {
591                vals[attSoFar + k] = 1;
592              } else {
593                vals[attSoFar + k] = 0;
594              }
595            }
596          }
597          attSoFar += att.numValues();
598        }
599      }
600    }
601    Instance inst = null;
602    if (instance instanceof SparseInstance) {
603      inst = new SparseInstance(instance.weight(), vals);
604    } else {
605      inst = new DenseInstance(instance.weight(), vals);
606    }
607    inst.setDataset(getOutputFormat());
608    copyValues(inst, false, instance.dataset(), getOutputFormat());
609    inst.setDataset(getOutputFormat());
610    push(inst);
611  }
612
613  /**
614   * Convert a single instance over if the class is numeric. The converted
615   * instance is added to the end of the output queue.
616   *
617   * @param instance the instance to convert
618   */
619  private void convertInstanceNumeric(Instance instance) {
620
621    double [] vals = new double [outputFormatPeek().numAttributes()];
622    int attSoFar = 0;
623
624    for(int j = 0; j < getInputFormat().numAttributes(); j++) {
625      Attribute att = getInputFormat().attribute(j);
626      if ((!att.isNominal()) || (j == getInputFormat().classIndex())) {
627        vals[attSoFar] = instance.value(j);
628        attSoFar++;
629      } else {
630        if (instance.isMissing(j)) {
631          for (int k = 0; k < att.numValues() - 1; k++) {
632            vals[attSoFar + k] = instance.value(j);
633          }
634        } else {
635          int k = 0;
636          while ((int)instance.value(j) != m_Indices[j][k]) {
637            vals[attSoFar + k] = 1;
638            k++;
639          }
640          while (k < att.numValues() - 1) {
641            vals[attSoFar + k] = 0;
642            k++;
643          }
644        }
645        attSoFar += att.numValues() - 1;
646      }
647    }
648    Instance inst = null;
649    if (instance instanceof SparseInstance) {
650      inst = new SparseInstance(instance.weight(), vals);
651    } else {
652      inst = new DenseInstance(instance.weight(), vals);
653    }
654    inst.setDataset(getOutputFormat());
655    copyValues(inst, false, instance.dataset(), getOutputFormat());
656    inst.setDataset(getOutputFormat());
657    push(inst);
658  }
659 
660  /**
661   * Returns the revision string.
662   *
663   * @return            the revision
664   */
665  public String getRevision() {
666    return RevisionUtils.extract("$Revision: 5987 $");
667  }
668
669  /**
670   * Main method for testing this class.
671   *
672   * @param argv should contain arguments to the filter:
673   * use -h for help
674   */
675  public static void main(String [] argv) {
676    runFilter(new NominalToBinary(), argv);
677  }
678}
Note: See TracBrowser for help on using the repository browser.