source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/Normalize.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 17.7 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Normalize.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.attribute;
24
25import weka.core.Capabilities;
26import weka.core.Instance; 
27import weka.core.DenseInstance;
28import weka.core.Instances;
29import weka.core.Option;
30import weka.core.OptionHandler;
31import weka.core.RevisionUtils;
32import weka.core.SparseInstance;
33import weka.core.Utils;
34import weka.core.Capabilities.Capability;
35import weka.filters.Sourcable;
36import weka.filters.UnsupervisedFilter;
37
38import java.util.Enumeration;
39import java.util.Vector;
40
41/**
42 <!-- globalinfo-start -->
43 * Normalizes all numeric values in the given dataset (apart from the class attribute, if set). The resulting values are by default in [0,1] for the data used to compute the normalization intervals. But with the scale and translation parameters one can change that, e.g., with scale = 2.0 and translation = -1.0 you get values in the range [-1,+1].
44 * <p/>
45 <!-- globalinfo-end -->
46 *
47 <!-- options-start -->
48 * Valid options are: <p/>
49 *
50 * <pre> -unset-class-temporarily
51 *  Unsets the class index temporarily before the filter is
52 *  applied to the data.
53 *  (default: no)</pre>
54 *
55 * <pre> -S &lt;num&gt;
56 *  The scaling factor for the output range.
57 *  (default: 1.0)</pre>
58 *
59 * <pre> -T &lt;num&gt;
60 *  The translation of the output range.
61 *  (default: 0.0)</pre>
62 *
63 <!-- options-end -->
64 *
65 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
66 * @author FracPete (fracpete at waikato dot ac dot nz)
67 * @version $Revision: 5987 $
68 */
69public class Normalize 
70  extends PotentialClassIgnorer
71  implements UnsupervisedFilter, Sourcable, OptionHandler {
72 
73  /** for serialization. */
74  static final long serialVersionUID = -8158531150984362898L;
75
76  /** The minimum values for numeric attributes. */
77  protected double[] m_MinArray;
78 
79  /** The maximum values for numeric attributes. */
80  protected double[] m_MaxArray;
81
82  /** The translation of the output range. */
83  protected double m_Translation = 0;
84 
85  /** The scaling factor of the output range. */
86  protected double m_Scale = 1.0;
87 
88  /**
89   * Returns a string describing this filter.
90   *
91   * @return            a description of the filter suitable for
92   *                    displaying in the explorer/experimenter gui
93   */
94  public String globalInfo() {
95    return 
96        "Normalizes all numeric values in the given dataset (apart from the "
97      + "class attribute, if set). The resulting values are by default "
98      + "in [0,1] for the data used to compute the normalization intervals. "
99      + "But with the scale and translation parameters one can change that, "
100      + "e.g., with scale = 2.0 and translation = -1.0 you get values in the "
101      + "range [-1,+1].";
102  }
103
104  /**
105   * Returns an enumeration describing the available options.
106   *
107   * @return            an enumeration of all the available options.
108   */
109  public Enumeration listOptions() {
110    Vector result = new Vector();
111
112    Enumeration en = super.listOptions();
113    while (en.hasMoreElements())
114      result.addElement(en.nextElement());
115
116    result.addElement(new Option(
117        "\tThe scaling factor for the output range.\n"
118        + "\t(default: 1.0)",
119        "S", 1, "-S <num>"));
120
121    result.addElement(new Option(
122        "\tThe translation of the output range.\n"
123        +"\t(default: 0.0)",
124        "T", 1,"-T <num>"));
125
126    return result.elements();
127  }
128
129
130  /**
131   * Parses a given list of options. <p/>
132   *
133   <!-- options-start -->
134   * Valid options are: <p/>
135   *
136   * <pre> -unset-class-temporarily
137   *  Unsets the class index temporarily before the filter is
138   *  applied to the data.
139   *  (default: no)</pre>
140   *
141   * <pre> -S &lt;num&gt;
142   *  The scaling factor for the output range.
143   *  (default: 1.0)</pre>
144   *
145   * <pre> -T &lt;num&gt;
146   *  The translation of the output range.
147   *  (default: 0.0)</pre>
148   *
149   <!-- options-end -->
150   *
151   * @param options the list of options as an array of strings
152   * @throws Exception if an option is not supported
153   */
154  public void setOptions(String[] options) throws Exception {
155    String      tmpStr;
156
157    tmpStr = Utils.getOption('S', options);
158    if (tmpStr.length() != 0)
159      setScale(Double.parseDouble(tmpStr));
160    else
161      setScale(1.0);
162   
163    tmpStr = Utils.getOption('T', options);
164    if (tmpStr.length() != 0)
165      setTranslation(Double.parseDouble(tmpStr));
166    else
167      setTranslation(0.0);
168
169    if (getInputFormat() != null)
170      setInputFormat(getInputFormat());
171  }
172
173  /**
174   * Gets the current settings of the filter.
175   *
176   * @return an array of strings suitable for passing to setOptions
177   */
178  public String[] getOptions() {
179    Vector<String>      result;
180   
181    result = new Vector<String>();
182
183    result.add("-S");
184    result.add("" + getScale());
185
186    result.add("-T");
187    result.add("" + getTranslation());
188   
189    return result.toArray(new String[result.size()]);
190  }
191
192  /**
193   * Returns the Capabilities of this filter.
194   *
195   * @return            the capabilities of this object
196   * @see               Capabilities
197   */
198  public Capabilities getCapabilities() {
199    Capabilities result = super.getCapabilities();
200    result.disableAll();
201
202    // attributes
203    result.enableAllAttributes();
204    result.enable(Capability.MISSING_VALUES);
205   
206    // class
207    result.enableAllClasses();
208    result.enable(Capability.MISSING_CLASS_VALUES);
209    result.enable(Capability.NO_CLASS);
210   
211    return result;
212  }
213
214  /**
215   * Sets the format of the input instances.
216   *
217   * @param instanceInfo        an Instances object containing the input
218   *                            instance structure (any instances contained in
219   *                            the object are ignored - only the structure is
220   *                            required).
221   * @return                    true if the outputFormat may be collected
222   *                            immediately
223   * @throws Exception          if the input format can't be set successfully
224   */
225  public boolean setInputFormat(Instances instanceInfo) 
226       throws Exception {
227
228    super.setInputFormat(instanceInfo);
229    setOutputFormat(instanceInfo);
230    m_MinArray = m_MaxArray = null;
231    return true;
232  }
233
234  /**
235   * Input an instance for filtering. Filter requires all
236   * training instances be read before producing output.
237   *
238   * @param instance    the input instance
239   * @return            true if the filtered instance may now be
240   *                    collected with output().
241   * @throws Exception  if an error occurs
242   * @throws IllegalStateException      if no input format has been set.
243   */
244  public boolean input(Instance instance) throws Exception {
245    if (getInputFormat() == null)
246      throw new IllegalStateException("No input instance format defined");
247   
248    if (m_NewBatch) {
249      resetQueue();
250      m_NewBatch = false;
251    }
252    if (m_MinArray == null) {
253      bufferInput(instance);
254      return false;
255    }
256    else {
257      convertInstance(instance);
258      return true;
259    }
260  }
261
262  /**
263   * Signify that this batch of input to the filter is finished.
264   * If the filter requires all instances prior to filtering,
265   * output() may now be called to retrieve the filtered instances.
266   *
267   * @return            true if there are instances pending output
268   * @throws Exception  if an error occurs
269   * @throws IllegalStateException      if no input structure has been defined
270   */
271  public boolean batchFinished() throws Exception {
272    if (getInputFormat() == null)
273      throw new IllegalStateException("No input instance format defined");
274
275    if (m_MinArray == null) {
276      Instances input = getInputFormat();
277      // Compute minimums and maximums
278      m_MinArray = new double[input.numAttributes()];
279      m_MaxArray = new double[input.numAttributes()];
280      for (int i = 0; i < input.numAttributes(); i++)
281        m_MinArray[i] = Double.NaN;
282
283      for (int j = 0; j < input.numInstances(); j++) {
284        double[] value = input.instance(j).toDoubleArray();
285        for (int i = 0; i < input.numAttributes(); i++) {
286          if (input.attribute(i).isNumeric() &&
287              (input.classIndex() != i)) {
288            if (!Utils.isMissingValue(value[i])) {
289              if (Double.isNaN(m_MinArray[i])) {
290                m_MinArray[i] = m_MaxArray[i] = value[i];
291              }
292              else {
293                if (value[i] < m_MinArray[i])
294                  m_MinArray[i] = value[i];
295                if (value[i] > m_MaxArray[i])
296                  m_MaxArray[i] = value[i];
297              }
298            }
299          }
300        } 
301      }
302
303      // Convert pending input instances
304      for (int i = 0; i < input.numInstances(); i++)
305        convertInstance(input.instance(i));
306    } 
307    // Free memory
308    flushInput();
309
310    m_NewBatch = true;
311    return (numPendingOutput() != 0);
312  }
313
314  /**
315   * Convert a single instance over. The converted instance is
316   * added to the end of the output queue.
317   *
318   * @param instance    the instance to convert
319   * @throws Exception  if conversion fails
320   */
321  protected void convertInstance(Instance instance) throws Exception {
322    Instance inst = null;
323    if (instance instanceof SparseInstance) {
324      double[] newVals = new double[instance.numAttributes()];
325      int[] newIndices = new int[instance.numAttributes()];
326      double[] vals = instance.toDoubleArray();
327      int ind = 0;
328      for (int j = 0; j < instance.numAttributes(); j++) {
329        double value;
330        if (instance.attribute(j).isNumeric() &&
331            (!Utils.isMissingValue(vals[j])) &&
332            (getInputFormat().classIndex() != j)) {
333          if (Double.isNaN(m_MinArray[j]) ||
334              (m_MaxArray[j] == m_MinArray[j])) {
335            value = 0;
336          }
337          else {
338            value = (vals[j] - m_MinArray[j]) / 
339              (m_MaxArray[j] - m_MinArray[j]) * m_Scale + m_Translation;
340            if (Double.isNaN(value)) {
341              throw new Exception("A NaN value was generated "
342                                  + "while normalizing " 
343                                  + instance.attribute(j).name());
344            }
345          }
346          if (value != 0.0) {
347            newVals[ind] = value;
348            newIndices[ind] = j;
349            ind++;
350          }
351        }
352        else {
353          value = vals[j];
354          if (value != 0.0) {
355            newVals[ind] = value;
356            newIndices[ind] = j;
357            ind++;
358          }
359        }
360      } 
361      double[] tempVals = new double[ind];
362      int[] tempInd = new int[ind];
363      System.arraycopy(newVals, 0, tempVals, 0, ind);
364      System.arraycopy(newIndices, 0, tempInd, 0, ind);
365      inst = new SparseInstance(instance.weight(), tempVals, tempInd,
366                                instance.numAttributes());
367    }
368    else {
369      double[] vals = instance.toDoubleArray();
370      for (int j = 0; j < getInputFormat().numAttributes(); j++) {
371        if (instance.attribute(j).isNumeric() &&
372            (!Utils.isMissingValue(vals[j])) &&
373            (getInputFormat().classIndex() != j)) {
374          if (Double.isNaN(m_MinArray[j]) ||
375              (m_MaxArray[j] == m_MinArray[j])) {
376            vals[j] = 0;
377          }
378          else {
379            vals[j] = (vals[j] - m_MinArray[j]) / 
380              (m_MaxArray[j] - m_MinArray[j]) * m_Scale + m_Translation;
381            if (Double.isNaN(vals[j])) {
382              throw new Exception("A NaN value was generated "
383                                  + "while normalizing " 
384                                  + instance.attribute(j).name());
385            }
386          }
387        }
388      } 
389      inst = new DenseInstance(instance.weight(), vals);
390    }
391    inst.setDataset(instance.dataset());
392    push(inst);
393  }
394 
395  /**
396   * Returns a string that describes the filter as source. The
397   * filter will be contained in a class with the given name (there may
398   * be auxiliary classes),
399   * and will contain two methods with these signatures:
400   * <pre><code>
401   * // converts one row
402   * public static Object[] filter(Object[] i);
403   * // converts a full dataset (first dimension is row index)
404   * public static Object[][] filter(Object[][] i);
405   * </code></pre>
406   * where the array <code>i</code> contains elements that are either
407   * Double, String, with missing values represented as null. The generated
408   * code is public domain and comes with no warranty.
409   *
410   * @param className   the name that should be given to the source class.
411   * @param data        the dataset used for initializing the filter
412   * @return            the object source described by a string
413   * @throws Exception  if the source can't be computed
414   */
415  public String toSource(String className, Instances data) throws Exception {
416    StringBuffer        result;
417    boolean[]           process;
418    int                 i;
419   
420    result = new StringBuffer();
421   
422    // determine what attributes were processed
423    process = new boolean[data.numAttributes()];
424    for (i = 0; i < data.numAttributes(); i++) 
425      process[i] = (data.attribute(i).isNumeric() && (i != data.classIndex()));
426 
427    result.append("class " + className + " {\n");
428    result.append("\n");
429    result.append("  /** lists which attributes will be processed */\n");
430    result.append("  protected final static boolean[] PROCESS = new boolean[]{" + Utils.arrayToString(process) + "};\n");
431    result.append("\n");
432    result.append("  /** the minimum values for numeric values */\n");
433    result.append("  protected final static double[] MIN = new double[]{" + Utils.arrayToString(m_MinArray).replaceAll("NaN", "Double.NaN") + "};\n");
434    result.append("\n");
435    result.append("  /** the maximum values for numeric values */\n");
436    result.append("  protected final static double[] MAX = new double[]{" + Utils.arrayToString(m_MaxArray) + "};\n");
437    result.append("\n");
438    result.append("  /** the scale factor */\n");
439    result.append("  protected final static double SCALE = " + m_Scale + ";\n");
440    result.append("\n");
441    result.append("  /** the translation */\n");
442    result.append("  protected final static double TRANSLATION = " + m_Translation + ";\n");
443    result.append("\n");
444    result.append("  /**\n");
445    result.append("   * filters a single row\n");
446    result.append("   * \n");
447    result.append("   * @param i the row to process\n");
448    result.append("   * @return the processed row\n");
449    result.append("   */\n");
450    result.append("  public static Object[] filter(Object[] i) {\n");
451    result.append("    Object[] result;\n");
452    result.append("\n");
453    result.append("    result = new Object[i.length];\n");
454    result.append("    for (int n = 0; n < i.length; n++) {\n");
455    result.append("      if (PROCESS[n] && (i[n] != null)) {\n");
456    result.append("        if (Double.isNaN(MIN[n]) || (MIN[n] == MAX[n]))\n");
457    result.append("          result[n] = 0;\n");
458    result.append("        else\n");
459    result.append("          result[n] = (((Double) i[n]) - MIN[n]) / (MAX[n] - MIN[n]) * SCALE + TRANSLATION;\n");
460    result.append("      }\n");
461    result.append("      else {\n");
462    result.append("        result[n] = i[n];\n");
463    result.append("      }\n");
464    result.append("    }\n");
465    result.append("\n");
466    result.append("    return result;\n");
467    result.append("  }\n");
468    result.append("\n");
469    result.append("  /**\n");
470    result.append("   * filters multiple rows\n");
471    result.append("   * \n");
472    result.append("   * @param i the rows to process\n");
473    result.append("   * @return the processed rows\n");
474    result.append("   */\n");
475    result.append("  public static Object[][] filter(Object[][] i) {\n");
476    result.append("    Object[][] result;\n");
477    result.append("\n");
478    result.append("    result = new Object[i.length][];\n");
479    result.append("    for (int n = 0; n < i.length; n++) {\n");
480    result.append("      result[n] = filter(i[n]);\n");
481    result.append("    }\n");
482    result.append("\n");
483    result.append("    return result;\n");
484    result.append("  }\n");
485    result.append("}\n");
486   
487    return result.toString();
488  }
489
490  /**
491   * Returns the calculated minimum values for the attributes in the data.
492   *
493   * @return            the array with the minimum values
494   */
495  public double[] getMinArray() {
496    return m_MinArray;
497  }
498
499  /**
500   * Returns the calculated maximum values for the attributes in the data.
501   *
502   * @return            the array with the maximum values
503   */
504  public double[] getMaxArray() {
505    return m_MaxArray;
506  }
507
508  /**
509   * Returns the tip text for this property.
510   *
511   * @return            tip text for this property suitable for
512   *                    displaying in the explorer/experimenter gui
513   */
514  public String scaleTipText() {
515    return "The factor for scaling the output range (default: 1).";
516  }
517
518  /**
519   * Get the scaling factor.
520   *
521   * @return            the factor
522   */
523  public double getScale() {
524    return m_Scale;
525  }
526
527  /**
528   * Sets the scaling factor.
529   *
530   * @param value       the scaling factor
531   */
532  public void setScale(double value) {
533    m_Scale = value;
534  }
535
536  /**
537   * Returns the tip text for this property.
538   *
539   * @return            tip text for this property suitable for
540   *                    displaying in the explorer/experimenter gui
541   */
542  public String translationTipText() {
543    return "The translation of the output range (default: 0).";
544  }
545
546  /**
547   * Get the translation.
548   *
549   * @return            the translation
550   */
551  public double getTranslation() {
552    return m_Translation;
553  }
554
555  /**
556   * Sets the translation.
557   *
558   * @param value       the translation
559   */
560  public void setTranslation(double value) {
561    m_Translation = value;
562  }
563 
564  /**
565   * Returns the revision string.
566   *
567   * @return            the revision
568   */
569  public String getRevision() {
570    return RevisionUtils.extract("$Revision: 5987 $");
571  }
572 
573  /**
574   * Main method for running this filter.
575   *
576   * @param args        should contain arguments to the filter, use -h for help
577   */
578  public static void main(String[] args) {
579    runFilter(new Normalize(), args);
580  }
581}
Note: See TracBrowser for help on using the repository browser.