source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/Standardize.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 12.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Standardize.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.attribute;
24
25import weka.core.Capabilities;
26import weka.core.Instance; 
27import weka.core.DenseInstance;
28import weka.core.Instances;
29import weka.core.RevisionUtils;
30import weka.core.SparseInstance;
31import weka.core.Utils;
32import weka.core.Capabilities.Capability;
33import weka.filters.Sourcable;
34import weka.filters.UnsupervisedFilter;
35
36/**
37 <!-- globalinfo-start -->
38 * Standardizes all numeric attributes in the given dataset to have zero mean and unit variance (apart from the class attribute, if set).
39 * <p/>
40 <!-- globalinfo-end -->
41 *
42 <!-- options-start -->
43 * Valid options are: <p/>
44 *
45 * <pre> -unset-class-temporarily
46 *  Unsets the class index temporarily before the filter is
47 *  applied to the data.
48 *  (default: no)</pre>
49 *
50 <!-- options-end -->
51 *
52 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
53 * @version $Revision: 5987 $
54 */
55public class Standardize 
56  extends PotentialClassIgnorer
57  implements UnsupervisedFilter, Sourcable {
58 
59  /** for serialization */
60  static final long serialVersionUID = -6830769026855053281L;
61
62  /** The means */
63  private double [] m_Means;
64 
65  /** The variances */
66  private double [] m_StdDevs;
67
68  /**
69   * Returns a string describing this filter
70   *
71   * @return a description of the filter suitable for
72   * displaying in the explorer/experimenter gui
73   */
74  public String globalInfo() {
75
76    return "Standardizes all numeric attributes in the given dataset "
77      + "to have zero mean and unit variance (apart from the class attribute, if set).";
78  }
79
80  /**
81   * Returns the Capabilities of this filter.
82   *
83   * @return            the capabilities of this object
84   * @see               Capabilities
85   */
86  public Capabilities getCapabilities() {
87    Capabilities result = super.getCapabilities();
88    result.disableAll();
89
90    // attributes
91    result.enableAllAttributes();
92    result.enable(Capability.MISSING_VALUES);
93   
94    // class
95    result.enableAllClasses();
96    result.enable(Capability.MISSING_CLASS_VALUES);
97    result.enable(Capability.NO_CLASS);
98   
99    return result;
100  }
101
102  /**
103   * Sets the format of the input instances.
104   *
105   * @param instanceInfo an Instances object containing the input
106   * instance structure (any instances contained in the object are
107   * ignored - only the structure is required).
108   * @return true if the outputFormat may be collected immediately
109   * @throws Exception if the input format can't be set
110   * successfully
111   */
112  public boolean setInputFormat(Instances instanceInfo) 
113       throws Exception {
114
115    super.setInputFormat(instanceInfo);
116    setOutputFormat(instanceInfo);
117    m_Means = m_StdDevs = null;
118    return true;
119  }
120
121  /**
122   * Input an instance for filtering. Filter requires all
123   * training instances be read before producing output.
124   *
125   * @param instance the input instance
126   * @return true if the filtered instance may now be
127   * collected with output().
128   * @throws IllegalStateException if no input format has been set.
129   */
130  public boolean input(Instance instance) throws Exception {
131
132    if (getInputFormat() == null) {
133      throw new IllegalStateException("No input instance format defined");
134    }
135    if (m_NewBatch) {
136      resetQueue();
137      m_NewBatch = false;
138    }
139    if (m_Means == null) {
140      bufferInput(instance);
141      return false;
142    } else {
143      convertInstance(instance);
144      return true;
145    }
146  }
147
148  /**
149   * Signify that this batch of input to the filter is finished.
150   * If the filter requires all instances prior to filtering,
151   * output() may now be called to retrieve the filtered instances.
152   *
153   * @return true if there are instances pending output
154   * @exception Exception if an error occurs
155   * @exception IllegalStateException if no input structure has been defined
156   */
157  public boolean batchFinished() throws Exception {
158
159    if (getInputFormat() == null) {
160      throw new IllegalStateException("No input instance format defined");
161    }
162    if (m_Means == null) {
163      Instances input = getInputFormat();
164      m_Means = new double[input.numAttributes()];
165      m_StdDevs = new double[input.numAttributes()];
166      for (int i = 0; i < input.numAttributes(); i++) {
167        if (input.attribute(i).isNumeric() &&
168            (input.classIndex() != i)) {
169          m_Means[i] = input.meanOrMode(i);
170          m_StdDevs[i] = Math.sqrt(input.variance(i));
171        }
172      }
173
174      // Convert pending input instances
175      for(int i = 0; i < input.numInstances(); i++) {
176        convertInstance(input.instance(i));
177      }
178    } 
179    // Free memory
180    flushInput();
181
182    m_NewBatch = true;
183    return (numPendingOutput() != 0);
184  }
185
186  /**
187   * Convert a single instance over. The converted instance is
188   * added to the end of the output queue.
189   *
190   * @param instance the instance to convert
191   * @exception Exception if an error occurs
192   */
193  private void convertInstance(Instance instance) throws Exception {
194 
195    Instance inst = null;
196    if (instance instanceof SparseInstance) {
197      double[] newVals = new double[instance.numAttributes()];
198      int[] newIndices = new int[instance.numAttributes()];
199      double[] vals = instance.toDoubleArray();
200      int ind = 0;
201      for (int j = 0; j < instance.numAttributes(); j++) {
202        double value;
203        if (instance.attribute(j).isNumeric() &&
204            (!Utils.isMissingValue(vals[j])) &&
205            (getInputFormat().classIndex() != j)) {
206         
207          // Just subtract the mean if the standard deviation is zero
208          if (m_StdDevs[j] > 0) { 
209            value = (vals[j] - m_Means[j]) / m_StdDevs[j];
210          } else {
211            value = vals[j] - m_Means[j];
212          }
213          if (Double.isNaN(value)) {
214            throw new Exception("A NaN value was generated "
215                                + "while standardizing attribute " 
216                                + instance.attribute(j).name());
217          }
218          if (value != 0.0) {
219            newVals[ind] = value;
220            newIndices[ind] = j;
221            ind++;
222          }
223        } else {
224          value = vals[j];
225          if (value != 0.0) {
226            newVals[ind] = value;
227            newIndices[ind] = j;
228            ind++;
229          }
230        }
231      } 
232      double[] tempVals = new double[ind];
233      int[] tempInd = new int[ind];
234      System.arraycopy(newVals, 0, tempVals, 0, ind);
235      System.arraycopy(newIndices, 0, tempInd, 0, ind);
236      inst = new SparseInstance(instance.weight(), tempVals, tempInd,
237                                instance.numAttributes());
238    } else {
239      double[] vals = instance.toDoubleArray();
240      for (int j = 0; j < getInputFormat().numAttributes(); j++) {
241        if (instance.attribute(j).isNumeric() &&
242            (!Utils.isMissingValue(vals[j])) &&
243            (getInputFormat().classIndex() != j)) {
244         
245          // Just subtract the mean if the standard deviation is zero
246          if (m_StdDevs[j] > 0) { 
247            vals[j] = (vals[j] - m_Means[j]) / m_StdDevs[j];
248          } else {
249            vals[j] = (vals[j] - m_Means[j]);
250          }
251          if (Double.isNaN(vals[j])) {
252            throw new Exception("A NaN value was generated "
253                                + "while standardizing attribute " 
254                                + instance.attribute(j).name());
255          }
256        }
257      } 
258      inst = new DenseInstance(instance.weight(), vals);
259    }
260    inst.setDataset(instance.dataset());
261    push(inst);
262  }
263 
264  /**
265   * Returns a string that describes the filter as source. The
266   * filter will be contained in a class with the given name (there may
267   * be auxiliary classes),
268   * and will contain two methods with these signatures:
269   * <pre><code>
270   * // converts one row
271   * public static Object[] filter(Object[] i);
272   * // converts a full dataset (first dimension is row index)
273   * public static Object[][] filter(Object[][] i);
274   * </code></pre>
275   * where the array <code>i</code> contains elements that are either
276   * Double, String, with missing values represented as null. The generated
277   * code is public domain and comes with no warranty.
278   *
279   * @param className   the name that should be given to the source class.
280   * @param data        the dataset used for initializing the filter
281   * @return            the object source described by a string
282   * @throws Exception  if the source can't be computed
283   */
284  public String toSource(String className, Instances data) throws Exception {
285    StringBuffer        result;
286    boolean[]           process;
287    int                 i;
288   
289    result = new StringBuffer();
290   
291    // determine what attributes were processed
292    process = new boolean[data.numAttributes()];
293    for (i = 0; i < data.numAttributes(); i++) {
294      process[i] = (data.attribute(i).isNumeric() && (i != data.classIndex()));
295    }
296   
297    result.append("class " + className + " {\n");
298    result.append("\n");
299    result.append("  /** lists which attributes will be processed */\n");
300    result.append("  protected final static boolean[] PROCESS = new boolean[]{" + Utils.arrayToString(process) + "};\n");
301    result.append("\n");
302    result.append("  /** the computed means */\n");
303    result.append("  protected final static double[] MEANS = new double[]{" + Utils.arrayToString(m_Means) + "};\n");
304    result.append("\n");
305    result.append("  /** the computed standard deviations */\n");
306    result.append("  protected final static double[] STDEVS = new double[]{" + Utils.arrayToString(m_StdDevs) + "};\n");
307    result.append("\n");
308    result.append("  /**\n");
309    result.append("   * filters a single row\n");
310    result.append("   * \n");
311    result.append("   * @param i the row to process\n");
312    result.append("   * @return the processed row\n");
313    result.append("   */\n");
314    result.append("  public static Object[] filter(Object[] i) {\n");
315    result.append("    Object[] result;\n");
316    result.append("\n");
317    result.append("    result = new Object[i.length];\n");
318    result.append("    for (int n = 0; n < i.length; n++) {\n");
319    result.append("      if (PROCESS[n] && (i[n] != null)) {\n");
320    result.append("        if (STDEVS[n] > 0)\n");
321    result.append("          result[n] = (((Double) i[n]) - MEANS[n]) / STDEVS[n];\n");
322    result.append("        else\n");
323    result.append("          result[n] = ((Double) i[n]) - MEANS[n];\n");
324    result.append("      }\n");
325    result.append("      else {\n");
326    result.append("        result[n] = i[n];\n");
327    result.append("      }\n");
328    result.append("    }\n");
329    result.append("\n");
330    result.append("    return result;\n");
331    result.append("  }\n");
332    result.append("\n");
333    result.append("  /**\n");
334    result.append("   * filters multiple rows\n");
335    result.append("   * \n");
336    result.append("   * @param i the rows to process\n");
337    result.append("   * @return the processed rows\n");
338    result.append("   */\n");
339    result.append("  public static Object[][] filter(Object[][] i) {\n");
340    result.append("    Object[][] result;\n");
341    result.append("\n");
342    result.append("    result = new Object[i.length][];\n");
343    result.append("    for (int n = 0; n < i.length; n++) {\n");
344    result.append("      result[n] = filter(i[n]);\n");
345    result.append("    }\n");
346    result.append("\n");
347    result.append("    return result;\n");
348    result.append("  }\n");
349    result.append("}\n");
350   
351    return result.toString();
352  }
353 
354  /**
355   * Returns the revision string.
356   *
357   * @return            the revision
358   */
359  public String getRevision() {
360    return RevisionUtils.extract("$Revision: 5987 $");
361  }
362
363  /**
364   * Main method for testing this class.
365   *
366   * @param argv should contain arguments to the filter:
367   * use -h for help
368   */
369  public static void main(String [] argv) {
370    runFilter(new Standardize(), argv);
371  }
372}
Note: See TracBrowser for help on using the repository browser.