source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/ReplaceMissingValues.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 14.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    ReplaceMissingValues.java
19 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23
24package weka.filters.unsupervised.attribute;
25
26import weka.core.Capabilities;
27import weka.core.Instance; 
28import weka.core.DenseInstance;
29import weka.core.DenseInstance;
30import weka.core.Instances;
31import weka.core.RevisionUtils;
32import weka.core.SparseInstance;
33import weka.core.Utils;
34import weka.core.Capabilities.Capability;
35import weka.filters.Sourcable;
36import weka.filters.UnsupervisedFilter;
37
38/**
39 <!-- globalinfo-start -->
40 * Replaces all missing values for nominal and numeric attributes in a dataset with the modes and means from the training data.
41 * <p/>
42 <!-- globalinfo-end -->
43 *
44 <!-- options-start -->
45 * Valid options are: <p/>
46 *
47 * <pre> -unset-class-temporarily
48 *  Unsets the class index temporarily before the filter is
49 *  applied to the data.
50 *  (default: no)</pre>
51 *
52 <!-- options-end -->
53 *
54 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
55 * @version $Revision: 5987 $
56 */
57public class ReplaceMissingValues 
58  extends PotentialClassIgnorer
59  implements UnsupervisedFilter, Sourcable {
60
61  /** for serialization */
62  static final long serialVersionUID = 8349568310991609867L;
63 
64  /** The modes and means */
65  private double[] m_ModesAndMeans = null;
66
67  /**
68   * Returns a string describing this filter
69   *
70   * @return a description of the filter suitable for
71   * displaying in the explorer/experimenter gui
72   */
73  public String globalInfo() {
74
75    return "Replaces all missing values for nominal and numeric attributes in a "
76      + "dataset with the modes and means from the training data.";
77  }
78
79  /**
80   * Returns the Capabilities of this filter.
81   *
82   * @return            the capabilities of this object
83   * @see               Capabilities
84   */
85  public Capabilities getCapabilities() {
86    Capabilities result = super.getCapabilities();
87    result.disableAll();
88
89    // attributes
90    result.enableAllAttributes();
91    result.enable(Capability.MISSING_VALUES);
92   
93    // class
94    result.enableAllClasses();
95    result.enable(Capability.MISSING_CLASS_VALUES);
96    result.enable(Capability.NO_CLASS);
97   
98    return result;
99  }
100
101  /**
102   * Sets the format of the input instances.
103   *
104   * @param instanceInfo an Instances object containing the input
105   * instance structure (any instances contained in the object are
106   * ignored - only the structure is required).
107   * @return true if the outputFormat may be collected immediately
108   * @throws Exception if the input format can't be set
109   * successfully
110   */
111  public boolean setInputFormat(Instances instanceInfo) 
112       throws Exception {
113
114    super.setInputFormat(instanceInfo);
115    setOutputFormat(instanceInfo);
116    m_ModesAndMeans = null;
117    return true;
118  }
119
120  /**
121   * Input an instance for filtering. Filter requires all
122   * training instances be read before producing output.
123   *
124   * @param instance the input instance
125   * @return true if the filtered instance may now be
126   * collected with output().
127   * @throws IllegalStateException if no input format has been set.
128   */
129  public boolean input(Instance instance) {
130
131    if (getInputFormat() == null) {
132      throw new IllegalStateException("No input instance format defined");
133    }
134    if (m_NewBatch) {
135      resetQueue();
136      m_NewBatch = false;
137    }
138    if (m_ModesAndMeans == null) {
139      bufferInput(instance);
140      return false;
141    } else {
142      convertInstance(instance);
143      return true;
144    }
145  }
146
147  /**
148   * Signify that this batch of input to the filter is finished.
149   * If the filter requires all instances prior to filtering,
150   * output() may now be called to retrieve the filtered instances.
151   *
152   * @return true if there are instances pending output
153   * @throws IllegalStateException if no input structure has been defined
154   */
155  public boolean batchFinished() {
156
157    if (getInputFormat() == null) {
158      throw new IllegalStateException("No input instance format defined");
159    }
160
161    if (m_ModesAndMeans == null) {
162      // Compute modes and means
163      double sumOfWeights =  getInputFormat().sumOfWeights();
164      double[][] counts = new double[getInputFormat().numAttributes()][];
165      for (int i = 0; i < getInputFormat().numAttributes(); i++) {
166        if (getInputFormat().attribute(i).isNominal()) {
167          counts[i] = new double[getInputFormat().attribute(i).numValues()];
168          if (counts[i].length > 0)
169            counts[i][0] = sumOfWeights;
170        }
171      }
172      double[] sums = new double[getInputFormat().numAttributes()];
173      for (int i = 0; i < sums.length; i++) {
174        sums[i] = sumOfWeights;
175      }
176      double[] results = new double[getInputFormat().numAttributes()];
177      for (int j = 0; j < getInputFormat().numInstances(); j++) {
178        Instance inst = getInputFormat().instance(j);
179        for (int i = 0; i < inst.numValues(); i++) {
180          if (!inst.isMissingSparse(i)) {
181            double value = inst.valueSparse(i);
182            if (inst.attributeSparse(i).isNominal()) {
183              if (counts[inst.index(i)].length > 0) {
184                counts[inst.index(i)][(int)value] += inst.weight();
185                counts[inst.index(i)][0] -= inst.weight();
186              }
187            } else if (inst.attributeSparse(i).isNumeric()) {
188              results[inst.index(i)] += inst.weight() * inst.valueSparse(i);
189            }
190          } else {
191            if (inst.attributeSparse(i).isNominal()) {
192              if (counts[inst.index(i)].length > 0) {
193                counts[inst.index(i)][0] -= inst.weight();
194              }
195            } else if (inst.attributeSparse(i).isNumeric()) {
196              sums[inst.index(i)] -= inst.weight();
197            }
198          }
199        }
200      }
201      m_ModesAndMeans = new double[getInputFormat().numAttributes()];
202      for (int i = 0; i < getInputFormat().numAttributes(); i++) {
203        if (getInputFormat().attribute(i).isNominal()) {
204          if (counts[i].length == 0)
205            m_ModesAndMeans[i] = Utils.missingValue();
206          else
207            m_ModesAndMeans[i] = (double)Utils.maxIndex(counts[i]);
208        } else if (getInputFormat().attribute(i).isNumeric()) {
209          if (Utils.gr(sums[i], 0)) {
210            m_ModesAndMeans[i] = results[i] / sums[i];
211          }
212        }
213      }
214
215      // Convert pending input instances
216      for(int i = 0; i < getInputFormat().numInstances(); i++) {
217        convertInstance(getInputFormat().instance(i));
218      }
219    } 
220    // Free memory
221    flushInput();
222
223    m_NewBatch = true;
224    return (numPendingOutput() != 0);
225  }
226
227  /**
228   * Convert a single instance over. The converted instance is
229   * added to the end of the output queue.
230   *
231   * @param instance the instance to convert
232   */
233  private void convertInstance(Instance instance) {
234 
235    Instance inst = null;
236    if (instance instanceof SparseInstance) {
237      double []vals = new double[instance.numValues()];
238      int []indices = new int[instance.numValues()];
239      int num = 0;
240      for (int j = 0; j < instance.numValues(); j++) {
241        if (instance.isMissingSparse(j) &&
242            (getInputFormat().classIndex() != instance.index(j)) &&
243            (instance.attributeSparse(j).isNominal() ||
244             instance.attributeSparse(j).isNumeric())) {
245          if (m_ModesAndMeans[instance.index(j)] != 0.0) {
246            vals[num] = m_ModesAndMeans[instance.index(j)];
247            indices[num] = instance.index(j);
248            num++;
249          } 
250        } else {
251          vals[num] = instance.valueSparse(j);
252          indices[num] = instance.index(j);
253          num++;
254        }
255      } 
256      if (num == instance.numValues()) {
257        inst = new SparseInstance(instance.weight(), vals, indices,
258                                  instance.numAttributes());
259      } else {
260        double []tempVals = new double[num];
261        int []tempInd = new int[num];
262        System.arraycopy(vals, 0, tempVals, 0, num);
263        System.arraycopy(indices, 0, tempInd, 0, num);
264        inst = new SparseInstance(instance.weight(), tempVals, tempInd,
265                                  instance.numAttributes());
266      }
267    } else {
268      double []vals = new double[getInputFormat().numAttributes()];
269      for (int j = 0; j < instance.numAttributes(); j++) {
270        if (instance.isMissing(j) &&
271            (getInputFormat().classIndex() != j) &&
272            (getInputFormat().attribute(j).isNominal() ||
273             getInputFormat().attribute(j).isNumeric())) {
274          vals[j] = m_ModesAndMeans[j]; 
275        } else {
276          vals[j] = instance.value(j);
277        }
278      } 
279      inst = new DenseInstance(instance.weight(), vals);
280    } 
281    inst.setDataset(instance.dataset());
282    push(inst);
283  }
284 
285  /**
286   * Returns a string that describes the filter as source. The
287   * filter will be contained in a class with the given name (there may
288   * be auxiliary classes),
289   * and will contain two methods with these signatures:
290   * <pre><code>
291   * // converts one row
292   * public static Object[] filter(Object[] i);
293   * // converts a full dataset (first dimension is row index)
294   * public static Object[][] filter(Object[][] i);
295   * </code></pre>
296   * where the array <code>i</code> contains elements that are either
297   * Double, String, with missing values represented as null. The generated
298   * code is public domain and comes with no warranty.
299   *
300   * @param className   the name that should be given to the source class.
301   * @param data        the dataset used for initializing the filter
302   * @return            the object source described by a string
303   * @throws Exception  if the source can't be computed
304   */
305  public String toSource(String className, Instances data) throws Exception {
306    StringBuffer        result;
307    boolean[]           numeric;
308    boolean[]           nominal;
309    String[]            modes;
310    double[]            means;
311    int                 i;
312   
313    result = new StringBuffer();
314   
315    // determine what attributes were processed
316    numeric = new boolean[data.numAttributes()];
317    nominal = new boolean[data.numAttributes()];
318    modes   = new String[data.numAttributes()];
319    means   = new double[data.numAttributes()];
320    for (i = 0; i < data.numAttributes(); i++) {
321      numeric[i] = (data.attribute(i).isNumeric() && (i != data.classIndex()));
322      nominal[i] = (data.attribute(i).isNominal() && (i != data.classIndex()));
323     
324      if (numeric[i])
325        means[i] = m_ModesAndMeans[i];
326      else
327        means[i] = Double.NaN;
328
329      if (nominal[i])
330        modes[i] = data.attribute(i).value((int) m_ModesAndMeans[i]);
331      else
332        modes[i] = null;
333    }
334   
335    result.append("class " + className + " {\n");
336    result.append("\n");
337    result.append("  /** lists which numeric attributes will be processed */\n");
338    result.append("  protected final static boolean[] NUMERIC = new boolean[]{" + Utils.arrayToString(numeric) + "};\n");
339    result.append("\n");
340    result.append("  /** lists which nominal attributes will be processed */\n");
341    result.append("  protected final static boolean[] NOMINAL = new boolean[]{" + Utils.arrayToString(nominal) + "};\n");
342    result.append("\n");
343    result.append("  /** the means */\n");
344    result.append("  protected final static double[] MEANS = new double[]{" + Utils.arrayToString(means).replaceAll("NaN", "Double.NaN") + "};\n");
345    result.append("\n");
346    result.append("  /** the modes */\n");
347    result.append("  protected final static String[] MODES = new String[]{");
348    for (i = 0; i < modes.length; i++) {
349      if (i > 0)
350        result.append(",");
351      if (nominal[i])
352        result.append("\"" + Utils.quote(modes[i]) + "\"");
353      else
354        result.append(modes[i]);
355    }
356    result.append("};\n");
357    result.append("\n");
358    result.append("  /**\n");
359    result.append("   * filters a single row\n");
360    result.append("   * \n");
361    result.append("   * @param i the row to process\n");
362    result.append("   * @return the processed row\n");
363    result.append("   */\n");
364    result.append("  public static Object[] filter(Object[] i) {\n");
365    result.append("    Object[] result;\n");
366    result.append("\n");
367    result.append("    result = new Object[i.length];\n");
368    result.append("    for (int n = 0; n < i.length; n++) {\n");
369    result.append("      if (i[n] == null) {\n");
370    result.append("        if (NUMERIC[n])\n");
371    result.append("          result[n] = MEANS[n];\n");
372    result.append("        else if (NOMINAL[n])\n");
373    result.append("          result[n] = MODES[n];\n");
374    result.append("        else\n");
375    result.append("          result[n] = i[n];\n");
376    result.append("      }\n");
377    result.append("      else {\n");
378    result.append("        result[n] = i[n];\n");
379    result.append("      }\n");
380    result.append("    }\n");
381    result.append("\n");
382    result.append("    return result;\n");
383    result.append("  }\n");
384    result.append("\n");
385    result.append("  /**\n");
386    result.append("   * filters multiple rows\n");
387    result.append("   * \n");
388    result.append("   * @param i the rows to process\n");
389    result.append("   * @return the processed rows\n");
390    result.append("   */\n");
391    result.append("  public static Object[][] filter(Object[][] i) {\n");
392    result.append("    Object[][] result;\n");
393    result.append("\n");
394    result.append("    result = new Object[i.length][];\n");
395    result.append("    for (int n = 0; n < i.length; n++) {\n");
396    result.append("      result[n] = filter(i[n]);\n");
397    result.append("    }\n");
398    result.append("\n");
399    result.append("    return result;\n");
400    result.append("  }\n");
401    result.append("}\n");
402   
403    return result.toString();
404  }
405 
406  /**
407   * Returns the revision string.
408   *
409   * @return            the revision
410   */
411  public String getRevision() {
412    return RevisionUtils.extract("$Revision: 5987 $");
413  }
414
415  /**
416   * Main method for testing this class.
417   *
418   * @param argv should contain arguments to the filter:
419   * use -h for help
420   */
421  public static void main(String [] argv) {
422    runFilter(new ReplaceMissingValues(), argv);
423  }
424}
Note: See TracBrowser for help on using the repository browser.