source: src/main/java/weka/experiment/DensityBasedClustererSplitEvaluator.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 18.8 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    DensityBasedClustererSplitEvaluator.java
19 *    Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23
24package weka.experiment;
25
26import weka.clusterers.ClusterEvaluation;
27import weka.clusterers.Clusterer;
28import weka.clusterers.AbstractClusterer;
29import weka.clusterers.AbstractDensityBasedClusterer;
30import weka.clusterers.DensityBasedClusterer;
31import weka.clusterers.EM;
32import weka.core.AdditionalMeasureProducer;
33import weka.core.Instances;
34import weka.core.Option;
35import weka.core.OptionHandler;
36import weka.core.RevisionHandler;
37import weka.core.RevisionUtils;
38import weka.core.Utils;
39import weka.filters.Filter;
40import weka.filters.unsupervised.attribute.Remove;
41
42import java.io.ObjectStreamClass;
43import java.io.Serializable;
44import java.util.Enumeration;
45import java.util.Vector;
46
47/**
48 * A SplitEvaluator that produces results for a density based clusterer.
49 *
50 * -W classname <br>
51 * Specify the full class name of the clusterer to evaluate. <p>
52 *
53 * @author Mark Hall (mhall{[at]}pentaho{[dot]}org
54 * @version $Revision: 5563 $
55 */
56
57public class DensityBasedClustererSplitEvaluator 
58  implements SplitEvaluator,
59             OptionHandler,
60             AdditionalMeasureProducer,
61             RevisionHandler {
62
63  /** Remove the class column (if set) from the data */
64  protected boolean m_removeClassColumn = true;
65
66  /** The clusterer used for evaluation */
67  protected DensityBasedClusterer m_clusterer = new EM();
68
69  /** The names of any additional measures to look for in SplitEvaluators */
70  protected String [] m_additionalMeasures = null;
71
72  /** Array of booleans corresponding to the measures in m_AdditionalMeasures
73      indicating which of the AdditionalMeasures the current clusterer
74      can produce */
75  protected boolean [] m_doesProduce = null;
76
77  /** The number of additional measures that need to be filled in
78      after taking into account column constraints imposed by the final
79      destination for results */
80  protected int m_numberAdditionalMeasures = 0;
81
82  /** Holds the statistics for the most recent application of the clusterer */
83  protected String m_result = null;
84
85  /** The clusterer options (if any) */
86  protected String m_clustererOptions = "";
87
88  /** The clusterer version */
89  protected String m_clustererVersion = "";
90
91  /** The length of a key */
92  private static final int KEY_SIZE = 3;
93
94  /** The length of a result */
95  private static final int RESULT_SIZE = 6;
96
97 
98  public DensityBasedClustererSplitEvaluator() {
99    updateOptions();
100  }
101
102  /**
103   * Returns a string describing this split evaluator
104   * @return a description of the split evaluator suitable for
105   * displaying in the explorer/experimenter gui
106   */
107  public String globalInfo() {
108    return " A SplitEvaluator that produces results for a density based clusterer. ";
109  }
110
111  /**
112   * Returns an enumeration describing the available options.
113   *
114   * @return an enumeration of all the available options.
115   */
116  public Enumeration listOptions() {
117
118    Vector newVector = new Vector(1);
119
120    newVector.addElement(new Option(
121                                    "\tThe full class name of the density based clusterer.\n"
122                                    +"\teg: weka.clusterers.EM", 
123                                    "W", 1, 
124                                    "-W <class name>"));
125
126    if ((m_clusterer != null) &&
127        (m_clusterer instanceof OptionHandler)) {
128      newVector.addElement(new Option(
129                                      "",
130                                      "", 0, "\nOptions specific to clusterer "
131                                      + m_clusterer.getClass().getName() + ":"));
132      Enumeration enu = ((OptionHandler)m_clusterer).listOptions();
133      while (enu.hasMoreElements()) {
134        newVector.addElement(enu.nextElement());
135      }
136    }
137    return newVector.elements();
138  }
139
140  /**
141   * Parses a given list of options. Valid options are:<p>
142   *
143   * -W classname <br>
144   * Specify the full class name of the clusterer to evaluate. <p>
145   *
146   * All option after -- will be passed to the classifier.
147   *
148   * @param options the list of options as an array of strings
149   * @exception Exception if an option is not supported
150   */
151  public void setOptions(String[] options) throws Exception {
152   
153    String cName = Utils.getOption('W', options);
154    if (cName.length() == 0) {
155      throw new Exception("A clusterer must be specified with"
156                          + " the -W option.");
157    }
158    // Do it first without options, so if an exception is thrown during
159    // the option setting, listOptions will contain options for the actual
160    // Classifier.
161    setClusterer((DensityBasedClusterer)AbstractClusterer.forName(cName, null));
162    if (getClusterer() instanceof OptionHandler) {
163      ((OptionHandler) getClusterer())
164        .setOptions(Utils.partitionOptions(options));
165      updateOptions();
166    }
167  }
168
169  /**
170   * Gets the current settings of the Classifier.
171   *
172   * @return an array of strings suitable for passing to setOptions
173   */
174  public String [] getOptions() {
175
176    String [] clustererOptions = new String [0];
177    if ((m_clusterer != null) && 
178        (m_clusterer instanceof OptionHandler)) {
179      clustererOptions = ((OptionHandler)m_clusterer).getOptions();
180    }
181   
182    String [] options = new String [clustererOptions.length + 3];
183    int current = 0;
184
185    if (getClusterer() != null) {
186      options[current++] = "-W";
187      options[current++] = getClusterer().getClass().getName();
188    }
189
190    options[current++] = "--";
191
192    System.arraycopy(clustererOptions, 0, options, current, 
193                     clustererOptions.length);
194    current += clustererOptions.length;
195    while (current < options.length) {
196      options[current++] = "";
197    }
198    return options;
199  }
200
201  /**
202   * Set a list of method names for additional measures to look for
203   * in Classifiers. This could contain many measures (of which only a
204   * subset may be produceable by the current Classifier) if an experiment
205   * is the type that iterates over a set of properties.
206   * @param additionalMeasures a list of method names
207   */
208  public void setAdditionalMeasures(String [] additionalMeasures) {
209    // System.err.println("ClassifierSplitEvaluator: setting additional measures");
210    m_additionalMeasures = additionalMeasures;
211   
212    // determine which (if any) of the additional measures this clusterer
213    // can produce
214    if (m_additionalMeasures != null && m_additionalMeasures.length > 0) {
215      m_doesProduce = new boolean [m_additionalMeasures.length];
216
217      if (m_clusterer instanceof AdditionalMeasureProducer) {
218        Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
219          enumerateMeasures();
220        while (en.hasMoreElements()) {
221          String mname = (String)en.nextElement();
222          for (int j=0;j<m_additionalMeasures.length;j++) {
223            if (mname.compareToIgnoreCase(m_additionalMeasures[j]) == 0) {
224              m_doesProduce[j] = true;
225            }
226          }
227        }
228      }
229    } else {
230      m_doesProduce = null;
231    }
232  }
233
234  /**
235   * Returns an enumeration of any additional measure names that might be
236   * in the classifier
237   * @return an enumeration of the measure names
238   */
239  public Enumeration enumerateMeasures() {
240    Vector newVector = new Vector();
241    if (m_clusterer instanceof AdditionalMeasureProducer) {
242      Enumeration en = ((AdditionalMeasureProducer)m_clusterer).
243        enumerateMeasures();
244      while (en.hasMoreElements()) {
245        String mname = (String)en.nextElement();
246        newVector.addElement(mname);
247      }
248    }
249    return newVector.elements();
250  }
251
252  /**
253   * Returns the value of the named measure
254   * @param additionalMeasureName the name of the measure to query for its value
255   * @return the value of the named measure
256   * @exception IllegalArgumentException if the named measure is not supported
257   */
258  public double getMeasure(String additionalMeasureName) {
259    if (m_clusterer instanceof AdditionalMeasureProducer) {
260      return ((AdditionalMeasureProducer)m_clusterer).
261        getMeasure(additionalMeasureName);
262    } else {
263      throw new IllegalArgumentException("DensityBasedClustererSplitEvaluator: "
264                                         +"Can't return value for : "+additionalMeasureName
265                                         +". "+m_clusterer.getClass().getName()+" "
266                                         +"is not an AdditionalMeasureProducer");
267    }
268  }
269
270  /**
271   * Gets the data types of each of the key columns produced for a single run.
272   * The number of key fields must be constant
273   * for a given SplitEvaluator.
274   *
275   * @return an array containing objects of the type of each key column. The
276   * objects should be Strings, or Doubles.
277   */
278  public Object [] getKeyTypes() {
279
280    Object [] keyTypes = new Object[KEY_SIZE];
281    keyTypes[0] = "";
282    keyTypes[1] = "";
283    keyTypes[2] = "";
284    return keyTypes;
285  }
286
287  /**
288   * Gets the names of each of the key columns produced for a single run.
289   * The number of key fields must be constant
290   * for a given SplitEvaluator.
291   *
292   * @return an array containing the name of each key column
293   */
294  public String [] getKeyNames() {
295
296    String [] keyNames = new String[KEY_SIZE];
297    keyNames[0] = "Scheme";
298    keyNames[1] = "Scheme_options";
299    keyNames[2] = "Scheme_version_ID";
300    return keyNames;
301  }
302
303  /**
304   * Gets the key describing the current SplitEvaluator. For example
305   * This may contain the name of the classifier used for classifier
306   * predictive evaluation. The number of key fields must be constant
307   * for a given SplitEvaluator.
308   *
309   * @return an array of objects containing the key.
310   */
311  public Object [] getKey(){
312
313    Object [] key = new Object[KEY_SIZE];
314    key[0] = m_clusterer.getClass().getName();
315    key[1] = m_clustererOptions;
316    key[2] = m_clustererVersion;
317    return key;
318  }
319
320  /**
321   * Gets the data types of each of the result columns produced for a
322   * single run. The number of result fields must be constant
323   * for a given SplitEvaluator.
324   *
325   * @return an array containing objects of the type of each result column.
326   * The objects should be Strings, or Doubles.
327   */
328  public Object [] getResultTypes() {
329    int addm = (m_additionalMeasures != null) 
330      ? m_additionalMeasures.length 
331      : 0;
332    int overall_length = RESULT_SIZE+addm;
333
334    Object [] resultTypes = new Object[overall_length];
335    Double doub = new Double(0);
336    int current = 0;
337   
338    // number of training and testing instances
339    resultTypes[current++] = doub;
340    resultTypes[current++] = doub;
341   
342    // log liklihood
343    resultTypes[current++] = doub;
344    // number of clusters
345    resultTypes[current++] = doub;
346
347    // timing stats
348    resultTypes[current++] = doub;
349    resultTypes[current++] = doub;
350
351
352    //    resultTypes[current++] = "";
353
354    // add any additional measures
355    for (int i=0;i<addm;i++) {
356      resultTypes[current++] = doub;
357    }
358    if (current != overall_length) {
359      throw new Error("ResultTypes didn't fit RESULT_SIZE");
360    }
361    return resultTypes;
362  }
363
364  /**
365   * Gets the names of each of the result columns produced for a single run.
366   * The number of result fields must be constant
367   * for a given SplitEvaluator.
368   *
369   * @return an array containing the name of each result column
370   */
371  public String [] getResultNames() {
372    int addm = (m_additionalMeasures != null) 
373      ? m_additionalMeasures.length 
374      : 0;
375    int overall_length = RESULT_SIZE+addm;
376   
377    String [] resultNames = new String[overall_length];
378    int current = 0;
379    resultNames[current++] = "Number_of_training_instances";
380    resultNames[current++] = "Number_of_testing_instances";
381
382    // Basic performance stats
383    resultNames[current++] = "Log_likelihood";
384    resultNames[current++] = "Number_of_clusters";
385
386    // Timing stats
387    resultNames[current++] = "Time_training";
388    resultNames[current++] = "Time_testing";
389
390    // Classifier defined extras
391    //    resultNames[current++] = "Summary";
392    // add any additional measures
393    for (int i=0;i<addm;i++) {
394      resultNames[current++] = m_additionalMeasures[i];
395    }
396    if (current != overall_length) {
397      throw new Error("ResultNames didn't fit RESULT_SIZE");
398    }
399    return resultNames;
400  }
401
402  /**
403   * Gets the results for the supplied train and test datasets.
404   *
405   * @param train the training Instances.
406   * @param test the testing Instances.
407   * @return the results stored in an array. The objects stored in
408   * the array may be Strings, Doubles, or null (for the missing value).
409   * @exception Exception if a problem occurs while getting the results
410   */
411  public Object [] getResult(Instances train, Instances test) 
412    throws Exception {
413   
414    if (m_clusterer == null) {
415      throw new Exception("No clusterer has been specified");
416    }
417    int addm = (m_additionalMeasures != null) 
418      ? m_additionalMeasures.length 
419      : 0;
420    int overall_length = RESULT_SIZE+addm;
421
422    if (m_removeClassColumn && train.classIndex() != -1) {
423      // remove the class column from the training and testing data
424      Remove r = new Remove();
425      r.setAttributeIndicesArray(new int [] {train.classIndex()});
426      r.setInvertSelection(false);
427      r.setInputFormat(train);
428      train = Filter.useFilter(train, r);
429     
430      test = Filter.useFilter(test, r);
431    }
432    train.setClassIndex(-1);
433    test.setClassIndex(-1);
434     
435
436    ClusterEvaluation eval = new ClusterEvaluation();
437
438    Object [] result = new Object[overall_length];
439    long trainTimeStart = System.currentTimeMillis();
440    m_clusterer.buildClusterer(train);
441    double numClusters = m_clusterer.numberOfClusters();
442    eval.setClusterer(m_clusterer);
443    long trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
444    long testTimeStart = System.currentTimeMillis();
445    eval.evaluateClusterer(test);
446    long testTimeElapsed = System.currentTimeMillis() - testTimeStart;
447    //    m_result = eval.toSummaryString();
448
449    // The results stored are all per instance -- can be multiplied by the
450    // number of instances to get absolute numbers
451    int current = 0;
452    result[current++] = new Double(train.numInstances());
453    result[current++] = new Double(test.numInstances());
454
455    result[current++] = new Double(eval.getLogLikelihood());
456    result[current++] = new Double(numClusters);
457   
458    // Timing stats
459    result[current++] = new Double(trainTimeElapsed / 1000.0);
460    result[current++] = new Double(testTimeElapsed / 1000.0);
461   
462    for (int i=0;i<addm;i++) {
463      if (m_doesProduce[i]) {
464        try {
465          double dv = ((AdditionalMeasureProducer)m_clusterer).
466            getMeasure(m_additionalMeasures[i]);
467          Double value = new Double(dv);
468         
469          result[current++] = value;
470        } catch (Exception ex) {
471          System.err.println(ex);
472        }
473      } else {
474        result[current++] = null;
475      }
476    }
477   
478    if (current != overall_length) {
479      throw new Error("Results didn't fit RESULT_SIZE");
480    }
481    return result;
482  }
483
484  /**
485   * Returns the tip text for this property
486   * @return tip text for this property suitable for
487   * displaying in the explorer/experimenter gui
488   */
489  public String removeClassColumnTipText() {
490    return "Remove the class column (if set) from the data.";
491  }
492
493  /**
494   * Set whether the class column should be removed from the data.
495   *
496   * @param r true if the class column is to be removed.
497   */
498  public void setRemoveClassColumn(boolean r) {
499    m_removeClassColumn = r;
500  }
501
502  /**
503   * Get whether the class column is to be removed.
504   *
505   * @return true if the class column is to be removed.
506   */
507  public boolean getRemoveClassColumn() {
508    return m_removeClassColumn;
509  }
510 
511  /**
512   * Returns the tip text for this property
513   * @return tip text for this property suitable for
514   * displaying in the explorer/experimenter gui
515   */
516  public String clustererTipText() {
517    return "The density based clusterer to use.";
518  }
519
520  /**
521   * Get the value of clusterer
522   *
523   * @return Value of clusterer.
524   */
525  public DensityBasedClusterer getClusterer() {
526   
527    return m_clusterer;
528  }
529 
530  /**
531   * Sets the clusterer.
532   *
533   * @param newClusterer the new clusterer to use.
534   */
535  public void setClusterer(DensityBasedClusterer newClusterer) {
536   
537    m_clusterer = newClusterer;
538    updateOptions();
539  }
540
541
542  protected void updateOptions() {
543   
544    if (m_clusterer instanceof OptionHandler) {
545      m_clustererOptions = Utils.joinOptions(((OptionHandler)m_clusterer)
546                                             .getOptions());
547    } else {
548      m_clustererOptions = "";
549    }
550    if (m_clusterer instanceof Serializable) {
551      ObjectStreamClass obs = ObjectStreamClass.lookup(m_clusterer
552                                                       .getClass());
553      m_clustererVersion = "" + obs.getSerialVersionUID();
554    } else {
555      m_clustererVersion = "";
556    }
557  }
558
559  /**
560   * Set the Clusterer to use, given it's class name. A new clusterer will be
561   * instantiated.
562   *
563   * @param newClustererName the clusterer class name.
564   * @exception Exception if the class name is invalid.
565   */
566  public void setClustererName(String newClustererName) throws Exception {
567
568    try {
569      setClusterer((DensityBasedClusterer)Class.forName(newClustererName)
570                    .newInstance());
571    } catch (Exception ex) {
572      throw new Exception("Can't find Clusterer with class name: "
573                          + newClustererName);
574    }
575  }
576
577  /**
578   * Gets the raw output from the classifier
579   * @return the raw output from the classifier
580   */
581  public String getRawResultOutput() {
582    StringBuffer result = new StringBuffer();
583
584    if (m_clusterer == null) {
585      return "<null> clusterer";
586    }
587    result.append(toString());
588    result.append("Clustering model: \n"+m_clusterer.toString()+'\n');
589
590    // append the performance statistics
591    if (m_result != null) {
592      //      result.append(m_result);
593     
594      if (m_doesProduce != null) {
595        for (int i=0;i<m_doesProduce.length;i++) {
596          if (m_doesProduce[i]) {
597            try {
598              double dv = ((AdditionalMeasureProducer)m_clusterer).
599                getMeasure(m_additionalMeasures[i]);
600              Double value = new Double(dv);
601             
602              result.append(m_additionalMeasures[i]+" : "+value+'\n');
603            } catch (Exception ex) {
604              System.err.println(ex);
605            }
606          } 
607        }
608      }
609    }
610    return result.toString();
611  }
612
613  /**
614   * Returns a text description of the split evaluator.
615   *
616   * @return a text description of the split evaluator.
617   */
618  public String toString() {
619
620    String result = "DensityBasedClustererSplitEvaluator: ";
621    if (m_clusterer == null) {
622      return result + "<null> clusterer";
623    }
624    return result + m_clusterer.getClass().getName() + " " 
625      + m_clustererOptions + "(version " + m_clustererVersion + ")";
626  }
627 
628  /**
629   * Returns the revision string.
630   *
631   * @return            the revision
632   */
633  public String getRevision() {
634    return RevisionUtils.extract("$Revision: 5563 $");
635  }
636}
Note: See TracBrowser for help on using the repository browser.