1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * ClusterEvaluation.java |
---|
19 | * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.clusterers; |
---|
24 | |
---|
25 | import weka.core.Drawable; |
---|
26 | import weka.core.Instance; |
---|
27 | import weka.core.Instances; |
---|
28 | import weka.core.Option; |
---|
29 | import weka.core.OptionHandler; |
---|
30 | import weka.core.Range; |
---|
31 | import weka.core.RevisionHandler; |
---|
32 | import weka.core.RevisionUtils; |
---|
33 | import weka.core.Utils; |
---|
34 | import weka.core.converters.ConverterUtils.DataSource; |
---|
35 | import weka.filters.Filter; |
---|
36 | import weka.filters.unsupervised.attribute.Remove; |
---|
37 | |
---|
38 | import java.beans.BeanInfo; |
---|
39 | import java.beans.Introspector; |
---|
40 | import java.beans.MethodDescriptor; |
---|
41 | import java.io.BufferedWriter; |
---|
42 | import java.io.FileWriter; |
---|
43 | import java.io.Serializable; |
---|
44 | import java.lang.reflect.Method; |
---|
45 | import java.util.Enumeration; |
---|
46 | import java.util.Random; |
---|
47 | import java.util.Vector; |
---|
48 | |
---|
49 | /** |
---|
50 | * Class for evaluating clustering models.<p/> |
---|
51 | * |
---|
52 | * Valid options are: <p/> |
---|
53 | * |
---|
54 | * -t name of the training file <br/> |
---|
55 | * Specify the training file. <p/> |
---|
56 | * |
---|
57 | * -T name of the test file <br/> |
---|
58 | * Specify the test file to apply clusterer to. <p/> |
---|
59 | * |
---|
60 | * -d name of file to save clustering model to <br/> |
---|
61 | * Specify output file. <p/> |
---|
62 | * |
---|
63 | * -l name of file to load clustering model from <br/> |
---|
64 | * Specifiy input file. <p/> |
---|
65 | * |
---|
66 | * -p attribute range <br/> |
---|
67 | * Output predictions. Predictions are for the training file if only the |
---|
68 | * training file is specified, otherwise they are for the test file. The range |
---|
69 | * specifies attribute values to be output with the predictions. |
---|
70 | * Use '-p 0' for none. <p/> |
---|
71 | * |
---|
72 | * -x num folds <br/> |
---|
73 | * Set the number of folds for a cross validation of the training data. |
---|
74 | * Cross validation can only be done for distribution clusterers and will |
---|
75 | * be performed if the test file is missing. <p/> |
---|
76 | * |
---|
77 | * -s num <br/> |
---|
78 | * Sets the seed for randomizing the data for cross-validation. <p/> |
---|
79 | * |
---|
80 | * -c class <br/> |
---|
81 | * Set the class attribute. If set, then class based evaluation of clustering |
---|
82 | * is performed. <p/> |
---|
83 | * |
---|
84 | * -g name of graph file <br/> |
---|
85 | * Outputs the graph representation of the clusterer to the file. Only for |
---|
86 | * clusterer that implemented the <code>weka.core.Drawable</code> interface. |
---|
87 | * <p/> |
---|
88 | * |
---|
89 | * @author Mark Hall (mhall@cs.waikato.ac.nz) |
---|
90 | * @version $Revision: 6021 $ |
---|
91 | * @see weka.core.Drawable |
---|
92 | */ |
---|
93 | public class ClusterEvaluation |
---|
94 | implements Serializable, RevisionHandler { |
---|
95 | |
---|
96 | /** for serialization */ |
---|
97 | static final long serialVersionUID = -830188327319128005L; |
---|
98 | |
---|
99 | /** the clusterer */ |
---|
100 | private Clusterer m_Clusterer; |
---|
101 | |
---|
102 | /** holds a string describing the results of clustering the training data */ |
---|
103 | private StringBuffer m_clusteringResults; |
---|
104 | |
---|
105 | /** holds the number of clusters found by the clusterer */ |
---|
106 | private int m_numClusters; |
---|
107 | |
---|
108 | /** holds the assigments of instances to clusters for a particular testing |
---|
109 | dataset */ |
---|
110 | private double[] m_clusterAssignments; |
---|
111 | |
---|
112 | /** holds the average log likelihood for a particular testing dataset |
---|
113 | if the clusterer is a DensityBasedClusterer */ |
---|
114 | private double m_logL; |
---|
115 | |
---|
116 | /** will hold the mapping of classes to clusters (for class based |
---|
117 | evaluation) */ |
---|
118 | private int[] m_classToCluster = null; |
---|
119 | |
---|
120 | /** |
---|
121 | * set the clusterer |
---|
122 | * @param clusterer the clusterer to use |
---|
123 | */ |
---|
124 | public void setClusterer(Clusterer clusterer) { |
---|
125 | m_Clusterer = clusterer; |
---|
126 | } |
---|
127 | |
---|
128 | /** |
---|
129 | * return the results of clustering. |
---|
130 | * @return a string detailing the results of clustering a data set |
---|
131 | */ |
---|
132 | public String clusterResultsToString() { |
---|
133 | return m_clusteringResults.toString(); |
---|
134 | } |
---|
135 | |
---|
136 | /** |
---|
137 | * Return the number of clusters found for the most recent call to |
---|
138 | * evaluateClusterer |
---|
139 | * @return the number of clusters found |
---|
140 | */ |
---|
141 | public int getNumClusters() { |
---|
142 | return m_numClusters; |
---|
143 | } |
---|
144 | |
---|
145 | /** |
---|
146 | * Return an array of cluster assignments corresponding to the most |
---|
147 | * recent set of instances clustered. |
---|
148 | * @return an array of cluster assignments |
---|
149 | */ |
---|
150 | public double[] getClusterAssignments() { |
---|
151 | return m_clusterAssignments; |
---|
152 | } |
---|
153 | |
---|
154 | /** |
---|
155 | * Return the array (ordered by cluster number) of minimum error class to |
---|
156 | * cluster mappings |
---|
157 | * @return an array of class to cluster mappings |
---|
158 | */ |
---|
159 | public int[] getClassesToClusters() { |
---|
160 | return m_classToCluster; |
---|
161 | } |
---|
162 | |
---|
163 | /** |
---|
164 | * Return the log likelihood corresponding to the most recent |
---|
165 | * set of instances clustered. |
---|
166 | * |
---|
167 | * @return a <code>double</code> value |
---|
168 | */ |
---|
169 | public double getLogLikelihood() { |
---|
170 | return m_logL; |
---|
171 | } |
---|
172 | |
---|
173 | /** |
---|
174 | * Constructor. Sets defaults for each member variable. Default Clusterer |
---|
175 | * is EM. |
---|
176 | */ |
---|
177 | public ClusterEvaluation () { |
---|
178 | setClusterer(new SimpleKMeans()); |
---|
179 | m_clusteringResults = new StringBuffer(); |
---|
180 | m_clusterAssignments = null; |
---|
181 | } |
---|
182 | |
---|
183 | /** |
---|
184 | * Evaluate the clusterer on a set of instances. Calculates clustering |
---|
185 | * statistics and stores cluster assigments for the instances in |
---|
186 | * m_clusterAssignments |
---|
187 | * |
---|
188 | * @param test the set of instances to cluster |
---|
189 | * @throws Exception if something goes wrong |
---|
190 | */ |
---|
191 | public void evaluateClusterer(Instances test) throws Exception { |
---|
192 | evaluateClusterer(test, ""); |
---|
193 | } |
---|
194 | |
---|
195 | /** |
---|
196 | * Evaluate the clusterer on a set of instances. Calculates clustering |
---|
197 | * statistics and stores cluster assigments for the instances in |
---|
198 | * m_clusterAssignments |
---|
199 | * |
---|
200 | * @param test the set of instances to cluster |
---|
201 | * @param testFileName the name of the test file for incremental testing, |
---|
202 | * if "" or null then not used |
---|
203 | * @throws Exception if something goes wrong |
---|
204 | */ |
---|
205 | public void evaluateClusterer(Instances test, String testFileName) throws Exception { |
---|
206 | int i = 0; |
---|
207 | int cnum; |
---|
208 | double loglk = 0.0; |
---|
209 | int cc = m_Clusterer.numberOfClusters(); |
---|
210 | m_numClusters = cc; |
---|
211 | double[] instanceStats = new double[cc]; |
---|
212 | Instances testRaw = null; |
---|
213 | boolean hasClass = (test.classIndex() >= 0); |
---|
214 | int unclusteredInstances = 0; |
---|
215 | Vector<Double> clusterAssignments = new Vector<Double>(); |
---|
216 | Filter filter = null; |
---|
217 | DataSource source = null; |
---|
218 | Instance inst; |
---|
219 | |
---|
220 | if (testFileName == null) |
---|
221 | testFileName = ""; |
---|
222 | |
---|
223 | // load data |
---|
224 | if (testFileName.length() != 0) |
---|
225 | source = new DataSource(testFileName); |
---|
226 | else |
---|
227 | source = new DataSource(test); |
---|
228 | testRaw = source.getStructure(test.classIndex()); |
---|
229 | |
---|
230 | // If class is set then do class based evaluation as well |
---|
231 | if (hasClass) { |
---|
232 | if (testRaw.classAttribute().isNumeric()) |
---|
233 | throw new Exception("ClusterEvaluation: Class must be nominal!"); |
---|
234 | |
---|
235 | filter = new Remove(); |
---|
236 | ((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1)); |
---|
237 | ((Remove) filter).setInvertSelection(false); |
---|
238 | filter.setInputFormat(testRaw); |
---|
239 | } |
---|
240 | |
---|
241 | i = 0; |
---|
242 | while (source.hasMoreElements(testRaw)) { |
---|
243 | // next instance |
---|
244 | inst = source.nextElement(testRaw); |
---|
245 | if (filter != null) { |
---|
246 | filter.input(inst); |
---|
247 | filter.batchFinished(); |
---|
248 | inst = filter.output(); |
---|
249 | } |
---|
250 | |
---|
251 | cnum = -1; |
---|
252 | try { |
---|
253 | if (m_Clusterer instanceof DensityBasedClusterer) { |
---|
254 | loglk += ((DensityBasedClusterer)m_Clusterer). |
---|
255 | logDensityForInstance(inst); |
---|
256 | cnum = m_Clusterer.clusterInstance(inst); |
---|
257 | clusterAssignments.add((double) cnum); |
---|
258 | } |
---|
259 | else { |
---|
260 | cnum = m_Clusterer.clusterInstance(inst); |
---|
261 | clusterAssignments.add((double) cnum); |
---|
262 | } |
---|
263 | } |
---|
264 | catch (Exception e) { |
---|
265 | clusterAssignments.add(-1.0); |
---|
266 | unclusteredInstances++; |
---|
267 | } |
---|
268 | |
---|
269 | if (cnum != -1) { |
---|
270 | instanceStats[cnum]++; |
---|
271 | } |
---|
272 | } |
---|
273 | |
---|
274 | double sum = Utils.sum(instanceStats); |
---|
275 | loglk /= sum; |
---|
276 | m_logL = loglk; |
---|
277 | m_clusterAssignments = new double [clusterAssignments.size()]; |
---|
278 | for (i = 0; i < clusterAssignments.size(); i++) { |
---|
279 | m_clusterAssignments[i] = clusterAssignments.get(i); |
---|
280 | } |
---|
281 | int numInstFieldWidth = (int)((Math.log(clusterAssignments.size())/Math.log(10))+1); |
---|
282 | |
---|
283 | m_clusteringResults.append(m_Clusterer.toString()); |
---|
284 | m_clusteringResults.append("Clustered Instances\n\n"); |
---|
285 | int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1); |
---|
286 | for (i = 0; i < cc; i++) { |
---|
287 | if (instanceStats[i] > 0) |
---|
288 | m_clusteringResults.append(Utils.doubleToString((double)i, |
---|
289 | clustFieldWidth, 0) |
---|
290 | + " " |
---|
291 | + Utils.doubleToString(instanceStats[i], |
---|
292 | numInstFieldWidth, 0) |
---|
293 | + " (" |
---|
294 | + Utils.doubleToString((instanceStats[i] / |
---|
295 | sum * 100.0) |
---|
296 | , 3, 0) + "%)\n"); |
---|
297 | } |
---|
298 | |
---|
299 | if (unclusteredInstances > 0) |
---|
300 | m_clusteringResults.append("\nUnclustered instances : " |
---|
301 | +unclusteredInstances); |
---|
302 | |
---|
303 | if (m_Clusterer instanceof DensityBasedClusterer) |
---|
304 | m_clusteringResults.append("\n\nLog likelihood: " |
---|
305 | + Utils.doubleToString(loglk, 1, 5) |
---|
306 | + "\n"); |
---|
307 | |
---|
308 | if (hasClass) { |
---|
309 | evaluateClustersWithRespectToClass(test, testFileName); |
---|
310 | } |
---|
311 | } |
---|
312 | |
---|
313 | /** |
---|
314 | * Evaluates cluster assignments with respect to actual class labels. |
---|
315 | * Assumes that m_Clusterer has been trained and tested on |
---|
316 | * inst (minus the class). |
---|
317 | * |
---|
318 | * @param inst the instances (including class) to evaluate with respect to |
---|
319 | * @param fileName the name of the test file for incremental testing, |
---|
320 | * if "" or null then not used |
---|
321 | * @throws Exception if something goes wrong |
---|
322 | */ |
---|
323 | private void evaluateClustersWithRespectToClass(Instances inst, String fileName) |
---|
324 | throws Exception { |
---|
325 | |
---|
326 | |
---|
327 | |
---|
328 | int numClasses = inst.classAttribute().numValues(); |
---|
329 | int[][] counts = new int [m_numClusters][numClasses]; |
---|
330 | int[] clusterTotals = new int[m_numClusters]; |
---|
331 | double[] best = new double[m_numClusters+1]; |
---|
332 | double[] current = new double[m_numClusters+1]; |
---|
333 | DataSource source = null; |
---|
334 | Instances instances = null; |
---|
335 | Instance instance = null; |
---|
336 | int i; |
---|
337 | int numInstances; |
---|
338 | |
---|
339 | |
---|
340 | if (fileName == null) |
---|
341 | fileName = ""; |
---|
342 | |
---|
343 | if (fileName.length() != 0) { |
---|
344 | source = new DataSource(fileName); |
---|
345 | } |
---|
346 | else |
---|
347 | source = new DataSource(inst); |
---|
348 | instances = source.getStructure(inst.classIndex()); |
---|
349 | |
---|
350 | i = 0; |
---|
351 | while (source.hasMoreElements(instances)) { |
---|
352 | instance = source.nextElement(instances); |
---|
353 | if (m_clusterAssignments[i] >= 0) { |
---|
354 | counts[(int)m_clusterAssignments[i]][(int)instance.classValue()]++; |
---|
355 | clusterTotals[(int)m_clusterAssignments[i]]++; |
---|
356 | } |
---|
357 | i++; |
---|
358 | } |
---|
359 | numInstances = i; |
---|
360 | |
---|
361 | best[m_numClusters] = Double.MAX_VALUE; |
---|
362 | mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0); |
---|
363 | |
---|
364 | m_clusteringResults.append("\n\nClass attribute: " |
---|
365 | +inst.classAttribute().name() |
---|
366 | +"\n"); |
---|
367 | m_clusteringResults.append("Classes to Clusters:\n"); |
---|
368 | String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0)); |
---|
369 | m_clusteringResults.append(matrixString).append("\n"); |
---|
370 | |
---|
371 | int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10)); |
---|
372 | // add the minimum error assignment |
---|
373 | for (i = 0; i < m_numClusters; i++) { |
---|
374 | if (clusterTotals[i] > 0) { |
---|
375 | m_clusteringResults.append("Cluster " |
---|
376 | +Utils.doubleToString((double)i,Cwidth,0)); |
---|
377 | m_clusteringResults.append(" <-- "); |
---|
378 | |
---|
379 | if (best[i] < 0) { |
---|
380 | m_clusteringResults.append("No class\n"); |
---|
381 | } else { |
---|
382 | m_clusteringResults. |
---|
383 | append(inst.classAttribute().value((int)best[i])).append("\n"); |
---|
384 | } |
---|
385 | } |
---|
386 | } |
---|
387 | m_clusteringResults.append("\nIncorrectly clustered instances :\t" |
---|
388 | +best[m_numClusters]+"\t" |
---|
389 | +(Utils.doubleToString((best[m_numClusters] / |
---|
390 | numInstances * |
---|
391 | 100.0), 8, 4)) |
---|
392 | +" %\n"); |
---|
393 | |
---|
394 | // copy the class assignments |
---|
395 | m_classToCluster = new int [m_numClusters]; |
---|
396 | for (i = 0; i < m_numClusters; i++) { |
---|
397 | m_classToCluster[i] = (int)best[i]; |
---|
398 | } |
---|
399 | } |
---|
400 | |
---|
401 | /** |
---|
402 | * Returns a "confusion" style matrix of classes to clusters assignments |
---|
403 | * @param counts the counts of classes for each cluster |
---|
404 | * @param clusterTotals total number of examples in each cluster |
---|
405 | * @param inst the training instances (with class) |
---|
406 | * @return the "confusion" style matrix as string |
---|
407 | * @throws Exception if matrix can't be generated |
---|
408 | */ |
---|
409 | private String toMatrixString(int[][] counts, int[] clusterTotals, |
---|
410 | Instances inst) |
---|
411 | throws Exception { |
---|
412 | StringBuffer ms = new StringBuffer(); |
---|
413 | |
---|
414 | int maxval = 0; |
---|
415 | for (int i = 0; i < m_numClusters; i++) { |
---|
416 | for (int j = 0; j < counts[i].length; j++) { |
---|
417 | if (counts[i][j] > maxval) { |
---|
418 | maxval = counts[i][j]; |
---|
419 | } |
---|
420 | } |
---|
421 | } |
---|
422 | |
---|
423 | int Cwidth = 1 + Math.max((int)(Math.log(maxval) / Math.log(10)), |
---|
424 | (int)(Math.log(m_numClusters) / Math.log(10))); |
---|
425 | |
---|
426 | ms.append("\n"); |
---|
427 | |
---|
428 | for (int i = 0; i < m_numClusters; i++) { |
---|
429 | if (clusterTotals[i] > 0) { |
---|
430 | ms.append(" ").append(Utils.doubleToString((double)i, Cwidth, 0)); |
---|
431 | } |
---|
432 | } |
---|
433 | ms.append(" <-- assigned to cluster\n"); |
---|
434 | |
---|
435 | for (int i = 0; i< counts[0].length; i++) { |
---|
436 | |
---|
437 | for (int j = 0; j < m_numClusters; j++) { |
---|
438 | if (clusterTotals[j] > 0) { |
---|
439 | ms.append(" ").append(Utils.doubleToString((double)counts[j][i], |
---|
440 | Cwidth, 0)); |
---|
441 | } |
---|
442 | } |
---|
443 | ms.append(" | ").append(inst.classAttribute().value(i)).append("\n"); |
---|
444 | } |
---|
445 | |
---|
446 | return ms.toString(); |
---|
447 | } |
---|
448 | |
---|
449 | /** |
---|
450 | * Finds the minimum error mapping of classes to clusters. Recursively |
---|
451 | * considers all possible class to cluster assignments. |
---|
452 | * |
---|
453 | * @param numClusters the number of clusters |
---|
454 | * @param lev the cluster being processed |
---|
455 | * @param counts the counts of classes in clusters |
---|
456 | * @param clusterTotals the total number of examples in each cluster |
---|
457 | * @param current the current path through the class to cluster assignment |
---|
458 | * tree |
---|
459 | * @param best the best assignment path seen |
---|
460 | * @param error accumulates the error for a particular path |
---|
461 | */ |
---|
462 | public static void mapClasses(int numClusters, int lev, int[][] counts, int[] clusterTotals, |
---|
463 | double[] current, double[] best, int error) { |
---|
464 | // leaf |
---|
465 | if (lev == numClusters) { |
---|
466 | if (error < best[numClusters]) { |
---|
467 | best[numClusters] = error; |
---|
468 | for (int i = 0; i < numClusters; i++) { |
---|
469 | best[i] = current[i]; |
---|
470 | } |
---|
471 | } |
---|
472 | } else { |
---|
473 | // empty cluster -- ignore |
---|
474 | if (clusterTotals[lev] == 0) { |
---|
475 | current[lev] = -1; // cluster ignored |
---|
476 | mapClasses(numClusters, lev+1, counts, clusterTotals, current, best, |
---|
477 | error); |
---|
478 | } else { |
---|
479 | // first try no class assignment to this cluster |
---|
480 | current[lev] = -1; // cluster assigned no class (ie all errors) |
---|
481 | mapClasses(numClusters, lev+1, counts, clusterTotals, current, best, |
---|
482 | error+clusterTotals[lev]); |
---|
483 | // now loop through the classes in this cluster |
---|
484 | for (int i = 0; i < counts[0].length; i++) { |
---|
485 | if (counts[lev][i] > 0) { |
---|
486 | boolean ok = true; |
---|
487 | // check to see if this class has already been assigned |
---|
488 | for (int j = 0; j < lev; j++) { |
---|
489 | if ((int)current[j] == i) { |
---|
490 | ok = false; |
---|
491 | break; |
---|
492 | } |
---|
493 | } |
---|
494 | if (ok) { |
---|
495 | current[lev] = i; |
---|
496 | mapClasses(numClusters, lev+1, counts, clusterTotals, current, best, |
---|
497 | (error + (clusterTotals[lev] - counts[lev][i]))); |
---|
498 | } |
---|
499 | } |
---|
500 | } |
---|
501 | } |
---|
502 | } |
---|
503 | } |
---|
504 | |
---|
505 | /** |
---|
506 | * Evaluates a clusterer with the options given in an array of |
---|
507 | * strings. It takes the string indicated by "-t" as training file, the |
---|
508 | * string indicated by "-T" as test file. |
---|
509 | * If the test file is missing, a stratified ten-fold |
---|
510 | * cross-validation is performed (distribution clusterers only). |
---|
511 | * Using "-x" you can change the number of |
---|
512 | * folds to be used, and using "-s" the random seed. |
---|
513 | * If the "-p" option is present it outputs the classification for |
---|
514 | * each test instance. If you provide the name of an object file using |
---|
515 | * "-l", a clusterer will be loaded from the given file. If you provide the |
---|
516 | * name of an object file using "-d", the clusterer built from the |
---|
517 | * training data will be saved to the given file. |
---|
518 | * |
---|
519 | * @param clusterer machine learning clusterer |
---|
520 | * @param options the array of string containing the options |
---|
521 | * @throws Exception if model could not be evaluated successfully |
---|
522 | * @return a string describing the results |
---|
523 | */ |
---|
524 | public static String evaluateClusterer(Clusterer clusterer, String[] options) |
---|
525 | throws Exception { |
---|
526 | |
---|
527 | int seed = 1, folds = 10; |
---|
528 | boolean doXval = false; |
---|
529 | Instances train = null; |
---|
530 | Random random; |
---|
531 | String trainFileName, testFileName, seedString, foldsString; |
---|
532 | String objectInputFileName, objectOutputFileName, attributeRangeString; |
---|
533 | String graphFileName; |
---|
534 | String[] savedOptions = null; |
---|
535 | boolean printClusterAssignments = false; |
---|
536 | Range attributesToOutput = null; |
---|
537 | StringBuffer text = new StringBuffer(); |
---|
538 | int theClass = -1; // class based evaluation of clustering |
---|
539 | boolean updateable = (clusterer instanceof UpdateableClusterer); |
---|
540 | DataSource source = null; |
---|
541 | Instance inst; |
---|
542 | |
---|
543 | if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) { |
---|
544 | |
---|
545 | // global info requested as well? |
---|
546 | boolean globalInfo = Utils.getFlag("synopsis", options) || |
---|
547 | Utils.getFlag("info", options); |
---|
548 | |
---|
549 | throw new Exception("Help requested." |
---|
550 | + makeOptionString(clusterer, globalInfo)); |
---|
551 | } |
---|
552 | |
---|
553 | try { |
---|
554 | // Get basic options (options the same for all clusterers |
---|
555 | //printClusterAssignments = Utils.getFlag('p', options); |
---|
556 | objectInputFileName = Utils.getOption('l', options); |
---|
557 | objectOutputFileName = Utils.getOption('d', options); |
---|
558 | trainFileName = Utils.getOption('t', options); |
---|
559 | testFileName = Utils.getOption('T', options); |
---|
560 | graphFileName = Utils.getOption('g', options); |
---|
561 | |
---|
562 | // Check -p option |
---|
563 | try { |
---|
564 | attributeRangeString = Utils.getOption('p', options); |
---|
565 | } |
---|
566 | catch (Exception e) { |
---|
567 | throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. " + |
---|
568 | "It now expects a parameter specifying a range of attributes " + |
---|
569 | "to list with the predictions. Use '-p 0' for none."); |
---|
570 | } |
---|
571 | if (attributeRangeString.length() != 0) { |
---|
572 | printClusterAssignments = true; |
---|
573 | if (!attributeRangeString.equals("0")) |
---|
574 | attributesToOutput = new Range(attributeRangeString); |
---|
575 | } |
---|
576 | |
---|
577 | if (trainFileName.length() == 0) { |
---|
578 | if (objectInputFileName.length() == 0) { |
---|
579 | throw new Exception("No training file and no object " |
---|
580 | + "input file given."); |
---|
581 | } |
---|
582 | |
---|
583 | if (testFileName.length() == 0) { |
---|
584 | throw new Exception("No training file and no test file given."); |
---|
585 | } |
---|
586 | } |
---|
587 | else { |
---|
588 | if ((objectInputFileName.length() != 0) |
---|
589 | && (printClusterAssignments == false)) { |
---|
590 | throw new Exception("Can't use both train and model file " |
---|
591 | + "unless -p specified."); |
---|
592 | } |
---|
593 | } |
---|
594 | |
---|
595 | seedString = Utils.getOption('s', options); |
---|
596 | |
---|
597 | if (seedString.length() != 0) { |
---|
598 | seed = Integer.parseInt(seedString); |
---|
599 | } |
---|
600 | |
---|
601 | foldsString = Utils.getOption('x', options); |
---|
602 | |
---|
603 | if (foldsString.length() != 0) { |
---|
604 | folds = Integer.parseInt(foldsString); |
---|
605 | doXval = true; |
---|
606 | } |
---|
607 | } |
---|
608 | catch (Exception e) { |
---|
609 | throw new Exception('\n' + e.getMessage() |
---|
610 | + makeOptionString(clusterer, false)); |
---|
611 | } |
---|
612 | |
---|
613 | try { |
---|
614 | if (trainFileName.length() != 0) { |
---|
615 | source = new DataSource(trainFileName); |
---|
616 | train = source.getStructure(); |
---|
617 | |
---|
618 | String classString = Utils.getOption('c',options); |
---|
619 | if (classString.length() != 0) { |
---|
620 | if (classString.compareTo("last") == 0) |
---|
621 | theClass = train.numAttributes(); |
---|
622 | else if (classString.compareTo("first") == 0) |
---|
623 | theClass = 1; |
---|
624 | else |
---|
625 | theClass = Integer.parseInt(classString); |
---|
626 | |
---|
627 | if (theClass != -1) { |
---|
628 | if (doXval || testFileName.length() != 0) |
---|
629 | throw new Exception("Can only do class based evaluation on the " |
---|
630 | +"training data"); |
---|
631 | |
---|
632 | if (objectInputFileName.length() != 0) |
---|
633 | throw new Exception("Can't load a clusterer and do class based " |
---|
634 | +"evaluation"); |
---|
635 | |
---|
636 | if (objectOutputFileName.length() != 0) |
---|
637 | throw new Exception( |
---|
638 | "Can't do class based evaluation and save clusterer"); |
---|
639 | } |
---|
640 | } |
---|
641 | else { |
---|
642 | // if the dataset defines a class attribute, use it |
---|
643 | if (train.classIndex() != -1) { |
---|
644 | theClass = train.classIndex() + 1; |
---|
645 | System.err.println( |
---|
646 | "Note: using class attribute from dataset, i.e., attribute #" |
---|
647 | + theClass); |
---|
648 | } |
---|
649 | } |
---|
650 | |
---|
651 | if (theClass != -1) { |
---|
652 | if (theClass < 1 || theClass > train.numAttributes()) |
---|
653 | throw new Exception("Class is out of range!"); |
---|
654 | |
---|
655 | if (!train.attribute(theClass - 1).isNominal()) |
---|
656 | throw new Exception("Class must be nominal!"); |
---|
657 | |
---|
658 | train.setClassIndex(theClass - 1); |
---|
659 | } |
---|
660 | } |
---|
661 | } |
---|
662 | catch (Exception e) { |
---|
663 | throw new Exception("ClusterEvaluation: " + e.getMessage() + '.'); |
---|
664 | } |
---|
665 | |
---|
666 | // Save options |
---|
667 | if (options != null) { |
---|
668 | savedOptions = new String[options.length]; |
---|
669 | System.arraycopy(options, 0, savedOptions, 0, options.length); |
---|
670 | } |
---|
671 | |
---|
672 | if (objectInputFileName.length() != 0) |
---|
673 | Utils.checkForRemainingOptions(options); |
---|
674 | |
---|
675 | // Set options for clusterer |
---|
676 | if (clusterer instanceof OptionHandler) |
---|
677 | ((OptionHandler)clusterer).setOptions(options); |
---|
678 | |
---|
679 | Utils.checkForRemainingOptions(options); |
---|
680 | |
---|
681 | Instances trainHeader = train; |
---|
682 | if (objectInputFileName.length() != 0) { |
---|
683 | // Load the clusterer from file |
---|
684 | // clusterer = (Clusterer) SerializationHelper.read(objectInputFileName); |
---|
685 | java.io.ObjectInputStream ois = |
---|
686 | new java.io.ObjectInputStream( |
---|
687 | new java.io.BufferedInputStream( |
---|
688 | new java.io.FileInputStream(objectInputFileName))); |
---|
689 | clusterer = (Clusterer) ois.readObject(); |
---|
690 | // try and get the training header |
---|
691 | try { |
---|
692 | trainHeader = (Instances) ois.readObject(); |
---|
693 | } catch (Exception ex) { |
---|
694 | // don't moan if we cant |
---|
695 | } |
---|
696 | } |
---|
697 | else { |
---|
698 | // Build the clusterer if no object file provided |
---|
699 | if (theClass == -1) { |
---|
700 | if (updateable) { |
---|
701 | clusterer.buildClusterer(source.getStructure()); |
---|
702 | while (source.hasMoreElements(train)) { |
---|
703 | inst = source.nextElement(train); |
---|
704 | ((UpdateableClusterer) clusterer).updateClusterer(inst); |
---|
705 | } |
---|
706 | ((UpdateableClusterer) clusterer).updateFinished(); |
---|
707 | } |
---|
708 | else { |
---|
709 | clusterer.buildClusterer(source.getDataSet()); |
---|
710 | } |
---|
711 | } |
---|
712 | else { |
---|
713 | Remove removeClass = new Remove(); |
---|
714 | removeClass.setAttributeIndices("" + theClass); |
---|
715 | removeClass.setInvertSelection(false); |
---|
716 | removeClass.setInputFormat(train); |
---|
717 | if (updateable) { |
---|
718 | Instances clusterTrain = Filter.useFilter(train, removeClass); |
---|
719 | clusterer.buildClusterer(clusterTrain); |
---|
720 | trainHeader = clusterTrain; |
---|
721 | while (source.hasMoreElements(train)) { |
---|
722 | inst = source.nextElement(train); |
---|
723 | removeClass.input(inst); |
---|
724 | removeClass.batchFinished(); |
---|
725 | Instance clusterTrainInst = removeClass.output(); |
---|
726 | ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst); |
---|
727 | } |
---|
728 | ((UpdateableClusterer) clusterer).updateFinished(); |
---|
729 | } |
---|
730 | else { |
---|
731 | Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass); |
---|
732 | clusterer.buildClusterer(clusterTrain); |
---|
733 | trainHeader = clusterTrain; |
---|
734 | } |
---|
735 | ClusterEvaluation ce = new ClusterEvaluation(); |
---|
736 | ce.setClusterer(clusterer); |
---|
737 | ce.evaluateClusterer(train, trainFileName); |
---|
738 | |
---|
739 | return "\n\n=== Clustering stats for training data ===\n\n" + |
---|
740 | ce.clusterResultsToString(); |
---|
741 | } |
---|
742 | } |
---|
743 | |
---|
744 | /* Output cluster predictions only (for the test data if specified, |
---|
745 | otherwise for the training data */ |
---|
746 | if (printClusterAssignments) { |
---|
747 | return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput); |
---|
748 | } |
---|
749 | |
---|
750 | text.append(clusterer.toString()); |
---|
751 | text.append("\n\n=== Clustering stats for training data ===\n\n" |
---|
752 | + printClusterStats(clusterer, trainFileName)); |
---|
753 | |
---|
754 | if (testFileName.length() != 0) { |
---|
755 | // check header compatibility |
---|
756 | DataSource test = new DataSource(testFileName); |
---|
757 | Instances testStructure = test.getStructure(); |
---|
758 | if (!trainHeader.equalHeaders(testStructure)) { |
---|
759 | throw new Exception("Training and testing data are not compatible\n" + trainHeader.equalHeadersMsg(testStructure)); |
---|
760 | } |
---|
761 | |
---|
762 | text.append("\n\n=== Clustering stats for testing data ===\n\n" |
---|
763 | + printClusterStats(clusterer, testFileName)); |
---|
764 | } |
---|
765 | |
---|
766 | if ((clusterer instanceof DensityBasedClusterer) && |
---|
767 | (doXval == true) && |
---|
768 | (testFileName.length() == 0) && |
---|
769 | (objectInputFileName.length() == 0)) { |
---|
770 | // cross validate the log likelihood on the training data |
---|
771 | random = new Random(seed); |
---|
772 | random.setSeed(seed); |
---|
773 | train = source.getDataSet(); |
---|
774 | train.randomize(random); |
---|
775 | text.append( |
---|
776 | crossValidateModel( |
---|
777 | clusterer.getClass().getName(), train, folds, savedOptions, random)); |
---|
778 | } |
---|
779 | |
---|
780 | // Save the clusterer if an object output file is provided |
---|
781 | if (objectOutputFileName.length() != 0) { |
---|
782 | //SerializationHelper.write(objectOutputFileName, clusterer); |
---|
783 | saveClusterer(objectOutputFileName, clusterer, trainHeader); |
---|
784 | } |
---|
785 | |
---|
786 | // If classifier is drawable output string describing graph |
---|
787 | if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) { |
---|
788 | BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName)); |
---|
789 | writer.write(((Drawable) clusterer).graph()); |
---|
790 | writer.newLine(); |
---|
791 | writer.flush(); |
---|
792 | writer.close(); |
---|
793 | } |
---|
794 | |
---|
795 | return text.toString(); |
---|
796 | } |
---|
797 | |
---|
798 | private static void saveClusterer(String fileName, |
---|
799 | Clusterer clusterer, |
---|
800 | Instances header) throws Exception { |
---|
801 | java.io.ObjectOutputStream oos = |
---|
802 | new java.io.ObjectOutputStream( |
---|
803 | new java.io.BufferedOutputStream( |
---|
804 | new java.io.FileOutputStream(fileName))); |
---|
805 | |
---|
806 | oos.writeObject(clusterer); |
---|
807 | if (header != null) { |
---|
808 | oos.writeObject(header); |
---|
809 | } |
---|
810 | oos.flush(); |
---|
811 | oos.close(); |
---|
812 | } |
---|
813 | |
---|
814 | /** |
---|
815 | * Perform a cross-validation for DensityBasedClusterer on a set of instances. |
---|
816 | * |
---|
817 | * @param clusterer the clusterer to use |
---|
818 | * @param data the training data |
---|
819 | * @param numFolds number of folds of cross validation to perform |
---|
820 | * @param random random number seed for cross-validation |
---|
821 | * @return the cross-validated log-likelihood |
---|
822 | * @throws Exception if an error occurs |
---|
823 | */ |
---|
824 | public static double crossValidateModel(DensityBasedClusterer clusterer, |
---|
825 | Instances data, |
---|
826 | int numFolds, |
---|
827 | Random random) throws Exception { |
---|
828 | Instances train, test; |
---|
829 | double foldAv = 0;; |
---|
830 | data = new Instances(data); |
---|
831 | data.randomize(random); |
---|
832 | // double sumOW = 0; |
---|
833 | for (int i = 0; i < numFolds; i++) { |
---|
834 | // Build and test clusterer |
---|
835 | train = data.trainCV(numFolds, i, random); |
---|
836 | |
---|
837 | clusterer.buildClusterer(train); |
---|
838 | |
---|
839 | test = data.testCV(numFolds, i); |
---|
840 | |
---|
841 | for (int j = 0; j < test.numInstances(); j++) { |
---|
842 | try { |
---|
843 | foldAv += ((DensityBasedClusterer)clusterer). |
---|
844 | logDensityForInstance(test.instance(j)); |
---|
845 | // sumOW += test.instance(j).weight(); |
---|
846 | // double temp = Utils.sum(tempDist); |
---|
847 | } catch (Exception ex) { |
---|
848 | // unclustered instances |
---|
849 | } |
---|
850 | } |
---|
851 | } |
---|
852 | |
---|
853 | // return foldAv / sumOW; |
---|
854 | return foldAv / data.numInstances(); |
---|
855 | } |
---|
856 | |
---|
857 | /** |
---|
858 | * Performs a cross-validation |
---|
859 | * for a DensityBasedClusterer clusterer on a set of instances. |
---|
860 | * |
---|
861 | * @param clustererString a string naming the class of the clusterer |
---|
862 | * @param data the data on which the cross-validation is to be |
---|
863 | * performed |
---|
864 | * @param numFolds the number of folds for the cross-validation |
---|
865 | * @param options the options to the clusterer |
---|
866 | * @param random a random number generator |
---|
867 | * @return a string containing the cross validated log likelihood |
---|
868 | * @throws Exception if a clusterer could not be generated |
---|
869 | */ |
---|
870 | public static String crossValidateModel (String clustererString, |
---|
871 | Instances data, |
---|
872 | int numFolds, |
---|
873 | String[] options, |
---|
874 | Random random) |
---|
875 | throws Exception { |
---|
876 | Clusterer clusterer = null; |
---|
877 | String[] savedOptions = null; |
---|
878 | double CvAv = 0.0; |
---|
879 | StringBuffer CvString = new StringBuffer(); |
---|
880 | |
---|
881 | if (options != null) { |
---|
882 | savedOptions = new String[options.length]; |
---|
883 | } |
---|
884 | |
---|
885 | data = new Instances(data); |
---|
886 | |
---|
887 | // create clusterer |
---|
888 | try { |
---|
889 | clusterer = (Clusterer)Class.forName(clustererString).newInstance(); |
---|
890 | } |
---|
891 | catch (Exception e) { |
---|
892 | throw new Exception("Can't find class with name " |
---|
893 | + clustererString + '.'); |
---|
894 | } |
---|
895 | |
---|
896 | if (!(clusterer instanceof DensityBasedClusterer)) { |
---|
897 | throw new Exception(clustererString |
---|
898 | + " must be a distrinbution " |
---|
899 | + "clusterer."); |
---|
900 | } |
---|
901 | |
---|
902 | // Save options |
---|
903 | if (options != null) { |
---|
904 | System.arraycopy(options, 0, savedOptions, 0, options.length); |
---|
905 | } |
---|
906 | |
---|
907 | // Parse options |
---|
908 | if (clusterer instanceof OptionHandler) { |
---|
909 | try { |
---|
910 | ((OptionHandler)clusterer).setOptions(savedOptions); |
---|
911 | Utils.checkForRemainingOptions(savedOptions); |
---|
912 | } |
---|
913 | catch (Exception e) { |
---|
914 | throw new Exception("Can't parse given options in " |
---|
915 | + "cross-validation!"); |
---|
916 | } |
---|
917 | } |
---|
918 | CvAv = crossValidateModel((DensityBasedClusterer)clusterer, data, numFolds, random); |
---|
919 | |
---|
920 | CvString.append("\n" + numFolds |
---|
921 | + " fold CV Log Likelihood: " |
---|
922 | + Utils.doubleToString(CvAv, 6, 4) |
---|
923 | + "\n"); |
---|
924 | return CvString.toString(); |
---|
925 | } |
---|
926 | |
---|
927 | |
---|
928 | // =============== |
---|
929 | // Private methods |
---|
930 | // =============== |
---|
931 | /** |
---|
932 | * Print the cluster statistics for either the training |
---|
933 | * or the testing data. |
---|
934 | * |
---|
935 | * @param clusterer the clusterer to use for generating statistics. |
---|
936 | * @param fileName the file to load |
---|
937 | * @return a string containing cluster statistics. |
---|
938 | * @throws Exception if statistics can't be generated. |
---|
939 | */ |
---|
940 | private static String printClusterStats (Clusterer clusterer, |
---|
941 | String fileName) |
---|
942 | throws Exception { |
---|
943 | StringBuffer text = new StringBuffer(); |
---|
944 | int i = 0; |
---|
945 | int cnum; |
---|
946 | double loglk = 0.0; |
---|
947 | int cc = clusterer.numberOfClusters(); |
---|
948 | double[] instanceStats = new double[cc]; |
---|
949 | int unclusteredInstances = 0; |
---|
950 | |
---|
951 | if (fileName.length() != 0) { |
---|
952 | DataSource source = new DataSource(fileName); |
---|
953 | Instances structure = source.getStructure(); |
---|
954 | Instance inst; |
---|
955 | while (source.hasMoreElements(structure)) { |
---|
956 | inst = source.nextElement(structure); |
---|
957 | try { |
---|
958 | cnum = clusterer.clusterInstance(inst); |
---|
959 | |
---|
960 | if (clusterer instanceof DensityBasedClusterer) { |
---|
961 | loglk += ((DensityBasedClusterer)clusterer). |
---|
962 | logDensityForInstance(inst); |
---|
963 | // temp = Utils.sum(dist); |
---|
964 | } |
---|
965 | instanceStats[cnum]++; |
---|
966 | } |
---|
967 | catch (Exception e) { |
---|
968 | unclusteredInstances++; |
---|
969 | } |
---|
970 | i++; |
---|
971 | } |
---|
972 | |
---|
973 | /* |
---|
974 | // count the actual number of used clusters |
---|
975 | int count = 0; |
---|
976 | for (i = 0; i < cc; i++) { |
---|
977 | if (instanceStats[i] > 0) { |
---|
978 | count++; |
---|
979 | } |
---|
980 | } |
---|
981 | if (count > 0) { |
---|
982 | double[] tempStats = new double [count]; |
---|
983 | count=0; |
---|
984 | for (i=0;i<cc;i++) { |
---|
985 | if (instanceStats[i] > 0) { |
---|
986 | tempStats[count++] = instanceStats[i]; |
---|
987 | } |
---|
988 | } |
---|
989 | instanceStats = tempStats; |
---|
990 | cc = instanceStats.length; |
---|
991 | } */ |
---|
992 | |
---|
993 | int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1); |
---|
994 | int numInstFieldWidth = (int)((Math.log(i)/Math.log(10))+1); |
---|
995 | double sum = Utils.sum(instanceStats); |
---|
996 | loglk /= sum; |
---|
997 | text.append("Clustered Instances\n"); |
---|
998 | |
---|
999 | for (i = 0; i < cc; i++) { |
---|
1000 | if (instanceStats[i] > 0) { |
---|
1001 | text.append(Utils.doubleToString((double)i, |
---|
1002 | clustFieldWidth, 0) |
---|
1003 | + " " |
---|
1004 | + Utils.doubleToString(instanceStats[i], |
---|
1005 | numInstFieldWidth, 0) |
---|
1006 | + " (" |
---|
1007 | + Utils.doubleToString((instanceStats[i]/sum*100.0) |
---|
1008 | , 3, 0) + "%)\n"); |
---|
1009 | } |
---|
1010 | } |
---|
1011 | if (unclusteredInstances > 0) { |
---|
1012 | text.append("\nUnclustered Instances : "+unclusteredInstances); |
---|
1013 | } |
---|
1014 | |
---|
1015 | if (clusterer instanceof DensityBasedClusterer) { |
---|
1016 | text.append("\n\nLog likelihood: " |
---|
1017 | + Utils.doubleToString(loglk, 1, 5) |
---|
1018 | + "\n"); |
---|
1019 | } |
---|
1020 | } |
---|
1021 | |
---|
1022 | return text.toString(); |
---|
1023 | } |
---|
1024 | |
---|
1025 | |
---|
1026 | /** |
---|
1027 | * Print the cluster assignments for either the training |
---|
1028 | * or the testing data. |
---|
1029 | * |
---|
1030 | * @param clusterer the clusterer to use for cluster assignments |
---|
1031 | * @param trainFileName the train file |
---|
1032 | * @param testFileName an optional test file |
---|
1033 | * @param attributesToOutput the attributes to print |
---|
1034 | * @return a string containing the instance indexes and cluster assigns. |
---|
1035 | * @throws Exception if cluster assignments can't be printed |
---|
1036 | */ |
---|
1037 | private static String printClusterings (Clusterer clusterer, String trainFileName, |
---|
1038 | String testFileName, Range attributesToOutput) |
---|
1039 | throws Exception { |
---|
1040 | |
---|
1041 | StringBuffer text = new StringBuffer(); |
---|
1042 | int i = 0; |
---|
1043 | int cnum; |
---|
1044 | DataSource source = null; |
---|
1045 | Instance inst; |
---|
1046 | Instances structure; |
---|
1047 | |
---|
1048 | if (testFileName.length() != 0) |
---|
1049 | source = new DataSource(testFileName); |
---|
1050 | else |
---|
1051 | source = new DataSource(trainFileName); |
---|
1052 | |
---|
1053 | structure = source.getStructure(); |
---|
1054 | while (source.hasMoreElements(structure)) { |
---|
1055 | inst = source.nextElement(structure); |
---|
1056 | try { |
---|
1057 | cnum = clusterer.clusterInstance(inst); |
---|
1058 | |
---|
1059 | text.append(i + " " + cnum + " " |
---|
1060 | + attributeValuesString(inst, attributesToOutput) + "\n"); |
---|
1061 | } |
---|
1062 | catch (Exception e) { |
---|
1063 | /* throw new Exception('\n' + "Unable to cluster instance\n" |
---|
1064 | + e.getMessage()); */ |
---|
1065 | text.append(i + " Unclustered " |
---|
1066 | + attributeValuesString(inst, attributesToOutput) + "\n"); |
---|
1067 | } |
---|
1068 | i++; |
---|
1069 | } |
---|
1070 | |
---|
1071 | return text.toString(); |
---|
1072 | } |
---|
1073 | |
---|
1074 | /** |
---|
1075 | * Builds a string listing the attribute values in a specified range of indices, |
---|
1076 | * separated by commas and enclosed in brackets. |
---|
1077 | * |
---|
1078 | * @param instance the instance to print the values from |
---|
1079 | * @param attRange the range of the attributes to list |
---|
1080 | * @return a string listing values of the attributes in the range |
---|
1081 | */ |
---|
1082 | private static String attributeValuesString(Instance instance, Range attRange) { |
---|
1083 | StringBuffer text = new StringBuffer(); |
---|
1084 | if (attRange != null) { |
---|
1085 | boolean firstOutput = true; |
---|
1086 | attRange.setUpper(instance.numAttributes() - 1); |
---|
1087 | for (int i=0; i<instance.numAttributes(); i++) |
---|
1088 | if (attRange.isInRange(i)) { |
---|
1089 | if (firstOutput) text.append("("); |
---|
1090 | else text.append(","); |
---|
1091 | text.append(instance.toString(i)); |
---|
1092 | firstOutput = false; |
---|
1093 | } |
---|
1094 | if (!firstOutput) text.append(")"); |
---|
1095 | } |
---|
1096 | return text.toString(); |
---|
1097 | } |
---|
1098 | |
---|
1099 | /** |
---|
1100 | * Make up the help string giving all the command line options |
---|
1101 | * |
---|
1102 | * @param clusterer the clusterer to include options for |
---|
1103 | * @return a string detailing the valid command line options |
---|
1104 | */ |
---|
1105 | private static String makeOptionString (Clusterer clusterer, |
---|
1106 | boolean globalInfo) { |
---|
1107 | StringBuffer optionsText = new StringBuffer(""); |
---|
1108 | // General options |
---|
1109 | optionsText.append("\n\nGeneral options:\n\n"); |
---|
1110 | optionsText.append("-h or -help\n"); |
---|
1111 | optionsText.append("\tOutput help information.\n"); |
---|
1112 | optionsText.append("-synopsis or -info\n"); |
---|
1113 | optionsText.append("\tOutput synopsis for clusterer (use in conjunction " |
---|
1114 | + " with -h)\n"); |
---|
1115 | optionsText.append("-t <name of training file>\n"); |
---|
1116 | optionsText.append("\tSets training file.\n"); |
---|
1117 | optionsText.append("-T <name of test file>\n"); |
---|
1118 | optionsText.append("\tSets test file.\n"); |
---|
1119 | optionsText.append("-l <name of input file>\n"); |
---|
1120 | optionsText.append("\tSets model input file.\n"); |
---|
1121 | optionsText.append("-d <name of output file>\n"); |
---|
1122 | optionsText.append("\tSets model output file.\n"); |
---|
1123 | optionsText.append("-p <attribute range>\n"); |
---|
1124 | optionsText.append("\tOutput predictions. Predictions are for " |
---|
1125 | + "training file" |
---|
1126 | + "\n\tif only training file is specified," |
---|
1127 | + "\n\totherwise predictions are for the test file." |
---|
1128 | + "\n\tThe range specifies attribute values to be output" |
---|
1129 | + "\n\twith the predictions. Use '-p 0' for none.\n"); |
---|
1130 | optionsText.append("-x <number of folds>\n"); |
---|
1131 | optionsText.append("\tOnly Distribution Clusterers can be cross validated.\n"); |
---|
1132 | optionsText.append("-s <random number seed>\n"); |
---|
1133 | optionsText.append("\tSets the seed for randomizing the data in cross-validation\n"); |
---|
1134 | optionsText.append("-c <class index>\n"); |
---|
1135 | optionsText.append("\tSet class attribute. If supplied, class is ignored"); |
---|
1136 | optionsText.append("\n\tduring clustering but is used in a classes to"); |
---|
1137 | optionsText.append("\n\tclusters evaluation.\n"); |
---|
1138 | if (clusterer instanceof Drawable) { |
---|
1139 | optionsText.append("-g <name of graph file>\n"); |
---|
1140 | optionsText.append("\tOutputs the graph representation of the clusterer to the file.\n"); |
---|
1141 | } |
---|
1142 | |
---|
1143 | // Get scheme-specific options |
---|
1144 | if (clusterer instanceof OptionHandler) { |
---|
1145 | optionsText.append("\nOptions specific to " |
---|
1146 | + clusterer.getClass().getName() + ":\n\n"); |
---|
1147 | Enumeration enu = ((OptionHandler)clusterer).listOptions(); |
---|
1148 | |
---|
1149 | while (enu.hasMoreElements()) { |
---|
1150 | Option option = (Option)enu.nextElement(); |
---|
1151 | optionsText.append(option.synopsis() + '\n'); |
---|
1152 | optionsText.append(option.description() + "\n"); |
---|
1153 | } |
---|
1154 | } |
---|
1155 | |
---|
1156 | // Get global information (if available) |
---|
1157 | if (globalInfo) { |
---|
1158 | try { |
---|
1159 | String gi = getGlobalInfo(clusterer); |
---|
1160 | optionsText.append(gi); |
---|
1161 | } catch (Exception ex) { |
---|
1162 | // quietly ignore |
---|
1163 | } |
---|
1164 | } |
---|
1165 | |
---|
1166 | return optionsText.toString(); |
---|
1167 | } |
---|
1168 | |
---|
1169 | /** |
---|
1170 | * Return the global info (if it exists) for the supplied clusterer |
---|
1171 | * |
---|
1172 | * @param clusterer the clusterer to get the global info for |
---|
1173 | * @return the global info (synopsis) for the clusterer |
---|
1174 | * @throws Exception if there is a problem reflecting on the clusterer |
---|
1175 | */ |
---|
1176 | protected static String getGlobalInfo(Clusterer clusterer) throws Exception { |
---|
1177 | BeanInfo bi = Introspector.getBeanInfo(clusterer.getClass()); |
---|
1178 | MethodDescriptor[] methods; |
---|
1179 | methods = bi.getMethodDescriptors(); |
---|
1180 | Object[] args = {}; |
---|
1181 | String result = "\nSynopsis for " + clusterer.getClass().getName() |
---|
1182 | + ":\n\n"; |
---|
1183 | |
---|
1184 | for (int i = 0; i < methods.length; i++) { |
---|
1185 | String name = methods[i].getDisplayName(); |
---|
1186 | Method meth = methods[i].getMethod(); |
---|
1187 | if (name.equals("globalInfo")) { |
---|
1188 | String globalInfo = (String)(meth.invoke(clusterer, args)); |
---|
1189 | result += globalInfo; |
---|
1190 | break; |
---|
1191 | } |
---|
1192 | } |
---|
1193 | |
---|
1194 | return result; |
---|
1195 | } |
---|
1196 | |
---|
1197 | /** |
---|
1198 | * Tests whether the current evaluation object is equal to another |
---|
1199 | * evaluation object |
---|
1200 | * |
---|
1201 | * @param obj the object to compare against |
---|
1202 | * @return true if the two objects are equal |
---|
1203 | */ |
---|
1204 | public boolean equals(Object obj) { |
---|
1205 | if ((obj == null) || !(obj.getClass().equals(this.getClass()))) |
---|
1206 | return false; |
---|
1207 | |
---|
1208 | ClusterEvaluation cmp = (ClusterEvaluation) obj; |
---|
1209 | |
---|
1210 | if ((m_classToCluster != null) != (cmp.m_classToCluster != null)) return false; |
---|
1211 | if (m_classToCluster != null) { |
---|
1212 | for (int i = 0; i < m_classToCluster.length; i++) { |
---|
1213 | if (m_classToCluster[i] != cmp.m_classToCluster[i]) |
---|
1214 | return false; |
---|
1215 | } |
---|
1216 | } |
---|
1217 | |
---|
1218 | if ((m_clusterAssignments != null) != (cmp.m_clusterAssignments != null)) return false; |
---|
1219 | if (m_clusterAssignments != null) { |
---|
1220 | for (int i = 0; i < m_clusterAssignments.length; i++) { |
---|
1221 | if (m_clusterAssignments[i] != cmp.m_clusterAssignments[i]) |
---|
1222 | return false; |
---|
1223 | } |
---|
1224 | } |
---|
1225 | |
---|
1226 | if (Double.isNaN(m_logL) != Double.isNaN(cmp.m_logL)) return false; |
---|
1227 | if (!Double.isNaN(m_logL)) { |
---|
1228 | if (m_logL != cmp.m_logL) return false; |
---|
1229 | } |
---|
1230 | |
---|
1231 | if (m_numClusters != cmp.m_numClusters) return false; |
---|
1232 | |
---|
1233 | // TODO: better comparison? via members? |
---|
1234 | String clusteringResults1 = m_clusteringResults.toString().replaceAll("Elapsed time.*", ""); |
---|
1235 | String clusteringResults2 = cmp.m_clusteringResults.toString().replaceAll("Elapsed time.*", ""); |
---|
1236 | if (!clusteringResults1.equals(clusteringResults2)) return false; |
---|
1237 | |
---|
1238 | return true; |
---|
1239 | } |
---|
1240 | |
---|
1241 | /** |
---|
1242 | * Returns the revision string. |
---|
1243 | * |
---|
1244 | * @return the revision |
---|
1245 | */ |
---|
1246 | public String getRevision() { |
---|
1247 | return RevisionUtils.extract("$Revision: 6021 $"); |
---|
1248 | } |
---|
1249 | |
---|
1250 | /** |
---|
1251 | * Main method for testing this class. |
---|
1252 | * |
---|
1253 | * @param args the options |
---|
1254 | */ |
---|
1255 | public static void main (String[] args) { |
---|
1256 | try { |
---|
1257 | if (args.length == 0) { |
---|
1258 | throw new Exception("The first argument must be the name of a " |
---|
1259 | + "clusterer"); |
---|
1260 | } |
---|
1261 | |
---|
1262 | String ClustererString = args[0]; |
---|
1263 | args[0] = ""; |
---|
1264 | Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null); |
---|
1265 | System.out.println(evaluateClusterer(newClusterer, args)); |
---|
1266 | } |
---|
1267 | catch (Exception e) { |
---|
1268 | System.out.println(e.getMessage()); |
---|
1269 | } |
---|
1270 | } |
---|
1271 | } |
---|