[4] | 1 | /* |
---|
| 2 | * This program is free software; you can redistribute it and/or modify |
---|
| 3 | * it under the terms of the GNU General Public License as published by |
---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
---|
| 5 | * (at your option) any later version. |
---|
| 6 | * |
---|
| 7 | * This program is distributed in the hope that it will be useful, |
---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 10 | * GNU General Public License for more details. |
---|
| 11 | * |
---|
| 12 | * You should have received a copy of the GNU General Public License |
---|
| 13 | * along with this program; if not, write to the Free Software |
---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
| 15 | */ |
---|
| 16 | |
---|
| 17 | /* |
---|
| 18 | * ClassificationViaClustering.java |
---|
| 19 | * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand |
---|
| 20 | */ |
---|
| 21 | |
---|
| 22 | package weka.classifiers.meta; |
---|
| 23 | |
---|
| 24 | import weka.classifiers.Classifier; |
---|
| 25 | import weka.classifiers.AbstractClassifier; |
---|
| 26 | import weka.classifiers.rules.ZeroR; |
---|
| 27 | import weka.clusterers.ClusterEvaluation; |
---|
| 28 | import weka.clusterers.Clusterer; |
---|
| 29 | import weka.clusterers.AbstractClusterer; |
---|
| 30 | import weka.clusterers.SimpleKMeans; |
---|
| 31 | import weka.core.Capabilities; |
---|
| 32 | import weka.core.Instance; |
---|
| 33 | import weka.core.DenseInstance; |
---|
| 34 | import weka.core.Instances; |
---|
| 35 | import weka.core.Option; |
---|
| 36 | import weka.core.OptionHandler; |
---|
| 37 | import weka.core.RevisionUtils; |
---|
| 38 | import weka.core.Utils; |
---|
| 39 | import weka.core.Capabilities.Capability; |
---|
| 40 | |
---|
| 41 | import java.util.Enumeration; |
---|
| 42 | import java.util.Vector; |
---|
| 43 | |
---|
| 44 | /** |
---|
| 45 | <!-- globalinfo-start --> |
---|
| 46 | * A simple meta-classifier that uses a clusterer for classification. For cluster algorithms that use a fixed number of clusterers, like SimpleKMeans, the user has to make sure that the number of clusters to generate are the same as the number of class labels in the dataset in order to obtain a useful model.<br/> |
---|
| 47 | * <br/> |
---|
| 48 | * Note: at prediction time, a missing value is returned if no cluster is found for the instance.<br/> |
---|
| 49 | * <br/> |
---|
| 50 | * The code is based on the 'clusters to classes' functionality of the weka.clusterers.ClusterEvaluation class by Mark Hall. |
---|
| 51 | * <p/> |
---|
| 52 | <!-- globalinfo-end --> |
---|
| 53 | * |
---|
| 54 | <!-- options-start --> |
---|
| 55 | * Valid options are: <p/> |
---|
| 56 | * |
---|
| 57 | * <pre> -D |
---|
| 58 | * If set, classifier is run in debug mode and |
---|
| 59 | * may output additional info to the console</pre> |
---|
| 60 | * |
---|
| 61 | * <pre> -W |
---|
| 62 | * Full name of clusterer. |
---|
| 63 | * (default: weka.clusterers.SimpleKMeans)</pre> |
---|
| 64 | * |
---|
| 65 | * <pre> |
---|
| 66 | * Options specific to clusterer weka.clusterers.SimpleKMeans: |
---|
| 67 | * </pre> |
---|
| 68 | * |
---|
| 69 | * <pre> -N <num> |
---|
| 70 | * number of clusters. |
---|
| 71 | * (default 2).</pre> |
---|
| 72 | * |
---|
| 73 | * <pre> -V |
---|
| 74 | * Display std. deviations for centroids. |
---|
| 75 | * </pre> |
---|
| 76 | * |
---|
| 77 | * <pre> -M |
---|
| 78 | * Replace missing values with mean/mode. |
---|
| 79 | * </pre> |
---|
| 80 | * |
---|
| 81 | * <pre> -S <num> |
---|
| 82 | * Random number seed. |
---|
| 83 | * (default 10)</pre> |
---|
| 84 | * |
---|
| 85 | <!-- options-end --> |
---|
| 86 | * |
---|
| 87 | * @author fracpete (fracpete at waikato dot ac dot nz) |
---|
| 88 | * @version $Revision: 5987 $ |
---|
| 89 | */ |
---|
| 90 | public class ClassificationViaClustering |
---|
| 91 | extends AbstractClassifier { |
---|
| 92 | |
---|
| 93 | /** for serialization */ |
---|
| 94 | private static final long serialVersionUID = -5687069451420259135L; |
---|
| 95 | |
---|
| 96 | /** the cluster algorithm used (template) */ |
---|
| 97 | protected Clusterer m_Clusterer; |
---|
| 98 | |
---|
| 99 | /** the actual cluster algorithm being used */ |
---|
| 100 | protected Clusterer m_ActualClusterer; |
---|
| 101 | |
---|
| 102 | /** the original training data header */ |
---|
| 103 | protected Instances m_OriginalHeader; |
---|
| 104 | |
---|
| 105 | /** the modified training data header */ |
---|
| 106 | protected Instances m_ClusteringHeader; |
---|
| 107 | |
---|
| 108 | /** the mapping between clusters and classes */ |
---|
| 109 | protected double[] m_ClustersToClasses; |
---|
| 110 | |
---|
| 111 | /** the default model */ |
---|
| 112 | protected Classifier m_ZeroR; |
---|
| 113 | |
---|
| 114 | /** |
---|
| 115 | * default constructor |
---|
| 116 | */ |
---|
| 117 | public ClassificationViaClustering() { |
---|
| 118 | super(); |
---|
| 119 | |
---|
| 120 | m_Clusterer = new SimpleKMeans(); |
---|
| 121 | } |
---|
| 122 | |
---|
| 123 | /** |
---|
| 124 | * Returns a string describing classifier |
---|
| 125 | * |
---|
| 126 | * @return a description suitable for displaying in the |
---|
| 127 | * explorer/experimenter gui |
---|
| 128 | */ |
---|
| 129 | public String globalInfo() { |
---|
| 130 | return |
---|
| 131 | "A simple meta-classifier that uses a clusterer for classification. " |
---|
| 132 | + "For cluster algorithms that use a fixed number of clusterers, like " |
---|
| 133 | + "SimpleKMeans, the user has to make sure that the number of clusters " |
---|
| 134 | + "to generate are the same as the number of class labels in the dataset " |
---|
| 135 | + "in order to obtain a useful model.\n" |
---|
| 136 | + "\n" |
---|
| 137 | + "Note: at prediction time, a missing value is returned if no cluster " |
---|
| 138 | + "is found for the instance.\n" |
---|
| 139 | + "\n" |
---|
| 140 | + "The code is based on the 'clusters to classes' functionality of the " |
---|
| 141 | + "weka.clusterers.ClusterEvaluation class by Mark Hall."; |
---|
| 142 | } |
---|
| 143 | |
---|
| 144 | /** |
---|
| 145 | * Gets an enumeration describing the available options. |
---|
| 146 | * |
---|
| 147 | * @return an enumeration of all the available options. |
---|
| 148 | */ |
---|
| 149 | public Enumeration listOptions(){ |
---|
| 150 | Vector result; |
---|
| 151 | Enumeration enm; |
---|
| 152 | |
---|
| 153 | result = new Vector(); |
---|
| 154 | |
---|
| 155 | enm = super.listOptions(); |
---|
| 156 | while (enm.hasMoreElements()) |
---|
| 157 | result.addElement(enm.nextElement()); |
---|
| 158 | |
---|
| 159 | result.addElement(new Option( |
---|
| 160 | "\tFull name of clusterer.\n" |
---|
| 161 | + "\t(default: " + defaultClustererString() +")", |
---|
| 162 | "W", 1, "-W")); |
---|
| 163 | |
---|
| 164 | result.addElement(new Option( |
---|
| 165 | "", |
---|
| 166 | "", 0, "\nOptions specific to clusterer " |
---|
| 167 | + m_Clusterer.getClass().getName() + ":")); |
---|
| 168 | enm = ((OptionHandler) m_Clusterer).listOptions(); |
---|
| 169 | while (enm.hasMoreElements()) |
---|
| 170 | result.addElement(enm.nextElement()); |
---|
| 171 | |
---|
| 172 | return result.elements(); |
---|
| 173 | } |
---|
| 174 | |
---|
| 175 | /** |
---|
| 176 | * returns the options of the current setup |
---|
| 177 | * |
---|
| 178 | * @return the current options |
---|
| 179 | */ |
---|
| 180 | public String[] getOptions(){ |
---|
| 181 | int i; |
---|
| 182 | Vector<String> result; |
---|
| 183 | String[] options; |
---|
| 184 | |
---|
| 185 | result = new Vector<String>(); |
---|
| 186 | |
---|
| 187 | result.add("-W"); |
---|
| 188 | result.add("" + getClusterer().getClass().getName()); |
---|
| 189 | |
---|
| 190 | options = super.getOptions(); |
---|
| 191 | for (i = 0; i < options.length; i++) |
---|
| 192 | result.add(options[i]); |
---|
| 193 | |
---|
| 194 | if (getClusterer() instanceof OptionHandler) { |
---|
| 195 | result.add("--"); |
---|
| 196 | options = ((OptionHandler) getClusterer()).getOptions(); |
---|
| 197 | for (i = 0; i < options.length; i++) |
---|
| 198 | result.add(options[i]); |
---|
| 199 | } |
---|
| 200 | |
---|
| 201 | return result.toArray(new String[result.size()]); |
---|
| 202 | } |
---|
| 203 | |
---|
| 204 | /** |
---|
| 205 | * Parses the options for this object. <p/> |
---|
| 206 | * |
---|
| 207 | <!-- options-start --> |
---|
| 208 | * Valid options are: <p/> |
---|
| 209 | * |
---|
| 210 | * <pre> -D |
---|
| 211 | * If set, classifier is run in debug mode and |
---|
| 212 | * may output additional info to the console</pre> |
---|
| 213 | * |
---|
| 214 | * <pre> -W |
---|
| 215 | * Full name of clusterer. |
---|
| 216 | * (default: weka.clusterers.SimpleKMeans)</pre> |
---|
| 217 | * |
---|
| 218 | * <pre> |
---|
| 219 | * Options specific to clusterer weka.clusterers.SimpleKMeans: |
---|
| 220 | * </pre> |
---|
| 221 | * |
---|
| 222 | * <pre> -N <num> |
---|
| 223 | * number of clusters. |
---|
| 224 | * (default 2).</pre> |
---|
| 225 | * |
---|
| 226 | * <pre> -V |
---|
| 227 | * Display std. deviations for centroids. |
---|
| 228 | * </pre> |
---|
| 229 | * |
---|
| 230 | * <pre> -M |
---|
| 231 | * Replace missing values with mean/mode. |
---|
| 232 | * </pre> |
---|
| 233 | * |
---|
| 234 | * <pre> -S <num> |
---|
| 235 | * Random number seed. |
---|
| 236 | * (default 10)</pre> |
---|
| 237 | * |
---|
| 238 | <!-- options-end --> |
---|
| 239 | * |
---|
| 240 | * @param options the options to use |
---|
| 241 | * @throws Exception if setting of options fails |
---|
| 242 | */ |
---|
| 243 | public void setOptions(String[] options) throws Exception { |
---|
| 244 | String tmpStr; |
---|
| 245 | |
---|
| 246 | super.setOptions(options); |
---|
| 247 | |
---|
| 248 | tmpStr = Utils.getOption('W', options); |
---|
| 249 | if (tmpStr.length() > 0) { |
---|
| 250 | // This is just to set the classifier in case the option |
---|
| 251 | // parsing fails. |
---|
| 252 | setClusterer(AbstractClusterer.forName(tmpStr, null)); |
---|
| 253 | setClusterer(AbstractClusterer.forName(tmpStr, Utils.partitionOptions(options))); |
---|
| 254 | } |
---|
| 255 | else { |
---|
| 256 | // This is just to set the classifier in case the option |
---|
| 257 | // parsing fails. |
---|
| 258 | setClusterer(AbstractClusterer.forName(defaultClustererString(), null)); |
---|
| 259 | setClusterer(AbstractClusterer.forName(defaultClustererString(), Utils.partitionOptions(options))); |
---|
| 260 | } |
---|
| 261 | } |
---|
| 262 | |
---|
| 263 | /** |
---|
| 264 | * String describing default clusterer. |
---|
| 265 | * |
---|
| 266 | * @return the classname |
---|
| 267 | */ |
---|
| 268 | protected String defaultClustererString() { |
---|
| 269 | return SimpleKMeans.class.getName(); |
---|
| 270 | } |
---|
| 271 | |
---|
| 272 | /** |
---|
| 273 | * Returns the tip text for this property |
---|
| 274 | * |
---|
| 275 | * @return tip text for this property suitable for |
---|
| 276 | * displaying in the explorer/experimenter gui |
---|
| 277 | */ |
---|
| 278 | public String clustererTipText() { |
---|
| 279 | return "The clusterer to be used."; |
---|
| 280 | } |
---|
| 281 | |
---|
| 282 | /** |
---|
| 283 | * Set the base clusterer. |
---|
| 284 | * |
---|
| 285 | * @param value the clusterer to use. |
---|
| 286 | */ |
---|
| 287 | public void setClusterer(Clusterer value) { |
---|
| 288 | m_Clusterer = value; |
---|
| 289 | } |
---|
| 290 | |
---|
| 291 | /** |
---|
| 292 | * Get the clusterer used as the base learner. |
---|
| 293 | * |
---|
| 294 | * @return the current clusterer |
---|
| 295 | */ |
---|
| 296 | public Clusterer getClusterer() { |
---|
| 297 | return m_Clusterer; |
---|
| 298 | } |
---|
| 299 | |
---|
| 300 | /** |
---|
| 301 | * Classifies the given test instance. |
---|
| 302 | * |
---|
| 303 | * @param instance the instance to be classified |
---|
| 304 | * @return the predicted most likely class for the instance or |
---|
| 305 | * Utils.missingValue() if no prediction is made |
---|
| 306 | * @throws Exception if an error occurred during the prediction |
---|
| 307 | */ |
---|
| 308 | public double classifyInstance(Instance instance) throws Exception { |
---|
| 309 | double result; |
---|
| 310 | double[] values; |
---|
| 311 | Instance newInst; |
---|
| 312 | int i; |
---|
| 313 | int n; |
---|
| 314 | |
---|
| 315 | if (m_ZeroR != null) { |
---|
| 316 | result = m_ZeroR.classifyInstance(instance); |
---|
| 317 | } |
---|
| 318 | else { |
---|
| 319 | if (m_ActualClusterer != null) { |
---|
| 320 | // build new instance |
---|
| 321 | values = new double[m_ClusteringHeader.numAttributes()]; |
---|
| 322 | n = 0; |
---|
| 323 | for (i = 0; i < instance.numAttributes(); i++) { |
---|
| 324 | if (i == instance.classIndex()) |
---|
| 325 | continue; |
---|
| 326 | values[n] = instance.value(i); |
---|
| 327 | n++; |
---|
| 328 | } |
---|
| 329 | newInst = new DenseInstance(instance.weight(), values); |
---|
| 330 | newInst.setDataset(m_ClusteringHeader); |
---|
| 331 | |
---|
| 332 | // determine cluster/class |
---|
| 333 | result = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)]; |
---|
| 334 | if (result == -1) |
---|
| 335 | result = Utils.missingValue(); |
---|
| 336 | } |
---|
| 337 | else { |
---|
| 338 | result = Utils.missingValue(); |
---|
| 339 | } |
---|
| 340 | } |
---|
| 341 | |
---|
| 342 | return result; |
---|
| 343 | } |
---|
| 344 | |
---|
| 345 | /** |
---|
| 346 | * Returns default capabilities of the classifier. |
---|
| 347 | * |
---|
| 348 | * @return the capabilities of this classifier |
---|
| 349 | */ |
---|
| 350 | public Capabilities getCapabilities() { |
---|
| 351 | Capabilities result; |
---|
| 352 | |
---|
| 353 | result = m_Clusterer.getCapabilities(); |
---|
| 354 | |
---|
| 355 | // class |
---|
| 356 | result.disableAllClasses(); |
---|
| 357 | result.disable(Capability.NO_CLASS); |
---|
| 358 | result.enable(Capability.NOMINAL_CLASS); |
---|
| 359 | |
---|
| 360 | return result; |
---|
| 361 | } |
---|
| 362 | |
---|
| 363 | /** |
---|
| 364 | * builds the classifier |
---|
| 365 | * |
---|
| 366 | * @param data the training instances |
---|
| 367 | * @throws Exception if something goes wrong |
---|
| 368 | */ |
---|
| 369 | public void buildClassifier(Instances data) throws Exception { |
---|
| 370 | Instances clusterData; |
---|
| 371 | ClusterEvaluation eval; |
---|
| 372 | int i; |
---|
| 373 | Instance instance; |
---|
| 374 | int[][] counts; |
---|
| 375 | int[] clusterTotals; |
---|
| 376 | double[] best; |
---|
| 377 | double[] current; |
---|
| 378 | double[] clusterAssignments; |
---|
| 379 | |
---|
| 380 | // can classifier handle the data? |
---|
| 381 | getCapabilities().testWithFail(data); |
---|
| 382 | |
---|
| 383 | // remove instances with missing class |
---|
| 384 | data = new Instances(data); |
---|
| 385 | data.deleteWithMissingClass(); |
---|
| 386 | |
---|
| 387 | // save original header (needed for clusters to classes output) |
---|
| 388 | m_OriginalHeader = new Instances(data, 0); |
---|
| 389 | |
---|
| 390 | // remove class attribute for clusterer |
---|
| 391 | clusterData = new Instances(data); |
---|
| 392 | clusterData.setClassIndex(-1); |
---|
| 393 | clusterData.deleteAttributeAt(m_OriginalHeader.classIndex()); |
---|
| 394 | m_ClusteringHeader = new Instances(clusterData, 0); |
---|
| 395 | |
---|
| 396 | if (m_ClusteringHeader.numAttributes() == 0) { |
---|
| 397 | System.err.println("Data contains only class attribute, defaulting to ZeroR model."); |
---|
| 398 | m_ZeroR = new ZeroR(); |
---|
| 399 | m_ZeroR.buildClassifier(data); |
---|
| 400 | } |
---|
| 401 | else { |
---|
| 402 | m_ZeroR = null; |
---|
| 403 | |
---|
| 404 | // build clusterer |
---|
| 405 | m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); |
---|
| 406 | m_ActualClusterer.buildClusterer(clusterData); |
---|
| 407 | |
---|
| 408 | // evaluate clusterer on training set |
---|
| 409 | eval = new ClusterEvaluation(); |
---|
| 410 | eval.setClusterer(m_ActualClusterer); |
---|
| 411 | eval.evaluateClusterer(clusterData); |
---|
| 412 | clusterAssignments = eval.getClusterAssignments(); |
---|
| 413 | |
---|
| 414 | // determine classes-to-clusters mapping |
---|
| 415 | counts = new int [eval.getNumClusters()][m_OriginalHeader.numClasses()]; |
---|
| 416 | clusterTotals = new int[eval.getNumClusters()]; |
---|
| 417 | best = new double[eval.getNumClusters()+1]; |
---|
| 418 | current = new double[eval.getNumClusters()+1]; |
---|
| 419 | for (i = 0; i < data.numInstances(); i++) { |
---|
| 420 | instance = data.instance(i); |
---|
| 421 | counts[(int) clusterAssignments[i]][(int) instance.classValue()]++; |
---|
| 422 | clusterTotals[(int) clusterAssignments[i]]++; |
---|
| 423 | i++; |
---|
| 424 | } |
---|
| 425 | best[eval.getNumClusters()] = Double.MAX_VALUE; |
---|
| 426 | ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0); |
---|
| 427 | m_ClustersToClasses = new double[best.length]; |
---|
| 428 | System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length); |
---|
| 429 | } |
---|
| 430 | } |
---|
| 431 | |
---|
| 432 | /** |
---|
| 433 | * Returns a string representation of the classifier. |
---|
| 434 | * |
---|
| 435 | * @return a string representation of the classifier. |
---|
| 436 | */ |
---|
| 437 | public String toString() { |
---|
| 438 | StringBuffer result; |
---|
| 439 | int i; |
---|
| 440 | int n; |
---|
| 441 | boolean found; |
---|
| 442 | |
---|
| 443 | result = new StringBuffer(); |
---|
| 444 | |
---|
| 445 | // title |
---|
| 446 | result.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n"); |
---|
| 447 | result.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n"); |
---|
| 448 | |
---|
| 449 | // model |
---|
| 450 | if (m_ActualClusterer != null) { |
---|
| 451 | // output clusterer |
---|
| 452 | result.append(m_ActualClusterer + "\n"); |
---|
| 453 | |
---|
| 454 | // clusters to classes |
---|
| 455 | result.append("Clusters to classes mapping:\n"); |
---|
| 456 | for (i = 0; i < m_ClustersToClasses.length - 1; i++) { |
---|
| 457 | result.append(" " + (i+1) + ". Cluster: "); |
---|
| 458 | if (m_ClustersToClasses[i] < 0) |
---|
| 459 | result.append("no class"); |
---|
| 460 | else |
---|
| 461 | result.append( |
---|
| 462 | m_OriginalHeader.classAttribute().value((int) m_ClustersToClasses[i]) |
---|
| 463 | + " (" + ((int) m_ClustersToClasses[i] + 1) + ")"); |
---|
| 464 | result.append("\n"); |
---|
| 465 | } |
---|
| 466 | result.append("\n"); |
---|
| 467 | |
---|
| 468 | // classes to clusters |
---|
| 469 | result.append("Classes to clusters mapping:\n"); |
---|
| 470 | for (i = 0; i < m_OriginalHeader.numClasses(); i++) { |
---|
| 471 | result.append( |
---|
| 472 | " " + (i+1) + ". Class (" |
---|
| 473 | + m_OriginalHeader.classAttribute().value(i) + "): "); |
---|
| 474 | |
---|
| 475 | found = false; |
---|
| 476 | for (n = 0; n < m_ClustersToClasses.length - 1; n++) { |
---|
| 477 | if (((int) m_ClustersToClasses[n]) == i) { |
---|
| 478 | found = true; |
---|
| 479 | result.append((n+1) + ". Cluster"); |
---|
| 480 | break; |
---|
| 481 | } |
---|
| 482 | } |
---|
| 483 | |
---|
| 484 | if (!found) |
---|
| 485 | result.append("no cluster"); |
---|
| 486 | |
---|
| 487 | result.append("\n"); |
---|
| 488 | } |
---|
| 489 | |
---|
| 490 | result.append("\n"); |
---|
| 491 | } |
---|
| 492 | else { |
---|
| 493 | result.append("no model built yet\n"); |
---|
| 494 | } |
---|
| 495 | |
---|
| 496 | return result.toString(); |
---|
| 497 | } |
---|
| 498 | |
---|
| 499 | /** |
---|
| 500 | * Returns the revision string. |
---|
| 501 | * |
---|
| 502 | * @return the revision |
---|
| 503 | */ |
---|
| 504 | public String getRevision() { |
---|
| 505 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
| 506 | } |
---|
| 507 | |
---|
| 508 | /** |
---|
| 509 | * Runs the classifier with the given options |
---|
| 510 | * |
---|
| 511 | * @param args the commandline options |
---|
| 512 | */ |
---|
| 513 | public static void main(String[] args) { |
---|
| 514 | runClassifier(new ClassificationViaClustering(), args); |
---|
| 515 | } |
---|
| 516 | } |
---|
| 517 | |
---|