[29] | 1 | /* |
---|
| 2 | * This program is free software; you can redistribute it and/or modify |
---|
| 3 | * it under the terms of the GNU General Public License as published by |
---|
| 4 | * the Free Software Foundation; either version 2 of the License, or |
---|
| 5 | * (at your option) any later version. |
---|
| 6 | * |
---|
| 7 | * This program is distributed in the hope that it will be useful, |
---|
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 10 | * GNU General Public License for more details. |
---|
| 11 | * |
---|
| 12 | * You should have received a copy of the GNU General Public License |
---|
| 13 | * along with this program; if not, write to the Free Software |
---|
| 14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
| 15 | */ |
---|
| 16 | |
---|
| 17 | /* |
---|
| 18 | * Cobweb.java |
---|
| 19 | * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand |
---|
| 20 | * |
---|
| 21 | */ |
---|
| 22 | |
---|
| 23 | package weka.clusterers; |
---|
| 24 | |
---|
| 25 | import weka.core.AttributeStats; |
---|
| 26 | import weka.core.Capabilities; |
---|
| 27 | import weka.core.Drawable; |
---|
| 28 | import weka.core.FastVector; |
---|
| 29 | import weka.core.Instance; |
---|
| 30 | import weka.core.Instances; |
---|
| 31 | import weka.core.Option; |
---|
| 32 | import weka.core.RevisionHandler; |
---|
| 33 | import weka.core.RevisionUtils; |
---|
| 34 | import weka.core.TechnicalInformation; |
---|
| 35 | import weka.core.TechnicalInformationHandler; |
---|
| 36 | import weka.core.Utils; |
---|
| 37 | import weka.core.Capabilities.Capability; |
---|
| 38 | import weka.core.TechnicalInformation.Field; |
---|
| 39 | import weka.core.TechnicalInformation.Type; |
---|
| 40 | import weka.experiment.Stats; |
---|
| 41 | import weka.filters.Filter; |
---|
| 42 | import weka.filters.unsupervised.attribute.Add; |
---|
| 43 | |
---|
| 44 | import java.io.Serializable; |
---|
| 45 | import java.util.Enumeration; |
---|
| 46 | import java.util.Random; |
---|
| 47 | import java.util.Vector; |
---|
| 48 | |
---|
| 49 | /** |
---|
| 50 | <!-- globalinfo-start --> |
---|
| 51 | * Class implementing the Cobweb and Classit clustering algorithms.<br/> |
---|
| 52 | * <br/> |
---|
| 53 | * Note: the application of node operators (merging, splitting etc.) in terms of ordering and priority differs (and is somewhat ambiguous) between the original Cobweb and Classit papers. This algorithm always compares the best host, adding a new leaf, merging the two best hosts, and splitting the best host when considering where to place a new instance.<br/> |
---|
| 54 | * <br/> |
---|
| 55 | * For more information see:<br/> |
---|
| 56 | * <br/> |
---|
| 57 | * D. Fisher (1987). Knowledge acquisition via incremental conceptual clustering. Machine Learning. 2(2):139-172.<br/> |
---|
| 58 | * <br/> |
---|
| 59 | * J. H. Gennari, P. Langley, D. Fisher (1990). Models of incremental concept formation. Artificial Intelligence. 40:11-61. |
---|
| 60 | * <p/> |
---|
| 61 | <!-- globalinfo-end --> |
---|
| 62 | * |
---|
| 63 | <!-- technical-bibtex-start --> |
---|
| 64 | * BibTeX: |
---|
| 65 | * <pre> |
---|
| 66 | * @article{Fisher1987, |
---|
| 67 | * author = {D. Fisher}, |
---|
| 68 | * journal = {Machine Learning}, |
---|
| 69 | * number = {2}, |
---|
| 70 | * pages = {139-172}, |
---|
| 71 | * title = {Knowledge acquisition via incremental conceptual clustering}, |
---|
| 72 | * volume = {2}, |
---|
| 73 | * year = {1987} |
---|
| 74 | * } |
---|
| 75 | * |
---|
| 76 | * @article{Gennari1990, |
---|
| 77 | * author = {J. H. Gennari and P. Langley and D. Fisher}, |
---|
| 78 | * journal = {Artificial Intelligence}, |
---|
| 79 | * pages = {11-61}, |
---|
| 80 | * title = {Models of incremental concept formation}, |
---|
| 81 | * volume = {40}, |
---|
| 82 | * year = {1990} |
---|
| 83 | * } |
---|
| 84 | * </pre> |
---|
| 85 | * <p/> |
---|
| 86 | <!-- technical-bibtex-end --> |
---|
| 87 | * |
---|
| 88 | <!-- options-start --> |
---|
| 89 | * Valid options are: <p/> |
---|
| 90 | * |
---|
| 91 | * <pre> -A <acuity> |
---|
| 92 | * Acuity. |
---|
| 93 | * (default=1.0)</pre> |
---|
| 94 | * |
---|
| 95 | * <pre> -C <cutoff> |
---|
| 96 | * Cutoff. |
---|
| 97 | * (default=0.002)</pre> |
---|
| 98 | * |
---|
| 99 | * <pre> -S <num> |
---|
| 100 | * Random number seed. |
---|
| 101 | * (default 42)</pre> |
---|
| 102 | * |
---|
| 103 | <!-- options-end --> |
---|
| 104 | * |
---|
| 105 | * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a> |
---|
| 106 | * @version $Revision: 5488 $ |
---|
| 107 | * @see RandomizableClusterer |
---|
| 108 | * @see Drawable |
---|
| 109 | */ |
---|
| 110 | public class Cobweb |
---|
| 111 | extends RandomizableClusterer |
---|
| 112 | implements Drawable, TechnicalInformationHandler, UpdateableClusterer { |
---|
| 113 | |
---|
| 114 | /** for serialization */ |
---|
| 115 | static final long serialVersionUID = 928406656495092318L; |
---|
| 116 | |
---|
| 117 | /** |
---|
| 118 | * Inner class handling node operations for Cobweb. |
---|
| 119 | * |
---|
| 120 | * @see Serializable |
---|
| 121 | */ |
---|
| 122 | private class CNode |
---|
| 123 | implements Serializable, RevisionHandler { |
---|
| 124 | |
---|
| 125 | /** for serialization */ |
---|
| 126 | static final long serialVersionUID = 3452097436933325631L; |
---|
| 127 | /** |
---|
| 128 | * Within cluster attribute statistics |
---|
| 129 | */ |
---|
| 130 | private AttributeStats[] m_attStats; |
---|
| 131 | |
---|
| 132 | /** |
---|
| 133 | * Number of attributes |
---|
| 134 | */ |
---|
| 135 | private int m_numAttributes; |
---|
| 136 | |
---|
| 137 | /** |
---|
| 138 | * Instances at this node |
---|
| 139 | */ |
---|
| 140 | protected Instances m_clusterInstances = null; |
---|
| 141 | |
---|
| 142 | /** |
---|
| 143 | * Children of this node |
---|
| 144 | */ |
---|
| 145 | private FastVector m_children = null; |
---|
| 146 | |
---|
| 147 | /** |
---|
| 148 | * Total instances at this node |
---|
| 149 | */ |
---|
| 150 | private double m_totalInstances = 0.0; |
---|
| 151 | |
---|
| 152 | /** |
---|
| 153 | * Cluster number of this node |
---|
| 154 | */ |
---|
| 155 | private int m_clusterNum = -1; |
---|
| 156 | |
---|
| 157 | /** |
---|
| 158 | * Creates an empty <code>CNode</code> instance. |
---|
| 159 | * |
---|
| 160 | * @param numAttributes the number of attributes in the data |
---|
| 161 | */ |
---|
| 162 | public CNode(int numAttributes) { |
---|
| 163 | m_numAttributes = numAttributes; |
---|
| 164 | } |
---|
| 165 | |
---|
| 166 | /** |
---|
| 167 | * Creates a new leaf <code>CNode</code> instance. |
---|
| 168 | * |
---|
| 169 | * @param numAttributes the number of attributes in the data |
---|
| 170 | * @param leafInstance the instance to store at this leaf |
---|
| 171 | */ |
---|
| 172 | public CNode(int numAttributes, Instance leafInstance) { |
---|
| 173 | this(numAttributes); |
---|
| 174 | if (m_clusterInstances == null) { |
---|
| 175 | m_clusterInstances = new Instances(leafInstance.dataset(), 1); |
---|
| 176 | } |
---|
| 177 | m_clusterInstances.add(leafInstance); |
---|
| 178 | updateStats(leafInstance, false); |
---|
| 179 | } |
---|
| 180 | |
---|
| 181 | /** |
---|
| 182 | * Adds an instance to this cluster. |
---|
| 183 | * |
---|
| 184 | * @param newInstance the instance to add |
---|
| 185 | * @throws Exception if an error occurs |
---|
| 186 | */ |
---|
| 187 | protected void addInstance(Instance newInstance) throws Exception { |
---|
| 188 | // Add the instance to this cluster |
---|
| 189 | |
---|
| 190 | if (m_clusterInstances == null) { |
---|
| 191 | m_clusterInstances = new Instances(newInstance.dataset(), 1); |
---|
| 192 | m_clusterInstances.add(newInstance); |
---|
| 193 | updateStats(newInstance, false); |
---|
| 194 | return; |
---|
| 195 | } else if (m_children == null) { |
---|
| 196 | /* we are a leaf, so make our existing instance(s) into a child |
---|
| 197 | and then add the new instance as a child */ |
---|
| 198 | m_children = new FastVector(); |
---|
| 199 | CNode tempSubCluster = new CNode(m_numAttributes, |
---|
| 200 | m_clusterInstances.instance(0)); |
---|
| 201 | |
---|
| 202 | // System.out.println("Dumping "+m_clusterInstances.numInstances()); |
---|
| 203 | for (int i = 1; i < m_clusterInstances.numInstances(); i++) { |
---|
| 204 | tempSubCluster.m_clusterInstances. |
---|
| 205 | add(m_clusterInstances.instance(i)); |
---|
| 206 | tempSubCluster.updateStats(m_clusterInstances.instance(i), false); |
---|
| 207 | } |
---|
| 208 | m_children = new FastVector(); |
---|
| 209 | m_children.addElement(tempSubCluster); |
---|
| 210 | m_children.addElement(new CNode(m_numAttributes, newInstance)); |
---|
| 211 | |
---|
| 212 | m_clusterInstances.add(newInstance); |
---|
| 213 | updateStats(newInstance, false); |
---|
| 214 | |
---|
| 215 | // here is where we check against cutoff (also check cutoff |
---|
| 216 | // in findHost) |
---|
| 217 | if (categoryUtility() < m_cutoff) { |
---|
| 218 | // System.out.println("Cutting (leaf add) "); |
---|
| 219 | m_children = null; |
---|
| 220 | } |
---|
| 221 | return; |
---|
| 222 | } |
---|
| 223 | |
---|
| 224 | // otherwise, find the best host for this instance |
---|
| 225 | CNode bestHost = findHost(newInstance, false); |
---|
| 226 | if (bestHost != null) { |
---|
| 227 | // now add to the best host |
---|
| 228 | bestHost.addInstance(newInstance); |
---|
| 229 | } |
---|
| 230 | } |
---|
| 231 | |
---|
| 232 | /** |
---|
| 233 | * Temporarily adds a new instance to each of this nodes children |
---|
| 234 | * in turn and computes the category utility. |
---|
| 235 | * |
---|
| 236 | * @param newInstance the new instance to evaluate |
---|
| 237 | * @return an array of category utility values---the result of considering |
---|
| 238 | * each child in turn as a host for the new instance |
---|
| 239 | * @throws Exception if an error occurs |
---|
| 240 | */ |
---|
| 241 | private double[] cuScoresForChildren(Instance newInstance) |
---|
| 242 | throws Exception { |
---|
| 243 | // look for a host in existing children |
---|
| 244 | double[] categoryUtils = new double [m_children.size()]; |
---|
| 245 | |
---|
| 246 | // look for a home for this instance in the existing children |
---|
| 247 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 248 | CNode temp = (CNode) m_children.elementAt(i); |
---|
| 249 | // tentitively add the new instance to this child |
---|
| 250 | temp.updateStats(newInstance, false); |
---|
| 251 | categoryUtils[i] = categoryUtility(); |
---|
| 252 | |
---|
| 253 | // remove the new instance from this child |
---|
| 254 | temp.updateStats(newInstance, true); |
---|
| 255 | } |
---|
| 256 | return categoryUtils; |
---|
| 257 | } |
---|
| 258 | |
---|
| 259 | private double cuScoreForBestTwoMerged(CNode merged, |
---|
| 260 | CNode a, CNode b, |
---|
| 261 | Instance newInstance) |
---|
| 262 | throws Exception { |
---|
| 263 | |
---|
| 264 | double mergedCU = -Double.MAX_VALUE; |
---|
| 265 | // consider merging the best and second |
---|
| 266 | // best. |
---|
| 267 | merged.m_clusterInstances = new Instances(m_clusterInstances, 1); |
---|
| 268 | |
---|
| 269 | merged.addChildNode(a); |
---|
| 270 | merged.addChildNode(b); |
---|
| 271 | merged.updateStats(newInstance, false); // add new instance to stats |
---|
| 272 | // remove the best and second best nodes |
---|
| 273 | m_children.removeElementAt(m_children.indexOf(a)); |
---|
| 274 | m_children.removeElementAt(m_children.indexOf(b)); |
---|
| 275 | m_children.addElement(merged); |
---|
| 276 | mergedCU = categoryUtility(); |
---|
| 277 | // restore the status quo |
---|
| 278 | merged.updateStats(newInstance, true); |
---|
| 279 | m_children.removeElementAt(m_children.indexOf(merged)); |
---|
| 280 | m_children.addElement(a); |
---|
| 281 | m_children.addElement(b); |
---|
| 282 | return mergedCU; |
---|
| 283 | } |
---|
| 284 | |
---|
| 285 | /** |
---|
| 286 | * Finds a host for the new instance in this nodes children. Also |
---|
| 287 | * considers merging the two best hosts and splitting the best host. |
---|
| 288 | * |
---|
| 289 | * @param newInstance the instance to find a host for |
---|
| 290 | * @param structureFrozen true if the instance is not to be added to |
---|
| 291 | * the tree and instead the best potential host is to be returned |
---|
| 292 | * @return the best host |
---|
| 293 | * @throws Exception if an error occurs |
---|
| 294 | */ |
---|
| 295 | private CNode findHost(Instance newInstance, |
---|
| 296 | boolean structureFrozen) throws Exception { |
---|
| 297 | |
---|
| 298 | if (!structureFrozen) { |
---|
| 299 | updateStats(newInstance, false); |
---|
| 300 | } |
---|
| 301 | |
---|
| 302 | // look for a host in existing children and also consider as a new leaf |
---|
| 303 | double[] categoryUtils = cuScoresForChildren(newInstance); |
---|
| 304 | |
---|
| 305 | // make a temporary new leaf for this instance and get CU |
---|
| 306 | CNode newLeaf = new CNode(m_numAttributes, newInstance); |
---|
| 307 | m_children.addElement(newLeaf); |
---|
| 308 | double bestHostCU = categoryUtility(); |
---|
| 309 | CNode finalBestHost = newLeaf; |
---|
| 310 | |
---|
| 311 | // remove new leaf when seaching for best and second best nodes to |
---|
| 312 | // consider for merging and splitting |
---|
| 313 | m_children.removeElementAt(m_children.size()-1); |
---|
| 314 | |
---|
| 315 | // now determine the best host (and the second best) |
---|
| 316 | int best = 0; |
---|
| 317 | int secondBest = 0; |
---|
| 318 | for (int i = 0; i < categoryUtils.length; i++) { |
---|
| 319 | if (categoryUtils[i] > categoryUtils[secondBest]) { |
---|
| 320 | if (categoryUtils[i] > categoryUtils[best]) { |
---|
| 321 | secondBest = best; |
---|
| 322 | best = i; |
---|
| 323 | } else { |
---|
| 324 | secondBest = i; |
---|
| 325 | } |
---|
| 326 | } |
---|
| 327 | } |
---|
| 328 | |
---|
| 329 | CNode a = (CNode) m_children.elementAt(best); |
---|
| 330 | CNode b = (CNode) m_children.elementAt(secondBest); |
---|
| 331 | if (categoryUtils[best] > bestHostCU) { |
---|
| 332 | bestHostCU = categoryUtils[best]; |
---|
| 333 | finalBestHost = a; |
---|
| 334 | // System.out.println("Node is best"); |
---|
| 335 | } |
---|
| 336 | |
---|
| 337 | if (structureFrozen) { |
---|
| 338 | if (finalBestHost == newLeaf) { |
---|
| 339 | return null; // *this* node is the best host |
---|
| 340 | } else { |
---|
| 341 | return finalBestHost; |
---|
| 342 | } |
---|
| 343 | } |
---|
| 344 | |
---|
| 345 | double mergedCU = -Double.MAX_VALUE; |
---|
| 346 | CNode merged = new CNode(m_numAttributes); |
---|
| 347 | if (a != b) { |
---|
| 348 | mergedCU = cuScoreForBestTwoMerged(merged, a, b, newInstance); |
---|
| 349 | |
---|
| 350 | if (mergedCU > bestHostCU) { |
---|
| 351 | bestHostCU = mergedCU; |
---|
| 352 | finalBestHost = merged; |
---|
| 353 | } |
---|
| 354 | } |
---|
| 355 | |
---|
| 356 | // Consider splitting the best |
---|
| 357 | double splitCU = -Double.MAX_VALUE; |
---|
| 358 | double splitBestChildCU = -Double.MAX_VALUE; |
---|
| 359 | double splitPlusNewLeafCU = -Double.MAX_VALUE; |
---|
| 360 | double splitPlusMergeBestTwoCU = -Double.MAX_VALUE; |
---|
| 361 | if (a.m_children != null) { |
---|
| 362 | FastVector tempChildren = new FastVector(); |
---|
| 363 | |
---|
| 364 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 365 | CNode existingChild = (CNode)m_children.elementAt(i); |
---|
| 366 | if (existingChild != a) { |
---|
| 367 | tempChildren.addElement(existingChild); |
---|
| 368 | } |
---|
| 369 | } |
---|
| 370 | for (int i = 0; i < a.m_children.size(); i++) { |
---|
| 371 | CNode promotedChild = (CNode)a.m_children.elementAt(i); |
---|
| 372 | tempChildren.addElement(promotedChild); |
---|
| 373 | } |
---|
| 374 | // also add the new leaf |
---|
| 375 | tempChildren.addElement(newLeaf); |
---|
| 376 | |
---|
| 377 | FastVector saveStatusQuo = m_children; |
---|
| 378 | m_children = tempChildren; |
---|
| 379 | splitPlusNewLeafCU = categoryUtility(); // split + new leaf |
---|
| 380 | // remove the new leaf |
---|
| 381 | tempChildren.removeElementAt(tempChildren.size()-1); |
---|
| 382 | // now look for best and second best |
---|
| 383 | categoryUtils = cuScoresForChildren(newInstance); |
---|
| 384 | |
---|
| 385 | // now determine the best host (and the second best) |
---|
| 386 | best = 0; |
---|
| 387 | secondBest = 0; |
---|
| 388 | for (int i = 0; i < categoryUtils.length; i++) { |
---|
| 389 | if (categoryUtils[i] > categoryUtils[secondBest]) { |
---|
| 390 | if (categoryUtils[i] > categoryUtils[best]) { |
---|
| 391 | secondBest = best; |
---|
| 392 | best = i; |
---|
| 393 | } else { |
---|
| 394 | secondBest = i; |
---|
| 395 | } |
---|
| 396 | } |
---|
| 397 | } |
---|
| 398 | CNode sa = (CNode) m_children.elementAt(best); |
---|
| 399 | CNode sb = (CNode) m_children.elementAt(secondBest); |
---|
| 400 | splitBestChildCU = categoryUtils[best]; |
---|
| 401 | |
---|
| 402 | // now merge best and second best |
---|
| 403 | CNode mergedSplitChildren = new CNode(m_numAttributes); |
---|
| 404 | if (sa != sb) { |
---|
| 405 | splitPlusMergeBestTwoCU = |
---|
| 406 | cuScoreForBestTwoMerged(mergedSplitChildren, sa, sb, newInstance); |
---|
| 407 | } |
---|
| 408 | splitCU = (splitBestChildCU > splitPlusNewLeafCU) ? |
---|
| 409 | splitBestChildCU : splitPlusNewLeafCU; |
---|
| 410 | splitCU = (splitCU > splitPlusMergeBestTwoCU) ? |
---|
| 411 | splitCU : splitPlusMergeBestTwoCU; |
---|
| 412 | |
---|
| 413 | if (splitCU > bestHostCU) { |
---|
| 414 | bestHostCU = splitCU; |
---|
| 415 | finalBestHost = this; |
---|
| 416 | // tempChildren.removeElementAt(tempChildren.size()-1); |
---|
| 417 | } else { |
---|
| 418 | // restore the status quo |
---|
| 419 | m_children = saveStatusQuo; |
---|
| 420 | } |
---|
| 421 | } |
---|
| 422 | |
---|
| 423 | if (finalBestHost != this) { |
---|
| 424 | // can commit the instance to the set of instances at this node |
---|
| 425 | m_clusterInstances.add(newInstance); |
---|
| 426 | } else { |
---|
| 427 | m_numberSplits++; |
---|
| 428 | } |
---|
| 429 | |
---|
| 430 | if (finalBestHost == merged) { |
---|
| 431 | m_numberMerges++; |
---|
| 432 | m_children.removeElementAt(m_children.indexOf(a)); |
---|
| 433 | m_children.removeElementAt(m_children.indexOf(b)); |
---|
| 434 | m_children.addElement(merged); |
---|
| 435 | } |
---|
| 436 | |
---|
| 437 | if (finalBestHost == newLeaf) { |
---|
| 438 | finalBestHost = new CNode(m_numAttributes); |
---|
| 439 | m_children.addElement(finalBestHost); |
---|
| 440 | } |
---|
| 441 | |
---|
| 442 | if (bestHostCU < m_cutoff) { |
---|
| 443 | if (finalBestHost == this) { |
---|
| 444 | // splitting was the best, but since we are cutting all children |
---|
| 445 | // recursion is aborted and we still need to add the instance |
---|
| 446 | // to the set of instances at this node |
---|
| 447 | m_clusterInstances.add(newInstance); |
---|
| 448 | } |
---|
| 449 | m_children = null; |
---|
| 450 | finalBestHost = null; |
---|
| 451 | } |
---|
| 452 | |
---|
| 453 | if (finalBestHost == this) { |
---|
| 454 | // splitting is still the best, so downdate the stats as |
---|
| 455 | // we'll be recursively calling on this node |
---|
| 456 | updateStats(newInstance, true); |
---|
| 457 | } |
---|
| 458 | |
---|
| 459 | return finalBestHost; |
---|
| 460 | } |
---|
| 461 | |
---|
| 462 | /** |
---|
| 463 | * Adds the supplied node as a child of this node. All of the child's |
---|
| 464 | * instances are added to this nodes instances |
---|
| 465 | * |
---|
| 466 | * @param child the child to add |
---|
| 467 | */ |
---|
| 468 | protected void addChildNode(CNode child) { |
---|
| 469 | for (int i = 0; i < child.m_clusterInstances.numInstances(); i++) { |
---|
| 470 | Instance temp = child.m_clusterInstances.instance(i); |
---|
| 471 | m_clusterInstances.add(temp); |
---|
| 472 | updateStats(temp, false); |
---|
| 473 | } |
---|
| 474 | |
---|
| 475 | if (m_children == null) { |
---|
| 476 | m_children = new FastVector(); |
---|
| 477 | } |
---|
| 478 | m_children.addElement(child); |
---|
| 479 | } |
---|
| 480 | |
---|
| 481 | /** |
---|
| 482 | * Computes the utility of all children with respect to this node |
---|
| 483 | * |
---|
| 484 | * @return the category utility of the children with respect to this node. |
---|
| 485 | * @throws Exception if there are no children |
---|
| 486 | */ |
---|
| 487 | protected double categoryUtility() throws Exception { |
---|
| 488 | |
---|
| 489 | if (m_children == null) { |
---|
| 490 | throw new Exception("categoryUtility: No children!"); |
---|
| 491 | } |
---|
| 492 | |
---|
| 493 | double totalCU = 0; |
---|
| 494 | |
---|
| 495 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 496 | CNode child = (CNode) m_children.elementAt(i); |
---|
| 497 | totalCU += categoryUtilityChild(child); |
---|
| 498 | } |
---|
| 499 | |
---|
| 500 | totalCU /= (double)m_children.size(); |
---|
| 501 | return totalCU; |
---|
| 502 | } |
---|
| 503 | |
---|
| 504 | /** |
---|
| 505 | * Computes the utility of a single child with respect to this node |
---|
| 506 | * |
---|
| 507 | * @param child the child for which to compute the utility |
---|
| 508 | * @return the utility of the child with respect to this node |
---|
| 509 | * @throws Exception if something goes wrong |
---|
| 510 | */ |
---|
| 511 | protected double categoryUtilityChild(CNode child) throws Exception { |
---|
| 512 | |
---|
| 513 | double sum = 0; |
---|
| 514 | for (int i = 0; i < m_numAttributes; i++) { |
---|
| 515 | if (m_clusterInstances.attribute(i).isNominal()) { |
---|
| 516 | for (int j = 0; |
---|
| 517 | j < m_clusterInstances.attribute(i).numValues(); j++) { |
---|
| 518 | double x = child.getProbability(i, j); |
---|
| 519 | double y = getProbability(i, j); |
---|
| 520 | sum += (x * x) - (y * y); |
---|
| 521 | } |
---|
| 522 | } else { |
---|
| 523 | // numeric attribute |
---|
| 524 | sum += ((m_normal / child.getStandardDev(i)) - |
---|
| 525 | (m_normal / getStandardDev(i))); |
---|
| 526 | |
---|
| 527 | } |
---|
| 528 | } |
---|
| 529 | return (child.m_totalInstances / m_totalInstances) * sum; |
---|
| 530 | } |
---|
| 531 | |
---|
| 532 | /** |
---|
| 533 | * Returns the probability of a value of a nominal attribute in this node |
---|
| 534 | * |
---|
| 535 | * @param attIndex the index of the attribute |
---|
| 536 | * @param valueIndex the index of the value of the attribute |
---|
| 537 | * @return the probability |
---|
| 538 | * @throws Exception if the requested attribute is not nominal |
---|
| 539 | */ |
---|
| 540 | protected double getProbability(int attIndex, int valueIndex) |
---|
| 541 | throws Exception { |
---|
| 542 | |
---|
| 543 | if (!m_clusterInstances.attribute(attIndex).isNominal()) { |
---|
| 544 | throw new Exception("getProbability: attribute is not nominal"); |
---|
| 545 | } |
---|
| 546 | |
---|
| 547 | if (m_attStats[attIndex].totalCount <= 0) { |
---|
| 548 | return 0; |
---|
| 549 | } |
---|
| 550 | |
---|
| 551 | return (double) m_attStats[attIndex].nominalCounts[valueIndex] / |
---|
| 552 | (double) m_attStats[attIndex].totalCount; |
---|
| 553 | } |
---|
| 554 | |
---|
| 555 | /** |
---|
| 556 | * Returns the standard deviation of a numeric attribute |
---|
| 557 | * |
---|
| 558 | * @param attIndex the index of the attribute |
---|
| 559 | * @return the standard deviation |
---|
| 560 | * @throws Exception if an error occurs |
---|
| 561 | */ |
---|
| 562 | protected double getStandardDev(int attIndex) throws Exception { |
---|
| 563 | if (!m_clusterInstances.attribute(attIndex).isNumeric()) { |
---|
| 564 | throw new Exception("getStandardDev: attribute is not numeric"); |
---|
| 565 | } |
---|
| 566 | |
---|
| 567 | m_attStats[attIndex].numericStats.calculateDerived(); |
---|
| 568 | double stdDev = m_attStats[attIndex].numericStats.stdDev; |
---|
| 569 | if (Double.isNaN(stdDev) || Double.isInfinite(stdDev)) { |
---|
| 570 | return m_acuity; |
---|
| 571 | } |
---|
| 572 | |
---|
| 573 | return Math.max(m_acuity, stdDev); |
---|
| 574 | } |
---|
| 575 | |
---|
| 576 | /** |
---|
| 577 | * Update attribute stats using the supplied instance. |
---|
| 578 | * |
---|
| 579 | * @param updateInstance the instance for updating |
---|
| 580 | * @param delete true if the values of the supplied instance are |
---|
| 581 | * to be removed from the statistics |
---|
| 582 | */ |
---|
| 583 | protected void updateStats(Instance updateInstance, |
---|
| 584 | boolean delete) { |
---|
| 585 | |
---|
| 586 | if (m_attStats == null) { |
---|
| 587 | m_attStats = new AttributeStats[m_numAttributes]; |
---|
| 588 | for (int i = 0; i < m_numAttributes; i++) { |
---|
| 589 | m_attStats[i] = new AttributeStats(); |
---|
| 590 | if (m_clusterInstances.attribute(i).isNominal()) { |
---|
| 591 | m_attStats[i].nominalCounts = |
---|
| 592 | new int [m_clusterInstances.attribute(i).numValues()]; |
---|
| 593 | } else { |
---|
| 594 | m_attStats[i].numericStats = new Stats(); |
---|
| 595 | } |
---|
| 596 | } |
---|
| 597 | } |
---|
| 598 | for (int i = 0; i < m_numAttributes; i++) { |
---|
| 599 | if (!updateInstance.isMissing(i)) { |
---|
| 600 | double value = updateInstance.value(i); |
---|
| 601 | if (m_clusterInstances.attribute(i).isNominal()) { |
---|
| 602 | m_attStats[i].nominalCounts[(int)value] += (delete) ? |
---|
| 603 | (-1.0 * updateInstance.weight()) : |
---|
| 604 | updateInstance.weight(); |
---|
| 605 | m_attStats[i].totalCount += (delete) ? |
---|
| 606 | (-1.0 * updateInstance.weight()) : |
---|
| 607 | updateInstance.weight(); |
---|
| 608 | } else { |
---|
| 609 | if (delete) { |
---|
| 610 | m_attStats[i].numericStats.subtract(value, |
---|
| 611 | updateInstance.weight()); |
---|
| 612 | } else { |
---|
| 613 | m_attStats[i].numericStats.add(value, updateInstance.weight()); |
---|
| 614 | } |
---|
| 615 | } |
---|
| 616 | } |
---|
| 617 | } |
---|
| 618 | m_totalInstances += (delete) |
---|
| 619 | ? (-1.0 * updateInstance.weight()) |
---|
| 620 | : (updateInstance.weight()); |
---|
| 621 | } |
---|
| 622 | |
---|
| 623 | /** |
---|
| 624 | * Recursively assigns numbers to the nodes in the tree. |
---|
| 625 | * |
---|
| 626 | * @param cl_num an <code>int[]</code> value |
---|
| 627 | * @throws Exception if an error occurs |
---|
| 628 | */ |
---|
| 629 | private void assignClusterNums(int[] cl_num) throws Exception { |
---|
| 630 | if (m_children != null && m_children.size() < 2) { |
---|
| 631 | throw new Exception("assignClusterNums: tree not built correctly!"); |
---|
| 632 | } |
---|
| 633 | |
---|
| 634 | m_clusterNum = cl_num[0]; |
---|
| 635 | cl_num[0]++; |
---|
| 636 | if (m_children != null) { |
---|
| 637 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 638 | CNode child = (CNode) m_children.elementAt(i); |
---|
| 639 | child.assignClusterNums(cl_num); |
---|
| 640 | } |
---|
| 641 | } |
---|
| 642 | } |
---|
| 643 | |
---|
| 644 | /** |
---|
| 645 | * Recursively build a string representation of the Cobweb tree |
---|
| 646 | * |
---|
| 647 | * @param depth depth of this node in the tree |
---|
| 648 | * @param text holds the string representation |
---|
| 649 | */ |
---|
| 650 | protected void dumpTree(int depth, StringBuffer text) { |
---|
| 651 | |
---|
| 652 | if (depth == 0) |
---|
| 653 | determineNumberOfClusters(); |
---|
| 654 | |
---|
| 655 | if (m_children == null) { |
---|
| 656 | text.append("\n"); |
---|
| 657 | for (int j = 0; j < depth; j++) { |
---|
| 658 | text.append("| "); |
---|
| 659 | } |
---|
| 660 | text.append("leaf "+m_clusterNum+" [" |
---|
| 661 | +m_clusterInstances.numInstances()+"]"); |
---|
| 662 | } else { |
---|
| 663 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 664 | text.append("\n"); |
---|
| 665 | for (int j = 0; j < depth; j++) { |
---|
| 666 | text.append("| "); |
---|
| 667 | } |
---|
| 668 | text.append("node "+m_clusterNum+" [" |
---|
| 669 | +m_clusterInstances.numInstances() |
---|
| 670 | +"]"); |
---|
| 671 | ((CNode) m_children.elementAt(i)).dumpTree(depth+1, text); |
---|
| 672 | } |
---|
| 673 | } |
---|
| 674 | } |
---|
| 675 | |
---|
| 676 | /** |
---|
| 677 | * Returns the instances at this node as a string. Appends the cluster |
---|
| 678 | * number of the child that each instance belongs to. |
---|
| 679 | * |
---|
| 680 | * @return a <code>String</code> value |
---|
| 681 | * @throws Exception if an error occurs |
---|
| 682 | */ |
---|
| 683 | protected String dumpData() throws Exception { |
---|
| 684 | if (m_children == null) { |
---|
| 685 | return m_clusterInstances.toString(); |
---|
| 686 | } |
---|
| 687 | |
---|
| 688 | // construct instances string with cluster numbers attached |
---|
| 689 | CNode tempNode = new CNode(m_numAttributes); |
---|
| 690 | tempNode.m_clusterInstances = new Instances(m_clusterInstances, 1); |
---|
| 691 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 692 | tempNode.addChildNode((CNode)m_children.elementAt(i)); |
---|
| 693 | } |
---|
| 694 | Instances tempInst = tempNode.m_clusterInstances; |
---|
| 695 | tempNode = null; |
---|
| 696 | |
---|
| 697 | Add af = new Add(); |
---|
| 698 | af.setAttributeName("Cluster"); |
---|
| 699 | String labels = ""; |
---|
| 700 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 701 | CNode temp = (CNode)m_children.elementAt(i); |
---|
| 702 | labels += ("C"+temp.m_clusterNum); |
---|
| 703 | if (i < m_children.size()-1) { |
---|
| 704 | labels+=","; |
---|
| 705 | } |
---|
| 706 | } |
---|
| 707 | af.setNominalLabels(labels); |
---|
| 708 | af.setInputFormat(tempInst); |
---|
| 709 | tempInst = Filter.useFilter(tempInst, af); |
---|
| 710 | tempInst.setRelationName("Cluster "+m_clusterNum); |
---|
| 711 | |
---|
| 712 | int z = 0; |
---|
| 713 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 714 | CNode temp = (CNode)m_children.elementAt(i); |
---|
| 715 | for (int j = 0; j < temp.m_clusterInstances.numInstances(); j++) { |
---|
| 716 | tempInst.instance(z).setValue(m_numAttributes, (double)i); |
---|
| 717 | z++; |
---|
| 718 | } |
---|
| 719 | } |
---|
| 720 | return tempInst.toString(); |
---|
| 721 | } |
---|
| 722 | |
---|
| 723 | /** |
---|
| 724 | * Recursively generate the graph string for the Cobweb tree. |
---|
| 725 | * |
---|
| 726 | * @param text holds the graph string |
---|
| 727 | * @throws Exception if generation fails |
---|
| 728 | */ |
---|
| 729 | protected void graphTree(StringBuffer text) throws Exception { |
---|
| 730 | |
---|
| 731 | text.append("N"+m_clusterNum |
---|
| 732 | + " [label=\""+((m_children == null) |
---|
| 733 | ? "leaf " : "node ") |
---|
| 734 | +m_clusterNum+" " |
---|
| 735 | +" ("+m_clusterInstances.numInstances() |
---|
| 736 | +")\" " |
---|
| 737 | +((m_children == null) |
---|
| 738 | ? "shape=box style=filled " : "") |
---|
| 739 | +(m_saveInstances |
---|
| 740 | ? "data =\n"+dumpData() +"\n,\n" |
---|
| 741 | : "") |
---|
| 742 | + "]\n"); |
---|
| 743 | if (m_children != null) { |
---|
| 744 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 745 | CNode temp = (CNode)m_children.elementAt(i); |
---|
| 746 | text.append("N"+m_clusterNum |
---|
| 747 | +"->" |
---|
| 748 | +"N" + temp.m_clusterNum |
---|
| 749 | + "\n"); |
---|
| 750 | } |
---|
| 751 | |
---|
| 752 | for (int i = 0; i < m_children.size(); i++) { |
---|
| 753 | CNode temp = (CNode)m_children.elementAt(i); |
---|
| 754 | temp.graphTree(text); |
---|
| 755 | } |
---|
| 756 | } |
---|
| 757 | } |
---|
| 758 | |
---|
| 759 | /** |
---|
| 760 | * Returns the revision string. |
---|
| 761 | * |
---|
| 762 | * @return the revision |
---|
| 763 | */ |
---|
| 764 | public String getRevision() { |
---|
| 765 | return RevisionUtils.extract("$Revision: 5488 $"); |
---|
| 766 | } |
---|
| 767 | } |
---|
| 768 | |
---|
| 769 | /** |
---|
| 770 | * Normal constant. |
---|
| 771 | */ |
---|
| 772 | protected static final double m_normal = 1.0/(2 * Math.sqrt(Math.PI)); |
---|
| 773 | |
---|
| 774 | /** |
---|
| 775 | * Acuity (minimum standard deviation). |
---|
| 776 | */ |
---|
| 777 | protected double m_acuity = 1.0; |
---|
| 778 | |
---|
| 779 | /** |
---|
| 780 | * Cutoff (minimum category utility). |
---|
| 781 | */ |
---|
| 782 | protected double m_cutoff = 0.01 * Cobweb.m_normal; |
---|
| 783 | |
---|
| 784 | /** |
---|
| 785 | * Holds the root of the Cobweb tree. |
---|
| 786 | */ |
---|
| 787 | protected CNode m_cobwebTree = null; |
---|
| 788 | |
---|
| 789 | /** |
---|
| 790 | * Number of clusters (nodes in the tree). Must never be queried directly, |
---|
| 791 | * only via the method numberOfClusters(). Otherwise it's not guaranteed that |
---|
| 792 | * it contains the correct value. |
---|
| 793 | * |
---|
| 794 | * @see #numberOfClusters() |
---|
| 795 | * @see #m_numberOfClustersDetermined |
---|
| 796 | */ |
---|
| 797 | protected int m_numberOfClusters = -1; |
---|
| 798 | |
---|
| 799 | /** whether the number of clusters was already determined */ |
---|
| 800 | protected boolean m_numberOfClustersDetermined = false; |
---|
| 801 | |
---|
| 802 | /** the number of splits that happened */ |
---|
| 803 | protected int m_numberSplits; |
---|
| 804 | |
---|
| 805 | /** the number of merges that happened */ |
---|
| 806 | protected int m_numberMerges; |
---|
| 807 | |
---|
| 808 | /** |
---|
| 809 | * Output instances in graph representation of Cobweb tree (Allows |
---|
| 810 | * instances at nodes in the tree to be visualized in the Explorer). |
---|
| 811 | */ |
---|
| 812 | protected boolean m_saveInstances = false; |
---|
| 813 | |
---|
| 814 | /** |
---|
| 815 | * default constructor |
---|
| 816 | */ |
---|
| 817 | public Cobweb() { |
---|
| 818 | super(); |
---|
| 819 | |
---|
| 820 | m_SeedDefault = 42; |
---|
| 821 | setSeed(m_SeedDefault); |
---|
| 822 | } |
---|
| 823 | |
---|
| 824 | /** |
---|
| 825 | * Returns a string describing this clusterer |
---|
| 826 | * @return a description of the evaluator suitable for |
---|
| 827 | * displaying in the explorer/experimenter gui |
---|
| 828 | */ |
---|
| 829 | public String globalInfo() { |
---|
| 830 | return |
---|
| 831 | "Class implementing the Cobweb and Classit clustering algorithms.\n\n" |
---|
| 832 | + "Note: the application of node operators (merging, splitting etc.) in " |
---|
| 833 | + "terms of ordering and priority differs (and is somewhat ambiguous) " |
---|
| 834 | + "between the original Cobweb and Classit papers. This algorithm always " |
---|
| 835 | + "compares the best host, adding a new leaf, merging the two best hosts, " |
---|
| 836 | + "and splitting the best host when considering where to place a new " |
---|
| 837 | + "instance.\n\n" |
---|
| 838 | + "For more information see:\n\n" |
---|
| 839 | + getTechnicalInformation().toString(); |
---|
| 840 | } |
---|
| 841 | |
---|
| 842 | /** |
---|
| 843 | * Returns an instance of a TechnicalInformation object, containing |
---|
| 844 | * detailed information about the technical background of this class, |
---|
| 845 | * e.g., paper reference or book this class is based on. |
---|
| 846 | * |
---|
| 847 | * @return the technical information about this class |
---|
| 848 | */ |
---|
| 849 | public TechnicalInformation getTechnicalInformation() { |
---|
| 850 | TechnicalInformation result; |
---|
| 851 | TechnicalInformation additional; |
---|
| 852 | |
---|
| 853 | result = new TechnicalInformation(Type.ARTICLE); |
---|
| 854 | result.setValue(Field.AUTHOR, "D. Fisher"); |
---|
| 855 | result.setValue(Field.YEAR, "1987"); |
---|
| 856 | result.setValue(Field.TITLE, "Knowledge acquisition via incremental conceptual clustering"); |
---|
| 857 | result.setValue(Field.JOURNAL, "Machine Learning"); |
---|
| 858 | result.setValue(Field.VOLUME, "2"); |
---|
| 859 | result.setValue(Field.NUMBER, "2"); |
---|
| 860 | result.setValue(Field.PAGES, "139-172"); |
---|
| 861 | |
---|
| 862 | additional = result.add(Type.ARTICLE); |
---|
| 863 | additional.setValue(Field.AUTHOR, "J. H. Gennari and P. Langley and D. Fisher"); |
---|
| 864 | additional.setValue(Field.YEAR, "1990"); |
---|
| 865 | additional.setValue(Field.TITLE, "Models of incremental concept formation"); |
---|
| 866 | additional.setValue(Field.JOURNAL, "Artificial Intelligence"); |
---|
| 867 | additional.setValue(Field.VOLUME, "40"); |
---|
| 868 | additional.setValue(Field.PAGES, "11-61"); |
---|
| 869 | |
---|
| 870 | return result; |
---|
| 871 | } |
---|
| 872 | |
---|
| 873 | /** |
---|
| 874 | * Returns default capabilities of the clusterer. |
---|
| 875 | * |
---|
| 876 | * @return the capabilities of this clusterer |
---|
| 877 | */ |
---|
| 878 | public Capabilities getCapabilities() { |
---|
| 879 | Capabilities result = super.getCapabilities(); |
---|
| 880 | result.disableAll(); |
---|
| 881 | result.enable(Capability.NO_CLASS); |
---|
| 882 | |
---|
| 883 | // attributes |
---|
| 884 | result.enable(Capability.NOMINAL_ATTRIBUTES); |
---|
| 885 | result.enable(Capability.NUMERIC_ATTRIBUTES); |
---|
| 886 | result.enable(Capability.DATE_ATTRIBUTES); |
---|
| 887 | result.enable(Capability.MISSING_VALUES); |
---|
| 888 | |
---|
| 889 | // other |
---|
| 890 | result.setMinimumNumberInstances(0); |
---|
| 891 | |
---|
| 892 | return result; |
---|
| 893 | } |
---|
| 894 | |
---|
| 895 | /** |
---|
| 896 | * Builds the clusterer. |
---|
| 897 | * |
---|
| 898 | * @param data the training instances. |
---|
| 899 | * @throws Exception if something goes wrong. |
---|
| 900 | */ |
---|
| 901 | public void buildClusterer(Instances data) throws Exception { |
---|
| 902 | m_numberOfClusters = -1; |
---|
| 903 | m_cobwebTree = null; |
---|
| 904 | m_numberSplits = 0; |
---|
| 905 | m_numberMerges = 0; |
---|
| 906 | |
---|
| 907 | // can clusterer handle the data? |
---|
| 908 | getCapabilities().testWithFail(data); |
---|
| 909 | |
---|
| 910 | // randomize the instances |
---|
| 911 | data = new Instances(data); |
---|
| 912 | data.randomize(new Random(getSeed())); |
---|
| 913 | |
---|
| 914 | for (int i = 0; i < data.numInstances(); i++) { |
---|
| 915 | updateClusterer(data.instance(i)); |
---|
| 916 | } |
---|
| 917 | |
---|
| 918 | updateFinished(); |
---|
| 919 | } |
---|
| 920 | |
---|
| 921 | /** |
---|
| 922 | * Singals the end of the updating. |
---|
| 923 | */ |
---|
| 924 | public void updateFinished() { |
---|
| 925 | determineNumberOfClusters(); |
---|
| 926 | } |
---|
| 927 | |
---|
| 928 | /** |
---|
| 929 | * Classifies a given instance. |
---|
| 930 | * |
---|
| 931 | * @param instance the instance to be assigned to a cluster |
---|
| 932 | * @return the number of the assigned cluster as an interger |
---|
| 933 | * if the class is enumerated, otherwise the predicted value |
---|
| 934 | * @throws Exception if instance could not be classified |
---|
| 935 | * successfully |
---|
| 936 | */ |
---|
| 937 | public int clusterInstance(Instance instance) throws Exception { |
---|
| 938 | CNode host = m_cobwebTree; |
---|
| 939 | CNode temp = null; |
---|
| 940 | |
---|
| 941 | determineNumberOfClusters(); |
---|
| 942 | |
---|
| 943 | do { |
---|
| 944 | if (host.m_children == null) { |
---|
| 945 | temp = null; |
---|
| 946 | break; |
---|
| 947 | } |
---|
| 948 | |
---|
| 949 | host.updateStats(instance, false); |
---|
| 950 | temp = host.findHost(instance, true); |
---|
| 951 | host.updateStats(instance, true); |
---|
| 952 | |
---|
| 953 | if (temp != null) { |
---|
| 954 | host = temp; |
---|
| 955 | } |
---|
| 956 | } while (temp != null); |
---|
| 957 | |
---|
| 958 | return host.m_clusterNum; |
---|
| 959 | } |
---|
| 960 | |
---|
| 961 | /** |
---|
| 962 | * determines the number of clusters if necessary |
---|
| 963 | * |
---|
| 964 | * @see #m_numberOfClusters |
---|
| 965 | * @see #m_numberOfClustersDetermined |
---|
| 966 | */ |
---|
| 967 | protected void determineNumberOfClusters() { |
---|
| 968 | if ( !m_numberOfClustersDetermined |
---|
| 969 | && (m_cobwebTree != null) ) { |
---|
| 970 | int[] numClusts = new int [1]; |
---|
| 971 | numClusts[0] = 0; |
---|
| 972 | try { |
---|
| 973 | m_cobwebTree.assignClusterNums(numClusts); |
---|
| 974 | } |
---|
| 975 | catch (Exception e) { |
---|
| 976 | e.printStackTrace(); |
---|
| 977 | numClusts[0] = 0; |
---|
| 978 | } |
---|
| 979 | m_numberOfClusters = numClusts[0]; |
---|
| 980 | |
---|
| 981 | m_numberOfClustersDetermined = true; |
---|
| 982 | } |
---|
| 983 | } |
---|
| 984 | |
---|
| 985 | /** |
---|
| 986 | * Returns the number of clusters. |
---|
| 987 | * |
---|
| 988 | * @return the number of clusters |
---|
| 989 | */ |
---|
| 990 | public int numberOfClusters() { |
---|
| 991 | determineNumberOfClusters(); |
---|
| 992 | return m_numberOfClusters; |
---|
| 993 | } |
---|
| 994 | |
---|
| 995 | /** |
---|
| 996 | * Adds an instance to the clusterer. |
---|
| 997 | * |
---|
| 998 | * @param newInstance the instance to be added |
---|
| 999 | * @throws Exception if something goes wrong |
---|
| 1000 | */ |
---|
| 1001 | public void updateClusterer(Instance newInstance) throws Exception { |
---|
| 1002 | m_numberOfClustersDetermined = false; |
---|
| 1003 | |
---|
| 1004 | if (m_cobwebTree == null) { |
---|
| 1005 | m_cobwebTree = new CNode(newInstance.numAttributes(), newInstance); |
---|
| 1006 | } else { |
---|
| 1007 | m_cobwebTree.addInstance(newInstance); |
---|
| 1008 | } |
---|
| 1009 | } |
---|
| 1010 | |
---|
| 1011 | /** |
---|
| 1012 | * Adds an instance to the Cobweb tree. |
---|
| 1013 | * |
---|
| 1014 | * @param newInstance the instance to be added |
---|
| 1015 | * @throws Exception if something goes wrong |
---|
| 1016 | * @deprecated updateClusterer(Instance) should be used instead |
---|
| 1017 | * @see #updateClusterer(Instance) |
---|
| 1018 | */ |
---|
| 1019 | public void addInstance(Instance newInstance) throws Exception { |
---|
| 1020 | updateClusterer(newInstance); |
---|
| 1021 | } |
---|
| 1022 | |
---|
| 1023 | /** |
---|
| 1024 | * Returns an enumeration describing the available options. |
---|
| 1025 | * |
---|
| 1026 | * @return an enumeration of all the available options. |
---|
| 1027 | **/ |
---|
| 1028 | public Enumeration listOptions() { |
---|
| 1029 | Vector result = new Vector(); |
---|
| 1030 | |
---|
| 1031 | result.addElement(new Option( |
---|
| 1032 | "\tAcuity.\n" |
---|
| 1033 | +"\t(default=1.0)", |
---|
| 1034 | "A", 1,"-A <acuity>")); |
---|
| 1035 | |
---|
| 1036 | result.addElement(new Option( |
---|
| 1037 | "\tCutoff.\n" |
---|
| 1038 | +"\t(default=0.002)", |
---|
| 1039 | "C", 1,"-C <cutoff>")); |
---|
| 1040 | |
---|
| 1041 | Enumeration en = super.listOptions(); |
---|
| 1042 | while (en.hasMoreElements()) |
---|
| 1043 | result.addElement(en.nextElement()); |
---|
| 1044 | |
---|
| 1045 | return result.elements(); |
---|
| 1046 | } |
---|
| 1047 | |
---|
| 1048 | /** |
---|
| 1049 | * Parses a given list of options. <p/> |
---|
| 1050 | * |
---|
| 1051 | <!-- options-start --> |
---|
| 1052 | * Valid options are: <p/> |
---|
| 1053 | * |
---|
| 1054 | * <pre> -A <acuity> |
---|
| 1055 | * Acuity. |
---|
| 1056 | * (default=1.0)</pre> |
---|
| 1057 | * |
---|
| 1058 | * <pre> -C <cutoff> |
---|
| 1059 | * Cutoff. |
---|
| 1060 | * (default=0.002)</pre> |
---|
| 1061 | * |
---|
| 1062 | * <pre> -S <num> |
---|
| 1063 | * Random number seed. |
---|
| 1064 | * (default 42)</pre> |
---|
| 1065 | * |
---|
| 1066 | <!-- options-end --> |
---|
| 1067 | * |
---|
| 1068 | * @param options the list of options as an array of strings |
---|
| 1069 | * @throws Exception if an option is not supported |
---|
| 1070 | */ |
---|
| 1071 | public void setOptions(String[] options) throws Exception { |
---|
| 1072 | String optionString; |
---|
| 1073 | |
---|
| 1074 | optionString = Utils.getOption('A', options); |
---|
| 1075 | if (optionString.length() != 0) { |
---|
| 1076 | Double temp = new Double(optionString); |
---|
| 1077 | setAcuity(temp.doubleValue()); |
---|
| 1078 | } |
---|
| 1079 | else { |
---|
| 1080 | m_acuity = 1.0; |
---|
| 1081 | } |
---|
| 1082 | optionString = Utils.getOption('C', options); |
---|
| 1083 | if (optionString.length() != 0) { |
---|
| 1084 | Double temp = new Double(optionString); |
---|
| 1085 | setCutoff(temp.doubleValue()); |
---|
| 1086 | } |
---|
| 1087 | else { |
---|
| 1088 | m_cutoff = 0.01 * Cobweb.m_normal; |
---|
| 1089 | } |
---|
| 1090 | |
---|
| 1091 | super.setOptions(options); |
---|
| 1092 | } |
---|
| 1093 | |
---|
| 1094 | /** |
---|
| 1095 | * Returns the tip text for this property |
---|
| 1096 | * @return tip text for this property suitable for |
---|
| 1097 | * displaying in the explorer/experimenter gui |
---|
| 1098 | */ |
---|
| 1099 | public String acuityTipText() { |
---|
| 1100 | return "set the minimum standard deviation for numeric attributes"; |
---|
| 1101 | } |
---|
| 1102 | |
---|
| 1103 | /** |
---|
| 1104 | * set the acuity. |
---|
| 1105 | * @param a the acuity value |
---|
| 1106 | */ |
---|
| 1107 | public void setAcuity(double a) { |
---|
| 1108 | m_acuity = a; |
---|
| 1109 | } |
---|
| 1110 | |
---|
| 1111 | /** |
---|
| 1112 | * get the acuity value |
---|
| 1113 | * @return the acuity |
---|
| 1114 | */ |
---|
| 1115 | public double getAcuity() { |
---|
| 1116 | return m_acuity; |
---|
| 1117 | } |
---|
| 1118 | |
---|
| 1119 | /** |
---|
| 1120 | * Returns the tip text for this property |
---|
| 1121 | * @return tip text for this property suitable for |
---|
| 1122 | * displaying in the explorer/experimenter gui |
---|
| 1123 | */ |
---|
| 1124 | public String cutoffTipText() { |
---|
| 1125 | return "set the category utility threshold by which to prune nodes"; |
---|
| 1126 | } |
---|
| 1127 | |
---|
| 1128 | /** |
---|
| 1129 | * set the cutoff |
---|
| 1130 | * @param c the cutof |
---|
| 1131 | */ |
---|
| 1132 | public void setCutoff(double c) { |
---|
| 1133 | m_cutoff = c; |
---|
| 1134 | } |
---|
| 1135 | |
---|
| 1136 | /** |
---|
| 1137 | * get the cutoff |
---|
| 1138 | * @return the cutoff |
---|
| 1139 | */ |
---|
| 1140 | public double getCutoff() { |
---|
| 1141 | return m_cutoff; |
---|
| 1142 | } |
---|
| 1143 | |
---|
| 1144 | /** |
---|
| 1145 | * Returns the tip text for this property |
---|
| 1146 | * @return tip text for this property suitable for |
---|
| 1147 | * displaying in the explorer/experimenter gui |
---|
| 1148 | */ |
---|
| 1149 | public String saveInstanceDataTipText() { |
---|
| 1150 | return "save instance information for visualization purposes"; |
---|
| 1151 | } |
---|
| 1152 | |
---|
| 1153 | /** |
---|
| 1154 | * Get the value of saveInstances. |
---|
| 1155 | * |
---|
| 1156 | * @return Value of saveInstances. |
---|
| 1157 | */ |
---|
| 1158 | public boolean getSaveInstanceData() { |
---|
| 1159 | |
---|
| 1160 | return m_saveInstances; |
---|
| 1161 | } |
---|
| 1162 | |
---|
| 1163 | /** |
---|
| 1164 | * Set the value of saveInstances. |
---|
| 1165 | * |
---|
| 1166 | * @param newsaveInstances Value to assign to saveInstances. |
---|
| 1167 | */ |
---|
| 1168 | public void setSaveInstanceData(boolean newsaveInstances) { |
---|
| 1169 | |
---|
| 1170 | m_saveInstances = newsaveInstances; |
---|
| 1171 | } |
---|
| 1172 | |
---|
| 1173 | /** |
---|
| 1174 | * Gets the current settings of Cobweb. |
---|
| 1175 | * |
---|
| 1176 | * @return an array of strings suitable for passing to setOptions() |
---|
| 1177 | */ |
---|
| 1178 | public String[] getOptions() { |
---|
| 1179 | int i; |
---|
| 1180 | Vector<String> result; |
---|
| 1181 | String[] options; |
---|
| 1182 | |
---|
| 1183 | result = new Vector<String>(); |
---|
| 1184 | |
---|
| 1185 | result.add("-A"); |
---|
| 1186 | result.add("" + m_acuity); |
---|
| 1187 | result.add("-C"); |
---|
| 1188 | result.add("" + m_cutoff); |
---|
| 1189 | |
---|
| 1190 | options = super.getOptions(); |
---|
| 1191 | for (i = 0; i < options.length; i++) |
---|
| 1192 | result.add(options[i]); |
---|
| 1193 | |
---|
| 1194 | return result.toArray(new String[result.size()]); |
---|
| 1195 | } |
---|
| 1196 | |
---|
| 1197 | /** |
---|
| 1198 | * Returns a description of the clusterer as a string. |
---|
| 1199 | * |
---|
| 1200 | * @return a string describing the clusterer. |
---|
| 1201 | */ |
---|
| 1202 | public String toString() { |
---|
| 1203 | StringBuffer text = new StringBuffer(); |
---|
| 1204 | if (m_cobwebTree == null) { |
---|
| 1205 | return "Cobweb hasn't been built yet!"; |
---|
| 1206 | } |
---|
| 1207 | else { |
---|
| 1208 | m_cobwebTree.dumpTree(0, text); |
---|
| 1209 | return "Number of merges: " |
---|
| 1210 | + m_numberMerges+"\nNumber of splits: " |
---|
| 1211 | + m_numberSplits+"\nNumber of clusters: " |
---|
| 1212 | + numberOfClusters() +"\n"+text.toString()+"\n\n"; |
---|
| 1213 | |
---|
| 1214 | } |
---|
| 1215 | } |
---|
| 1216 | |
---|
| 1217 | /** |
---|
| 1218 | * Returns the type of graphs this class |
---|
| 1219 | * represents |
---|
| 1220 | * @return Drawable.TREE |
---|
| 1221 | */ |
---|
| 1222 | public int graphType() { |
---|
| 1223 | return Drawable.TREE; |
---|
| 1224 | } |
---|
| 1225 | |
---|
| 1226 | /** |
---|
| 1227 | * Generates the graph string of the Cobweb tree |
---|
| 1228 | * |
---|
| 1229 | * @return a <code>String</code> value |
---|
| 1230 | * @throws Exception if an error occurs |
---|
| 1231 | */ |
---|
| 1232 | public String graph() throws Exception { |
---|
| 1233 | StringBuffer text = new StringBuffer(); |
---|
| 1234 | |
---|
| 1235 | text.append("digraph CobwebTree {\n"); |
---|
| 1236 | m_cobwebTree.graphTree(text); |
---|
| 1237 | text.append("}\n"); |
---|
| 1238 | return text.toString(); |
---|
| 1239 | } |
---|
| 1240 | |
---|
| 1241 | /** |
---|
| 1242 | * Returns the revision string. |
---|
| 1243 | * |
---|
| 1244 | * @return the revision |
---|
| 1245 | */ |
---|
| 1246 | public String getRevision() { |
---|
| 1247 | return RevisionUtils.extract("$Revision: 5488 $"); |
---|
| 1248 | } |
---|
| 1249 | |
---|
| 1250 | /** |
---|
| 1251 | * Main method. |
---|
| 1252 | * |
---|
| 1253 | * @param argv the commandline options |
---|
| 1254 | */ |
---|
| 1255 | public static void main(String[] argv) { |
---|
| 1256 | runClusterer(new Cobweb(), argv); |
---|
| 1257 | } |
---|
| 1258 | } |
---|