source: src/main/java/weka/clusterers/FarthestFirst.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 16.8 KB
RevLine 
[4]1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    FarthestFirst.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22package weka.clusterers;
23
24import weka.core.Attribute;
25import weka.core.Capabilities;
26import weka.core.Instance;
27import weka.core.Instances;
28import weka.core.Option;
29import weka.core.RevisionUtils;
30import weka.core.TechnicalInformation;
31import weka.core.TechnicalInformationHandler;
32import weka.core.Utils;
33import weka.core.Capabilities.Capability;
34import weka.core.TechnicalInformation.Field;
35import weka.core.TechnicalInformation.Type;
36import weka.filters.Filter;
37import weka.filters.unsupervised.attribute.ReplaceMissingValues;
38
39import java.util.Enumeration;
40import java.util.Random;
41import java.util.Vector;
42
43/**
44 <!-- globalinfo-start -->
45 * Cluster data using the FarthestFirst algorithm.<br/>
46 * <br/>
47 * For more information see:<br/>
48 * <br/>
49 * Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/>
50 * <br/>
51 * Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/>
52 * <br/>
53 * Notes:<br/>
54 * - works as a fast simple approximate clusterer<br/>
55 * - modelled after SimpleKMeans, might be a useful initializer for it
56 * <p/>
57 <!-- globalinfo-end -->
58 *
59 <!-- technical-bibtex-start -->
60 * BibTeX:
61 * <pre>
62 * &#64;article{Hochbaum1985,
63 *    author = {Hochbaum and Shmoys},
64 *    journal = {Mathematics of Operations Research},
65 *    number = {2},
66 *    pages = {180-184},
67 *    title = {A best possible heuristic for the k-center problem},
68 *    volume = {10},
69 *    year = {1985}
70 * }
71 *
72 * &#64;inproceedings{Dasgupta2002,
73 *    author = {Sanjoy Dasgupta},
74 *    booktitle = {15th Annual Conference on Computational Learning Theory},
75 *    pages = {351-363},
76 *    publisher = {Springer},
77 *    title = {Performance Guarantees for Hierarchical Clustering},
78 *    year = {2002}
79 * }
80 * </pre>
81 * <p/>
82 <!-- technical-bibtex-end -->
83 *
84 <!-- options-start -->
85 * Valid options are: <p/>
86 *
87 * <pre> -N &lt;num&gt;
88 *  number of clusters. (default = 2).</pre>
89 *
90 * <pre> -S &lt;num&gt;
91 *  Random number seed.
92 *  (default 1)</pre>
93 *
94 <!-- options-end -->
95 *
96 * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
97 * @version $Revision: 5987 $
98 * @see RandomizableClusterer
99 */
100public class FarthestFirst 
101  extends RandomizableClusterer
102  implements TechnicalInformationHandler {
103
104  //Todo: rewrite to be fully incremental
105  //      cleanup, like deleting m_instances
106
107  /** for serialization */
108  static final long serialVersionUID = 7499838100631329509L;
109 
110  /**
111   * training instances, not necessary to keep,
112   * could be replaced by m_ClusterCentroids where needed for header info
113   */
114  protected Instances m_instances;
115
116  /**
117   * replace missing values in training instances
118   */
119  protected ReplaceMissingValues m_ReplaceMissingFilter;
120
121  /**
122   * number of clusters to generate
123   */
124  protected int m_NumClusters = 2;
125
126  /**
127   * holds the cluster centroids
128   */
129  protected Instances m_ClusterCentroids;
130
131  /**
132   * attribute min values
133   */
134  private double [] m_Min;
135 
136  /**
137   * attribute max values
138   */
139  private double [] m_Max;
140
141  /**
142   * Returns a string describing this clusterer
143   * @return a description of the evaluator suitable for
144   * displaying in the explorer/experimenter gui
145   */
146  public String globalInfo() {
147    return "Cluster data using the FarthestFirst algorithm.\n\n"
148      + "For more information see:\n\n"
149      + getTechnicalInformation().toString() + "\n\n"
150      + "Notes:\n"
151      + "- works as a fast simple approximate clusterer\n"
152      + "- modelled after SimpleKMeans, might be a useful initializer for it";
153  }
154
155  /**
156   * Returns an instance of a TechnicalInformation object, containing
157   * detailed information about the technical background of this class,
158   * e.g., paper reference or book this class is based on.
159   *
160   * @return the technical information about this class
161   */
162  public TechnicalInformation getTechnicalInformation() {
163    TechnicalInformation        result;
164    TechnicalInformation        additional;
165   
166    result = new TechnicalInformation(Type.ARTICLE);
167    result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");
168    result.setValue(Field.YEAR, "1985");
169    result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem");
170    result.setValue(Field.JOURNAL, "Mathematics of Operations Research");
171    result.setValue(Field.VOLUME, "10");
172    result.setValue(Field.NUMBER, "2");
173    result.setValue(Field.PAGES, "180-184");
174   
175    additional = result.add(Type.INPROCEEDINGS);
176    additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");
177    additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering");
178    additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory");
179    additional.setValue(Field.YEAR, "2002");
180    additional.setValue(Field.PAGES, "351-363");
181    additional.setValue(Field.PUBLISHER, "Springer");
182   
183    return result;
184  }
185
186  /**
187   * Returns default capabilities of the clusterer.
188   *
189   * @return      the capabilities of this clusterer
190   */
191  public Capabilities getCapabilities() {
192    Capabilities result = super.getCapabilities();
193    result.disableAll();
194    result.enable(Capability.NO_CLASS);
195
196    // attributes
197    result.enable(Capability.NOMINAL_ATTRIBUTES);
198    result.enable(Capability.NUMERIC_ATTRIBUTES);
199    result.enable(Capability.DATE_ATTRIBUTES);
200    result.enable(Capability.MISSING_VALUES);
201
202    return result;
203  }
204
205  /**
206   * Generates a clusterer. Has to initialize all fields of the clusterer
207   * that are not being set via options.
208   *
209   * @param data set of instances serving as training data
210   * @throws Exception if the clusterer has not been
211   * generated successfully
212   */
213  public void buildClusterer(Instances data) throws Exception {
214
215    // can clusterer handle the data?
216    getCapabilities().testWithFail(data);
217
218    //long start = System.currentTimeMillis();
219
220    m_ReplaceMissingFilter = new ReplaceMissingValues();
221    m_ReplaceMissingFilter.setInputFormat(data);
222    m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
223
224    initMinMax(m_instances);
225
226    m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
227
228    int n = m_instances.numInstances();
229    Random r = new Random(getSeed());
230    boolean[] selected = new boolean[n];
231    double[] minDistance = new double[n];
232
233    for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;
234
235    int firstI = r.nextInt(n);
236    m_ClusterCentroids.add(m_instances.instance(firstI));
237    selected[firstI] = true;
238
239    updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));
240
241    if (m_NumClusters > n) m_NumClusters = n;
242
243    for(int i = 1; i < m_NumClusters; i++) {
244      int nextI =  farthestAway(minDistance, selected);
245      m_ClusterCentroids.add(m_instances.instance(nextI));
246      selected[nextI] = true;
247      updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));
248    }
249
250    m_instances = new Instances(m_instances,0);
251    //long end = System.currentTimeMillis();
252    //System.out.println("Clustering Time = " + (end-start));
253  }
254
255
256  protected void updateMinDistance(double[] minDistance, boolean[] selected, 
257                                   Instances data, Instance center) {
258    for(int i = 0; i<selected.length; i++) 
259      if (!selected[i]) {
260        double d = distance(center,data.instance(i));
261        if (d<minDistance[i]) 
262          minDistance[i] = d;
263      }
264  }
265
266  protected int farthestAway(double[] minDistance, boolean[] selected) {
267    double maxDistance = -1.0;
268    int maxI = -1;
269    for(int i = 0; i<selected.length; i++) 
270      if (!selected[i]) 
271        if (maxDistance < minDistance[i]) {
272          maxDistance = minDistance[i];
273          maxI = i;
274        }
275    return maxI;
276  }
277
278  protected void initMinMax(Instances data) {
279    m_Min = new double [data.numAttributes()];
280    m_Max = new double [data.numAttributes()];
281    for (int i = 0; i < data.numAttributes(); i++) {
282      m_Min[i] = m_Max[i] = Double.NaN;
283    }
284
285    for (int i = 0; i < data.numInstances(); i++) {
286      updateMinMax(data.instance(i));
287    }
288  }
289
290
291  /**
292   * Updates the minimum and maximum values for all the attributes
293   * based on a new instance.
294   *
295   * @param instance the new instance
296   */
297  private void updateMinMax(Instance instance) { 
298
299    for (int j = 0;j < instance.numAttributes(); j++) {
300      if (Double.isNaN(m_Min[j])) {
301        m_Min[j] = instance.value(j);
302        m_Max[j] = instance.value(j);
303      } else {
304        if (instance.value(j) < m_Min[j]) {
305          m_Min[j] = instance.value(j);
306        } else {
307          if (instance.value(j) > m_Max[j]) {
308            m_Max[j] = instance.value(j);
309          }
310        }
311      }
312    }
313  }
314
315
316  /**
317   * clusters an instance that has been through the filters
318   *
319   * @param instance the instance to assign a cluster to
320   * @return a cluster number
321   */
322  protected int clusterProcessedInstance(Instance instance) {
323    double minDist = Double.MAX_VALUE;
324    int bestCluster = 0;
325    for (int i = 0; i < m_NumClusters; i++) {
326      double dist = distance(instance, m_ClusterCentroids.instance(i));
327      if (dist < minDist) {
328        minDist = dist;
329        bestCluster = i;
330      }
331    }
332    return bestCluster;
333  }
334
335  /**
336   * Classifies a given instance.
337   *
338   * @param instance the instance to be assigned to a cluster
339   * @return the number of the assigned cluster as an integer
340   * if the class is enumerated, otherwise the predicted value
341   * @throws Exception if instance could not be classified
342   * successfully
343   */
344  public int clusterInstance(Instance instance) throws Exception {
345    m_ReplaceMissingFilter.input(instance);
346    m_ReplaceMissingFilter.batchFinished();
347    Instance inst = m_ReplaceMissingFilter.output();
348
349    return clusterProcessedInstance(inst);
350  }
351
352  /**
353   * Calculates the distance between two instances
354   *
355   * @param first the first instance
356   * @param second the second instance
357   * @return the distance between the two given instances, between 0 and 1
358   */         
359  protected double distance(Instance first, Instance second) { 
360
361    double distance = 0;
362    int firstI, secondI;
363
364    for (int p1 = 0, p2 = 0; 
365         p1 < first.numValues() || p2 < second.numValues();) {
366      if (p1 >= first.numValues()) {
367        firstI = m_instances.numAttributes();
368      } else {
369        firstI = first.index(p1); 
370      }
371      if (p2 >= second.numValues()) {
372        secondI = m_instances.numAttributes();
373      } else {
374        secondI = second.index(p2);
375      }
376      if (firstI == m_instances.classIndex()) {
377        p1++; continue;
378      } 
379      if (secondI == m_instances.classIndex()) {
380        p2++; continue;
381      } 
382      double diff;
383      if (firstI == secondI) {
384        diff = difference(firstI, 
385                          first.valueSparse(p1),
386                          second.valueSparse(p2));
387        p1++; p2++;
388      } else if (firstI > secondI) {
389        diff = difference(secondI, 
390                          0, second.valueSparse(p2));
391        p2++;
392      } else {
393        diff = difference(firstI, 
394                          first.valueSparse(p1), 0);
395        p1++;
396      }
397      distance += diff * diff;
398    }
399   
400    return Math.sqrt(distance / m_instances.numAttributes());
401  }
402
403  /**
404   * Computes the difference between two given attribute
405   * values.
406   */
407  protected double difference(int index, double val1, double val2) {
408
409    switch (m_instances.attribute(index).type()) {
410    case Attribute.NOMINAL:
411     
412      // If attribute is nominal
413      if (Utils.isMissingValue(val1) || 
414          Utils.isMissingValue(val2) ||
415          ((int)val1 != (int)val2)) {
416        return 1;
417      } else {
418        return 0;
419      }
420    case Attribute.NUMERIC:
421
422      // If attribute is numeric
423      if (Utils.isMissingValue(val1) || 
424          Utils.isMissingValue(val2)) {
425        if (Utils.isMissingValue(val1) && 
426            Utils.isMissingValue(val2)) {
427          return 1;
428        } else {
429          double diff;
430          if (Utils.isMissingValue(val2)) {
431            diff = norm(val1, index);
432          } else {
433            diff = norm(val2, index);
434          }
435          if (diff < 0.5) {
436            diff = 1.0 - diff;
437          }
438          return diff;
439        }
440      } else {
441        return norm(val1, index) - norm(val2, index);
442      }
443    default:
444      return 0;
445    }
446  }
447
448  /**
449   * Normalizes a given value of a numeric attribute.
450   *
451   * @param x the value to be normalized
452   * @param i the attribute's index
453   * @return the normalized value
454   */
455  protected double norm(double x, int i) {
456
457    if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {
458      return 0;
459    } else {
460      return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
461    }
462  }
463
464  /**
465   * Returns the number of clusters.
466   *
467   * @return the number of clusters generated for a training dataset.
468   * @throws Exception if number of clusters could not be returned
469   * successfully
470   */
471  public int numberOfClusters() throws Exception {
472    return m_NumClusters;
473  }
474
475  /**
476   * Returns an enumeration describing the available options.
477   *
478   * @return an enumeration of all the available options.
479   */
480  public Enumeration listOptions () {
481    Vector result = new Vector();
482   
483    result.addElement(new Option(
484        "\tnumber of clusters. (default = 2).", 
485        "N", 1, "-N <num>"));
486   
487    Enumeration en = super.listOptions();
488    while (en.hasMoreElements())
489      result.addElement(en.nextElement());
490   
491    return  result.elements();
492  }
493
494  /**
495   * Returns the tip text for this property
496   * @return tip text for this property suitable for
497   * displaying in the explorer/experimenter gui
498   */
499  public String numClustersTipText() {
500    return "set number of clusters";
501  }
502
503  /**
504   * set the number of clusters to generate
505   *
506   * @param n the number of clusters to generate
507   * @throws Exception if number of clusters is negative
508   */
509  public void setNumClusters(int n) throws Exception {
510    if (n < 0) {
511      throw new Exception("Number of clusters must be > 0");
512    }
513    m_NumClusters = n;
514  }
515
516  /**
517   * gets the number of clusters to generate
518   *
519   * @return the number of clusters to generate
520   */
521  public int getNumClusters() {
522    return m_NumClusters;
523  }
524
525  /**
526   * Parses a given list of options. <p/>
527   *
528   <!-- options-start -->
529   * Valid options are: <p/>
530   *
531   * <pre> -N &lt;num&gt;
532   *  number of clusters. (default = 2).</pre>
533   *
534   * <pre> -S &lt;num&gt;
535   *  Random number seed.
536   *  (default 1)</pre>
537   *
538   <!-- options-end -->
539   *
540   * @param options the list of options as an array of strings
541   * @throws Exception if an option is not supported
542   */
543  public void setOptions (String[] options)
544    throws Exception {
545
546    String optionString = Utils.getOption('N', options);
547
548    if (optionString.length() != 0) {
549      setNumClusters(Integer.parseInt(optionString));
550    }
551   
552    super.setOptions(options);
553  }
554
555  /**
556   * Gets the current settings of FarthestFirst
557   *
558   * @return an array of strings suitable for passing to setOptions()
559   */
560  public String[] getOptions () {
561    int         i;
562    Vector      result;
563    String[]    options;
564
565    result = new Vector();
566
567    result.add("-N");
568    result.add("" + getNumClusters());
569
570    options = super.getOptions();
571    for (i = 0; i < options.length; i++)
572      result.add(options[i]);
573
574    return (String[]) result.toArray(new String[result.size()]);         
575  }
576
577  /**
578   * return a string describing this clusterer
579   *
580   * @return a description of the clusterer as a string
581   */
582  public String toString() {
583    StringBuffer temp = new StringBuffer();
584
585    temp.append("\n FarthestFirst\n==============\n");
586
587    temp.append("\nCluster centroids:\n");
588    for (int i = 0; i < m_NumClusters; i++) {
589      temp.append("\nCluster "+i+"\n\t");
590      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
591        if (m_ClusterCentroids.attribute(j).isNominal()) {
592          temp.append(" "+m_ClusterCentroids.attribute(j).
593                      value((int)m_ClusterCentroids.instance(i).value(j)));
594        } else {
595          temp.append(" "+m_ClusterCentroids.instance(i).value(j));
596        }
597      }
598    }
599    temp.append("\n\n");
600    return temp.toString();
601  }
602 
603  /**
604   * Returns the revision string.
605   *
606   * @return            the revision
607   */
608  public String getRevision() {
609    return RevisionUtils.extract("$Revision: 5987 $");
610  }
611
612  /**
613   * Main method for testing this class.
614   *
615   * @param argv should contain the following arguments: <p>
616   * -t training file [-N number of clusters]
617   */
618  public static void main (String[] argv) {
619    runClusterer(new FarthestFirst(), argv);
620  }
621}
Note: See TracBrowser for help on using the repository browser.