source: src/main/java/weka/clusterers/DBScan.java @ 26

Last change on this file since 26 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 22.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    Copyright (C) 2004
19 *    & Matthias Schubert (schubert@dbs.ifi.lmu.de)
20 *    & Zhanna Melnikova-Albrecht (melnikov@cip.ifi.lmu.de)
21 *    & Rainer Holzmann (holzmann@cip.ifi.lmu.de)
22 */
23
24package weka.clusterers;
25
26import weka.clusterers.forOPTICSAndDBScan.DataObjects.DataObject;
27import weka.clusterers.forOPTICSAndDBScan.Databases.Database;
28import weka.core.Capabilities;
29import weka.core.Instance;
30import weka.core.Instances;
31import weka.core.Option;
32import weka.core.OptionHandler;
33import weka.core.RevisionUtils;
34import weka.core.TechnicalInformation;
35import weka.core.TechnicalInformationHandler;
36import weka.core.Utils;
37import weka.core.Capabilities.Capability;
38import weka.core.TechnicalInformation.Field;
39import weka.core.TechnicalInformation.Type;
40import weka.filters.Filter;
41import weka.filters.unsupervised.attribute.ReplaceMissingValues;
42
43import java.lang.reflect.Constructor;
44import java.lang.reflect.InvocationTargetException;
45import java.text.DecimalFormat;
46import java.util.Enumeration;
47import java.util.Iterator;
48import java.util.List;
49import java.util.Vector;
50
51/**
52 <!-- globalinfo-start -->
53 * Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu: A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise. In: Second International Conference on Knowledge Discovery and Data Mining, 226-231, 1996.
54 * <p/>
55 <!-- globalinfo-end -->
56 *
57 <!-- technical-bibtex-start -->
58 * BibTeX:
59 * <pre>
60 * &#64;inproceedings{Ester1996,
61 *    author = {Martin Ester and Hans-Peter Kriegel and Joerg Sander and Xiaowei Xu},
62 *    booktitle = {Second International Conference on Knowledge Discovery and Data Mining},
63 *    editor = {Evangelos Simoudis and Jiawei Han and Usama M. Fayyad},
64 *    pages = {226-231},
65 *    publisher = {AAAI Press},
66 *    title = {A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise},
67 *    year = {1996}
68 * }
69 * </pre>
70 * <p/>
71 <!-- technical-bibtex-end -->
72 *
73 <!-- options-start -->
74 * Valid options are: <p/>
75 *
76 * <pre> -E &lt;double&gt;
77 *  epsilon (default = 0.9)</pre>
78 *
79 * <pre> -M &lt;int&gt;
80 *  minPoints (default = 6)</pre>
81 *
82 * <pre> -I &lt;String&gt;
83 *  index (database) used for DBScan (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)</pre>
84 *
85 * <pre> -D &lt;String&gt;
86 *  distance-type (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)</pre>
87 *
88 <!-- options-end -->
89 *
90 * @author Matthias Schubert (schubert@dbs.ifi.lmu.de)
91 * @author Zhanna Melnikova-Albrecht (melnikov@cip.ifi.lmu.de)
92 * @author Rainer Holzmann (holzmann@cip.ifi.lmu.de)
93 * @version $Revision: 5488 $
94 */
95public class DBScan 
96    extends AbstractClusterer
97    implements OptionHandler, TechnicalInformationHandler {
98
99    /** for serialization */
100    static final long serialVersionUID = -1666498248451219728L;
101 
102    /**
103     * Specifies the radius for a range-query
104     */
105    private double epsilon = 0.9;
106
107    /**
108     * Specifies the density (the range-query must contain at least minPoints DataObjects)
109     */
110    private int minPoints = 6;
111
112    /**
113     * Replace missing values in training instances
114     */
115    private ReplaceMissingValues replaceMissingValues_Filter;
116
117    /**
118     * Holds the number of clusters generated
119     */
120    private int numberOfGeneratedClusters;
121
122    /**
123     * Holds the distance-type that is used
124     * (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)
125     */
126    private String database_distanceType = "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject";
127
128    /**
129     * Holds the type of the used database
130     * (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
131     */
132    private String database_Type = "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase";
133
134    /**
135     * The database that is used for DBScan
136     */
137    private Database database;
138
139    /**
140     * Holds the current clusterID
141     */
142    private int clusterID;
143
144    /**
145     * Counter for the processed instances
146     */
147    private int processed_InstanceID;
148
149    /**
150     * Holds the time-value (seconds) for the duration of the clustering-process
151     */
152    private double elapsedTime;
153
154    /**
155     * Returns default capabilities of the clusterer.
156     *
157     * @return      the capabilities of this clusterer
158     */
159    public Capabilities getCapabilities() {
160      Capabilities result = super.getCapabilities();
161      result.disableAll();
162      result.enable(Capability.NO_CLASS);
163
164      // attributes
165      result.enable(Capability.NOMINAL_ATTRIBUTES);
166      result.enable(Capability.NUMERIC_ATTRIBUTES);
167      result.enable(Capability.DATE_ATTRIBUTES);
168      result.enable(Capability.MISSING_VALUES);
169
170      return result;
171    }
172
173    // *****************************************************************************************************************
174    // constructors
175    // *****************************************************************************************************************
176
177    // *****************************************************************************************************************
178    // methods
179    // *****************************************************************************************************************
180
181    /**
182     * Generate Clustering via DBScan
183     * @param instances The instances that need to be clustered
184     * @throws java.lang.Exception If clustering was not successful
185     */
186    public void buildClusterer(Instances instances) throws Exception {
187        // can clusterer handle the data?
188        getCapabilities().testWithFail(instances);
189
190        long time_1 = System.currentTimeMillis();
191
192        processed_InstanceID = 0;
193        numberOfGeneratedClusters = 0;
194        clusterID = 0;
195
196        replaceMissingValues_Filter = new ReplaceMissingValues();
197        replaceMissingValues_Filter.setInputFormat(instances);
198        Instances filteredInstances = Filter.useFilter(instances, replaceMissingValues_Filter);
199
200        database = databaseForName(getDatabase_Type(), filteredInstances);
201        for (int i = 0; i < database.getInstances().numInstances(); i++) {
202            DataObject dataObject = dataObjectForName(getDatabase_distanceType(),
203                    database.getInstances().instance(i),
204                    Integer.toString(i),
205                    database);
206            database.insert(dataObject);
207        }
208        database.setMinMaxValues();
209
210        Iterator iterator = database.dataObjectIterator();
211        while (iterator.hasNext()) {
212            DataObject dataObject = (DataObject) iterator.next();
213            if (dataObject.getClusterLabel() == DataObject.UNCLASSIFIED) {
214                if (expandCluster(dataObject)) {
215                    clusterID++;
216                    numberOfGeneratedClusters++;
217                }
218            }
219        }
220
221        long time_2 = System.currentTimeMillis();
222        elapsedTime = (double) (time_2 - time_1) / 1000.0;
223    }
224
225    /**
226     * Assigns this dataObject to a cluster or remains it as NOISE
227     * @param dataObject The DataObject that needs to be assigned
228     * @return true, if the DataObject could be assigned, else false
229     */
230    private boolean expandCluster(DataObject dataObject) {
231        List seedList = database.epsilonRangeQuery(getEpsilon(), dataObject);
232        /** dataObject is NO coreObject */
233        if (seedList.size() < getMinPoints()) {
234            dataObject.setClusterLabel(DataObject.NOISE);
235            return false;
236        }
237
238        /** dataObject is coreObject */
239        for (int i = 0; i < seedList.size(); i++) {
240            DataObject seedListDataObject = (DataObject) seedList.get(i);
241            /** label this seedListDataObject with the current clusterID, because it is in epsilon-range */
242            seedListDataObject.setClusterLabel(clusterID);
243            if (seedListDataObject.equals(dataObject)) {
244                seedList.remove(i);
245                i--;
246            }
247        }
248
249        /** Iterate the seedList of the startDataObject */
250        for (int j = 0; j < seedList.size(); j++) {
251            DataObject seedListDataObject = (DataObject) seedList.get(j);
252            List seedListDataObject_Neighbourhood = database.epsilonRangeQuery(getEpsilon(), seedListDataObject);
253
254            /** seedListDataObject is coreObject */
255            if (seedListDataObject_Neighbourhood.size() >= getMinPoints()) {
256                for (int i = 0; i < seedListDataObject_Neighbourhood.size(); i++) {
257                    DataObject p = (DataObject) seedListDataObject_Neighbourhood.get(i);
258                    if (p.getClusterLabel() == DataObject.UNCLASSIFIED || p.getClusterLabel() == DataObject.NOISE) {
259                        if (p.getClusterLabel() == DataObject.UNCLASSIFIED) {
260                            seedList.add(p);
261                        }
262                        p.setClusterLabel(clusterID);
263                    }
264                }
265            }
266            seedList.remove(j);
267            j--;
268        }
269
270        return true;
271    }
272
273    /**
274     * Classifies a given instance.
275     *
276     * @param instance The instance to be assigned to a cluster
277     * @return int The number of the assigned cluster as an integer
278     * @throws java.lang.Exception If instance could not be clustered
279     * successfully
280     */
281    public int clusterInstance(Instance instance) throws Exception {
282        if (processed_InstanceID >= database.size()) processed_InstanceID = 0;
283        int cnum = (database.getDataObject(Integer.toString(processed_InstanceID++))).getClusterLabel();
284        if (cnum == DataObject.NOISE)
285            throw new Exception();
286        else
287            return cnum;
288    }
289
290    /**
291     * Returns the number of clusters.
292     *
293     * @return int The number of clusters generated for a training dataset.
294     * @throws java.lang.Exception if number of clusters could not be returned
295     * successfully
296     */
297    public int numberOfClusters() throws Exception {
298        return numberOfGeneratedClusters;
299    }
300
301    /**
302     * Returns an enumeration of all the available options..
303     *
304     * @return Enumeration An enumeration of all available options.
305     */
306    public Enumeration listOptions() {
307        Vector vector = new Vector();
308
309        vector.addElement(
310                new Option("\tepsilon (default = 0.9)",
311                        "E",
312                        1,
313                        "-E <double>"));
314        vector.addElement(
315                new Option("\tminPoints (default = 6)",
316                        "M",
317                        1,
318                        "-M <int>"));
319        vector.addElement(
320                new Option("\tindex (database) used for DBScan (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)",
321                        "I",
322                        1,
323                        "-I <String>"));
324        vector.addElement(
325                new Option("\tdistance-type (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)",
326                        "D",
327                        1,
328                        "-D <String>"));
329        return vector.elements();
330    }
331
332    /**
333     * Sets the OptionHandler's options using the given list. All options
334     * will be set (or reset) during this call (i.e. incremental setting
335     * of options is not possible). <p/>
336     *
337     <!-- options-start -->
338     * Valid options are: <p/>
339     *
340     * <pre> -E &lt;double&gt;
341     *  epsilon (default = 0.9)</pre>
342     *
343     * <pre> -M &lt;int&gt;
344     *  minPoints (default = 6)</pre>
345     *
346     * <pre> -I &lt;String&gt;
347     *  index (database) used for DBScan (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)</pre>
348     *
349     * <pre> -D &lt;String&gt;
350     *  distance-type (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)</pre>
351     *
352     <!-- options-end -->
353     *
354     * @param options The list of options as an array of strings
355     * @throws java.lang.Exception If an option is not supported
356     */
357    public void setOptions(String[] options) throws Exception {
358        String optionString = Utils.getOption('E', options);
359        if (optionString.length() != 0) {
360            setEpsilon(Double.parseDouble(optionString));
361        }
362
363        optionString = Utils.getOption('M', options);
364        if (optionString.length() != 0) {
365            setMinPoints(Integer.parseInt(optionString));
366        }
367
368        optionString = Utils.getOption('I', options);
369        if (optionString.length() != 0) {
370            setDatabase_Type(optionString);
371        }
372
373        optionString = Utils.getOption('D', options);
374        if (optionString.length() != 0) {
375            setDatabase_distanceType(optionString);
376        }
377    }
378
379    /**
380     * Gets the current option settings for the OptionHandler.
381     *
382     * @return String[] The list of current option settings as an array of strings
383     */
384    public String[] getOptions() {
385        String[] options = new String[8];
386        int current = 0;
387
388        options[current++] = "-E";
389        options[current++] = "" + getEpsilon();
390        options[current++] = "-M";
391        options[current++] = "" + getMinPoints();
392        options[current++] = "-I";
393        options[current++] = "" + getDatabase_Type();
394        options[current++] = "-D";
395        options[current++] = "" + getDatabase_distanceType();
396
397        return options;
398    }
399
400    /**
401     * Returns a new Class-Instance of the specified database
402     * @param database_Type String of the specified database
403     * @param instances Instances that were delivered from WEKA
404     * @return Database New constructed Database
405     */
406    public Database databaseForName(String database_Type, Instances instances) {
407        Object o = null;
408
409        Constructor co = null;
410        try {
411            co = (Class.forName(database_Type)).getConstructor(new Class[]{Instances.class});
412            o = co.newInstance(new Object[]{instances});
413        } catch (NoSuchMethodException e) {
414            e.printStackTrace();
415        } catch (SecurityException e) {
416            e.printStackTrace();
417        } catch (ClassNotFoundException e) {
418            e.printStackTrace();
419        } catch (InstantiationException e) {
420            e.printStackTrace();
421        } catch (IllegalAccessException e) {
422            e.printStackTrace();
423        } catch (InvocationTargetException e) {
424            e.printStackTrace();
425        }
426
427        return (Database) o;
428    }
429
430    /**
431     * Returns a new Class-Instance of the specified database
432     * @param database_distanceType String of the specified distance-type
433     * @param instance The original instance that needs to hold by this DataObject
434     * @param key Key for this DataObject
435     * @param database Link to the database
436     * @return DataObject New constructed DataObject
437     */
438    public DataObject dataObjectForName(String database_distanceType, Instance instance, String key, Database database) {
439        Object o = null;
440
441        Constructor co = null;
442        try {
443            co = (Class.forName(database_distanceType)).
444                    getConstructor(new Class[]{Instance.class, String.class, Database.class});
445            o = co.newInstance(new Object[]{instance, key, database});
446        } catch (NoSuchMethodException e) {
447            e.printStackTrace();
448        } catch (SecurityException e) {
449            e.printStackTrace();
450        } catch (ClassNotFoundException e) {
451            e.printStackTrace();
452        } catch (InstantiationException e) {
453            e.printStackTrace();
454        } catch (IllegalAccessException e) {
455            e.printStackTrace();
456        } catch (InvocationTargetException e) {
457            e.printStackTrace();
458        }
459
460        return (DataObject) o;
461    }
462
463    /**
464     * Sets a new value for minPoints
465     * @param minPoints MinPoints
466     */
467    public void setMinPoints(int minPoints) {
468        this.minPoints = minPoints;
469    }
470
471    /**
472     * Sets a new value for epsilon
473     * @param epsilon Epsilon
474     */
475    public void setEpsilon(double epsilon) {
476        this.epsilon = epsilon;
477    }
478
479    /**
480     * Returns the value of epsilon
481     * @return double Epsilon
482     */
483    public double getEpsilon() {
484        return epsilon;
485    }
486
487    /**
488     * Returns the value of minPoints
489     * @return int MinPoints
490     */
491    public int getMinPoints() {
492        return minPoints;
493    }
494
495    /**
496     * Returns the distance-type
497     * @return String Distance-type
498     */
499    public String getDatabase_distanceType() {
500        return database_distanceType;
501    }
502
503    /**
504     * Returns the type of the used index (database)
505     * @return String Index-type
506     */
507    public String getDatabase_Type() {
508        return database_Type;
509    }
510
511    /**
512     * Sets a new distance-type
513     * @param database_distanceType The new distance-type
514     */
515    public void setDatabase_distanceType(String database_distanceType) {
516        this.database_distanceType = database_distanceType;
517    }
518
519    /**
520     * Sets a new database-type
521     * @param database_Type The new database-type
522     */
523    public void setDatabase_Type(String database_Type) {
524        this.database_Type = database_Type;
525    }
526
527    /**
528     * Returns the tip text for this property
529     * @return tip text for this property suitable for
530     * displaying in the explorer/experimenter gui
531     */
532    public String epsilonTipText() {
533        return "radius of the epsilon-range-queries";
534    }
535
536    /**
537     * Returns the tip text for this property
538     * @return tip text for this property suitable for
539     * displaying in the explorer/experimenter gui
540     */
541    public String minPointsTipText() {
542        return "minimun number of DataObjects required in an epsilon-range-query";
543    }
544
545    /**
546     * Returns the tip text for this property
547     * @return tip text for this property suitable for
548     * displaying in the explorer/experimenter gui
549     */
550    public String database_TypeTipText() {
551        return "used database";
552    }
553
554    /**
555     * Returns the tip text for this property
556     * @return tip text for this property suitable for
557     * displaying in the explorer/experimenter gui
558     */
559    public String database_distanceTypeTipText() {
560        return "used distance-type";
561    }
562
563    /**
564     * Returns a string describing this DataMining-Algorithm
565     * @return String Information for the gui-explorer
566     */
567    public String globalInfo() {
568        return getTechnicalInformation().toString();
569    }
570
571    /**
572     * Returns an instance of a TechnicalInformation object, containing
573     * detailed information about the technical background of this class,
574     * e.g., paper reference or book this class is based on.
575     *
576     * @return the technical information about this class
577     */
578    public TechnicalInformation getTechnicalInformation() {
579      TechnicalInformation      result;
580     
581      result = new TechnicalInformation(Type.INPROCEEDINGS);
582      result.setValue(Field.AUTHOR, "Martin Ester and Hans-Peter Kriegel and Joerg Sander and Xiaowei Xu");
583      result.setValue(Field.TITLE, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise");
584      result.setValue(Field.BOOKTITLE, "Second International Conference on Knowledge Discovery and Data Mining");
585      result.setValue(Field.EDITOR, "Evangelos Simoudis and Jiawei Han and Usama M. Fayyad");
586      result.setValue(Field.YEAR, "1996");
587      result.setValue(Field.PAGES, "226-231");
588      result.setValue(Field.PUBLISHER, "AAAI Press");
589     
590      return result;
591    }
592
593    /**
594     * Returns a description of the clusterer
595     *
596     * @return a string representation of the clusterer
597     */
598    public String toString() {
599        StringBuffer stringBuffer = new StringBuffer();
600        stringBuffer.append("DBScan clustering results\n" +
601                "========================================================================================\n\n");
602        stringBuffer.append("Clustered DataObjects: " + database.size() + "\n");
603        stringBuffer.append("Number of attributes: " + database.getInstances().numAttributes() + "\n");
604        stringBuffer.append("Epsilon: " + getEpsilon() + "; minPoints: " + getMinPoints() + "\n");
605        stringBuffer.append("Index: " + getDatabase_Type() + "\n");
606        stringBuffer.append("Distance-type: " + getDatabase_distanceType() + "\n");
607        stringBuffer.append("Number of generated clusters: " + numberOfGeneratedClusters + "\n");
608        DecimalFormat decimalFormat = new DecimalFormat(".##");
609        stringBuffer.append("Elapsed time: " + decimalFormat.format(elapsedTime) + "\n\n");
610
611        for (int i = 0; i < database.size(); i++) {
612            DataObject dataObject = database.getDataObject(Integer.toString(i));
613            stringBuffer.append("(" + Utils.doubleToString(Double.parseDouble(dataObject.getKey()),
614                    (Integer.toString(database.size()).length()), 0) + ".) "
615                    + Utils.padRight(dataObject.toString(), 69) + "  -->  " +
616                    ((dataObject.getClusterLabel() == DataObject.NOISE) ?
617                    "NOISE\n" : dataObject.getClusterLabel() + "\n"));
618        }
619        return stringBuffer.toString() + "\n";
620    }
621   
622    /**
623     * Returns the revision string.
624     *
625     * @return          the revision
626     */
627    public String getRevision() {
628      return RevisionUtils.extract("$Revision: 5488 $");
629    }
630
631    /**
632     * Main Method for testing DBScan
633     * @param args Valid parameters are: 'E' epsilon (default = 0.9); 'M' minPoints (default = 6);
634     *                                   'I' index-type (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase);
635     *                                   'D' distance-type (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject);
636     */
637    public static void main(String[] args) {
638        runClusterer(new DBScan(), args);
639    }
640}
Note: See TracBrowser for help on using the repository browser.