/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* FarthestFirst.java
* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
*
*/
package weka.clusterers;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* Cluster data using the FarthestFirst algorithm.
*
* For more information see:
*
* Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.
*
* Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.
*
* Notes:
* - works as a fast simple approximate clusterer
* - modelled after SimpleKMeans, might be a useful initializer for it
*
* @article{Hochbaum1985, * author = {Hochbaum and Shmoys}, * journal = {Mathematics of Operations Research}, * number = {2}, * pages = {180-184}, * title = {A best possible heuristic for the k-center problem}, * volume = {10}, * year = {1985} * } * * @inproceedings{Dasgupta2002, * author = {Sanjoy Dasgupta}, * booktitle = {15th Annual Conference on Computational Learning Theory}, * pages = {351-363}, * publisher = {Springer}, * title = {Performance Guarantees for Hierarchical Clustering}, * year = {2002} * } ** * * Valid options are: * *
-N <num> * number of clusters. (default = 2).* *
-S <num> * Random number seed. * (default 1)* * * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz) * @version $Revision: 5987 $ * @see RandomizableClusterer */ public class FarthestFirst extends RandomizableClusterer implements TechnicalInformationHandler { //Todo: rewrite to be fully incremental // cleanup, like deleting m_instances /** for serialization */ static final long serialVersionUID = 7499838100631329509L; /** * training instances, not necessary to keep, * could be replaced by m_ClusterCentroids where needed for header info */ protected Instances m_instances; /** * replace missing values in training instances */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * number of clusters to generate */ protected int m_NumClusters = 2; /** * holds the cluster centroids */ protected Instances m_ClusterCentroids; /** * attribute min values */ private double [] m_Min; /** * attribute max values */ private double [] m_Max; /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the FarthestFirst algorithm.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "Notes:\n" + "- works as a fast simple approximate clusterer\n" + "- modelled after SimpleKMeans, might be a useful initializer for it"; } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "Hochbaum and Shmoys"); result.setValue(Field.YEAR, "1985"); result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem"); result.setValue(Field.JOURNAL, "Mathematics of Operations Research"); result.setValue(Field.VOLUME, "10"); result.setValue(Field.NUMBER, "2"); result.setValue(Field.PAGES, "180-184"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta"); additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering"); additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory"); additional.setValue(Field.YEAR, "2002"); additional.setValue(Field.PAGES, "351-363"); additional.setValue(Field.PUBLISHER, "Springer"); return result; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); //long start = System.currentTimeMillis(); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_instances = Filter.useFilter(data, m_ReplaceMissingFilter); initMinMax(m_instances); m_ClusterCentroids = new Instances(m_instances, m_NumClusters); int n = m_instances.numInstances(); Random r = new Random(getSeed()); boolean[] selected = new boolean[n]; double[] minDistance = new double[n]; for(int i = 0; i
-N <num> * number of clusters. (default = 2).* *
-S <num> * Random number seed. * (default 1)* * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions (String[] options) throws Exception { String optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } super.setOptions(options); } /** * Gets the current settings of FarthestFirst * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { int i; Vector result; String[] options; result = new Vector(); result.add("-N"); result.add("" + getNumClusters()); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); return (String[]) result.toArray(new String[result.size()]); } /** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\n FarthestFirst\n==============\n"); temp.append("\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster "+i+"\n\t"); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append(" "+m_ClusterCentroids.attribute(j). value((int)m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" "+m_ClusterCentroids.instance(i).value(j)); } } } temp.append("\n\n"); return temp.toString(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5987 $"); } /** * Main method for testing this class. * * @param argv should contain the following arguments:
* -t training file [-N number of clusters] */ public static void main (String[] argv) { runClusterer(new FarthestFirst(), argv); } }