Context Navigation

source: src/main/java/weka/clusterers/MakeDensityBasedClusterer.java @ 27

Last change on this file since 27 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 17.3 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* MakeDensityBasedClusterer.java
	19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.clusterers;
	24
	25	import weka.core.Capabilities;
	26	import weka.core.Instance;
	27	import weka.core.Instances;
	28	import weka.core.Option;
	29	import weka.core.OptionHandler;
	30	import weka.core.RevisionUtils;
	31	import weka.core.Utils;
	32	import weka.core.WeightedInstancesHandler;
	33	import weka.core.Capabilities.Capability;
	34	import weka.estimators.DiscreteEstimator;
	35	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	36
	37	import java.util.Enumeration;
	38	import java.util.Vector;
	39
	40	/**
	41	<!-- globalinfo-start -->
	42	* Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
	43	* <p/>
	44	<!-- globalinfo-end -->
	45	*
	46	<!-- options-start -->
	47	* Valid options are: <p/>
	48	*
	49	* <pre> -M <num>
	50	* minimum allowable standard deviation for normal density computation
	51	* (default 1e-6)</pre>
	52	*
	53	* <pre> -W <clusterer name>
	54	* Clusterer to wrap.
	55	* (default weka.clusterers.SimpleKMeans)</pre>
	56	*
	57	* <pre>
	58	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	59	* </pre>
	60	*
	61	* <pre> -N <num>
	62	* number of clusters.
	63	* (default 2).</pre>
	64	*
	65	* <pre> -V
	66	* Display std. deviations for centroids.
	67	* </pre>
	68	*
	69	* <pre> -M
	70	* Replace missing values with mean/mode.
	71	* </pre>
	72	*
	73	* <pre> -S <num>
	74	* Random number seed.
	75	* (default 10)</pre>
	76	*
	77	<!-- options-end -->
	78	*
	79	* Options after "--" are passed on to the base clusterer.
	80	*
	81	* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
	82	* @author Mark Hall (mhall@cs.waikato.ac.nz)
	83	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
	84	* @version $Revision: 5488 $
	85	*/
	86	public class MakeDensityBasedClusterer
	87	extends AbstractDensityBasedClusterer
	88	implements NumberOfClustersRequestable,
	89	OptionHandler,
	90	WeightedInstancesHandler {
	91
	92	/** for serialization */
	93	static final long serialVersionUID = -5643302427972186631L;
	94
	95	/** holds training instances header information */
	96	private Instances m_theInstances;
	97	/** prior probabilities for the fitted clusters */
	98	private double [] m_priors;
	99	/** normal distributions fitted to each numeric attribute in each cluster */
	100	private double [][][] m_modelNormal;
	101	/** discrete distributions fitted to each discrete attribute in each cluster */
	102	private DiscreteEstimator [][] m_model;
	103	/** default minimum standard deviation */
	104	private double m_minStdDev = 1e-6;
	105	/** The clusterer being wrapped */
	106	private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
	107	/** globally replace missing values */
	108	private ReplaceMissingValues m_replaceMissing;
	109
	110	/**
	111	* Default constructor.
	112	*
	113	*/
	114	public MakeDensityBasedClusterer() {
	115	super();
	116	}
	117
	118	/**
	119	* Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
	120	*
	121	* @param toWrap the clusterer to wrap around
	122	*/
	123	public MakeDensityBasedClusterer(Clusterer toWrap) {
	124
	125	setClusterer(toWrap);
	126	}
	127
	128	/**
	129	* Returns a string describing classifier
	130	* @return a description suitable for
	131	* displaying in the explorer/experimenter gui
	132	*/
	133	public String globalInfo() {
	134	return
	135	"Class for wrapping a Clusterer to make it return a distribution "
	136	+ "and density. Fits normal distributions and discrete distributions "
	137	+ "within each cluster produced by the wrapped clusterer. Supports the "
	138	+ "NumberOfClustersRequestable interface only if the wrapped Clusterer "
	139	+ "does.";
	140	}
	141
	142	/**
	143	* String describing default clusterer.
	144	*
	145	* @return the default clusterer classname
	146	*/
	147	protected String defaultClustererString() {
	148	return SimpleKMeans.class.getName();
	149	}
	150
	151	/**
	152	* Set the number of clusters to generate.
	153	*
	154	* @param n the number of clusters to generate
	155	* @throws Exception if the wrapped clusterer has not been set, or if
	156	* the wrapped clusterer does not implement this facility.
	157	*/
	158	public void setNumClusters(int n) throws Exception {
	159	if (m_wrappedClusterer == null) {
	160	throw new Exception("Can't set the number of clusters to generate - "
	161	+"no clusterer has been set yet.");
	162	}
	163	if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {
	164	throw new Exception("Can't set the number of clusters to generate - "
	165	+"wrapped clusterer does not support this facility.");
	166	}
	167
	168	((NumberOfClustersRequestable)m_wrappedClusterer).setNumClusters(n);
	169	}
	170
	171	/**
	172	* Returns default capabilities of the clusterer (i.e., of the wrapper
	173	* clusterer).
	174	*
	175	* @return the capabilities of this clusterer
	176	*/
	177	public Capabilities getCapabilities() {
	178	if (m_wrappedClusterer != null) {
	179	return m_wrappedClusterer.getCapabilities();
	180	}
	181	Capabilities result = super.getCapabilities();
	182	result.disableAll();
	183	result.enable(Capability.NO_CLASS);
	184
	185	return result;
	186	}
	187
	188	/**
	189	* Builds a clusterer for a set of instances.
	190	*
	191	* @param data the instances to train the clusterer with
	192	* @throws Exception if the clusterer hasn't been set or something goes wrong
	193	*/
	194	public void buildClusterer(Instances data) throws Exception {
	195	// can clusterer handle the data?
	196	getCapabilities().testWithFail(data);
	197
	198	m_replaceMissing = new ReplaceMissingValues();
	199	m_replaceMissing.setInputFormat(data);
	200	data = weka.filters.Filter.useFilter(data, m_replaceMissing);
	201
	202	m_theInstances = new Instances(data, 0);
	203	if (m_wrappedClusterer == null) {
	204	throw new Exception("No clusterer has been set");
	205	}
	206	m_wrappedClusterer.buildClusterer(data);
	207	m_model =
	208	new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
	209	m_modelNormal =
	210	new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()][2];
	211	double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
	212	m_priors = new double[m_wrappedClusterer.numberOfClusters()];
	213	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
	214	m_priors[i] = 1.0; // laplace correction
	215	for (int j = 0; j < data.numAttributes(); j++) {
	216	if (data.attribute(j).isNominal()) {
	217	m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),
	218	true);
	219	}
	220	}
	221	}
	222
	223	Instance inst = null;
	224
	225	// Compute mean, etc.
	226	int[] clusterIndex = new int[data.numInstances()];
	227	for (int i = 0; i < data.numInstances(); i++) {
	228	inst = data.instance(i);
	229	int cluster = m_wrappedClusterer.clusterInstance(inst);
	230	m_priors[cluster] += inst.weight();
	231	for (int j = 0; j < data.numAttributes(); j++) {
	232	if (!inst.isMissing(j)) {
	233	if (data.attribute(j).isNominal()) {
	234	m_model[cluster][j].addValue(inst.value(j),inst.weight());
	235	} else {
	236	m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);
	237	weights[cluster][j] += inst.weight();
	238	}
	239	}
	240	}
	241	clusterIndex[i] = cluster;
	242	}
	243
	244	for (int j = 0; j < data.numAttributes(); j++) {
	245	if (data.attribute(j).isNumeric()) {
	246	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
	247	if (weights[i][j] > 0) {
	248	m_modelNormal[i][j][0] /= weights[i][j];
	249	}
	250	}
	251	}
	252	}
	253
	254	// Compute standard deviations
	255	for (int i = 0; i < data.numInstances(); i++) {
	256	inst = data.instance(i);
	257	for (int j = 0; j < data.numAttributes(); j++) {
	258	if (!inst.isMissing(j)) {
	259	if (data.attribute(j).isNumeric()) {
	260	double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);
	261	m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;
	262	}
	263	}
	264	}
	265	}
	266
	267	for (int j = 0; j < data.numAttributes(); j++) {
	268	if (data.attribute(j).isNumeric()) {
	269	for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
	270	if (weights[i][j] > 0) {
	271	m_modelNormal[i][j][1] =
	272	Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]);
	273	} else if (weights[i][j] <= 0) {
	274	m_modelNormal[i][j][1] = Double.MAX_VALUE;
	275	}
	276	if (m_modelNormal[i][j][1] <= m_minStdDev) {
	277	m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
	278	if (m_modelNormal[i][j][1] <= m_minStdDev) {
	279	m_modelNormal[i][j][1] = m_minStdDev;
	280	}
	281	}
	282	}
	283	}
	284	}
	285
	286	Utils.normalize(m_priors);
	287	}
	288
	289	/**
	290	* Returns the cluster priors.
	291	*
	292	* @return the cluster priors
	293	*/
	294	public double[] clusterPriors() {
	295
	296	double[] n = new double[m_priors.length];
	297
	298	System.arraycopy(m_priors, 0, n, 0, n.length);
	299	return n;
	300	}
	301
	302	/**
	303	* Computes the log of the conditional density (per cluster) for a given instance.
	304	*
	305	* @param inst the instance to compute the density for
	306	* @return an array containing the estimated densities
	307	* @throws Exception if the density could not be computed
	308	* successfully
	309	*/
	310	public double[] logDensityPerClusterForInstance(Instance inst) throws Exception {
	311
	312	int i, j;
	313	double logprob;
	314	double[] wghts = new double[m_wrappedClusterer.numberOfClusters()];
	315
	316	m_replaceMissing.input(inst);
	317	inst = m_replaceMissing.output();
	318
	319	for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
	320	logprob = 0;
	321	for (j = 0; j < inst.numAttributes(); j++) {
	322	if (!inst.isMissing(j)) {
	323	if (inst.attribute(j).isNominal()) {
	324	logprob += Math.log(m_model[i][j].getProbability(inst.value(j)));
	325	} else { // numeric attribute
	326	logprob += logNormalDens(inst.value(j),
	327	m_modelNormal[i][j][0],
	328	m_modelNormal[i][j][1]);
	329	}
	330	}
	331	}
	332	wghts[i] = logprob;
	333	}
	334	return wghts;
	335	}
	336
	337	/** Constant for normal distribution. */
	338	private static double m_normConst = 0.5 * Math.log(2 * Math.PI);
	339
	340	/**
	341	* Density function of normal distribution.
	342	* @param x input value
	343	* @param mean mean of distribution
	344	* @param stdDev standard deviation of distribution
	345	* @return the density
	346	*/
	347	private double logNormalDens (double x, double mean, double stdDev) {
	348
	349	double diff = x - mean;
	350
	351	return - (diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev);
	352	}
	353
	354	/**
	355	* Returns the number of clusters.
	356	*
	357	* @return the number of clusters generated for a training dataset.
	358	* @throws Exception if number of clusters could not be returned successfully
	359	*/
	360	public int numberOfClusters() throws Exception {
	361
	362	return m_wrappedClusterer.numberOfClusters();
	363	}
	364
	365	/**
	366	* Returns a description of the clusterer.
	367	*
	368	* @return a string containing a description of the clusterer
	369	*/
	370	public String toString() {
	371	if (m_priors == null) {
	372	return "No clusterer built yet!";
	373	}
	374
	375	StringBuffer text = new StringBuffer();
	376	text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
	377	+ m_wrappedClusterer.toString());
	378
	379	text.append("\nFitted estimators (with ML estimates of variance):\n");
	380
	381	for (int j = 0; j < m_priors.length; j++) {
	382	text.append("\nCluster: " + j + " Prior probability: "
	383	+ Utils.doubleToString(m_priors[j], 4) + "\n\n");
	384
	385	for (int i = 0; i < m_model[0].length; i++) {
	386	text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");
	387
	388	if (m_theInstances.attribute(i).isNominal()) {
	389	if (m_model[j][i] != null) {
	390	text.append(m_model[j][i].toString());
	391	}
	392	}
	393	else {
	394	text.append("Normal Distribution. Mean = "
	395	+ Utils.doubleToString(m_modelNormal[j][i][0], 4)
	396	+ " StdDev = "
	397	+ Utils.doubleToString(m_modelNormal[j][i][1], 4)
	398	+ "\n");
	399	}
	400	}
	401	}
	402
	403	return text.toString();
	404	}
	405
	406	/**
	407	* Returns the tip text for this property
	408	* @return tip text for this property suitable for
	409	* displaying in the explorer/experimenter gui
	410	*/
	411	public String clustererTipText() {
	412	return "the clusterer to wrap";
	413	}
	414
	415	/**
	416	* Sets the clusterer to wrap.
	417	*
	418	* @param toWrap the clusterer
	419	*/
	420	public void setClusterer(Clusterer toWrap) {
	421
	422	m_wrappedClusterer = toWrap;
	423	}
	424
	425	/**
	426	* Gets the clusterer being wrapped.
	427	*
	428	* @return the clusterer
	429	*/
	430	public Clusterer getClusterer() {
	431
	432	return m_wrappedClusterer;
	433	}
	434
	435	/**
	436	* Returns the tip text for this property
	437	* @return tip text for this property suitable for
	438	* displaying in the explorer/experimenter gui
	439	*/
	440	public String minStdDevTipText() {
	441	return "set minimum allowable standard deviation";
	442	}
	443
	444	/**
	445	* Set the minimum value for standard deviation when calculating
	446	* normal density. Reducing this value can help prevent arithmetic
	447	* overflow resulting from multiplying large densities (arising from small
	448	* standard deviations) when there are many singleton or near singleton
	449	* values.
	450	* @param m minimum value for standard deviation
	451	*/
	452	public void setMinStdDev(double m) {
	453	m_minStdDev = m;
	454	}
	455
	456	/**
	457	* Get the minimum allowable standard deviation.
	458	* @return the minumum allowable standard deviation
	459	*/
	460	public double getMinStdDev() {
	461	return m_minStdDev;
	462	}
	463
	464	/**
	465	* Returns an enumeration describing the available options..
	466	*
	467	* @return an enumeration of all the available options.
	468	*/
	469	public Enumeration listOptions() {
	470	Vector result = new Vector();
	471
	472	result.addElement(new Option(
	473	"\tminimum allowable standard deviation for normal density computation "
	474	+"\n\t(default 1e-6)"
	475	,"M",1,"-M <num>"));
	476
	477	result.addElement(new Option(
	478	"\tClusterer to wrap.\n"
	479	+ "\t(default " + defaultClustererString() + ")",
	480	"W", 1,"-W <clusterer name>"));
	481
	482	if ((m_wrappedClusterer != null) &&
	483	(m_wrappedClusterer instanceof OptionHandler)) {
	484	result.addElement(new Option(
	485	"",
	486	"", 0, "\nOptions specific to clusterer "
	487	+ m_wrappedClusterer.getClass().getName() + ":"));
	488	Enumeration enu = ((OptionHandler)m_wrappedClusterer).listOptions();
	489	while (enu.hasMoreElements()) {
	490	result.addElement(enu.nextElement());
	491	}
	492	}
	493
	494	return result.elements();
	495	}
	496
	497	/**
	498	* Parses a given list of options. <p/>
	499	*
	500	<!-- options-start -->
	501	* Valid options are: <p/>
	502	*
	503	* <pre> -M <num>
	504	* minimum allowable standard deviation for normal density computation
	505	* (default 1e-6)</pre>
	506	*
	507	* <pre> -W <clusterer name>
	508	* Clusterer to wrap.
	509	* (default weka.clusterers.SimpleKMeans)</pre>
	510	*
	511	* <pre>
	512	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	513	* </pre>
	514	*
	515	* <pre> -N <num>
	516	* number of clusters.
	517	* (default 2).</pre>
	518	*
	519	* <pre> -V
	520	* Display std. deviations for centroids.
	521	* </pre>
	522	*
	523	* <pre> -M
	524	* Replace missing values with mean/mode.
	525	* </pre>
	526	*
	527	* <pre> -S <num>
	528	* Random number seed.
	529	* (default 10)</pre>
	530	*
	531	<!-- options-end -->
	532	*
	533	* @param options the list of options as an array of strings
	534	* @throws Exception if an option is not supported
	535	*/
	536	public void setOptions(String[] options) throws Exception {
	537
	538	String optionString = Utils.getOption('M', options);
	539	if (optionString.length() != 0)
	540	setMinStdDev((new Double(optionString)).doubleValue());
	541	else
	542	setMinStdDev(1e-6);
	543
	544	String wString = Utils.getOption('W', options);
	545	if (wString.length() == 0)
	546	wString = defaultClustererString();
	547	setClusterer(AbstractClusterer.forName(wString, Utils.partitionOptions(options)));
	548	}
	549
	550	/**
	551	* Gets the current settings of the clusterer.
	552	*
	553	* @return an array of strings suitable for passing to setOptions()
	554	*/
	555	public String[] getOptions() {
	556
	557	String [] clustererOptions = new String [0];
	558	if ((m_wrappedClusterer != null) &&
	559	(m_wrappedClusterer instanceof OptionHandler)) {
	560	clustererOptions = ((OptionHandler)m_wrappedClusterer).getOptions();
	561	}
	562	String [] options = new String [clustererOptions.length + 5];
	563	int current = 0;
	564
	565	options[current++] = "-M";
	566	options[current++] = ""+getMinStdDev();
	567
	568	if (getClusterer() != null) {
	569	options[current++] = "-W";
	570	options[current++] = getClusterer().getClass().getName();
	571	}
	572	options[current++] = "--";
	573
	574	System.arraycopy(clustererOptions, 0, options, current,
	575	clustererOptions.length);
	576	current += clustererOptions.length;
	577	while (current < options.length) {
	578	options[current++] = "";
	579	}
	580	return options;
	581	}
	582
	583	/**
	584	* Returns the revision string.
	585	*
	586	* @return the revision
	587	*/
	588	public String getRevision() {
	589	return RevisionUtils.extract("$Revision: 5488 $");
	590	}
	591
	592	/**
	593	* Main method for testing this class.
	594	*
	595	* @param argv the options
	596	*/
	597	public static void main(String [] argv) {
	598	runClusterer(new MakeDensityBasedClusterer(), argv);
	599	}
	600	}
	601

Note: See TracBrowser for help on using the repository browser.

Download in other formats: