Context Navigation

source: src/main/java/weka/classifiers/meta/ClassificationViaClustering.java @ 16

Last change on this file since 16 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 14.2 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* ClassificationViaClustering.java
	19	* Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
	20	*/
	21
	22	package weka.classifiers.meta;
	23
	24	import weka.classifiers.Classifier;
	25	import weka.classifiers.AbstractClassifier;
	26	import weka.classifiers.rules.ZeroR;
	27	import weka.clusterers.ClusterEvaluation;
	28	import weka.clusterers.Clusterer;
	29	import weka.clusterers.AbstractClusterer;
	30	import weka.clusterers.SimpleKMeans;
	31	import weka.core.Capabilities;
	32	import weka.core.Instance;
	33	import weka.core.DenseInstance;
	34	import weka.core.Instances;
	35	import weka.core.Option;
	36	import weka.core.OptionHandler;
	37	import weka.core.RevisionUtils;
	38	import weka.core.Utils;
	39	import weka.core.Capabilities.Capability;
	40
	41	import java.util.Enumeration;
	42	import java.util.Vector;
	43
	44	/**
	45	<!-- globalinfo-start -->
	46	* A simple meta-classifier that uses a clusterer for classification. For cluster algorithms that use a fixed number of clusterers, like SimpleKMeans, the user has to make sure that the number of clusters to generate are the same as the number of class labels in the dataset in order to obtain a useful model.<br/>
	47	* <br/>
	48	* Note: at prediction time, a missing value is returned if no cluster is found for the instance.<br/>
	49	* <br/>
	50	* The code is based on the 'clusters to classes' functionality of the weka.clusterers.ClusterEvaluation class by Mark Hall.
	51	* <p/>
	52	<!-- globalinfo-end -->
	53	*
	54	<!-- options-start -->
	55	* Valid options are: <p/>
	56	*
	57	* <pre> -D
	58	* If set, classifier is run in debug mode and
	59	* may output additional info to the console</pre>
	60	*
	61	* <pre> -W
	62	* Full name of clusterer.
	63	* (default: weka.clusterers.SimpleKMeans)</pre>
	64	*
	65	* <pre>
	66	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	67	* </pre>
	68	*
	69	* <pre> -N <num>
	70	* number of clusters.
	71	* (default 2).</pre>
	72	*
	73	* <pre> -V
	74	* Display std. deviations for centroids.
	75	* </pre>
	76	*
	77	* <pre> -M
	78	* Replace missing values with mean/mode.
	79	* </pre>
	80	*
	81	* <pre> -S <num>
	82	* Random number seed.
	83	* (default 10)</pre>
	84	*
	85	<!-- options-end -->
	86	*
	87	* @author fracpete (fracpete at waikato dot ac dot nz)
	88	* @version $Revision: 5987 $
	89	*/
	90	public class ClassificationViaClustering
	91	extends AbstractClassifier {
	92
	93	/** for serialization */
	94	private static final long serialVersionUID = -5687069451420259135L;
	95
	96	/** the cluster algorithm used (template) */
	97	protected Clusterer m_Clusterer;
	98
	99	/** the actual cluster algorithm being used */
	100	protected Clusterer m_ActualClusterer;
	101
	102	/** the original training data header */
	103	protected Instances m_OriginalHeader;
	104
	105	/** the modified training data header */
	106	protected Instances m_ClusteringHeader;
	107
	108	/** the mapping between clusters and classes */
	109	protected double[] m_ClustersToClasses;
	110
	111	/** the default model */
	112	protected Classifier m_ZeroR;
	113
	114	/**
	115	* default constructor
	116	*/
	117	public ClassificationViaClustering() {
	118	super();
	119
	120	m_Clusterer = new SimpleKMeans();
	121	}
	122
	123	/**
	124	* Returns a string describing classifier
	125	*
	126	* @return a description suitable for displaying in the
	127	* explorer/experimenter gui
	128	*/
	129	public String globalInfo() {
	130	return
	131	"A simple meta-classifier that uses a clusterer for classification. "
	132	+ "For cluster algorithms that use a fixed number of clusterers, like "
	133	+ "SimpleKMeans, the user has to make sure that the number of clusters "
	134	+ "to generate are the same as the number of class labels in the dataset "
	135	+ "in order to obtain a useful model.\n"
	136	+ "\n"
	137	+ "Note: at prediction time, a missing value is returned if no cluster "
	138	+ "is found for the instance.\n"
	139	+ "\n"
	140	+ "The code is based on the 'clusters to classes' functionality of the "
	141	+ "weka.clusterers.ClusterEvaluation class by Mark Hall.";
	142	}
	143
	144	/**
	145	* Gets an enumeration describing the available options.
	146	*
	147	* @return an enumeration of all the available options.
	148	*/
	149	public Enumeration listOptions(){
	150	Vector result;
	151	Enumeration enm;
	152
	153	result = new Vector();
	154
	155	enm = super.listOptions();
	156	while (enm.hasMoreElements())
	157	result.addElement(enm.nextElement());
	158
	159	result.addElement(new Option(
	160	"\tFull name of clusterer.\n"
	161	+ "\t(default: " + defaultClustererString() +")",
	162	"W", 1, "-W"));
	163
	164	result.addElement(new Option(
	165	"",
	166	"", 0, "\nOptions specific to clusterer "
	167	+ m_Clusterer.getClass().getName() + ":"));
	168	enm = ((OptionHandler) m_Clusterer).listOptions();
	169	while (enm.hasMoreElements())
	170	result.addElement(enm.nextElement());
	171
	172	return result.elements();
	173	}
	174
	175	/**
	176	* returns the options of the current setup
	177	*
	178	* @return the current options
	179	*/
	180	public String[] getOptions(){
	181	int i;
	182	Vector<String> result;
	183	String[] options;
	184
	185	result = new Vector<String>();
	186
	187	result.add("-W");
	188	result.add("" + getClusterer().getClass().getName());
	189
	190	options = super.getOptions();
	191	for (i = 0; i < options.length; i++)
	192	result.add(options[i]);
	193
	194	if (getClusterer() instanceof OptionHandler) {
	195	result.add("--");
	196	options = ((OptionHandler) getClusterer()).getOptions();
	197	for (i = 0; i < options.length; i++)
	198	result.add(options[i]);
	199	}
	200
	201	return result.toArray(new String[result.size()]);
	202	}
	203
	204	/**
	205	* Parses the options for this object. <p/>
	206	*
	207	<!-- options-start -->
	208	* Valid options are: <p/>
	209	*
	210	* <pre> -D
	211	* If set, classifier is run in debug mode and
	212	* may output additional info to the console</pre>
	213	*
	214	* <pre> -W
	215	* Full name of clusterer.
	216	* (default: weka.clusterers.SimpleKMeans)</pre>
	217	*
	218	* <pre>
	219	* Options specific to clusterer weka.clusterers.SimpleKMeans:
	220	* </pre>
	221	*
	222	* <pre> -N <num>
	223	* number of clusters.
	224	* (default 2).</pre>
	225	*
	226	* <pre> -V
	227	* Display std. deviations for centroids.
	228	* </pre>
	229	*
	230	* <pre> -M
	231	* Replace missing values with mean/mode.
	232	* </pre>
	233	*
	234	* <pre> -S <num>
	235	* Random number seed.
	236	* (default 10)</pre>
	237	*
	238	<!-- options-end -->
	239	*
	240	* @param options the options to use
	241	* @throws Exception if setting of options fails
	242	*/
	243	public void setOptions(String[] options) throws Exception {
	244	String tmpStr;
	245
	246	super.setOptions(options);
	247
	248	tmpStr = Utils.getOption('W', options);
	249	if (tmpStr.length() > 0) {
	250	// This is just to set the classifier in case the option
	251	// parsing fails.
	252	setClusterer(AbstractClusterer.forName(tmpStr, null));
	253	setClusterer(AbstractClusterer.forName(tmpStr, Utils.partitionOptions(options)));
	254	}
	255	else {
	256	// This is just to set the classifier in case the option
	257	// parsing fails.
	258	setClusterer(AbstractClusterer.forName(defaultClustererString(), null));
	259	setClusterer(AbstractClusterer.forName(defaultClustererString(), Utils.partitionOptions(options)));
	260	}
	261	}
	262
	263	/**
	264	* String describing default clusterer.
	265	*
	266	* @return the classname
	267	*/
	268	protected String defaultClustererString() {
	269	return SimpleKMeans.class.getName();
	270	}
	271
	272	/**
	273	* Returns the tip text for this property
	274	*
	275	* @return tip text for this property suitable for
	276	* displaying in the explorer/experimenter gui
	277	*/
	278	public String clustererTipText() {
	279	return "The clusterer to be used.";
	280	}
	281
	282	/**
	283	* Set the base clusterer.
	284	*
	285	* @param value the clusterer to use.
	286	*/
	287	public void setClusterer(Clusterer value) {
	288	m_Clusterer = value;
	289	}
	290
	291	/**
	292	* Get the clusterer used as the base learner.
	293	*
	294	* @return the current clusterer
	295	*/
	296	public Clusterer getClusterer() {
	297	return m_Clusterer;
	298	}
	299
	300	/**
	301	* Classifies the given test instance.
	302	*
	303	* @param instance the instance to be classified
	304	* @return the predicted most likely class for the instance or
	305	* Utils.missingValue() if no prediction is made
	306	* @throws Exception if an error occurred during the prediction
	307	*/
	308	public double classifyInstance(Instance instance) throws Exception {
	309	double result;
	310	double[] values;
	311	Instance newInst;
	312	int i;
	313	int n;
	314
	315	if (m_ZeroR != null) {
	316	result = m_ZeroR.classifyInstance(instance);
	317	}
	318	else {
	319	if (m_ActualClusterer != null) {
	320	// build new instance
	321	values = new double[m_ClusteringHeader.numAttributes()];
	322	n = 0;
	323	for (i = 0; i < instance.numAttributes(); i++) {
	324	if (i == instance.classIndex())
	325	continue;
	326	values[n] = instance.value(i);
	327	n++;
	328	}
	329	newInst = new DenseInstance(instance.weight(), values);
	330	newInst.setDataset(m_ClusteringHeader);
	331
	332	// determine cluster/class
	333	result = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)];
	334	if (result == -1)
	335	result = Utils.missingValue();
	336	}
	337	else {
	338	result = Utils.missingValue();
	339	}
	340	}
	341
	342	return result;
	343	}
	344
	345	/**
	346	* Returns default capabilities of the classifier.
	347	*
	348	* @return the capabilities of this classifier
	349	*/
	350	public Capabilities getCapabilities() {
	351	Capabilities result;
	352
	353	result = m_Clusterer.getCapabilities();
	354
	355	// class
	356	result.disableAllClasses();
	357	result.disable(Capability.NO_CLASS);
	358	result.enable(Capability.NOMINAL_CLASS);
	359
	360	return result;
	361	}
	362
	363	/**
	364	* builds the classifier
	365	*
	366	* @param data the training instances
	367	* @throws Exception if something goes wrong
	368	*/
	369	public void buildClassifier(Instances data) throws Exception {
	370	Instances clusterData;
	371	ClusterEvaluation eval;
	372	int i;
	373	Instance instance;
	374	int[][] counts;
	375	int[] clusterTotals;
	376	double[] best;
	377	double[] current;
	378	double[] clusterAssignments;
	379
	380	// can classifier handle the data?
	381	getCapabilities().testWithFail(data);
	382
	383	// remove instances with missing class
	384	data = new Instances(data);
	385	data.deleteWithMissingClass();
	386
	387	// save original header (needed for clusters to classes output)
	388	m_OriginalHeader = new Instances(data, 0);
	389
	390	// remove class attribute for clusterer
	391	clusterData = new Instances(data);
	392	clusterData.setClassIndex(-1);
	393	clusterData.deleteAttributeAt(m_OriginalHeader.classIndex());
	394	m_ClusteringHeader = new Instances(clusterData, 0);
	395
	396	if (m_ClusteringHeader.numAttributes() == 0) {
	397	System.err.println("Data contains only class attribute, defaulting to ZeroR model.");
	398	m_ZeroR = new ZeroR();
	399	m_ZeroR.buildClassifier(data);
	400	}
	401	else {
	402	m_ZeroR = null;
	403
	404	// build clusterer
	405	m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer);
	406	m_ActualClusterer.buildClusterer(clusterData);
	407
	408	// evaluate clusterer on training set
	409	eval = new ClusterEvaluation();
	410	eval.setClusterer(m_ActualClusterer);
	411	eval.evaluateClusterer(clusterData);
	412	clusterAssignments = eval.getClusterAssignments();
	413
	414	// determine classes-to-clusters mapping
	415	counts = new int [eval.getNumClusters()][m_OriginalHeader.numClasses()];
	416	clusterTotals = new int[eval.getNumClusters()];
	417	best = new double[eval.getNumClusters()+1];
	418	current = new double[eval.getNumClusters()+1];
	419	for (i = 0; i < data.numInstances(); i++) {
	420	instance = data.instance(i);
	421	counts[(int) clusterAssignments[i]][(int) instance.classValue()]++;
	422	clusterTotals[(int) clusterAssignments[i]]++;
	423	i++;
	424	}
	425	best[eval.getNumClusters()] = Double.MAX_VALUE;
	426	ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0);
	427	m_ClustersToClasses = new double[best.length];
	428	System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length);
	429	}
	430	}
	431
	432	/**
	433	* Returns a string representation of the classifier.
	434	*
	435	* @return a string representation of the classifier.
	436	*/
	437	public String toString() {
	438	StringBuffer result;
	439	int i;
	440	int n;
	441	boolean found;
	442
	443	result = new StringBuffer();
	444
	445	// title
	446	result.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n");
	447	result.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n");
	448
	449	// model
	450	if (m_ActualClusterer != null) {
	451	// output clusterer
	452	result.append(m_ActualClusterer + "\n");
	453
	454	// clusters to classes
	455	result.append("Clusters to classes mapping:\n");
	456	for (i = 0; i < m_ClustersToClasses.length - 1; i++) {
	457	result.append(" " + (i+1) + ". Cluster: ");
	458	if (m_ClustersToClasses[i] < 0)
	459	result.append("no class");
	460	else
	461	result.append(
	462	m_OriginalHeader.classAttribute().value((int) m_ClustersToClasses[i])
	463	+ " (" + ((int) m_ClustersToClasses[i] + 1) + ")");
	464	result.append("\n");
	465	}
	466	result.append("\n");
	467
	468	// classes to clusters
	469	result.append("Classes to clusters mapping:\n");
	470	for (i = 0; i < m_OriginalHeader.numClasses(); i++) {
	471	result.append(
	472	" " + (i+1) + ". Class ("
	473	+ m_OriginalHeader.classAttribute().value(i) + "): ");
	474
	475	found = false;
	476	for (n = 0; n < m_ClustersToClasses.length - 1; n++) {
	477	if (((int) m_ClustersToClasses[n]) == i) {
	478	found = true;
	479	result.append((n+1) + ". Cluster");
	480	break;
	481	}
	482	}
	483
	484	if (!found)
	485	result.append("no cluster");
	486
	487	result.append("\n");
	488	}
	489
	490	result.append("\n");
	491	}
	492	else {
	493	result.append("no model built yet\n");
	494	}
	495
	496	return result.toString();
	497	}
	498
	499	/**
	500	* Returns the revision string.
	501	*
	502	* @return the revision
	503	*/
	504	public String getRevision() {
	505	return RevisionUtils.extract("$Revision: 5987 $");
	506	}
	507
	508	/**
	509	* Runs the classifier with the given options
	510	*
	511	* @param args the commandline options
	512	*/
	513	public static void main(String[] args) {
	514	runClassifier(new ClassificationViaClustering(), args);
	515	}
	516	}
	517

Note: See TracBrowser for help on using the repository browser.

Download in other formats: