Context Navigation

ClusterMembership.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 17.0 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* ClusterMembership.java
19	* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.filters.unsupervised.attribute;
24
25	import weka.clusterers.DensityBasedClusterer;
26	import weka.clusterers.AbstractDensityBasedClusterer;
27	import weka.core.Attribute;
28	import weka.core.Capabilities;
29	import weka.core.FastVector;
30	import weka.core.Instance;
31	import weka.core.DenseInstance;
32	import weka.core.Instances;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.Range;
36	import weka.core.RevisionUtils;
37	import weka.core.Utils;
38	import weka.filters.Filter;
39	import weka.filters.UnsupervisedFilter;
40
41	import java.util.Enumeration;
42	import java.util.Vector;
43
44	/**
45	<!-- globalinfo-start -->
46	* A filter that uses a density-based clusterer to generate cluster membership values; filtered instances are composed of these values plus the class attribute (if set in the input data). If a (nominal) class attribute is set, the clusterer is run separately for each class. The class attribute (if set) and any user-specified attributes are ignored during the clustering operation
47	* <p/>
48	<!-- globalinfo-end -->
49	*
50	<!-- options-start -->
51	* Valid options are: <p/>
52	*
53	* <pre> -W <clusterer name>
54	* Full name of clusterer to use. eg:
55	* weka.clusterers.EM
56	* Additional options after the '--'.
57	* (default: weka.clusterers.EM)</pre>
58	*
59	* <pre> -I <att1,att2-att4,...>
60	* The range of attributes the clusterer should ignore.
61	* (the class attribute is automatically ignored)</pre>
62	*
63	<!-- options-end -->
64	*
65	* Options after the -- are passed on to the clusterer.
66	*
67	* @author Mark Hall (mhall@cs.waikato.ac.nz)
68	* @author Eibe Frank
69	* @version $Revision: 5987 $
70	*/
71	public class ClusterMembership
72	extends Filter
73	implements UnsupervisedFilter, OptionHandler {
74
75	/** for serialization */
76	static final long serialVersionUID = 6675702504667714026L;
77
78	/** The clusterer */
79	protected DensityBasedClusterer m_clusterer = new weka.clusterers.EM();
80
81	/** Array for storing the clusterers */
82	protected DensityBasedClusterer[] m_clusterers;
83
84	/** Range of attributes to ignore */
85	protected Range m_ignoreAttributesRange;
86
87	/** Filter for removing attributes */
88	protected Filter m_removeAttributes;
89
90	/** The prior probability for each class */
91	protected double[] m_priors;
92
93	/**
94	* Returns the Capabilities of this filter.
95	*
96	* @return the capabilities of this object
97	* @see Capabilities
98	*/
99	public Capabilities getCapabilities() {
100	Capabilities result = m_clusterer.getCapabilities();
101
102	result.setMinimumNumberInstances(0);
103
104	return result;
105	}
106
107	/**
108	* Returns the Capabilities of this filter, makes sure that the class is
109	* never set (for the clusterer).
110	*
111	* @param data the data to use for customization
112	* @return the capabilities of this object, based on the data
113	* @see #getCapabilities()
114	*/
115	public Capabilities getCapabilities(Instances data) {
116	Instances newData;
117
118	newData = new Instances(data, 0);
119	newData.setClassIndex(-1);
120
121	return super.getCapabilities(newData);
122	}
123
124	/**
125	* tests the data whether the filter can actually handle it
126	*
127	* @param instanceInfo the data to test
128	* @throws Exception if the test fails
129	*/
130	protected void testInputFormat(Instances instanceInfo) throws Exception {
131	getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo));
132	}
133
134	/**
135	* Sets the format of the input instances.
136	*
137	* @param instanceInfo an Instances object containing the input instance
138	* structure (any instances contained in the object are ignored - only the
139	* structure is required).
140	* @return true if the outputFormat may be collected immediately
141	* @throws Exception if the inputFormat can't be set successfully
142	*/
143	public boolean setInputFormat(Instances instanceInfo) throws Exception {
144
145	super.setInputFormat(instanceInfo);
146	m_removeAttributes = null;
147	m_priors = null;
148
149	return false;
150	}
151
152	/**
153	* filters all attributes that should be ignored
154	*
155	* @param data the data to filter
156	* @return the filtered data
157	* @throws Exception if filtering fails
158	*/
159	protected Instances removeIgnored(Instances data) throws Exception {
160	Instances result = data;
161
162	if (m_ignoreAttributesRange != null \|\| data.classIndex() >= 0) {
163	result = new Instances(data);
164	m_removeAttributes = new Remove();
165	String rangeString = "";
166	if (m_ignoreAttributesRange != null) {
167	rangeString += m_ignoreAttributesRange.getRanges();
168	}
169	if (data.classIndex() >= 0) {
170	if (rangeString.length() > 0) {
171	rangeString += "," + (data.classIndex() + 1);
172	} else {
173	rangeString = "" + (data.classIndex() + 1);
174	}
175	}
176	((Remove) m_removeAttributes).setAttributeIndices(rangeString);
177	((Remove) m_removeAttributes).setInvertSelection(false);
178	m_removeAttributes.setInputFormat(data);
179	result = Filter.useFilter(data, m_removeAttributes);
180	}
181
182	return result;
183	}
184
185	/**
186	* Signify that this batch of input to the filter is finished.
187	*
188	* @return true if there are instances pending output
189	* @throws IllegalStateException if no input structure has been defined
190	*/
191	public boolean batchFinished() throws Exception {
192
193	if (getInputFormat() == null) {
194	throw new IllegalStateException("No input instance format defined");
195	}
196
197	if (outputFormatPeek() == null) {
198	Instances toFilter = getInputFormat();
199	Instances[] toFilterIgnoringAttributes;
200
201	// Make subsets if class is nominal
202	if ((toFilter.classIndex() >= 0) && toFilter.classAttribute().isNominal()) {
203	toFilterIgnoringAttributes = new Instances[toFilter.numClasses()];
204	for (int i = 0; i < toFilter.numClasses(); i++) {
205	toFilterIgnoringAttributes[i] = new Instances(toFilter, toFilter.numInstances());
206	}
207	for (int i = 0; i < toFilter.numInstances(); i++) {
208	toFilterIgnoringAttributes[(int)toFilter.instance(i).classValue()].add(toFilter.instance(i));
209	}
210	m_priors = new double[toFilter.numClasses()];
211	for (int i = 0; i < toFilter.numClasses(); i++) {
212	toFilterIgnoringAttributes[i].compactify();
213	m_priors[i] = toFilterIgnoringAttributes[i].sumOfWeights();
214	}
215	Utils.normalize(m_priors);
216	} else {
217	toFilterIgnoringAttributes = new Instances[1];
218	toFilterIgnoringAttributes[0] = toFilter;
219	m_priors = new double[1];
220	m_priors[0] = 1;
221	}
222
223	// filter out attributes if necessary
224	for (int i = 0; i < toFilterIgnoringAttributes.length; i++)
225	toFilterIgnoringAttributes[i] = removeIgnored(toFilterIgnoringAttributes[i]);
226
227	// build the clusterers
228	if ((toFilter.classIndex() <= 0) \|\| !toFilter.classAttribute().isNominal()) {
229	m_clusterers = AbstractDensityBasedClusterer.makeCopies(m_clusterer, 1);
230	m_clusterers[0].buildClusterer(toFilterIgnoringAttributes[0]);
231	} else {
232	m_clusterers = AbstractDensityBasedClusterer.makeCopies(m_clusterer, toFilter.numClasses());
233	for (int i = 0; i < m_clusterers.length; i++) {
234	if (toFilterIgnoringAttributes[i].numInstances() == 0) {
235	m_clusterers[i] = null;
236	} else {
237	m_clusterers[i].buildClusterer(toFilterIgnoringAttributes[i]);
238	}
239	}
240	}
241
242	// create output dataset
243	FastVector attInfo = new FastVector();
244	for (int j = 0; j < m_clusterers.length; j++) {
245	if (m_clusterers[j] != null) {
246	for (int i = 0; i < m_clusterers[j].numberOfClusters(); i++) {
247	attInfo.addElement(new Attribute("pCluster_" + j + "_" + i));
248	}
249	}
250	}
251	if (toFilter.classIndex() >= 0) {
252	attInfo.addElement(toFilter.classAttribute().copy());
253	}
254	attInfo.trimToSize();
255	Instances filtered = new Instances(toFilter.relationName()+"_clusterMembership",
256	attInfo, 0);
257	if (toFilter.classIndex() >= 0) {
258	filtered.setClassIndex(filtered.numAttributes() - 1);
259	}
260	setOutputFormat(filtered);
261
262	// build new dataset
263	for (int i = 0; i < toFilter.numInstances(); i++) {
264	convertInstance(toFilter.instance(i));
265	}
266	}
267	flushInput();
268
269	m_NewBatch = true;
270	return (numPendingOutput() != 0);
271	}
272
273	/**
274	* Input an instance for filtering. Ordinarily the instance is processed
275	* and made available for output immediately. Some filters require all
276	* instances be read before producing output.
277	*
278	* @param instance the input instance
279	* @return true if the filtered instance may now be
280	* collected with output().
281	* @throws IllegalStateException if no input format has been defined.
282	*/
283	public boolean input(Instance instance) throws Exception {
284
285	if (getInputFormat() == null) {
286	throw new IllegalStateException("No input instance format defined");
287	}
288	if (m_NewBatch) {
289	resetQueue();
290	m_NewBatch = false;
291	}
292
293	if (outputFormatPeek() != null) {
294	convertInstance(instance);
295	return true;
296	}
297
298	bufferInput(instance);
299	return false;
300	}
301
302	/**
303	* Converts logs back to density values.
304	*
305	* @param j the index of the clusterer
306	* @param in the instance to convert the logs back
307	* @return the densities
308	* @throws Exception if something goes wrong
309	*/
310	protected double[] logs2densities(int j, Instance in) throws Exception {
311
312	double[] logs = m_clusterers[j].logJointDensitiesForInstance(in);
313
314	for (int i = 0; i < logs.length; i++) {
315	logs[i] += Math.log(m_priors[j]);
316	}
317	return logs;
318	}
319
320	/**
321	* Convert a single instance over. The converted instance is added to
322	* the end of the output queue.
323	*
324	* @param instance the instance to convert
325	* @throws Exception if something goes wrong
326	*/
327	protected void convertInstance(Instance instance) throws Exception {
328
329	// set up values
330	double [] instanceVals = new double[outputFormatPeek().numAttributes()];
331	double [] tempvals;
332	if (instance.classIndex() >= 0) {
333	tempvals = new double[outputFormatPeek().numAttributes() - 1];
334	} else {
335	tempvals = new double[outputFormatPeek().numAttributes()];
336	}
337	int pos = 0;
338	for (int j = 0; j < m_clusterers.length; j++) {
339	if (m_clusterers[j] != null) {
340	double [] probs;
341	if (m_removeAttributes != null) {
342	m_removeAttributes.input(instance);
343	probs = logs2densities(j, m_removeAttributes.output());
344	} else {
345	probs = logs2densities(j, instance);
346	}
347	System.arraycopy(probs, 0, tempvals, pos, probs.length);
348	pos += probs.length;
349	}
350	}
351	tempvals = Utils.logs2probs(tempvals);
352	System.arraycopy(tempvals, 0, instanceVals, 0, tempvals.length);
353	if (instance.classIndex() >= 0) {
354	instanceVals[instanceVals.length - 1] = instance.classValue();
355	}
356
357	push(new DenseInstance(instance.weight(), instanceVals));
358	}
359
360	/**
361	* Returns an enumeration describing the available options.
362	*
363	* @return an enumeration of all the available options.
364	*/
365	public Enumeration listOptions() {
366
367	Vector newVector = new Vector(2);
368
369	newVector.
370	addElement(new Option("\tFull name of clusterer to use. eg:\n"
371	+ "\t\tweka.clusterers.EM\n"
372	+ "\tAdditional options after the '--'.\n"
373	+ "\t(default: weka.clusterers.EM)",
374	"W", 1, "-W <clusterer name>"));
375
376	newVector.
377	addElement(new Option("\tThe range of attributes the clusterer should ignore."
378	+"\n\t(the class attribute is automatically ignored)",
379	"I", 1,"-I <att1,att2-att4,...>"));
380
381	return newVector.elements();
382	}
383
384	/**
385	* Parses a given list of options. <p/>
386	*
387	<!-- options-start -->
388	* Valid options are: <p/>
389	*
390	* <pre> -W <clusterer name>
391	* Full name of clusterer to use. eg:
392	* weka.clusterers.EM
393	* Additional options after the '--'.
394	* (default: weka.clusterers.EM)</pre>
395	*
396	* <pre> -I <att1,att2-att4,...>
397	* The range of attributes the clusterer should ignore.
398	* (the class attribute is automatically ignored)</pre>
399	*
400	<!-- options-end -->
401	*
402	* Options after the -- are passed on to the clusterer.
403	*
404	* @param options the list of options as an array of strings
405	* @throws Exception if an option is not supported
406	*/
407	public void setOptions(String[] options) throws Exception {
408
409	String clustererString = Utils.getOption('W', options);
410	if (clustererString.length() == 0)
411	clustererString = weka.clusterers.EM.class.getName();
412	setDensityBasedClusterer((DensityBasedClusterer)Utils.
413	forName(DensityBasedClusterer.class, clustererString,
414	Utils.partitionOptions(options)));
415
416	setIgnoredAttributeIndices(Utils.getOption('I', options));
417	Utils.checkForRemainingOptions(options);
418	}
419
420	/**
421	* Gets the current settings of the filter.
422	*
423	* @return an array of strings suitable for passing to setOptions
424	*/
425	public String [] getOptions() {
426
427	String [] clustererOptions = new String [0];
428	if ((m_clusterer != null) &&
429	(m_clusterer instanceof OptionHandler)) {
430	clustererOptions = ((OptionHandler)m_clusterer).getOptions();
431	}
432	String [] options = new String [clustererOptions.length + 5];
433	int current = 0;
434
435	if (!getIgnoredAttributeIndices().equals("")) {
436	options[current++] = "-I";
437	options[current++] = getIgnoredAttributeIndices();
438	}
439
440	if (m_clusterer != null) {
441	options[current++] = "-W";
442	options[current++] = getDensityBasedClusterer().getClass().getName();
443	}
444
445	options[current++] = "--";
446	System.arraycopy(clustererOptions, 0, options, current,
447	clustererOptions.length);
448	current += clustererOptions.length;
449
450	while (current < options.length) {
451	options[current++] = "";
452	}
453	return options;
454	}
455
456	/**
457	* Returns a string describing this filter
458	*
459	* @return a description of the filter suitable for
460	* displaying in the explorer/experimenter gui
461	*/
462	public String globalInfo() {
463
464	return "A filter that uses a density-based clusterer to generate cluster "
465	+ "membership values; filtered instances are composed of these values "
466	+ "plus the class attribute (if set in the input data). If a (nominal) "
467	+ "class attribute is set, the clusterer is run separately for each "
468	+ "class. The class attribute (if set) and any user-specified "
469	+ "attributes are ignored during the clustering operation";
470	}
471
472	/**
473	* Returns a description of this option suitable for display
474	* as a tip text in the gui.
475	*
476	* @return description of this option
477	*/
478	public String densityBasedClustererTipText() {
479	return "The clusterer that will generate membership values for the instances.";
480	}
481
482	/**
483	* Set the clusterer for use in filtering
484	*
485	* @param newClusterer the clusterer to use
486	*/
487	public void setDensityBasedClusterer(DensityBasedClusterer newClusterer) {
488	m_clusterer = newClusterer;
489	}
490
491	/**
492	* Get the clusterer used by this filter
493	*
494	* @return the clusterer used
495	*/
496	public DensityBasedClusterer getDensityBasedClusterer() {
497	return m_clusterer;
498	}
499
500	/**
501	* Returns the tip text for this property
502	*
503	* @return tip text for this property suitable for
504	* displaying in the explorer/experimenter gui
505	*/
506	public String ignoredAttributeIndicesTipText() {
507
508	return "The range of attributes to be ignored by the clusterer. eg: first-3,5,9-last";
509	}
510
511	/**
512	* Gets ranges of attributes to be ignored.
513	*
514	* @return a string containing a comma-separated list of ranges
515	*/
516	public String getIgnoredAttributeIndices() {
517
518	if (m_ignoreAttributesRange == null) {
519	return "";
520	} else {
521	return m_ignoreAttributesRange.getRanges();
522	}
523	}
524
525	/**
526	* Sets the ranges of attributes to be ignored. If provided string
527	* is null, no attributes will be ignored.
528	*
529	* @param rangeList a string representing the list of attributes.
530	* eg: first-3,5,6-last
531	* @throws IllegalArgumentException if an invalid range list is supplied
532	*/
533	public void setIgnoredAttributeIndices(String rangeList) {
534
535	if ((rangeList == null) \|\| (rangeList.length() == 0)) {
536	m_ignoreAttributesRange = null;
537	} else {
538	m_ignoreAttributesRange = new Range();
539	m_ignoreAttributesRange.setRanges(rangeList);
540	}
541	}
542
543	/**
544	* Returns the revision string.
545	*
546	* @return the revision
547	*/
548	public String getRevision() {
549	return RevisionUtils.extract("$Revision: 5987 $");
550	}
551
552	/**
553	* Main method for testing this class.
554	*
555	* @param argv should contain arguments to the filter: use -h for help
556	*/
557	public static void main(String [] argv) {
558	runFilter(new ClusterMembership(), argv);
559	}
560	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/ClusterMembership.java

Download in other formats: