Context Navigation

AddCluster.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 18.0 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* AddCluster.java
19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.filters.unsupervised.attribute;
24
25	import weka.clusterers.AbstractClusterer;
26	import weka.clusterers.Clusterer;
27	import weka.core.Attribute;
28	import weka.core.Capabilities;
29	import weka.core.FastVector;
30	import weka.core.Instance;
31	import weka.core.DenseInstance;
32	import weka.core.Instances;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.Range;
36	import weka.core.RevisionUtils;
37	import weka.core.SparseInstance;
38	import weka.core.Utils;
39	import weka.core.WekaException;
40	import weka.filters.Filter;
41	import weka.filters.UnsupervisedFilter;
42
43	import java.io.File;
44	import java.io.FileInputStream;
45	import java.io.FileNotFoundException;
46	import java.io.ObjectInputStream;
47	import java.util.Enumeration;
48	import java.util.Vector;
49
50	/**
51	<!-- globalinfo-start -->
52	* A filter that adds a new nominal attribute representing the cluster assigned to each instance by the specified clustering algorithm.<br/>
53	* Either the clustering algorithm gets built with the first batch of data or one specifies are serialized clusterer model file to use instead.
54	* <p/>
55	<!-- globalinfo-end -->
56	*
57	<!-- options-start -->
58	* Valid options are: <p/>
59	*
60	* <pre> -W <clusterer specification>
61	* Full class name of clusterer to use, followed
62	* by scheme options. eg:
63	* "weka.clusterers.SimpleKMeans -N 3"
64	* (default: weka.clusterers.SimpleKMeans)</pre>
65	*
66	* <pre> -serialized <file>
67	* Instead of building a clusterer on the data, one can also provide
68	* a serialized model and use that for adding the clusters.</pre>
69	*
70	* <pre> -I <att1,att2-att4,...>
71	* The range of attributes the clusterer should ignore.
72	* </pre>
73	*
74	<!-- options-end -->
75	*
76	* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
77	* @author FracPete (fracpete at waikato dot ac dot nz)
78	* @version $Revision: 5987 $
79	*/
80	public class AddCluster
81	extends Filter
82	implements UnsupervisedFilter, OptionHandler {
83
84	/** for serialization. */
85	static final long serialVersionUID = 7414280611943807337L;
86
87	/** The clusterer used to do the cleansing. */
88	protected Clusterer m_Clusterer = new weka.clusterers.SimpleKMeans();
89
90	/** The file from which to load a serialized clusterer. */
91	protected File m_SerializedClustererFile = new File(System.getProperty("user.dir"));
92
93	/** The actual clusterer used to do the clustering. */
94	protected Clusterer m_ActualClusterer = null;
95
96	/** Range of attributes to ignore. */
97	protected Range m_IgnoreAttributesRange = null;
98
99	/** Filter for removing attributes. */
100	protected Filter m_removeAttributes = new Remove();
101
102	/**
103	* Returns the Capabilities of this filter, makes sure that the class is
104	* never set (for the clusterer).
105	*
106	* @param data the data to use for customization
107	* @return the capabilities of this object, based on the data
108	* @see #getCapabilities()
109	*/
110	public Capabilities getCapabilities(Instances data) {
111	Instances newData;
112
113	newData = new Instances(data, 0);
114	newData.setClassIndex(-1);
115
116	return super.getCapabilities(newData);
117	}
118
119	/**
120	* Returns the Capabilities of this filter.
121	*
122	* @return the capabilities of this object
123	* @see Capabilities
124	*/
125	public Capabilities getCapabilities() {
126	Capabilities result = m_Clusterer.getCapabilities();
127
128	result.setMinimumNumberInstances(0);
129
130	return result;
131	}
132
133	/**
134	* tests the data whether the filter can actually handle it.
135	*
136	* @param instanceInfo the data to test
137	* @throws Exception if the test fails
138	*/
139	protected void testInputFormat(Instances instanceInfo) throws Exception {
140	getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo));
141	}
142
143	/**
144	* Sets the format of the input instances.
145	*
146	* @param instanceInfo an Instances object containing the input instance
147	* structure (any instances contained in the object are ignored - only the
148	* structure is required).
149	* @return true if the outputFormat may be collected immediately
150	* @throws Exception if the inputFormat can't be set successfully
151	*/
152	public boolean setInputFormat(Instances instanceInfo) throws Exception {
153	super.setInputFormat(instanceInfo);
154
155	m_removeAttributes = null;
156
157	return false;
158	}
159
160	/**
161	* filters all attributes that should be ignored.
162	*
163	* @param data the data to filter
164	* @return the filtered data
165	* @throws Exception if filtering fails
166	*/
167	protected Instances removeIgnored(Instances data) throws Exception {
168	Instances result = data;
169
170	if (m_IgnoreAttributesRange != null \|\| data.classIndex() >= 0) {
171	m_removeAttributes = new Remove();
172	String rangeString = "";
173	if (m_IgnoreAttributesRange != null) {
174	rangeString += m_IgnoreAttributesRange.getRanges();
175	}
176	if (data.classIndex() >= 0) {
177	if (rangeString.length() > 0) {
178	rangeString += "," + (data.classIndex() + 1);
179	} else {
180	rangeString = "" + (data.classIndex() + 1);
181	}
182	}
183	((Remove) m_removeAttributes).setAttributeIndices(rangeString);
184	((Remove) m_removeAttributes).setInvertSelection(false);
185	m_removeAttributes.setInputFormat(data);
186	result = Filter.useFilter(data, m_removeAttributes);
187	}
188
189	return result;
190	}
191
192	/**
193	* Signify that this batch of input to the filter is finished.
194	*
195	* @return true if there are instances pending output
196	* @throws IllegalStateException if no input structure has been defined
197	*/
198	public boolean batchFinished() throws Exception {
199	if (getInputFormat() == null)
200	throw new IllegalStateException("No input instance format defined");
201
202	Instances toFilter = getInputFormat();
203
204	if (!isFirstBatchDone()) {
205	// filter out attributes if necessary
206	Instances toFilterIgnoringAttributes = removeIgnored(toFilter);
207
208	// serialized model or build clusterer from scratch?
209	File file = getSerializedClustererFile();
210	if (!file.isDirectory()) {
211	ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
212	m_ActualClusterer = (Clusterer) ois.readObject();
213	Instances header = null;
214	// let's see whether there's an Instances header stored as well
215	try {
216	header = (Instances) ois.readObject();
217	}
218	catch (Exception e) {
219	// ignored
220	}
221	ois.close();
222	// same dataset format?
223	if ((header != null) && (!header.equalHeaders(toFilterIgnoringAttributes)))
224	throw new WekaException(
225	"Training header of clusterer and filter dataset don't match:\n"
226	+ header.equalHeadersMsg(toFilterIgnoringAttributes));
227	}
228	else {
229	m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer);
230	m_ActualClusterer.buildClusterer(toFilterIgnoringAttributes);
231	}
232
233	// create output dataset with new attribute
234	Instances filtered = new Instances(toFilter, 0);
235	FastVector nominal_values = new FastVector(m_ActualClusterer.numberOfClusters());
236	for (int i = 0; i < m_ActualClusterer.numberOfClusters(); i++) {
237	nominal_values.addElement("cluster" + (i+1));
238	}
239	filtered.insertAttributeAt(new Attribute("cluster", nominal_values),
240	filtered.numAttributes());
241
242	setOutputFormat(filtered);
243	}
244
245	// build new dataset
246	for (int i=0; i<toFilter.numInstances(); i++) {
247	convertInstance(toFilter.instance(i));
248	}
249
250	flushInput();
251	m_NewBatch = true;
252	m_FirstBatchDone = true;
253
254	return (numPendingOutput() != 0);
255	}
256
257	/**
258	* Input an instance for filtering. Ordinarily the instance is processed
259	* and made available for output immediately. Some filters require all
260	* instances be read before producing output.
261	*
262	* @param instance the input instance
263	* @return true if the filtered instance may now be
264	* collected with output().
265	* @throws IllegalStateException if no input format has been defined.
266	*/
267	public boolean input(Instance instance) throws Exception {
268	if (getInputFormat() == null)
269	throw new IllegalStateException("No input instance format defined");
270
271	if (m_NewBatch) {
272	resetQueue();
273	m_NewBatch = false;
274	}
275
276	if (outputFormatPeek() != null) {
277	convertInstance(instance);
278	return true;
279	}
280
281	bufferInput(instance);
282	return false;
283	}
284
285	/**
286	* Convert a single instance over. The converted instance is added to
287	* the end of the output queue.
288	*
289	* @param instance the instance to convert
290	* @throws Exception if something goes wrong
291	*/
292	protected void convertInstance(Instance instance) throws Exception {
293	Instance original, processed;
294	original = instance;
295
296	// copy values
297	double[] instanceVals = new double[instance.numAttributes()+1];
298	for(int j = 0; j < instance.numAttributes(); j++) {
299	instanceVals[j] = original.value(j);
300	}
301	Instance filteredI = null;
302	if (m_removeAttributes != null) {
303	m_removeAttributes.input(instance);
304	filteredI = m_removeAttributes.output();
305	} else {
306	filteredI = instance;
307	}
308
309	// add cluster to end
310	try {
311	instanceVals[instance.numAttributes()] = m_ActualClusterer.clusterInstance(filteredI);
312	}
313	catch (Exception e) {
314	// clusterer couldn't cluster instance -> missing
315	instanceVals[instance.numAttributes()] = Utils.missingValue();
316	}
317
318	// create new instance
319	if (original instanceof SparseInstance) {
320	processed = new SparseInstance(original.weight(), instanceVals);
321	} else {
322	processed = new DenseInstance(original.weight(), instanceVals);
323	}
324
325	processed.setDataset(instance.dataset());
326	copyValues(processed, false, instance.dataset(), getOutputFormat());
327	processed.setDataset(getOutputFormat());
328
329	push(processed);
330	}
331
332	/**
333	* Returns an enumeration describing the available options.
334	*
335	* @return an enumeration of all the available options.
336	*/
337	public Enumeration listOptions() {
338	Vector result = new Vector();
339
340	result.addElement(new Option(
341	"\tFull class name of clusterer to use, followed\n"
342	+ "\tby scheme options. eg:\n"
343	+ "\t\t\"weka.clusterers.SimpleKMeans -N 3\"\n"
344	+ "\t(default: weka.clusterers.SimpleKMeans)",
345	"W", 1, "-W <clusterer specification>"));
346
347	result.addElement(new Option(
348	"\tInstead of building a clusterer on the data, one can also provide\n"
349	+ "\ta serialized model and use that for adding the clusters.",
350	"serialized", 1, "-serialized <file>"));
351
352	result.addElement(new Option(
353	"\tThe range of attributes the clusterer should ignore.\n",
354	"I", 1,"-I <att1,att2-att4,...>"));
355
356	return result.elements();
357	}
358
359
360	/**
361	* Parses a given list of options. <p/>
362	*
363	<!-- options-start -->
364	* Valid options are: <p/>
365	*
366	* <pre> -W <clusterer specification>
367	* Full class name of clusterer to use, followed
368	* by scheme options. eg:
369	* "weka.clusterers.SimpleKMeans -N 3"
370	* (default: weka.clusterers.SimpleKMeans)</pre>
371	*
372	* <pre> -serialized <file>
373	* Instead of building a clusterer on the data, one can also provide
374	* a serialized model and use that for adding the clusters.</pre>
375	*
376	* <pre> -I <att1,att2-att4,...>
377	* The range of attributes the clusterer should ignore.
378	* </pre>
379	*
380	<!-- options-end -->
381	*
382	* @param options the list of options as an array of strings
383	* @throws Exception if an option is not supported
384	*/
385	public void setOptions(String[] options) throws Exception {
386	String tmpStr;
387	String[] tmpOptions;
388	File file;
389	boolean serializedModel;
390
391	serializedModel = false;
392	tmpStr = Utils.getOption("serialized", options);
393	if (tmpStr.length() != 0) {
394	file = new File(tmpStr);
395	if (!file.exists())
396	throw new FileNotFoundException(
397	"File '" + file.getAbsolutePath() + "' not found!");
398	if (file.isDirectory())
399	throw new FileNotFoundException(
400	"'" + file.getAbsolutePath() + "' points to a directory not a file!");
401	setSerializedClustererFile(file);
402	serializedModel = true;
403	}
404	else {
405	setSerializedClustererFile(null);
406	}
407
408	if (!serializedModel) {
409	tmpStr = Utils.getOption('W', options);
410	if (tmpStr.length() == 0)
411	tmpStr = weka.clusterers.SimpleKMeans.class.getName();
412	tmpOptions = Utils.splitOptions(tmpStr);
413	if (tmpOptions.length == 0) {
414	throw new Exception("Invalid clusterer specification string");
415	}
416	tmpStr = tmpOptions[0];
417	tmpOptions[0] = "";
418	setClusterer(AbstractClusterer.forName(tmpStr, tmpOptions));
419	}
420
421	setIgnoredAttributeIndices(Utils.getOption('I', options));
422
423	Utils.checkForRemainingOptions(options);
424	}
425
426	/**
427	* Gets the current settings of the filter.
428	*
429	* @return an array of strings suitable for passing to setOptions
430	*/
431	public String[] getOptions() {
432	Vector<String> result;
433	File file;
434
435	result = new Vector<String>();
436
437	file = getSerializedClustererFile();
438	if ((file != null) && (!file.isDirectory())) {
439	result.add("-serialized");
440	result.add(file.getAbsolutePath());
441	}
442	else {
443	result.add("-W");
444	result.add(getClustererSpec());
445	}
446
447	if (!getIgnoredAttributeIndices().equals("")) {
448	result.add("-I");
449	result.add(getIgnoredAttributeIndices());
450	}
451
452	return result.toArray(new String[result.size()]);
453	}
454
455	/**
456	* Returns a string describing this filter.
457	*
458	* @return a description of the filter suitable for
459	* displaying in the explorer/experimenter gui
460	*/
461	public String globalInfo() {
462	return
463	"A filter that adds a new nominal attribute representing the cluster "
464	+ "assigned to each instance by the specified clustering algorithm.\n"
465	+ "Either the clustering algorithm gets built with the first batch of "
466	+ "data or one specifies are serialized clusterer model file to use "
467	+ "instead.";
468	}
469
470	/**
471	* Returns the tip text for this property.
472	*
473	* @return tip text for this property suitable for
474	* displaying in the explorer/experimenter gui
475	*/
476	public String clustererTipText() {
477	return "The clusterer to assign clusters with.";
478	}
479
480	/**
481	* Sets the clusterer to assign clusters with.
482	*
483	* @param clusterer The clusterer to be used (with its options set).
484	*/
485	public void setClusterer(Clusterer clusterer) {
486	m_Clusterer = clusterer;
487	}
488
489	/**
490	* Gets the clusterer used by the filter.
491	*
492	* @return The clusterer being used.
493	*/
494	public Clusterer getClusterer() {
495	return m_Clusterer;
496	}
497
498	/**
499	* Gets the clusterer specification string, which contains the class name of
500	* the clusterer and any options to the clusterer.
501	*
502	* @return the clusterer string.
503	*/
504	protected String getClustererSpec() {
505	Clusterer c = getClusterer();
506	if (c instanceof OptionHandler) {
507	return c.getClass().getName() + " "
508	+ Utils.joinOptions(((OptionHandler)c).getOptions());
509	}
510	return c.getClass().getName();
511	}
512
513	/**
514	* Returns the tip text for this property.
515	*
516	* @return tip text for this property suitable for
517	* displaying in the explorer/experimenter gui
518	*/
519	public String ignoredAttributeIndicesTipText() {
520	return "The range of attributes to be ignored by the clusterer. eg: first-3,5,9-last";
521	}
522
523	/**
524	* Gets ranges of attributes to be ignored.
525	*
526	* @return a string containing a comma-separated list of ranges
527	*/
528	public String getIgnoredAttributeIndices() {
529	if (m_IgnoreAttributesRange == null)
530	return "";
531	else
532	return m_IgnoreAttributesRange.getRanges();
533	}
534
535	/**
536	* Sets the ranges of attributes to be ignored. If provided string
537	* is null, no attributes will be ignored.
538	*
539	* @param rangeList a string representing the list of attributes.
540	* eg: first-3,5,6-last
541	* @throws IllegalArgumentException if an invalid range list is supplied
542	*/
543	public void setIgnoredAttributeIndices(String rangeList) {
544	if ((rangeList == null) \|\| (rangeList.length() == 0)) {
545	m_IgnoreAttributesRange = null;
546	} else {
547	m_IgnoreAttributesRange = new Range();
548	m_IgnoreAttributesRange.setRanges(rangeList);
549	}
550	}
551
552	/**
553	* Gets the file pointing to a serialized, built clusterer. If it is
554	* null or pointing to a directory it will not be used.
555	*
556	* @return the file the serialized, built clusterer is located in
557	*/
558	public File getSerializedClustererFile() {
559	return m_SerializedClustererFile;
560	}
561
562	/**
563	* Sets the file pointing to a serialized, built clusterer. If the
564	* argument is null, doesn't exist or pointing to a directory, then the
565	* value is ignored.
566	*
567	* @param value the file pointing to the serialized, built clusterer
568	*/
569	public void setSerializedClustererFile(File value) {
570	if ((value == null) \|\| (!value.exists()))
571	value = new File(System.getProperty("user.dir"));
572
573	m_SerializedClustererFile = value;
574	}
575
576	/**
577	* Returns the tip text for this property.
578	*
579	* @return tip text for this property suitable for
580	* displaying in the explorer/experimenter gui
581	*/
582	public String serializedClustererFileTipText() {
583	return "A file containing the serialized model of a built clusterer.";
584	}
585
586	/**
587	* Returns the revision string.
588	*
589	* @return the revision
590	*/
591	public String getRevision() {
592	return RevisionUtils.extract("$Revision: 5987 $");
593	}
594
595	/**
596	* Main method for testing this class.
597	*
598	* @param argv should contain arguments to the filter: use -h for help
599	*/
600	public static void main(String[] argv) {
601	runFilter(new AddCluster(), argv);
602	}
603	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/AddCluster.java

Download in other formats: