source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/AddCluster.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 18.0 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    AddCluster.java
19 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.filters.unsupervised.attribute;
24
25import weka.clusterers.AbstractClusterer;
26import weka.clusterers.Clusterer;
27import weka.core.Attribute;
28import weka.core.Capabilities;
29import weka.core.FastVector;
30import weka.core.Instance; 
31import weka.core.DenseInstance;
32import weka.core.Instances;
33import weka.core.Option;
34import weka.core.OptionHandler;
35import weka.core.Range;
36import weka.core.RevisionUtils;
37import weka.core.SparseInstance;
38import weka.core.Utils;
39import weka.core.WekaException;
40import weka.filters.Filter;
41import weka.filters.UnsupervisedFilter;
42
43import java.io.File;
44import java.io.FileInputStream;
45import java.io.FileNotFoundException;
46import java.io.ObjectInputStream;
47import java.util.Enumeration;
48import java.util.Vector;
49
50/**
51 <!-- globalinfo-start -->
52 * A filter that adds a new nominal attribute representing the cluster assigned to each instance by the specified clustering algorithm.<br/>
53 * Either the clustering algorithm gets built with the first batch of data or one specifies are serialized clusterer model file to use instead.
54 * <p/>
55 <!-- globalinfo-end -->
56 *
57 <!-- options-start -->
58 * Valid options are: <p/>
59 *
60 * <pre> -W &lt;clusterer specification&gt;
61 *  Full class name of clusterer to use, followed
62 *  by scheme options. eg:
63 *   "weka.clusterers.SimpleKMeans -N 3"
64 *  (default: weka.clusterers.SimpleKMeans)</pre>
65 *
66 * <pre> -serialized &lt;file&gt;
67 *  Instead of building a clusterer on the data, one can also provide
68 *  a serialized model and use that for adding the clusters.</pre>
69 *
70 * <pre> -I &lt;att1,att2-att4,...&gt;
71 *  The range of attributes the clusterer should ignore.
72 * </pre>
73 *
74 <!-- options-end -->
75 *
76 * @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
77 * @author FracPete (fracpete at waikato dot ac dot nz)
78 * @version $Revision: 5987 $
79 */
80public class AddCluster 
81  extends Filter
82  implements UnsupervisedFilter, OptionHandler {
83 
84  /** for serialization. */
85  static final long serialVersionUID = 7414280611943807337L;
86
87  /** The clusterer used to do the cleansing. */
88  protected Clusterer m_Clusterer = new weka.clusterers.SimpleKMeans();
89
90  /** The file from which to load a serialized clusterer. */
91  protected File m_SerializedClustererFile = new File(System.getProperty("user.dir"));
92 
93  /** The actual clusterer used to do the clustering. */
94  protected Clusterer m_ActualClusterer = null;
95
96  /** Range of attributes to ignore. */
97  protected Range m_IgnoreAttributesRange = null;
98
99  /** Filter for removing attributes. */
100  protected Filter m_removeAttributes = new Remove();
101
102  /**
103   * Returns the Capabilities of this filter, makes sure that the class is
104   * never set (for the clusterer).
105   *
106   * @param data        the data to use for customization
107   * @return            the capabilities of this object, based on the data
108   * @see               #getCapabilities()
109   */
110  public Capabilities getCapabilities(Instances data) {
111    Instances   newData;
112   
113    newData = new Instances(data, 0);
114    newData.setClassIndex(-1);
115   
116    return super.getCapabilities(newData);
117  }
118
119  /**
120   * Returns the Capabilities of this filter.
121   *
122   * @return            the capabilities of this object
123   * @see               Capabilities
124   */
125  public Capabilities getCapabilities() {
126    Capabilities result = m_Clusterer.getCapabilities();
127   
128    result.setMinimumNumberInstances(0);
129   
130    return result;
131  }
132 
133  /**
134   * tests the data whether the filter can actually handle it.
135   *
136   * @param instanceInfo        the data to test
137   * @throws Exception          if the test fails
138   */
139  protected void testInputFormat(Instances instanceInfo) throws Exception {
140    getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo));
141  }
142
143  /**
144   * Sets the format of the input instances.
145   *
146   * @param instanceInfo an Instances object containing the input instance
147   * structure (any instances contained in the object are ignored - only the
148   * structure is required).
149   * @return true if the outputFormat may be collected immediately
150   * @throws Exception if the inputFormat can't be set successfully
151   */ 
152  public boolean setInputFormat(Instances instanceInfo) throws Exception {
153    super.setInputFormat(instanceInfo);
154
155    m_removeAttributes = null;
156
157    return false;
158  }
159
160  /**
161   * filters all attributes that should be ignored.
162   *
163   * @param data        the data to filter
164   * @return            the filtered data
165   * @throws Exception  if filtering fails
166   */
167  protected Instances removeIgnored(Instances data) throws Exception {
168    Instances result = data;
169   
170    if (m_IgnoreAttributesRange != null || data.classIndex() >= 0) {
171      m_removeAttributes = new Remove();
172      String rangeString = "";
173      if (m_IgnoreAttributesRange != null) {
174        rangeString += m_IgnoreAttributesRange.getRanges();
175      }
176      if (data.classIndex() >= 0) {
177        if (rangeString.length() > 0) {
178          rangeString += "," + (data.classIndex() + 1);
179        } else {
180          rangeString = "" + (data.classIndex() + 1);
181        }
182      }
183      ((Remove) m_removeAttributes).setAttributeIndices(rangeString);
184      ((Remove) m_removeAttributes).setInvertSelection(false);
185      m_removeAttributes.setInputFormat(data);
186      result = Filter.useFilter(data, m_removeAttributes);
187    }
188   
189    return result;
190  }
191 
192  /**
193   * Signify that this batch of input to the filter is finished.
194   *
195   * @return true if there are instances pending output
196   * @throws IllegalStateException if no input structure has been defined
197   */ 
198  public boolean batchFinished() throws Exception {
199    if (getInputFormat() == null)
200      throw new IllegalStateException("No input instance format defined");
201
202    Instances toFilter = getInputFormat();
203   
204    if (!isFirstBatchDone()) {
205      // filter out attributes if necessary
206      Instances toFilterIgnoringAttributes = removeIgnored(toFilter);
207
208      // serialized model or build clusterer from scratch?
209      File file = getSerializedClustererFile();
210      if (!file.isDirectory()) {
211        ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
212        m_ActualClusterer = (Clusterer) ois.readObject();
213        Instances header = null;
214        // let's see whether there's an Instances header stored as well
215        try {
216          header = (Instances) ois.readObject();
217        }
218        catch (Exception e) {
219          // ignored
220        }
221        ois.close();
222        // same dataset format?
223        if ((header != null) && (!header.equalHeaders(toFilterIgnoringAttributes)))
224          throw new WekaException(
225              "Training header of clusterer and filter dataset don't match:\n"
226              + header.equalHeadersMsg(toFilterIgnoringAttributes));
227      }
228      else {
229        m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer);
230        m_ActualClusterer.buildClusterer(toFilterIgnoringAttributes);
231      }
232
233      // create output dataset with new attribute
234      Instances filtered = new Instances(toFilter, 0); 
235      FastVector nominal_values = new FastVector(m_ActualClusterer.numberOfClusters());
236      for (int i = 0; i < m_ActualClusterer.numberOfClusters(); i++) {
237        nominal_values.addElement("cluster" + (i+1)); 
238      }
239      filtered.insertAttributeAt(new Attribute("cluster", nominal_values),
240          filtered.numAttributes());
241
242      setOutputFormat(filtered);
243    }
244
245    // build new dataset
246    for (int i=0; i<toFilter.numInstances(); i++) {
247      convertInstance(toFilter.instance(i));
248    }
249   
250    flushInput();
251    m_NewBatch = true;
252    m_FirstBatchDone = true;
253
254    return (numPendingOutput() != 0);
255  }
256
257  /**
258   * Input an instance for filtering. Ordinarily the instance is processed
259   * and made available for output immediately. Some filters require all
260   * instances be read before producing output.
261   *
262   * @param instance the input instance
263   * @return true if the filtered instance may now be
264   * collected with output().
265   * @throws IllegalStateException if no input format has been defined.
266   */
267  public boolean input(Instance instance) throws Exception {
268    if (getInputFormat() == null)
269      throw new IllegalStateException("No input instance format defined");
270
271    if (m_NewBatch) {
272      resetQueue();
273      m_NewBatch = false;
274    }
275   
276    if (outputFormatPeek() != null) {
277      convertInstance(instance);
278      return true;
279    }
280
281    bufferInput(instance);
282    return false;
283  }
284
285  /**
286   * Convert a single instance over. The converted instance is added to
287   * the end of the output queue.
288   *
289   * @param instance the instance to convert
290   * @throws Exception if something goes wrong
291   */
292  protected void convertInstance(Instance instance) throws Exception {
293    Instance original, processed;
294    original = instance;
295
296    // copy values
297    double[] instanceVals = new double[instance.numAttributes()+1];
298    for(int j = 0; j < instance.numAttributes(); j++) {
299      instanceVals[j] = original.value(j);
300    }
301    Instance filteredI = null;
302    if (m_removeAttributes != null) {
303      m_removeAttributes.input(instance);
304      filteredI = m_removeAttributes.output();
305    } else {
306      filteredI = instance;
307    }
308
309    // add cluster to end
310    try {
311      instanceVals[instance.numAttributes()] = m_ActualClusterer.clusterInstance(filteredI);
312    }
313    catch (Exception e) {
314      // clusterer couldn't cluster instance -> missing
315      instanceVals[instance.numAttributes()] = Utils.missingValue();
316    }
317
318    // create new instance
319    if (original instanceof SparseInstance) {
320      processed = new SparseInstance(original.weight(), instanceVals);
321    } else {
322      processed = new DenseInstance(original.weight(), instanceVals);
323    }
324
325    processed.setDataset(instance.dataset());
326    copyValues(processed, false, instance.dataset(), getOutputFormat());
327    processed.setDataset(getOutputFormat());
328     
329    push(processed);
330  }
331
332  /**
333   * Returns an enumeration describing the available options.
334   *
335   * @return an enumeration of all the available options.
336   */
337  public Enumeration listOptions() {
338    Vector result = new Vector();
339   
340    result.addElement(new Option(
341        "\tFull class name of clusterer to use, followed\n"
342        + "\tby scheme options. eg:\n"
343        + "\t\t\"weka.clusterers.SimpleKMeans -N 3\"\n"
344        + "\t(default: weka.clusterers.SimpleKMeans)",
345        "W", 1, "-W <clusterer specification>"));
346
347    result.addElement(new Option(
348        "\tInstead of building a clusterer on the data, one can also provide\n"
349        + "\ta serialized model and use that for adding the clusters.",
350        "serialized", 1, "-serialized <file>"));
351   
352    result.addElement(new Option(
353        "\tThe range of attributes the clusterer should ignore.\n",
354        "I", 1,"-I <att1,att2-att4,...>"));
355
356    return result.elements();
357  }
358
359
360  /**
361   * Parses a given list of options. <p/>
362   *
363   <!-- options-start -->
364   * Valid options are: <p/>
365   *
366   * <pre> -W &lt;clusterer specification&gt;
367   *  Full class name of clusterer to use, followed
368   *  by scheme options. eg:
369   *   "weka.clusterers.SimpleKMeans -N 3"
370   *  (default: weka.clusterers.SimpleKMeans)</pre>
371   *
372   * <pre> -serialized &lt;file&gt;
373   *  Instead of building a clusterer on the data, one can also provide
374   *  a serialized model and use that for adding the clusters.</pre>
375   *
376   * <pre> -I &lt;att1,att2-att4,...&gt;
377   *  The range of attributes the clusterer should ignore.
378   * </pre>
379   *
380   <!-- options-end -->
381   *
382   * @param options the list of options as an array of strings
383   * @throws Exception if an option is not supported
384   */
385  public void setOptions(String[] options) throws Exception {
386    String      tmpStr;
387    String[]    tmpOptions;
388    File        file;
389    boolean     serializedModel;
390   
391    serializedModel = false;
392    tmpStr = Utils.getOption("serialized", options);
393    if (tmpStr.length() != 0) {
394      file = new File(tmpStr);
395      if (!file.exists())
396        throw new FileNotFoundException(
397            "File '" + file.getAbsolutePath() + "' not found!");
398      if (file.isDirectory())
399        throw new FileNotFoundException(
400            "'" + file.getAbsolutePath() + "' points to a directory not a file!");
401      setSerializedClustererFile(file);
402      serializedModel = true;
403    }
404    else {
405      setSerializedClustererFile(null);
406    }
407
408    if (!serializedModel) {
409      tmpStr = Utils.getOption('W', options);
410      if (tmpStr.length() == 0)
411        tmpStr = weka.clusterers.SimpleKMeans.class.getName();
412      tmpOptions = Utils.splitOptions(tmpStr);
413      if (tmpOptions.length == 0) {
414        throw new Exception("Invalid clusterer specification string");
415      }
416      tmpStr = tmpOptions[0];
417      tmpOptions[0] = "";
418      setClusterer(AbstractClusterer.forName(tmpStr, tmpOptions));
419    }
420       
421    setIgnoredAttributeIndices(Utils.getOption('I', options));
422
423    Utils.checkForRemainingOptions(options);
424  }
425
426  /**
427   * Gets the current settings of the filter.
428   *
429   * @return an array of strings suitable for passing to setOptions
430   */
431  public String[] getOptions() {
432    Vector<String>      result;
433    File                file;
434
435    result = new Vector<String>();
436
437    file = getSerializedClustererFile();
438    if ((file != null) && (!file.isDirectory())) {
439      result.add("-serialized");
440      result.add(file.getAbsolutePath());
441    }
442    else {
443      result.add("-W");
444      result.add(getClustererSpec());
445    }
446   
447    if (!getIgnoredAttributeIndices().equals("")) {
448      result.add("-I");
449      result.add(getIgnoredAttributeIndices());
450    }
451   
452    return result.toArray(new String[result.size()]);
453  }
454
455  /**
456   * Returns a string describing this filter.
457   *
458   * @return a description of the filter suitable for
459   * displaying in the explorer/experimenter gui
460   */
461  public String globalInfo() {
462    return 
463        "A filter that adds a new nominal attribute representing the cluster "
464      + "assigned to each instance by the specified clustering algorithm.\n"
465      + "Either the clustering algorithm gets built with the first batch of "
466      + "data or one specifies are serialized clusterer model file to use "
467      + "instead.";
468  }
469
470  /**
471   * Returns the tip text for this property.
472   *
473   * @return tip text for this property suitable for
474   * displaying in the explorer/experimenter gui
475   */
476  public String clustererTipText() {
477    return "The clusterer to assign clusters with.";
478  }
479
480  /**
481   * Sets the clusterer to assign clusters with.
482   *
483   * @param clusterer The clusterer to be used (with its options set).
484   */
485  public void setClusterer(Clusterer clusterer) {
486    m_Clusterer = clusterer;
487  }
488 
489  /**
490   * Gets the clusterer used by the filter.
491   *
492   * @return The clusterer being used.
493   */
494  public Clusterer getClusterer() {
495    return m_Clusterer;
496  }
497
498  /**
499   * Gets the clusterer specification string, which contains the class name of
500   * the clusterer and any options to the clusterer.
501   *
502   * @return the clusterer string.
503   */
504  protected String getClustererSpec() {
505    Clusterer c = getClusterer();
506    if (c instanceof OptionHandler) {
507      return c.getClass().getName() + " "
508        + Utils.joinOptions(((OptionHandler)c).getOptions());
509    }
510    return c.getClass().getName();
511  }
512
513  /**
514   * Returns the tip text for this property.
515   *
516   * @return tip text for this property suitable for
517   * displaying in the explorer/experimenter gui
518   */
519  public String ignoredAttributeIndicesTipText() {
520    return "The range of attributes to be ignored by the clusterer. eg: first-3,5,9-last";
521  }
522
523  /**
524   * Gets ranges of attributes to be ignored.
525   *
526   * @return a string containing a comma-separated list of ranges
527   */
528  public String getIgnoredAttributeIndices() {
529    if (m_IgnoreAttributesRange == null)
530      return "";
531    else
532      return m_IgnoreAttributesRange.getRanges();
533  }
534
535  /**
536   * Sets the ranges of attributes to be ignored. If provided string
537   * is null, no attributes will be ignored.
538   *
539   * @param rangeList a string representing the list of attributes.
540   * eg: first-3,5,6-last
541   * @throws IllegalArgumentException if an invalid range list is supplied
542   */
543  public void setIgnoredAttributeIndices(String rangeList) {
544    if ((rangeList == null) || (rangeList.length() == 0)) {
545      m_IgnoreAttributesRange = null;
546    } else {
547      m_IgnoreAttributesRange = new Range();
548      m_IgnoreAttributesRange.setRanges(rangeList);
549    }
550  }
551
552  /**
553   * Gets the file pointing to a serialized, built clusterer. If it is
554   * null or pointing to a directory it will not be used.
555   *
556   * @return            the file the serialized, built clusterer is located in
557   */
558  public File getSerializedClustererFile() {
559    return m_SerializedClustererFile;
560  }
561
562  /**
563   * Sets the file pointing to a serialized, built clusterer. If the
564   * argument is null, doesn't exist or pointing to a directory, then the
565   * value is ignored.
566   *
567   * @param value       the file pointing to the serialized, built clusterer
568   */
569  public void setSerializedClustererFile(File value) {
570    if ((value == null) || (!value.exists()))
571      value = new File(System.getProperty("user.dir"));
572
573    m_SerializedClustererFile = value;
574  }
575 
576  /**
577   * Returns the tip text for this property.
578   *
579   * @return            tip text for this property suitable for
580   *                    displaying in the explorer/experimenter gui
581   */
582  public String serializedClustererFileTipText() {
583    return "A file containing the serialized model of a built clusterer.";
584  }
585 
586  /**
587   * Returns the revision string.
588   *
589   * @return            the revision
590   */
591  public String getRevision() {
592    return RevisionUtils.extract("$Revision: 5987 $");
593  }
594
595  /**
596   * Main method for testing this class.
597   *
598   * @param argv should contain arguments to the filter: use -h for help
599   */
600  public static void main(String[] argv) {
601    runFilter(new AddCluster(), argv);
602  }
603}
Note: See TracBrowser for help on using the repository browser.