source: tags/MetisMQIDemo/src/main/java/weka/datagenerators/classifiers/classification/RDG1.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 34.2 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * RDG1.java
19 * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.datagenerators.classifiers.classification;
24
25import weka.core.Attribute;
26import weka.core.FastVector;
27import weka.core.Instance; 
28import weka.core.DenseInstance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.RevisionHandler;
32import weka.core.RevisionUtils;
33import weka.core.Utils;
34import weka.datagenerators.ClassificationGenerator;
35import weka.datagenerators.Test;
36
37import java.io.Serializable;
38import java.util.Enumeration;
39import java.util.Random;
40import java.util.Vector;
41
42/**
43 <!-- globalinfo-start -->
44 * A data generator that produces data randomly by producing a decision list.<br/>
45 * The decision list consists of rules.<br/>
46 * Instances are generated randomly one by one. If decision list fails to classify the current instance, a new rule according to this current instance is generated and added to the decision list.<br/>
47 * <br/>
48 * The option -V switches on voting, which means that at the end of the generation all instances are reclassified to the class value that is supported by the most rules.<br/>
49 * <br/>
50 * This data generator can generate 'boolean' attributes (= nominal with the values {true, false}) and numeric attributes. The rules can be 'A' or 'NOT A' for boolean values and 'B &lt; random_value' or 'B &gt;= random_value' for numeric values.
51 * <p/>
52 <!-- globalinfo-end -->
53 *
54 <!-- options-start -->
55 * Valid options are: <p/>
56 *
57 * <pre> -h
58 *  Prints this help.</pre>
59 *
60 * <pre> -o &lt;file&gt;
61 *  The name of the output file, otherwise the generated data is
62 *  printed to stdout.</pre>
63 *
64 * <pre> -r &lt;name&gt;
65 *  The name of the relation.</pre>
66 *
67 * <pre> -d
68 *  Whether to print debug informations.</pre>
69 *
70 * <pre> -S
71 *  The seed for random function (default 1)</pre>
72 *
73 * <pre> -n &lt;num&gt;
74 *  The number of examples to generate (default 100)</pre>
75 *
76 * <pre> -a &lt;num&gt;
77 *  The number of attributes (default 10).</pre>
78 *
79 * <pre> -c &lt;num&gt;
80 *  The number of classes (default 2)</pre>
81 *
82 * <pre> -R &lt;num&gt;
83 *  maximum size for rules (default 10) </pre>
84 *
85 * <pre> -M &lt;num&gt;
86 *  minimum size for rules (default 1) </pre>
87 *
88 * <pre> -I &lt;num&gt;
89 *  number of irrelevant attributes (default 0)</pre>
90 *
91 * <pre> -N
92 *  number of numeric attributes (default 0)</pre>
93 *
94 * <pre> -V
95 *  switch on voting (default is no voting)</pre>
96 *
97 <!-- options-end -->
98 *
99 * Following an example of a generated dataset: <br/>
100 * <pre>
101 * %
102 * % weka.datagenerators.RDG1 -r expl -a 2 -c 3 -n 4 -N 1 -I 0 -M 2 -R 10 -S 2
103 * %
104 * relation expl
105 *
106 * attribute a0 {false,true}
107 * attribute a1 numeric
108 * attribute class {c0,c1,c2}
109 *
110 * data
111 *
112 * true,0.496823,c0
113 * false,0.743158,c1
114 * false,0.408285,c1
115 * false,0.993687,c2
116 * %
117 * % Number of attributes chosen as irrelevant = 0
118 * %
119 * % DECISIONLIST (number of rules = 3):
120 * % RULE 0:   c0 := a1 &lt; 0.986, a0
121 * % RULE 1:   c1 := a1 &lt; 0.95, not(a0)
122 * % RULE 2:   c2 := not(a0), a1 &gt;= 0.562
123 * </pre>
124 *
125 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
126 * @version $Revision: 5987 $
127 */
128public class RDG1 
129  extends ClassificationGenerator {
130
131  /** for serialization */
132  static final long serialVersionUID = 7751005204635320414L; 
133 
134  /**
135   * class to represent decisionlist
136   */
137  private class RuleList 
138    implements Serializable, RevisionHandler {
139
140    /** for serialization */
141    static final long serialVersionUID = 2830125413361938177L;
142   
143    /** rule list */
144    private FastVector m_RuleList = null;
145   
146    /** class */
147    double m_ClassValue = 0.0;
148
149    /**
150     * returns the class value
151     *
152     * @return the class value
153     */
154    public double getClassValue() { 
155      return m_ClassValue; 
156    }
157   
158    /**
159     * sets the class value
160     *
161     * @param newClassValue the new classvalue
162     */
163    public void setClassValue(double newClassValue) {
164      m_ClassValue = newClassValue;
165    }
166   
167    /**
168     * adds the given test to the list
169     *
170     * @param newTest the test to add
171     */
172    private void addTest (Test newTest) { 
173      if (m_RuleList == null)
174        m_RuleList = new FastVector();
175     
176      m_RuleList.addElement(newTest);
177    }
178   
179    /**
180     * classifies the given example
181     *
182     * @param example the instance to classify
183     * @return the classification
184     * @throws Exception if classification fails
185     */
186    private double classifyInstance (Instance example) throws Exception {
187      boolean passedAllTests = true;
188      for (Enumeration e = m_RuleList.elements(); 
189           passedAllTests && e.hasMoreElements(); ) {
190        Test test = (Test) e.nextElement();
191        passedAllTests = test.passesTest(example);
192      }
193      if (passedAllTests) return m_ClassValue;
194      else return -1.0;
195    }
196   
197    /**
198     * returns a string representation of the rule list
199     *
200     * @return the rule list as string
201     */
202    public String toString () {
203      StringBuffer str = new StringBuffer();
204      str = str.append("  c" + (int) m_ClassValue + " := ");
205      Enumeration e = m_RuleList.elements();
206      if (e.hasMoreElements()) {
207        Test test = (Test) e.nextElement();
208        str = str.append(test.toPrologString()); 
209      }
210      while (e.hasMoreElements()) {
211        Test test = (Test) e.nextElement();
212        str = str.append(", " + test.toPrologString());       
213      }
214      return str.toString();
215    } 
216   
217    /**
218     * Returns the revision string.
219     *
220     * @return          the revision
221     */
222    public String getRevision() {
223      return RevisionUtils.extract("$Revision: 5987 $");
224    }
225  } /*end class RuleList ******/
226
227  /** Number of attribute the dataset should have */
228  protected int m_NumAttributes;
229
230  /** Number of Classes the dataset should have */
231  protected int m_NumClasses;
232
233  /** maximum rule size*/ 
234  private int m_MaxRuleSize;
235 
236  /** minimum rule size*/ 
237  private int m_MinRuleSize;
238 
239  /** number of irrelevant attributes.*/
240  private int m_NumIrrelevant;
241
242  /** number of numeric attribute*/
243  private int m_NumNumeric;
244 
245  /** flag that stores if voting is wished*/ 
246  private boolean m_VoteFlag = false;
247
248   /** decision list */
249  private FastVector m_DecisionList = null;
250
251  /** array defines which attributes are irrelevant, with:
252   * true = attribute is irrelevant; false = attribute is not irrelevant*/
253  boolean[] m_AttList_Irr;
254
255  /**
256   * initializes the generator with default values
257   */
258  public RDG1() {
259    super();
260
261    setNumAttributes(defaultNumAttributes());
262    setNumClasses(defaultNumClasses());
263    setMaxRuleSize(defaultMaxRuleSize());
264    setMinRuleSize(defaultMinRuleSize());
265    setNumIrrelevant(defaultNumIrrelevant());
266    setNumNumeric(defaultNumNumeric());
267  }
268
269  /**
270   * Returns a string describing this data generator.
271   *
272   * @return a description of the data generator suitable for
273   * displaying in the explorer/experimenter gui
274   */
275  public String globalInfo() {
276    return
277        "A data generator that produces data randomly by producing a decision list.\n"
278      + "The decision list consists of rules.\n"
279      + "Instances are generated randomly one by one. If decision list fails "
280      + "to classify the current instance, a new rule according to this current "
281      + "instance is generated and added to the decision list.\n\n"
282      + "The option -V switches on voting, which means that at the end "
283      + "of the generation all instances are "
284      + "reclassified to the class value that is supported by the most rules.\n\n"
285      + "This data generator can generate 'boolean' attributes (= nominal with "
286      + "the values {true, false}) and numeric attributes. The rules can be "
287      + "'A' or 'NOT A' for boolean values and 'B < random_value' or "
288      + "'B >= random_value' for numeric values.";
289  }
290
291 /**
292   * Returns an enumeration describing the available options.
293   *
294   * @return an enumeration of all the available options
295   */
296  public Enumeration listOptions() {
297    Vector result = enumToVector(super.listOptions());
298
299    result.addElement(new Option(
300          "\tThe number of attributes (default " 
301          + defaultNumAttributes() + ").",
302          "a", 1, "-a <num>"));
303
304    result.addElement(new Option(
305        "\tThe number of classes (default " + defaultNumClasses() + ")",
306        "c", 1, "-c <num>"));
307
308    result.addElement(new Option(
309          "\tmaximum size for rules (default " 
310          + defaultMaxRuleSize() + ") ",
311          "R", 1, "-R <num>"));
312   
313    result.addElement(new Option(
314          "\tminimum size for rules (default " 
315          + defaultMinRuleSize() + ") ",
316          "M", 1, "-M <num>"));
317   
318    result.addElement(new Option(
319          "\tnumber of irrelevant attributes (default " 
320          + defaultNumIrrelevant() + ")",
321          "I", 1, "-I <num>"));
322   
323    result.addElement(new Option(
324          "\tnumber of numeric attributes (default "
325          + defaultNumNumeric() + ")",
326          "N", 1, "-N"));
327   
328    result.addElement(new Option(
329          "\tswitch on voting (default is no voting)",
330          "V", 1, "-V"));
331   
332    return result.elements();
333  }
334
335  /**
336   * Parses a list of options for this object. <p/>
337   *
338   <!-- options-start -->
339   * Valid options are: <p/>
340   *
341   * <pre> -h
342   *  Prints this help.</pre>
343   *
344   * <pre> -o &lt;file&gt;
345   *  The name of the output file, otherwise the generated data is
346   *  printed to stdout.</pre>
347   *
348   * <pre> -r &lt;name&gt;
349   *  The name of the relation.</pre>
350   *
351   * <pre> -d
352   *  Whether to print debug informations.</pre>
353   *
354   * <pre> -S
355   *  The seed for random function (default 1)</pre>
356   *
357   * <pre> -n &lt;num&gt;
358   *  The number of examples to generate (default 100)</pre>
359   *
360   * <pre> -a &lt;num&gt;
361   *  The number of attributes (default 10).</pre>
362   *
363   * <pre> -c &lt;num&gt;
364   *  The number of classes (default 2)</pre>
365   *
366   * <pre> -R &lt;num&gt;
367   *  maximum size for rules (default 10) </pre>
368   *
369   * <pre> -M &lt;num&gt;
370   *  minimum size for rules (default 1) </pre>
371   *
372   * <pre> -I &lt;num&gt;
373   *  number of irrelevant attributes (default 0)</pre>
374   *
375   * <pre> -N
376   *  number of numeric attributes (default 0)</pre>
377   *
378   * <pre> -V
379   *  switch on voting (default is no voting)</pre>
380   *
381   <!-- options-end -->
382   *
383   * @param options the list of options as an array of strings
384   * @throws Exception if an option is not supported
385   */
386  public void setOptions(String[] options) throws Exception {
387    String      tmpStr;
388
389    super.setOptions(options);
390
391    tmpStr = Utils.getOption('a', options);
392    if (tmpStr.length() != 0)
393      setNumAttributes(Integer.parseInt(tmpStr));
394    else
395      setNumAttributes(defaultNumAttributes());
396
397    tmpStr = Utils.getOption('c', options);
398    if (tmpStr.length() != 0)
399      setNumClasses(Integer.parseInt(tmpStr));
400    else
401      setNumClasses(defaultNumClasses());
402
403    tmpStr = Utils.getOption('R', options);
404    if (tmpStr.length() != 0)
405      setMaxRuleSize(Integer.parseInt(tmpStr));
406    else 
407      setMaxRuleSize(defaultMaxRuleSize());
408
409    tmpStr = Utils.getOption('M', options);
410    if (tmpStr.length() != 0)
411      setMinRuleSize(Integer.parseInt(tmpStr));
412    else
413      setMinRuleSize(defaultMinRuleSize());
414
415    tmpStr = Utils.getOption('I', options);
416    if (tmpStr.length() != 0)
417      setNumIrrelevant(Integer.parseInt(tmpStr));
418    else
419      setNumIrrelevant(defaultNumIrrelevant());
420
421    if ((getNumAttributes() - getNumIrrelevant()) < getMinRuleSize())
422       throw new Exception("Possible rule size is below minimal rule size.");
423
424    tmpStr = Utils.getOption('N', options);
425    if (tmpStr.length() != 0)
426      setNumNumeric(Integer.parseInt(tmpStr));
427    else
428      setNumNumeric(defaultNumNumeric());
429
430    setVoteFlag(Utils.getFlag('V', options));
431  }
432
433  /**
434   * Gets the current settings of the datagenerator RDG1.
435   *
436   * @return an array of strings suitable for passing to setOptions
437   */
438  public String[] getOptions() {
439    Vector        result;
440    String[]      options;
441    int           i;
442   
443    result  = new Vector();
444    options = super.getOptions();
445    for (i = 0; i < options.length; i++)
446      result.add(options[i]);
447   
448    result.add("-a");
449    result.add("" + getNumAttributes());
450   
451    result.add("-c");
452    result.add("" + getNumClasses());
453
454    result.add("-N"); 
455    result.add("" + getNumNumeric());
456   
457    result.add("-I"); 
458    result.add("" + getNumIrrelevant());
459   
460    result.add("-M"); 
461    result.add("" + getMinRuleSize());
462   
463    result.add("-R"); 
464    result.add("" + getMaxRuleSize());
465   
466    if (getVoteFlag())
467      result.add("-V"); 
468
469    return (String[]) result.toArray(new String[result.size()]);
470  }
471
472  /**
473   * returns the default number of attributes
474   *
475   * @return the default number of attributes
476   */
477  protected int defaultNumAttributes() {
478    return 10;
479  }
480
481  /**
482   * Sets the number of attributes the dataset should have.
483   * @param numAttributes the new number of attributes
484   */
485  public void setNumAttributes(int numAttributes) {
486    m_NumAttributes = numAttributes;
487  }
488
489  /**
490   * Gets the number of attributes that should be produced.
491   * @return the number of attributes that should be produced
492   */
493  public int getNumAttributes() { 
494    return m_NumAttributes; 
495  }
496 
497  /**
498   * Returns the tip text for this property
499   *
500   * @return tip text for this property suitable for
501   *         displaying in the explorer/experimenter gui
502   */
503  public String numAttributesTipText() {
504    return "The number of attributes the generated data will contain.";
505  }
506
507  /**
508   * returns the default number of classes
509   *
510   * @return the default number of classes
511   */
512  protected int defaultNumClasses() {
513    return 2;
514  }
515
516  /**
517   * Sets the number of classes the dataset should have.
518   * @param numClasses the new number of classes
519   */
520  public void setNumClasses(int numClasses) { 
521    m_NumClasses = numClasses; 
522  }
523
524  /**
525   * Gets the number of classes the dataset should have.
526   * @return the number of classes the dataset should have
527   */
528  public int getNumClasses() { 
529    return m_NumClasses; 
530  }
531 
532  /**
533   * Returns the tip text for this property
534   *
535   * @return tip text for this property suitable for
536   *         displaying in the explorer/experimenter gui
537   */
538  public String numClassesTipText() {
539    return "The number of classes to generate.";
540  }
541
542  /**
543   * returns the default max size of rules
544   *
545   * @return the default max size of rules
546   */
547  protected int defaultMaxRuleSize() {
548    return 10;
549  }
550
551  /**
552   * Gets the maximum number of tests in rules.
553   *
554   * @return the maximum number of tests allowed in rules
555   */
556  public int getMaxRuleSize() { 
557    return m_MaxRuleSize; 
558  }
559 
560  /**
561   * Sets the maximum number of tests in rules.
562   *
563   * @param newMaxRuleSize new maximum number of tests allowed in rules.
564   */
565  public void setMaxRuleSize(int newMaxRuleSize) {
566    m_MaxRuleSize = newMaxRuleSize;
567  }
568 
569  /**
570   * Returns the tip text for this property
571   *
572   * @return tip text for this property suitable for
573   *         displaying in the explorer/experimenter gui
574   */
575  public String maxRuleSizeTipText() {
576    return "The maximum number of tests in rules.";
577  }
578
579  /**
580   * returns the default min size of rules
581   *
582   * @return the default min size of rules
583   */
584  protected int defaultMinRuleSize() {
585    return 1;
586  }
587
588  /**
589   * Gets the minimum number of tests in rules.
590   *
591   * @return the minimum number of tests allowed in rules
592   */
593  public int getMinRuleSize() { 
594    return m_MinRuleSize; 
595  }
596 
597  /**
598   * Sets the minimum number of tests in rules.
599   *
600   * @param newMinRuleSize new minimum number of test in rules.
601   */
602  public void setMinRuleSize(int newMinRuleSize) {
603    m_MinRuleSize = newMinRuleSize;
604  }
605 
606  /**
607   * Returns the tip text for this property
608   *
609   * @return tip text for this property suitable for
610   *         displaying in the explorer/experimenter gui
611   */
612  public String minRuleSizeTipText() {
613    return "The minimum number of tests in rules.";
614  }
615
616  /**
617   * returns the default number of irrelevant attributes
618   *
619   * @return the default number of irrelevant attributes
620   */
621  protected int defaultNumIrrelevant() {
622    return 0;
623  }
624
625  /**
626   * Gets the number of irrelevant attributes.
627   *
628   * @return the number of irrelevant attributes
629   */
630  public int getNumIrrelevant() { 
631    return m_NumIrrelevant; 
632  }
633 
634  /**
635   * Sets the number of irrelevant attributes.
636   *
637   * @param newNumIrrelevant the number of irrelevant attributes.
638   */
639  public void setNumIrrelevant(int newNumIrrelevant) {
640    m_NumIrrelevant = newNumIrrelevant;
641  }
642 
643  /**
644   * Returns the tip text for this property
645   *
646   * @return tip text for this property suitable for
647   *         displaying in the explorer/experimenter gui
648   */
649  public String numIrrelevantTipText() {
650    return "The number of irrelevant attributes.";
651  }
652
653  /**
654   * returns the default number of numeric attributes
655   *
656   * @return the default number of numeric attributes
657   */
658  protected int defaultNumNumeric() {
659    return 0;
660  }
661
662  /**
663   * Gets the number of numerical attributes.
664   *
665   * @return the number of numerical attributes.
666   */
667  public int getNumNumeric() { 
668    return m_NumNumeric; 
669  }
670 
671  /**
672   * Sets the number of numerical attributes.
673   *
674   * @param newNumNumeric the number of numerical attributes.
675   */
676  public void setNumNumeric(int newNumNumeric) { 
677    m_NumNumeric = newNumNumeric;
678  }
679 
680  /**
681   * Returns the tip text for this property
682   *
683   * @return tip text for this property suitable for
684   *         displaying in the explorer/experimenter gui
685   */
686  public String numNumericTipText() {
687    return "The number of numerical attributes.";
688  }
689
690  /**
691   * Gets the vote flag.
692   *
693   * @return voting flag.
694   */
695  public boolean getVoteFlag() { 
696    return m_VoteFlag; 
697  }
698 
699  /**
700   * Sets the vote flag.
701   *
702   * @param newVoteFlag boolean with the new setting of the vote flag.
703   */
704  public void setVoteFlag(boolean newVoteFlag) { 
705    m_VoteFlag = newVoteFlag; 
706  }
707 
708  /**
709   * Returns the tip text for this property
710   *
711   * @return tip text for this property suitable for
712   *         displaying in the explorer/experimenter gui
713   */
714  public String voteFlagTipText() {
715    return "Whether to use voting or not.";
716  }
717
718  /**
719   * Gets the single mode flag.
720   *
721   * @return true if methode generateExample can be used.
722   */
723  public boolean getSingleModeFlag() { 
724    return (!getVoteFlag()); 
725  }
726
727  /**
728   * Gets the array that defines which of the attributes
729   * are seen to be irrelevant.
730   *
731   * @return the array that defines the irrelevant attributes
732   */
733  public boolean[] getAttList_Irr() { 
734    return m_AttList_Irr; 
735  }
736 
737  /**
738   * Sets the array that defines which of the attributes
739   * are seen to be irrelevant.
740   *
741   * @param newAttList_Irr array that defines the irrelevant attributes.
742   */
743  public void setAttList_Irr(boolean[] newAttList_Irr) {
744    m_AttList_Irr = newAttList_Irr;
745  }
746 
747  /**
748   * Returns the tip text for this property
749   *
750   * @return tip text for this property suitable for
751   *         displaying in the explorer/experimenter gui
752   */
753  public String attList_IrrTipText() {
754    return "The array with the indices of the irrelevant attributes.";
755  }
756
757  /**
758   * Initializes the format for the dataset produced.
759   *
760   * @return the output data format
761   * @throws Exception data format could not be defined
762   */
763  public Instances defineDataFormat() throws Exception {
764    Instances dataset;
765    Random random = new Random (getSeed());
766    setRandom(random);
767
768    m_DecisionList = new FastVector();
769
770    // number of examples is the same as given per option
771    setNumExamplesAct(getNumExamples());
772
773    // define dataset
774    dataset = defineDataset(random);
775    return dataset; 
776  }
777
778  /**
779   * Generate an example of the dataset dataset.
780   * @return the instance generated
781   * @throws Exception if format not defined or generating <br/>
782   * examples one by one is not possible, because voting is chosen
783   */
784  public Instance generateExample() throws Exception {
785    Random random = getRandom();
786    Instances format = getDatasetFormat();
787
788    if (format == null) 
789      throw new Exception("Dataset format not defined.");
790    if (getVoteFlag()) 
791      throw new Exception("Examples cannot be generated one by one.");
792
793    // generate values for all attributes
794    format = generateExamples(1, random, format);
795
796    return format.lastInstance();
797  }
798
799  /**
800   * Generate all examples of the dataset.
801   * @return the instance generated
802   * @throws Exception if format not defined or generating <br/>
803   * examples one by one is not possible, because voting is chosen
804   */
805  public Instances generateExamples() throws Exception {
806    Random random = getRandom();
807    Instances format = getDatasetFormat();
808    if (format == null) 
809      throw new Exception("Dataset format not defined.");
810
811    // generate values for all attributes
812    format = generateExamples(getNumExamplesAct(), random, format);
813
814    // vote all examples, and set new class value
815    if (getVoteFlag())
816      format = voteDataset(format);
817
818    return format;
819  }
820
821  /**
822   * Generate all examples of the dataset.
823   * @param num the number of examples to generate
824   * @param random the random number generator to use
825   * @param format the dataset format
826   * @return the instance generated
827   * @throws Exception if format not defined or generating <br/>
828   * examples one by one is not possible, because voting is chosen
829   */
830  public Instances generateExamples(int num, 
831                                   Random random,
832                                   Instances format) throws Exception {
833
834    if (format == null) 
835      throw new Exception("Dataset format not defined.");
836   
837    // generate values for all attributes
838    for (int i = 0; i < num; i++)  {
839      // over all examples to be produced
840      Instance example = generateExample(random, format);
841
842      // set class of example using decision list
843      boolean classDefined = classifyExample(example);
844      if (!classDefined) {
845        // set class with newly generated rule
846        example = updateDecisionList(random, example);
847      }
848      example.setDataset(format);
849      format.add(example);
850    }
851
852    return (format);
853  }
854
855 /**
856   * Generates a new rule for the decision list.
857   * and classifies the new example
858   * @param random random number generator
859   * @param example example used to update decision list
860   * @return the classified example
861   * @throws Exception if dataset format not defined
862   */
863  private Instance updateDecisionList(Random random, Instance example)
864   throws Exception {
865
866    FastVector TestList;
867    Instances format = getDatasetFormat();
868    if (format == null) 
869      throw new Exception("Dataset format not defined.");
870
871    TestList = generateTestList(random, example);
872
873    int maxSize = getMaxRuleSize() < TestList.size() ? 
874                            getMaxRuleSize() : TestList.size();
875    int ruleSize = ((int) (random.nextDouble() * 
876                             (double) (maxSize - getMinRuleSize())))
877                                   + getMinRuleSize();
878
879    RuleList newRule = new RuleList();
880    for (int i=0; i < ruleSize; i++) {
881      int testIndex = (int) (random.nextDouble() * (double) TestList.size());
882      Test test = (Test) TestList.elementAt(testIndex);
883         
884      newRule.addTest(test);
885      TestList.removeElementAt(testIndex);
886    }
887    double newClassValue = 0.0;
888    if (m_DecisionList.size() > 0) {
889      RuleList r = (RuleList)(m_DecisionList.lastElement());
890      double oldClassValue = (double) 
891                        (r.getClassValue());
892      newClassValue = (double)((int)oldClassValue + 1)
893                               % getNumClasses();
894    }
895    newRule.setClassValue(newClassValue);
896    m_DecisionList.addElement(newRule);
897    example = (Instance)example.copy();
898    example.setDataset(format);
899    example.setClassValue(newClassValue);
900    return example;
901  }
902
903 /**
904   * Generates a new rule for the decision list
905   * and classifies the new example.
906   *
907   * @param random random number generator
908   * @param example the instance to classify
909   * @return a list of tests
910   * @throws Exception if dataset format not defined
911   */
912  private FastVector generateTestList(Random random, Instance example) 
913   throws Exception {
914
915    Instances format = getDatasetFormat();
916    if (format == null) 
917      throw new Exception("Dataset format not defined.");
918
919    int numTests = getNumAttributes() - getNumIrrelevant();
920    FastVector TestList = new FastVector(numTests);
921    boolean[] irrelevant = getAttList_Irr();
922
923    for (int i = 0; i < getNumAttributes(); i++) {
924      if (!irrelevant[i]) {
925        Test newTest = null;
926        Attribute att = example.attribute(i);
927        if (att.isNumeric()) {
928          double newSplit = random.nextDouble();
929          boolean newNot = newSplit < example.value(i);
930          newTest = new Test(i, newSplit, format, newNot);
931        } else {
932          newTest = new Test(i, example.value(i), format, false);
933        }
934      TestList.addElement (newTest);     
935      }
936    }
937   
938    return TestList;
939  }
940
941 /**
942   * Generates an example with its classvalue set to missing
943   * and binds it to the datasets.
944   *
945   * @param random random number generator
946   * @param format dataset the example gets bind to
947   * @return the generated example
948   * @throws Exception if attribute type not supported
949   */
950  private Instance generateExample(Random random, Instances format) 
951    throws Exception {     
952    double[] attributes;
953    Instance example;
954
955    attributes = new double[getNumAttributes() + 1];
956    for (int i = 0; i < getNumAttributes(); i++) {
957      double value = random.nextDouble();
958      if (format.attribute(i).isNumeric()) {
959        attributes[i] = value; 
960      } else {
961        if (format.attribute(i).isNominal())
962          attributes[i] = (value > 0.5) ? 1.0 : 0.0;
963        else
964          throw new Exception ("Attribute type is not supported.");
965      }
966    }
967    example = new DenseInstance(1.0, attributes);
968    example.setDataset(format);
969    example.setClassMissing();
970
971    return example; 
972  }
973
974 /**
975   * Tries to classify an example.
976   *
977   * @param example the example to classify
978   * @return true if it could be classified
979   * @throws Exception if something goes wrong
980   */
981  private boolean classifyExample(Instance example) throws Exception {
982    double classValue = -1.0; 
983
984    for (Enumeration e = m_DecisionList.elements(); 
985         e.hasMoreElements() && classValue < 0.0;) {
986      RuleList rl = (RuleList) e.nextElement();
987      classValue = rl.classifyInstance(example);   
988    }
989    if (classValue >= 0.0) {
990      example.setClassValue(classValue);
991      return true;
992    } 
993    else {
994      return false;
995    }
996  }
997
998 /**
999   * Classify example with maximum vote the following way.
1000   * With every rule in the decisionlist, it is evaluated if
1001   * the given instance could be the class of the rule.
1002   * Finally the class value that receives the highest number of votes
1003   * is assigned to the example.
1004   *
1005   * @param example example to be reclassified
1006   * @return instance with new class value
1007   * @throws Exception if classification fails
1008   */
1009  private Instance votedReclassifyExample(Instance example) throws Exception {
1010    int classVotes[] = new int [getNumClasses()]; 
1011    for (int i = 0; i < classVotes.length; i++) classVotes[i] = 0; 
1012
1013    for (Enumeration e = m_DecisionList.elements(); 
1014         e.hasMoreElements();) {
1015      RuleList rl = (RuleList) e.nextElement();
1016      int classValue = (int) rl.classifyInstance(example);
1017      if (classValue >= 0) classVotes[classValue]++; 
1018    }
1019    int maxVote = 0;
1020    int vote = -1;
1021    for (int i = 0; i < classVotes.length; i++) {
1022      if (classVotes[i] > maxVote) {
1023        maxVote = classVotes[i];
1024        vote = i; 
1025      }
1026    }
1027    if (vote >= 0)
1028      example.setClassValue((double) vote);
1029    else
1030      throw new Exception ("Error in instance classification.");
1031
1032    return example;
1033  }
1034
1035 /**
1036   * Returns a dataset header.
1037   * @param random random number generator
1038   * @return dataset header
1039   * @throws Exception if something goes wrong
1040   */
1041  private Instances defineDataset(Random random) throws Exception {
1042
1043    boolean[] attList_Irr;
1044    int[] attList_Num;
1045    FastVector attributes = new FastVector();
1046    Attribute attribute;
1047    FastVector nominalValues = new FastVector (2);
1048    nominalValues.addElement("false"); 
1049    nominalValues.addElement("true"); 
1050    FastVector classValues = new FastVector (getNumClasses());
1051    Instances dataset;
1052     
1053    // set randomly those attributes that are irrelevant
1054    attList_Irr = defineIrrelevant(random);
1055    setAttList_Irr(attList_Irr);
1056
1057    // set randomly those attributes that are numeric
1058    attList_Num = defineNumeric(random); 
1059
1060    // define dataset
1061    for (int i = 0; i < getNumAttributes(); i++) {
1062      if (attList_Num[i] == Attribute.NUMERIC)
1063        attribute = new Attribute("a" + i); 
1064      else
1065        attribute = new Attribute("a" + i, nominalValues); 
1066      attributes.addElement(attribute);
1067    }
1068    for (int i = 0; i < getNumClasses(); i++)
1069      classValues.addElement("c" + i);
1070    attribute = new Attribute ("class", classValues); 
1071    attributes.addElement(attribute);
1072
1073    dataset = new Instances(getRelationNameToUse(), attributes,
1074                            getNumExamplesAct());
1075    dataset.setClassIndex(getNumAttributes());
1076
1077    // set dataset format of this class
1078    Instances format = new Instances(dataset, 0);
1079    setDatasetFormat(format);
1080   
1081    return dataset; 
1082  } 
1083
1084 /**
1085   * Defines randomly the attributes as irrelevant.
1086   * Number of attributes to be set as irrelevant is either set
1087   * with a preceeding call of setNumIrrelevant() or is per default 0.
1088   *
1089   * @param random the random number generator to use
1090   * @return list of boolean values with one value for each attribute,
1091   * and each value set true or false according to if the corresponding
1092   * attribute was defined irrelevant or not
1093   */
1094  private boolean[] defineIrrelevant(Random random) {
1095
1096    boolean[] irr = new boolean [getNumAttributes()];
1097 
1098    // initialize
1099    for (int i = 0; i < irr.length; i++)
1100      irr[i] = false;
1101
1102    // set randomly
1103    int numIrr = 0;
1104    for (int i = 0; 
1105         (numIrr < getNumIrrelevant()) && (i < getNumAttributes() * 5);
1106          i++) {
1107      int maybeNext = (int) (random.nextDouble() * (double) irr.length);
1108      if (irr[maybeNext] == false) {
1109        irr [maybeNext] = true;
1110        numIrr++;
1111      }
1112    }
1113   
1114    return irr;
1115  }
1116
1117 /**
1118   * Chooses randomly the attributes that get datatyp numeric.
1119   * @param random the random number generator to use
1120   * @return list of integer values, with one value for each attribute,
1121   * and each value set to Attribut.NOMINAL or Attribut.NUMERIC
1122   */
1123  private int[] defineNumeric(Random random) {
1124   
1125    int[] num = new int [getNumAttributes()];
1126
1127    // initialize
1128    for (int i = 0; i < num.length; i++)
1129      num[i] = Attribute.NOMINAL;
1130
1131    int numNum = 0;
1132    for (int i = 0;
1133         (numNum < getNumNumeric()) && (i < getNumAttributes() * 5); i++) {
1134      int maybeNext = (int) (random.nextDouble() * (double) num.length);
1135      if (num[maybeNext] != Attribute.NUMERIC) {
1136        num[maybeNext] = Attribute.NUMERIC;
1137        numNum++;
1138      }
1139    }
1140   
1141    return num;
1142  }
1143
1144  /**
1145   * Generates a comment string that documentates the data generator.
1146   * By default this string is added at the beginning of the produced output
1147   * as ARFF file type, next after the options.
1148   *
1149   * @return string contains info about the generated rules
1150   */
1151  public String generateStart () {
1152    return "";
1153  }
1154
1155  /**
1156   * Compiles documentation about the data generation. This is the number of
1157   * irrelevant attributes and the decisionlist with all rules.
1158   * Considering that the decisionlist might get enhanced until
1159   * the last instance is generated, this method should be called at the
1160   * end of the data generation process.
1161   *
1162   * @return string with additional information about generated dataset
1163   * @throws Exception no input structure has been defined
1164   */
1165  public String generateFinished() throws Exception {
1166
1167    StringBuffer dLString = new StringBuffer();
1168
1169    // string for output at end of ARFF-File
1170    boolean[] attList_Irr = getAttList_Irr();
1171    Instances format = getDatasetFormat();
1172    dLString.append("%\n% Number of attributes chosen as irrelevant = " +
1173                    getNumIrrelevant() + "\n");
1174    for (int i = 0; i < attList_Irr.length; i++) {
1175      if (attList_Irr[i])
1176        dLString.append("% " + format.attribute(i).name() + "\n");
1177    }
1178
1179    dLString.append("%\n% DECISIONLIST (number of rules = " +
1180                    m_DecisionList.size() + "):\n");
1181     
1182    for (int i = 0; i < m_DecisionList.size(); i++) {
1183      RuleList rl = (RuleList) m_DecisionList.elementAt(i);
1184      dLString.append("% RULE " + i + ": " + rl.toString() + "\n");
1185    }
1186   
1187    return dLString.toString();
1188  }
1189
1190 /**
1191   * Resets the class values of all instances using voting.
1192   * For each instance the class value that satisfies the most rules
1193   * is choosen as new class value.
1194   *
1195   * @param dataset the dataset to work on
1196   * @return the changed instances
1197   * @throws Exception if something goes wrong
1198   */
1199  private Instances voteDataset(Instances dataset) throws Exception {
1200    for (int i = 0; i < dataset.numInstances(); i++) {
1201      Instance inst = dataset.firstInstance();
1202      inst = votedReclassifyExample(inst); 
1203      dataset.add(inst);
1204      dataset.delete(0);
1205    } 
1206
1207    return dataset;
1208  }
1209 
1210  /**
1211   * Returns the revision string.
1212   *
1213   * @return            the revision
1214   */
1215  public String getRevision() {
1216    return RevisionUtils.extract("$Revision: 5987 $");
1217  }
1218
1219  /**
1220   * Main method for testing this class.
1221   *
1222   * @param args should contain arguments for the data producer:
1223   */
1224  public static void main(String[] args) {
1225    runDataGenerator(new RDG1(), args);
1226  }
1227}
Note: See TracBrowser for help on using the repository browser.