source: branches/MetisMQI/src/main/java/weka/datagenerators/DataGenerator.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 21.2 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * DataGenerator.java
19 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.datagenerators;
24
25import weka.core.Instance;
26import weka.core.Instances;
27import weka.core.Option;
28import weka.core.OptionHandler;
29import weka.core.Randomizable;
30import weka.core.RevisionHandler;
31import weka.core.Utils;
32
33import java.io.FileOutputStream;
34import java.io.PrintWriter;
35import java.io.Serializable;
36import java.io.StringWriter;
37import java.util.Enumeration;
38import java.util.HashSet;
39import java.util.Hashtable;
40import java.util.Random;
41import java.util.Vector;
42
43/**
44 * Abstract superclass for data generators that generate data for
45 * classifiers and clusterers.
46 *
47 * @author FracPete (fracpete at waikato dot ac dot nz)
48 * @version $Revision: 1.8 $
49 */
50public abstract class DataGenerator 
51  implements OptionHandler, Randomizable, Serializable, RevisionHandler {
52
53  /** for serialization */
54  private static final long serialVersionUID = -3698585946221802578L;
55
56  /** Debugging mode */
57  protected boolean m_Debug = false;
58
59  /** The format for the generated dataset */
60  protected Instances m_DatasetFormat = null;
61
62  /** Relation name the dataset should have */
63  protected String m_RelationName = "";
64
65  /** Number of instances that should be produced into the dataset
66   * this number is by default m_NumExamples,
67   * but can be reset by the generator
68   */
69  protected int m_NumExamplesAct;
70
71  /** default output (is printed to stdout after generation) */
72  protected transient StringWriter m_DefaultOutput = new StringWriter();
73
74  /** PrintWriter for outputting the generated data */
75  protected transient PrintWriter m_Output = new PrintWriter(m_DefaultOutput);
76
77  /** random number generator seed*/ 
78  protected int m_Seed;
79
80  /** random number generator*/ 
81  protected Random m_Random = null;
82
83  /** flag, that indicates whether the relationname is currently assembled */
84  protected boolean m_CreatingRelationName = false;
85
86  /** a black list for options not to be listed (for derived generators)
87   *  in the makeOptionString method
88   *  @see #makeOptionString(DataGenerator) */
89  protected static HashSet m_OptionBlacklist;
90  static {
91    m_OptionBlacklist = new HashSet();
92  }
93
94  /**
95   * initializes with default settings. <br/>
96   * Note: default values are set via a default&lt;name&gt; method. These
97   * default methods are also used in the listOptions method and in the
98   * setOptions method. Why? Derived generators can override the return value
99   * of these default methods, to avoid exceptions.
100   */
101  public DataGenerator() {
102    clearBlacklist();
103   
104    setNumExamplesAct(defaultNumExamplesAct());
105    setSeed(defaultSeed());
106  }
107
108  /**
109   * creates a vector out of the enumeration from the listOptions of the
110   * super class. Only a "convenience" method.
111   * @param enm     the Enumeration to dump into a vector
112   * @return        the elements of the enumeration in a vector
113   */
114  protected Vector enumToVector(Enumeration enm) {
115    Vector      result;
116
117    result = new Vector();
118
119    while (enm.hasMoreElements())
120      result.add(enm.nextElement());
121
122    return result;
123  }
124
125  /**
126   * Returns an enumeration describing the available options.
127   *
128   * @return an enumeration of all the available options
129   */
130  public Enumeration listOptions() {
131    Vector      result;
132
133    result = new Vector();
134
135    result.addElement(new Option(
136          "\tPrints this help.",
137          "h", 1, "-h"));
138
139    result.addElement(new Option(
140          "\tThe name of the output file, otherwise the generated data is\n"
141          + "\tprinted to stdout.",
142          "o", 1, "-o <file>"));
143
144    result.addElement(new Option(
145          "\tThe name of the relation.",
146          "r", 1, "-r <name>"));
147
148    result.addElement(new Option(
149          "\tWhether to print debug informations.",
150          "d", 0, "-d"));
151
152    result.addElement(new Option(
153          "\tThe seed for random function (default " 
154          + defaultSeed() + ")",
155          "S", 1, "-S"));
156
157    return result.elements();
158  }
159
160  /**
161   * Parses a list of options for this object. <p/>
162   *
163   * For list of valid options see class description. <p/>
164   *
165   * @param options the list of options as an array of strings
166   * @throws Exception if an option is not supported
167   */
168  public void setOptions(String[] options) throws Exception {
169    String        tmpStr;
170
171    // remove unwanted options
172    options = removeBlacklist(options);
173
174    tmpStr = Utils.getOption('r', options);
175    if (tmpStr.length() != 0)
176      setRelationName(Utils.unquote(tmpStr));
177    else
178      setRelationName("");
179
180    tmpStr = Utils.getOption('o', options);
181    if (tmpStr.length() != 0)
182      setOutput(new PrintWriter(new FileOutputStream(tmpStr)));
183    else if (getOutput() == null)
184      throw new Exception("No Output defined!");
185
186    setDebug(Utils.getFlag('d', options));
187   
188    tmpStr = Utils.getOption('S', options);
189    if (tmpStr.length() != 0)
190      setSeed(Integer.parseInt(tmpStr));
191    else
192      setSeed(defaultSeed());
193  }
194
195  /**
196   * Gets the current settings of the datagenerator RDG1. Removing of
197   * blacklisted options has to be done in the derived class, that defines
198   * the blacklist-entry.
199   *
200   * @return an array of strings suitable for passing to setOptions
201   * @see    #removeBlacklist(String[])
202   */
203  public String[] getOptions() {
204    Vector        result;
205
206    result = new Vector();
207
208    // to avoid endless loop
209    if (!m_CreatingRelationName) {
210      result.add("-r");
211      result.add(Utils.quote(getRelationNameToUse()));
212    }
213
214    if (getDebug())
215      result.add("-d");
216   
217    result.add("-S");
218    result.add("" + getSeed());
219
220    return (String[]) result.toArray(new String[result.size()]);
221  }
222
223  /**
224   * Initializes the format for the dataset produced.
225   * Must be called before the generateExample or generateExamples
226   * methods are used. Also sets a default relation name in case
227   * the current relation name is empty.
228   *
229   * @return the format for the dataset
230   * @throws Exception if the generating of the format failed
231   * @see #defaultRelationName()
232   */
233  public Instances defineDataFormat() throws Exception {
234    if (getRelationName().length() == 0)
235      setRelationName(defaultRelationName());
236
237    return m_DatasetFormat;
238  }
239
240  /**
241   * Generates one example of the dataset.
242   *
243   * @return the generated example
244   * @throws Exception if the format of the dataset is not yet defined
245   * @throws Exception if the generator only works with generateExamples
246   * which means in non single mode
247   */
248  public abstract Instance generateExample() throws Exception;
249
250  /**
251   * Generates all examples of the dataset.
252   *
253   * @return the generated dataset
254   * @throws Exception if the format of the dataset is not yet defined
255   * @throws Exception if the generator only works with generateExample,
256   * which means in single mode
257   */
258  public abstract Instances generateExamples() throws Exception;
259
260  /**
261   * Generates a comment string that documentates the data generator.
262   * By default this string is added at the beginning of the produced output
263   * as ARFF file type, next after the options.
264   *
265   * @return string contains info about the generated rules
266   * @throws Exception if the generating of the documentation fails
267   */
268  public abstract String generateStart () throws Exception;
269
270  /**
271   * Generates a comment string that documentates the data generator.
272   * By default this string is added at the end of the produced output
273   * as ARFF file type.
274   *
275   * @return string contains info about the generated rules
276   * @throws Exception if the generating of the documentation fails
277   */
278  public abstract String generateFinished () throws Exception;
279
280  /**
281   * Return if single mode is set for the given data generator
282   * mode depends on option setting and or generator type.
283   *
284   * @return single mode flag
285   * @throws Exception if mode is not set yet
286   */
287  public abstract boolean getSingleModeFlag () throws Exception;
288
289  /**
290   * Sets the debug flag.
291   * @param debug the new debug flag
292   */
293  public void setDebug(boolean debug) { 
294    m_Debug = debug;
295  }
296
297  /**
298   * Gets the debug flag.
299   * @return the debug flag
300   */
301  public boolean getDebug() { 
302    return m_Debug; 
303  }
304 
305  /**
306   * Returns the tip text for this property
307   *
308   * @return tip text for this property suitable for
309   *         displaying in the explorer/experimenter gui
310   */
311  public String debugTipText() {
312    return "Whether the generator is run in debug mode or not.";
313  }
314
315  /**
316   * Sets the relation name the dataset should have.
317   * @param relationName the new relation name
318   */
319  public void setRelationName(String relationName) {
320    m_RelationName = relationName;
321  }
322
323  /**
324   * returns a relation name based on the options
325   *
326   * @return a relation name based on the options
327   */
328  protected String defaultRelationName() {
329    StringBuffer    result;
330    String[]        options;
331    String          option;
332    int             i;
333
334    m_CreatingRelationName = true;
335
336    result = new StringBuffer(this.getClass().getName());
337
338    options = getOptions();
339    for (i = 0; i < options.length; i++) {
340      option = options[i].trim();
341      if (i > 0)
342        result.append("_");
343      result.append(option.replaceAll(" ", "_"));
344    }
345
346    m_CreatingRelationName = false;
347
348    return result.toString();
349  }
350
351  /**
352   * returns the relation name to use, i.e., in case the currently set
353   * relation name is empty, a generic one is returned. Must be used in
354   * defineDataFormat()
355   * @return the relation name
356   * @see #defaultRelationName()
357   * @see #defineDataFormat()
358   */
359  protected String getRelationNameToUse() {
360    String        result;
361
362    result = getRelationName();
363    if (result.length() == 0)
364      result = defaultRelationName();
365
366    return result;
367  }
368
369  /**
370   * Gets the relation name the dataset should have.
371   * @return the relation name the dataset should have
372   */
373  public String getRelationName() { 
374    return m_RelationName;
375  }
376 
377  /**
378   * Returns the tip text for this property
379   *
380   * @return tip text for this property suitable for
381   *         displaying in the explorer/experimenter gui
382   */
383  public String relationNameTipText() {
384    return "The relation name of the generated data (if empty, a generic one will be supplied).";
385  }
386
387  /**
388   * returns the default number of actual examples
389   *
390   * @return the default number of actual examples
391   */
392  protected int defaultNumExamplesAct() {
393    return 0;
394  }
395
396  /**
397   * Sets the number of examples the dataset should have.
398   * @param numExamplesAct the new number of examples
399   */
400  protected void setNumExamplesAct(int numExamplesAct) { 
401    m_NumExamplesAct = numExamplesAct;
402  }
403
404  /**
405   * Gets the number of examples the dataset should have.
406   * @return the number of examples the dataset should have
407   */
408  public int getNumExamplesAct() { 
409    return m_NumExamplesAct; 
410  }
411 
412  /**
413   * Returns the tip text for this property
414   *
415   * @return tip text for this property suitable for
416   *         displaying in the explorer/experimenter gui
417   */
418  protected String numExamplesActTipText() {
419    return "The actual number of examples to generate.";
420  }
421
422  /**
423   * Sets the print writer.
424   * @param newOutput the new print writer
425   */
426  public void setOutput(PrintWriter newOutput) {
427    m_Output        = newOutput;
428    m_DefaultOutput = null;
429  }
430
431  /**
432   * Gets the print writer.
433   * @return print writer object
434   */
435  public PrintWriter getOutput() { 
436    return m_Output; 
437  }
438
439  /**
440   * Gets the string writer, which is used for outputting to stdout.
441   * A workaround for the problem of closing stdout when closing the
442   * associated Printwriter.
443   * @return print string writer object
444   */
445  public StringWriter defaultOutput() { 
446    return m_DefaultOutput; 
447  }
448 
449  /**
450   * Returns the tip text for this property
451   *
452   * @return tip text for this property suitable for
453   *         displaying in the explorer/experimenter gui
454   */
455  public String outputTipText() {
456    return "The output writer to use for printing the generated data.";
457  }
458
459  /**
460   * Sets the format of the dataset that is to be generated.
461   * @param newFormat the new dataset format of the dataset
462   */
463  public void setDatasetFormat(Instances newFormat) {
464    m_DatasetFormat = new Instances(newFormat, 0);
465  }
466
467  /**
468   * Gets the format of the dataset that is to be generated.
469   * @return the dataset format of the dataset
470   */
471  public Instances getDatasetFormat() {
472    if (m_DatasetFormat != null)
473      return new Instances(m_DatasetFormat, 0);
474    else
475      return null;
476  }
477 
478  /**
479   * Returns the tip text for this property
480   *
481   * @return tip text for this property suitable for
482   *         displaying in the explorer/experimenter gui
483   */
484  public String formatTipText() {
485    return "The data format to use.";
486  }
487
488  /**
489   * returns the default seed
490   *
491   * @return the default seed
492   */
493  protected int defaultSeed() {
494    return 1;
495  }
496 
497  /**
498   * Gets the random number seed.
499   *
500   * @return the random number seed.
501   */
502  public int getSeed() { 
503    return m_Seed; 
504  }
505 
506  /**
507   * Sets the random number seed.
508   *
509   * @param newSeed the new random number seed.
510   */
511  public void setSeed(int newSeed) { 
512    m_Seed   = newSeed; 
513    m_Random = new Random(newSeed);
514  }
515 
516  /**
517   * Returns the tip text for this property
518   *
519   * @return tip text for this property suitable for
520   *         displaying in the explorer/experimenter gui
521   */
522  public String seedTipText() {
523    return "The seed value for the random number generator.";
524  }
525
526  /**
527   * Gets the random generator.
528   *
529   * @return the random generator
530   */
531  public Random getRandom() {
532    if (m_Random == null)
533      m_Random = new Random (getSeed());
534
535    return m_Random;
536  }
537 
538  /**
539   * Sets the random generator.
540   *
541   * @param newRandom is the random generator.
542   */
543  public void setRandom(Random newRandom) {
544    m_Random = newRandom;
545  }
546 
547  /**
548   * Returns the tip text for this property
549   *
550   * @return tip text for this property suitable for
551   *         displaying in the explorer/experimenter gui
552   */
553  public String randomTipText() {
554    return "The random number generator to use.";
555  }
556
557  /**
558   * Returns a string representing the dataset in the instance queue.
559   * @return the string representing the output data format
560   */
561  protected String toStringFormat() {
562    if (m_DatasetFormat == null)
563      return "";
564    return 
565      m_DatasetFormat.toString();
566  }
567
568  /**
569   * removes all entries from the options blacklist
570   */
571  protected static void clearBlacklist() {
572    m_OptionBlacklist.clear();
573  }
574 
575  /**
576   * adds the given option, e.g., for "-V" use "V", to the blacklist of options
577   * that are not to be output via the makeOptionString method
578   * @param option      the option to exclude from listing
579   * @see #makeOptionString(DataGenerator)
580   */
581  protected static void addToBlacklist(String option) {
582    m_OptionBlacklist.add(option);
583  }
584
585  /**
586   * checks, whether the given option is in the blacklist of options not to
587   * be output by makeOptionString
588   * @param option      the option to check
589   * @return true if the option is on the blacklist
590   * @see #makeOptionString(DataGenerator)
591   */
592  protected static boolean isOnBlacklist(String option) {
593    return m_OptionBlacklist.contains(option);
594  }
595
596  /**
597   * removes all the options from the options array that are blacklisted
598   *
599   * @param options the options to remove from the blacklist
600   * @return the processed options array
601   */
602  protected String[] removeBlacklist(String[] options) {
603    Enumeration     enm;
604    Hashtable       pool;
605    Option          option;
606
607    // retrieve options that are on blacklist
608    enm  = listOptions();
609    pool = new Hashtable();
610    while (enm.hasMoreElements()) {
611      option = (Option) enm.nextElement();
612      if (isOnBlacklist(option.name()))
613        pool.put(option.name(), option);
614    }
615
616    // remove options
617    enm = pool.keys();
618    while (enm.hasMoreElements()) {
619      option = (Option) pool.get(enm.nextElement());
620      try {
621        if (option.numArguments() == 0)
622          Utils.getFlag(option.name(), options);
623        else
624          Utils.getOption(option.name(), options);
625      }
626      catch (Exception e) {
627        e.printStackTrace();
628      }
629    }
630
631    return options;
632  }
633
634  /**
635   * returns all the options in a string
636   *
637   * @param generator the DataGenerator to return all the options for
638   * @return the assembled option string
639   */
640  protected static String makeOptionString(DataGenerator generator) {
641    StringBuffer    result;
642    Enumeration     enm;
643    Option          option;
644   
645    result = new StringBuffer();
646    result.append("\nData Generator options:\n\n");
647
648    enm = generator.listOptions();
649    while (enm.hasMoreElements()) {
650      option = (Option) enm.nextElement();
651      // skip option if on blacklist
652      if (isOnBlacklist(option.name()))
653        continue;
654      result.append(option.synopsis() + "\n" + option.description() + "\n");
655    }
656
657    return result.toString();
658  }
659
660  /**
661   * Calls the data generator.
662   *
663   * @param generator one of the data generators
664   * @param options options of the data generator
665   * @throws Exception if there was an error in the option list
666   */
667  public static void makeData(DataGenerator generator, String[] options) 
668    throws Exception {
669
670    boolean     printhelp;
671    Vector      unknown;
672    int         i;
673   
674    // help?
675    printhelp = (Utils.getFlag('h', options));
676
677    // read options
678    if (!printhelp) {
679      try {
680        options = generator.removeBlacklist(options);
681        generator.setOptions(options);
682       
683        // check for left-over options, but don't raise exception
684        unknown = new Vector();
685        for (i = 0; i < options.length; i++) {
686          if (options[i].length() != 0) 
687            unknown.add(options[i]);
688        }
689        if (unknown.size() > 0) {
690          System.out.print("Unknown options:");
691          for (i = 0; i < unknown.size(); i++)
692            System.out.print(" " + unknown.get(i));
693          System.out.println();
694        }
695      }
696      catch (Exception e) {
697        e.printStackTrace();
698        printhelp = true;
699      }
700    }
701   
702    if (printhelp) {
703      System.out.println(makeOptionString(generator));
704      return;
705    }
706   
707    // define dataset format
708    // computes actual number of examples to be produced
709    generator.setDatasetFormat(generator.defineDataFormat());
710
711    // get print writer
712    PrintWriter output = generator.getOutput();
713
714    // output of options
715    output.println("%");
716    output.println("% Commandline");
717    output.println("%");
718    output.println("% " + generator.getClass().getName() + " " 
719                      + Utils.joinOptions(generator.getOptions()));
720    output.println("%");
721
722    // comment at beginning of ARFF File
723    String commentAtStart = generator.generateStart();
724 
725    if (commentAtStart.length() > 0) {
726      output.println("%");
727      output.println("% Prologue");
728      output.println("%");
729      output.println(commentAtStart.trim());
730      output.println("%");
731    }
732
733    // ask data generator which mode
734    boolean singleMode = generator.getSingleModeFlag();
735
736    // start data producer
737    if (singleMode) {
738      // output of dataset header
739      output.println(generator.toStringFormat());
740      for (i = 0; i < generator.getNumExamplesAct(); i++)  {
741        // over all examples to be produced
742        Instance inst = generator.generateExample();
743        output.println(inst);
744      }
745    } 
746    else { // generator produces all instances at once
747      Instances dataset = generator.generateExamples();
748      // output of  dataset
749      output.println(dataset);     
750    }
751    // comment at end of ARFF File
752    String commentAtEnd = generator.generateFinished();
753 
754    if (commentAtEnd.length() > 0) {
755      output.println("%");
756      output.println("% Epilogue");
757      output.println("%");
758      output.println(commentAtEnd.trim());
759      output.println("%");
760    }
761   
762    output.flush();
763    output.close();
764
765    // print result to stdout?
766    if (generator.defaultOutput() != null)
767      System.out.println(generator.defaultOutput().toString());
768  }
769 
770  /**
771   * runs the datagenerator instance with the given options.
772   *
773   * @param datagenerator               the datagenerator to run
774   * @param options     the commandline options
775   */
776  protected static void runDataGenerator(DataGenerator datagenerator, String[] options) {
777    try {
778      DataGenerator.makeData(datagenerator, options);
779    } 
780    catch (Exception e) {
781      if (    (e.getMessage() != null)
782           && (e.getMessage().indexOf("Data Generator options") == -1) )
783        e.printStackTrace();
784      else
785        System.err.println(e.getMessage());
786    }
787  }
788}
Note: See TracBrowser for help on using the repository browser.