source: src/main/java/weka/associations/GeneralizedSequentialPatterns.java @ 11

Last change on this file since 11 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 18.1 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * GeneralizedSequentialPatterns.java
19 * Copyright (C) 2007 Sebastian Beer
20 *
21 */
22
23package weka.associations;
24
25import weka.associations.gsp.Element;
26import weka.associations.gsp.Sequence;
27import weka.core.Attribute;
28import weka.core.Capabilities;
29import weka.core.FastVector;
30import weka.core.Instances;
31import weka.core.Option;
32import weka.core.OptionHandler;
33import weka.core.RevisionUtils;
34import weka.core.TechnicalInformation;
35import weka.core.TechnicalInformationHandler;
36import weka.core.Utils;
37import weka.core.Capabilities.Capability;
38import weka.core.TechnicalInformation.Field;
39import weka.core.TechnicalInformation.Type;
40
41import java.text.SimpleDateFormat;
42import java.util.Date;
43import java.util.Enumeration;
44import java.util.Vector;
45
46/**
47 <!-- globalinfo-start -->
48 * Class implementing a GSP algorithm for discovering sequential patterns in a sequential data set.<br/>
49 * The attribute identifying the distinct data sequences contained in the set can be determined by the respective option. Furthermore, the set of output results can be restricted by specifying one or more attributes that have to be contained in each element/itemset of a sequence.<br/>
50 * <br/>
51 * For further information see:<br/>
52 * <br/>
53 * Ramakrishnan Srikant, Rakesh Agrawal (1996). Mining Sequential Patterns: Generalizations and Performance Improvements.
54 * <p/>
55 <!-- globalinfo-end -->
56 *
57 <!-- technical-bibtex-start -->
58 * BibTeX:
59 * <pre>
60 * &#64;proceedings{Srikant1996,
61 *    author = {Ramakrishnan Srikant and Rakesh Agrawal},
62 *    booktitle = {Advances in Database Technology EDBT '96},
63 *    publisher = {Springer},
64 *    title = {Mining Sequential Patterns: Generalizations and Performance Improvements},
65 *    year = {1996}
66 * }
67 * </pre>
68 * <p/>
69 <!-- technical-bibtex-end -->
70 *
71 <!-- options-start -->
72 * Valid options are: <p/>
73 *
74 * <pre> -D
75 *  If set, algorithm is run in debug mode and
76 *  may output additional info to the console</pre>
77 *
78 * <pre> -S &lt;minimum support threshold&gt;
79 *  The miminum support threshold.
80 *  (default: 0.9)</pre>
81 *
82 * <pre> -I &lt;attribute number representing the data sequence ID
83 *  The attribute number representing the data sequence ID.
84 *  (default: 0)</pre>
85 *
86 * <pre> -F &lt;attribute numbers used for result filtering
87 *  The attribute numbers used for result filtering.
88 *  (default: -1)</pre>
89 *
90 <!-- options-end -->
91 *
92 * @author  Sebastian Beer
93 * @version $Revision: 5444 $
94 */
95public class GeneralizedSequentialPatterns
96  extends AbstractAssociator
97  implements OptionHandler, TechnicalInformationHandler {
98
99  /** for serialization */
100  private static final long serialVersionUID = -4119691320812254676L;
101
102  /** the minimum support threshold */
103  protected double m_MinSupport; 
104
105  /** number indicating the attribute holding the data sequence ID */
106  protected int m_DataSeqID;
107
108  /** original sequential data set to be used for sequential patterns extraction */
109  protected Instances m_OriginalDataSet;
110 
111  /** all generated frequent sequences, i.e. sequential patterns */
112  protected FastVector m_AllSequentialPatterns;
113 
114  /** number of cycles performed until termination */
115  protected int m_Cycles;
116 
117  /** String indicating the starting time of an cycle. */
118  protected String m_CycleStart;
119 
120  /** String indicating the ending time of an cycle. */
121  protected String m_CycleEnd;
122 
123  /** String indicating the starting time of the algorithm. */
124  protected String m_AlgorithmStart;
125 
126  /** String containing the attribute numbers that are used for result
127   * filtering; -1 means no filtering */
128  protected String m_FilterAttributes;
129 
130  /** Vector containing the attribute numbers that are used for result
131   * filtering; -1 means no filtering */
132  protected FastVector m_FilterAttrVector;
133 
134  /** Whether the classifier is run in debug mode. */
135  protected boolean m_Debug = false;
136
137  /**
138   * Constructor.
139   */
140  public GeneralizedSequentialPatterns() {
141    resetOptions();
142  }
143
144  /**
145   * Returns global information about the algorithm.
146   *
147   * @return                    the global information
148   */
149  public String globalInfo() {
150    return 
151        "Class implementing a GSP algorithm for discovering sequential "
152      + "patterns in a sequential data set.\n" 
153      + "The attribute identifying the distinct data sequences contained in "
154      + "the set can be determined by the respective option. Furthermore, the "
155      + "set of output results can be restricted by specifying one or more "
156      + "attributes that have to be contained in each element/itemset of a "
157      + "sequence.\n\n" 
158      + "For further information see:\n\n" 
159      + getTechnicalInformation().toString();
160  }
161
162  /**
163   * Returns TechnicalInformation about the paper related to the algorithm.
164   *
165   * @return                    the TechnicalInformation
166   */
167  public TechnicalInformation getTechnicalInformation() {       
168    TechnicalInformation paper = new TechnicalInformation(Type.PROCEEDINGS);
169
170    paper.setValue(Field.AUTHOR, "Ramakrishnan Srikant and Rakesh Agrawal");
171    paper.setValue(Field.TITLE, "Mining Sequential Patterns: Generalizations and Performance Improvements");
172    paper.setValue(Field.BOOKTITLE, "Advances in Database Technology EDBT '96");
173    paper.setValue(Field.YEAR, "1996");
174    paper.setValue(Field.PUBLISHER, "Springer");
175
176    return paper;
177  }
178
179  /**
180   * Returns an enumeration of the available options.
181   *
182   * @return                    the available options
183   */
184  public Enumeration listOptions() {
185    Vector result = new Vector();
186
187    result.addElement(new Option(
188        "\tIf set, algorithm is run in debug mode and\n"
189        + "\tmay output additional info to the console",
190        "D", 0, "-D"));
191   
192    result.addElement(new Option(
193        "\tThe miminum support threshold.\n"
194        + "\t(default: 0.9)",
195        "S", 1, "-S <minimum support threshold>"));
196   
197    result.addElement(new Option(
198        "\tThe attribute number representing the data sequence ID.\n"
199        + "\t(default: 0)",
200        "I", 1, "-I <attribute number representing the data sequence ID"));
201
202    result.addElement(new Option(
203        "\tThe attribute numbers used for result filtering.\n"
204        + "\t(default: -1)",
205        "F", 1, "-F <attribute numbers used for result filtering"));
206
207    return result.elements();
208  }
209
210  /**
211   * Parses a given list of options. <p/>
212   *
213   <!-- options-start -->
214   * Valid options are: <p/>
215   *
216   * <pre> -D
217   *  If set, algorithm is run in debug mode and
218   *  may output additional info to the console</pre>
219   *
220   * <pre> -S &lt;minimum support threshold&gt;
221   *  The miminum support threshold.
222   *  (default: 0.9)</pre>
223   *
224   * <pre> -I &lt;attribute number representing the data sequence ID
225   *  The attribute number representing the data sequence ID.
226   *  (default: 0)</pre>
227   *
228   * <pre> -F &lt;attribute numbers used for result filtering
229   *  The attribute numbers used for result filtering.
230   *  (default: -1)</pre>
231   *
232   <!-- options-end -->
233   *
234   * @param options             the Array containing the options
235   */
236  public void setOptions(String[] options) throws Exception {
237    String      tmpStr;
238 
239    resetOptions();
240
241    setDebug(Utils.getFlag('D', options));
242   
243    tmpStr = Utils.getOption('S', options);
244    if (tmpStr.length() != 0)
245      setMinSupport(Double.parseDouble(tmpStr));
246
247    tmpStr = Utils.getOption('I', options);
248    if (tmpStr.length() != 0)
249      setDataSeqID(Integer.parseInt(tmpStr));
250
251    tmpStr = Utils.getOption('F', options);
252    if (tmpStr.length() != 0)
253      setFilterAttributes(tmpStr);
254  }
255
256  /**
257   * Returns an Array containing the current options settings.
258   *
259   * @return                    the Array containing the settings
260   */
261  public String[] getOptions() {
262    Vector<String>      result;
263   
264    result = new Vector<String>();
265
266    if (getDebug())
267      result.add("-D");
268   
269    result.add("-S");
270    result.add("" + getMinSupport());
271
272    result.add("-I");
273    result.add("" + getDataSeqID());
274   
275    result.add("-F");
276    result.add(getFilterAttributes());
277
278    return result.toArray(new String[result.size()]);
279  }
280
281  /**
282   * Resets the algorithm's options to the default values.
283   */
284  protected void resetOptions() {
285    m_MinSupport       = 0.9;
286    m_DataSeqID        = 0;
287    m_FilterAttributes = "-1";
288  }
289
290  /**
291   * Returns the Capabilities of the algorithm.
292   *
293   * @return                    the Capabilities
294   */
295  public Capabilities getCapabilities() {
296    Capabilities result = super.getCapabilities();
297    result.disableAll();
298
299    result.enable(Capability.NOMINAL_ATTRIBUTES);
300    result.enable(Capability.NO_CLASS);
301   
302    return result;
303  }
304
305  /**
306   * Extracts all sequential patterns out of a given sequential data set and
307   * prints out the results.
308   *
309   * @param data        the original data set
310   */
311  public void buildAssociations(Instances data) throws Exception {
312    // can associator handle the data?
313    getCapabilities().testWithFail(data);
314
315    m_AllSequentialPatterns = new FastVector();
316    m_Cycles                = 0;
317    m_FilterAttrVector      = new FastVector();
318    m_AlgorithmStart        = getTimeAndDate();
319    m_OriginalDataSet       = new Instances(data);
320   
321    extractFilterAttributes(m_FilterAttributes);
322    findFrequentSequences();
323  }
324
325  /**
326   * Calculates the total number of extracted frequent sequences.
327   *
328   * @return                    the total number of frequent sequences
329   */
330  protected int calcFreqSequencesTotal() {
331    int total = 0;
332    Enumeration allSeqPatternsEnum = m_AllSequentialPatterns.elements();
333
334    while (allSeqPatternsEnum.hasMoreElements()) {
335      FastVector kSequences = (FastVector) allSeqPatternsEnum.nextElement();
336      total += kSequences.size();
337    }
338
339    return total;
340  }
341
342  /**
343   * Extracts the data sequences out of the original data set according to
344   * their sequence id attribute, which is removed after extraction.
345   *
346   * @param originalDataSet     the original data set
347   * @param dataSeqID           the squence ID to use
348   * @return                    set of distinct data sequences
349   */
350  protected FastVector extractDataSequences (Instances originalDataSet, int dataSeqID) {
351    FastVector dataSequences = new FastVector();
352    int firstInstance = 0;
353    int lastInstance = 0;
354    Attribute seqIDAttribute = originalDataSet.attribute(dataSeqID);
355
356    for (int i = 0; i < seqIDAttribute.numValues(); i++) {
357      double sequenceID = originalDataSet.instance(firstInstance).value(dataSeqID);
358      while (lastInstance < originalDataSet.numInstances()
359          && sequenceID == originalDataSet.instance(lastInstance).value(dataSeqID)) {
360        lastInstance++;
361      }
362      Instances dataSequence = new Instances(originalDataSet, firstInstance, (lastInstance)-firstInstance);
363      dataSequence.deleteAttributeAt(dataSeqID);
364      dataSequences.addElement(dataSequence);
365      firstInstance = lastInstance;
366    }
367    return dataSequences;
368  }
369
370  /**
371   * Parses a given String containing attribute numbers which are used for
372   * result filtering.
373   *
374   * @param attrNumbers         the String of attribute numbers
375   */
376  public void extractFilterAttributes(String attrNumbers) {
377    String numbers = attrNumbers.trim();
378
379    while (!numbers.equals("")) {
380      int commaLoc = numbers.indexOf(',');
381
382      if (commaLoc != -1) {
383        String number = numbers.substring(0, commaLoc);
384        numbers = numbers.substring(commaLoc + 1).trim();
385        m_FilterAttrVector.addElement(Integer.decode(number));
386      } else {
387        m_FilterAttrVector.addElement(Integer.decode(numbers));
388        break;
389      }
390    }
391  }
392
393  /**
394   * The actual method for extracting frequent sequences.
395   *
396   * @throws CloneNotSupportedException
397   */
398  protected void findFrequentSequences() throws CloneNotSupportedException {
399    m_CycleStart = getTimeAndDate();
400    Instances originalDataSet = m_OriginalDataSet;
401    FastVector dataSequences = extractDataSequences(m_OriginalDataSet, m_DataSeqID);
402    long minSupportCount = Math.round(m_MinSupport * dataSequences.size());
403    FastVector kMinusOneSequences;
404    FastVector kSequences;
405
406    originalDataSet.deleteAttributeAt(0);
407    FastVector oneElements = Element.getOneElements(originalDataSet);
408    m_Cycles = 1;
409
410    kSequences = Sequence.oneElementsToSequences(oneElements);
411    Sequence.updateSupportCount(kSequences, dataSequences);
412    kSequences = Sequence.deleteInfrequentSequences(kSequences, minSupportCount);
413
414    m_CycleEnd = getTimeAndDate();
415
416    if (kSequences.size() == 0) {
417      return;
418    }
419    while (kSequences.size() > 0) {
420      m_CycleStart = getTimeAndDate();
421
422      m_AllSequentialPatterns.addElement(kSequences.copy());
423      kMinusOneSequences = kSequences;
424      kSequences = Sequence.aprioriGen(kMinusOneSequences);
425      Sequence.updateSupportCount(kSequences, dataSequences);
426      kSequences = Sequence.deleteInfrequentSequences(kSequences, minSupportCount);
427
428      m_CycleEnd = getTimeAndDate();
429     
430      if (getDebug())
431        System.out.println(
432            "Cycle " + m_Cycles + " from " + m_CycleStart + " to " + m_CycleEnd);
433     
434      m_Cycles++;
435    }
436  }
437
438  /**
439   * Returns the dataSeqID option tip text for the Weka GUI.
440   *
441   * @return                    the option tip text
442   */
443  public String dataSeqIDTipText() {
444    return "The attribute number representing the data sequence ID.";
445  }
446
447  /**
448   * Returns the attribute representing the data sequence ID.
449   *
450   * @return                    the data sequence ID
451   */
452  public int getDataSeqID() {
453    return m_DataSeqID;
454  }
455
456  /**
457   * Sets the attribute representing the data sequence ID.
458   *
459   * @param value               the data sequence ID to set
460   */
461  public void setDataSeqID(int value) {
462    m_DataSeqID = value;
463  }
464
465  /**
466   * Returns the filterAttributes option tip text for the Weka GUI.
467   *
468   * @return                    the option tip text
469   */
470  public String filterAttributesTipText() {
471    return 
472        "The attribute numbers (eg \"0, 1\") used for result filtering; only "
473      + "sequences containing the specified attributes in each of their "
474      + "elements/itemsets will be output; -1 prints all.";
475  }
476
477  /**
478   * Returns the String containing the attributes which are used for output
479   * filtering.
480   *
481   * @return                    the String containing the attributes
482   */
483  public String getFilterAttributes() {
484    return m_FilterAttributes;
485  }
486
487  /**
488   * Sets the String containing the attributes which are used for output
489   * filtering.
490   *
491   * @param value               the String containing the attributes
492   */
493  public void setFilterAttributes(String value) {
494    m_FilterAttributes = value;
495  }
496
497  /**
498   * Returns the minimum support option tip text for the Weka GUI.
499   *
500   * @return                    the option tip text
501   */
502  public String minSupportTipText() {
503    return "Minimum support threshold.";
504  }
505
506  /**
507   * Returns the minimum support threshold.
508   *
509   * @return                    the minimum support threshold
510   */
511  public double getMinSupport() {
512    return m_MinSupport;
513  }
514
515  /**
516   * Sets the minimum support threshold.
517   *
518   * @param value               the minimum support threshold
519   */
520  public void setMinSupport(double value) {
521    m_MinSupport = value;
522  }
523
524  /**
525   * Set debugging mode.
526   *
527   * @param value               true if debug output should be printed
528   */
529  public void setDebug(boolean value) {
530    m_Debug = value;
531  }
532
533  /**
534   * Get whether debugging is turned on.
535   *
536   * @return                    true if debugging output is on
537   */
538  public boolean getDebug() {
539    return m_Debug;
540  }
541 
542  /**
543   * Returns the tip text for this property
544   *
545   * @return                    tip text for this property suitable for
546   *                            displaying in the explorer/experimenter gui
547   */
548  public String debugTipText() {
549    return "If set to true, algorithm may output additional info to the console.";
550  }
551
552  /**
553   * Returns the current time and date.
554   *
555   * @return                    the time and date
556   */
557  protected String getTimeAndDate() {
558    SimpleDateFormat    dateFormat;
559   
560    dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
561   
562    return dateFormat.format(new Date());
563  }
564
565  /**
566   * Returns the time/date string the algorithm was started
567   *
568   * @return                    the time and date the algorithm was started
569   */
570  public String getAlgorithmStart() {
571    return m_AlgorithmStart;
572  }
573
574  /**
575   * Returns the time/date string the cycle was started
576   *
577   * @return                    the time and date the cycle was started
578   */
579  public String getCycleStart() {
580    return m_CycleStart;
581  }
582
583  /**
584   * Returns the time/date string the cycle ended
585   *
586   * @return                    the time and date the cycle ended
587   */
588  public String getCycleEnd() {
589    return m_CycleEnd;
590  }
591 
592  /**
593   * Returns a String containing the result information of the algorithm.
594   *
595   * @return                    the String containing the result information
596   */
597  public String toString() {
598    StringBuffer result = new StringBuffer();
599
600    result.append("GeneralizedSequentialPatterns\n");
601    result.append("=============================\n\n");
602    result.append("Number of cycles performed: " + (m_Cycles-1) + "\n");
603    result.append("Total number of frequent sequences: " + calcFreqSequencesTotal() + "\n\n");
604    result.append("Frequent Sequences Details (filtered):\n\n");
605    for (int i = 0; i < m_AllSequentialPatterns.size(); i++) {
606      result.append("- " + (i+1) + "-sequences\n\n");
607      FastVector kSequences = (FastVector) m_AllSequentialPatterns.elementAt(i);
608      result.append(Sequence.setOfSequencesToString(kSequences, m_OriginalDataSet, m_FilterAttrVector) + "\n");
609    }
610
611    return result.toString();
612  }
613 
614  /**
615   * Returns the revision string.
616   *
617   * @return            the revision
618   */
619  public String getRevision() {
620    return RevisionUtils.extract("$Revision: 5444 $");
621  }
622
623  /**
624   * Main method.
625   *
626   * @param args        commandline options, use -h for help
627   */
628  public static void main(String[] args) {
629    runAssociator(new GeneralizedSequentialPatterns(), args);
630  }
631}
Note: See TracBrowser for help on using the repository browser.