source: src/main/java/weka/associations/gsp/Sequence.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 17.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * Sequence.java
19 * Copyright (C) 2007 Sebastian Beer
20 *
21 */
22
23package weka.associations.gsp;
24
25import weka.core.FastVector;
26import weka.core.Instances;
27import weka.core.RevisionHandler;
28import weka.core.RevisionUtils;
29
30import java.io.Serializable;
31import java.util.Enumeration;
32
33/**
34 * Class representing a sequence of elements/itemsets.
35 *
36 * @author  Sebastian Beer
37 * @version $Revision: 1.2 $
38 */
39public class Sequence
40  implements Cloneable, Serializable, RevisionHandler {
41
42  /** for serialization */
43  private static final long serialVersionUID = -5001018056339156390L;
44
45  /** the support count of the Sequence */
46  protected int m_SupportCount;
47 
48  /** ordered list of the comprised elements/itemsets */
49  protected FastVector m_Elements;
50
51  /**
52   * Constructor.
53   */
54  public Sequence() {
55    m_SupportCount = 0;
56    m_Elements = new FastVector();
57  }
58
59  /**
60   * Constructor accepting a set of elements as parameter.
61   *
62   * @param elements            the Elements of the Sequence
63   */
64  public Sequence(FastVector elements) {
65    m_SupportCount = 0;
66    m_Elements = elements;
67  }
68
69  /**
70   * Constructor accepting an int value as parameter to set the support count.
71   *
72   * @param supportCount        the support count to set
73   */
74  public Sequence(int supportCount) {
75    m_SupportCount = supportCount;
76    m_Elements = new FastVector();
77  }
78
79  /**
80   * Generates all possible candidate k-Sequences and prunes the ones that
81   * contain an infrequent (k-1)-Sequence.
82   *
83   * @param kMinusOneSequences  the set of (k-1)-Sequences, used for verification
84   * @return                    the generated set of k-candidates
85   * @throws CloneNotSupportedException
86   */
87  public static FastVector aprioriGen(FastVector kMinusOneSequences) throws CloneNotSupportedException {
88    FastVector allCandidates = generateKCandidates(kMinusOneSequences);
89    FastVector prunedCandidates = pruneCadidates(allCandidates, kMinusOneSequences);
90
91    return prunedCandidates;
92  }
93
94  /**
95   * Deletes Sequences of a given set which don't meet the minimum support
96   * count threshold.
97   *
98   * @param sequences           the set Sequences to be checked
99   * @param minSupportCount     the minimum support count
100   * @return                    the set of Sequences after deleting
101   */
102  public static FastVector deleteInfrequentSequences(FastVector sequences, long minSupportCount) {
103    FastVector deletedSequences = new FastVector();
104    Enumeration seqEnum = sequences.elements();
105
106    while (seqEnum.hasMoreElements()) {
107      Sequence currentSeq = (Sequence) seqEnum.nextElement();
108      long curSupportCount = currentSeq.getSupportCount();
109
110      if (curSupportCount >= minSupportCount) {
111        deletedSequences.addElement(currentSeq);
112      }
113    }
114    return deletedSequences;
115  }
116
117  /**
118   * Generates candidate k-Sequences on the basis of a given (k-1)-Sequence set.
119   *
120   * @param kMinusOneSequences  the set of (k-1)-Sequences
121   * @return                    the set of candidate k-Sequences
122   * @throws CloneNotSupportedException
123   */
124  protected static FastVector generateKCandidates(FastVector kMinusOneSequences) throws CloneNotSupportedException {
125    FastVector candidates = new FastVector();
126    FastVector mergeResult = new FastVector();
127
128    for (int i = 0; i < kMinusOneSequences.size(); i++) {
129      for (int j = 0; j < kMinusOneSequences.size(); j++) {
130        Sequence originalSeq1 = (Sequence) kMinusOneSequences.elementAt(i);
131        Sequence seq1 = originalSeq1.clone();
132        Sequence originalSeq2 = (Sequence) kMinusOneSequences.elementAt(j);
133        Sequence seq2 = originalSeq2.clone();
134        Sequence subseq1 = seq1.deleteEvent("first");
135        Sequence subseq2 = seq2.deleteEvent("last");
136
137        if (subseq1.equals(subseq2)) {
138          //seq1 and seq2 are 1-sequences
139          if ((subseq1.getElements().size() == 0) && (subseq2.getElements().size() == 0)) {
140            if (i >= j) {
141              mergeResult = merge(seq1, seq2, true, true);
142            } else {
143              mergeResult = merge(seq1, seq2, true, false);
144            }
145            //seq1 and seq2 are k-sequences
146          } else {
147            mergeResult = merge(seq1, seq2, false, false);
148          }
149          candidates.appendElements(mergeResult);
150        }
151      }
152    }
153    return candidates;
154  }
155
156  /**
157   * Merges two Sequences in the course of candidate generation. Differentiates
158   * between merging 1-Sequences and k-Sequences, k > 1.
159   *
160   * @param seq1                Sequence at first position
161   * @param seq2                Sequence at second position
162   * @param oneElements         true, if 1-Elements should be merged, else false
163   * @param mergeElements       true, if two 1-Elements were not already merged
164   *                            (regardless of their position), else false
165   * @return                    set of resulting Sequences
166   */
167  protected static FastVector merge(Sequence seq1, Sequence seq2, boolean oneElements, boolean mergeElements) {
168    FastVector mergeResult = new FastVector();
169
170    //merge 1-sequences
171    if (oneElements) {
172      Element element1 = (Element) seq1.getElements().firstElement();
173      Element element2 = (Element) seq2.getElements().firstElement();
174      Element element3 = null;
175      if (mergeElements) {
176        for (int i = 0; i < element1.getEvents().length; i++) {
177          if (element1.getEvents()[i] > -1) {
178            if (element2.getEvents()[i] > -1) {
179              break;
180            } else {
181              element3 = Element.merge(element1, element2);
182            }
183          }
184        }
185      }
186      FastVector newElements1 = new FastVector();
187      //generate <{x}{y}>
188      newElements1.addElement(element1);
189      newElements1.addElement(element2);
190      mergeResult.addElement(new Sequence(newElements1));
191      //generate <{x,y}>
192      if (element3 != null) {
193        FastVector newElements2 = new FastVector();
194        newElements2.addElement(element3);
195        mergeResult.addElement(new Sequence(newElements2));
196      }
197
198      return mergeResult;
199      //merge k-sequences, k > 1
200    } else {
201      Element lastElementSeq1 = (Element) seq1.getElements().lastElement();
202      Element lastElementSeq2 = (Element) seq2.getElements().lastElement();
203      Sequence resultSeq = new Sequence();
204      FastVector resultSeqElements = resultSeq.getElements();
205
206      //if last two events/items belong to the same element/itemset
207      if (lastElementSeq2.containsOverOneEvent()) {
208        for (int i = 0; i < (seq1.getElements().size()-1); i++) {
209          resultSeqElements.addElement(seq1.getElements().elementAt(i));
210        }
211        resultSeqElements.addElement(Element.merge(lastElementSeq1, lastElementSeq2));
212        mergeResult.addElement(resultSeq);
213
214        return mergeResult;
215        //if last two events/items belong to different elements/itemsets
216      } else {
217        for (int i = 0; i < (seq1.getElements().size()); i++) {
218          resultSeqElements.addElement(seq1.getElements().elementAt(i));
219        }
220        resultSeqElements.addElement(lastElementSeq2);
221        mergeResult.addElement(resultSeq);
222
223        return mergeResult;
224      }
225    }
226  }
227
228  /**
229   * Converts a set of 1-Elements into a set of 1-Sequences.
230   *
231   * @param elements            the set of 1-Elements
232   * @return                    the set of 1-Sequences
233   */
234  public static FastVector oneElementsToSequences(FastVector elements) {
235    FastVector sequences = new FastVector();
236    Enumeration elementEnum = elements.elements();
237
238    while (elementEnum.hasMoreElements()) {
239      Sequence seq = new Sequence();
240      FastVector seqElements = seq.getElements();
241      seqElements.addElement(elementEnum.nextElement());
242      sequences.addElement(seq);
243    }
244    return sequences;
245  }
246
247  /**
248   * Prints a set of Sequences as String output.
249   *
250   * @param setOfSequences      the set of sequences
251   */
252  public static void printSetOfSequences(FastVector setOfSequences) {
253    Enumeration seqEnum = setOfSequences.elements();
254    int i = 1;
255
256    while(seqEnum.hasMoreElements()) {
257      Sequence seq = (Sequence) seqEnum.nextElement();
258      System.out.print("[" + i++ + "]" + " " + seq.toString());
259    }
260  }
261
262  /**
263   * Prunes a k-Sequence of a given candidate set if one of its (k-1)-Sequences
264   * is infrequent.
265   *
266   * @param allCandidates       the set of all potential k-Sequences
267   * @param kMinusOneSequences  the set of (k-1)-Sequences for verification
268   * @return                    the set of the pruned candidates
269   */
270  protected static FastVector pruneCadidates(FastVector allCandidates, FastVector kMinusOneSequences) {
271    FastVector prunedCandidates = new FastVector();
272    boolean isFrequent;
273    //for each candidate
274    for (int i = 0; i < allCandidates.size(); i++) {
275      Sequence candidate = (Sequence) allCandidates.elementAt(i);
276      isFrequent = true;
277      FastVector canElements = candidate.getElements();
278      //generate each possible (k-1)-sequence and verify if it's frequent
279      for (int j = 0; j < canElements.size(); j++) {
280        if(isFrequent) {
281          Element origElement = (Element) canElements.elementAt(j);
282          int[] origEvents = origElement.getEvents();
283
284          for (int k = 0; k < origEvents.length; k++) {
285            if (origEvents[k] > -1) {
286              int helpEvent = origEvents[k];
287              origEvents[k] = -1;
288
289              if (origElement.isEmpty()) {
290                canElements.removeElementAt(j);
291                //check if the (k-1)-sequence is contained in the set of kMinusOneSequences
292                int containedAt = kMinusOneSequences.indexOf(candidate);
293                if (containedAt != -1) {
294                  origEvents[k] = helpEvent;
295                  canElements.insertElementAt(origElement, j);
296                  break;
297                } else {
298                  isFrequent = false;
299                  break;
300                }
301              } else {
302                //check if the (k-1)-sequence is contained in the set of kMinusOneSequences
303                int containedAt = kMinusOneSequences.indexOf(candidate);
304                if (containedAt != -1) {
305                  origEvents[k] = helpEvent;
306                  continue;
307                } else {
308                  isFrequent = false;
309                  break;
310                }
311              }
312            }
313          }
314        } else {
315          break;
316        }
317      }
318      if (isFrequent) {
319        prunedCandidates.addElement(candidate);
320      }
321    }
322    return prunedCandidates;
323  }
324
325  /**
326   * Returns a String representation of a set of Sequences where the numeric
327   * value of each event/item is represented by its respective nominal value.
328   *
329   * @param setOfSequences      the set of Sequences
330   * @param dataSet             the corresponding data set containing the header
331   *                            information
332   * @param filterAttributes    the attributes to filter out
333   * @return                    the String representation
334   */
335  public static String setOfSequencesToString(FastVector setOfSequences, Instances dataSet, FastVector filterAttributes) {
336    StringBuffer resString = new StringBuffer();
337    Enumeration SequencesEnum = setOfSequences.elements();
338    int i = 1;
339    boolean printSeq;
340
341    while(SequencesEnum.hasMoreElements()) {
342      Sequence seq = (Sequence) SequencesEnum.nextElement();
343      Integer filterAttr = (Integer) filterAttributes.elementAt(0);
344      printSeq = true;
345
346      if (filterAttr.intValue() != -1) {
347        for (int j=0; j < filterAttributes.size(); j++) {
348          filterAttr = (Integer) filterAttributes.elementAt(j);
349          FastVector seqElements = seq.getElements();
350
351          if (printSeq) {
352            for (int k=0; k < seqElements.size(); k++) {
353              Element currentElement = (Element) seqElements.elementAt(k);
354              int[] currentEvents = currentElement.getEvents();
355
356              if (currentEvents[filterAttr.intValue()] != -1) {
357                continue;
358              } else {
359                printSeq = false;
360                break;
361              }
362            }
363          }
364        }
365      }
366      if (printSeq) {
367        resString.append("[" + i++ + "]" + " " + seq.toNominalString(dataSet));
368      }
369    }
370    return resString.toString();
371  }
372
373  /**
374   * Updates the support count of a set of Sequence candidates according to a
375   * given set of data sequences.
376   *
377   * @param candidates          the set of candidates
378   * @param dataSequences       the set of data sequences
379   */
380  public static void updateSupportCount(FastVector candidates, FastVector dataSequences) {
381    Enumeration canEnumeration = candidates.elements();
382
383    while(canEnumeration.hasMoreElements()){
384      Enumeration dataSeqEnumeration = dataSequences.elements();
385      Sequence candidate = (Sequence) canEnumeration.nextElement();
386
387      while(dataSeqEnumeration.hasMoreElements()) {
388        Instances dataSequence = (Instances) dataSeqEnumeration.nextElement();
389
390        if (candidate.isSubsequenceOf(dataSequence)) {
391          candidate.setSupportCount(candidate.getSupportCount() + 1);
392        }
393      }
394    }
395  }
396
397  /**
398   * Returns a deep clone of a Sequence.
399   *
400   * @return            the cloned Sequence
401   */
402  public Sequence clone() {
403    try {
404      Sequence clone = (Sequence) super.clone();
405
406      clone.setSupportCount(m_SupportCount);
407      FastVector cloneElements = new FastVector(m_Elements.size());
408
409      for (int i = 0; i < m_Elements.size(); i++) {
410        Element helpElement = (Element) m_Elements.elementAt(i);
411        cloneElements.addElement(helpElement.clone());
412      }
413      clone.setElements(cloneElements);
414
415      return clone;
416    } catch (CloneNotSupportedException exc) {
417      exc.printStackTrace();
418    }
419    return null;
420  }
421
422  /**
423   * Deletes either the first or the last event/item of a Sequence. If the
424   * deleted event/item is the only value in the Element, it is removed, as well.
425   *
426   * @param position            the position of the event/item (first or last)
427   * @return                    the Sequence with either the first or the last
428   *                            event/item deleted
429   */
430  protected Sequence deleteEvent(String position) {
431    Sequence cloneSeq = clone();
432
433    if (position.equals("first")) {
434      Element element = (Element) cloneSeq.getElements().firstElement();
435      element.deleteEvent("first");
436      if (element.isEmpty()) {
437        cloneSeq.getElements().removeElementAt(0);
438      }
439      return cloneSeq;
440    }
441    if (position.equals("last")) {
442      Element element = (Element) cloneSeq.getElements().lastElement();
443      element.deleteEvent("last");
444      if (element.isEmpty()) {
445        cloneSeq.getElements().removeElementAt(m_Elements.size()-1);
446      }
447      return cloneSeq;
448    }
449    return null;
450  }
451
452  /**
453   * Checks if two Sequences are equal.
454   *
455   * @return                    true, if the two Sequences are equal, else false
456   */
457  public boolean equals(Object obj) {
458    Sequence seq2 = (Sequence) obj;
459    FastVector seq2Elements = seq2.getElements();
460
461    for (int i = 0; i < m_Elements.size(); i++) {
462      Element thisElement = (Element) m_Elements.elementAt(i);
463      Element seq2Element = (Element) seq2Elements.elementAt(i);
464      if (!thisElement.equals(seq2Element)) {
465        return false;
466      }
467    }
468    return true;
469  }
470
471  /**
472   * Returns the Elements of the Sequence.
473   *
474   * @return                    the Elements
475   */
476  protected FastVector getElements() {
477    return m_Elements;
478  }
479
480  /**
481   * Returns the support count of the Sequence.
482   *
483   * @return                    the support count
484   */
485  protected int getSupportCount() {
486    return m_SupportCount;
487  }
488
489  /**
490   * Checks if the Sequence is subsequence of a given data sequence.
491   *
492   * @param dataSequence        the data sequence to verify against
493   * @return                    true, if the Sequnce is subsequence of the data
494   *                            sequence, else false
495   */
496  protected boolean isSubsequenceOf(Instances dataSequence) {
497    FastVector elements = getElements();
498    Enumeration elementEnum = elements.elements();
499    Element curElement = (Element) elementEnum.nextElement();
500
501    for (int i = 0; i < dataSequence.numInstances(); i++) {
502      if (curElement.isContainedBy(dataSequence.instance(i))) {
503        if (!elementEnum.hasMoreElements()) {
504          return true;
505        } else {
506          curElement = (Element) elementEnum.nextElement();
507          continue;
508        }
509      }
510    }
511    return false;
512  }
513
514  /**
515   * Sets the Elements of the Sequence.
516   *
517   * @param elements            the Elements to set
518   */
519  protected void setElements(FastVector elements) {
520    m_Elements = elements;
521  }
522
523  /**
524   * Sets the support count of the Sequence.
525   *
526   * @param supportCount        the support count to set
527   */
528  protected void setSupportCount(int supportCount) {
529    m_SupportCount = supportCount;
530  }
531
532  /**
533   * Returns a String representation of a Sequences where the numeric value
534   * of each event/item is represented by its respective nominal value.
535   *
536   * @param dataSet             the corresponding data set containing the header
537   *                            information
538   * @return                    the String representation
539   */
540  public String toNominalString(Instances dataSet) {
541    String result = "";
542
543    result += "<";
544
545    for (int i = 0; i < m_Elements.size(); i++) {
546      Element element = (Element) m_Elements.elementAt(i);
547      result += element.toNominalString(dataSet);
548    }
549    result += "> (" + getSupportCount() + ")\n";
550
551    return result;
552  }
553
554  /**
555   * Returns a String representation of a Sequence.
556   *
557   * @return                    the String representation
558   */
559  public String toString() {
560    String result = "";
561
562    result += "Sequence Output\n";
563    result += "------------------------------\n";
564    result += "Support Count: " + getSupportCount() + "\n";
565    result += "contained elements/itemsets:\n";
566
567    for (int i = 0; i < m_Elements.size(); i++) {
568      Element element = (Element) m_Elements.elementAt(i);
569      result += element.toString();
570    }
571    result += "\n\n";
572
573    return result;
574  }
575 
576  /**
577   * Returns the revision string.
578   *
579   * @return            the revision
580   */
581  public String getRevision() {
582    return RevisionUtils.extract("$Revision: 1.2 $");
583  }
584}
Note: See TracBrowser for help on using the repository browser.