source: src/main/java/weka/classifiers/rules/RuleStats.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 29.2 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    RuleStats.java
19 *    Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.classifiers.rules;
23
24import weka.core.Attribute;
25import weka.core.FastVector;
26import weka.core.Instance;
27import weka.core.Instances;
28import weka.core.RevisionHandler;
29import weka.core.RevisionUtils;
30import weka.core.Utils;
31
32import java.io.Serializable;
33import java.util.Enumeration;
34import java.util.Random;
35
36/**
37 * This class implements the statistics functions used in the
38 * propositional rule learner, from the simpler ones like count of
39 * true/false positive/negatives, filter data based on the ruleset, etc.
40 * to the more sophisticated ones such as MDL calculation and rule
41 * variants generation for each rule in the ruleset. <p>
42 *
43 * Obviously the statistics functions listed above need the specific
44 * data and the specific ruleset, which are given in order to instantiate
45 * an object of this class. <p>
46 * 
47 * @author Xin Xu (xx5@cs.waikato.ac.nz)
48 * @version $Revision: 4608 $
49 */
50public class RuleStats 
51  implements Serializable, RevisionHandler {
52 
53  /** for serialization */
54  static final long serialVersionUID = -5708153367675298624L;
55
56  /** The data on which the stats calculation is based */
57  private Instances m_Data;
58
59  /** The specific ruleset in question */
60  private FastVector m_Ruleset;
61
62  /** The simple stats of each rule */
63  private FastVector m_SimpleStats;
64
65  /** The set of instances filtered by the ruleset */
66  private FastVector m_Filtered;
67   
68  /** The total number of possible conditions that could
69   *  appear in a rule */
70  private double m_Total;
71 
72  /** The redundancy factor in theory description length */     
73  private static double REDUNDANCY_FACTOR = 0.5;
74   
75  /** The theory weight in the MDL calculation */
76  private double MDL_THEORY_WEIGHT = 1.0;
77
78    /** The class distributions predicted by each rule */
79    private FastVector m_Distributions;
80
81  /** Default constructor */
82  public RuleStats(){
83    m_Data = null;
84    m_Ruleset = null;
85    m_SimpleStats = null;
86    m_Filtered = null;
87    m_Distributions = null;
88    m_Total = -1;
89  }
90
91   
92  /**
93   * Constructor that provides ruleset and data
94   *
95   * @param data the data
96   * @param rules the ruleset
97   */
98  public RuleStats(Instances data, FastVector rules){
99    this();
100    m_Data = data;
101    m_Ruleset = rules; 
102  }
103
104  /**
105   * Frees up memory after classifier has been built.
106   */
107  public void cleanUp() {
108    m_Data     = null;
109    m_Filtered = null;
110  }
111
112  /**
113   * Set the number of all conditions that could appear
114   * in a rule in this RuleStats object, if the number set
115   * is smaller than 0 (typically -1), then it calcualtes
116   * based on the data store
117   *
118   * @param total the set number
119   */
120  public void setNumAllConds(double total){
121    if(total < 0)
122      m_Total = numAllConditions(m_Data);   
123    else
124      m_Total = total;
125  }
126   
127  /**
128   * Set the data of the stats, overwriting the old one if any
129   *
130   * @param data the data to be set
131   */
132  public void setData(Instances data){
133    m_Data = data;
134  }
135   
136  /**
137   * Get the data of the stats
138   *
139   * @return the data
140   */
141  public Instances getData(){
142    return m_Data;
143  }
144
145   
146  /**
147   * Set the ruleset of the stats, overwriting the old one if any
148   *     
149   * @param rules the set of rules to be set
150   */
151  public void setRuleset(FastVector rules){
152    m_Ruleset = rules;
153  }
154
155   
156  /**
157   * Get the ruleset of the stats
158   *     
159   * @return the set of rules
160   */
161  public FastVector getRuleset(){
162    return m_Ruleset;
163  }
164
165  /**
166   * Get the size of the ruleset in the stats
167   *     
168   * @return the size of ruleset
169   */
170  public int getRulesetSize(){
171    return m_Ruleset.size();
172  }
173 
174  /**
175   * Get the simple stats of one rule, including 6 parameters:
176   * 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;
177   * 4: false positives; 5: false negatives
178   *
179   * @param index the index of the rule
180   * @return the stats
181   */
182  public double[] getSimpleStats(int index){
183    if((m_SimpleStats != null) && (index < m_SimpleStats.size()))
184      return (double[])m_SimpleStats.elementAt(index);
185       
186    return null;
187  }   
188
189
190  /**
191   * Get the data after filtering the given rule
192   *
193   * @param index the index of the rule
194   * @return the data covered and uncovered by the rule
195   */
196  public Instances[] getFiltered(int index){
197       
198    if((m_Filtered != null) && (index < m_Filtered.size()))
199      return (Instances[])m_Filtered.elementAt(index);
200       
201    return null;
202  }   
203
204    /**
205     * Get the class distribution predicted by the rule in
206     * given position
207     *
208     * @param index the position index of the rule
209     * @return the class distributions
210     */
211    public double[] getDistributions(int index){
212       
213        if((m_Distributions != null) && (index < m_Distributions.size()))
214            return (double[])m_Distributions.elementAt(index);
215       
216        return null;
217    }   
218   
219  /**
220   * Set the weight of theory in MDL calcualtion
221   *
222   * @param weight the weight to be set
223   */
224  public void setMDLTheoryWeight(double weight){
225    MDL_THEORY_WEIGHT = weight;
226  } 
227
228  /**
229   * Compute the number of all possible conditions that could
230   * appear in a rule of a given data.  For nominal attributes,
231   * it's the number of values that could appear; for numeric
232   * attributes, it's the number of values * 2, i.e. <= and >=
233   * are counted as different possible conditions.
234   *
235   * @param data the given data
236   * @return number of all conditions of the data
237   */
238  public static double numAllConditions(Instances data){
239    double total = 0;
240    Enumeration attEnum = data.enumerateAttributes();   
241    while(attEnum.hasMoreElements()){
242      Attribute att= (Attribute)attEnum.nextElement();
243      if(att.isNominal())
244        total += (double)att.numValues();
245      else
246        total += 2.0 * (double)data.numDistinctValues(att);     
247    }
248    return total;
249  }
250
251
252  /**
253   * Filter the data according to the ruleset and compute the basic
254   * stats: coverage/uncoverage, true/false positive/negatives of
255   * each rule
256   */
257  public void countData(){
258    if((m_Filtered != null) ||
259       (m_Ruleset == null)  ||
260       (m_Data == null))
261      return;
262       
263    int size = m_Ruleset.size();
264    m_Filtered = new FastVector(size);
265    m_SimpleStats = new FastVector(size);
266    m_Distributions = new FastVector(size);
267    Instances data = new Instances(m_Data);
268       
269    for(int i=0; i < size; i++){
270      double[] stats = new double[6];  // 6 statistics parameters
271      double[] classCounts = new double[m_Data.classAttribute().numValues()];
272      Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);
273      m_Filtered.addElement(filtered);
274      m_SimpleStats.addElement(stats);
275      m_Distributions.addElement(classCounts);
276      data = filtered[1];  // Data not covered
277    }   
278  }
279   
280    /**
281     * Count data from the position index in the ruleset
282     * assuming that given data are not covered by the rules
283     * in position 0...(index-1), and the statistics of these
284     * rules are provided.<br>
285     * This procedure is typically useful when a temporary
286     * object of RuleStats is constructed in order to efficiently
287     * calculate the relative DL of rule in position index,
288     * thus all other stuff is not needed.
289     *
290     * @param index the given position
291     * @param uncovered the data not covered by rules before index
292     * @param prevRuleStats the provided stats of previous rules
293     */
294    public void countData(int index, Instances uncovered, 
295                          double[][] prevRuleStats){
296        if((m_Filtered != null) ||
297           (m_Ruleset == null))
298            return;
299       
300        int size = m_Ruleset.size();
301        m_Filtered = new FastVector(size);
302        m_SimpleStats = new FastVector(size);
303        Instances[] data = new Instances[2];
304        data[1] = uncovered;
305       
306        for(int i=0; i < index; i++){
307            m_SimpleStats.addElement(prevRuleStats[i]);
308            if(i+1 == index)
309                m_Filtered.addElement(data);
310            else
311                m_Filtered.addElement(new Object()); // Stuff sth.
312        }
313       
314        for(int j=index; j < size; j++){
315            double[] stats = new double[6];  // 6 statistics parameters
316            Instances[] filtered = computeSimpleStats(j, data[1], stats, null);
317            m_Filtered.addElement(filtered);
318            m_SimpleStats.addElement(stats);
319            data = filtered;  // Data not covered
320        }       
321    }
322   
323  /**
324   * Find all the instances in the dataset covered/not covered by
325   * the rule in given index, and the correponding simple statistics
326   * and predicted class distributions are stored in the given double array,
327   * which can be obtained by getSimpleStats() and getDistributions().<br>
328   *
329   * @param index the given index, assuming correct
330   * @param insts the dataset to be covered by the rule
331   * @param stats the given double array to hold stats, side-effected
332   * @param dist the given array to hold class distributions, side-effected
333   *             if null, the distribution is not necessary
334   * @return the instances covered and not covered by the rule
335   */
336  private Instances[] computeSimpleStats(int index, Instances insts, 
337                                         double[] stats, double[] dist){
338    Rule rule = (Rule)m_Ruleset.elementAt(index);
339       
340    Instances[] data = new Instances[2];
341    data[0] = new Instances(insts, insts.numInstances());
342    data[1] = new Instances(insts, insts.numInstances());
343
344    for(int i=0; i<insts.numInstances(); i++){
345      Instance datum = insts.instance(i);
346      double weight = datum.weight();
347      if(rule.covers(datum)){
348        data[0].add(datum);        // Covered by this rule
349        stats[0] += weight;        // Coverage
350        if((int)datum.classValue() == (int)rule.getConsequent())
351          stats[2] += weight;    // True positives
352        else
353          stats[4] += weight;    // False positives
354        if(dist != null)
355            dist[(int)datum.classValue()] += weight;
356      }
357      else{
358        data[1].add(datum);        // Not covered by this rule
359        stats[1] += weight; 
360        if((int)datum.classValue() != (int)rule.getConsequent())
361          stats[3] += weight;    // True negatives
362        else
363          stats[5] += weight;    // False negatives         
364      }
365    }
366   
367    return data;
368  }
369   
370   
371  /**
372   * Add a rule to the ruleset and update the stats
373   *
374   * @param lastRule the rule to be added
375   */
376  public void addAndUpdate(Rule lastRule){
377    if(m_Ruleset == null)
378      m_Ruleset = new FastVector();
379    m_Ruleset.addElement(lastRule);
380 
381    Instances data = (m_Filtered == null) ?
382      m_Data : ((Instances[])m_Filtered.lastElement())[1];
383    double[] stats = new double[6];
384    double[] classCounts = new double[m_Data.classAttribute().numValues()];
385    Instances[] filtered = 
386        computeSimpleStats(m_Ruleset.size()-1, data, stats, classCounts);
387   
388    if(m_Filtered == null)
389        m_Filtered = new FastVector(); 
390    m_Filtered.addElement(filtered);
391
392    if(m_SimpleStats == null)
393      m_SimpleStats = new FastVector(); 
394    m_SimpleStats.addElement(stats);
395
396    if(m_Distributions == null)
397        m_Distributions = new FastVector();
398    m_Distributions.addElement(classCounts);
399  }
400   
401   
402  /**
403   * Subset description length: <br>
404   * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
405   *
406   * Details see Quilan: "MDL and categorical theories (Continued)",ML95
407   *
408   * @param t the number of elements in a known set
409   * @param k the number of elements in a subset
410   * @param p the expected proportion of subset known by recipient
411   * @return the subset description length
412   */
413  public static double subsetDL(double t, double k, double p){
414    double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
415    rt -= (t-k)*Utils.log2(1-p);
416    return rt;
417  }
418   
419   
420  /**
421   * The description length of the theory for a given rule.  Computed as:<br>
422   *                 0.5* [||k||+ S(t, k, k/t)]<br>
423   * where k is the number of antecedents of the rule; t is the total
424   * possible antecedents that could appear in a rule; ||K|| is the
425   * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
426   * is the subset encoding length.<p>
427   *
428   * Details see Quilan: "MDL and categorical theories (Continued)",ML95
429   *
430   * @param index the index of the given rule (assuming correct)
431   * @return the theory DL, weighted if weight != 1.0
432   */
433  public double theoryDL(int index){
434       
435    double k = ((Rule)m_Ruleset.elementAt(index)).size();
436       
437    if(k == 0)
438      return 0.0;
439       
440    double tdl = Utils.log2(k);                     
441    if(k > 1)                           // Approximation
442      tdl += 2.0 * Utils.log2(tdl);   // of log2 star   
443    tdl += subsetDL(m_Total, k, k/m_Total);
444    //System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
445    return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
446  }
447     
448
449  /**
450   * The description length of data given the parameters of the data
451   * based on the ruleset. <p>
452   * Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
453   *
454   * @param expFPOverErr expected FP/(FP+FN)
455   * @param cover coverage
456   * @param uncover uncoverage
457   * @param fp False Positive
458   * @param fn False Negative
459   * @return the description length
460   */
461  public static double dataDL(double expFPOverErr, double cover, 
462                              double uncover, double fp, double fn){
463    double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
464    double coverBits, uncoverBits; // What's the error?
465    double expErr;                 // Expected FP or FN
466
467    if(Utils.gr(cover, uncover)){
468      expErr = expFPOverErr*(fp+fn);
469      coverBits = subsetDL(cover, fp, expErr/cover);
470      uncoverBits = Utils.gr(uncover, 0.0) ? 
471        subsetDL(uncover, fn, fn/uncover) : 0.0;           
472    }
473    else{
474      expErr = (1.0-expFPOverErr)*(fp+fn);
475      coverBits = Utils.gr(cover, 0.0) ? 
476        subsetDL(cover, fp, fp/cover) : 0.0;
477      uncoverBits = subsetDL(uncover, fn, expErr/uncover);
478    }
479       
480    /*
481      System.err.println("!!!cover: " + cover + "|uncover" + uncover +
482      "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
483      "|FPRate: "+expFPOverErr + "|expErr: "+expErr+
484      "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
485    */
486    return (totalBits + coverBits + uncoverBits);
487  }
488
489   
490  /**
491   * Calculate the potential to decrease DL of the ruleset,
492   * i.e. the possible DL that could be decreased by deleting
493   * the rule whose index and simple statstics are given. 
494   * If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),
495   * it returns NaN. <p>
496   *
497   * The way this procedure does is copied from original RIPPER
498   * implementation and is quite bizzare because it
499   * does not update the following rules' stats recursively
500   * any more when testing each rule, which means it assumes
501   * after deletion no data covered by the following rules (or
502   * regards the deleted rule as the last rule).  Reasonable
503   * assumption?<p>
504   *
505   * @param index the index of the rule in m_Ruleset to be deleted
506   * @param expFPOverErr expected FP/(FP+FN)
507   * @param rulesetStat the simple statistics of the ruleset, updated
508   *                    if the rule should be deleted
509   * @param ruleStat the simple statistics of the rule to be deleted
510   * @param checkErr whether check if error rate >= 0.5
511   * @return the potential DL that could be decreased
512   */
513  public double potential(int index, double expFPOverErr, 
514                          double[] rulesetStat, double[] ruleStat,
515                          boolean checkErr){
516    //System.out.println("!!!inside potential: ");
517    // Restore the stats if deleted
518    double pcov = rulesetStat[0] - ruleStat[0];
519    double puncov = rulesetStat[1] + ruleStat[0];
520    double pfp = rulesetStat[4] - ruleStat[4];
521    double pfn = rulesetStat[5] + ruleStat[2];
522
523    double dataDLWith = dataDL(expFPOverErr, rulesetStat[0], 
524                               rulesetStat[1], rulesetStat[4], 
525                               rulesetStat[5]);
526    double theoryDLWith = theoryDL(index);
527    double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);
528
529    double potential = dataDLWith + theoryDLWith - dataDLWithout;
530    double err = ruleStat[4] / ruleStat[0];
531    /*System.out.println("!!!"+dataDLWith +" | "+
532      theoryDLWith + " | "
533      +dataDLWithout+"|"+ruleStat[4] + " / " + ruleStat[0]);
534    */
535    boolean overErr = Utils.grOrEq(err, 0.5);
536    if(!checkErr)
537      overErr = false;
538       
539    if(Utils.grOrEq(potential, 0.0) || overErr){ 
540      // If deleted, update ruleset stats.  Other stats do not matter
541      rulesetStat[0] = pcov;
542      rulesetStat[1] = puncov;
543      rulesetStat[4] = pfp;
544      rulesetStat[5] = pfn;
545      return potential;
546    }
547    else
548      return Double.NaN;
549  }
550   
551   
552  /**
553   * Compute the minimal data description length of the ruleset
554   * if the rule in the given position is deleted.<br>
555   * The min_data_DL_if_deleted = data_DL_if_deleted - potential
556   *
557   * @param index the index of the rule in question
558   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
559   * @param checkErr whether check if error rate >= 0.5
560   * @return the minDataDL
561   */
562  public double minDataDLIfDeleted(int index, double expFPRate,
563                                   boolean checkErr){
564    //System.out.println("!!!Enter without: ");
565    double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
566    int more = m_Ruleset.size() - 1 - index; // How many rules after?
567    FastVector indexPlus = new FastVector(more); // Their stats
568       
569    // 0...(index-1) are OK     
570    for(int j=0; j<index; j++){
571      // Covered stats are cumulative
572      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
573      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
574      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
575    }
576       
577    // Recount data from index+1
578    Instances data = (index == 0) ? 
579      m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1]; 
580    //System.out.println("!!!without: " + data.sumOfWeights());
581
582    for(int j=(index+1); j<m_Ruleset.size(); j++){
583      double[] stats = new double[6];
584      Instances[] split = computeSimpleStats(j, data, stats, null);
585      indexPlus.addElement(stats);
586      rulesetStat[0] += stats[0];
587      rulesetStat[2] += stats[2];
588      rulesetStat[4] += stats[4];         
589      data = split[1];
590    }
591    // Uncovered stats are those of the last rule
592    if(more > 0){
593      rulesetStat[1] = ((double[])indexPlus.lastElement())[1];
594      rulesetStat[3] = ((double[])indexPlus.lastElement())[3];
595      rulesetStat[5] = ((double[])indexPlus.lastElement())[5];
596    }
597    else if(index > 0){
598      rulesetStat[1] = 
599        ((double[])m_SimpleStats.elementAt(index-1))[1];
600      rulesetStat[3] =
601        ((double[])m_SimpleStats.elementAt(index-1))[3];
602      rulesetStat[5] = 
603        ((double[])m_SimpleStats.elementAt(index-1))[5];
604    }   
605    else{ // Null coverage
606      rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] +
607        ((double[])m_SimpleStats.elementAt(0))[1];
608      rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] +
609        ((double[])m_SimpleStats.elementAt(0))[4];
610      rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] +
611        ((double[])m_SimpleStats.elementAt(0))[5];         
612    }
613       
614    // Potential
615    double potential = 0;
616    for(int k=index+1; k<m_Ruleset.size(); k++){
617      double[] ruleStat = (double[])indexPlus.elementAt(k-index-1);
618      double ifDeleted = potential(k, expFPRate, rulesetStat, 
619                                   ruleStat, checkErr);
620      if(!Double.isNaN(ifDeleted))
621        potential += ifDeleted;
622    }
623
624    // Data DL of the ruleset without the rule
625    // Note that ruleset stats has already been updated to reflect
626    // deletion if any potential               
627    double dataDLWithout = dataDL(expFPRate, rulesetStat[0], 
628                                  rulesetStat[1], rulesetStat[4], 
629                                  rulesetStat[5]);
630    //System.out.println("!!!without: "+dataDLWithout + " |potential: "+
631    //             potential);
632    // Why subtract potential again?  To reflect change of theory DL??
633    return (dataDLWithout - potential);
634  }   
635   
636   
637  /**
638   * Compute the minimal data description length of the ruleset
639   * if the rule in the given position is NOT deleted.<br>
640   * The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential
641   *
642   * @param index the index of the rule in question
643   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
644   * @param checkErr whether check if error rate >= 0.5
645   * @return the minDataDL
646   */
647  public double minDataDLIfExists(int index, double expFPRate,
648                                  boolean checkErr){
649    //  System.out.println("!!!Enter with: ");
650    double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists
651    for(int j=0; j<m_SimpleStats.size(); j++){
652      // Covered stats are cumulative
653      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
654      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
655      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
656      if(j == m_SimpleStats.size()-1){ // Last rule
657        rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
658        rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
659        rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
660      }     
661    }
662       
663    // Potential
664    double potential = 0;
665    for(int k=index+1; k<m_SimpleStats.size(); k++){
666      double[] ruleStat = (double[])getSimpleStats(k);
667      double ifDeleted = potential(k, expFPRate, rulesetStat, 
668                                   ruleStat, checkErr);
669      if(!Double.isNaN(ifDeleted))
670        potential += ifDeleted;
671    }
672       
673    // Data DL of the ruleset without the rule
674    // Note that ruleset stats has already been updated to reflect deletion
675    // if any potential
676    double dataDLWith = dataDL(expFPRate, rulesetStat[0], 
677                               rulesetStat[1], rulesetStat[4], 
678                               rulesetStat[5]); 
679    //System.out.println("!!!with: "+dataDLWith + " |potential: "+
680    //             potential);
681    return (dataDLWith - potential);
682  }
683   
684   
685  /**
686   * The description length (DL) of the ruleset relative to if the
687   * rule in the given position is deleted, which is obtained by: <br>
688   * MDL if the rule exists - MDL if the rule does not exist <br>
689   * Note the minimal possible DL of the ruleset is calculated(i.e. some
690   * other rules may also be deleted) instead of the DL of the current
691   * ruleset.<p>
692   *
693   * @param index the given position of the rule in question
694   *              (assuming correct)
695   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
696   * @param checkErr whether check if error rate >= 0.5
697   * @return the relative DL
698   */
699  public double relativeDL(int index, double expFPRate, boolean checkErr){ 
700                 
701    return (minDataDLIfExists(index, expFPRate, checkErr) 
702            + theoryDL(index) - 
703            minDataDLIfDeleted(index, expFPRate, checkErr));
704  } 
705   
706   
707  /**
708   * Try to reduce the DL of the ruleset by testing removing the rules
709   * one by one in reverse order and update all the stats
710   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
711   * @param checkErr whether check if error rate >= 0.5
712   */
713  public void reduceDL(double expFPRate, boolean checkErr){
714       
715    boolean needUpdate = false;
716    double[] rulesetStat = new double[6];
717    for(int j=0; j<m_SimpleStats.size(); j++){
718      // Covered stats are cumulative
719      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
720      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
721      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
722      if(j == m_SimpleStats.size()-1){ // Last rule
723        rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
724        rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
725        rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
726      }     
727    }
728       
729    // Potential
730    for(int k=m_SimpleStats.size()-1; k>=0; k--){
731               
732      double[] ruleStat = (double[])m_SimpleStats.elementAt(k);
733
734      // rulesetStat updated
735      double ifDeleted = potential(k, expFPRate, rulesetStat, 
736                                   ruleStat, checkErr);
737      if(!Double.isNaN(ifDeleted)){ 
738        /*System.err.println("!!!deleted ("+k+"): save "+ifDeleted
739          +" | "+rulesetStat[0]
740          +" | "+rulesetStat[1]
741          +" | "+rulesetStat[4]
742          +" | "+rulesetStat[5]);
743        */
744       
745        if(k == (m_SimpleStats.size()-1))
746            removeLast();
747        else{
748            m_Ruleset.removeElementAt(k);
749            needUpdate = true;
750        }
751      }
752    }
753       
754    if(needUpdate){
755      m_Filtered = null;
756      m_SimpleStats = null;
757      countData();
758    }
759  }
760   
761  /**
762   * Remove the last rule in the ruleset as well as it's stats.
763   * It might be useful when the last rule was added for testing
764   * purpose and then the test failed
765   */
766  public void removeLast(){
767    int last = m_Ruleset.size()-1;
768    m_Ruleset.removeElementAt(last);
769    m_Filtered.removeElementAt(last);
770    m_SimpleStats.removeElementAt(last);       
771    if(m_Distributions != null)
772        m_Distributions.removeElementAt(last);
773  }
774
775  /**
776   * Static utility function to count the data covered by the
777   * rules after the given index in the given rules, and then
778   * remove them.  It returns the data not covered by the
779   * successive rules.
780   *
781   * @param data the data to be processed
782   * @param rules the ruleset
783   * @param index the given index
784   * @return the data after processing
785   */
786  public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){
787    Instances rt = new Instances(data, 0);
788
789    for(int i=0; i < data.numInstances(); i++){
790      Instance datum = data.instance(i);
791      boolean covered = false;     
792           
793      for(int j=index+1; j<rules.size();j++){
794        Rule rule = (Rule)rules.elementAt(j);
795        if(rule.covers(datum)){
796          covered = true;
797          break;
798        }
799      }
800
801      if(!covered)
802        rt.add(datum);
803    }   
804    return rt;
805  } 
806   
807  /**
808   * Stratify the given data into the given number of bags based on the class
809   * values.  It differs from the <code>Instances.stratify(int fold)</code>
810   * that before stratification it sorts the instances according to the
811   * class order in the header file.  It assumes no missing values in the class.
812   *
813   * @param data the given data
814   * @param folds the given number of folds
815   * @param rand the random object used to randomize the instances
816   * @return the stratified instances
817   */
818  public static final Instances stratify(Instances data, int folds, Random rand){
819    if(!data.classAttribute().isNominal())
820      return data;
821       
822    Instances result = new Instances(data, 0);
823    Instances[] bagsByClasses = new Instances[data.numClasses()];
824       
825    for(int i=0; i < bagsByClasses.length; i++)
826      bagsByClasses[i] = new Instances(data, 0);
827       
828    // Sort by class   
829    for(int j=0; j < data.numInstances(); j++){
830      Instance datum = data.instance(j);
831      bagsByClasses[(int)datum.classValue()].add(datum);
832    }
833       
834    // Randomize each class
835    for(int j=0; j < bagsByClasses.length; j++)
836      bagsByClasses[j].randomize(rand);
837       
838    for(int k=0; k < folds; k++){
839      int offset = k, bag = 0;
840    oneFold:
841      while (true){     
842        while(offset >= bagsByClasses[bag].numInstances()){
843          offset -= bagsByClasses[bag].numInstances();
844          if (++bag >= bagsByClasses.length)// Next bag
845            break oneFold;                 
846        }       
847           
848        result.add(bagsByClasses[bag].instance(offset));
849        offset += folds;                               
850      }
851    }
852       
853    return result;
854  }
855
856  /**
857   * Compute the combined DL of the ruleset in this class, i.e. theory
858   * DL and data DL.  Note this procedure computes the combined DL
859   * according to the current status of the ruleset in this class
860   *
861   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
862   * @param predicted the default classification if ruleset covers null
863   * @return the combined class
864   */
865  public double combinedDL(double expFPRate, double predicted){
866    double rt = 0;
867   
868    if(getRulesetSize() > 0) {
869      double[] stats = (double[])m_SimpleStats.lastElement();
870      for(int j=getRulesetSize()-2; j >= 0; j--){
871        stats[0] += getSimpleStats(j)[0];
872        stats[2] += getSimpleStats(j)[2];
873        stats[4] += getSimpleStats(j)[4];
874      }
875      rt += dataDL(expFPRate, stats[0], stats[1], 
876                   stats[4], stats[5]);     // Data DL     
877    }
878    else{ // Null coverage ruleset
879      double fn = 0.0;
880      for(int j=0; j < m_Data.numInstances(); j++)
881        if((int)m_Data.instance(j).classValue() == (int)predicted)
882          fn += m_Data.instance(j).weight();
883      rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);     
884    }     
885   
886    for(int i=0; i<getRulesetSize(); i++) // Theory DL
887      rt += theoryDL(i);     
888   
889    return rt;
890  }
891 
892  /**
893   * Patition the data into 2, first of which has (numFolds-1)/numFolds of
894   * the data and the second has 1/numFolds of the data
895   *
896   *
897   * @param data the given data
898   * @param numFolds the given number of folds
899   * @return the patitioned instances
900   */
901  public static final Instances[] partition(Instances data, int numFolds){
902    Instances[] rt = new Instances[2];
903    int splits = data.numInstances() * (numFolds - 1) / numFolds;
904   
905    rt[0] = new Instances(data, 0, splits);
906    rt[1] = new Instances(data, splits, data.numInstances()-splits);
907   
908    return rt;
909  } 
910 
911  /**
912   * Returns the revision string.
913   *
914   * @return            the revision
915   */
916  public String getRevision() {
917    return RevisionUtils.extract("$Revision: 4608 $");
918  }
919}
Note: See TracBrowser for help on using the repository browser.